com.sleepycat.je.rep.impl.node.Replay Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of je Show documentation
Berkeley DB Java Edition is a open source, transactional storage solution for Java applications. The Direct Persistence Layer (DPL) API is faster and easier to develop, deploy, and manage than serialized object files or ORM-based Java persistence solutions. The Collections API enhances the standard java.util.collections classes allowing them to be persisted to a local file system and accessed concurrently while protected by ACID transactions. Data is stored by serializing objects and managing class and instance data separately so as not to waste space. Berkeley DB Java Edition is the reliable drop-in solution for complex, fast, and scalable storage. Source for this release is in 'je-4.0.92-sources.jar', the Javadoc is located at 'http://download.oracle.com/berkeley-db/docs/je/4.0.92/'.
There is a newer version: 5.0.73
Show newest version
/*-
 * Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle Berkeley
 * DB Java Edition made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle Berkeley DB Java Edition for a copy of the
 * license and additional information.
 */

package com.sleepycat.je.rep.impl.node;

import static com.sleepycat.je.ExtinctionFilter.ExtinctionStatus.NOT_EXTINCT;
import static com.sleepycat.je.log.LogEntryType.LOG_NAMELN_TRANSACTIONAL;
import static com.sleepycat.je.log.LogEntryType.LOG_TXN_ABORT;
import static com.sleepycat.je.log.LogEntryType.LOG_TXN_COMMIT;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.ELAPSED_TXN_95_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.ELAPSED_TXN_99_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.ELAPSED_TXN_AVG_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.ELAPSED_TXN_MAX_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.LATEST_COMMIT_LAG_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.MAX_COMMIT_PROCESSING_NANOS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.MIN_COMMIT_PROCESSING_NANOS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_ABORTS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMITS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_ACKS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_NO_SYNCS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_SYNCS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_WRITE_NO_SYNCS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_GROUP_COMMITS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_GROUP_COMMIT_MAX_EXCEEDED;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_GROUP_COMMIT_TIMEOUTS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_GROUP_COMMIT_TXNS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_LNS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_MESSAGE_QUEUE_OVERFLOWS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_NAME_LNS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.OUTPUT_QUEUE_95_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.OUTPUT_QUEUE_99_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.OUTPUT_QUEUE_AVG_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.OUTPUT_QUEUE_MAX_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.REPLAY_QUEUE_95_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.REPLAY_QUEUE_99_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.REPLAY_QUEUE_AVG_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.REPLAY_QUEUE_MAX_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.TOTAL_COMMIT_PROCESSING_NANOS;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static java.util.concurrent.TimeUnit.NANOSECONDS;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.logging.Level;
import java.util.logging.Logger;

import com.sleepycat.je.Cursor;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.DatabaseNotFoundException;
import com.sleepycat.je.DbInternal;
import com.sleepycat.je.Durability.SyncPolicy;
import com.sleepycat.je.EnvironmentFailureException;
import com.sleepycat.je.ExtinctionFilter.ExtinctionStatus;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationResult;
import com.sleepycat.je.StatsConfig;
import com.sleepycat.je.TransactionConfig;
import com.sleepycat.je.dbi.DatabaseId;
import com.sleepycat.je.dbi.DatabaseImpl;
import com.sleepycat.je.dbi.DbConfigManager;
import com.sleepycat.je.dbi.DbTree.TruncateDbResult;
import com.sleepycat.je.dbi.DbType;
import com.sleepycat.je.dbi.EnvironmentFailureReason;
import com.sleepycat.je.dbi.PutMode;
import com.sleepycat.je.dbi.SearchMode;
import com.sleepycat.je.dbi.TTL;
import com.sleepycat.je.dbi.TriggerManager;
import com.sleepycat.je.log.DbOpReplicationContext;
import com.sleepycat.je.log.FileManager;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.LogManager;
import com.sleepycat.je.log.ReplicationContext;
import com.sleepycat.je.log.entry.DbOperationType;
import com.sleepycat.je.log.entry.LNLogEntry;
import com.sleepycat.je.log.entry.LogEntry;
import com.sleepycat.je.log.entry.NameLNLogEntry;
import com.sleepycat.je.log.entry.SingleItemEntry;
import com.sleepycat.je.recovery.RecoveryInfo;
import com.sleepycat.je.recovery.RollbackTracker;
import com.sleepycat.je.rep.LogFileRewriteListener;
import com.sleepycat.je.rep.SyncupProgress;
import com.sleepycat.je.rep.impl.RepGroupDB;
import com.sleepycat.je.rep.impl.RepImpl;
import com.sleepycat.je.rep.impl.RepParams;
import com.sleepycat.je.rep.stream.InputWireRecord;
import com.sleepycat.je.rep.stream.MasterStatus.MasterSyncException;
import com.sleepycat.je.rep.stream.Protocol;
import com.sleepycat.je.rep.txn.ReplayTxn;
import com.sleepycat.je.rep.utilint.LongMinZeroStat;
import com.sleepycat.je.rep.utilint.SimpleTxnMap;
import com.sleepycat.je.rep.vlsn.VLSNRange;
import com.sleepycat.je.tree.Key;
import com.sleepycat.je.tree.LN;
import com.sleepycat.je.tree.NameLN;
import com.sleepycat.je.txn.RollbackEnd;
import com.sleepycat.je.txn.RollbackStart;
import com.sleepycat.je.txn.Txn;
import com.sleepycat.je.txn.TxnAbort;
import com.sleepycat.je.txn.TxnCommit;
import com.sleepycat.je.txn.TxnEnd;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.LatencyPercentileStat;
import com.sleepycat.je.utilint.LoggerUtils;
import com.sleepycat.je.utilint.LongAvgStat;
import com.sleepycat.je.utilint.LongMaxStat;
import com.sleepycat.je.utilint.LongMaxZeroStat;
import com.sleepycat.je.utilint.LongMinStat;
import com.sleepycat.je.utilint.LongStat;
import com.sleepycat.je.utilint.NanoTimeUtil;
import com.sleepycat.je.utilint.StatGroup;
import com.sleepycat.je.utilint.VLSN;
import com.sleepycat.utilint.StringUtils;

/**
 * Replays log records from the replication stream, and manages the
 * transactions for those records.
 *
 * The Replay module has a lifetime equivalent to the environment owned by
 * this replicator. Its lifetime is longer than the feeder/replica stream.
 * For example, suppose this is nodeX:
 *
 * t1 - Node X is a replica, node A is master. Replay X is alive
 * t2 - Node X is a replica, node B takes over as master. X's Replay module
 *      is still alive and has the same set of active txns. It doesn't matter
 *      to X that the master has changed.
 * t3 - Node X becomes the master. Now its Replay unit is cleared, because
 *      anything managed by the Replay is defunct.
 */
public class Replay {

    /* These are strings for the rollback logging. */
    private static final String RBSTATUS_START =
        "Started Rollback";
    private static final String RBSTATUS_NO_ACTIVE =
        "No active txns, nothing to rollback";
    private static final String RBSTATUS_RANGE_EQUALS =
        "End of range equals matchpoint, nothing to rollback";
    private static final String RBSTATUS_LOG_RBSTART =
        "Logged RollbackStart entry";
    private static final String RBSTATUS_MEM_ROLLBACK =
        "Finished in-memory rollback";
    private static final String RBSTATUS_INVISIBLE =
        "Finished invisible setting";
    private static final String RBSTATUS_FINISH =
        "Finished rollback";

    /*
     * DatabaseEntry objects reused during replay, to minimize allocation in
     * high performance replay path.
     */
    final DatabaseEntry replayKeyEntry = new DatabaseEntry();
    final DatabaseEntry replayDataEntry = new DatabaseEntry();
    final DatabaseEntry delDataEntry = new DatabaseEntry();

    private final RepImpl repImpl;

    /**
     *  If a commit replay operation takes more than this threshold, it's
     *  logged. This information helps determine whether ack timeouts on the
     *  master are due to a slow replica, or the network.
     */
    private final long ackTimeoutLogThresholdNs;

    /**
     * ActiveTxns is a collection of txn objects used for applying replicated
     * transactions. This collection should be empty if the node is a master.
     *
     * Note that there is an interesting relationship between ActiveTxns and
     * the txn collection managed by the environment TxnManager. ActiveTxns is
     * effectively a subset of the set of txns held by the
     * TxnManager. ReplayTxns must be sure to register and unregister
     * themselves from ActiveTxns, just as all Txns must register and
     * unregister with the TxnManager's set. One implementation alternative to
     * having an ActiveTxns map here is to search the TxnManager set (which is
     * a set, not a map) for a given ReplayTxn. Another is to subclass
     * TxnManager so that replicated nodes have their own replayTxn map, just
     * as XATxns have a XID->Txn map.
     *
     * Both alternatives seemed too costly in terms of performance or elaborate
     * in terms of code to do for the current function. It seems clearer to
     * make the ActiveTxns a map in the one place that it is currently
     * used. This choice may change over time, and should be reevaluated if the
     * implementation changes.
     *
     * The ActiveTxns key is the transaction id. These transactions are closed
     * when:
     * - the replay unit executes a commit received over the replication stream
     * - the replay unit executes an abort received over the replication stream
     * - the replication node realizes that it has just become the new master,
     *   and was not previously the master.
     *
     * Note that the Replay class has a lifetime that is longer than that of a
     * RepNode. This means in particular, that transactions may be left open,
     * and will be resumed when a replica switches from one master to another,
     * creating a new RepNode in the process. Because of that, part of the
     * transaction may be implemented by the rep stream from one master and
     * another part by another.
     *
     * The map is synchronized, so simple get/put operations do not require
     * additional synchronization.  However, iteration requires synchronization
     * and copyActiveTxns can be used in most cases.
     */
    private final SimpleTxnMap activeTxns;

    /*
     * The entry representing the last replayed txn commit. Supports the
     * replica's notion of consistency.
     */
    private volatile TxnInfo lastReplayedTxn = null;

    /*
     * The last replayed entry of any kind. Supports PointConsistencyPolicy.
     */
    private volatile VLSN lastReplayedVLSN = null;

    /*
     * The last replayed DTVLSN in the stream. It's used to ensure that the
     * DTVLSNs in the stream are correctly sequenced.
     */
    private long lastReplayedDTVLSN = VLSN.NULL_VLSN_SEQUENCE;

    /*
     * The sync policy to be used in the absence of an ACK request. The replica
     * in this case has some latitude about how it syncs the commit.
     */
    private final SyncPolicy noAckSyncPolicy = SyncPolicy.NO_SYNC;

    /**
     *  The RepParams.REPLAY_LOGGING_THRESHOLD configured logging threshold.
     */
    private final long replayLoggingThresholdNs;

    /**
     * State that is reinitialized by the reinit() method each time a replay
     * loop is started with a new feeder.
     */

    /**
     *  All writes (predominantly acks) are queued here, so they do not block
     *  the replay thread.
     */
    private final BlockingQueue outputQueue;

    /**
     * Holds the state associated with group commits.
     */
    private final GroupCommit groupCommit;

    /* Maintains the statistics associated with stream replay. */
    private final StatGroup statistics;
    private final LongStat nCommits;
    private final LongStat nCommitAcks;
    private final LongStat nCommitSyncs;
    private final LongStat nCommitNoSyncs;
    private final LongStat nCommitWriteNoSyncs;
    private final LongStat nAborts;
    private final LongStat nNameLNs;
    private final LongStat nLNs;
    private final LongStat nMessageQueueOverflows;
    private final LongMinStat minCommitProcessingNanos;
    private final LongMaxStat maxCommitProcessingNanos;
    private final LongStat totalCommitProcessingNanos;
    private final LongStat latestCommitLagMs;

    private volatile long heartbeatRequestEnqueueTime = 0;
    private volatile long heartbeatRequestMasterNow = 0;
    private final LongAvgStat replayQueueAvgDelayMs;
    private final LatencyPercentileStat replayQueue95DelayMs;
    private final LatencyPercentileStat replayQueue99DelayMs;
    private final LongMaxStat replayQueueMaxDelayMs;

    private volatile long heartbeatResponseEnqueueTime = 0;
    private final LongAvgStat outputQueueAvgDelayMs;
    private final LatencyPercentileStat outputQueue95DelayMs;
    private final LatencyPercentileStat outputQueue99DelayMs;
    private final LongMaxStat outputQueueMaxDelayMs;

    private final LongAvgStat elapsedTxnAvgMs;
    private final LatencyPercentileStat elapsedTxn95Ms;
    private final LatencyPercentileStat elapsedTxn99Ms;
    private final LongMaxStat elapsedTxnMaxMs;

    private final Logger logger;
    public Replay(RepImpl repImpl,
                  @SuppressWarnings("unused") NameIdPair nameIdPair) {

        /*
         * This should have already been caught in
         * ReplicatedEnvironment.setupRepConfig, but it is checked here anyway
         * as an added sanity check. [#17643]
         */
        if (repImpl.isReadOnly()) {
            throw EnvironmentFailureException.unexpectedState
                ("Replay created with readonly ReplicatedEnvironment");
        }

        this.repImpl = repImpl;
        final DbConfigManager configManager = repImpl.getConfigManager();

        ackTimeoutLogThresholdNs = MILLISECONDS.toNanos(configManager.
            getDuration(RepParams.REPLICA_ACK_TIMEOUT));

        /**
         * The factor of 2 below is somewhat arbitrary. It should be > 1 X so
         * that the ReplicaOutputThread can completely process the buffered
         * messages in the face of a network drop and 2X to allow for
         * additional headroom and minimize the chances that the replay might
         * be blocked due to the limited queue length.
         */
        final int outputQueueSize = 2 *
            configManager.getInt(RepParams.REPLICA_MESSAGE_QUEUE_SIZE);
        outputQueue = new ArrayBlockingQueue(outputQueueSize);

        /*
         * The Replay module manages all write transactions and mimics a
         * writing application thread. When the node comes up, it populates
         * the activeTxn collection with ReplayTxns that were resurrected
         * at recovery time.
         */
        activeTxns = new SimpleTxnMap<>(1024);

        /*
         * Configure the data entry used for deletion to avoid fetching the
         * old data during deletion replay.
         */
        delDataEntry.setPartial(0, 0, true);

        logger = LoggerUtils.getLogger(getClass());
        statistics = new StatGroup(ReplayStatDefinition.GROUP_NAME,
                                   ReplayStatDefinition.GROUP_DESC);

        groupCommit = new GroupCommit(configManager);

        nCommits = new LongStat(statistics, N_COMMITS);
        nCommitAcks = new LongStat(statistics, N_COMMIT_ACKS);
        nCommitSyncs = new LongStat(statistics, N_COMMIT_SYNCS);
        nCommitNoSyncs = new LongStat(statistics, N_COMMIT_NO_SYNCS);
        nCommitWriteNoSyncs =
            new LongStat(statistics, N_COMMIT_WRITE_NO_SYNCS);
        nAborts = new LongStat(statistics, N_ABORTS);
        nNameLNs = new LongStat(statistics, N_NAME_LNS);
        nLNs = new LongStat(statistics, N_LNS);
        nMessageQueueOverflows =
            new LongStat(statistics, N_MESSAGE_QUEUE_OVERFLOWS);
        minCommitProcessingNanos =
            new LongMinZeroStat(statistics, MIN_COMMIT_PROCESSING_NANOS);
        maxCommitProcessingNanos =
            new LongMaxZeroStat(statistics, MAX_COMMIT_PROCESSING_NANOS);
        totalCommitProcessingNanos =
            new LongStat(statistics, TOTAL_COMMIT_PROCESSING_NANOS);
        latestCommitLagMs = new LongStat(statistics, LATEST_COMMIT_LAG_MS);

        replayQueueAvgDelayMs =
            new LongAvgStat(statistics, REPLAY_QUEUE_AVG_DELAY_MS);
        replayQueue95DelayMs = new LatencyPercentileStat(
            statistics, REPLAY_QUEUE_95_DELAY_MS, 0.95f);
        replayQueue99DelayMs = new LatencyPercentileStat(
            statistics, REPLAY_QUEUE_99_DELAY_MS, 0.99f);
        replayQueueMaxDelayMs = new LongMaxStat(
            statistics, REPLAY_QUEUE_MAX_DELAY_MS);

        outputQueueAvgDelayMs =
            new LongAvgStat(statistics, OUTPUT_QUEUE_AVG_DELAY_MS);
        outputQueue95DelayMs = new LatencyPercentileStat(
            statistics, OUTPUT_QUEUE_95_DELAY_MS, 0.95f);
        outputQueue99DelayMs = new LatencyPercentileStat(
            statistics, OUTPUT_QUEUE_99_DELAY_MS, 0.99f);
        outputQueueMaxDelayMs = new LongMaxStat(
            statistics, OUTPUT_QUEUE_MAX_DELAY_MS);

        elapsedTxnAvgMs = new LongAvgStat(statistics, ELAPSED_TXN_AVG_MS);
        elapsedTxn95Ms =
            new LatencyPercentileStat(statistics, ELAPSED_TXN_95_MS, 0.95f);
        elapsedTxn99Ms =
            new LatencyPercentileStat(statistics, ELAPSED_TXN_99_MS, 0.99f);
        elapsedTxnMaxMs = new LongMaxStat(statistics, ELAPSED_TXN_MAX_MS);

        replayLoggingThresholdNs = MILLISECONDS.toNanos(configManager.
           getDuration(RepParams.REPLAY_LOGGING_THRESHOLD));
    }

    /**
     * Note that a heartbeat request has been received over the network and is
     * about to be added to the replay queue. If no heartbeat request is being
     * tracked, note the enqueuing time and the heartbeat's master time. When
     * we dequeue this same request, as determined by matching the master time,
     * we will compute the time it spent in the queue and use that to tally
     * statistics about the replay queue delays.
     */
    void noteEnqueueHeartbeatRequest(Protocol.Heartbeat heartbeat) {
        if (heartbeatRequestEnqueueTime == 0) {
            heartbeatRequestEnqueueTime = System.currentTimeMillis();
            heartbeatRequestMasterNow = heartbeat.getMasterNow();
        }
    }

    /**
     * Note that an incoming heartbeat request has been retrieved from the
     * replay queue in preparation for processing. If we are tracking a
     * heartbeat request and this request's master time matches the entry being
     * tracked, tally the replay queue delay statistics.
     *
     * Then, if no heartbeat response is being tracked, note the time of
     * enqueuing the heartbeat response to the output queue.
     *
     * @return whether to track the heartbeat response time
     */
    boolean noteDequeueHeartbeatRequest(Protocol.Heartbeat heartbeat) {
        long now = 0;
        if ((heartbeatRequestEnqueueTime != 0) &&
            (heartbeatRequestMasterNow == heartbeat.getMasterNow())) {
            now = System.currentTimeMillis();
            final long delay = now - heartbeatRequestEnqueueTime;
            replayQueueAvgDelayMs.add(delay);
            replayQueue95DelayMs.add(delay);
            replayQueue99DelayMs.add(delay);
            replayQueueMaxDelayMs.setMax(delay);
            heartbeatRequestEnqueueTime = 0;
        }
        if (heartbeatResponseEnqueueTime == 0) {
            heartbeatResponseEnqueueTime =
                (now == 0) ? System.currentTimeMillis() : now;
            return true;
        }
        return false;
    }

    /**
     * Note that an outgoing heartbeat response has been retrieved from the
     * output queue in preparation for sending it out over the network.  Tally
     * output queue delay statistics if we were tracking a heartbeat response.
     */
    void noteDequeueHeartbeatResponse() {
        if (heartbeatResponseEnqueueTime != 0) {
            final long delay =
                System.currentTimeMillis() - heartbeatResponseEnqueueTime;
            outputQueueAvgDelayMs.add(delay);
            outputQueue95DelayMs.add(delay);
            outputQueue99DelayMs.add(delay);
            outputQueueMaxDelayMs.setMax(delay);
            heartbeatResponseEnqueueTime = 0;
        }
    }

    public BlockingQueue getOutputQueue() {
        return outputQueue;
    }

    /**
     * Reinitialize for replay from a new feeder
     */
    public void reset() {
        outputQueue.clear();
    }

    LongStat getMessageQueueOverflows() {
        return nMessageQueueOverflows;
    }

    /**
     * Actions that must be taken before the recovery checkpoint, whether
     * the environment is read/write or read/only.
     */
    public void preRecoveryCheckpointInit(RecoveryInfo recoveryInfo) {
        for (Txn txn : recoveryInfo.replayTxns.values()) {

            /*
             * ReplayTxns need to know about their owning activeTxn map,
             * so they can remove themselves at close. We are casting upwards,
             * because the non-HA code is prohibited from referencing
             * Replication classes, and the RecoveryInfo.replayTxns collection
             * doesn't know that it's got ReplayTxns.
             */
            ((ReplayTxn) txn).registerWithActiveTxns(activeTxns);
        }
        lastReplayedVLSN = repImpl.getVLSNIndex().getRange().getLast();
    }

    public TxnInfo getLastReplayedTxn() {
        return lastReplayedTxn;
    }

    public VLSN getLastReplayedVLSN() {
        return lastReplayedVLSN;
    }

    /**
     * When mastership changes, all inflight replay transactions are aborted.
     * Replay transactions need only be aborted by the node that has become
     * the new master (who was previously a Replica). The replay transactions
     * on the other replicas who have not changed roles are
     * resolved by the abort record issued by said new master.
     */
    public void abortOldTxns()
        throws DatabaseException {

        final int masterNodeId = repImpl.getNodeId();
        for (ReplayTxn replayTxn : copyActiveTxns().values()) {
            /*
             * Use NULL for the DTVLSN since it's being written as the MASTER
             * despite being a ReplayTxn; it will be corrected when it's
             * written to the log.
             */
            replayTxn.abort(ReplicationContext.MASTER, masterNodeId,
                            VLSN.NULL_VLSN_SEQUENCE);
        }
        assert activeTxns.isEmpty() : "Unexpected txns in activeTxns = " +
            activeTxns;
    }

    private void updateCommitStats(final boolean needsAck,
                                   final SyncPolicy syncPolicy,
                                   final long startTimeNanos,
                                   final long masterCommitTimeMs,
                                   final long replicaCommitTimeMs,
                                   final long masterTxnTimeMs) {

        final long now = System.nanoTime();
        final long commitNanos = now - startTimeNanos;

        if (commitNanos > ackTimeoutLogThresholdNs &&
            logger.isLoggable(Level.INFO)) {
            LoggerUtils.info
                (logger, repImpl,
                 "Replay commit time: " + (commitNanos / 1000000) +
                 " ms exceeded log threshold: " +
                 (ackTimeoutLogThresholdNs / 1000000));
        }

        nCommits.increment();

        if (needsAck) {
            nCommitAcks.increment();
        }

        if (syncPolicy == SyncPolicy.SYNC) {
            nCommitSyncs.increment();
        } else if (syncPolicy == SyncPolicy.NO_SYNC) {
            nCommitNoSyncs.increment();
        } else if (syncPolicy == SyncPolicy.WRITE_NO_SYNC) {
            nCommitWriteNoSyncs.increment();
        } else {
            throw EnvironmentFailureException.unexpectedState
                ("Unknown sync policy: " + syncPolicy);
        }

        totalCommitProcessingNanos.add(commitNanos);
        minCommitProcessingNanos.setMin(commitNanos);
        maxCommitProcessingNanos.setMax(commitNanos);

        updateElapsedTxnStats(masterTxnTimeMs);

        /*
         * Tally the lag between master and replica commits, even if clock skew
         * makes the lag appear negative.  The documentation already warns that
         * the value will be affected by clock skew, so users can adjust for
         * that, but only if we don't throw the information way.
         */
        final long replicaLagMs = replicaCommitTimeMs - masterCommitTimeMs;
        latestCommitLagMs.set(replicaLagMs);
    }

    private void updateElapsedTxnStats(long masterTxnTimeMs) {
        elapsedTxnAvgMs.add(masterTxnTimeMs);
        elapsedTxn95Ms.add(masterTxnTimeMs);
        elapsedTxn99Ms.add(masterTxnTimeMs);
        elapsedTxnMaxMs.setMax(masterTxnTimeMs);
    }

    /**
     * Apply the operation represented by this log entry on this replica node.
     */
    public void replayEntry(long startNs,
                            Protocol.Entry entry)
        throws DatabaseException,
               IOException,
               InterruptedException,
               MasterSyncException {

        final InputWireRecord wireRecord = entry.getWireRecord();
        final LogEntry logEntry = wireRecord.getLogEntry();

        /*
         * Sanity check that the replication stream is in sequence. We want to
         * forestall any possible corruption from replaying invalid entries.
         */
        if (!wireRecord.getVLSN().follows(lastReplayedVLSN)) {
            throw EnvironmentFailureException.unexpectedState
                (repImpl,
                 "Rep stream not sequential. Current VLSN: " +
                 lastReplayedVLSN +
                 " next log entry VLSN: " + wireRecord.getVLSN());
        }

        if (logger.isLoggable(Level.FINEST)) {
            LoggerUtils.finest(logger, repImpl, "Replaying " + wireRecord);
        }

        final ReplayTxn repTxn = getReplayTxn(logEntry.getTransactionId(), true);
        updateReplicaSequences(logEntry);
        final byte entryType = wireRecord.getEntryType();

        lastReplayedVLSN = wireRecord.getVLSN();

        try {
            final long txnId = repTxn.getId();

            if (LOG_TXN_COMMIT.equalsType(entryType)) {
                Protocol.Commit commitEntry = (Protocol.Commit) entry;

                final boolean needsAck = commitEntry.getNeedsAck();
                final SyncPolicy txnSyncPolicy =
                    commitEntry.getReplicaSyncPolicy();
                final SyncPolicy implSyncPolicy =
                    needsAck ?
                    groupCommit.getImplSyncPolicy(txnSyncPolicy) :
                    noAckSyncPolicy;

                logReplay(repTxn, needsAck, implSyncPolicy);

                final TxnCommit commit = (TxnCommit) logEntry.getMainItem();
                final long dtvlsn = updateDTVLSN(commit);

                if (needsAck) {

                    /*
                     * Only wait if the replica is not lagging and the
                     * durability requires it.
                     */
                    repImpl.getRepNode().getVLSNFreezeLatch().awaitThaw();
                    repImpl.getRepNode().getMasterStatus().assertSync();
                }

                repTxn.commit(implSyncPolicy,
                              new ReplicationContext(lastReplayedVLSN),
                              commit.getMasterNodeId(),
                              dtvlsn);

                final long masterCommitTimeMs = commit.getTime().getTime();
                lastReplayedTxn = new TxnInfo(lastReplayedVLSN,
                                              masterCommitTimeMs);

                updateCommitStats(needsAck, implSyncPolicy, startNs,
                                  masterCommitTimeMs, repTxn.getEndTime(),
                                  repTxn.elapsedTime());

                /* Respond to the feeder. */
                if (needsAck) {
                    /*
                     * Need an ack, either buffer it, for sync group commit, or
                     * queue it.
                     */
                    if (!groupCommit.bufferAck(startNs, repTxn,
                                               txnSyncPolicy)) {
                        queueAck(txnId);
                    }
                }

                /*
                 * The group refresh and recalculation can be expensive, since
                 * it may require a database read. Do it after the ack.
                 */
                if (repTxn.getRepGroupDbChange() && canRefreshGroup(repTxn)) {
                    repImpl.getRepNode().refreshCachedGroup();
                    repImpl.getRepNode().recalculateGlobalCBVLSN();
                }

            } else if (LOG_TXN_ABORT.equalsType(entryType)) {

                nAborts.increment();
                final TxnAbort abort = (TxnAbort) logEntry.getMainItem();
                final ReplicationContext abortContext =
                    new ReplicationContext(wireRecord.getVLSN());
                if (logger.isLoggable(Level.FINEST)) {
                    LoggerUtils.finest(logger, repImpl,
                                       "abort called for " + txnId +
                                       " masterId=" +
                                       abort.getMasterNodeId() +
                                       " repContext=" + abortContext);
                }

                long dtvlsn = updateDTVLSN(abort);
                repTxn.abort(abortContext, abort.getMasterNodeId(), dtvlsn);
                lastReplayedTxn = new TxnInfo(lastReplayedVLSN,
                                              abort.getTime().getTime());
                if (repTxn.getRepGroupDbChange() && canRefreshGroup(repTxn)) {

                    /*
                     * Refresh is the safe thing to do on an abort, since a
                     * refresh may have been held back from an earlier commit
                     * due to this active transaction.
                     */
                    repImpl.getRepNode().refreshCachedGroup();
                }
                updateElapsedTxnStats(repTxn.elapsedTime());

            } else if (LOG_NAMELN_TRANSACTIONAL.equalsType(entryType)) {

                repImpl.getRepNode().getReplica().clearDbTreeCache();
                nNameLNs.increment();
                applyNameLN(repTxn, wireRecord);

            } else {
                nLNs.increment();
                /* A data operation. */
                assert wireRecord.getLogEntry() instanceof LNLogEntry;
                applyLN(repTxn, wireRecord);
            }

            /* Remember the last VLSN applied by this txn. */
            repTxn.setLastAppliedVLSN(lastReplayedVLSN);

        } catch (DatabaseException e) {
            e.addErrorMessage("Problem seen replaying entry " + wireRecord);
            throw e;
        } finally {
            final long elapsedNs = System.nanoTime() - startNs;
            if (elapsedNs > replayLoggingThresholdNs) {
                LoggerUtils.info(logger, repImpl,
                                 "Replay time for entry type:" +
                                 LogEntryType.findType(entryType) + " " +
                                 NANOSECONDS.toMillis(elapsedNs) + "ms " +
                                 "exceeded threshold:" +
                                 NANOSECONDS.
                                     toMillis(replayLoggingThresholdNs) +
                                 "ms");
            }
        }
    }

    /**
     * Update the replica's in-memory DTVLSN using the value in the
     * commit/abort entry.
     *
     * In the normal course of events, DTVLSNs should not decrease. However,
     * there is just one exception: if the rep stream transitions from a post
     * to a pre-dtvlsn stream, it will transition from a positive to the
     * UNINITIALIZED_VLSN_SEQUENCE.
     *
     * A transition from a pre to a post-dtvlsn transition (from zero to some
     * positive value), observes the "DTVLSNs should not decrease" rule
     * automatically.
     *
     * @return the DTVLSN entry in the log txnEnd record so it can be used in
     * the commit/abort operation for replay
     */
    private long updateDTVLSN(final TxnEnd txnEnd) {
        final long txnDTVLSN = txnEnd.getDTVLSN();

        if (txnDTVLSN == VLSN.UNINITIALIZED_VLSN_SEQUENCE) {
            /*
             * A pre DTVLSN format entry, simply set it as the in-memory DTVLSN
             */
            final long prevDTVLSN = repImpl.getRepNode().setDTVLSN(txnDTVLSN);
            if (prevDTVLSN != VLSN.UNINITIALIZED_VLSN_SEQUENCE) {
                LoggerUtils.info(logger, repImpl,
                                 "Transitioned to pre DTVLSN stream." +
                                 " DTVLSN:" + prevDTVLSN +
                                 " at VLSN:" + lastReplayedVLSN);

            }
            lastReplayedDTVLSN = txnDTVLSN;
            return txnDTVLSN;
        }

        /* Sanity check. */
        if (txnDTVLSN < lastReplayedDTVLSN) {
            String msg = "DTVLSNs must be in ascending order in the stream. " +
                " prev DTVLSN:" + lastReplayedDTVLSN +
                " next DTVLSN:" + txnDTVLSN + " at VLSN: " +
                lastReplayedVLSN.getSequence();
          throw EnvironmentFailureException.unexpectedState(repImpl, msg);
        }

        if ((lastReplayedDTVLSN == VLSN.UNINITIALIZED_VLSN_SEQUENCE) &&
            (txnDTVLSN > 0)) {
            LoggerUtils.info(logger, repImpl,
                             "Transitioned to post DTVLSN stream." +
                             " DTVLSN:" + txnDTVLSN +
                             " at VLSN:" + lastReplayedVLSN);
        }

        lastReplayedDTVLSN = txnDTVLSN;
        repImpl.getRepNode().setDTVLSN(txnDTVLSN);

        return txnDTVLSN;
    }

    /**
     * Queue the request ack for an async ack write to the network.
     */
    void queueAck(final long txnId) throws IOException {
        try {
            outputQueue.put(txnId);
        } catch (InterruptedException ie) {
            /*
             * Have the higher levels treat it like an IOE and
             * exit the thread.
             */
            throw new IOException("Ack I/O interrupted", ie);
        }
    }

    /**
     * Logs information associated with the replay of the txn commit
     */
    private void logReplay(ReplayTxn repTxn,
                           boolean needsAck,
                           SyncPolicy syncPolicy) {

        if (!logger.isLoggable(Level.FINE)) {
            return;
        }

        if (needsAck) {
            LoggerUtils.fine(logger, repImpl,
                             "Replay: got commit for txn=" + repTxn.getId() +
                             ", ack needed, replica sync policy=" +
                             syncPolicy +
                             " vlsn=" + lastReplayedVLSN);
        } else {
            LoggerUtils.fine(logger, repImpl,
                             "Replay: got commit for txn=" + repTxn.getId() +
                             " ack not needed" +
                             " vlsn=" + lastReplayedVLSN);
        }
    }

    /**
     * Returns true if there are no other activeTxns that have also modified
     * the membership database and are still open, since they could potentially
     * hold write locks that would block the read locks acquired during the
     * refresh operation.
     *
     * @param txn the current txn being committed or aborted
     *
     * @return true if there are no open transactions that hold locks on the
     * membership database.
     */
    private boolean canRefreshGroup(ReplayTxn txn) {

        /*
         * Use synchronized rather than copyActiveTxns, since this is called
         * during replay and there is no nested locking to worry about.
         */
        synchronized (activeTxns) {
            // TODO: very inefficient
            for (ReplayTxn atxn : activeTxns.getMap().values()) {
                if (atxn == txn) {
                    continue;
                }
                if (atxn.getRepGroupDbChange()) {
                    return false;
                }
            }
        }
        return true;
    }

    /**
     * Update this replica's node, txn and database sequences with any ids in
     * this log entry. We can call update, even if the replay id doesn't
     * represent a new lowest-id point, or if the apply is not successful,
     * because the apply checks that the replay id is < the sequence on the
     * replica. We just want to ensure that if this node becomes the master,
     * its sequences are in sync with what came before in the replication
     * stream, and ids are not incorrectly reused.
     */
    private void updateReplicaSequences(LogEntry logEntry) {

        /* For now, we assume all replay entries have a txn id. */
        repImpl.getTxnManager().updateFromReplay(logEntry.getTransactionId());

        /* If it's a database operation, update the database id. */
        if (logEntry instanceof NameLNLogEntry) {
            NameLNLogEntry nameLogEntry = (NameLNLogEntry) logEntry;
            nameLogEntry.postFetchInit(false /*isDupDb*/);
            NameLN nameLN = (NameLN) nameLogEntry.getLN();
            repImpl.getDbTree().updateFromReplay(nameLN.getId());
        }
    }

    /**
     * Obtain a ReplayTxn to represent the incoming operation.
     */
    public ReplayTxn getReplayTxn(long txnId, boolean registerTxnImmediately)
        throws DatabaseException {

        ReplayTxn useTxn = null;
        synchronized (activeTxns) {
            useTxn = activeTxns.get(txnId);
            if (useTxn == null) {

                /*
                 * Durability will be explicitly specified when
                 * ReplayTxn.commit is called, so TransactionConfig.DEFAULT is
                 * fine.
                 */
                if (registerTxnImmediately) {
                    useTxn = new ReplayTxn(repImpl, TransactionConfig.DEFAULT,
                                           txnId, activeTxns, logger);
                } else {
                    useTxn = new ReplayTxn(repImpl, TransactionConfig.DEFAULT,
                                           txnId, activeTxns, logger) {
                            @Override
                            protected
                            boolean registerImmediately() {
                                return false;
                            }
                    };
                }
            }
        }
        return useTxn;
    }

    /**
     * Replays the NameLN.
     *
     * Note that the operations: remove, rename and truncate need to establish
     * write locks on the database. Any open handles are closed by this
     * operation by virtue of the ReplayTxn's importunate property.  The
     * application will receive a LockPreemptedException if it subsequently
     * accesses the database handle.
     */
    private void applyNameLN(ReplayTxn repTxn,
                             InputWireRecord wireRecord)
        throws DatabaseException {

        NameLNLogEntry nameLNEntry = (NameLNLogEntry) wireRecord.getLogEntry();
        final NameLN nameLN = (NameLN) nameLNEntry.getLN();

        String databaseName = StringUtils.fromUTF8(nameLNEntry.getKey());

        final DbOpReplicationContext repContext =
            new DbOpReplicationContext(wireRecord.getVLSN(), nameLNEntry);

        DbOperationType opType = repContext.getDbOperationType();
        DatabaseImpl dbImpl = null;
        try {
            switch (opType) {
                case CREATE:
                {
                    DatabaseConfig dbConfig =
                        repContext.getCreateConfig().getReplicaConfig(repImpl);

                    dbImpl = repImpl.getDbTree().createReplicaDb
                      (repTxn, databaseName, dbConfig, nameLN, repContext);

                    /*
                     * We rely on the RepGroupDB.DB_ID value, so make sure
                     * it's what we expect for this internal replicated
                     * database.
                     */
                    if ((dbImpl.getId().getId() == RepGroupDB.DB_ID) &&
                        !DbType.REP_GROUP.getInternalName().equals
                        (databaseName)) {
                        throw EnvironmentFailureException.unexpectedState
                            ("Database: " +
                             DbType.REP_GROUP.getInternalName() +
                             " is associated with id: " +
                             dbImpl.getId().getId() +
                             " and not the reserved database id: " +
                             RepGroupDB.DB_ID);
                    }

                    TriggerManager.runOpenTriggers(repTxn, dbImpl, true);
                    break;
                }

                case REMOVE: {
                    dbImpl = repImpl.getDbTree().getDb(nameLN.getId());
                    try {
                        repImpl.getDbTree().removeReplicaDb
                            (repTxn, databaseName, nameLN.getId(), repContext);
                        TriggerManager.runRemoveTriggers(repTxn, dbImpl);
                    } catch (DatabaseNotFoundException e) {
                        throw EnvironmentFailureException.unexpectedState
                            ("Database: " + dbImpl.getName() +
                             " Id: " + nameLN.getId() +
                             " not found on the Replica.");
                    }
                    break;
                }

                case TRUNCATE: {
                    dbImpl = repImpl.getDbTree().getDb
                        (repContext.getTruncateOldDbId());
                    try {
                        TruncateDbResult result =
                        repImpl.getDbTree().truncateReplicaDb
                            (repTxn, databaseName, false, nameLN, repContext);
                        TriggerManager.runTruncateTriggers(repTxn, result.newDb);
                    } catch (DatabaseNotFoundException e) {
                        throw EnvironmentFailureException.unexpectedState
                            ("Database: " + dbImpl.getName() +
                             " Id: " + nameLN.getId() +
                             " not found on the Replica.");
                    }

                    break;
                }

                case RENAME: {
                    dbImpl = repImpl.getDbTree().getDb(nameLN.getId());
                    try {
                        dbImpl =
                        repImpl.getDbTree().renameReplicaDb
                            (repTxn, dbImpl.getName(), databaseName, nameLN,
                             repContext);
                        TriggerManager.runRenameTriggers(repTxn, dbImpl,
                                                         databaseName);
                    } catch (DatabaseNotFoundException e) {
                        throw EnvironmentFailureException.unexpectedState
                            ("Database rename from: " + dbImpl.getName() +
                             " to " + databaseName +
                             " failed, name not found on the Replica.");
                    }
                    break;
                }

                case UPDATE_CONFIG: {
                    /* Get the replicated database configurations. */
                    DatabaseConfig dbConfig =
                        repContext.getCreateConfig().getReplicaConfig(repImpl);

                    /* Update the NameLN and write it to the log. */
                    dbImpl = repImpl.getDbTree().getDb(nameLN.getId());
                    final String dbName = dbImpl.getName();
                    repImpl.getDbTree().updateNameLN
                        (repTxn, dbName, repContext);

                    /* Set the new configurations to DatabaseImpl. */
                    dbImpl.setConfigProperties
                        (repTxn, dbName, dbConfig, repImpl);

                    repImpl.getDbTree().modifyDbRoot(dbImpl);

                    break;
                }

                default:
                    throw EnvironmentFailureException.unexpectedState
                        ("Illegal database op type of " + opType.toString() +
                         " from " + wireRecord + " database=" + databaseName);
            }
        } finally {
            if (dbImpl != null) {
                repImpl.getDbTree().releaseDb(dbImpl);
            }
        }
    }

    private void applyLN(
        final ReplayTxn repTxn,
        final InputWireRecord wireRecord)
        throws DatabaseException {

        final LNLogEntry lnEntry = (LNLogEntry) wireRecord.getLogEntry();
        final DatabaseId dbId = lnEntry.getDbId();

        /*
         * If this is a change to the rep group db, remember at commit time,
         * and refresh this node's group metadata.
         */
        if (dbId.getId() == RepGroupDB.DB_ID) {
            repTxn.noteRepGroupDbChange();
        }

        /*
         * Note that we don't have to worry about serializable isolation when
         * applying a replicated txn; serializable isolation in only an issue
         * for txns that take read locks, and a replicated txn consists only of
         * write operations.
         */
        final DatabaseImpl dbImpl =
            repImpl.getRepNode().getReplica().getDbCache().get(dbId, repTxn);

        lnEntry.postFetchInit(dbImpl);

        final ReplicationContext repContext =
            new ReplicationContext(wireRecord.getVLSN());

        try (final Cursor cursor = DbInternal.makeCursor(
                dbImpl, repTxn, null /*cursorConfig*/)) {

            OperationResult result;
            final LN ln = lnEntry.getLN();

            /* In a dup DB, do not expect embedded LNs or non-empty data. */
            if (dbImpl.getSortedDuplicates() &&
                (lnEntry.isEmbeddedLN() ||
                 (ln.getData() != null && ln.getData().length > 0))) {

                throw EnvironmentFailureException.unexpectedState(
                    dbImpl.getEnv(),
                    "[#25288] emb=" + lnEntry.isEmbeddedLN() +
                    " key=" + Key.getNoFormatString(lnEntry.getKey()) +
                    " data=" + Key.getNoFormatString(ln.getData()) +
                    " vlsn=" + ln.getVLSNSequence());
            }

            if (ln.isDeleted()) {

                /*
                 * Perform an exact search by key. Use a partial data entry
                 * (delDataEntry) to avoid reading old data.
                 */
                replayKeyEntry.setData(lnEntry.getKey());

                result = DbInternal.searchForReplay(
                    cursor, replayKeyEntry, delDataEntry,
                    LockMode.RMW, SearchMode.SET);

                if (result != null) {
                    result =
                        DbInternal.deleteWithRepContext(cursor, repContext);
                }
            } else {
                replayKeyEntry.setData(lnEntry.getKey());
                replayDataEntry.setData(ln.getData());

                result = DbInternal.putForReplay(
                    cursor, replayKeyEntry, replayDataEntry, ln,
                    lnEntry.getExpiration(), lnEntry.isExpirationInHours(),
                    PutMode.OVERWRITE, repContext);
            }

            /*
             * For a deletion we expect the record to previously exist. Note
             * that the put call never fails since PutMode.OVERWRITE is used.
             *
             * Ignore deletion errors for records that are expired (within TTL
             * clock tolerance), since these can be deleted from the Btree by
             * background IN compression. A record may expire after the master
             * deletion op and before the replica replay, e.g., due to a
             * lagging replica or a replica that was down during the deletion.
             *
             * Also ignore deletion errors for EXTINCT and MAYBE_EXTINCT
             * records to account for apps that try to delete extinct records.
             * This is illegal but should not bring down the RN.
             */
            if (result == null) {

                final ExtinctionStatus extinctionState =
                    repImpl.getExtinctionState(dbImpl, lnEntry.getKey());

                final boolean isExpired = repImpl.expiresWithin(
                    lnEntry.getExpiration(), lnEntry.isExpirationInHours(),
                    repImpl.getTtlClockTolerance());

                final String expTime = (lnEntry.getExpiration() == 0) ?
                    "none" :
                    TTL.formatExpiration(
                        lnEntry.getExpiration(),
                        lnEntry.isExpirationInHours());

                if (!isExpired && extinctionState == NOT_EXTINCT) {

                    throw new EnvironmentFailureException(
                        repImpl,
                        EnvironmentFailureReason.LOG_INCOMPLETE,
                        "Replicated operation could  not be applied. " +
                            " vlsn=" + wireRecord.getVLSN() +
                            " expirationTime=" + expTime +
                            // TODO remove key after debugging
                            " key=" + Key.dumpString(lnEntry.getKey(), 0) +
                            wireRecord);
                }
            }
        }

        /*
         * Queue new extinct scans for later processing. Must be done after
         * applying the LN so the record is present in the Btree when the
         * extinct scanner thread tries to read it.
         */
        if (lnEntry.getLogType().equals(
            LogEntryType.LOG_EXTINCT_SCAN_LN_TRANSACTIONAL)) {

            repImpl.getExtinctionScanner().replay(
                lnEntry.getKey(), lnEntry.getData());
        }
    }

    /**
     * Go through all active txns and rollback up to but not including the log
     * entry represented by the matchpoint VLSN.
     *
     * Effectively truncate these rolled back log entries by making them
     * invisible. Flush the log first, to make sure these log entries are out
     * of the log buffers and are on disk, so we can reliably find them through
     * the FileManager.
     *
     * Rollback steps are described in
     * https://sleepycat.oracle.com/trac/wiki/Logging#Recoverysteps. In
     * summary,
     *
     * 1. Log and fsync a new RollbackStart record
     * 2. Do the rollback in memory. There is no need to explicitly
     *    log INs made dirty by the rollback operation.
     * 3. Do invisibility masking by overwriting LNs.
     * 4. Fsync all overwritten log files at this point.
     * 5. Write a RollbackEnd record, for ease of debugging
     *
     * Note that application read txns  can continue to run during syncup.
     * Reader txns cannot access records that are being rolled back, because
     * they are in txns that are not committed, i.e, they are write locked.
     * The rollback interval never includes committed txns, and we do a hard
     * recovery if it would include them.
     */
    public void rollback(VLSN matchpointVLSN, long matchpointLsn) {

        String rollbackStatus = RBSTATUS_START;

        final Map localActiveTxns = copyActiveTxns();
        try {
            if (localActiveTxns.size() == 0) {
                /* no live read/write txns, nothing to do. */
                rollbackStatus = RBSTATUS_NO_ACTIVE;
                return;
            }

            VLSNRange range = repImpl.getVLSNIndex().getRange();
            if (range.getLast().equals(matchpointVLSN)) {
                /* nothing to roll back. */
                rollbackStatus = RBSTATUS_RANGE_EQUALS;
                return;
            }

            repImpl.setSyncupProgress(SyncupProgress.DO_ROLLBACK);

            /*
             * Stop the log file backup service, since the files will be in an
             * inconsistent state while the rollback is in progress.
             */
            repImpl.getRepNode().shutdownNetworkBackup();

            /*
             * Set repImpl's isRollingBack to true, and invalidate all the in
             * progress DbBackup.
             */
            repImpl.setBackupProhibited(true);
            repImpl.invalidateBackups(DbLsn.getFileNumber(matchpointLsn));

            /*
             * 1. Log RollbackStart. The fsync guarantees that this marker will
             * be present in the log for recovery. It also ensures that all log
             * entries will be flushed to disk and the TxnChain will not have
             * to worry about entries that are in log buffers when constructing
             * the rollback information.
             */
            LogManager logManager = repImpl.getLogManager();
            LogEntry rollbackStart = SingleItemEntry.create(
                LogEntryType.LOG_ROLLBACK_START,
                new RollbackStart(
                    matchpointVLSN, matchpointLsn, localActiveTxns.keySet()));
            long rollbackStartLsn =
                logManager.logForceFlush(rollbackStart,
                                         true, // fsyncRequired,
                                         ReplicationContext.NO_REPLICATE);
            rollbackStatus = RBSTATUS_LOG_RBSTART;

            /*
             * 2. Do rollback in memory. Undo any operations that were logged
             * after the matchpointLsn, and save the LSNs for those log
             * entries.. There should be something to undo, because we checked
             * earlier that there were log entries after the matchpoint.
             */
            List rollbackLsns = new ArrayList();
            for (ReplayTxn replayTxn : localActiveTxns.values()) {
                Collection txnRollbackLsns =
                    replayTxn.rollback(matchpointLsn);

                /*
                 * Txns that were entirely rolled back should have been removed
                 * from the activeTxns map.
                 */
                assert checkRemoved(replayTxn) :
                    "Should have removed " + replayTxn;

                rollbackLsns.addAll(txnRollbackLsns);
            }
            rollbackStatus = RBSTATUS_MEM_ROLLBACK;
            assert rollbackLsns.size() != 0 : dumpActiveTxns(matchpointLsn);

            /*
             * 3 & 4 - Mark the rolled back log entries as invisible.  But
             * before doing so, invoke any registered rewrite listeners, so the
             * application knows that existing log files will be modified.
             *
             * After all are done, fsync the set of files. By waiting, some may
             * have made it out on their own.
             */
            LogFileRewriteListener listener = repImpl.getLogRewriteListener();
            if (listener != null) {
                listener.rewriteLogFiles(getFileNames(rollbackLsns));
            }
            RollbackTracker.makeInvisible(repImpl, rollbackLsns);
            rollbackStatus = RBSTATUS_INVISIBLE;

            /*
             * 5. Log RollbackEnd. Flush it so that we can use it to optimize
             * recoveries later on. If the RollbackEnd exists, we can skip the
             * step of re-making LNs invisible.
             */
            logManager.logForceFlush(
                SingleItemEntry.create(LogEntryType.LOG_ROLLBACK_END,
                                       new RollbackEnd(matchpointLsn,
                                                       rollbackStartLsn)),
                 true, // fsyncRequired
                 ReplicationContext.NO_REPLICATE);

            /*
             * Restart the backup service only if all the steps of the
             * rollback were successful.
             */
            repImpl.getRepNode().restartNetworkBackup();
            repImpl.setBackupProhibited(false);
            rollbackStatus = RBSTATUS_FINISH;
        } finally {

            /* Reset the lastReplayedVLSN so it's correct when we resume. */
            lastReplayedVLSN = matchpointVLSN;
            LoggerUtils.info(logger, repImpl,
                             "Rollback to matchpoint " + matchpointVLSN +
                             " at " + DbLsn.getNoFormatString(matchpointLsn) +
                             " status=" + rollbackStatus);
        }
    }

    /* For debugging support */
    private String dumpActiveTxns(long matchpointLsn) {
        StringBuilder sb = new StringBuilder();
        sb.append("matchpointLsn=");
        sb.append(DbLsn.getNoFormatString(matchpointLsn));
        for (ReplayTxn replayTxn : copyActiveTxns().values()) {
            sb.append("txn id=").append(replayTxn.getId());
            sb.append(" locks=").append(replayTxn.getWriteLockIds());
            sb.append("lastLogged=");
            sb.append(DbLsn.getNoFormatString(replayTxn.getLastLsn()));
            sb.append("\n");
        }

        return sb.toString();
    }

    private Set getFileNames(List lsns) {
        Set fileNums = new HashSet();
        Set files = new HashSet();

        for (long lsn : lsns) {
            fileNums.add(DbLsn.getFileNumber(lsn));
        }
        for (long fileNum : fileNums) {
            files.add(new File(FileManager.getFileName(fileNum)));
        }
        return files;
    }

    private boolean checkRemoved(ReplayTxn txn) {
        if (txn.isClosed()) {
            if (activeTxns.get(txn.getId()) != null) {
                return false;
            }
        }

        return true;
    }

    /**
     * Make a copy of activeTxns to avoid holding its mutex while iterating.
     * Can be used whenever the cost of the HashMap copy is not significant.
     */
    private Map copyActiveTxns() {
        return activeTxns.getMap();
    }

    /**
     * Release all transactions, database handles, etc held by the replay
     * unit. The Replicator is closing down and Replay will not be invoked
     * again.
     */
    public void close() {

        for (ReplayTxn replayTxn : copyActiveTxns().values()) {
            try {
                if (logger.isLoggable(Level.FINE)) {
                    LoggerUtils.fine(logger, repImpl,
                                     "Unregistering open replay txn: " +
                                     replayTxn.getId());
                }
                replayTxn.cleanup();
            } catch (DatabaseException e) {
                LoggerUtils.fine(logger, repImpl,
                                 "Replay txn: " + replayTxn.getId() +
                                 " unregistration failed: " + e.getMessage());
            }
        }
        assert activeTxns.isEmpty();
    }

    /**
     * Returns a copy of the statistics associated with Replay
     */
    public StatGroup getStats(StatsConfig config) {
        StatGroup ret = statistics.cloneGroup(config.getClear());

        return ret;
    }

    public void resetStats() {
        statistics.clear();
    }

    /* For unit tests */
    public SimpleTxnMap getActiveTxns() {
        return activeTxns;
    }

    public String dumpState() {
        StringBuilder sb = new StringBuilder();
        sb.append("lastReplayedTxn=").append(lastReplayedTxn);
        sb.append(" lastReplayedVLSN=").append(lastReplayedVLSN);
        sb.append(" numActiveReplayTxns=").append(activeTxns.size());
        sb.append("\n");
        return sb.toString();
    }

    /**
     * Write out any pending acknowledgments. See GroupCommit.flushPendingAcks
     * for details. This method is invoked after each log entry is read from
     * the replication stream.
     *
     * @param nowNs the time at the reading of the log entry
     */
    void flushPendingAcks(long nowNs)
        throws IOException {

        groupCommit.flushPendingAcks(nowNs);
    }

    /**
     * See GroupCommit.getPollIntervalNs(long)
     */
    long getPollIntervalNs(long defaultNs) {
        return groupCommit.getPollIntervalNs(defaultNs);
    }

    /**
     * Implements group commit. It's really a substructure of Replay and exists
     * mainly for modularity reasons.
     * 
     * Since replay is single threaded, the group commit mechanism works
     * differently in the replica than in the master. In the replica, SYNC
     * transactions are converted into NO_SYNC transactions and executed
     * immediately, but their acknowledgments are delayed until after either
     * the REPLICA_GROUP_COMMIT_INTERVAL (the max amount the first transaction
     * in the group is delayed) has expired, or the size of the group (as
     * specified by REPLICA_MAX_GROUP_COMMIT) has been exceeded.
     */
    private class GroupCommit {

        /* Size determines max fsync commits that can be grouped. */
        private final long pendingCommitAcks[];

        /* Number of entries currently in pendingCommitAcks */
        private int nPendingAcks;

        /*
         * If this time limit is reached, the group will be forced to commit.
         * Invariant: nPendingAcks > 0 ==> limitGroupCommitNs > 0
         */
        private long limitGroupCommitNs = 0;

        /* The time interval that an open group is held back. */
        private final long groupCommitIntervalNs;

        private final LongStat nGroupCommitTimeouts;
        private final LongStat nGroupCommitMaxExceeded;
        private final LongStat nGroupCommits;
        private final LongStat nGroupCommitTxns;

        private GroupCommit(DbConfigManager configManager) {
            pendingCommitAcks = new long[configManager.
                getInt(RepParams.REPLICA_MAX_GROUP_COMMIT)];

            nPendingAcks = 0;

            final long groupCommitIntervalMs = configManager.
                getDuration(RepParams.REPLICA_GROUP_COMMIT_INTERVAL);

            groupCommitIntervalNs =
                NANOSECONDS.convert(groupCommitIntervalMs, MILLISECONDS);
            nGroupCommitTimeouts =
                new LongStat(statistics, N_GROUP_COMMIT_TIMEOUTS);

            nGroupCommitMaxExceeded =
                new LongStat(statistics, N_GROUP_COMMIT_MAX_EXCEEDED);

            nGroupCommitTxns =
                new LongStat(statistics, N_GROUP_COMMIT_TXNS);

            nGroupCommits =
                new LongStat(statistics, N_GROUP_COMMITS);
        }

        /**
         * Returns true if group commits are enabled at the replica.
         */
        private boolean isEnabled() {
            return pendingCommitAcks.length > 0;
        }

        /**
         * The interval used to poll for incoming log entries. The time is
         * lowered from the defaultNs time, if there are pending
         * acknowledgments.
         *
         * @param defaultNs the default poll interval
         *
         * @return the actual poll interval
         */
        private long getPollIntervalNs(long defaultNs) {
            if (nPendingAcks == 0) {
                return defaultNs;
            }
            final long now = System.nanoTime();

            final long interval = limitGroupCommitNs - now;
            return Math.min(interval, defaultNs);
        }

        /**
         * Returns the sync policy to be implemented at the replica. If
         * group commit is active, and SYNC is requested it will return
         * NO_SYNC instead to delay the fsync.
         *
         * @param txnSyncPolicy the sync policy as stated in the txn
         *
         * @return the sync policy to be implemented by the replica
         */
        private SyncPolicy getImplSyncPolicy(SyncPolicy txnSyncPolicy) {
            return ((txnSyncPolicy ==  SyncPolicy.SYNC) && isEnabled()) ?
                   SyncPolicy.NO_SYNC : txnSyncPolicy;
        }

        /**
         * Buffers the acknowledgment if the commit calls for a sync, or if
         * there are pending acknowledgments to ensure that acks are sent
         * in order.
         *
         * @param nowNs the current time
         * @param ackTxn the txn associated with the ack
         * @param txnSyncPolicy the sync policy as request by the committing
         * txn
         *
         * @return true if the ack has been buffered
         */
        private final boolean bufferAck(long nowNs,
                                        ReplayTxn ackTxn,
                                        SyncPolicy txnSyncPolicy)
            throws IOException {

            if (!isEnabled() ||
                !((txnSyncPolicy == SyncPolicy.SYNC) || (nPendingAcks > 0))) {
                return false;
            }

            pendingCommitAcks[nPendingAcks++] = ackTxn.getId();

            if (nPendingAcks == 1) {
                /* First txn in group, start the clock. */
                limitGroupCommitNs = nowNs + groupCommitIntervalNs;
            } else {
                flushPendingAcks(nowNs);
            }
            return true;
        }

        /**
         * Flush if there are pending acks and either the buffer limit or the
         * group interval has been reached.
         *
         * @param nowNs the current time (passed in to minimize system calls)
         */
        private final void flushPendingAcks(long nowNs)
            throws IOException {

            if ((nPendingAcks == 0) ||
                ((nPendingAcks != pendingCommitAcks.length) &&
                 (NanoTimeUtil.compare(nowNs, limitGroupCommitNs) < 0))) {

                return;
            }

            /* Update statistics. */
            nGroupCommits.increment();
            nGroupCommitTxns.add(nPendingAcks);
            if (NanoTimeUtil.compare(nowNs, limitGroupCommitNs) >= 0) {
                nGroupCommitTimeouts.increment();
            } else if (nPendingAcks >= pendingCommitAcks.length) {
                nGroupCommitMaxExceeded.increment();
            }

            /* flush log buffer and fsync to disk */
            repImpl.getLogManager().flushSync();

            /* commits are on disk, send out acknowledgments on the network. */
            for (int i=0; i < nPendingAcks; i++) {
                queueAck(pendingCommitAcks[i]);
                pendingCommitAcks[i] = 0;
            }

            nPendingAcks = 0;
            limitGroupCommitNs = 0;
        }
    }

    /**
     * Simple helper class to package a Txn vlsn and its associated commit
     * time.
     */
    public static class TxnInfo {
        final VLSN txnVLSN;
        final long masterTxnEndTime;

        private TxnInfo(VLSN txnVLSN, long masterTxnEndTime) {
            this.txnVLSN = txnVLSN;
            this.masterTxnEndTime = masterTxnEndTime;
        }

        public VLSN getTxnVLSN() {
            return txnVLSN;
        }

        public long getMasterTxnEndTime() {
            return masterTxnEndTime;
        }

        @Override
        public String toString() {
            return " VLSN: " + txnVLSN +
                " masterTxnEndTime=" + new Date(masterTxnEndTime);
        }
    }
}