com.sleepycat.je.rep.impl.node.Replay Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of starrocks-bdb-je Show documentation
starrocks managed bdb je
The newest version!
/*-
 * Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle Berkeley
 * DB Java Edition made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle Berkeley DB Java Edition for a copy of the
 * license and additional information.
 */

package com.sleepycat.je.rep.impl.node;

import static com.sleepycat.je.ExtinctionFilter.ExtinctionStatus.NOT_EXTINCT;
import static com.sleepycat.je.log.LogEntryType.LOG_NAMELN_TRANSACTIONAL;
import static com.sleepycat.je.log.LogEntryType.LOG_TXN_ABORT;
import static com.sleepycat.je.log.LogEntryType.LOG_TXN_COMMIT;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.ELAPSED_TXN_95_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.ELAPSED_TXN_99_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.ELAPSED_TXN_AVG_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.ELAPSED_TXN_MAX_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.LATEST_COMMIT_LAG_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.MAX_COMMIT_PROCESSING_NANOS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.MIN_COMMIT_PROCESSING_NANOS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_ABORTS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMITS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_ACKS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_NO_SYNCS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_SYNCS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_WRITE_NO_SYNCS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_GROUP_COMMITS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_GROUP_COMMIT_MAX_EXCEEDED;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_GROUP_COMMIT_TIMEOUTS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_GROUP_COMMIT_TXNS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_LNS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_MESSAGE_QUEUE_OVERFLOWS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_NAME_LNS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.OUTPUT_QUEUE_95_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.OUTPUT_QUEUE_99_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.OUTPUT_QUEUE_AVG_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.OUTPUT_QUEUE_MAX_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.REPLAY_QUEUE_95_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.REPLAY_QUEUE_99_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.REPLAY_QUEUE_AVG_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.REPLAY_QUEUE_MAX_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.TOTAL_COMMIT_PROCESSING_NANOS;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static java.util.concurrent.TimeUnit.NANOSECONDS;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.logging.Level;
import java.util.logging.Logger;

import com.sleepycat.je.Cursor;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.DatabaseNotFoundException;
import com.sleepycat.je.DbInternal;
import com.sleepycat.je.Durability.SyncPolicy;
import com.sleepycat.je.EnvironmentFailureException;
import com.sleepycat.je.ExtinctionFilter.ExtinctionStatus;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationResult;
import com.sleepycat.je.StatsConfig;
import com.sleepycat.je.TransactionConfig;
import com.sleepycat.je.dbi.DatabaseId;
import com.sleepycat.je.dbi.DatabaseImpl;
import com.sleepycat.je.dbi.DbConfigManager;
import com.sleepycat.je.dbi.DbTree.TruncateDbResult;
import com.sleepycat.je.dbi.DbType;
import com.sleepycat.je.dbi.EnvironmentFailureReason;
import com.sleepycat.je.dbi.PutMode;
import com.sleepycat.je.dbi.SearchMode;
import com.sleepycat.je.dbi.TTL;
import com.sleepycat.je.dbi.TriggerManager;
import com.sleepycat.je.log.DbOpReplicationContext;
import com.sleepycat.je.log.FileManager;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.LogManager;
import com.sleepycat.je.log.ReplicationContext;
import com.sleepycat.je.log.entry.DbOperationType;
import com.sleepycat.je.log.entry.LNLogEntry;
import com.sleepycat.je.log.entry.LogEntry;
import com.sleepycat.je.log.entry.NameLNLogEntry;
import com.sleepycat.je.log.entry.SingleItemEntry;
import com.sleepycat.je.recovery.RecoveryInfo;
import com.sleepycat.je.recovery.RollbackTracker;
import com.sleepycat.je.rep.LogFileRewriteListener;
import com.sleepycat.je.rep.SyncupProgress;
import com.sleepycat.je.rep.impl.RepGroupDB;
import com.sleepycat.je.rep.impl.RepImpl;
import com.sleepycat.je.rep.impl.RepParams;
import com.sleepycat.je.rep.stream.InputWireRecord;
import com.sleepycat.je.rep.stream.MasterStatus.MasterSyncException;
import com.sleepycat.je.rep.stream.Protocol;
import com.sleepycat.je.rep.txn.ReplayTxn;
import com.sleepycat.je.rep.utilint.LongMinZeroStat;
import com.sleepycat.je.rep.utilint.SimpleTxnMap;
import com.sleepycat.je.rep.vlsn.VLSNRange;
import com.sleepycat.je.tree.Key;
import com.sleepycat.je.tree.LN;
import com.sleepycat.je.tree.NameLN;
import com.sleepycat.je.txn.RollbackEnd;
import com.sleepycat.je.txn.RollbackStart;
import com.sleepycat.je.txn.Txn;
import com.sleepycat.je.txn.TxnAbort;
import com.sleepycat.je.txn.TxnCommit;
import com.sleepycat.je.txn.TxnEnd;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.LatencyPercentileStat;
import com.sleepycat.je.utilint.LoggerUtils;
import com.sleepycat.je.utilint.LongAvgStat;
import com.sleepycat.je.utilint.LongMaxStat;
import com.sleepycat.je.utilint.LongMaxZeroStat;
import com.sleepycat.je.utilint.LongMinStat;
import com.sleepycat.je.utilint.LongStat;
import com.sleepycat.je.utilint.NanoTimeUtil;
import com.sleepycat.je.utilint.StatGroup;
import com.sleepycat.je.utilint.VLSN;
import com.sleepycat.utilint.StringUtils;

/**
 * Replays log records from the replication stream, and manages the
 * transactions for those records.
 *
 * The Replay module has a lifetime equivalent to the environment owned by
 * this replicator. Its lifetime is longer than the feeder/replica stream.
 * For example, suppose this is nodeX:
 *
 * t1 - Node X is a replica, node A is master. Replay X is alive
 * t2 - Node X is a replica, node B takes over as master. X's Replay module
 *      is still alive and has the same set of active txns. It doesn't matter
 *      to X that the master has changed.
 * t3 - Node X becomes the master. Now its Replay unit is cleared, because
 *      anything managed by the Replay is defunct.
 */
public class Replay {

    /* These are strings for the rollback logging. */
    private static final String RBSTATUS_START =
        "Started Rollback";
    private static final String RBSTATUS_NO_ACTIVE =
        "No active txns, nothing to rollback";
    private static final String RBSTATUS_RANGE_EQUALS =
        "End of range equals matchpoint, nothing to rollback";
    private static final String RBSTATUS_LOG_RBSTART =
        "Logged RollbackStart entry";
    private static final String RBSTATUS_MEM_ROLLBACK =
        "Finished in-memory rollback";
    private static final String RBSTATUS_INVISIBLE =
        "Finished invisible setting";
    private static final String RBSTATUS_FINISH =
        "Finished rollback";

    /*
     * DatabaseEntry objects reused during replay, to minimize allocation in
     * high performance replay path.
     */
    final DatabaseEntry replayKeyEntry = new DatabaseEntry();
    final DatabaseEntry replayDataEntry = new DatabaseEntry();
    final DatabaseEntry delDataEntry = new DatabaseEntry();

    private final RepImpl repImpl;

    /**
     *  If a commit replay operation takes more than this threshold, it's
     *  logged. This information helps determine whether ack timeouts on the
     *  master are due to a slow replica, or the network.
     */
    private final long ackTimeoutLogThresholdNs;

    /**
     * ActiveTxns is a collection of txn objects used for applying replicated
     * transactions. This collection should be empty if the node is a master.
     *
     * Note that there is an interesting relationship between ActiveTxns and
     * the txn collection managed by the environment TxnManager. ActiveTxns is
     * effectively a subset of the set of txns held by the
     * TxnManager. ReplayTxns must be sure to register and unregister
     * themselves from ActiveTxns, just as all Txns must register and
     * unregister with the TxnManager's set. One implementation alternative to
     * having an ActiveTxns map here is to search the TxnManager set (which is
     * a set, not a map) for a given ReplayTxn. Another is to subclass
     * TxnManager so that replicated nodes have their own replayTxn map, just
     * as XATxns have a XID->Txn map.
     *
     * Both alternatives seemed too costly in terms of performance or elaborate
     * in terms of code to do for the current function. It seems clearer to
     * make the ActiveTxns a map in the one place that it is currently
     * used. This choice may change over time, and should be reevaluated if the
     * implementation changes.
     *
     * The ActiveTxns key is the transaction id. These transactions are closed
     * when:
     * - the replay unit executes a commit received over the replication stream
     * - the replay unit executes an abort received over the replication stream
     * - the replication node realizes that it has just become the new master,
     *   and was not previously the master.
     *
     * Note that the Replay class has a lifetime that is longer than that of a
     * RepNode. This means in particular, that transactions may be left open,
     * and will be resumed when a replica switches from one master to another,
     * creating a new RepNode in the process. Because of that, part of the
     * transaction may be implemented by the rep stream from one master and
     * another part by another.
     *
     * The map is synchronized, so simple get/put operations do not require
     * additional synchronization.  However, iteration requires synchronization
     * and copyActiveTxns can be used in most cases.
     */
    private final SimpleTxnMap activeTxns;

    /*
     * The entry representing the last replayed txn commit. Supports the
     * replica's notion of consistency.
     */
    private volatile TxnInfo lastReplayedTxn = null;

    /*
     * The last replayed entry of any kind. Supports PointConsistencyPolicy.
     */
    private volatile VLSN lastReplayedVLSN = null;

    /*
     * The last replayed DTVLSN in the stream. It's used to ensure that the
     * DTVLSNs in the stream are correctly sequenced.
     */
    private long lastReplayedDTVLSN = VLSN.NULL_VLSN_SEQUENCE;

    /*
     * The sync policy to be used in the absence of an ACK request. The replica
     * in this case has some latitude about how it syncs the commit.
     */
    private final SyncPolicy noAckSyncPolicy = SyncPolicy.NO_SYNC;

    /**
     *  The RepParams.REPLAY_LOGGING_THRESHOLD configured logging threshold.
     */
    private final long replayLoggingThresholdNs;

    /**
     * State that is reinitialized by the reinit() method each time a replay
     * loop is started with a new feeder.
     */

    /**
     *  All writes (predominantly acks) are queued here, so they do not block
     *  the replay thread.
     */
    private final BlockingQueue outputQueue;

    /**
     * Holds the state associated with group commits.
     */
    private final GroupCommit groupCommit;

    /* Maintains the statistics associated with stream replay. */
    private final StatGroup statistics;
    private final LongStat nCommits;
    private final LongStat nCommitAcks;
    private final LongStat nCommitSyncs;
    private final LongStat nCommitNoSyncs;
    private final LongStat nCommitWriteNoSyncs;
    private final LongStat nAborts;
    private final LongStat nNameLNs;
    private final LongStat nLNs;
    private final LongStat nMessageQueueOverflows;
    private final LongMinStat minCommitProcessingNanos;
    private final LongMaxStat maxCommitProcessingNanos;
    private final LongStat totalCommitProcessingNanos;
    private final LongStat latestCommitLagMs;

    private volatile long heartbeatRequestEnqueueTime = 0;
    private volatile long heartbeatRequestMasterNow = 0;
    private final LongAvgStat replayQueueAvgDelayMs;
    private final LatencyPercentileStat replayQueue95DelayMs;
    private final LatencyPercentileStat replayQueue99DelayMs;
    private final LongMaxStat replayQueueMaxDelayMs;

    private volatile long heartbeatResponseEnqueueTime = 0;
    private final LongAvgStat outputQueueAvgDelayMs;
    private final LatencyPercentileStat outputQueue95DelayMs;
    private final LatencyPercentileStat outputQueue99DelayMs;
    private final LongMaxStat outputQueueMaxDelayMs;

    private final LongAvgStat elapsedTxnAvgMs;
    private final LatencyPercentileStat elapsedTxn95Ms;
    private final LatencyPercentileStat elapsedTxn99Ms;
    private final LongMaxStat elapsedTxnMaxMs;

    private final Logger logger;
    public Replay(RepImpl repImpl,
                  @SuppressWarnings("unused") NameIdPair nameIdPair) {

        /*
         * This should have already been caught in
         * ReplicatedEnvironment.setupRepConfig, but it is checked here anyway
         * as an added sanity check. [#17643]
         */
        if (repImpl.isReadOnly()) {
            throw EnvironmentFailureException.unexpectedState
                ("Replay created with readonly ReplicatedEnvironment");
        }

        this.repImpl = repImpl;
        final DbConfigManager configManager = repImpl.getConfigManager();

        ackTimeoutLogThresholdNs = MILLISECONDS.toNanos(configManager.
            getDuration(RepParams.REPLICA_ACK_TIMEOUT));

        /**
         * The factor of 2 below is somewhat arbitrary. It should be > 1 X so
         * that the ReplicaOutputThread can completely process the buffered
         * messages in the face of a network drop and 2X to allow for
         * additional headroom and minimize the chances that the replay might
         * be blocked due to the limited queue length.
         */
        final int outputQueueSize = 2 *
            configManager.getInt(RepParams.REPLICA_MESSAGE_QUEUE_SIZE);
        outputQueue = new ArrayBlockingQueue(outputQueueSize);

        /*
         * The Replay module manages all write transactions and mimics a
         * writing application thread. When the node comes up, it populates
         * the activeTxn collection with ReplayTxns that were resurrected
         * at recovery time.
         */
        activeTxns = new SimpleTxnMap<>(1024);

        /*
         * Configure the data entry used for deletion to avoid fetching the
         * old data during deletion replay.
         */
        delDataEntry.setPartial(0, 0, true);

        logger = LoggerUtils.getLogger(getClass());
        statistics = new StatGroup(ReplayStatDefinition.GROUP_NAME,
                                   ReplayStatDefinition.GROUP_DESC);

        groupCommit = new GroupCommit(configManager);

        nCommits = new LongStat(statistics, N_COMMITS);
        nCommitAcks = new LongStat(statistics, N_COMMIT_ACKS);
        nCommitSyncs = new LongStat(statistics, N_COMMIT_SYNCS);
        nCommitNoSyncs = new LongStat(statistics, N_COMMIT_NO_SYNCS);
        nCommitWriteNoSyncs =
            new LongStat(statistics, N_COMMIT_WRITE_NO_SYNCS);
        nAborts = new LongStat(statistics, N_ABORTS);
        nNameLNs = new LongStat(statistics, N_NAME_LNS);
        nLNs = new LongStat(statistics, N_LNS);
        nMessageQueueOverflows =
            new LongStat(statistics, N_MESSAGE_QUEUE_OVERFLOWS);
        minCommitProcessingNanos =
            new LongMinZeroStat(statistics, MIN_COMMIT_PROCESSING_NANOS);
        maxCommitProcessingNanos =
            new LongMaxZeroStat(statistics, MAX_COMMIT_PROCESSING_NANOS);
        totalCommitProcessingNanos =
            new LongStat(statistics, TOTAL_COMMIT_PROCESSING_NANOS);
        latestCommitLagMs = new LongStat(statistics, LATEST_COMMIT_LAG_MS);

        replayQueueAvgDelayMs =
            new LongAvgStat(statistics, REPLAY_QUEUE_AVG_DELAY_MS);
        replayQueue95DelayMs = new LatencyPercentileStat(
            statistics, REPLAY_QUEUE_95_DELAY_MS, 0.95f);
        replayQueue99DelayMs = new LatencyPercentileStat(
            statistics, REPLAY_QUEUE_99_DELAY_MS, 0.99f);
        replayQueueMaxDelayMs = new LongMaxStat(
            statistics, REPLAY_QUEUE_MAX_DELAY_MS);

        outputQueueAvgDelayMs =
            new LongAvgStat(statistics, OUTPUT_QUEUE_AVG_DELAY_MS);
        outputQueue95DelayMs = new LatencyPercentileStat(
            statistics, OUTPUT_QUEUE_95_DELAY_MS, 0.95f);
        outputQueue99DelayMs = new LatencyPercentileStat(
            statistics, OUTPUT_QUEUE_99_DELAY_MS, 0.99f);
        outputQueueMaxDelayMs = new LongMaxStat(
            statistics, OUTPUT_QUEUE_MAX_DELAY_MS);

        elapsedTxnAvgMs = new LongAvgStat(statistics, ELAPSED_TXN_AVG_MS);
        elapsedTxn95Ms =
            new LatencyPercentileStat(statistics, ELAPSED_TXN_95_MS, 0.95f);
        elapsedTxn99Ms =
            new LatencyPercentileStat(statistics, ELAPSED_TXN_99_MS, 0.99f);
        elapsedTxnMaxMs = new LongMaxStat(statistics, ELAPSED_TXN_MAX_MS);

        replayLoggingThresholdNs = MILLISECONDS.toNanos(configManager.
           getDuration(RepParams.REPLAY_LOGGING_THRESHOLD));
    }

    /**
     * Note that a heartbeat request has been received over the network and is
     * about to be added to the replay queue. If no heartbeat request is being
     * tracked, note the enqueuing time and the heartbeat's master time. When
     * we dequeue this same request, as determined by matching the master time,
     * we will compute the time it spent in the queue and use that to tally
     * statistics about the replay queue delays.
     */
    void noteEnqueueHeartbeatRequest(Protocol.Heartbeat heartbeat) {
        if (heartbeatRequestEnqueueTime == 0) {
            heartbeatRequestEnqueueTime = System.currentTimeMillis();
            heartbeatRequestMasterNow = heartbeat.getMasterNow();
        }
    }

    /**
     * Note that an incoming heartbeat request has been retrieved from the
     * replay queue in preparation for processing. If we are tracking a
     * heartbeat request and this request's master time matches the entry being
     * tracked, tally the replay queue delay statistics.
     *
     * Then, if no heartbeat response is being tracked, note the time of
     * enqueuing the heartbeat response to the output queue.
     *
     * @return whether to track the heartbeat response time
     */
    boolean noteDequeueHeartbeatRequest(Protocol.Heartbeat heartbeat) {
        long now = 0;
        if ((heartbeatRequestEnqueueTime != 0) &&
            (heartbeatRequestMasterNow == heartbeat.getMasterNow())) {
            now = System.currentTimeMillis();
            final long delay = now - heartbeatRequestEnqueueTime;
            replayQueueAvgDelayMs.add(delay);
            replayQueue95DelayMs.add(delay);
            replayQueue99DelayMs.add(delay);
            replayQueueMaxDelayMs.setMax(delay);
            heartbeatRequestEnqueueTime = 0;
        }
        if (heartbeatResponseEnqueueTime == 0) {
            heartbeatResponseEnqueueTime =
                (now == 0) ? System.currentTimeMillis() : now;
            return true;
        }
        return false;
    }

    /**
     * Note that an outgoing heartbeat response has been retrieved from the
     * output queue in preparation for sending it out over the network.  Tally
     * output queue delay statistics if we were tracking a heartbeat response.
     */
    void noteDequeueHeartbeatResponse() {
        if (heartbeatResponseEnqueueTime != 0) {
            final long delay =
                System.currentTimeMillis() - heartbeatResponseEnqueueTime;
            outputQueueAvgDelayMs.add(delay);
            outputQueue95DelayMs.add(delay);
            outputQueue99DelayMs.add(delay);
            outputQueueMaxDelayMs.setMax(delay);
            heartbeatResponseEnqueueTime = 0;
        }
    }

    public BlockingQueue getOutputQueue() {
        return outputQueue;
    }

    /**
     * Reinitialize for replay from a new feeder
     */
    public void reset() {
        outputQueue.clear();
    }

    LongStat getMessageQueueOverflows() {
        return nMessageQueueOverflows;
    }

    /**
     * Actions that must be taken before the recovery checkpoint, whether
     * the environment is read/write or read/only.
     */
    public void preRecoveryCheckpointInit(RecoveryInfo recoveryInfo) {
        for (Txn txn : recoveryInfo.replayTxns.values()) {

            /*
             * ReplayTxns need to know about their owning activeTxn map,
             * so they can remove themselves at close. We are casting upwards,
             * because the non-HA code is prohibited from referencing
             * Replication classes, and the RecoveryInfo.replayTxns collection
             * doesn't know that it's got ReplayTxns.
             */
            ((ReplayTxn) txn).registerWithActiveTxns(activeTxns);
        }
        lastReplayedVLSN = repImpl.getVLSNIndex().getRange().getLast();
    }

    public TxnInfo getLastReplayedTxn() {
        return lastReplayedTxn;
    }

    public VLSN getLastReplayedVLSN() {
        return lastReplayedVLSN;
    }

    /**
     * When mastership changes, all inflight replay transactions are aborted.
     * Replay transactions need only be aborted by the node that has become
     * the new master (who was previously a Replica). The replay transactions
     * on the other replicas who have not changed roles are
     * resolved by the abort record issued by said new master.
     */
    public void abortOldTxns()
        throws DatabaseException {

        final int masterNodeId = repImpl.getNodeId();
        for (ReplayTxn replayTxn : copyActiveTxns().values()) {
            /*
             * Use NULL for the DTVLSN since it's being written as the MASTER
             * despite being a ReplayTxn; it will be corrected when it's
             * written to the log.
             */
            replayTxn.abort(ReplicationContext.MASTER, masterNodeId,
                            VLSN.NULL_VLSN_SEQUENCE);
        }
        assert activeTxns.isEmpty() : "Unexpected txns in activeTxns = " +
            activeTxns;
    }

    private void updateCommitStats(final boolean needsAck,
                                   final SyncPolicy syncPolicy,
                                   final long startTimeNanos,
                                   final long masterCommitTimeMs,
                                   final long replicaCommitTimeMs,
                                   final long masterTxnTimeMs) {

        final long now = System.nanoTime();
        final long commitNanos = now - startTimeNanos;

        if (commitNanos > ackTimeoutLogThresholdNs &&
            logger.isLoggable(Level.INFO)) {
            LoggerUtils.info
                (logger, repImpl,
                 "Replay commit time: " + (commitNanos / 1000000) +
                 " ms exceeded log threshold: " +
                 (ackTimeoutLogThresholdNs / 1000000));
        }

        nCommits.increment();

        if (needsAck) {
            nCommitAcks.increment();
        }

        if (syncPolicy == SyncPolicy.SYNC) {
            nCommitSyncs.increment();
        } else if (syncPolicy == SyncPolicy.NO_SYNC) {
            nCommitNoSyncs.increment();
        } else if (syncPolicy == SyncPolicy.WRITE_NO_SYNC) {
            nCommitWriteNoSyncs.increment();
        } else {
            throw EnvironmentFailureException.unexpectedState
                ("Unknown sync policy: " + syncPolicy);
        }

        totalCommitProcessingNanos.add(commitNanos);
        minCommitProcessingNanos.setMin(commitNanos);
        maxCommitProcessingNanos.setMax(commitNanos);

        updateElapsedTxnStats(masterTxnTimeMs);

        /*
         * Tally the lag between master and replica commits, even if clock skew
         * makes the lag appear negative.  The documentation already warns that
         * the value will be affected by clock skew, so users can adjust for
         * that, but only if we don't throw the information way.
         */
        final long replicaLagMs = replicaCommitTimeMs - masterCommitTimeMs;
        latestCommitLagMs.set(replicaLagMs);
    }

    private void updateElapsedTxnStats(long masterTxnTimeMs) {
        elapsedTxnAvgMs.add(masterTxnTimeMs);
        elapsedTxn95Ms.add(masterTxnTimeMs);
        elapsedTxn99Ms.add(masterTxnTimeMs);
        elapsedTxnMaxMs.setMax(masterTxnTimeMs);
    }

    /**
     * Apply the operation represented by this log entry on this replica node.
     */
    public void replayEntry(long startNs,
                            Protocol.Entry entry)
        throws DatabaseException,
               IOException,
               InterruptedException,
               MasterSyncException {

        final InputWireRecord wireRecord = entry.getWireRecord();
        final LogEntry logEntry = wireRecord.getLogEntry();

        /*
         * Sanity check that the replication stream is in sequence. We want to
         * forestall any possible corruption from replaying invalid entries.
         */
        if (!wireRecord.getVLSN().follows(lastReplayedVLSN)) {
            throw EnvironmentFailureException.unexpectedState
                (repImpl,
                 "Rep stream not sequential. Current VLSN: " +
                 lastReplayedVLSN +
                 " next log entry VLSN: " + wireRecord.getVLSN());
        }

        if (logger.isLoggable(Level.FINEST)) {
            LoggerUtils.finest(logger, repImpl, "Replaying " + wireRecord);
        }

        final ReplayTxn repTxn = getReplayTxn(logEntry.getTransactionId(), true);
        updateReplicaSequences(logEntry);
        final byte entryType = wireRecord.getEntryType();

        lastReplayedVLSN = wireRecord.getVLSN();

        try {
            final long txnId = repTxn.getId();

            if (LOG_TXN_COMMIT.equalsType(entryType)) {
                Protocol.Commit commitEntry = (Protocol.Commit) entry;

                final boolean needsAck = commitEntry.getNeedsAck();
                final SyncPolicy txnSyncPolicy =
                    commitEntry.getReplicaSyncPolicy();
                final SyncPolicy implSyncPolicy =
                    needsAck ?
                    groupCommit.getImplSyncPolicy(txnSyncPolicy) :
                    noAckSyncPolicy;

                logReplay(repTxn, needsAck, implSyncPolicy);

                final TxnCommit commit = (TxnCommit) logEntry.getMainItem();
                final long dtvlsn = updateDTVLSN(commit);

                if (needsAck) {

                    /*
                     * Only wait if the replica is not lagging and the
                     * durability requires it.
                     */
                    repImpl.getRepNode().getVLSNFreezeLatch().awaitThaw();
                    repImpl.getRepNode().getMasterStatus().assertSync();
                }

                repTxn.commit(implSyncPolicy,
                              new ReplicationContext(lastReplayedVLSN),
                              commit.getMasterNodeId(),
                              dtvlsn);

                final long masterCommitTimeMs = commit.getTime().getTime();
                lastReplayedTxn = new TxnInfo(lastReplayedVLSN,
                                              masterCommitTimeMs);

                updateCommitStats(needsAck, implSyncPolicy, startNs,
                                  masterCommitTimeMs, repTxn.getEndTime(),
                                  repTxn.elapsedTime());

                /* Respond to the feeder. */
                if (needsAck) {
                    /*
                     * Need an ack, either buffer it, for sync group commit, or
                     * queue it.
                     */
                    if (!groupCommit.bufferAck(startNs, repTxn,
                                               txnSyncPolicy)) {
                        queueAck(txnId);
                    }
                }

                /*
                 * The group refresh and recalculation can be expensive, since
                 * it may require a database read. Do it after the ack.
                 */
                if (repTxn.getRepGroupDbChange() && canRefreshGroup(repTxn)) {
                    repImpl.getRepNode().refreshCachedGroup();
                    repImpl.getRepNode().recalculateGlobalCBVLSN();
                }

            } else if (LOG_TXN_ABORT.equalsType(entryType)) {

                nAborts.increment();
                final TxnAbort abort = (TxnAbort) logEntry.getMainItem();
                final ReplicationContext abortContext =
                    new ReplicationContext(wireRecord.getVLSN());
                if (logger.isLoggable(Level.FINEST)) {
                    LoggerUtils.finest(logger, repImpl,
                                       "abort called for " + txnId +
                                       " masterId=" +
                                       abort.getMasterNodeId() +
                                       " repContext=" + abortContext);
                }

                long dtvlsn = updateDTVLSN(abort);
                repTxn.abort(abortContext, abort.getMasterNodeId(), dtvlsn);
                lastReplayedTxn = new TxnInfo(lastReplayedVLSN,
                                              abort.getTime().getTime());
                if (repTxn.getRepGroupDbChange() && canRefreshGroup(repTxn)) {

                    /*
                     * Refresh is the safe thing to do on an abort, since a
                     * refresh may have been held back from an earlier commit
                     * due to this active transaction.
                     */
                    repImpl.getRepNode().refreshCachedGroup();
                }
                updateElapsedTxnStats(repTxn.elapsedTime());

            } else if (LOG_NAMELN_TRANSACTIONAL.equalsType(entryType)) {

                repImpl.getRepNode().getReplica().clearDbTreeCache();
                nNameLNs.increment();
                applyNameLN(repTxn, wireRecord);

            } else {
                nLNs.increment();
                /* A data operation. */
                assert wireRecord.getLogEntry() instanceof LNLogEntry;
                applyLN(repTxn, wireRecord);
            }

            /* Remember the last VLSN applied by this txn. */
            repTxn.setLastAppliedVLSN(lastReplayedVLSN);

        } catch (DatabaseException e) {
            e.addErrorMessage("Problem seen replaying entry " + wireRecord);
            throw e;
        } finally {
            final long elapsedNs = System.nanoTime() - startNs;
            if (elapsedNs > replayLoggingThresholdNs) {
                LoggerUtils.info(logger, repImpl,
                                 "Replay time for entry type:" +
                                 LogEntryType.findType(entryType) + " " +
                                 NANOSECONDS.toMillis(elapsedNs) + "ms " +
                                 "exceeded threshold:" +
                                 NANOSECONDS.
                                     toMillis(replayLoggingThresholdNs) +
                                 "ms");
            }
        }
    }

    /**
     * Update the replica's in-memory DTVLSN using the value in the
     * commit/abort entry.
     *
     * In the normal course of events, DTVLSNs should not decrease. However,
     * there is just one exception: if the rep stream transitions from a post
     * to a pre-dtvlsn stream, it will transition from a positive to the
     * UNINITIALIZED_VLSN_SEQUENCE.
     *
     * A transition from a pre to a post-dtvlsn transition (from zero to some
     * positive value), observes the "DTVLSNs should not decrease" rule
     * automatically.
     *
     * @return the DTVLSN entry in the log txnEnd record so it can be used in
     * the commit/abort operation for replay
     */
    private long updateDTVLSN(final TxnEnd txnEnd) {
        final long txnDTVLSN = txnEnd.getDTVLSN();

        if (txnDTVLSN == VLSN.UNINITIALIZED_VLSN_SEQUENCE) {
            /*
             * A pre DTVLSN format entry, simply set it as the in-memory DTVLSN
             */
            final long prevDTVLSN = repImpl.getRepNode().setDTVLSN(txnDTVLSN);
            if (prevDTVLSN != VLSN.UNINITIALIZED_VLSN_SEQUENCE) {
                LoggerUtils.info(logger, repImpl,
                                 "Transitioned to pre DTVLSN stream." +
                                 " DTVLSN:" + prevDTVLSN +
                                 " at VLSN:" + lastReplayedVLSN);

            }
            lastReplayedDTVLSN = txnDTVLSN;
            return txnDTVLSN;
        }

        /* Sanity check. */
        if (txnDTVLSN < lastReplayedDTVLSN) {
            String msg = "DTVLSNs must be in ascending order in the stream. " +
                " prev DTVLSN:" + lastReplayedDTVLSN +
                " next DTVLSN:" + txnDTVLSN + " at VLSN: " +
                lastReplayedVLSN.getSequence();
          throw EnvironmentFailureException.unexpectedState(repImpl, msg);
        }

        if ((lastReplayedDTVLSN == VLSN.UNINITIALIZED_VLSN_SEQUENCE) &&
            (txnDTVLSN > 0)) {
            LoggerUtils.info(logger, repImpl,
                             "Transitioned to post DTVLSN stream." +
                             " DTVLSN:" + txnDTVLSN +
                             " at VLSN:" + lastReplayedVLSN);
        }

        lastReplayedDTVLSN = txnDTVLSN;
        repImpl.getRepNode().setDTVLSN(txnDTVLSN);

        return txnDTVLSN;
    }

    /**
     * Queue the request ack for an async ack write to the network.
     */
    void queueAck(final long txnId) throws IOException {
        try {
            outputQueue.put(txnId);
        } catch (InterruptedException ie) {
            /*
             * Have the higher levels treat it like an IOE and
             * exit the thread.
             */
            throw new IOException("Ack I/O interrupted", ie);
        }
    }

    /**
     * Logs information associated with the replay of the txn commit
     */
    private void logReplay(ReplayTxn repTxn,
                           boolean needsAck,
                           SyncPolicy syncPolicy) {

        if (!logger.isLoggable(Level.FINE)) {
            return;
        }

        if (needsAck) {
            LoggerUtils.fine(logger, repImpl,
                             "Replay: got commit for txn=" + repTxn.getId() +
                             ", ack needed, replica sync policy=" +
                             syncPolicy +
                             " vlsn=" + lastReplayedVLSN);
        } else {
            LoggerUtils.fine(logger, repImpl,
                             "Replay: got commit for txn=" + repTxn.getId() +
                             " ack not needed" +
                             " vlsn=" + lastReplayedVLSN);
        }
    }

    /**
     * Returns true if there are no other activeTxns that have also modified
     * the membership database and are still open, since they could potentially
     * hold write locks that would block the read locks acquired during the
     * refresh operation.
     *
     * @param txn the current txn being committed or aborted
     *
     * @return true if there are no open transactions that hold locks on the
     * membership database.
     */
    private boolean canRefreshGroup(ReplayTxn txn) {

        /*
         * Use synchronized rather than copyActiveTxns, since this is called
         * during replay and there is no nested locking to worry about.
         */
        synchronized (activeTxns) {
            // TODO: very inefficient
            for (ReplayTxn atxn : activeTxns.getMap().values()) {
                if (atxn == txn) {
                    continue;
                }
                if (atxn.getRepGroupDbChange()) {
                    return false;
                }
            }
        }
        return true;
    }

    /**
     * Update this replica's node, txn and database sequences with any ids in
     * this log entry. We can call update, even if the replay id doesn't
     * represent a new lowest-id point, or if the apply is not successful,
     * because the apply checks that the replay id is < the sequence on the
     * replica. We just want to ensure that if this node becomes the master,
     * its sequences are in sync with what came before in the replication
     * stream, and ids are not incorrectly reused.
     */
    private void updateReplicaSequences(LogEntry logEntry) {

        /* For now, we assume all replay entries have a txn id. */
        repImpl.getTxnManager().updateFromReplay(logEntry.getTransactionId());

        /* If it's a database operation, update the database id. */
        if (logEntry instanceof NameLNLogEntry) {
            NameLNLogEntry nameLogEntry = (NameLNLogEntry) logEntry;
            nameLogEntry.postFetchInit(false /*isDupDb*/);
            NameLN nameLN = (NameLN) nameLogEntry.getLN();
            repImpl.getDbTree().updateFromReplay(nameLN.getId());
        }
    }

    /**
     * Obtain a ReplayTxn to represent the incoming operation.
     */
    public ReplayTxn getReplayTxn(long txnId, boolean registerTxnImmediately)
        throws DatabaseException {

        ReplayTxn useTxn = null;
        synchronized (activeTxns) {
            useTxn = activeTxns.get(txnId);
            if (useTxn == null) {

                /*
                 * Durability will be explicitly specified when
                 * ReplayTxn.commit is called, so TransactionConfig.DEFAULT is
                 * fine.
                 */
                if (registerTxnImmediately) {
                    useTxn = new ReplayTxn(repImpl, TransactionConfig.DEFAULT,
                                           txnId, activeTxns, logger);
                } else {
                    useTxn = new ReplayTxn(repImpl, TransactionConfig.DEFAULT,
                                           txnId, activeTxns, logger) {
                            @Override
                            protected
                            boolean registerImmediately() {
                                return false;
                            }
                    };
                }
            }
        }
        return useTxn;
    }

    /**
     * Replays the NameLN.
     *
     * Note that the operations: remove, rename and truncate need to establish
     * write locks on the database. Any open handles are closed by this
     * operation by virtue of the ReplayTxn's importunate property.  The
     * application will receive a LockPreemptedException if it subsequently
     * accesses the database handle.
     */
    private void applyNameLN(ReplayTxn repTxn,
                             InputWireRecord wireRecord)
        throws DatabaseException {

        NameLNLogEntry nameLNEntry = (NameLNLogEntry) wireRecord.getLogEntry();
        final NameLN nameLN = (NameLN) nameLNEntry.getLN();

        String databaseName = StringUtils.fromUTF8(nameLNEntry.getKey());

        final DbOpReplicationContext repContext =
            new DbOpReplicationContext(wireRecord.getVLSN(), nameLNEntry);

        DbOperationType opType = repContext.getDbOperationType();
        DatabaseImpl dbImpl = null;
        try {
            switch (opType) {
                case CREATE:
                {
                    DatabaseConfig dbConfig =
                        repContext.getCreateConfig().getReplicaConfig(repImpl);

                    dbImpl = repImpl.getDbTree().createReplicaDb
                      (repTxn, databaseName, dbConfig, nameLN, repContext);

                    /*
                     * We rely on the RepGroupDB.DB_ID value, so make sure
                     * it's what we expect for this internal replicated
                     * database.
                     */
                    if ((dbImpl.getId().getId() == RepGroupDB.DB_ID) &&
                        !DbType.REP_GROUP.getInternalName().equals
                        (databaseName)) {
                        throw EnvironmentFailureException.unexpectedState
                            ("Database: " +
                             DbType.REP_GROUP.getInternalName() +
                             " is associated with id: " +
                             dbImpl.getId().getId() +
                             " and not the reserved database id: " +
                             RepGroupDB.DB_ID);
                    }

                    TriggerManager.runOpenTriggers(repTxn, dbImpl, true);
                    break;
                }

                case REMOVE: {
                    dbImpl = repImpl.getDbTree().getDb(nameLN.getId());
                    try {
                        repImpl.getDbTree().removeReplicaDb
                            (repTxn, databaseName, nameLN.getId(), repContext);
                        TriggerManager.runRemoveTriggers(repTxn, dbImpl);
                    } catch (DatabaseNotFoundException e) {
                        throw EnvironmentFailureException.unexpectedState
                            ("Database: " + dbImpl.getName() +
                             " Id: " + nameLN.getId() +
                             " not found on the Replica.");
                    }
                    break;
                }

                case TRUNCATE: {
                    dbImpl = repImpl.getDbTree().getDb
                        (repContext.getTruncateOldDbId());
                    try {
                        TruncateDbResult result =
                        repImpl.getDbTree().truncateReplicaDb
                            (repTxn, databaseName, false, nameLN, repContext);
                        TriggerManager.runTruncateTriggers(repTxn, result.newDb);
                    } catch (DatabaseNotFoundException e) {
                        throw EnvironmentFailureException.unexpectedState
                            ("Database: " + dbImpl.getName() +
                             " Id: " + nameLN.getId() +
                             " not found on the Replica.");
                    }

                    break;
                }

                case RENAME: {
                    dbImpl = repImpl.getDbTree().getDb(nameLN.getId());
                    try {
                        dbImpl =
                        repImpl.getDbTree().renameReplicaDb
                            (repTxn, dbImpl.getName(), databaseName, nameLN,
                             repContext);
                        TriggerManager.runRenameTriggers(repTxn, dbImpl,
                                                         databaseName);
                    } catch (DatabaseNotFoundException e) {
                        throw EnvironmentFailureException.unexpectedState
                            ("Database rename from: " + dbImpl.getName() +
                             " to " + databaseName +
                             " failed, name not found on the Replica.");
                    }
                    break;
                }

                case UPDATE_CONFIG: {
                    /* Get the replicated database configurations. */
                    DatabaseConfig dbConfig =
                        repContext.getCreateConfig().getReplicaConfig(repImpl);

                    /* Update the NameLN and write it to the log. */
                    dbImpl = repImpl.getDbTree().getDb(nameLN.getId());
                    final String dbName = dbImpl.getName();
                    repImpl.getDbTree().updateNameLN
                        (repTxn, dbName, repContext);

                    /* Set the new configurations to DatabaseImpl. */
                    dbImpl.setConfigProperties
                        (repTxn, dbName, dbConfig, repImpl);

                    repImpl.getDbTree().modifyDbRoot(dbImpl);

                    break;
                }

                default:
                    throw EnvironmentFailureException.unexpectedState
                        ("Illegal database op type of " + opType.toString() +
                         " from " + wireRecord + " database=" + databaseName);
            }
        } finally {
            if (dbImpl != null) {
                repImpl.getDbTree().releaseDb(dbImpl);
            }
        }
    }

    private void applyLN(
        final ReplayTxn repTxn,
        final InputWireRecord wireRecord)
        throws DatabaseException {

        final LNLogEntry lnEntry = (LNLogEntry) wireRecord.getLogEntry();
        final DatabaseId dbId = lnEntry.getDbId();

        /*
         * If this is a change to the rep group db, remember at commit time,
         * and refresh this node's group metadata.
         */
        if (dbId.getId() == RepGroupDB.DB_ID) {
            repTxn.noteRepGroupDbChange();
        }

        /*
         * Note that we don't have to worry about serializable isolation when
         * applying a replicated txn; serializable isolation in only an issue
         * for txns that take read locks, and a replicated txn consists only of
         * write operations.
         */
        final DatabaseImpl dbImpl =
            repImpl.getRepNode().getReplica().getDbCache().get(dbId, repTxn);

        lnEntry.postFetchInit(dbImpl);

        final ReplicationContext repContext =
            new ReplicationContext(wireRecord.getVLSN());

        try (final Cursor cursor = DbInternal.makeCursor(
                dbImpl, repTxn, null /*cursorConfig*/)) {

            OperationResult result;
            final LN ln = lnEntry.getLN();

            /* In a dup DB, do not expect embedded LNs or non-empty data. */
            if (dbImpl.getSortedDuplicates() &&
                (lnEntry.isEmbeddedLN() ||
                 (ln.getData() != null && ln.getData().length > 0))) {

                throw EnvironmentFailureException.unexpectedState(
                    dbImpl.getEnv(),
                    "[#25288] emb=" + lnEntry.isEmbeddedLN() +
                    " key=" + Key.getNoFormatString(lnEntry.getKey()) +
                    " data=" + Key.getNoFormatString(ln.getData()) +
                    " vlsn=" + ln.getVLSNSequence());
            }

            if (ln.isDeleted()) {

                /*
                 * Perform an exact search by key. Use a partial data entry
                 * (delDataEntry) to avoid reading old data.
                 */
                replayKeyEntry.setData(lnEntry.getKey());

                result = DbInternal.searchForReplay(
                    cursor, replayKeyEntry, delDataEntry,
                    LockMode.RMW, SearchMode.SET);

                if (result != null) {
                    result =
                        DbInternal.deleteWithRepContext(cursor, repContext);
                }
            } else {
                replayKeyEntry.setData(lnEntry.getKey());
                replayDataEntry.setData(ln.getData());

                result = DbInternal.putForReplay(
                    cursor, replayKeyEntry, replayDataEntry, ln,
                    lnEntry.getExpiration(), lnEntry.isExpirationInHours(),
                    PutMode.OVERWRITE, repContext);
            }

            /*
             * For a deletion we expect the record to previously exist. Note
             * that the put call never fails since PutMode.OVERWRITE is used.
             *
             * Ignore deletion errors for records that are expired (within TTL
             * clock tolerance), since these can be deleted from the Btree by
             * background IN compression. A record may expire after the master
             * deletion op and before the replica replay, e.g., due to a
             * lagging replica or a replica that was down during the deletion.
             *
             * Also ignore deletion errors for EXTINCT and MAYBE_EXTINCT
             * records to account for apps that try to delete extinct records.
             * This is illegal but should not bring down the RN.
             */
            if (result == null) {

                final ExtinctionStatus extinctionState =
                    repImpl.getExtinctionState(dbImpl, lnEntry.getKey());

                final boolean isExpired = repImpl.expiresWithin(
                    lnEntry.getExpiration(), lnEntry.isExpirationInHours(),
                    repImpl.getTtlClockTolerance());

                final String expTime = (lnEntry.getExpiration() == 0) ?
                    "none" :
                    TTL.formatExpiration(
                        lnEntry.getExpiration(),
                        lnEntry.isExpirationInHours());

                if (!isExpired && extinctionState == NOT_EXTINCT) {

                    throw new EnvironmentFailureException(
                        repImpl,
                        EnvironmentFailureReason.LOG_INCOMPLETE,
                        "Replicated operation could  not be applied. " +
                            " vlsn=" + wireRecord.getVLSN() +
                            " expirationTime=" + expTime +
                            // TODO remove key after debugging
                            " key=" + Key.dumpString(lnEntry.getKey(), 0) +
                            wireRecord);
                }
            }
        }

        /*
         * Queue new extinct scans for later processing. Must be done after
         * applying the LN so the record is present in the Btree when the
         * extinct scanner thread tries to read it.
         */
        if (lnEntry.getLogType().equals(
            LogEntryType.LOG_EXTINCT_SCAN_LN_TRANSACTIONAL)) {

            repImpl.getExtinctionScanner().replay(
                lnEntry.getKey(), lnEntry.getData());
        }
    }

    /**
     * Go through all active txns and rollback up to but not including the log
     * entry represented by the matchpoint VLSN.
     *
     * Effectively truncate these rolled back log entries by making them
     * invisible. Flush the log first, to make sure these log entries are out
     * of the log buffers and are on disk, so we can reliably find them through
     * the FileManager.
     *
     * Rollback steps are described in
     * https://sleepycat.oracle.com/trac/wiki/Logging#Recoverysteps. In
     * summary,
     *
     * 1. Log and fsync a new RollbackStart record
     * 2. Do the rollback in memory. There is no need to explicitly
     *    log INs made dirty by the rollback operation.
     * 3. Do invisibility masking by overwriting LNs.
     * 4. Fsync all overwritten log files at this point.
     * 5. Write a RollbackEnd record, for ease of debugging
     *
     * Note that application read txns  can continue to run during syncup.
     * Reader txns cannot access records that are being rolled back, because
     * they are in txns that are not committed, i.e, they are write locked.
     * The rollback interval never includes committed txns, and we do a hard
     * recovery if it would include them.
     */
    public void rollback(VLSN matchpointVLSN, long matchpointLsn) {

        String rollbackStatus = RBSTATUS_START;

        final Map localActiveTxns = copyActiveTxns();
        try {
            if (localActiveTxns.size() == 0) {
                /* no live read/write txns, nothing to do. */
                rollbackStatus = RBSTATUS_NO_ACTIVE;
                return;
            }

            VLSNRange range = repImpl.getVLSNIndex().getRange();
            if (range.getLast().equals(matchpointVLSN)) {
                /* nothing to roll back. */
                rollbackStatus = RBSTATUS_RANGE_EQUALS;
                return;
            }

            repImpl.setSyncupProgress(SyncupProgress.DO_ROLLBACK);

            /*
             * Stop the log file backup service, since the files will be in an
             * inconsistent state while the rollback is in progress.
             */
            repImpl.getRepNode().shutdownNetworkBackup();

            /*
             * Set repImpl's isRollingBack to true, and invalidate all the in
             * progress DbBackup.
             */
            repImpl.setBackupProhibited(true);
            repImpl.invalidateBackups(DbLsn.getFileNumber(matchpointLsn));

            /*
             * 1. Log RollbackStart. The fsync guarantees that this marker will
             * be present in the log for recovery. It also ensures that all log
             * entries will be flushed to disk and the TxnChain will not have
             * to worry about entries that are in log buffers when constructing
             * the rollback information.
             */
            LogManager logManager = repImpl.getLogManager();
            LogEntry rollbackStart = SingleItemEntry.create(
                LogEntryType.LOG_ROLLBACK_START,
                new RollbackStart(
                    matchpointVLSN, matchpointLsn, localActiveTxns.keySet()));
            long rollbackStartLsn =
                logManager.logForceFlush(rollbackStart,
                                         true, // fsyncRequired,
                                         ReplicationContext.NO_REPLICATE);
            rollbackStatus = RBSTATUS_LOG_RBSTART;

            /*
             * 2. Do rollback in memory. Undo any operations that were logged
             * after the matchpointLsn, and save the LSNs for those log
             * entries.. There should be something to undo, because we checked
             * earlier that there were log entries after the matchpoint.
             */
            List rollbackLsns = new ArrayList();
            for (ReplayTxn replayTxn : localActiveTxns.values()) {
                Collection txnRollbackLsns =
                    replayTxn.rollback(matchpointLsn);

                /*
                 * Txns that were entirely rolled back should have been removed
                 * from the activeTxns map.
                 */
                assert checkRemoved(replayTxn) :
                    "Should have removed " + replayTxn;

                rollbackLsns.addAll(txnRollbackLsns);
            }
            rollbackStatus = RBSTATUS_MEM_ROLLBACK;
            assert rollbackLsns.size() != 0 : dumpActiveTxns(matchpointLsn);

            /*
             * 3 & 4 - Mark the rolled back log entries as invisible.  But
             * before doing so, invoke any registered rewrite listeners, so the
             * application knows that existing log files will be modified.
             *
             * After all are done, fsync the set of files. By waiting, some may
             * have made it out on their own.
             */
            LogFileRewriteListener listener = repImpl.getLogRewriteListener();
            if (listener != null) {
                listener.rewriteLogFiles(getFileNames(rollbackLsns));
            }
            RollbackTracker.makeInvisible(repImpl, rollbackLsns);
            rollbackStatus = RBSTATUS_INVISIBLE;

            /*
             * 5. Log RollbackEnd. Flush it so that we can use it to optimize
             * recoveries later on. If the RollbackEnd exists, we can skip the
             * step of re-making LNs invisible.
             */
            logManager.logForceFlush(
                SingleItemEntry.create(LogEntryType.LOG_ROLLBACK_END,
                                       new RollbackEnd(matchpointLsn,
                                                       rollbackStartLsn)),
                 true, // fsyncRequired
                 ReplicationContext.NO_REPLICATE);

            /*
             * Restart the backup service only if all the steps of the
             * rollback were successful.
             */
            repImpl.getRepNode().restartNetworkBackup();
            repImpl.setBackupProhibited(false);
            rollbackStatus = RBSTATUS_FINISH;
        } finally {

            /* Reset the lastReplayedVLSN so it's correct when we resume. */
            lastReplayedVLSN = matchpointVLSN;
            LoggerUtils.info(logger, repImpl,
                             "Rollback to matchpoint " + matchpointVLSN +
                             " at " + DbLsn.getNoFormatString(matchpointLsn) +
                             " status=" + rollbackStatus);
        }
    }

    /* For debugging support */
    private String dumpActiveTxns(long matchpointLsn) {
        StringBuilder sb = new StringBuilder();
        sb.append("matchpointLsn=");
        sb.append(DbLsn.getNoFormatString(matchpointLsn));
        for (ReplayTxn replayTxn : copyActiveTxns().values()) {
            sb.append("txn id=").append(replayTxn.getId());
            sb.append(" locks=").append(replayTxn.getWriteLockIds());
            sb.append("lastLogged=");
            sb.append(DbLsn.getNoFormatString(replayTxn.getLastLsn()));
            sb.append("\n");
        }

        return sb.toString();
    }

    private Set getFileNames(List lsns) {
        Set fileNums = new HashSet();
        Set files = new HashSet();

        for (long lsn : lsns) {
            fileNums.add(DbLsn.getFileNumber(lsn));
        }
        for (long fileNum : fileNums) {
            files.add(new File(FileManager.getFileName(fileNum)));
        }
        return files;
    }

    private boolean checkRemoved(ReplayTxn txn) {
        if (txn.isClosed()) {
            if (activeTxns.get(txn.getId()) != null) {
                return false;
            }
        }

        return true;
    }

    /**
     * Make a copy of activeTxns to avoid holding its mutex while iterating.
     * Can be used whenever the cost of the HashMap copy is not significant.
     */
    private Map copyActiveTxns() {
        return activeTxns.getMap();
    }

    /**
     * Release all transactions, database handles, etc held by the replay
     * unit. The Replicator is closing down and Replay will not be invoked
     * again.
     */
    public void close() {

        for (ReplayTxn replayTxn : copyActiveTxns().values()) {
            try {
                if (logger.isLoggable(Level.FINE)) {
                    LoggerUtils.fine(logger, repImpl,
                                     "Unregistering open replay txn: " +
                                     replayTxn.getId());
                }
                replayTxn.cleanup();
            } catch (DatabaseException e) {
                LoggerUtils.fine(logger, repImpl,
                                 "Replay txn: " + replayTxn.getId() +
                                 " unregistration failed: " + e.getMessage());
            }
        }
        assert activeTxns.isEmpty();
    }

    /**
     * Returns a copy of the statistics associated with Replay
     */
    public StatGroup getStats(StatsConfig config) {
        StatGroup ret = statistics.cloneGroup(config.getClear());

        return ret;
    }

    public void resetStats() {
        statistics.clear();
    }

    /* For unit tests */
    public SimpleTxnMap getActiveTxns() {
        return activeTxns;
    }

    public String dumpState() {
        StringBuilder sb = new StringBuilder();
        sb.append("lastReplayedTxn=").append(lastReplayedTxn);
        sb.append(" lastReplayedVLSN=").append(lastReplayedVLSN);
        sb.append(" numActiveReplayTxns=").append(activeTxns.size());
        sb.append("\n");
        return sb.toString();
    }

    /**
     * Write out any pending acknowledgments. See GroupCommit.flushPendingAcks
     * for details. This method is invoked after each log entry is read from
     * the replication stream.
     *
     * @param nowNs the time at the reading of the log entry
     */
    void flushPendingAcks(long nowNs)
        throws IOException {

        groupCommit.flushPendingAcks(nowNs);
    }

    /**
     * See GroupCommit.getPollIntervalNs(long)
     */
    long getPollIntervalNs(long defaultNs) {
        return groupCommit.getPollIntervalNs(defaultNs);
    }

    /**
     * Implements group commit. It's really a substructure of Replay and exists
     * mainly for modularity reasons.
     * 
     * Since replay is single threaded, the group commit mechanism works
     * differently in the replica than in the master. In the replica, SYNC
     * transactions are converted into NO_SYNC transactions and executed
     * immediately, but their acknowledgments are delayed until after either
     * the REPLICA_GROUP_COMMIT_INTERVAL (the max amount the first transaction
     * in the group is delayed) has expired, or the size of the group (as
     * specified by REPLICA_MAX_GROUP_COMMIT) has been exceeded.
     */
    private class GroupCommit {

        /* Size determines max fsync commits that can be grouped. */
        private final long pendingCommitAcks[];

        /* Number of entries currently in pendingCommitAcks */
        private int nPendingAcks;

        /*
         * If this time limit is reached, the group will be forced to commit.
         * Invariant: nPendingAcks > 0 ==> limitGroupCommitNs > 0
         */
        private long limitGroupCommitNs = 0;

        /* The time interval that an open group is held back. */
        private final long groupCommitIntervalNs;

        private final LongStat nGroupCommitTimeouts;
        private final LongStat nGroupCommitMaxExceeded;
        private final LongStat nGroupCommits;
        private final LongStat nGroupCommitTxns;

        private GroupCommit(DbConfigManager configManager) {
            pendingCommitAcks = new long[configManager.
                getInt(RepParams.REPLICA_MAX_GROUP_COMMIT)];

            nPendingAcks = 0;

            final long groupCommitIntervalMs = configManager.
                getDuration(RepParams.REPLICA_GROUP_COMMIT_INTERVAL);

            groupCommitIntervalNs =
                NANOSECONDS.convert(groupCommitIntervalMs, MILLISECONDS);
            nGroupCommitTimeouts =
                new LongStat(statistics, N_GROUP_COMMIT_TIMEOUTS);

            nGroupCommitMaxExceeded =
                new LongStat(statistics, N_GROUP_COMMIT_MAX_EXCEEDED);

            nGroupCommitTxns =
                new LongStat(statistics, N_GROUP_COMMIT_TXNS);

            nGroupCommits =
                new LongStat(statistics, N_GROUP_COMMITS);
        }

        /**
         * Returns true if group commits are enabled at the replica.
         */
        private boolean isEnabled() {
            return pendingCommitAcks.length > 0;
        }

        /**
         * The interval used to poll for incoming log entries. The time is
         * lowered from the defaultNs time, if there are pending
         * acknowledgments.
         *
         * @param defaultNs the default poll interval
         *
         * @return the actual poll interval
         */
        private long getPollIntervalNs(long defaultNs) {
            if (nPendingAcks == 0) {
                return defaultNs;
            }
            final long now = System.nanoTime();

            final long interval = limitGroupCommitNs - now;
            return Math.min(interval, defaultNs);
        }

        /**
         * Returns the sync policy to be implemented at the replica. If
         * group commit is active, and SYNC is requested it will return
         * NO_SYNC instead to delay the fsync.
         *
         * @param txnSyncPolicy the sync policy as stated in the txn
         *
         * @return the sync policy to be implemented by the replica
         */
        private SyncPolicy getImplSyncPolicy(SyncPolicy txnSyncPolicy) {
            return ((txnSyncPolicy ==  SyncPolicy.SYNC) && isEnabled()) ?
                   SyncPolicy.NO_SYNC : txnSyncPolicy;
        }

        /**
         * Buffers the acknowledgment if the commit calls for a sync, or if
         * there are pending acknowledgments to ensure that acks are sent
         * in order.
         *
         * @param nowNs the current time
         * @param ackTxn the txn associated with the ack
         * @param txnSyncPolicy the sync policy as request by the committing
         * txn
         *
         * @return true if the ack has been buffered
         */
        private final boolean bufferAck(long nowNs,
                                        ReplayTxn ackTxn,
                                        SyncPolicy txnSyncPolicy)
            throws IOException {

            if (!isEnabled() ||
                !((txnSyncPolicy == SyncPolicy.SYNC) || (nPendingAcks > 0))) {
                return false;
            }

            pendingCommitAcks[nPendingAcks++] = ackTxn.getId();

            if (nPendingAcks == 1) {
                /* First txn in group, start the clock. */
                limitGroupCommitNs = nowNs + groupCommitIntervalNs;
            } else {
                flushPendingAcks(nowNs);
            }
            return true;
        }

        /**
         * Flush if there are pending acks and either the buffer limit or the
         * group interval has been reached.
         *
         * @param nowNs the current time (passed in to minimize system calls)
         */
        private final void flushPendingAcks(long nowNs)
            throws IOException {

            if ((nPendingAcks == 0) ||
                ((nPendingAcks != pendingCommitAcks.length) &&
                 (NanoTimeUtil.compare(nowNs, limitGroupCommitNs) < 0))) {

                return;
            }

            /* Update statistics. */
            nGroupCommits.increment();
            nGroupCommitTxns.add(nPendingAcks);
            if (NanoTimeUtil.compare(nowNs, limitGroupCommitNs) >= 0) {
                nGroupCommitTimeouts.increment();
            } else if (nPendingAcks >= pendingCommitAcks.length) {
                nGroupCommitMaxExceeded.increment();
            }

            /* flush log buffer and fsync to disk */
            repImpl.getLogManager().flushSync();

            /* commits are on disk, send out acknowledgments on the network. */
            for (int i=0; i < nPendingAcks; i++) {
                queueAck(pendingCommitAcks[i]);
                pendingCommitAcks[i] = 0;
            }

            nPendingAcks = 0;
            limitGroupCommitNs = 0;
        }
    }

    /**
     * Simple helper class to package a Txn vlsn and its associated commit
     * time.
     */
    public static class TxnInfo {
        final VLSN txnVLSN;
        final long masterTxnEndTime;

        private TxnInfo(VLSN txnVLSN, long masterTxnEndTime) {
            this.txnVLSN = txnVLSN;
            this.masterTxnEndTime = masterTxnEndTime;
        }

        public VLSN getTxnVLSN() {
            return txnVLSN;
        }

        public long getMasterTxnEndTime() {
            return masterTxnEndTime;
        }

        @Override
        public String toString() {
            return " VLSN: " + txnVLSN +
                " masterTxnEndTime=" + new Date(masterTxnEndTime);
        }
    }
}