All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sleepycat.je.rep.impl.node.Replay Maven / Gradle / Ivy

The newest version!
/*-
 * Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle Berkeley
 * DB Java Edition made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle Berkeley DB Java Edition for a copy of the
 * license and additional information.
 */

package com.sleepycat.je.rep.impl.node;

import static com.sleepycat.je.ExtinctionFilter.ExtinctionStatus.NOT_EXTINCT;
import static com.sleepycat.je.log.LogEntryType.LOG_NAMELN_TRANSACTIONAL;
import static com.sleepycat.je.log.LogEntryType.LOG_TXN_ABORT;
import static com.sleepycat.je.log.LogEntryType.LOG_TXN_COMMIT;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.ELAPSED_TXN_95_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.ELAPSED_TXN_99_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.ELAPSED_TXN_AVG_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.ELAPSED_TXN_MAX_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.LATEST_COMMIT_LAG_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.MAX_COMMIT_PROCESSING_NANOS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.MIN_COMMIT_PROCESSING_NANOS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_ABORTS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMITS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_ACKS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_NO_SYNCS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_SYNCS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_COMMIT_WRITE_NO_SYNCS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_GROUP_COMMITS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_GROUP_COMMIT_MAX_EXCEEDED;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_GROUP_COMMIT_TIMEOUTS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_GROUP_COMMIT_TXNS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_LNS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_MESSAGE_QUEUE_OVERFLOWS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.N_NAME_LNS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.OUTPUT_QUEUE_95_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.OUTPUT_QUEUE_99_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.OUTPUT_QUEUE_AVG_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.OUTPUT_QUEUE_MAX_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.REPLAY_QUEUE_95_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.REPLAY_QUEUE_99_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.REPLAY_QUEUE_AVG_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.REPLAY_QUEUE_MAX_DELAY_MS;
import static com.sleepycat.je.rep.impl.node.ReplayStatDefinition.TOTAL_COMMIT_PROCESSING_NANOS;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static java.util.concurrent.TimeUnit.NANOSECONDS;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.logging.Level;
import java.util.logging.Logger;

import com.sleepycat.je.Cursor;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.DatabaseNotFoundException;
import com.sleepycat.je.DbInternal;
import com.sleepycat.je.Durability.SyncPolicy;
import com.sleepycat.je.EnvironmentFailureException;
import com.sleepycat.je.ExtinctionFilter.ExtinctionStatus;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationResult;
import com.sleepycat.je.StatsConfig;
import com.sleepycat.je.TransactionConfig;
import com.sleepycat.je.dbi.DatabaseId;
import com.sleepycat.je.dbi.DatabaseImpl;
import com.sleepycat.je.dbi.DbConfigManager;
import com.sleepycat.je.dbi.DbTree.TruncateDbResult;
import com.sleepycat.je.dbi.DbType;
import com.sleepycat.je.dbi.EnvironmentFailureReason;
import com.sleepycat.je.dbi.PutMode;
import com.sleepycat.je.dbi.SearchMode;
import com.sleepycat.je.dbi.TTL;
import com.sleepycat.je.dbi.TriggerManager;
import com.sleepycat.je.log.DbOpReplicationContext;
import com.sleepycat.je.log.FileManager;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.LogManager;
import com.sleepycat.je.log.ReplicationContext;
import com.sleepycat.je.log.entry.DbOperationType;
import com.sleepycat.je.log.entry.LNLogEntry;
import com.sleepycat.je.log.entry.LogEntry;
import com.sleepycat.je.log.entry.NameLNLogEntry;
import com.sleepycat.je.log.entry.SingleItemEntry;
import com.sleepycat.je.recovery.RecoveryInfo;
import com.sleepycat.je.recovery.RollbackTracker;
import com.sleepycat.je.rep.LogFileRewriteListener;
import com.sleepycat.je.rep.SyncupProgress;
import com.sleepycat.je.rep.impl.RepGroupDB;
import com.sleepycat.je.rep.impl.RepImpl;
import com.sleepycat.je.rep.impl.RepParams;
import com.sleepycat.je.rep.stream.InputWireRecord;
import com.sleepycat.je.rep.stream.MasterStatus.MasterSyncException;
import com.sleepycat.je.rep.stream.Protocol;
import com.sleepycat.je.rep.txn.ReplayTxn;
import com.sleepycat.je.rep.utilint.LongMinZeroStat;
import com.sleepycat.je.rep.utilint.SimpleTxnMap;
import com.sleepycat.je.rep.vlsn.VLSNRange;
import com.sleepycat.je.tree.Key;
import com.sleepycat.je.tree.LN;
import com.sleepycat.je.tree.NameLN;
import com.sleepycat.je.txn.RollbackEnd;
import com.sleepycat.je.txn.RollbackStart;
import com.sleepycat.je.txn.Txn;
import com.sleepycat.je.txn.TxnAbort;
import com.sleepycat.je.txn.TxnCommit;
import com.sleepycat.je.txn.TxnEnd;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.LatencyPercentileStat;
import com.sleepycat.je.utilint.LoggerUtils;
import com.sleepycat.je.utilint.LongAvgStat;
import com.sleepycat.je.utilint.LongMaxStat;
import com.sleepycat.je.utilint.LongMaxZeroStat;
import com.sleepycat.je.utilint.LongMinStat;
import com.sleepycat.je.utilint.LongStat;
import com.sleepycat.je.utilint.NanoTimeUtil;
import com.sleepycat.je.utilint.StatGroup;
import com.sleepycat.je.utilint.VLSN;
import com.sleepycat.utilint.StringUtils;

/**
 * Replays log records from the replication stream, and manages the
 * transactions for those records.
 *
 * The Replay module has a lifetime equivalent to the environment owned by
 * this replicator. Its lifetime is longer than the feeder/replica stream.
 * For example, suppose this is nodeX:
 *
 * t1 - Node X is a replica, node A is master. Replay X is alive
 * t2 - Node X is a replica, node B takes over as master. X's Replay module
 *      is still alive and has the same set of active txns. It doesn't matter
 *      to X that the master has changed.
 * t3 - Node X becomes the master. Now its Replay unit is cleared, because
 *      anything managed by the Replay is defunct.
 */
public class Replay {

    /* These are strings for the rollback logging. */
    private static final String RBSTATUS_START =
        "Started Rollback";
    private static final String RBSTATUS_NO_ACTIVE =
        "No active txns, nothing to rollback";
    private static final String RBSTATUS_RANGE_EQUALS =
        "End of range equals matchpoint, nothing to rollback";
    private static final String RBSTATUS_LOG_RBSTART =
        "Logged RollbackStart entry";
    private static final String RBSTATUS_MEM_ROLLBACK =
        "Finished in-memory rollback";
    private static final String RBSTATUS_INVISIBLE =
        "Finished invisible setting";
    private static final String RBSTATUS_FINISH =
        "Finished rollback";

    /*
     * DatabaseEntry objects reused during replay, to minimize allocation in
     * high performance replay path.
     */
    final DatabaseEntry replayKeyEntry = new DatabaseEntry();
    final DatabaseEntry replayDataEntry = new DatabaseEntry();
    final DatabaseEntry delDataEntry = new DatabaseEntry();

    private final RepImpl repImpl;

    /**
     *  If a commit replay operation takes more than this threshold, it's
     *  logged. This information helps determine whether ack timeouts on the
     *  master are due to a slow replica, or the network.
     */
    private final long ackTimeoutLogThresholdNs;

    /**
     * ActiveTxns is a collection of txn objects used for applying replicated
     * transactions. This collection should be empty if the node is a master.
     *
     * Note that there is an interesting relationship between ActiveTxns and
     * the txn collection managed by the environment TxnManager. ActiveTxns is
     * effectively a subset of the set of txns held by the
     * TxnManager. ReplayTxns must be sure to register and unregister
     * themselves from ActiveTxns, just as all Txns must register and
     * unregister with the TxnManager's set. One implementation alternative to
     * having an ActiveTxns map here is to search the TxnManager set (which is
     * a set, not a map) for a given ReplayTxn. Another is to subclass
     * TxnManager so that replicated nodes have their own replayTxn map, just
     * as XATxns have a XID->Txn map.
     *
     * Both alternatives seemed too costly in terms of performance or elaborate
     * in terms of code to do for the current function. It seems clearer to
     * make the ActiveTxns a map in the one place that it is currently
     * used. This choice may change over time, and should be reevaluated if the
     * implementation changes.
     *
     * The ActiveTxns key is the transaction id. These transactions are closed
     * when:
     * - the replay unit executes a commit received over the replication stream
     * - the replay unit executes an abort received over the replication stream
     * - the replication node realizes that it has just become the new master,
     *   and was not previously the master.
     *
     * Note that the Replay class has a lifetime that is longer than that of a
     * RepNode. This means in particular, that transactions may be left open,
     * and will be resumed when a replica switches from one master to another,
     * creating a new RepNode in the process. Because of that, part of the
     * transaction may be implemented by the rep stream from one master and
     * another part by another.
     *
     * The map is synchronized, so simple get/put operations do not require
     * additional synchronization.  However, iteration requires synchronization
     * and copyActiveTxns can be used in most cases.
     */
    private final SimpleTxnMap activeTxns;

    /*
     * The entry representing the last replayed txn commit. Supports the
     * replica's notion of consistency.
     */
    private volatile TxnInfo lastReplayedTxn = null;

    /*
     * The last replayed entry of any kind. Supports PointConsistencyPolicy.
     */
    private volatile VLSN lastReplayedVLSN = null;

    /*
     * The last replayed DTVLSN in the stream. It's used to ensure that the
     * DTVLSNs in the stream are correctly sequenced.
     */
    private long lastReplayedDTVLSN = VLSN.NULL_VLSN_SEQUENCE;

    /*
     * The sync policy to be used in the absence of an ACK request. The replica
     * in this case has some latitude about how it syncs the commit.
     */
    private final SyncPolicy noAckSyncPolicy = SyncPolicy.NO_SYNC;

    /**
     *  The RepParams.REPLAY_LOGGING_THRESHOLD configured logging threshold.
     */
    private final long replayLoggingThresholdNs;

    /**
     * State that is reinitialized by the reinit() method each time a replay
     * loop is started with a new feeder.
     */

    /**
     *  All writes (predominantly acks) are queued here, so they do not block
     *  the replay thread.
     */
    private final BlockingQueue outputQueue;

    /**
     * Holds the state associated with group commits.
     */
    private final GroupCommit groupCommit;

    /* Maintains the statistics associated with stream replay. */
    private final StatGroup statistics;
    private final LongStat nCommits;
    private final LongStat nCommitAcks;
    private final LongStat nCommitSyncs;
    private final LongStat nCommitNoSyncs;
    private final LongStat nCommitWriteNoSyncs;
    private final LongStat nAborts;
    private final LongStat nNameLNs;
    private final LongStat nLNs;
    private final LongStat nMessageQueueOverflows;
    private final LongMinStat minCommitProcessingNanos;
    private final LongMaxStat maxCommitProcessingNanos;
    private final LongStat totalCommitProcessingNanos;
    private final LongStat latestCommitLagMs;

    private volatile long heartbeatRequestEnqueueTime = 0;
    private volatile long heartbeatRequestMasterNow = 0;
    private final LongAvgStat replayQueueAvgDelayMs;
    private final LatencyPercentileStat replayQueue95DelayMs;
    private final LatencyPercentileStat replayQueue99DelayMs;
    private final LongMaxStat replayQueueMaxDelayMs;

    private volatile long heartbeatResponseEnqueueTime = 0;
    private final LongAvgStat outputQueueAvgDelayMs;
    private final LatencyPercentileStat outputQueue95DelayMs;
    private final LatencyPercentileStat outputQueue99DelayMs;
    private final LongMaxStat outputQueueMaxDelayMs;

    private final LongAvgStat elapsedTxnAvgMs;
    private final LatencyPercentileStat elapsedTxn95Ms;
    private final LatencyPercentileStat elapsedTxn99Ms;
    private final LongMaxStat elapsedTxnMaxMs;

    private final Logger logger;
    public Replay(RepImpl repImpl,
                  @SuppressWarnings("unused") NameIdPair nameIdPair) {

        /*
         * This should have already been caught in
         * ReplicatedEnvironment.setupRepConfig, but it is checked here anyway
         * as an added sanity check. [#17643]
         */
        if (repImpl.isReadOnly()) {
            throw EnvironmentFailureException.unexpectedState
                ("Replay created with readonly ReplicatedEnvironment");
        }

        this.repImpl = repImpl;
        final DbConfigManager configManager = repImpl.getConfigManager();

        ackTimeoutLogThresholdNs = MILLISECONDS.toNanos(configManager.
            getDuration(RepParams.REPLICA_ACK_TIMEOUT));

        /**
         * The factor of 2 below is somewhat arbitrary. It should be > 1 X so
         * that the ReplicaOutputThread can completely process the buffered
         * messages in the face of a network drop and 2X to allow for
         * additional headroom and minimize the chances that the replay might
         * be blocked due to the limited queue length.
         */
        final int outputQueueSize = 2 *
            configManager.getInt(RepParams.REPLICA_MESSAGE_QUEUE_SIZE);
        outputQueue = new ArrayBlockingQueue(outputQueueSize);

        /*
         * The Replay module manages all write transactions and mimics a
         * writing application thread. When the node comes up, it populates
         * the activeTxn collection with ReplayTxns that were resurrected
         * at recovery time.
         */
        activeTxns = new SimpleTxnMap<>(1024);

        /*
         * Configure the data entry used for deletion to avoid fetching the
         * old data during deletion replay.
         */
        delDataEntry.setPartial(0, 0, true);

        logger = LoggerUtils.getLogger(getClass());
        statistics = new StatGroup(ReplayStatDefinition.GROUP_NAME,
                                   ReplayStatDefinition.GROUP_DESC);

        groupCommit = new GroupCommit(configManager);

        nCommits = new LongStat(statistics, N_COMMITS);
        nCommitAcks = new LongStat(statistics, N_COMMIT_ACKS);
        nCommitSyncs = new LongStat(statistics, N_COMMIT_SYNCS);
        nCommitNoSyncs = new LongStat(statistics, N_COMMIT_NO_SYNCS);
        nCommitWriteNoSyncs =
            new LongStat(statistics, N_COMMIT_WRITE_NO_SYNCS);
        nAborts = new LongStat(statistics, N_ABORTS);
        nNameLNs = new LongStat(statistics, N_NAME_LNS);
        nLNs = new LongStat(statistics, N_LNS);
        nMessageQueueOverflows =
            new LongStat(statistics, N_MESSAGE_QUEUE_OVERFLOWS);
        minCommitProcessingNanos =
            new LongMinZeroStat(statistics, MIN_COMMIT_PROCESSING_NANOS);
        maxCommitProcessingNanos =
            new LongMaxZeroStat(statistics, MAX_COMMIT_PROCESSING_NANOS);
        totalCommitProcessingNanos =
            new LongStat(statistics, TOTAL_COMMIT_PROCESSING_NANOS);
        latestCommitLagMs = new LongStat(statistics, LATEST_COMMIT_LAG_MS);

        replayQueueAvgDelayMs =
            new LongAvgStat(statistics, REPLAY_QUEUE_AVG_DELAY_MS);
        replayQueue95DelayMs = new LatencyPercentileStat(
            statistics, REPLAY_QUEUE_95_DELAY_MS, 0.95f);
        replayQueue99DelayMs = new LatencyPercentileStat(
            statistics, REPLAY_QUEUE_99_DELAY_MS, 0.99f);
        replayQueueMaxDelayMs = new LongMaxStat(
            statistics, REPLAY_QUEUE_MAX_DELAY_MS);

        outputQueueAvgDelayMs =
            new LongAvgStat(statistics, OUTPUT_QUEUE_AVG_DELAY_MS);
        outputQueue95DelayMs = new LatencyPercentileStat(
            statistics, OUTPUT_QUEUE_95_DELAY_MS, 0.95f);
        outputQueue99DelayMs = new LatencyPercentileStat(
            statistics, OUTPUT_QUEUE_99_DELAY_MS, 0.99f);
        outputQueueMaxDelayMs = new LongMaxStat(
            statistics, OUTPUT_QUEUE_MAX_DELAY_MS);

        elapsedTxnAvgMs = new LongAvgStat(statistics, ELAPSED_TXN_AVG_MS);
        elapsedTxn95Ms =
            new LatencyPercentileStat(statistics, ELAPSED_TXN_95_MS, 0.95f);
        elapsedTxn99Ms =
            new LatencyPercentileStat(statistics, ELAPSED_TXN_99_MS, 0.99f);
        elapsedTxnMaxMs = new LongMaxStat(statistics, ELAPSED_TXN_MAX_MS);

        replayLoggingThresholdNs = MILLISECONDS.toNanos(configManager.
           getDuration(RepParams.REPLAY_LOGGING_THRESHOLD));
    }

    /**
     * Note that a heartbeat request has been received over the network and is
     * about to be added to the replay queue. If no heartbeat request is being
     * tracked, note the enqueuing time and the heartbeat's master time. When
     * we dequeue this same request, as determined by matching the master time,
     * we will compute the time it spent in the queue and use that to tally
     * statistics about the replay queue delays.
     */
    void noteEnqueueHeartbeatRequest(Protocol.Heartbeat heartbeat) {
        if (heartbeatRequestEnqueueTime == 0) {
            heartbeatRequestEnqueueTime = System.currentTimeMillis();
            heartbeatRequestMasterNow = heartbeat.getMasterNow();
        }
    }

    /**
     * Note that an incoming heartbeat request has been retrieved from the
     * replay queue in preparation for processing. If we are tracking a
     * heartbeat request and this request's master time matches the entry being
     * tracked, tally the replay queue delay statistics.
     *
     * 

Then, if no heartbeat response is being tracked, note the time of * enqueuing the heartbeat response to the output queue. * * @return whether to track the heartbeat response time */ boolean noteDequeueHeartbeatRequest(Protocol.Heartbeat heartbeat) { long now = 0; if ((heartbeatRequestEnqueueTime != 0) && (heartbeatRequestMasterNow == heartbeat.getMasterNow())) { now = System.currentTimeMillis(); final long delay = now - heartbeatRequestEnqueueTime; replayQueueAvgDelayMs.add(delay); replayQueue95DelayMs.add(delay); replayQueue99DelayMs.add(delay); replayQueueMaxDelayMs.setMax(delay); heartbeatRequestEnqueueTime = 0; } if (heartbeatResponseEnqueueTime == 0) { heartbeatResponseEnqueueTime = (now == 0) ? System.currentTimeMillis() : now; return true; } return false; } /** * Note that an outgoing heartbeat response has been retrieved from the * output queue in preparation for sending it out over the network. Tally * output queue delay statistics if we were tracking a heartbeat response. */ void noteDequeueHeartbeatResponse() { if (heartbeatResponseEnqueueTime != 0) { final long delay = System.currentTimeMillis() - heartbeatResponseEnqueueTime; outputQueueAvgDelayMs.add(delay); outputQueue95DelayMs.add(delay); outputQueue99DelayMs.add(delay); outputQueueMaxDelayMs.setMax(delay); heartbeatResponseEnqueueTime = 0; } } public BlockingQueue getOutputQueue() { return outputQueue; } /** * Reinitialize for replay from a new feeder */ public void reset() { outputQueue.clear(); } LongStat getMessageQueueOverflows() { return nMessageQueueOverflows; } /** * Actions that must be taken before the recovery checkpoint, whether * the environment is read/write or read/only. */ public void preRecoveryCheckpointInit(RecoveryInfo recoveryInfo) { for (Txn txn : recoveryInfo.replayTxns.values()) { /* * ReplayTxns need to know about their owning activeTxn map, * so they can remove themselves at close. We are casting upwards, * because the non-HA code is prohibited from referencing * Replication classes, and the RecoveryInfo.replayTxns collection * doesn't know that it's got ReplayTxns. */ ((ReplayTxn) txn).registerWithActiveTxns(activeTxns); } lastReplayedVLSN = repImpl.getVLSNIndex().getRange().getLast(); } public TxnInfo getLastReplayedTxn() { return lastReplayedTxn; } public VLSN getLastReplayedVLSN() { return lastReplayedVLSN; } /** * When mastership changes, all inflight replay transactions are aborted. * Replay transactions need only be aborted by the node that has become * the new master (who was previously a Replica). The replay transactions * on the other replicas who have not changed roles are * resolved by the abort record issued by said new master. */ public void abortOldTxns() throws DatabaseException { final int masterNodeId = repImpl.getNodeId(); for (ReplayTxn replayTxn : copyActiveTxns().values()) { /* * Use NULL for the DTVLSN since it's being written as the MASTER * despite being a ReplayTxn; it will be corrected when it's * written to the log. */ replayTxn.abort(ReplicationContext.MASTER, masterNodeId, VLSN.NULL_VLSN_SEQUENCE); } assert activeTxns.isEmpty() : "Unexpected txns in activeTxns = " + activeTxns; } private void updateCommitStats(final boolean needsAck, final SyncPolicy syncPolicy, final long startTimeNanos, final long masterCommitTimeMs, final long replicaCommitTimeMs, final long masterTxnTimeMs) { final long now = System.nanoTime(); final long commitNanos = now - startTimeNanos; if (commitNanos > ackTimeoutLogThresholdNs && logger.isLoggable(Level.INFO)) { LoggerUtils.info (logger, repImpl, "Replay commit time: " + (commitNanos / 1000000) + " ms exceeded log threshold: " + (ackTimeoutLogThresholdNs / 1000000)); } nCommits.increment(); if (needsAck) { nCommitAcks.increment(); } if (syncPolicy == SyncPolicy.SYNC) { nCommitSyncs.increment(); } else if (syncPolicy == SyncPolicy.NO_SYNC) { nCommitNoSyncs.increment(); } else if (syncPolicy == SyncPolicy.WRITE_NO_SYNC) { nCommitWriteNoSyncs.increment(); } else { throw EnvironmentFailureException.unexpectedState ("Unknown sync policy: " + syncPolicy); } totalCommitProcessingNanos.add(commitNanos); minCommitProcessingNanos.setMin(commitNanos); maxCommitProcessingNanos.setMax(commitNanos); updateElapsedTxnStats(masterTxnTimeMs); /* * Tally the lag between master and replica commits, even if clock skew * makes the lag appear negative. The documentation already warns that * the value will be affected by clock skew, so users can adjust for * that, but only if we don't throw the information way. */ final long replicaLagMs = replicaCommitTimeMs - masterCommitTimeMs; latestCommitLagMs.set(replicaLagMs); } private void updateElapsedTxnStats(long masterTxnTimeMs) { elapsedTxnAvgMs.add(masterTxnTimeMs); elapsedTxn95Ms.add(masterTxnTimeMs); elapsedTxn99Ms.add(masterTxnTimeMs); elapsedTxnMaxMs.setMax(masterTxnTimeMs); } /** * Apply the operation represented by this log entry on this replica node. */ public void replayEntry(long startNs, Protocol.Entry entry) throws DatabaseException, IOException, InterruptedException, MasterSyncException { final InputWireRecord wireRecord = entry.getWireRecord(); final LogEntry logEntry = wireRecord.getLogEntry(); /* * Sanity check that the replication stream is in sequence. We want to * forestall any possible corruption from replaying invalid entries. */ if (!wireRecord.getVLSN().follows(lastReplayedVLSN)) { throw EnvironmentFailureException.unexpectedState (repImpl, "Rep stream not sequential. Current VLSN: " + lastReplayedVLSN + " next log entry VLSN: " + wireRecord.getVLSN()); } if (logger.isLoggable(Level.FINEST)) { LoggerUtils.finest(logger, repImpl, "Replaying " + wireRecord); } final ReplayTxn repTxn = getReplayTxn(logEntry.getTransactionId(), true); updateReplicaSequences(logEntry); final byte entryType = wireRecord.getEntryType(); lastReplayedVLSN = wireRecord.getVLSN(); try { final long txnId = repTxn.getId(); if (LOG_TXN_COMMIT.equalsType(entryType)) { Protocol.Commit commitEntry = (Protocol.Commit) entry; final boolean needsAck = commitEntry.getNeedsAck(); final SyncPolicy txnSyncPolicy = commitEntry.getReplicaSyncPolicy(); final SyncPolicy implSyncPolicy = needsAck ? groupCommit.getImplSyncPolicy(txnSyncPolicy) : noAckSyncPolicy; logReplay(repTxn, needsAck, implSyncPolicy); final TxnCommit commit = (TxnCommit) logEntry.getMainItem(); final long dtvlsn = updateDTVLSN(commit); if (needsAck) { /* * Only wait if the replica is not lagging and the * durability requires it. */ repImpl.getRepNode().getVLSNFreezeLatch().awaitThaw(); repImpl.getRepNode().getMasterStatus().assertSync(); } repTxn.commit(implSyncPolicy, new ReplicationContext(lastReplayedVLSN), commit.getMasterNodeId(), dtvlsn); final long masterCommitTimeMs = commit.getTime().getTime(); lastReplayedTxn = new TxnInfo(lastReplayedVLSN, masterCommitTimeMs); updateCommitStats(needsAck, implSyncPolicy, startNs, masterCommitTimeMs, repTxn.getEndTime(), repTxn.elapsedTime()); /* Respond to the feeder. */ if (needsAck) { /* * Need an ack, either buffer it, for sync group commit, or * queue it. */ if (!groupCommit.bufferAck(startNs, repTxn, txnSyncPolicy)) { queueAck(txnId); } } /* * The group refresh and recalculation can be expensive, since * it may require a database read. Do it after the ack. */ if (repTxn.getRepGroupDbChange() && canRefreshGroup(repTxn)) { repImpl.getRepNode().refreshCachedGroup(); repImpl.getRepNode().recalculateGlobalCBVLSN(); } } else if (LOG_TXN_ABORT.equalsType(entryType)) { nAborts.increment(); final TxnAbort abort = (TxnAbort) logEntry.getMainItem(); final ReplicationContext abortContext = new ReplicationContext(wireRecord.getVLSN()); if (logger.isLoggable(Level.FINEST)) { LoggerUtils.finest(logger, repImpl, "abort called for " + txnId + " masterId=" + abort.getMasterNodeId() + " repContext=" + abortContext); } long dtvlsn = updateDTVLSN(abort); repTxn.abort(abortContext, abort.getMasterNodeId(), dtvlsn); lastReplayedTxn = new TxnInfo(lastReplayedVLSN, abort.getTime().getTime()); if (repTxn.getRepGroupDbChange() && canRefreshGroup(repTxn)) { /* * Refresh is the safe thing to do on an abort, since a * refresh may have been held back from an earlier commit * due to this active transaction. */ repImpl.getRepNode().refreshCachedGroup(); } updateElapsedTxnStats(repTxn.elapsedTime()); } else if (LOG_NAMELN_TRANSACTIONAL.equalsType(entryType)) { repImpl.getRepNode().getReplica().clearDbTreeCache(); nNameLNs.increment(); applyNameLN(repTxn, wireRecord); } else { nLNs.increment(); /* A data operation. */ assert wireRecord.getLogEntry() instanceof LNLogEntry; applyLN(repTxn, wireRecord); } /* Remember the last VLSN applied by this txn. */ repTxn.setLastAppliedVLSN(lastReplayedVLSN); } catch (DatabaseException e) { e.addErrorMessage("Problem seen replaying entry " + wireRecord); throw e; } finally { final long elapsedNs = System.nanoTime() - startNs; if (elapsedNs > replayLoggingThresholdNs) { LoggerUtils.info(logger, repImpl, "Replay time for entry type:" + LogEntryType.findType(entryType) + " " + NANOSECONDS.toMillis(elapsedNs) + "ms " + "exceeded threshold:" + NANOSECONDS. toMillis(replayLoggingThresholdNs) + "ms"); } } } /** * Update the replica's in-memory DTVLSN using the value in the * commit/abort entry. * * In the normal course of events, DTVLSNs should not decrease. However, * there is just one exception: if the rep stream transitions from a post * to a pre-dtvlsn stream, it will transition from a positive to the * UNINITIALIZED_VLSN_SEQUENCE. * * A transition from a pre to a post-dtvlsn transition (from zero to some * positive value), observes the "DTVLSNs should not decrease" rule * automatically. * * @return the DTVLSN entry in the log txnEnd record so it can be used in * the commit/abort operation for replay */ private long updateDTVLSN(final TxnEnd txnEnd) { final long txnDTVLSN = txnEnd.getDTVLSN(); if (txnDTVLSN == VLSN.UNINITIALIZED_VLSN_SEQUENCE) { /* * A pre DTVLSN format entry, simply set it as the in-memory DTVLSN */ final long prevDTVLSN = repImpl.getRepNode().setDTVLSN(txnDTVLSN); if (prevDTVLSN != VLSN.UNINITIALIZED_VLSN_SEQUENCE) { LoggerUtils.info(logger, repImpl, "Transitioned to pre DTVLSN stream." + " DTVLSN:" + prevDTVLSN + " at VLSN:" + lastReplayedVLSN); } lastReplayedDTVLSN = txnDTVLSN; return txnDTVLSN; } /* Sanity check. */ if (txnDTVLSN < lastReplayedDTVLSN) { String msg = "DTVLSNs must be in ascending order in the stream. " + " prev DTVLSN:" + lastReplayedDTVLSN + " next DTVLSN:" + txnDTVLSN + " at VLSN: " + lastReplayedVLSN.getSequence(); throw EnvironmentFailureException.unexpectedState(repImpl, msg); } if ((lastReplayedDTVLSN == VLSN.UNINITIALIZED_VLSN_SEQUENCE) && (txnDTVLSN > 0)) { LoggerUtils.info(logger, repImpl, "Transitioned to post DTVLSN stream." + " DTVLSN:" + txnDTVLSN + " at VLSN:" + lastReplayedVLSN); } lastReplayedDTVLSN = txnDTVLSN; repImpl.getRepNode().setDTVLSN(txnDTVLSN); return txnDTVLSN; } /** * Queue the request ack for an async ack write to the network. */ void queueAck(final long txnId) throws IOException { try { outputQueue.put(txnId); } catch (InterruptedException ie) { /* * Have the higher levels treat it like an IOE and * exit the thread. */ throw new IOException("Ack I/O interrupted", ie); } } /** * Logs information associated with the replay of the txn commit */ private void logReplay(ReplayTxn repTxn, boolean needsAck, SyncPolicy syncPolicy) { if (!logger.isLoggable(Level.FINE)) { return; } if (needsAck) { LoggerUtils.fine(logger, repImpl, "Replay: got commit for txn=" + repTxn.getId() + ", ack needed, replica sync policy=" + syncPolicy + " vlsn=" + lastReplayedVLSN); } else { LoggerUtils.fine(logger, repImpl, "Replay: got commit for txn=" + repTxn.getId() + " ack not needed" + " vlsn=" + lastReplayedVLSN); } } /** * Returns true if there are no other activeTxns that have also modified * the membership database and are still open, since they could potentially * hold write locks that would block the read locks acquired during the * refresh operation. * * @param txn the current txn being committed or aborted * * @return true if there are no open transactions that hold locks on the * membership database. */ private boolean canRefreshGroup(ReplayTxn txn) { /* * Use synchronized rather than copyActiveTxns, since this is called * during replay and there is no nested locking to worry about. */ synchronized (activeTxns) { // TODO: very inefficient for (ReplayTxn atxn : activeTxns.getMap().values()) { if (atxn == txn) { continue; } if (atxn.getRepGroupDbChange()) { return false; } } } return true; } /** * Update this replica's node, txn and database sequences with any ids in * this log entry. We can call update, even if the replay id doesn't * represent a new lowest-id point, or if the apply is not successful, * because the apply checks that the replay id is < the sequence on the * replica. We just want to ensure that if this node becomes the master, * its sequences are in sync with what came before in the replication * stream, and ids are not incorrectly reused. */ private void updateReplicaSequences(LogEntry logEntry) { /* For now, we assume all replay entries have a txn id. */ repImpl.getTxnManager().updateFromReplay(logEntry.getTransactionId()); /* If it's a database operation, update the database id. */ if (logEntry instanceof NameLNLogEntry) { NameLNLogEntry nameLogEntry = (NameLNLogEntry) logEntry; nameLogEntry.postFetchInit(false /*isDupDb*/); NameLN nameLN = (NameLN) nameLogEntry.getLN(); repImpl.getDbTree().updateFromReplay(nameLN.getId()); } } /** * Obtain a ReplayTxn to represent the incoming operation. */ public ReplayTxn getReplayTxn(long txnId, boolean registerTxnImmediately) throws DatabaseException { ReplayTxn useTxn = null; synchronized (activeTxns) { useTxn = activeTxns.get(txnId); if (useTxn == null) { /* * Durability will be explicitly specified when * ReplayTxn.commit is called, so TransactionConfig.DEFAULT is * fine. */ if (registerTxnImmediately) { useTxn = new ReplayTxn(repImpl, TransactionConfig.DEFAULT, txnId, activeTxns, logger); } else { useTxn = new ReplayTxn(repImpl, TransactionConfig.DEFAULT, txnId, activeTxns, logger) { @Override protected boolean registerImmediately() { return false; } }; } } } return useTxn; } /** * Replays the NameLN. * * Note that the operations: remove, rename and truncate need to establish * write locks on the database. Any open handles are closed by this * operation by virtue of the ReplayTxn's importunate property. The * application will receive a LockPreemptedException if it subsequently * accesses the database handle. */ private void applyNameLN(ReplayTxn repTxn, InputWireRecord wireRecord) throws DatabaseException { NameLNLogEntry nameLNEntry = (NameLNLogEntry) wireRecord.getLogEntry(); final NameLN nameLN = (NameLN) nameLNEntry.getLN(); String databaseName = StringUtils.fromUTF8(nameLNEntry.getKey()); final DbOpReplicationContext repContext = new DbOpReplicationContext(wireRecord.getVLSN(), nameLNEntry); DbOperationType opType = repContext.getDbOperationType(); DatabaseImpl dbImpl = null; try { switch (opType) { case CREATE: { DatabaseConfig dbConfig = repContext.getCreateConfig().getReplicaConfig(repImpl); dbImpl = repImpl.getDbTree().createReplicaDb (repTxn, databaseName, dbConfig, nameLN, repContext); /* * We rely on the RepGroupDB.DB_ID value, so make sure * it's what we expect for this internal replicated * database. */ if ((dbImpl.getId().getId() == RepGroupDB.DB_ID) && !DbType.REP_GROUP.getInternalName().equals (databaseName)) { throw EnvironmentFailureException.unexpectedState ("Database: " + DbType.REP_GROUP.getInternalName() + " is associated with id: " + dbImpl.getId().getId() + " and not the reserved database id: " + RepGroupDB.DB_ID); } TriggerManager.runOpenTriggers(repTxn, dbImpl, true); break; } case REMOVE: { dbImpl = repImpl.getDbTree().getDb(nameLN.getId()); try { repImpl.getDbTree().removeReplicaDb (repTxn, databaseName, nameLN.getId(), repContext); TriggerManager.runRemoveTriggers(repTxn, dbImpl); } catch (DatabaseNotFoundException e) { throw EnvironmentFailureException.unexpectedState ("Database: " + dbImpl.getName() + " Id: " + nameLN.getId() + " not found on the Replica."); } break; } case TRUNCATE: { dbImpl = repImpl.getDbTree().getDb (repContext.getTruncateOldDbId()); try { TruncateDbResult result = repImpl.getDbTree().truncateReplicaDb (repTxn, databaseName, false, nameLN, repContext); TriggerManager.runTruncateTriggers(repTxn, result.newDb); } catch (DatabaseNotFoundException e) { throw EnvironmentFailureException.unexpectedState ("Database: " + dbImpl.getName() + " Id: " + nameLN.getId() + " not found on the Replica."); } break; } case RENAME: { dbImpl = repImpl.getDbTree().getDb(nameLN.getId()); try { dbImpl = repImpl.getDbTree().renameReplicaDb (repTxn, dbImpl.getName(), databaseName, nameLN, repContext); TriggerManager.runRenameTriggers(repTxn, dbImpl, databaseName); } catch (DatabaseNotFoundException e) { throw EnvironmentFailureException.unexpectedState ("Database rename from: " + dbImpl.getName() + " to " + databaseName + " failed, name not found on the Replica."); } break; } case UPDATE_CONFIG: { /* Get the replicated database configurations. */ DatabaseConfig dbConfig = repContext.getCreateConfig().getReplicaConfig(repImpl); /* Update the NameLN and write it to the log. */ dbImpl = repImpl.getDbTree().getDb(nameLN.getId()); final String dbName = dbImpl.getName(); repImpl.getDbTree().updateNameLN (repTxn, dbName, repContext); /* Set the new configurations to DatabaseImpl. */ dbImpl.setConfigProperties (repTxn, dbName, dbConfig, repImpl); repImpl.getDbTree().modifyDbRoot(dbImpl); break; } default: throw EnvironmentFailureException.unexpectedState ("Illegal database op type of " + opType.toString() + " from " + wireRecord + " database=" + databaseName); } } finally { if (dbImpl != null) { repImpl.getDbTree().releaseDb(dbImpl); } } } private void applyLN( final ReplayTxn repTxn, final InputWireRecord wireRecord) throws DatabaseException { final LNLogEntry lnEntry = (LNLogEntry) wireRecord.getLogEntry(); final DatabaseId dbId = lnEntry.getDbId(); /* * If this is a change to the rep group db, remember at commit time, * and refresh this node's group metadata. */ if (dbId.getId() == RepGroupDB.DB_ID) { repTxn.noteRepGroupDbChange(); } /* * Note that we don't have to worry about serializable isolation when * applying a replicated txn; serializable isolation in only an issue * for txns that take read locks, and a replicated txn consists only of * write operations. */ final DatabaseImpl dbImpl = repImpl.getRepNode().getReplica().getDbCache().get(dbId, repTxn); lnEntry.postFetchInit(dbImpl); final ReplicationContext repContext = new ReplicationContext(wireRecord.getVLSN()); try (final Cursor cursor = DbInternal.makeCursor( dbImpl, repTxn, null /*cursorConfig*/)) { OperationResult result; final LN ln = lnEntry.getLN(); /* In a dup DB, do not expect embedded LNs or non-empty data. */ if (dbImpl.getSortedDuplicates() && (lnEntry.isEmbeddedLN() || (ln.getData() != null && ln.getData().length > 0))) { throw EnvironmentFailureException.unexpectedState( dbImpl.getEnv(), "[#25288] emb=" + lnEntry.isEmbeddedLN() + " key=" + Key.getNoFormatString(lnEntry.getKey()) + " data=" + Key.getNoFormatString(ln.getData()) + " vlsn=" + ln.getVLSNSequence()); } if (ln.isDeleted()) { /* * Perform an exact search by key. Use a partial data entry * (delDataEntry) to avoid reading old data. */ replayKeyEntry.setData(lnEntry.getKey()); result = DbInternal.searchForReplay( cursor, replayKeyEntry, delDataEntry, LockMode.RMW, SearchMode.SET); if (result != null) { result = DbInternal.deleteWithRepContext(cursor, repContext); } } else { replayKeyEntry.setData(lnEntry.getKey()); replayDataEntry.setData(ln.getData()); result = DbInternal.putForReplay( cursor, replayKeyEntry, replayDataEntry, ln, lnEntry.getExpiration(), lnEntry.isExpirationInHours(), PutMode.OVERWRITE, repContext); } /* * For a deletion we expect the record to previously exist. Note * that the put call never fails since PutMode.OVERWRITE is used. * * Ignore deletion errors for records that are expired (within TTL * clock tolerance), since these can be deleted from the Btree by * background IN compression. A record may expire after the master * deletion op and before the replica replay, e.g., due to a * lagging replica or a replica that was down during the deletion. * * Also ignore deletion errors for EXTINCT and MAYBE_EXTINCT * records to account for apps that try to delete extinct records. * This is illegal but should not bring down the RN. */ if (result == null) { final ExtinctionStatus extinctionState = repImpl.getExtinctionState(dbImpl, lnEntry.getKey()); final boolean isExpired = repImpl.expiresWithin( lnEntry.getExpiration(), lnEntry.isExpirationInHours(), repImpl.getTtlClockTolerance()); final String expTime = (lnEntry.getExpiration() == 0) ? "none" : TTL.formatExpiration( lnEntry.getExpiration(), lnEntry.isExpirationInHours()); if (!isExpired && extinctionState == NOT_EXTINCT) { throw new EnvironmentFailureException( repImpl, EnvironmentFailureReason.LOG_INCOMPLETE, "Replicated operation could not be applied. " + " vlsn=" + wireRecord.getVLSN() + " expirationTime=" + expTime + // TODO remove key after debugging " key=" + Key.dumpString(lnEntry.getKey(), 0) + wireRecord); } } } /* * Queue new extinct scans for later processing. Must be done after * applying the LN so the record is present in the Btree when the * extinct scanner thread tries to read it. */ if (lnEntry.getLogType().equals( LogEntryType.LOG_EXTINCT_SCAN_LN_TRANSACTIONAL)) { repImpl.getExtinctionScanner().replay( lnEntry.getKey(), lnEntry.getData()); } } /** * Go through all active txns and rollback up to but not including the log * entry represented by the matchpoint VLSN. * * Effectively truncate these rolled back log entries by making them * invisible. Flush the log first, to make sure these log entries are out * of the log buffers and are on disk, so we can reliably find them through * the FileManager. * * Rollback steps are described in * https://sleepycat.oracle.com/trac/wiki/Logging#Recoverysteps. In * summary, * * 1. Log and fsync a new RollbackStart record * 2. Do the rollback in memory. There is no need to explicitly * log INs made dirty by the rollback operation. * 3. Do invisibility masking by overwriting LNs. * 4. Fsync all overwritten log files at this point. * 5. Write a RollbackEnd record, for ease of debugging * * Note that application read txns can continue to run during syncup. * Reader txns cannot access records that are being rolled back, because * they are in txns that are not committed, i.e, they are write locked. * The rollback interval never includes committed txns, and we do a hard * recovery if it would include them. */ public void rollback(VLSN matchpointVLSN, long matchpointLsn) { String rollbackStatus = RBSTATUS_START; final Map localActiveTxns = copyActiveTxns(); try { if (localActiveTxns.size() == 0) { /* no live read/write txns, nothing to do. */ rollbackStatus = RBSTATUS_NO_ACTIVE; return; } VLSNRange range = repImpl.getVLSNIndex().getRange(); if (range.getLast().equals(matchpointVLSN)) { /* nothing to roll back. */ rollbackStatus = RBSTATUS_RANGE_EQUALS; return; } repImpl.setSyncupProgress(SyncupProgress.DO_ROLLBACK); /* * Stop the log file backup service, since the files will be in an * inconsistent state while the rollback is in progress. */ repImpl.getRepNode().shutdownNetworkBackup(); /* * Set repImpl's isRollingBack to true, and invalidate all the in * progress DbBackup. */ repImpl.setBackupProhibited(true); repImpl.invalidateBackups(DbLsn.getFileNumber(matchpointLsn)); /* * 1. Log RollbackStart. The fsync guarantees that this marker will * be present in the log for recovery. It also ensures that all log * entries will be flushed to disk and the TxnChain will not have * to worry about entries that are in log buffers when constructing * the rollback information. */ LogManager logManager = repImpl.getLogManager(); LogEntry rollbackStart = SingleItemEntry.create( LogEntryType.LOG_ROLLBACK_START, new RollbackStart( matchpointVLSN, matchpointLsn, localActiveTxns.keySet())); long rollbackStartLsn = logManager.logForceFlush(rollbackStart, true, // fsyncRequired, ReplicationContext.NO_REPLICATE); rollbackStatus = RBSTATUS_LOG_RBSTART; /* * 2. Do rollback in memory. Undo any operations that were logged * after the matchpointLsn, and save the LSNs for those log * entries.. There should be something to undo, because we checked * earlier that there were log entries after the matchpoint. */ List rollbackLsns = new ArrayList(); for (ReplayTxn replayTxn : localActiveTxns.values()) { Collection txnRollbackLsns = replayTxn.rollback(matchpointLsn); /* * Txns that were entirely rolled back should have been removed * from the activeTxns map. */ assert checkRemoved(replayTxn) : "Should have removed " + replayTxn; rollbackLsns.addAll(txnRollbackLsns); } rollbackStatus = RBSTATUS_MEM_ROLLBACK; assert rollbackLsns.size() != 0 : dumpActiveTxns(matchpointLsn); /* * 3 & 4 - Mark the rolled back log entries as invisible. But * before doing so, invoke any registered rewrite listeners, so the * application knows that existing log files will be modified. * * After all are done, fsync the set of files. By waiting, some may * have made it out on their own. */ LogFileRewriteListener listener = repImpl.getLogRewriteListener(); if (listener != null) { listener.rewriteLogFiles(getFileNames(rollbackLsns)); } RollbackTracker.makeInvisible(repImpl, rollbackLsns); rollbackStatus = RBSTATUS_INVISIBLE; /* * 5. Log RollbackEnd. Flush it so that we can use it to optimize * recoveries later on. If the RollbackEnd exists, we can skip the * step of re-making LNs invisible. */ logManager.logForceFlush( SingleItemEntry.create(LogEntryType.LOG_ROLLBACK_END, new RollbackEnd(matchpointLsn, rollbackStartLsn)), true, // fsyncRequired ReplicationContext.NO_REPLICATE); /* * Restart the backup service only if all the steps of the * rollback were successful. */ repImpl.getRepNode().restartNetworkBackup(); repImpl.setBackupProhibited(false); rollbackStatus = RBSTATUS_FINISH; } finally { /* Reset the lastReplayedVLSN so it's correct when we resume. */ lastReplayedVLSN = matchpointVLSN; LoggerUtils.info(logger, repImpl, "Rollback to matchpoint " + matchpointVLSN + " at " + DbLsn.getNoFormatString(matchpointLsn) + " status=" + rollbackStatus); } } /* For debugging support */ private String dumpActiveTxns(long matchpointLsn) { StringBuilder sb = new StringBuilder(); sb.append("matchpointLsn="); sb.append(DbLsn.getNoFormatString(matchpointLsn)); for (ReplayTxn replayTxn : copyActiveTxns().values()) { sb.append("txn id=").append(replayTxn.getId()); sb.append(" locks=").append(replayTxn.getWriteLockIds()); sb.append("lastLogged="); sb.append(DbLsn.getNoFormatString(replayTxn.getLastLsn())); sb.append("\n"); } return sb.toString(); } private Set getFileNames(List lsns) { Set fileNums = new HashSet(); Set files = new HashSet(); for (long lsn : lsns) { fileNums.add(DbLsn.getFileNumber(lsn)); } for (long fileNum : fileNums) { files.add(new File(FileManager.getFileName(fileNum))); } return files; } private boolean checkRemoved(ReplayTxn txn) { if (txn.isClosed()) { if (activeTxns.get(txn.getId()) != null) { return false; } } return true; } /** * Make a copy of activeTxns to avoid holding its mutex while iterating. * Can be used whenever the cost of the HashMap copy is not significant. */ private Map copyActiveTxns() { return activeTxns.getMap(); } /** * Release all transactions, database handles, etc held by the replay * unit. The Replicator is closing down and Replay will not be invoked * again. */ public void close() { for (ReplayTxn replayTxn : copyActiveTxns().values()) { try { if (logger.isLoggable(Level.FINE)) { LoggerUtils.fine(logger, repImpl, "Unregistering open replay txn: " + replayTxn.getId()); } replayTxn.cleanup(); } catch (DatabaseException e) { LoggerUtils.fine(logger, repImpl, "Replay txn: " + replayTxn.getId() + " unregistration failed: " + e.getMessage()); } } assert activeTxns.isEmpty(); } /** * Returns a copy of the statistics associated with Replay */ public StatGroup getStats(StatsConfig config) { StatGroup ret = statistics.cloneGroup(config.getClear()); return ret; } public void resetStats() { statistics.clear(); } /* For unit tests */ public SimpleTxnMap getActiveTxns() { return activeTxns; } public String dumpState() { StringBuilder sb = new StringBuilder(); sb.append("lastReplayedTxn=").append(lastReplayedTxn); sb.append(" lastReplayedVLSN=").append(lastReplayedVLSN); sb.append(" numActiveReplayTxns=").append(activeTxns.size()); sb.append("\n"); return sb.toString(); } /** * Write out any pending acknowledgments. See GroupCommit.flushPendingAcks * for details. This method is invoked after each log entry is read from * the replication stream. * * @param nowNs the time at the reading of the log entry */ void flushPendingAcks(long nowNs) throws IOException { groupCommit.flushPendingAcks(nowNs); } /** * See GroupCommit.getPollIntervalNs(long) */ long getPollIntervalNs(long defaultNs) { return groupCommit.getPollIntervalNs(defaultNs); } /** * Implements group commit. It's really a substructure of Replay and exists * mainly for modularity reasons. *

* Since replay is single threaded, the group commit mechanism works * differently in the replica than in the master. In the replica, SYNC * transactions are converted into NO_SYNC transactions and executed * immediately, but their acknowledgments are delayed until after either * the REPLICA_GROUP_COMMIT_INTERVAL (the max amount the first transaction * in the group is delayed) has expired, or the size of the group (as * specified by REPLICA_MAX_GROUP_COMMIT) has been exceeded. */ private class GroupCommit { /* Size determines max fsync commits that can be grouped. */ private final long pendingCommitAcks[]; /* Number of entries currently in pendingCommitAcks */ private int nPendingAcks; /* * If this time limit is reached, the group will be forced to commit. * Invariant: nPendingAcks > 0 ==> limitGroupCommitNs > 0 */ private long limitGroupCommitNs = 0; /* The time interval that an open group is held back. */ private final long groupCommitIntervalNs; private final LongStat nGroupCommitTimeouts; private final LongStat nGroupCommitMaxExceeded; private final LongStat nGroupCommits; private final LongStat nGroupCommitTxns; private GroupCommit(DbConfigManager configManager) { pendingCommitAcks = new long[configManager. getInt(RepParams.REPLICA_MAX_GROUP_COMMIT)]; nPendingAcks = 0; final long groupCommitIntervalMs = configManager. getDuration(RepParams.REPLICA_GROUP_COMMIT_INTERVAL); groupCommitIntervalNs = NANOSECONDS.convert(groupCommitIntervalMs, MILLISECONDS); nGroupCommitTimeouts = new LongStat(statistics, N_GROUP_COMMIT_TIMEOUTS); nGroupCommitMaxExceeded = new LongStat(statistics, N_GROUP_COMMIT_MAX_EXCEEDED); nGroupCommitTxns = new LongStat(statistics, N_GROUP_COMMIT_TXNS); nGroupCommits = new LongStat(statistics, N_GROUP_COMMITS); } /** * Returns true if group commits are enabled at the replica. */ private boolean isEnabled() { return pendingCommitAcks.length > 0; } /** * The interval used to poll for incoming log entries. The time is * lowered from the defaultNs time, if there are pending * acknowledgments. * * @param defaultNs the default poll interval * * @return the actual poll interval */ private long getPollIntervalNs(long defaultNs) { if (nPendingAcks == 0) { return defaultNs; } final long now = System.nanoTime(); final long interval = limitGroupCommitNs - now; return Math.min(interval, defaultNs); } /** * Returns the sync policy to be implemented at the replica. If * group commit is active, and SYNC is requested it will return * NO_SYNC instead to delay the fsync. * * @param txnSyncPolicy the sync policy as stated in the txn * * @return the sync policy to be implemented by the replica */ private SyncPolicy getImplSyncPolicy(SyncPolicy txnSyncPolicy) { return ((txnSyncPolicy == SyncPolicy.SYNC) && isEnabled()) ? SyncPolicy.NO_SYNC : txnSyncPolicy; } /** * Buffers the acknowledgment if the commit calls for a sync, or if * there are pending acknowledgments to ensure that acks are sent * in order. * * @param nowNs the current time * @param ackTxn the txn associated with the ack * @param txnSyncPolicy the sync policy as request by the committing * txn * * @return true if the ack has been buffered */ private final boolean bufferAck(long nowNs, ReplayTxn ackTxn, SyncPolicy txnSyncPolicy) throws IOException { if (!isEnabled() || !((txnSyncPolicy == SyncPolicy.SYNC) || (nPendingAcks > 0))) { return false; } pendingCommitAcks[nPendingAcks++] = ackTxn.getId(); if (nPendingAcks == 1) { /* First txn in group, start the clock. */ limitGroupCommitNs = nowNs + groupCommitIntervalNs; } else { flushPendingAcks(nowNs); } return true; } /** * Flush if there are pending acks and either the buffer limit or the * group interval has been reached. * * @param nowNs the current time (passed in to minimize system calls) */ private final void flushPendingAcks(long nowNs) throws IOException { if ((nPendingAcks == 0) || ((nPendingAcks != pendingCommitAcks.length) && (NanoTimeUtil.compare(nowNs, limitGroupCommitNs) < 0))) { return; } /* Update statistics. */ nGroupCommits.increment(); nGroupCommitTxns.add(nPendingAcks); if (NanoTimeUtil.compare(nowNs, limitGroupCommitNs) >= 0) { nGroupCommitTimeouts.increment(); } else if (nPendingAcks >= pendingCommitAcks.length) { nGroupCommitMaxExceeded.increment(); } /* flush log buffer and fsync to disk */ repImpl.getLogManager().flushSync(); /* commits are on disk, send out acknowledgments on the network. */ for (int i=0; i < nPendingAcks; i++) { queueAck(pendingCommitAcks[i]); pendingCommitAcks[i] = 0; } nPendingAcks = 0; limitGroupCommitNs = 0; } } /** * Simple helper class to package a Txn vlsn and its associated commit * time. */ public static class TxnInfo { final VLSN txnVLSN; final long masterTxnEndTime; private TxnInfo(VLSN txnVLSN, long masterTxnEndTime) { this.txnVLSN = txnVLSN; this.masterTxnEndTime = masterTxnEndTime; } public VLSN getTxnVLSN() { return txnVLSN; } public long getMasterTxnEndTime() { return masterTxnEndTime; } @Override public String toString() { return " VLSN: " + txnVLSN + " masterTxnEndTime=" + new Date(masterTxnEndTime); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy