com.sleepycat.je.recovery.Checkpointer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of starrocks-bdb-je Show documentation
starrocks managed bdb je
The newest version!
/*-
 * Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle Berkeley
 * DB Java Edition made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle Berkeley DB Java Edition for a copy of the
 * license and additional information.
 */

package com.sleepycat.je.recovery;

import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_CHECKPOINTS;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_DELTA_IN_FLUSH;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_FULL_BIN_FLUSH;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_FULL_IN_FLUSH;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPTID;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPT_END;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPT_INTERVAL;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.CKPT_LAST_CKPT_START;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.GROUP_DESC;
import static com.sleepycat.je.recovery.CheckpointStatDefinition.GROUP_NAME;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;

import com.sleepycat.je.CacheMode;
import com.sleepycat.je.CheckpointConfig;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.DiskLimitException;
import com.sleepycat.je.EnvironmentMutableConfig;
import com.sleepycat.je.StatsConfig;
import com.sleepycat.je.cleaner.Cleaner;
import com.sleepycat.je.cleaner.ExtinctionScanner;
import com.sleepycat.je.cleaner.FileSelector.CheckpointStartCleanerState;
import com.sleepycat.je.config.EnvironmentParams;
import com.sleepycat.je.dbi.DatabaseId;
import com.sleepycat.je.dbi.DatabaseImpl;
import com.sleepycat.je.dbi.DbConfigManager;
import com.sleepycat.je.dbi.DbTree;
import com.sleepycat.je.dbi.EnvConfigObserver;
import com.sleepycat.je.dbi.EnvironmentImpl;
import com.sleepycat.je.evictor.OffHeapCache;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.LogManager;
import com.sleepycat.je.log.Provisional;
import com.sleepycat.je.log.ReplicationContext;
import com.sleepycat.je.log.entry.INLogEntry;
import com.sleepycat.je.log.entry.SingleItemEntry;
import com.sleepycat.je.tree.BIN;
import com.sleepycat.je.tree.ChildReference;
import com.sleepycat.je.tree.IN;
import com.sleepycat.je.tree.SearchResult;
import com.sleepycat.je.tree.Tree;
import com.sleepycat.je.tree.WithRootLatched;
import com.sleepycat.je.utilint.DaemonThread;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.LSNStat;
import com.sleepycat.je.utilint.LoggerUtils;
import com.sleepycat.je.utilint.LongStat;
import com.sleepycat.je.utilint.StatGroup;
import com.sleepycat.je.utilint.TestHook;
import com.sleepycat.je.utilint.TestHookExecute;
import com.sleepycat.je.utilint.VLSN;

/**
 * The Checkpointer looks through the tree for internal nodes that must be
 * flushed to the log. Checkpoint flushes must be done in ascending order from
 * the bottom of the tree up.
 *
 * Checkpoint and IN Logging Rules
 * -------------------------------
 * The checkpoint must log, and make accessible via non-provisional ancestors,
 * all INs that are dirty at CkptStart.  If we crash and recover from that
 * CkptStart onward, any IN that became dirty (before the crash) after the
 * CkptStart must become dirty again as the result of replaying the action that
 * caused it to originally become dirty.
 *
 * Therefore, when an IN is dirtied at some point in the checkpoint interval,
 * but is not logged by the checkpoint, the log entry representing the action
 * that dirtied the IN must follow either the CkptStart or the FirstActiveLSN
 * that is recorded in the CkptEnd entry.  The FirstActiveLSN is less than or
 * equal to the CkptStart LSN.  Recovery will process LNs between the
 * FirstActiveLSN and the end of the log.  Other entries are only processed
 * from the CkptStart forward.  And provisional entries are not processed.
 *
 * Example: Non-transactional LN logging.  We take two actions: 1) log the LN
 * and then 2) dirty the parent BIN.  What if the LN is logged before CkptStart
 * and the BIN is dirtied after CkptStart?  How do we avoid breaking the rules?
 * The answer is that we log the LN while holding the latch on the parent BIN,
 * and we don't release the latch until after we dirty the BIN.   The
 * construction of the checkpoint dirty map requires latching the BIN.  Since
 * the LN was logged before CkptStart, the BIN will be dirtied before the
 * checkpointer latches it during dirty map construction.  So the BIN will
 * always be included in the dirty map and logged by the checkpoint.
 *
 * Example: Abort.  We take two actions: 1) log the abort and then 2) undo the
 * changes, which modifies (dirties) the BIN parents of the undone LNs.  There
 * is nothing to prevent logging CkptStart in between these two actions, so how
 * do we avoid breaking the rules?  The answer is that we do not unregister the
 * transaction until after the undo phase.  So although the BINs may be dirtied
 * by the undo after CkptStart is logged, the FirstActiveLSN will be prior to
 * CkptStart.  Therefore, we will process the Abort and replay the action that
 * modifies the BINs.
 *
 * Exception: Lazy migration.  The log cleaner will make an IN dirty without
 * logging an action that makes it dirty.  This is an exception to the general
 * rule that actions should be logged when they cause dirtiness.   The reasons
 * this is safe are:
 * 1. The IN contents are not modified, so there is no information lost if the
 *    IN is never logged, or is logged provisionally and no ancestor is logged
 *    non-provisionally.
 * 2. If the IN is logged non-provisionally, this will have the side effect of
 *    recording the old LSN as being obsolete. However, the general rules for
 *    checkpointing and recovery will ensure that the new version is used in
 *    the Btree.  The new version will either be replayed by recovery or
 *    referenced in the active Btree via a non-provisional ancestor.
 *
 * Checkpoint Algorithm TODO update this
 * --------------------
 * The final checkpointDirtyMap field is used to hold (in addition to the dirty
 * INs) the state of the checkpoint and highest flush levels.  Access to this
 * object is synchronized so that eviction and checkpointing can access it
 * concurrently.  When a checkpoint is not active, the state is CkptState.NONE
 * and the dirty map is empty.  When a checkpoint runs, we do this:
 *
 * 1. Get set of files from cleaner that can be deleted after this checkpoint.
 * 2. Set checkpointDirtyMap state to DIRTY_MAP_INCOMPLETE, meaning that dirty
 *    map construction is in progress.
 * 3. Log CkptStart
 * 4. Construct dirty map, organized by Btree level, from dirty INs in INList.
 *    The highest flush levels are calculated during dirty map construction.
 *    Set checkpointDirtyMap state to DIRTY_MAP_COMPLETE.
 * 5. Flush INs in dirty map.
 *        + First, flush the bottom two levels a sub-tree at a time, where a
 *          sub-tree is one IN at level two and all its BIN children.  Higher
 *          levels (above level two) are logged strictly by level, not using
 *          subtrees.
 *              o If je.checkpointer.highPriority=false, we log one IN at a
 *                time, whether or not the IN is logged as part of a subtree,
 *                and do a Btree search for the parent of each IN.
 *              o If je.checkpointer.highPriority=true, for the bottom two
 *                levels we log each sub-tree in a single call to the
 *                LogManager with the parent IN latched, and we only do one
 *                Btree search for each level two IN.  Higher levels are logged
 *                one IN at a time as with highPriority=false.
 *        + The Provisional property is set as follows, depending on the level
 *          of the IN:
 *              o level is max flush level:  Provisional.NO
 *              o level is bottom level: Provisional.YES
 *              o Otherwise (middle levels): Provisional.BEFORE_CKPT_END
 *  6. Flush VLSNIndex cache to make VLSNIndex recoverable.
 *  7. Flush UtilizationTracker (write FileSummaryLNs) to persist all
 *     tracked obsolete offsets and utilization summary info, to make this info
 *     recoverable.
 *  8. Log CkptEnd
 *  9. Delete cleaned files from step 1.
 * 10. Set checkpointDirtyMap state to NONE.
 *
 * Per-DB Highest Flush Level
 * --------------------------
 * As mentioned above, when the dirty map is constructed we also determine the
 * highest flush level for each database.  This is the maximum Btree level at
 * which a dirty node exists in the DB.
 *
 * When logging a node below the maxFlushLevel, we add the parent to the dirty
 * map.  It may or may not have been added when the dirty map was constructed.
 * The idea is to flush all ancestors of all nodes in the dirty map, up to and
 * including the maxFlushLevel, even if those ancestors were not dirty when the
 * dirty map was constructed.
 *
 * This is done to avoid orphaning a dirty node as shown in this example.
 *
 *          IN-A       (root level=4)
 *         /    \
 *   (d) IN-B   IN-C   (maxFlushLevel=3)
 *               \
 *           (d) IN-D
 *
 * IN-C is not dirty (d) when the dirty map is constructed, but it will be
 * logged because its child (IN-D) is dirty, and it is not above maxFlushLevel.
 *
 * If IN-C were not logged, and there were a crash after the checkpoint, the
 * changes to IN-D would be lost.  IN-D would not be replayed by recovery
 * because it is logged provisionally, and it would not be accessible via its
 * parent.  This is because only nodes at maxFlushLevel are logged
 * non-provisionally.  The actions that led to the changes in IN-D may not be
 * replayed either, because they may appear before the firstActiveLsn
 * associated with the checkpoint.
 *
 * When log files are to be deleted at the end of the checkpoint (after being
 * processed by the log cleaner), the maxFlushLevel is increased by one.
 * This is to ensure that LSNs in deleted files will not be fetched during
 * recovery.  Such files are in the FileSelector.CLEANED state, which means
 * they have been processed by the cleaner since the last checkpoint.
 *
 * TODO: Document circumstances and motivation for the extra flush level.
 *
 * Lastly, for Database.sync or a checkpoint with MinimizeRecoveryTime
 * configured, we will flush all the way to the root rather than using the
 * maxFlushLevel computed as described above.
 *
 * Provisional.BEFORE_CKPT_END
 * ---------------------------
 * See Provisional.java for a description of the relationship between the
 * checkpoint algorithm above and the BEFORE_CKPT_END property.
 *
 * Coordination of Eviction and Checkpointing
 * ------------------------------------------
 * Eviction can proceed concurrently with all phases of a checkpoint, and
 * eviction may take place concurrently in multiple threads.  This concurrency
 * is crucial to avoid blocking application threads that perform eviction and
 * to reduce the amount of eviction required in application threads.
 *
 * Eviction calls Checkpointer.coordinateEvictionWithCheckpoint, which calls
 * DirtyINMap.coordinateEvictionWithCheckpoint, just before logging an IN.
 * coordinateEvictionWithCheckpoint returns whether the IN should be logged
 * provisionally (Provisional.YES) or non-provisionally (Provisional.NO).
 *
 * Other coordination necessary depends on the state of the checkpoint:
 *   + NONE: No additional action.
 *      o return Provisional.NO
 *   + DIRTY_MAP_INCOMPLETE: The parent IN is added to the dirty map, exactly
 *     as if it were encountered as dirty in the INList during dirty map
 *     construction.
 *      o IN is root: return Provisional.NO
 *      o IN is not root: return Provisional.YES
 *   + DIRTY_MAP_COMPLETE:
 *      o IN level GTE highest flush level: return Provisional.NO
 *      o IN level LT highest flush level: return Provisional.YES
 *
 * In general this is designed so that eviction will use the same provisional
 * value that would be used by the checkpoint, as if the checkpoint itself were
 * logging the IN.  However, there are several conditions where this is not
 * exactly the case.
 *
 * 1. Eviction may log an IN with Provisional.YES when the IN was not dirty at
 *    the time of dirty map creation, if it became dirty afterwards.  In this
 *    case, the checkpointer would not have logged the IN at all.  This is safe
 *    because the actions that made that IN dirty are logged in the recovery
 *    period.
 * 2. Eviction may log an IN with Provisional.YES after the checkpoint has
 *    logged it, if it becomes dirty again.  In this case the IN is logged
 *    twice, which would not have been done by the checkpoint alone.  This is
 *    safe because the actions that made that IN dirty are logged in the
 *    recovery period.
 * 3. An intermediate level IN (not bottom most and not the highest flush
 *    level) will be logged by the checkpoint with Provisional.BEFORE_CKPT_END
 *    but will be logged by eviction with Provisional.YES.  See below for why
 *    this is safe.
 * 4. Between checkpoint step 8 (log CkptEnd) and 10 (set checkpointDirtyMap
 *    state to NONE), eviction may log an IN with Provisional.YES, although a
 *    checkpoint is not strictly active during this interval.  See below for
 *    why this is safe.
 *
 * It is safe for eviction to log an IN as Provisional.YES for the last two
 * special cases, because this does not cause incorrect recovery behavior.  For
 * recovery to work properly, it is only necessary that:
 *
 *  + Provisional.NO is used for INs at the max flush level during an active
 *    checkpoint.
 *  + Provisional.YES or BEFORE_CKPT_END is used for INs below the max flush
 *    level, to avoid replaying an IN during recovery that may depend on a file
 *    deleted as the result of the checkpoint.
 *
 * You may ask why we don't use Provisional.YES for eviction when a checkpoint
 * is not active.  There are two reason, both related to performance:
 *
 * 1. This would be wasteful when an IN is evicted in between checkpoints, and
 *    that portion of the log is processed by recovery later, in the event of a
 *    crash.  The evicted INs would be ignored by recovery, but the actions
 *    that caused them to be dirty would be replayed and the INs would be
 *    logged again redundantly.
 * 2. Logging a IN provisionally will not count the old LSN as obsolete
 *    immediately, so cleaner utilization will be inaccurate until the a
 *    non-provisional parent is logged, typically by the next checkpoint.  It
 *    is always important to keep the cleaner from stalling and spiking, to
 *    keep latency and throughput as level as possible.
 *
 * Therefore, it is safe to log with Provisional.YES in between checkpoints,
 * but not desirable.
 *
 * Although we don't do this, it would be safe and optimal to evict with
 * BEFORE_CKPT_END in between checkpoints, because it would be treated by
 * recovery as if it were Provisional.NO.  This is because the interval between
 * checkpoints is only processed by recovery if it follows the last CkptEnd,
 * and BEFORE_CKPT_END is treated as Provisional.NO if the IN follows the last
 * CkptEnd.
 *
 * However, it would not be safe to evict an IN with BEFORE_CKPT_END during a
 * checkpoint, when logging of the IN's ancestors does not occur according to
 * the rules of the checkpoint.  If this were done, then if the checkpoint
 * completes and is used during a subsequent recovery, an obsolete offset for
 * the old version of the IN will mistakenly be recorded.  Below are two cases
 * where BEFORE_CKPT_END is used correctly and one showing how it could be used
 * incorrectly.
 *
 * 1. Correct use of BEFORE_CKPT_END when the checkpoint does not complete.
 *
 *        050 BIN-A
 *        060 IN-B parent of BIN-A
 *        100 CkptStart
 *        200 BIN-A logged with BEFORE_CKPT_END
 *        300 FileSummaryLN with obsolete offset for BIN-A at 050
 *        Crash and recover
 *
 *    Recovery will process BIN-A at 200 (it will be considered
 *    non-provisional) because there is no following CkptEnd.  It is
 *    therefore correct that BIN-A at 050 is obsolete.
 *
 * 2. Correct use of BEFORE_CKPT_END when the checkpoint does complete.
 *
 *        050 BIN-A
 *        060 IN-B parent of BIN-A
 *        100 CkptStart
 *        200 BIN-A logged with BEFORE_CKPT_END
 *        300 FileSummaryLN with obsolete offset for BIN-A at 050
 *        400 IN-B parent of BIN-A, non-provisional
 *        500 CkptEnd
 *        Crash and recover
 *
 *    Recovery will not process BIN-A at 200 (it will be considered
 *    provisional) because there is a following CkptEnd, but it will
 *    process its parent IN-B at 400, and therefore the BIN-A at 200 will be
 *    active in the tree.  It is therefore correct that BIN-A at 050 is
 *    obsolete.
 *
 * 3. Incorrect use of BEFORE_CKPT_END when the checkpoint does complete.
 *
 *        050 BIN-A
 *        060 IN-B parent of BIN-A
 *        100 CkptStart
 *        200 BIN-A logged with BEFORE_CKPT_END
 *        300 FileSummaryLN with obsolete offset for BIN-A at 050
 *        400 CkptEnd
 *        Crash and recover
 *
 *    Recovery will not process BIN-A at 200 (it will be considered
 *    provisional) because there is a following CkptEnd, but no parent
 *    IN-B is logged, and therefore the IN-B at 060 and BIN-A at 050 will be
 *    active in the tree.  It is therefore incorrect that BIN-A at 050 is
 *    obsolete.
 *
 * This last case is what caused the LFNF in SR [#19422], when BEFORE_CKPT_END
 * was mistakenly used for logging evicted BINs via CacheMode.EVICT_BIN.
 * During the checkpoint, we evict BIN-A and log it with BEFORE_CKPT_END, yet
 * neither it nor its parent are part of the checkpoint.  After being counted
 * obsolete, we crash and recover.  Then the file containing the BIN (BIN-A at
 * 050 above) is cleaned and deleted.  During cleaning, it is not migrated
 * because an obsolete offset was previously recorded.  The LFNF occurs when
 * trying to access this BIN during a user operation.
 *
 * CacheMode.EVICT_BIN
 * -------------------
 * Unlike in JE 4.0 where EVICT_BIN was first introduced, in JE 4.1 and later
 * we do not use special rules when an IN is evicted.  Since concurrent
 * eviction and checkpointing are supported in JE 4.1, the above rules apply to
 * EVICT_BIN as well as all other types of eviction.
 */
public class Checkpointer extends DaemonThread implements EnvConfigObserver {

    /**
     * For unit testing only.  Called before we flush the max level.  This
     * field is static because it is called from the static flushIN method.
     */
    private static TestHook maxFlushLevelHook = null;

    private static TestHook beforeFlushHook = null;

    static TestHook examineINForCheckpointHook = null;

    /* Checkpoint sequence, initialized at recovery. */
    private long checkpointId;

    /*
     * How much the log should grow between checkpoints. If 0, we're using time
     * based checkpointing.
     */
    private final long logSizeBytesInterval;
    private final long logFileMax;
    private final long timeInterval;
    private long lastCheckpointMillis;
    private volatile boolean wakeupAfterNoWrites;

    /* Configured to true to minimize checkpoint duration. */
    private boolean highPriority;

    private long nCheckpoints;
    private long lastCheckpointStart;
    private long lastCheckpointEnd;
    private long lastCheckpointInterval;
    private volatile long lastCheckpointFirstActiveLsn;
    private final FlushStats flushStats;

    /**
     * The DirtyINMap for checkpointing is created once and is reset after each
     * checkpoint is complete.  Access to this object is synchronized so that
     * eviction and checkpointing can access it concurrently.
     */
    private final DirtyINMap checkpointDirtyMap;

    public Checkpointer(EnvironmentImpl envImpl,
                        long waitTime,
                        String name) {
        super(waitTime, name, envImpl);
        logSizeBytesInterval =
            envImpl.getConfigManager().getLong
                (EnvironmentParams.CHECKPOINTER_BYTES_INTERVAL);
        logFileMax =
            envImpl.getConfigManager().getLong(EnvironmentParams.LOG_FILE_MAX);
        timeInterval = waitTime;
        lastCheckpointMillis = 0;

        nCheckpoints = 0;
        flushStats = new FlushStats();

        checkpointDirtyMap = new DirtyINMap(envImpl);

        /* Initialize mutable properties and register for notifications. */
        envConfigUpdate(envImpl.getConfigManager(), null);
        envImpl.addConfigObserver(this);
    }

    /**
     * Process notifications of mutable property changes.
     */
    @Override
    public void envConfigUpdate(DbConfigManager cm,
                                EnvironmentMutableConfig ignore) {
        highPriority = cm.getBoolean
            (EnvironmentParams.CHECKPOINTER_HIGH_PRIORITY);
    }

    /**
     * Initializes the checkpoint intervals when no checkpoint is performed
     * while opening the environment.
     */
    void initIntervals(long lastCheckpointStart,
                       long lastCheckpointEnd,
                       long lastCheckpointFirstActiveLsn,
                       long lastCheckpointMillis) {
        this.lastCheckpointStart = lastCheckpointStart;
        this.lastCheckpointEnd = lastCheckpointEnd;
        this.lastCheckpointFirstActiveLsn = lastCheckpointFirstActiveLsn;
        this.lastCheckpointMillis = lastCheckpointMillis;
    }

    /**
     * Returns the firstActiveLsn of the last completed checkpoint.
     */
    public long getLastCheckpointFirstActiveLsn() {
        return lastCheckpointFirstActiveLsn;
    }

    /**
     * Coordinates an eviction with an in-progress checkpoint and returns
     * whether provisional logging is needed.
     *
     * @return the provisional status to use for logging the target.
     */
    public Provisional coordinateEvictionWithCheckpoint(
        final DatabaseImpl db,
        final int targetLevel,
        final IN parent) {

        return checkpointDirtyMap.
            coordinateEvictionWithCheckpoint(db, targetLevel, parent);
    }

    /**
     * Coordinates a split with an in-progress checkpoint.
     *
     * @param newSibling the sibling IN created by the split.
     */
    public void coordinateSplitWithCheckpoint(final IN newSibling) {
        checkpointDirtyMap.coordinateSplitWithCheckpoint(newSibling);
    }

    /**
     * Figure out the wakeup period. Supplied through this static method
     * because we need to pass wakeup period to the superclass and need to do
     * the calcuation outside this constructor.
     *
     * @throws IllegalArgumentException via Environment ctor and
     * setMutableConfig.
     */
    public static long getWakeupPeriod(DbConfigManager configManager)
        throws IllegalArgumentException {

        long wakeupPeriod = configManager.getDuration
            (EnvironmentParams.CHECKPOINTER_WAKEUP_INTERVAL);
        long bytePeriod = configManager.getLong
            (EnvironmentParams.CHECKPOINTER_BYTES_INTERVAL);

        /* Checkpointing period must be set either by time or by log size. */
        if ((wakeupPeriod == 0) && (bytePeriod == 0)) {
            throw new IllegalArgumentException
                (EnvironmentParams.CHECKPOINTER_BYTES_INTERVAL.getName() +
                 " and " +
                 EnvironmentParams.CHECKPOINTER_WAKEUP_INTERVAL.getName() +
                 " cannot both be 0. ");
        }

        /*
         * Checkpointing by log size takes precendence over time based period.
         */
        if (bytePeriod == 0) {
            return wakeupPeriod;
        } else {
            return 0;
        }
    }

    /**
     * Set checkpoint id -- can only be done after recovery.
     */
    synchronized void setCheckpointId(long lastCheckpointId) {
        checkpointId = lastCheckpointId;
    }

    /**
     * Load stats.
     */
    @SuppressWarnings("unused")
    public StatGroup loadStats(StatsConfig config) {
        StatGroup stats = new StatGroup(GROUP_NAME, GROUP_DESC);
        new LongStat(stats, CKPT_LAST_CKPTID, checkpointId);
        new LongStat(stats, CKPT_CHECKPOINTS, nCheckpoints);
        new LongStat(stats, CKPT_LAST_CKPT_INTERVAL, lastCheckpointInterval);
        new LSNStat(stats, CKPT_LAST_CKPT_START, lastCheckpointStart);
        new LSNStat(stats, CKPT_LAST_CKPT_END, lastCheckpointEnd);
        new LongStat(stats, CKPT_FULL_IN_FLUSH, flushStats.nFullINFlush);
        new LongStat(stats, CKPT_FULL_BIN_FLUSH, flushStats.nFullBINFlush);
        new LongStat(stats, CKPT_DELTA_IN_FLUSH, flushStats.nDeltaINFlush);

        if (config.getClear()) {
            nCheckpoints = 0;
            flushStats.nFullINFlush = 0;
            flushStats.nFullBINFlush = 0;
            flushStats.nDeltaINFlush = 0;
        }

        return stats;
    }

    /**
     * Return the number of retries when a deadlock exception occurs.
     */
    @Override
    protected long nDeadlockRetries() {
        return envImpl.getConfigManager().getInt
            (EnvironmentParams.CHECKPOINTER_RETRY);
    }

    /**
     * Called whenever the DaemonThread wakes up from a sleep.
     */
    @Override
    protected void onWakeup() {

        if (envImpl.isClosing()) {
            return;
        }

        doCheckpoint(
            CheckpointConfig.DEFAULT, "daemon", true /*invokedFromDaemon*/);

        wakeupAfterNoWrites = false;
    }

    /**
     * Wakes up the checkpointer if a checkpoint log interval is configured and
     * the number of bytes written since the last checkpoint exceeds the size
     * of the interval.
     */
    public void wakeupAfterWrite() {

        if ((logSizeBytesInterval != 0) && !isRunning()) {

            long nextLsn = envImpl.getFileManager().getNextLsn();

            if (DbLsn.getNoCleaningDistance(
                    nextLsn, lastCheckpointStart, logFileMax) >=
                logSizeBytesInterval) {

                wakeup();
            }
        }
    }

    /**
     * Wakes up the checkpointer if a checkpoint is needed to reclaim disk
     * space for already cleaned files. This method is called after an idle
     * period with no writes.
     */
    public void wakeupAfterNoWrites() {

        if (!isRunning() && needCheckpointForCleanedFiles()) {
            wakeupAfterNoWrites = true;
            wakeup();
        }
    }

    private boolean needCheckpointForCleanedFiles() {
        return envImpl.getCleaner().getFileSelector().isCheckpointNeeded();
    }

    /**
     * Determine whether a checkpoint should be run.
     */
    private boolean isRunnable(CheckpointConfig config) {
        /* Figure out if we're using log size or time to determine interval.*/
        long useBytesInterval = 0;
        long useTimeInterval = 0;
        long nextLsn = DbLsn.NULL_LSN;
        boolean runnable = false;
        try {
            if (config.getForce()) {
                runnable = true;
                return true;
            }

            if (wakeupAfterNoWrites && needCheckpointForCleanedFiles()) {
                runnable = true;
                return true;
            }

            if (config.getKBytes() != 0) {
                useBytesInterval = config.getKBytes() << 10;

            } else if (config.getMinutes() != 0) {
                /* Convert to millis. */
                useTimeInterval = config.getMinutes() * 60 * 1000;

            } else if (logSizeBytesInterval != 0) {
                useBytesInterval = logSizeBytesInterval;

            } else {
                useTimeInterval = timeInterval;
            }

            /*
             * If our checkpoint interval is defined by log size, check on how
             * much log has grown since the last checkpoint.
             */
            if (useBytesInterval != 0) {
                nextLsn = envImpl.getFileManager().getNextLsn();

                if (DbLsn.getNoCleaningDistance(
                    nextLsn, lastCheckpointStart, logFileMax) >=
                    useBytesInterval) {

                    runnable = true;
                }

            } else if (useTimeInterval != 0) {

                /*
                 * Our checkpoint is determined by time.  If enough time has
                 * passed and some log data has been written, do a checkpoint.
                 */
                final long lastUsedLsn =
                    envImpl.getFileManager().getLastUsedLsn();

                if (((System.currentTimeMillis() - lastCheckpointMillis) >=
                     useTimeInterval) &&
                    (DbLsn.compareTo(lastUsedLsn, lastCheckpointEnd) != 0)) {

                    runnable = true;
                }
            }
            return runnable;

        } finally {
            if (logger.isLoggable(Level.FINEST)) {
                final StringBuilder sb = new StringBuilder();
                sb.append("size interval=").append(useBytesInterval);
                if (nextLsn != DbLsn.NULL_LSN) {
                    sb.append(" nextLsn=").
                        append(DbLsn.getNoFormatString(nextLsn));
                }
                if (lastCheckpointEnd != DbLsn.NULL_LSN) {
                    sb.append(" lastCkpt=");
                    sb.append(DbLsn.getNoFormatString(lastCheckpointEnd));
                }
                sb.append(" time interval=").append(useTimeInterval);
                sb.append(" force=").append(config.getForce());
                sb.append(" runnable=").append(runnable);

                LoggerUtils.finest(logger, envImpl, sb.toString());
            }
        }
    }

    /**
     * The real work to do a checkpoint. This may be called by the checkpoint
     * thread when waking up, or it may be invoked programatically through the
     * api.
     *
     * @param invokingSource a debug aid, to indicate who invoked this
     *       checkpoint. (i.e. recovery, the checkpointer daemon, the cleaner,
     *       programatically)
     */
    public synchronized void doCheckpoint(CheckpointConfig config,
                                          String invokingSource,
                                          boolean invokedFromDaemon) {
        if (envImpl.isReadOnly()) {
            return;
        }

        if (!isRunnable(config)) {
            return;
        }

        /* Stop if we cannot write because of a disk limit violation. */
        try {
            envImpl.checkDiskLimitViolation();
        } catch (DiskLimitException e) {
            if (!invokedFromDaemon) {
                throw e;
            }
            return;
        }

        /*
         * If minimizing recovery time is desired, then flush all the way to
         * the top of the dbtree instead of stopping at the highest level last
         * modified, so that only the root INs are processed by recovery.
         */
        final boolean flushAll = config.getMinimizeRecoveryTime();

        /*
         * If there are cleaned files to be deleted, flush an extra level to
         * write out the parents of cleaned nodes.  This ensures that no node
         * will contain the LSN of a cleaned file.
         */
        final Cleaner cleaner = envImpl.getCleaner();

        final CheckpointStartCleanerState cleanerState =
            cleaner.getFilesAtCheckpointStart();

        /*
         * Any scans that are complete before the checkpoint, can be deleted
         * after the checkpoint.
         */
        final ExtinctionScanner extinctionScanner =
            envImpl.getExtinctionScanner();

        final Set completedScans =
            extinctionScanner.getCompletedRecordScans();

        final boolean flushExtraLevel = !cleanerState.isEmpty();

        lastCheckpointMillis = System.currentTimeMillis();
        flushStats.resetPerRunCounters();

        /* Get the next checkpoint id. */
        checkpointId++;
        nCheckpoints++;

        boolean success = false;
        boolean traced = false;

        final LogManager logManager = envImpl.getLogManager();

        /*
         * Set the checkpoint state so that concurrent eviction can be
         * coordinated.
         */
        checkpointDirtyMap.beginCheckpoint(flushAll, flushExtraLevel);

        try {
            /* Log the checkpoint start. */
            final SingleItemEntry startEntry =
                SingleItemEntry.create(
                    LogEntryType.LOG_CKPT_START,
                    new CheckpointStart(checkpointId, invokingSource));

            final long checkpointStart =
                logManager.log(startEntry, ReplicationContext.NO_REPLICATE);

            /*
             * Note the first active LSN point. The definition of
             * firstActiveLsn is that all log entries for active transactions
             * are equal to or after that LSN.  This is the starting point for
             * replaying LNs during recovery and will be stored in the CkptEnd
             * entry.
             *
             * Use the checkpointStart as the firstActiveLsn if firstActiveLsn
             * is null, meaning that no txns are active.
             *
             * The current value must be retrieved from TxnManager after
             * logging CkptStart. If it were instead retrieved before logging
             * CkptStart, the following failure could occur.  [#20270]
             *
             *  ... getFirstActiveLsn returns NULL_LSN, will use 200 CkptStart
             *  100 LN-A in Txn-1
             *  200 CkptStart
             *  300 BIN-B refers to 100 LN-A
             *  400 CkptEnd
             *  ... Crash and recover.  Recovery does not undo 100 LN-A.
             *  ... Txn-1 is uncommitted, yet 100 LN-A takes effect.
             */
            long firstActiveLsn = envImpl.getTxnManager().getFirstActiveLsn();
            if (firstActiveLsn == DbLsn.NULL_LSN) {
                firstActiveLsn = checkpointStart;
            }

            /*
             * In a replicated system, the checkpointer will be flushing out
             * the VLSNIndex, which is HA metadata. Check that the in-memory
             * version encompasses all metadata up to the point of the
             * CheckpointStart record. This is no-op for non-replicated
             * systems. [#19754]
             */
            envImpl.awaitVLSNConsistency();

            /* Find the set of dirty INs that must be logged. */
            checkpointDirtyMap.selectDirtyINsForCheckpoint();

            /* Call hook after dirty map creation and before flushing. */
            TestHookExecute.doHookIfSet(beforeFlushHook);

            /* Flush IN nodes. */
            flushDirtyNodes(
                envImpl, checkpointDirtyMap, checkpointStart, highPriority,
                flushStats);

            if (DirtyINMap.DIRTY_SET_DEBUG_TRACE) {
                LoggerUtils.logMsg(
                    envImpl.getLogger(), envImpl, Level.INFO,
                    "Ckpt flushed" +
                        " nFullINFlushThisRun = " +
                        flushStats.nFullINFlushThisRun +
                        " nFullBINFlushThisRun = " +
                        flushStats.nFullBINFlushThisRun +
                        " nDeltaINFlushThisRun = " +
                        flushStats.nDeltaINFlushThisRun);

            }

            /*
             * Flush MapLNs if not already done by flushDirtyNodes.  Only flush
             * a database if it has not already been flushed since checkpoint
             * start.  Lastly, flush the DB mapping tree root.
             */
            checkpointDirtyMap.flushMapLNs(checkpointStart);
            checkpointDirtyMap.flushRoot(checkpointStart);

            /*
             * Flush replication information if necessary so that the VLSNIndex
             * cache is flushed and is recoverable.
             */
            envImpl.preCheckpointEndFlush();

            /*
             * Flush utilization info AFTER flushing IN nodes to reduce the
             * inaccuracies caused by the sequence FileSummaryLN-LN-BIN. This
             * also reduces the chance of lost IN obsolete info when there is
             * a crash after logging the non-provisional INs but before
             * logging the FileSummaryLNs.
             */
            envImpl.getUtilizationProfile().flushFileUtilization
                (envImpl.getUtilizationTracker().getTrackedFiles());

            final DbTree dbTree = envImpl.getDbTree();
            final boolean willDeleteFiles = !cleanerState.isEmpty();

            final CheckpointEnd ckptEnd = new CheckpointEnd(
                invokingSource, checkpointStart, envImpl.getRootLsn(),
                firstActiveLsn,
                envImpl.getNodeSequence().getLastLocalNodeId(),
                envImpl.getNodeSequence().getLastReplicatedNodeId(),
                dbTree.getLastLocalDbId(), dbTree.getLastReplicatedDbId(),
                envImpl.getTxnManager().getLastLocalTxnId(),
                envImpl.getTxnManager().getLastReplicatedTxnId(),
                envImpl.getExtinctionScanner().getLastLocalId(),
                envImpl.getExtinctionScanner().getLastReplicatedId(),
                checkpointId, willDeleteFiles);

            final SingleItemEntry endEntry =
                SingleItemEntry.create(LogEntryType.LOG_CKPT_END, ckptEnd);

            /*
             * Log checkpoint end and update state kept about the last
             * checkpoint location. Send a trace message *before* the
             * checkpoint end log entry. This is done so that the normal trace
             * message doesn't affect the time-based isRunnable() calculation,
             * which only issues a checkpoint if a log record has been written
             * since the last checkpoint.
             */
            trace(envImpl, invokingSource, true);
            traced = true;

            lastCheckpointInterval = DbLsn.getNoCleaningDistance(
                checkpointStart, lastCheckpointStart, logFileMax);

            /*
             * We must flush and fsync to ensure that cleaned files are not
             * referenced. This also ensures that this checkpoint is not wasted
             * if we crash.
             */
            lastCheckpointEnd = logManager.logForceFlush(
                endEntry, true /*fsyncRequired*/,
                ReplicationContext.NO_REPLICATE);

            lastCheckpointStart = checkpointStart;
            lastCheckpointFirstActiveLsn = firstActiveLsn;

            success = true;
            cleaner.updateFilesAtCheckpointEnd(cleanerState);
            extinctionScanner.deleteCompletedRecordScans(completedScans);

        } catch (DiskLimitException e) {

            LoggerUtils.logMsg(
                envImpl.getLogger(), envImpl, Level.WARNING,
                "Ckpt id=" + checkpointId + " success=" + success +
                " aborted because of disk limit violation: " + e);

            if (!invokedFromDaemon) {
                throw e;
            }

        } catch (DatabaseException e) {
            LoggerUtils.traceAndLogException(envImpl, "Checkpointer",
                                             "doCheckpoint", "checkpointId=" +
                                             checkpointId, e);
            throw e;
        } finally {

            /*
             * Reset the checkpoint state so evictor activity knows there's no
             * further requirement for provisional logging. SR 11163.
             */
            checkpointDirtyMap.reset();

            if (!traced) {
                trace(envImpl, invokingSource, success);
            }
        }
    }

    private void trace(EnvironmentImpl envImpl,
                       String invokingSource,
                       boolean success ) {

        final StringBuilder sb = new StringBuilder();
        sb.append("Checkpoint ").append(checkpointId);
        sb.append(": source=" ).append(invokingSource);
        sb.append(" success=").append(success);
        sb.append(" nFullINFlushThisRun=");
        sb.append(flushStats.nFullINFlushThisRun);
        sb.append(" nDeltaINFlushThisRun=");
        sb.append(flushStats.nDeltaINFlushThisRun);
        LoggerUtils.logMsg(logger, envImpl, Level.CONFIG, sb.toString());
    }

    /**
     * Flush a given database to disk. Like checkpoint, log from the bottom
     * up so that parents properly represent their children.
     */
    public void syncDatabase(EnvironmentImpl envImpl,
                             DatabaseImpl dbImpl,
                             boolean flushLog) {
        if (envImpl.isReadOnly()) {
            return;
        }

        envImpl.checkDiskLimitViolation();

        final DirtyINMap dirtyMap = new DirtyINMap(envImpl);
        final FlushStats fstats = new FlushStats();

        try {
            /* Find the dirty set. */
            dirtyMap.selectDirtyINsForDbSync(dbImpl);

            if (dirtyMap.getNumEntries() > 0) {
                /* Write all dirtyINs out.*/
                flushDirtyNodes(
                    envImpl, dirtyMap, DbLsn.NULL_LSN  /*ckptStart*/,
                    false /*highPriority*/, fstats);

                /* Make changes durable. [#15254] */
                if (flushLog) {
                    envImpl.getLogManager().flushSync();
                }
            }
        } catch (DiskLimitException e) {
            throw e;
        } catch (DatabaseException e) {
            LoggerUtils.traceAndLogException
                (envImpl, "Checkpointer", "syncDatabase",
                 "of " + dbImpl.getName(), e);
            throw e;
        } finally {
            dirtyMap.reset();
        }
    }

    /* For unit testing only. */
    public static void setMaxFlushLevelHook(TestHook hook) {
        maxFlushLevelHook = hook;
    }

    /* For unit testing only. */
    public static void setBeforeFlushHook(TestHook hook) {
        beforeFlushHook = hook;
    }

    /**
     * Flush the nodes in order, from the lowest level to highest level.  As a
     * flush dirties its parent, add it to the dirty map, thereby cascading the
     * writes up the tree. If flushAll wasn't specified, we need only cascade
     * up to the highest level set at the start of checkpointing.
     *
     * Note that all but the top level INs are logged provisionally. That's
     * because we don't need to process lower INs during recovery because the
     * higher INs will end up pointing at them.
     */
    private static void flushDirtyNodes(EnvironmentImpl envImpl,
                                        DirtyINMap dirtyMap,
                                        long checkpointStart,
                                        boolean highPriority,
                                        FlushStats fstats) {

        final DbTree dbTree = envImpl.getDbTree();
        final Map dbCache = new HashMap<>();

        try {
            while (dirtyMap.getNumLevels() > 0) {

                /*
                 * Work on one level's worth of nodes in ascending level order.
                 */
                final Integer currentLevel = dirtyMap.getLowestLevelSet();
                final int currentLevelVal = currentLevel;

                /*
                 * Flush MapLNs just prior to flushing the first level of the
                 * mapping tree.  Only flush a database if it has not already
                 * been flushed since checkpoint start.
                 */
                if (currentLevelVal == IN.DBMAP_LEVEL) {
                    dirtyMap.flushMapLNs(checkpointStart);
                }

                /* Flush the nodes at the current level. */
                while (true) {
                    final CheckpointReference targetRef =
                        dirtyMap.removeNextNode(currentLevel);

                    if (targetRef == null) {
                        break;
                    }

                    envImpl.checkDiskLimitViolation();

                    /*
                     * Check to make sure the DB was not deleted after putting
                     * it in the dirty map, and prevent the DB from being
                     * deleted while we're working with it.
                     */
                    final DatabaseImpl db = dbTree.getDb(
                        targetRef.dbId, -1 /*lockTimeout*/, dbCache);

                    if (db != null) {

                        /* Flush if we're below maxFlushLevel. */
                        final int maxFlushLevel =
                            dirtyMap.getHighestFlushLevel(db);

                        if (currentLevelVal <= maxFlushLevel) {

                            flushIN(
                                db, targetRef, dirtyMap, maxFlushLevel,
                                highPriority, fstats, true /*allowLogSubtree*/);

                            /*
                             * Sleep if background read/write limit was
                             * exceeded.
                             */
                            envImpl.sleepAfterBackgroundIO();
                        }
                    }

                    /*
                     * If the environment was invalidated by other activity,
                     * get out of this loop, and re-throw the invalidating
                     * exception to indicate that the checkpoint did not
                     * succeed.
                     */
                    envImpl.checkIfInvalid();
                }

                /* We're done with this level. */
                dirtyMap.removeLevel(currentLevel);
            }
        } finally {
            dbTree.releaseDbs(dbCache);
        }

        /*
         * Do not flush FileSummaryLNs/MapLNs (do not call
         * UtilizationProfile.flushLocalTracker) here because that flushing is
         * already done by the checkpoint.
         */
    }

    /**
     * Flush the target IN.
     *
     * Where applicable, also attempt to flush the subtree that houses this
     * target, which means we flush the siblings of this target to promote
     * better cleaning throughput. The problem lies in the fact that
     * provisionally logged nodes are not available for log cleaning until
     * their parent is logged non-provisionally.  On the other hand, we want to
     * log nodes in provisional mode as much as possible, both for recovery
     * performance, and for correctness to avoid fetches against cleaned log
     * files. (See [#16037].) These conflicting goals are reconciled by
     * flushing nodes in subtree grouping, because writing the non-provisional
     * parent of a set of provisionally written nodes frees the cleaner to work
     * on that set of provisional nodes as soon as possible. For example, if a
     * tree consists of:
     *
     *             INa
     *       +------+-------+
     *      INb            INc
     * +-----+----+         +-----+
     * BINd BINe BINf      BINg BINh
     *
     * It is more efficient for cleaning throughput to log in this order:
     *       BINd, BINe, BINf, INb, BINg, BINh, INc, INa
     * rather than:
     *       BINd, BINe, BINf, BINg, BINh, INb, INc, INa
     *
     * Suppose the subtree in question is INb->{BINd, BINe, BINf}
     *
     * Suppose we see BINd in the dirty map first, before BINe and BINf.
     *  - flushIN(BINd) is called
     *  - we fetch and latch its parent, INb
     *
     * If this is a high priority checkpoint, we'll hold the INb latch across
     * the time it takes to flush all three children.  In flushIN(BINd), we
     * walk through INb, create a local map of all the siblings that can be
     * found in the dirty map, and then call logSiblings with that local map.
     * Then we'll write out INb.
     *
     * If high priority is false, we will not hold the INb latch across
     * multiple IOs. Instead, we
     *  - write BINd out, using logSiblings
     *  - while still holding the INb latch, we create a list of dirty siblings
     *  - release the INb latch
     *  - call flushIN() recursively on each entry in the local sibling map,
     *    which will result in a search and write of each sibling.  These
     *    recursive calls to flushIN are called with the allowLogSubtree
     *    parameter of false to halt the recursion and prevent a repeat of the
     *    sibling examination.
     *  - write INb
     */
    private static void flushIN(final DatabaseImpl db,
                                final CheckpointReference targetRef,
                                final DirtyINMap dirtyMap,
                                final int maxFlushLevel,
                                final boolean highPriority,
                                final FlushStats fstats,
                                final boolean allowLogSubtree) {

        final EnvironmentImpl envImpl = db.getEnv();
        final Tree tree = db.getTree();
        final int targetLevel = targetRef.nodeLevel;

        /* Call test hook when we reach the max level. */
        assert (targetLevel < maxFlushLevel) ||
            TestHookExecute.doHookIfSet(maxFlushLevelHook);

        if (targetRef.isRoot) {

            final RootFlusher flusher =
                new RootFlusher(db, targetRef.nodeId);

            tree.withRootLatchedExclusive(flusher);

            /*
             * Update the tree's owner, whether it's the env root or the
             * db-mapping tree.
             */
            if (flusher.getFlushed()) {
                DbTree dbTree = envImpl.getDbTree();
                dbTree.modifyDbRoot(db);
                fstats.nFullINFlushThisRun++;
                fstats.nFullINFlush++;
            }

            /*
             * If this target isn't the root anymore, we'll have to handle it
             * like a regular node.
             */
            if (flusher.stillRoot()) {
                return;
            }
        }

        /*
         * The following applies to two cases:
         * (1) the target was not ever the root
         * (2) the target was the root, when the checkpoint dirty set was
         * assembled but is not the root now.
         */
        final SearchResult result = tree.getParentINForChildIN(
            -1 /*nodeId*/, targetRef.treeKey,
            targetRef.nodeLevel /*targetLevel*/,
            targetRef.nodeLevel + 1 /*exclusiveLevel*/,
            false  /*requireExactMatch*/, false /*doFetch*/,
            CacheMode.UNCHANGED, null /*trackingList*/);

        /*
         * If no possible parent is found, the compressor may have deleted
         * this item before we got to processing it. (Although it seems this
         * cannot currently happen since we never delete the root node.)
         */
        if (result.parent == null) {
            return;
        }

        final IN parent = result.parent;
        final int index = result.index;
        final int parentLevel = parent.getLevel();
        final CheckpointReference parentRef;

        /* List of siblings to log after releasing the parent latch. */
        final List logSiblingsSeparately;

        try {
            /*
             * If bottomLevelTarget is true, the parent IN contains bottom
             * level BINs.  The masking is used to normalize the level for
             * ordinary DBs and the mapping tree DB.
             */
            final boolean bottomLevelTarget =
                ((parentLevel & IN.LEVEL_MASK) == 2);

            /*
             * INs at the max flush level are always non-provisional and
             * INs at the bottom level (when this is not also the max flush
             * level) are always provisional.  In between INs are
             * provisional BEFORE_CKPT_END (see Provisional).
             */
            final Provisional provisional;
            if (targetLevel >= maxFlushLevel) {
                provisional = Provisional.NO;
            } else if (bottomLevelTarget) {
                provisional = Provisional.YES;
            } else {
                provisional = Provisional.BEFORE_CKPT_END;
            }

            /*
             * If we didn't reach the target level, a child wasn't resident
             * and there is nothing to log at this level. To be on the safe
             * side, we'll put the parent into the dirty set to be logged when
             * that level is processed.
             *
             * Only do this if the parent we found is at a higher level than
             * the child.  This ensures that the non-exact search does not
             * find a sibling rather than a parent. [#11555]
             */
            if (!result.exactParentFound) {
                if (parentLevel > targetLevel) {
                    dirtyMap.addIN(
                        parent, -1 /*index*/,
                        false /*updateFlushLevels*/,
                        true /*updateMemoryBudget*/);
                }
                return;
            }

            /*
             * We found the parent. Add it unconditionally to the dirty map. We
             * must make sure that every IN that was selected for the
             * checkpointer's dirty IN set at the beginning of checkpoint is
             * written into the log and can be properly accessed from
             * ancestors. Eviction or a split may have written out a member of
             * this dirty set before the checkpointer got to it. See [#10249].
             */
            assert parentLevel == targetLevel + 1;

            dirtyMap.addIN(
                parent, -1 /*index*/,
                false /*updateFlushLevels*/,
                true /*updateMemoryBudget*/);

            /*
             * Determine whether our search found the IN identified by either
             * targetRef.nodeId or targetRef.lsn. If there is not a match, then
             * the node was deleted, logged or split since creating the
             * reference.
             *
             * For a non-DW DB, targetRef.lsn will be not null and we match on
             * it. If the LSN has changed then of course the node was logged,
             * and possibly split, and we will not log this target here.
             *
             * For a DW DB we also match on LSN if it is non-null. If the LSN
             * is null then the reference was created for a never-logged IN and
             * targetRef.nodeId >= 0. In that case we match on the nodeId. If
             * the LSN or nodeId doesn't match, there must have been a split,
             * and we will not log this target here. However, because splits
             * are not logged for DW, this is not sufficient to cause both
             * siblings that were part of split to be logged, when one node was
             * added to the dirty map. We account for this when the parent is
             * logged by calling logDirtyChildren. This approach relies on the
             * fact that a split will dirty the parent.
             *
             * TODO:
             * Why not always call logDirtyIN for a DW IN, whether or not the
             * LSN or nodeId matches? logDirtyChildren is going to log it
             * anyway if it is dirty.
             */
            if (targetRef.lsn != DbLsn.NULL_LSN) {

                if (targetRef.lsn != parent.getLsn(index)) {
                    return;
                }
            } else {
                assert targetRef.nodeId >= 0;
                assert db.isDeferredWriteMode();

                final IN target = (IN) parent.getTarget(index);

                if (target == null ||
                    targetRef.nodeId != target.getNodeId()) {
                    return;
                }
            }

            /* Log the target, if dirty. */
            logDirtyIN(envImpl, parent, index, provisional, fstats);

            /*
             * We will log a sub-tree when the target is at the bottom level
             * and this is not a recursive call to flushIN during sub-tree
             * logging. Return if we are only logging the target node here.
             */
            if (!bottomLevelTarget || !allowLogSubtree) {
                return;
            }

            /*
             * Log sub-tree siblings with the latch held when highPriority
             * is configured and this is not a DW DB. For a DW DB, dirty LNs
             * are logged for each BIN.  If we were to log a DW sub-tree with
             * the parent latch held, the amount of logging may cause the latch
             * to be held for too long a period.
             */
            if (highPriority && !db.isDurableDeferredWrite()) {
                logSiblingsSeparately = null;
            } else {
                logSiblingsSeparately = new ArrayList<>();
            }

            for (int i = 0; i < parent.getNEntries(); i += 1) {

                if (i == index) {
                    continue;
                }

                final IN child = (IN) parent.getTarget(i);
                final long childId = (child != null) ? child.getNodeId() : -1;
                final long childLsn = parent.getLsn(i);

                final CheckpointReference childRef =
                    dirtyMap.removeNode(targetLevel, childLsn, childId);

                if (childRef == null) {
                    continue;
                }

                if (logSiblingsSeparately != null) {
                    logSiblingsSeparately.add(childRef);
                } else {
                    logDirtyIN(envImpl, parent, i, provisional, fstats);
                }
            }

            /* Get parentRef before releasing the latch. */
            if (parentLevel <= maxFlushLevel) {
                parentRef = dirtyMap.removeNode(
                    parentLevel, parent.getLastLoggedLsn(),
                    parent.getNodeId());
            } else {
                parentRef = null;
            }
        } finally {
            parent.releaseLatch();
        }

        /*
         * If highPriority is false, we don't hold the latch while logging
         * the bottom level siblings.  We log them here with flushIN,
         * performing a separate search for each one, after releasing the
         * parent latch above.
         */
        if (logSiblingsSeparately != null) {
            for (final CheckpointReference childRef : logSiblingsSeparately) {
                flushIN(
                    db, childRef, dirtyMap, maxFlushLevel, highPriority,
                    fstats, false /*allowLogSubtree*/);
            }
        }

        /*
         * Log the sub-tree parent, which will be logged non-provisionally,
         * in order to update cleaner utilization. This must be done with
         * flushIN after releasing the parent latch above, since we must search
         * and acquire the grandparent latch.
         */
        if (parentRef != null) {
            flushIN(
                db, parentRef, dirtyMap, maxFlushLevel, highPriority, fstats,
                false /*allowLogSubtree*/);
        }
    }

    /**
     * Note that if this method is called, the parent must also be logged. This
     * is true even if this method finds that the child is not dirty. In that
     * case the child has already been flushed (e.g., by eviction) and the
     * parent must be logged according to the rule for max flush level.
     */
    private static void logDirtyIN(
        final EnvironmentImpl envImpl,
        final IN parent,
        final int index,
        final Provisional provisional,
        final FlushStats fstats) {

        final IN child = (IN) parent.getTarget(index);
        final long newLsn;
        final boolean isBIN;
        final boolean isDelta;

        if (child != null) {
            child.latch(CacheMode.UNCHANGED);
            try {
                if (!child.getDirty()) {
                    return;
                }

                if (child.getDatabase().isDurableDeferredWrite()) {

                    /*
                     * Find dirty descendants to avoid logging nodes with
                     * never-logged children. See [#13936] and
                     * IN.logDirtyChildren for description of the case.
                     *
                     * Note that we must log both dirty and never-logged
                     * descendants to be sure to have a consistent view of
                     * the split. If we didn't, we could end up with the
                     * post-split version of a new sibling and the
                     * pre-split version of an split sibling in the log,
                     * which could result in a recovery where descendants
                     * are incorrectly duplicated, because they are in both
                     * the pre-split split sibling, and the post-split
                     * version of the new sibling.
                     */
                    child.logDirtyChildren();
                }

                newLsn = child.log(
                    true /*allowDeltas*/, provisional,
                    true /*backgroundIO*/, parent);

                assert (newLsn != DbLsn.NULL_LSN);

                isBIN = child.isBIN();
                isDelta = (newLsn == child.getLastDeltaLsn());
            } finally {
                child.releaseLatch();
            }
        } else {
            final OffHeapCache ohCache = envImpl.getOffHeapCache();

            final INLogEntry logEntry =
                ohCache.createBINLogEntryForCheckpoint(parent, index);

            if (logEntry == null) {
                return;
            }

            isBIN = true;
            isDelta = logEntry.isBINDelta();

            newLsn = IN.logEntry(
                logEntry, provisional, true /*backgroundIO*/, parent);

            ohCache.postBINLog(parent, index, logEntry, newLsn);
        }

        parent.updateEntry(index, newLsn, VLSN.NULL_VLSN_SEQUENCE, 0);

        if (isDelta) {
            fstats.nDeltaINFlushThisRun++;
            fstats.nDeltaINFlush++;
        } else {
            fstats.nFullINFlushThisRun++;
            fstats.nFullINFlush++;
            if (isBIN) {
                fstats.nFullBINFlush++;
                fstats.nFullBINFlushThisRun++;
            }
        }
    }

    /*
     * RootFlusher lets us write out the root IN within the root latch.
     */
    private static class RootFlusher implements WithRootLatched {
        private final DatabaseImpl db;
        private boolean flushed;
        private boolean stillRoot;
        private final long targetNodeId;

        RootFlusher(final DatabaseImpl db,
                    final long targetNodeId) {
            this.db = db;
            flushed = false;
            this.targetNodeId = targetNodeId;
            stillRoot = false;
        }

        /**
         * Flush the rootIN if dirty.
         */
        @Override
        public IN doWork(ChildReference root) {

            if (root == null) {
                return null;
            }

            IN rootIN = (IN) root.fetchTarget(db, null);
            rootIN.latch(CacheMode.UNCHANGED);
            try {
                if (rootIN.getNodeId() == targetNodeId) {

                    /*
                     * Find dirty descendants to avoid logging nodes with
                     * never-logged children. See [#13936]
                     */
                    if (rootIN.getDatabase().isDurableDeferredWrite()) {
                        rootIN.logDirtyChildren();
                    }

                    /*
                     * stillRoot handles the situation where the root was split
                     * after it was placed in the checkpointer's dirty set.
                     */
                    stillRoot = true;

                    if (rootIN.getDirty()) {
                        long newLsn = rootIN.log();
                        root.setLsn(newLsn);
                        flushed = true;
                    }
                }
            } finally {
                rootIN.releaseLatch();
            }
            return null;
        }

        boolean getFlushed() {
            return flushed;
        }

        boolean stillRoot() {
            return stillRoot;
        }
    }

    /*
     * CheckpointReferences are used to identify nodes that must be flushed as
     * part of the checkpoint. We don't keep an actual reference to the node
     * because that prevents nodes from being GC'ed during checkpoint.
     *
     * Using a checkpointReference introduces a window between the point when
     * the checkpoint dirty set is created and when the node is flushed. Some
     * of the fields saved in the reference are immutable: db, nodeId.  The
     * others are not and we have to handle potential change:
     *
     * isRoot: it's possible for isRoot to go from true->false, but not
     *         false->true. True->false is handled by the flushIN method
     *         by finding the root and checking if it is the target.
     * treeKey: This can change only in the event of a split. If it does, there
     *         is the chance that the checkpointer will find the wrong node to
     *         flush, but that's okay because the split guarantees flushing to
     *         the root, so the target will be properly logged within the
     *         checkpoint period.
     *
     * The class and ctor are public for the Sizeof program.
     */
    public static class CheckpointReference {
        final DatabaseId dbId;
        final long nodeId;
        final int nodeLevel;
        final boolean isRoot;
        final byte[] treeKey;
        final long lsn;

        CheckpointReference(final DatabaseId dbId,
                            final long nodeId,
                            final int nodeLevel,
                            final boolean isRoot,
                            final byte[] treeKey,
                            final long lsn) {
            this.dbId = dbId;
            this.nodeId = nodeId;
            this.nodeLevel = nodeLevel;
            this.isRoot = isRoot;
            this.treeKey = treeKey;
            this.lsn = lsn;
        }

        @Override
        public boolean equals(Object o) {
            if (!(o instanceof CheckpointReference)) {
                return false;
            }

            CheckpointReference other = (CheckpointReference) o;
            return nodeId == other.nodeId;
        }

        @Override
        public int hashCode() {
            return (int) nodeId;
        }

        @Override
        public String toString() {
            StringBuilder sb = new StringBuilder();
            sb.append("db=").append(dbId);
            sb.append(" nodeId=").append(nodeId);
            return sb.toString();
        }
    }

    /**
     * A struct to hold log flushing stats for checkpoint and database sync.
     */
    public static class FlushStats {

        public long nFullINFlush;
        public long nFullBINFlush;
        public long nDeltaINFlush;
        long nFullINFlushThisRun;
        long nFullBINFlushThisRun;
        long nDeltaINFlushThisRun;

        /* For future addition to stats:
           private int nAlreadyEvictedThisRun;
        */

        /* Reset per-run counters. */
        void resetPerRunCounters() {
            nFullINFlushThisRun = 0;
            nFullBINFlushThisRun = 0;
            nDeltaINFlushThisRun = 0;
            /* nAlreadyEvictedThisRun = 0; -- for future */
        }
    }
}