com.sleepycat.je.cleaner.ExtinctionScanner Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of starrocks-bdb-je Show documentation
starrocks managed bdb je
The newest version!
/*-
 * Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle Berkeley
 * DB Java Edition made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle Berkeley DB Java Edition for a copy of the
 * license and additional information.
 */

package com.sleepycat.je.cleaner;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NavigableSet;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Logger;

import com.sleepycat.bind.tuple.SortedPackedLongBinding;
import com.sleepycat.bind.tuple.TupleBase;
import com.sleepycat.bind.tuple.TupleInput;
import com.sleepycat.bind.tuple.TupleOutput;
import com.sleepycat.je.CacheMode;
import com.sleepycat.je.Cursor;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.DatabaseNotFoundException;
import com.sleepycat.je.DbInternal;
import com.sleepycat.je.Durability;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentFailureException;
import com.sleepycat.je.EnvironmentMutableConfig;
import com.sleepycat.je.Get;
import com.sleepycat.je.JEVersion;
import com.sleepycat.je.LockConflictException;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationFailureException;
import com.sleepycat.je.ReadOptions;
import com.sleepycat.je.ScanFilter;
import com.sleepycat.je.ThreadInterruptedException;
import com.sleepycat.je.TransactionConfig;
import com.sleepycat.je.config.EnvironmentParams;
import com.sleepycat.je.dbi.CursorImpl;
import com.sleepycat.je.dbi.DatabaseId;
import com.sleepycat.je.dbi.DatabaseImpl;
import com.sleepycat.je.dbi.DbConfigManager;
import com.sleepycat.je.dbi.DbTree;
import com.sleepycat.je.dbi.DbType;
import com.sleepycat.je.dbi.DupKeyData;
import com.sleepycat.je.dbi.EnvConfigObserver;
import com.sleepycat.je.dbi.EnvironmentImpl;
import com.sleepycat.je.dbi.PutMode;
import com.sleepycat.je.dbi.SortedLSNTreeWalker;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.ReplicationContext;
import com.sleepycat.je.tree.BIN;
import com.sleepycat.je.tree.IN;
import com.sleepycat.je.tree.Key;
import com.sleepycat.je.tree.LN;
import com.sleepycat.je.tree.Node;
import com.sleepycat.je.tree.Tree;
import com.sleepycat.je.txn.BasicLocker;
import com.sleepycat.je.txn.Locker;
import com.sleepycat.je.txn.LockerFactory;
import com.sleepycat.je.txn.Txn;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.LoggerUtils;
import com.sleepycat.je.utilint.StoppableThreadFactory;
import com.sleepycat.je.utilint.TestHook;
import com.sleepycat.je.utilint.TestHookExecute;

import org.checkerframework.checker.nullness.qual.NonNull;
import org.checkerframework.checker.nullness.qual.Nullable;

/**
 * Performs async processing for Record Extinction and Database Extinction,
 * via the {@link RecordExtinction} and {@link DatabaseExtinction} tasks.
 *
 * When the application calls {@link Environment#discardExtinctRecords}
 * (record extinction) or {@link Environment#removeDatabase} or
 * {@link Environment#truncateDatabase} (DB extinction), we store a scan
 * record in an internal scan DB and queue the scan task.
 *
 * The scan DB is a replicated DB, but the scan records themselves may
 * be replicated or non-replicated as described below. (This mixing is only
 * possible for internal DBs, and is also used for the DbTree naming DB.)
 *
 * 
 *     For record extinction, the scan record is replicated and is
 *     inserted in the user transaction passed to discardExtinctRecords.
 *     The durability of this record is therefore the same as the durability
 *     of the record extinction, and if this txn aborts, the scan record
 *     insertion will also be aborted. When this record is replicated, the
 *     replica replay will initiate the scan on the replica.
 *
 *     For DB extinction, the scan record is inserted after the commit of
 *     the txn passed to removeDatabase or truncateDatabase. This insertion
 *     is not replicated, and instead the replay of NameLN causes insertion
 *     of the scan record on each replica. If there is a crash between the
 *     user txn commit and insertion of the scan record, recovery redo of the
 *     NameLN will cause insertion of the scan record.
 * 
 *
 * The scan task is always performed independently on each node in a rep
 * group, and therefore update and deletion of the scan record are
 * non-replicated operations. Recovery will re-queue the scan task for all
 * scan records not yet deleted.
 *
 * For now, although a thread executor is used, it is limited to just use
 * one thread. So for now, all extinction tasks will execute serially. If
 * using multiple threads becomes necessary, some synchronization changes will
 * be needed.
 */
public class ExtinctionScanner implements EnvConfigObserver {

    /* ExtinctionTask record types. */
    private static final int RECORD_EXTINCTION = 0;
    private static final int DATABASE_EXTINCTION = 1;

    private static final JEVersion MIN_JE_VERSION = new JEVersion("18.1.7");
    public static JEVersion TEST_MIN_JE_VERSION = null;

    private static final ReadOptions NOLOCK_UNCHANGED = new ReadOptions()
        .setCacheMode(CacheMode.UNCHANGED)
        .setLockMode(LockMode.READ_UNCOMMITTED);

    private static final TransactionConfig NO_SYNC_TXN =
        new TransactionConfig().
            setLocalWrite(false).
            setDurability(Durability.COMMIT_NO_SYNC);

    public static JEVersion getMinJEVersion() {
        if (TEST_MIN_JE_VERSION != null) {
            return TEST_MIN_JE_VERSION;
        }
        return MIN_JE_VERSION;
    }

    /*
     * Max time to wait for the discardExtinctRecords user txn to commit. If
     * the timeout elapses we'll try again later, but we need a limit for
     * blocking in the scanner thread.
     */
    private static final int COMMIT_LOCK_TIMEOUT_MS = 500;

    @NonNull private final EnvironmentImpl envImpl;
    @NonNull private final Logger logger;
    @Nullable private DatabaseImpl scanDb;
    @Nullable private final ThreadPoolExecutor threadPool;
    private final AtomicLong lastRepScanID = new AtomicLong();
    private final AtomicLong lastNonRepScanID = new AtomicLong();

    private final Map activeRecordTasks =
        Collections.synchronizedMap(new HashMap<>());

    private final Set completedRecordScans =
        Collections.synchronizedSet(new HashSet<>());

    private final boolean enabled;
    private final int batchSize;
    private final int batchDelayMs;
    private final long flushObsoleteBytes;
    private volatile boolean shutdownRequested;
    private int terminateMillis;
    private volatile List recoveredTasks = new ArrayList<>();

    TestHook beforeScan1Hook;
    TestHook beforeScan1FlushHook;
    TestHook beforeScan2Hook;

    public TestHook dbBeforeWriteTaskHook;
    TestHook dbBeforeExecTaskHook;
    TestHook dbBeforeDeleteMapLNHook;
    TestHook dbBeforeDeleteTaskLNHook;

    public ExtinctionScanner(@NonNull final EnvironmentImpl envImpl) {

        this.envImpl = envImpl;
        logger = LoggerUtils.getLogger(getClass());

        final DbConfigManager configManager = envImpl.getConfigManager();

        enabled = configManager.getBoolean(
            EnvironmentParams.ENV_RUN_EXTINCT_RECORD_SCANNER);

        batchSize = configManager.getInt(
            EnvironmentParams.CLEANER_EXTINCT_SCAN_BATCH_SIZE);

        batchDelayMs = configManager.getDuration(
            EnvironmentParams.CLEANER_EXTINCT_SCAN_BATCH_DELAY);

        flushObsoleteBytes = configManager.getLong(
            EnvironmentParams.CLEANER_FLUSH_EXTINCT_OBSOLETE);

        terminateMillis = configManager.getDuration(
            EnvironmentParams.EVICTOR_TERMINATE_TIMEOUT);

        envImpl.addConfigObserver(this);

        /* Use a single thread (for now) and an unbounded queue. */
        threadPool = envImpl.isReadOnly() ?
            null :
            new ThreadPoolExecutor(
                1 /*corePoolSize*/, 1 /*maxPoolThreads*/,
                0 /*keepAliveTime*/, TimeUnit.MILLISECONDS,
                new LinkedBlockingQueue<>(),
                new StoppableThreadFactory(
                    envImpl, "JEExtinctRecordScanner", logger));
    }

    public void shutdown() {

        if (threadPool == null) {
            return;
        }

        /*
         * Set the shutdown flag so that outstanding tasks end early. The
         * call to threadPool.shutdown is an orderly shutdown that waits for
         * in-flight tasks to end.
         */
        shutdownRequested = true;
        threadPool.shutdown();

        /*
         * awaitTermination() will wait for the timeout period, or will be
         * interrupted, but we don't really care which it is. The scan
         * shouldn't be interrupted, but if it is, something urgent is
         * happening.
         */
        boolean shutdownFinished = false;
        try {
            shutdownFinished = threadPool.awaitTermination(
                terminateMillis, TimeUnit.MILLISECONDS);
        } catch (InterruptedException e) {
            /* We've been interrupted, just give up and end. */
        } finally {
            if (!shutdownFinished) {
                threadPool.shutdownNow();
            }
        }
    }

    public void requestShutdown() {

        if (threadPool == null) {
            return;
        }

        shutdownRequested = true;
        threadPool.shutdown();
    }

    @Override
    public void envConfigUpdate(
        final DbConfigManager configManager,
        final EnvironmentMutableConfig ignore) {

        terminateMillis = configManager.getDuration(
            EnvironmentParams.EVICTOR_TERMINATE_TIMEOUT);
    }

    public boolean isEnabled() {
        return enabled;
    }

    public boolean isEmpty() {
        assert threadPool != null;
        return threadPool.getQueue().isEmpty() &&
            threadPool.getActiveCount() == 0;
    }

    public long getLastLocalId() {
        return lastNonRepScanID.get();
    }

    public long getLastReplicatedId() {
        return lastRepScanID.get();
    }

    public void setLastIds(final long lastRepScanID,
                           final long lastNonRepScanID) {
        this.lastRepScanID.set(
            Math.min(this.lastRepScanID.get(), lastRepScanID));
        this.lastNonRepScanID.set(
            Math.max(this.lastNonRepScanID.get(), lastNonRepScanID));
    }

    /**
     * Opens the DB if it is not already open.
     * Assumes that envImpl.isReadOnly() is false.
     *
     * @throws com.sleepycat.je.rep.ReplicaWriteException if the scan DB
     * NameLN has not yet been replicated, including when the master node is
     * running older software and the scan DB has not been created on the
     * master. To check for this exception in JE standalone (i.e., in this
     * class), call {@link OperationFailureException#isReplicaWrite()}.
     */
    private synchronized void openDb() {

        if (scanDb != null) {
            return;
        }

        final DbTree dbTree = envImpl.getDbTree();
        final TransactionConfig txnConfig = new TransactionConfig();

        Locker locker = Txn.createLocalAutoTxn(envImpl, txnConfig);

        try {
            scanDb = dbTree.getDb(
                locker, DbType.EXTINCT_SCANS.getInternalName(),
                null /*databaseHandle*/, false);
        } finally {
            locker.operationEnd(true);
        }

        if (scanDb != null) {
            return;
        }

        if (envImpl.isReadOnly()) {
            /* This should have been checked earlier. */
            throw EnvironmentFailureException.unexpectedState();
        }

        final DatabaseConfig dbConfig = new DatabaseConfig();
        dbConfig.setReplicated(true);
        dbConfig.setTransactional(true);

        final ReplicationContext repContext = envImpl.isReplicated() ?
            ReplicationContext.MASTER : ReplicationContext.NO_REPLICATE;

        txnConfig.setDurability(Durability.COMMIT_WRITE_NO_SYNC);

        txnConfig.setConsistencyPolicy(
            envImpl.getConsistencyPolicy("NoConsistencyRequiredPolicy"));

        locker = Txn.createAutoTxn(envImpl, txnConfig, repContext);

        boolean success = false;
        try {
            scanDb = dbTree.createInternalDb(
                locker, DbType.EXTINCT_SCANS.getInternalName(), dbConfig);
            success = true;
        } finally {
            locker.operationEnd(success);
        }
    }

    /**
     * @return whether the given ID is still present in the extinction scan DB,
     * meaning that that scan is not complete.
     */
    public boolean isScanTaskActive(final long id) {

        try {
            openDb();
        } catch (OperationFailureException e) {
            if (e.isReplicaWrite()) {
                /* Scan DB NameLN has not yet been replayed. */
                return true;
            }
            throw e;
        }

        final DatabaseEntry keyEntry = new DatabaseEntry();
        serializeKey(id, keyEntry);

        final Locker locker =
            BasicLocker.createBasicLocker(envImpl, false /*noWait*/);

        try (final Cursor cursor =
                 DbInternal.makeCursor(scanDb, locker, null)) {

            return cursor.get(keyEntry, null, Get.SEARCH, null) != null;

        } finally {
            locker.operationEnd();
        }
    }

    /**
     * Fires an assertion, or logs a warning if assertions are disabled.
     */
    private void assertOrWarn(final String msg) {
        assert false : msg;
        LoggerUtils.warning(logger, envImpl, msg);
    }
    private void deleteTask(final long id) {

        if (id == 0) {
            /* Non-persistent task. */
            return;
        }

        /*
         * Use non-replicated locker so the deletion is not replicated.
         * The extinction records are deleted independently on each node.
         */
        final Locker locker = Txn.createLocalAutoTxn(envImpl, NO_SYNC_TXN);
        boolean success = false;

        try (final Cursor cursor =
                 DbInternal.makeCursor(scanDb, locker, null)) {

            final DatabaseEntry keyEntry = new DatabaseEntry();
            serializeKey(id, keyEntry);

            if (cursor.get(keyEntry, null, Get.SEARCH,
                LockMode.RMW.toReadOptions()) == null) {

                assertOrWarn(
                    "Completed extinction task not found in DB, id=" +
                        id);
                return;
            }

            DbInternal.deleteWithRepContext(
                cursor, ReplicationContext.NO_REPLICATE);

            success = true;

        } finally {
            locker.operationEnd(success);
        }
    }

    /**
     * During recovery, save incomplete scan tasks after reading them from the
     * scans DB. Incomplete scans are held until after recovery is finished,
     * and then executed by {@link #executeRecoveredTasks()}. If an extinction
     * is requested at the end of recovery (e.g., for temp DB deletion), it
     * is queued rather than being executed immediately.
     */
    public void recoverIncompleteTasks() {

        assert recoveredTasks != null;

        if (envImpl.isReadOnly() || !enabled) {
            return;
        }

        try {
            openDb();
        } catch (OperationFailureException e) {
            if (e.isReplicaWrite()) {
                /* Scan DB NameLN not yet replayed. */
                return;
            }
            throw e;
        }

        final DatabaseEntry keyEntry = new DatabaseEntry();
        final DatabaseEntry dataEntry = new DatabaseEntry();

        final Locker locker =
            BasicLocker.createBasicLocker(envImpl, false /*noWait*/);

        try (final Cursor cursor =
                 DbInternal.makeCursor(scanDb, locker, null)) {

            while (cursor.get(
                keyEntry, dataEntry, Get.NEXT, null) != null) {

                final ExtinctionTask task =
                    materializeTask(keyEntry, dataEntry);

                if (task instanceof RecordExtinction) {
                    final RecordExtinction rTask = (RecordExtinction) task;
                    activeRecordTasks.put(rTask.id, rTask);
                }

                recoveredTasks.add(task);
            }
        } finally {
            locker.operationEnd();
        }
    }

    /**
     * After recovery, queue any incomplete scans for execution.
     */
    public synchronized void executeRecoveredTasks() {

        if (envImpl.isReadOnly() || !enabled) {
            return;
        }

        assert threadPool != null;

        /*
         * This method is called more than once when multiple handles are
         * opened for a single env. We must execute recovered tasks only once.
         */
        synchronized (this) {
            if (recoveredTasks == null) {
                return;
            }

            for (final ExtinctionTask task : recoveredTasks) {
                threadPool.execute(task);
            }

            recoveredTasks = null;
        }
    }

    /**
     * Queues a task to process a scan LN that was replayed on this replica.
     * The LN replay must be done before this method is called, since {@link
     * RecordExtinction#isTaskCommitted()} assumes that the record is in the
     * Btree.
     */
    public void replay(final byte[] scanKey, final byte[] scanData) {

        assert !envImpl.isReadOnly();
        assert threadPool != null;

        if (!enabled) {
            return;
        }

        final ExtinctionTask task = materializeTask(
            new DatabaseEntry(scanKey),
            new DatabaseEntry(scanData));

        if (task instanceof RecordExtinction) {
            final RecordExtinction rTask = (RecordExtinction) task;
            activeRecordTasks.put(rTask.id, rTask);
        }

        /*
         * Do as little work as possible in the replay thread. Open the DB
         * as part of the async task.
         */
        threadPool.execute(() -> {
            openDb();
            task.run();
        });
    }

    /**
     * Used during recovery or replica replay to create a scan record that is
     * stored in the database. During recovery, these are incomplete scans.
     * During replay, they're new scans coming from the master.
     */
    private ExtinctionTask materializeTask(final DatabaseEntry keyEntry,
                                           final DatabaseEntry dataEntry) {

        final long id = materializeKey(keyEntry);

        /*
         * Must update last scan IDs during replay in case this node is
         * elected master. This is also important when called via
         * recoverIncompleteTasks when recovery has read a old format
         * CheckpointEnd, since extinction IDs were not added to CheckpointEnd
         * until log version 17 [#26973].
         *
         * We don't have to worry about an atomic update (for now) because
         * recovery is single threaded and there is only one scanner thread.
         */
        if (id < 0) {
            if (lastRepScanID.get() > id) {
                lastRepScanID.set(id);
            }
        } else {
            if (lastNonRepScanID.get() < id) {
                lastNonRepScanID.set(id);
            }
        }

        final TupleInput in = TupleBase.entryToInput(dataEntry);

        switch (in.readPackedInt()) {
        case RECORD_EXTINCTION:
            return new RecordExtinction(id, in);
        case DATABASE_EXTINCTION:
            return new DatabaseExtinction(id, in);
        default:
            throw EnvironmentFailureException.unexpectedState();
        }
    }

    private static void serializeKey(final long id,
                                     final DatabaseEntry keyEntry) {

        SortedPackedLongBinding.longToEntry(id, keyEntry);
    }

    public static long materializeKey(final DatabaseEntry keyEntry) {
        return SortedPackedLongBinding.entryToLong(keyEntry);
    }

    private static TupleOutput serializeType(int taskType) {
        final TupleOutput out = new TupleOutput();
        out.writePackedInt(taskType);
        return out;
    }

    private interface ExtinctionTask extends Runnable {
        boolean isExtinctionForDb(DatabaseId dbId);
    }

    /* ------------------------*
     * Record Extinction methods
     * ------------------------*/

    /**
     * Inserts the scan record in the DB and queues the scan for execution.
     *
     * Caller must check for a read-write env and {@link #isEnabled()}.
     *
     * @throws DatabaseNotFoundException if a DB name does not exist.
     */
    public long discardExtinctRecords(
        @NonNull final Locker locker,
        @NonNull final Set dbNames,
        @Nullable final DatabaseEntry beginKey,
        @Nullable final DatabaseEntry endKey,
        @Nullable final ScanFilter filter,
        @NonNull final String label) {

        assert enabled;
        assert threadPool != null;

        openDb();

        final long id = envImpl.isReplicated() ?
            lastRepScanID.decrementAndGet() :
            lastNonRepScanID.incrementAndGet();

        final RecordExtinction task = new RecordExtinction(
            id, dbNamesToIds(dbNames), beginKey, endKey, filter, label);

        /* Insert a record using a user-level locker. It will be replicated. */
        task.writeTask(0, null, locker, true /*insert*/);

        activeRecordTasks.put(id, task);

        /*
         * It is important to queue the task _after_ writing the task record,
         * so that the record exists in the Btree when read by the scan, since
         * RecordExtinction.isTaskCommitted assumes that the record is in
         * the Btree.
         */
        threadPool.execute(task);

        return task.id;
    }

    /**
     * Transforms a set of DB names into a sorted set of DB IDs.
     *
     * @throws DatabaseNotFoundException if a DB name does not exist.
     */
    private NavigableSet dbNamesToIds(final Set dbNames) {

        final DbTree dbTree = envImpl.getDbTree();
        final NavigableSet dbIds = new TreeSet<>();

        for (final String dbName : dbNames) {

            final Locker locker = BasicLocker.createBasicLocker(envImpl);

            try {
                final DatabaseId id =
                    dbTree.getDbIdFromName(locker, dbName, null, false);

                if (id == null) {
                    throw new DatabaseNotFoundException(
                        "DB does not exist: " + dbName);
                }

                dbIds.add(id.getId());

            } finally {
                locker.operationEnd();
            }
        }

        return dbIds;
    }

    /**
     * Returns whether the given DB and key are targeted by an active record
     * extinction.
     */
    boolean isRecordExtinctionIncomplete(final DatabaseImpl dbImpl,
                                         final byte[] key) {
        synchronized (activeRecordTasks) {
            for (final RecordExtinction task : activeRecordTasks.values()) {
                if (task.isKeyTargeted(dbImpl, key)) {
                    return true;
                }
            }
        }
        return false;
    }

    /**
     * Get a copy of the completed scan IDs before starting a checkpoint.
     */
    public Set getCompletedRecordScans() {

        synchronized (completedRecordScans) {
            return new HashSet<>(completedRecordScans);
        }
    }

    /**
     * After a checkpoint, delete the scans previously returned by
     * {@link #getCompletedRecordScans()}.
     */
    public void deleteCompletedRecordScans(final Set scanIds) {

        if (scanIds.isEmpty()) {
            return;
        }

        assert scanDb != null;

        for (final long id : scanIds) {
            deleteTask(id);
        }

        completedRecordScans.removeAll(scanIds);
        activeRecordTasks.keySet().removeAll(scanIds);

        LoggerUtils.info(
            logger, envImpl,
            "Deleted complete extinct record scans after checkpoint, " +
                "ids=" + scanIds);
    }


    /**
     * The RecordExtinction task makes a set of records extinct as specified by
     * {@link Environment#discardExtinctRecords}.
     *
     * The user passes a key range and an optional serializable
     * ScanFilter to discardExtinctRecords. The serialized ScanFilter is
     * stored in the scan record so it can be used on all nodes for
     * executing the scan.
     *
     * Two key-only cursor scans of the specified key range are performed,
     * using the optional ScanFilter to identify the extinct records. The
     * life cycle of the scan task is as follows.
     * 
     *     
     *     The first scan counts LN utilization using the lastLoggedSize that
     *     is stored in the BIN slots. When counting is complete, utilization
     *     is flushed (FileSummaryLNs are written for each impacted file), the
     *     countComplete field is set to true, and the scan record is updated
     *     to record this persistently. The first scan is skipped for a
     *     duplicates DB, since all LNs are already obsolete.
     *     
     *     
     *     The second scan sets the KnownDeleted flags on the BIN slots of all
     *     extinct records, and queues the BINs for the compressor. When the
     *     second scan is done, its record ID is added to a
     *     completedRecordScans set.
     *     
     *     
     *     When a checkpoint starts, the completedRecordScans set is copied.
     *     When the checkpoint finishes, the deletion of all slots for those
     *     scans will have been persisted, either after compressing them
     *     (hopefully) or via the KnownDeleted flag. The completed scan
     *     records are then deleted.
     *     
     * 
     *
     * If a crash occurs, the incomplete scan LNs are read during recovery.
     * countComplete is set to true or false, depending on whether step 1
     * finished earlier. If not, countComplete is false and step 1 is re-run.
     * Steps 2 and 3 are always re-run.
     *
     * The app can optionally supply a environment-wide ExtinctionFilter
     * callback.  As described in its javadoc, if it is not supplied then
     * cleaning of extinct LNs will be more expensive because a Btree lookup is
     * required. Since this callback can cause LNs to be cleaned _before_ their
     * BIN slot is marked KnownDeleted, an LOG_FILE_NOT_FOUND LN fetch error is
     * ignored (record is considered deleted) if the callback indicates the
     * record is extinct.
     *
     * The reason for two scans is that we cannot start marking BIN slots
     * deleted until LN utilization changes are flushed to disk, or the counts
     * might be lost due to a crash. If the deletion of BIN slots is flushed by
     * a checkpoint, then after a crash there would be no way to find them
     * again in order to count the LNs.
     *
     * In the future we may be able to use a single scan, combining counting
     * and slot deletion for non-duplicates DBs, if we can flush the LN
     * utilization changes before a non-provisional IN is logged. This would
     * require that splits are not logged, INs are logged provisionally by
     * recovery, as well as changes to checkpointing and eviction. This
     * complexity may not be worthwhile, since the scans are very quick.
     *
     * Discarded approach: calculate overall utilization using full scans to
     * get rid of the complexity of transactional utilization counting, while
     * using partial scans to update utilization more quickly for specific
     * changes like drop table or DB removal. This doesn't work well because
     * there is no way to accurately update overall utilization at the end
     * of a full scan, because of variable overlap between what is counted
     * by full and partial scans. A worst case scenario may be dropping a
     * large table that is counted obsolete by both a partial and a full
     * scan.
     *
     *      * Ideas for implementing dynamic TTL using a similar approach.
     * - Dynamic TTL and record extinction are different:
     *   - Extinction has no filtering requirement. Make the app do it.
     *   - Extinction has a trigger event for scanning.
     * - Since an expensive callback should not be used for filtering, app must
     *   must supply the TTL in ReadOptions. JE will filter using this value.
     * - The TTL must also be specified using WriteOptions in order to update
     *   the histogram.
     * - Do scans, similar to for record extinction, but instead of using a
     *   callback, use old/new TTL values supplied by the app.
     * - When TTL is reduced, some slots will become obsolete and can be
     *   deleted as above.
     * - Histogram is updated by subtracting slot/LN size for old TTL and
     *   adding it for new TTL.
     * - How to prevent further TTL changes until the partial scan is complete?
     *   Perhaps add an API to return incomplete partial scans.
     *   Or do multiple scans really produce incorrect results?
     * - An expensive (table lookup) dynamic TTL callback must still be used
     *   for eviction (as well as cleaning). This callback is distinct from
     *   the record extinction callback and it simply maps key to expiration
     *   time.
     * - For cleaning LNs we must call the dynamic TTL callback and the record
     *   extinction callback, both of which do a table lookup. We should
     *   combine them somehow, to avoid two table lookups.
     * How to handle TTL decrease?
     * - Problem: May be dangling references: BIN slot still exists, but LN
     *   does not.
     * - Solution: Consider it expired if slot has dynamic TTL. Table
     *   metadata must have a "TTL existed at one time" flag, even if it has
     *   been set to zero.
     * How to handle TTL increase?
     * - Problem: When TTL is increased, some already expired records will be
     *   resurrected.
     * - Solution:
     *   - Calculate max record creation time for which records are expired
     *     using the _old_ TTL. Save that maxCreationTime in the table
     *     metadata.
     *   - When determining expiration, any record having a creation time LTE
     *     maxCreationTime should be considered expired. This is in addition to
     *     records with later creation times but that are expired according to
     *     the _new_ TTL.
     * For example:
     * - TTL is 1 month. Data is written during January-June.
     * - On July 15, TTL is changed to 2 months.
     * - Data for June has not been purged and will now have a TTL of 2 months.
     * - Data prior to June 1 may have been purged and we must consider it
     *   expired.
     * - TTL-min-creationDate is set to June 1. Data before this time is
     *   considered expired.
     * 
     */
    private class RecordExtinction implements ExtinctionTask {

        /* Fields describing the scan. */
        private final long id;
        private final NavigableSet dbIds;
        private final byte[] beginKey;
        private final byte[] endKey;
        private final ScanFilter filter;
        private final String label;
        private boolean countComplete;

        /*
         * When LN utilization is flushed before the COUNT phase is complete,
         * we save the DB ID and the key of the last record read. On redo of a
         * scan, LN utilization is counted only for records past this position.
         */
        private final long flushAtLastDb;
        private final byte[] flushAtLastKey;

        /* Transient fields. */
        private LocalUtilizationTracker tracker;
        private long unflushedBytes;
        private boolean countLNsThisDB;
        private boolean checkFlushKeyThisDB;
        private long scannedRecords;
        private long extinctRecords;

        /**
         * Create a new scan when discardExtinctRecords is called.
         */
        RecordExtinction(final long id,
                         final NavigableSet dbIds,
                         final DatabaseEntry beginKey,
                         final DatabaseEntry endKey,
                         final ScanFilter filter,
                         final String label) {

            tracker = new LocalUtilizationTracker(envImpl);

            this.id = id;
            this.dbIds = dbIds;
            this.beginKey =
                (beginKey != null) ? LN.copyEntryData(beginKey) : null;
            this.endKey =
                (endKey != null) ? LN.copyEntryData(endKey) : null;
            this.filter = filter;
            this.label = label;

            countComplete = false;
            flushAtLastDb = 0;
            flushAtLastKey = null;
        }

        /**
         * Materialize a scan record that was read from the scans DB.
         */
        RecordExtinction(final long id,
                         final TupleInput in) {

            tracker = new LocalUtilizationTracker(envImpl);
            this.id = id;

            in.readPackedInt(); // Discard version for now
            countComplete = in.readBoolean();

            /* Simple run length encoding. */
            final int nDbs = in.readPackedInt();
            dbIds = new TreeSet<>();
            long prevDbId = 0;

            for (int i = 0; i < nDbs; i += 1) {
                prevDbId += in.readPackedLong();
                dbIds.add(prevDbId);
            }

            int len = in.readPackedInt();
            if (len == 0) {
                beginKey = null;
            } else {
                beginKey = new byte[len];
                in.read(beginKey);
            }

            len = in.readPackedInt();
            if (len == 0) {
                endKey = null;
            } else {
                endKey = new byte[len];
                in.read(endKey);
            }

            len = in.readPackedInt();
            if (len == 0) {
                filter = null;
            } else {
                final byte[] bytes = new byte[len];
                in.read(bytes);

                filter = (ScanFilter) DatabaseImpl.bytesToObject(
                    bytes, "ScanFilter", envImpl.getClassLoader());
            }

            label = in.readString();

            flushAtLastDb = in.readPackedLong();

            len = in.readPackedInt();
            if (len == 0) {
                flushAtLastKey = null;
            } else {
                flushAtLastKey = new byte[len];
                in.read(flushAtLastKey);
            }
        }

        void serializeData(final long flushedDbId,
                           final byte[] flushedKey,
                           final TupleOutput out) {

            out.writePackedInt(0); // Version
            out.writeBoolean(countComplete);

            /* Simple run length encoding. */
            out.writePackedInt(dbIds.size());
            long prevDbId = 0;

            for (final long id : dbIds) {
                out.writePackedLong(id - prevDbId);
                prevDbId = id;
            }

            if (beginKey == null) {
                out.writePackedInt(0);
            } else {
                out.writePackedInt(beginKey.length);
                out.write(beginKey);
            }

            if (endKey == null) {
                out.writePackedInt(0);
            } else {
                out.writePackedInt(endKey.length);
                out.write(endKey);
            }

            if (filter == null) {
                out.writePackedInt(0);
            } else {
                final byte[] bytes = DatabaseImpl.objectToBytes(
                    filter, "ScanFilter");

                out.writePackedInt(bytes.length);
                out.write(bytes);
            }

            out.writeString(label);

            out.writePackedLong(flushedDbId);

            if (flushedKey == null) {
                out.writePackedInt(0);
            } else {
                out.writePackedInt(flushedKey.length);
                out.write(flushedKey);
            }
        }

        /**
         * Inserts or updates a RecordExtinction task record.
         *
         * flushedDbId and flushedKey are specified when LN utilization has
         * been flushed and we want to save the point where this happened by
         * updating the record. In this case, insert should be false.
         *
         * When the env is replicated, the locker is used to determine whether
         * to replicate the write. A replicated locker should be used when when
         * insert is true. A non-replicated locker should be used to prevent
         * replication when insert is false.
         */
        void writeTask(final long flushedDbId,
                       final byte[] flushedKey,
                       final Locker locker,
                       final boolean insert) {

            assert !envImpl.isReplicated() || insert == locker.isReplicated();
            assert scanDb != null;

            final DatabaseEntry keyEntry = new DatabaseEntry();
            final DatabaseEntry dataEntry = new DatabaseEntry();

            serializeKey(id, keyEntry);
            final TupleOutput out = serializeType(RECORD_EXTINCTION);
            serializeData(flushedDbId, flushedKey, out);
            TupleBase.outputToEntry(out, dataEntry);

            try (final Cursor cursor =
                     DbInternal.makeCursor(scanDb, locker, null)) {

                if (DbInternal.putWithRepContext(
                    cursor, keyEntry, dataEntry,
                    insert ? PutMode.NO_OVERWRITE : PutMode.OVERWRITE,
                    locker.isReplicated() ?
                        scanDb.getRepContext() :
                        ReplicationContext.NO_REPLICATE) == null) {

                    throw EnvironmentFailureException.unexpectedState();
                }
            }
        }

        public boolean isExtinctionForDb(DatabaseId dbId) {
            return false;
        }

        /**
         * Ensures that the discardExtinctRecords txn has committed before a
         * scan can proceed. This is important whether discardExtinctRecords
         * was called on this master node, or is being replayed on a replica.
         *
         * If the record does not exist, the txn was aborted and this
         * method removes the task from activeRecordTasks and returns false.
         * For this to work reliably, the scan LN must be added to the Btree
         * before this method is called.
         *
         * If a lock timeout occurs, the commit/abort is not yet
         * available and we re-queue the task to check again later and
         * return false. Otherwise, return true.
         *
         * @return true if the task is committed and can be run. Returns false
         * if the user txn was aborted or the record is locked.
         */
        private boolean isTaskCommitted() {

            final Locker locker =
                BasicLocker.createBasicLocker(envImpl, false /*noWait*/);

            /* Override the users's default timeout. */
            locker.setLockTimeout(COMMIT_LOCK_TIMEOUT_MS);

            final DatabaseEntry keyEntry = new DatabaseEntry();
            serializeKey(id, keyEntry);

            try (final Cursor cursor =
                     DbInternal.makeCursor(scanDb, locker, null)) {

                if (cursor.get(keyEntry, null, Get.SEARCH, null) != null) {
                    return true;
                } else {
                    activeRecordTasks.remove(id);
                    return false;
                }

            } catch (LockConflictException e) {

                LoggerUtils.info(
                    logger, envImpl,
                    "Delaying extinct record scan," +
                        " discardExtinctRecord txn is still open, id=" +
                        id +" label=" + label);

                assert threadPool != null;

                threadPool.execute(this);
                return false;

            } finally {
                locker.operationEnd();
            }
        }

        /**
         * Execute the two scans for all requested DBs. If we're re-running a
         * scan, only count LN utilization if counting was not previously
         * completed, and only from the saved db/key onward.
         */
        @Override
        public void run() {

            assert !completedRecordScans.contains(id);

            try {
                if (!isTaskCommitted()) {
                    return;
                }

                LoggerUtils.info(
                    logger, envImpl,
                    "Start extinct record scan, id=" + id +" label=" + label);

                final long pass1Scanned;
                final long pass1Extinct;

                if (!countComplete) {

                    if (beforeScan1Hook != null) {
                        beforeScan1Hook.doHook();
                    }

                    for (final long dbId : dbIds) {
                        if (shutdownRequested) {
                            return;
                        }
                        scanDb(dbId);
                    }

                    if (beforeScan1FlushHook != null) {
                        beforeScan1FlushHook.doHook();
                    }

                    countComplete = true;
                    flushUtilization(0, null);

                    pass1Scanned = scannedRecords;
                    pass1Extinct = extinctRecords;
                    scannedRecords = 0;
                    extinctRecords = 0;
                } else {
                    pass1Scanned = 0;
                    pass1Extinct = 0;
                }

                if (beforeScan2Hook != null) {
                    beforeScan2Hook.doHook();
                }

                for (final long dbId : dbIds) {
                    if (shutdownRequested) {
                        return;
                    }
                    scanDb(dbId);
                }

                completedRecordScans.add(id);

                LoggerUtils.info(
                    logger, envImpl,
                    "End extinct record scan, wait for checkpoint, id=" + id +
                        " pass1Scanned=" + pass1Scanned +
                        " pass1Extinct=" + pass1Extinct +
                        " pass2Scanned=" + scannedRecords +
                        " pass2Extinct=" + extinctRecords +
                        " label=" + label);

            } catch (Exception e) {
                LoggerUtils.warning(
                    logger, envImpl,
                    "Fatal exception during extinct record scan, id=" + id +
                        " label=" + label + ": " + e);
                throw e;
            }
        }

        /**
         * Execute scan of a single DB. Perform the scan in batches. We must
         * release the DB after each batch to allow for DB removal/truncation
         * operations. We also delay after each batch to avoid hogging CPU.
         */
        private void scanDb(final long dbId) {

            final DbTree dbTree = envImpl.getDbTree();
            final DatabaseEntry keyEntry = new DatabaseEntry();
            /* dataEntry is used only for dup DBs. */
            DatabaseEntry dataEntry = null;
            boolean firstBatch = true;

            while (true) {
                envImpl.checkOpen();

                if (shutdownRequested) {
                    return;
                }

                final DatabaseImpl dbImpl =
                    dbTree.getDb(new DatabaseId(dbId));
                try {
                    if (dbImpl == null || dbImpl.isDeleting()) {
                        LoggerUtils.warning(
                            logger, envImpl,
                            "DB=" + dbId +
                                " deleted during extinct record scan," +
                                " id=" + id + " label=" + label);
                        return;
                    }

                    if (firstBatch) {
                        if (countComplete) {
                            countLNsThisDB = false;
                            checkFlushKeyThisDB = false;
                        } else {
                            /*
                             * LNs were already counted obsolete in a dup
                             * DB, so no need to count them here. Also no
                             * need to count LNs if the entire DB was
                             * counted in a prior scan.
                             */
                            countLNsThisDB =
                                !dbImpl.isLNImmediatelyObsolete() &&
                                (flushAtLastKey == null ||
                                    dbId >= flushAtLastDb);

                            if (!countLNsThisDB) {
                                return;
                            }

                            /* Only check the flush key if it is in this DB. */
                            checkFlushKeyThisDB =
                                flushAtLastKey != null &&
                                dbId == flushAtLastDb;
                        }

                        /* Setup scan begin key for initial positioning. */
                        if (beginKey != null) {
                            keyEntry.setData(beginKey);
                        }

                        /*
                         * Need data entry for repositioning in a dup db. The
                         * dup data is embedded, so this doesn't cause a fetch.
                         */
                        if (dbImpl.getSortedDuplicates()) {
                            dataEntry = new DatabaseEntry();
                        }
                    }

                    if (!scanBatch(dbImpl, keyEntry, dataEntry, firstBatch)) {
                        return;
                    }

                    firstBatch = false;
                } finally {
                    dbTree.releaseDb(dbImpl);
                }

                if (batchDelayMs > 0) {
                    try {
                        Thread.sleep(batchDelayMs);
                    } catch (InterruptedException e) {
                        LoggerUtils.warning(
                            logger, envImpl,
                            "Extinct record scan interrupted, id=" + id +
                                " label=" + label);
                        throw new ThreadInterruptedException(envImpl, e);
                    }
                }
            }
        }

        /**
         * Scan a single batch (within a given DB).
         *
         * Returns true if the batch size is reached and there are more
         * records in the DB. When true is returned, the key/data pair is
         * the next record to be processed.
         *
         * Returns false if DB is finished or we are shutting down.
         */
        private boolean scanBatch(
            final DatabaseImpl dbImpl,
            final DatabaseEntry keyEntry,
            final DatabaseEntry dataEntry,
            final boolean firstBatch) {

            final Locker locker =
                LockerFactory.getInternalReadOperationLocker(envImpl);

            try (final Cursor cursor =
                     DbInternal.makeCursor(dbImpl, locker, null, false)) {

                DbInternal.excludeFromOpStats(cursor);
                cursor.setCacheMode(CacheMode.UNCHANGED);

                /*
                 * If the scan does not have a begin key, start at the first
                 * record in the DB.
                 *
                 * Else if the scan has a begin key and this is the first
                 * batch, use SEARCH_GTE to position at the start of the first
                 * batch. Note that this will position at the first dup for
                 * a given secondary key.
                 *
                 * Otherwise use SEARCH_ANY_GTE to move to the next record,
                 * which was saved in the key/data entry at the end of the
                 * prior batch. If that record has been deleted, this will
                 * position at the next highest existing key/data pair.
                 */
                if (cursor.get(
                    keyEntry, dataEntry,
                    (keyEntry.getData() == null) ? Get.FIRST :
                        (firstBatch ? Get.SEARCH_GTE : Get.SEARCH_ANY_GTE),
                    NOLOCK_UNCHANGED) == null) {
                    return false;
                }

                final long prevScanned = scannedRecords;

                while (true) {
                    if (shutdownRequested) {
                        return false;
                    }

                    final CursorImpl cursorImpl =
                        DbInternal.getCursorImpl(cursor);

                    cursorImpl.latchBIN();
                    try {
                        final BIN bin = cursorImpl.getBIN();
                        bin.mutateToFullBIN(false /*leaveFreeSlot*/);

                        if (!scanBIN(bin, cursorImpl.getIndex())) {
                            return false;
                        }

                        /* Cause NEXT op below to move to next BIN. */
                        cursorImpl.setIndex(bin.getNEntries() - 1);
                    } finally {
                        cursorImpl.releaseBIN();
                    }

                    if (cursor.get(
                        keyEntry, dataEntry, Get.NEXT,
                        NOLOCK_UNCHANGED) == null) {
                        return false;
                    }

                    if (unflushedBytes >= flushObsoleteBytes) {
                        /*
                         * This feature is disabled for now. It is not clear
                         * whether it is needed, and it is not well tested.
                         *
                        flushUtilization(
                            dbImpl.getId().getId(),
                            cursorImpl.getCurrentKey());
                         */
                        unflushedBytes = 0;
                    }

                    if (scannedRecords - prevScanned >= batchSize) {
                        return true;
                    }
                }
            } finally {
                locker.operationEnd();
            }
        }

        /**
         * Flush tracked LN utilization when it has exceeded the size limit.
         * Update the scan record in the DB to save the db/key of the last
         * counted LN.
         */
        void flushUtilization(final long currentDbId,
                              final byte[] currentKey) {

            /*
             * Flush FileSummaryLNs prior to flushing task records. If there
             * is a crash between the two, double counting (over cleaning) can
             * occur, but under counting (a disk leak) cannot occur.
             */
            envImpl.getUtilizationProfile().flushLocalTracker(tracker);
            tracker = new LocalUtilizationTracker(envImpl);

            /*
             * Use non-replicated locker so the update is not replicated.
             * The extinction records are updated independently on each node.
             */
            final Locker locker = Txn.createLocalAutoTxn(envImpl, NO_SYNC_TXN);
            boolean success = false;
            try {
                writeTask(
                    currentDbId, currentKey, locker, false /*insert*/);
                success = true;
            } finally {
                locker.operationEnd(success);
            }
        }

        /**
         * Scan a single BIN, latched by the caller. Marks all extinct slots
         * as known-deleted and preempts any locks held on them. If there are
         * any extinct slots, adds the BIN to the compressor queue.
         *
         * For each extinct slot the LN is counted obsolete in the local
         * tracker, but only if it was not previously counted obsolete.
         *
         * Returns true if there may be more records for this DB, or false if
         * DB is finished.
         */
        private boolean scanBIN(final BIN bin, final int startIndex) {

            final DatabaseImpl dbImpl = bin.getDatabase();
            final int nEntries = bin.getNEntries();

            boolean checkEndKeyThisBIN = (endKey != null);
            boolean countLNsThisBIN = countLNsThisDB;
            boolean checkFlushKeyThisBIN =
                countLNsThisBIN && checkFlushKeyThisDB;

            /*
             * Avoid checking every key in the BIN by checking the last key,
             * whenever possible. This is only worthwhile if there are more
             * than just a couple keys in the BIN.
             */
            if (nEntries - startIndex > 3 &&
                (checkEndKeyThisBIN || checkFlushKeyThisBIN)) {

                final byte[] lastBinKey = bin.getKey(nEntries - 1);

                if (checkEndKeyThisBIN) {
                    /*
                     * If the BIN's last key is LT the scan end key, then all
                     * keys in the BIN are also LT the scan end key.
                     */
                    if (!passedEndKey(lastBinKey, dbImpl)) {
                        checkEndKeyThisBIN = false;
                    }
                }

                if (checkFlushKeyThisBIN) {
                    /*
                     * If the BIN's last key is LTE the key of the last LN
                     * counted, then all LNs in this BIN have been counted.
                     */
                    if (Key.compareKeys(
                        lastBinKey, flushAtLastKey,
                        dbImpl.getKeyComparator()) <= 0) {

                        countLNsThisBIN = false;
                        checkFlushKeyThisBIN = false;
                    }
                }
            }

            boolean moreKeys = true;
            boolean addToCompressorQueue = false;

            for (int i = startIndex; i < nEntries && moreKeys; i += 1) {

                scannedRecords += 1;

                /*
                 * Don't get the BIN key unless we need it, since this
                 * normally involves creating a new byte array.
                 */
                byte[] slotKey = null;

                if (checkEndKeyThisBIN) {

                    slotKey = bin.getKey(i);

                    if (passedEndKey(slotKey, dbImpl)) {
                        moreKeys = false;
                        break;
                    }
                }

                if (filter != null) {

                    if (slotKey == null) {
                        slotKey = bin.getKey(i);
                    }

                    final byte[] priKey = dbImpl.getSortedDuplicates() ?
                        DupKeyData.getData(slotKey, 0, slotKey.length) :
                        slotKey;

                    switch (filter.checkKey(priKey)) {
                    case INCLUDE:
                        break;
                    case EXCLUDE:
                        continue;
                    case INCLUDE_STOP:
                        moreKeys = false;
                        break;
                    case EXCLUDE_STOP:
                        moreKeys = false;
                        continue;
                    }
                }

                extinctRecords += 1;

                /* Mark known-deleted if we're doing the 2nd scan. */
                if (countComplete) {
                    bin.setKnownDeleted(i);
                    addToCompressorQueue = true;
                }

                /*
                 * It is extinct, but do not count obsolete if already counted.
                 *
                 * An LN was previously counted obsolete if in a dup DB, and
                 * countLNsThisBIN will be false in this case. If it is
                 * embedded or defunct then it was also previously counted.
                 */
                if (!countLNsThisBIN ||
                    bin.isEmbeddedLN(i) ||
                    bin.isDefunct(i)) {
                    continue;
                }

                /* If it is transient or null, then it was never logged. */
                final long lsn = bin.getLsn(i);
                if (DbLsn.isTransientOrNull(lsn)) {
                    continue;
                }

                /*
                 * It may have been counted by a previous incomplete scan.
                 * We must check flushAtLastKey if it is in this DB and may be
                 * in this BIN. Note that countLNsThisBIN is false if the
                 * flush key comes after the keys in this BIN.
                 */
                if (checkFlushKeyThisBIN) {

                    if (slotKey == null) {
                        slotKey = bin.getKey(i);
                    }

                    if (Key.compareKeys(
                        slotKey, flushAtLastKey,
                        dbImpl.getKeyComparator()) < 0) {
                        continue;
                    }

                    /* BIN key is GT the flush key. */
                    checkFlushKeyThisBIN = false;
                    checkFlushKeyThisDB = false;
                }

                final int size = bin.getLastLoggedSize(i);

                tracker.countObsolete(
                    lsn, null, size,
                    false /*trackOffset*/, false /*checkDupOffset*/);

                unflushedBytes += size;
            }

            if (addToCompressorQueue) {
                /*
                 * To support erasure of extinct data, ensure that extinct
                 * slots are deleted before logging by prohibiting the next
                 * BIN-delta. Slots cannot be deleted when logging a delta.
                 */
                bin.setProhibitNextDelta(true);
                envImpl.addToCompressorQueue(bin);
            }

            return moreKeys;
        }

        /**
         * Returns whether the given BIN key is GTE the scan's end key.
         */
        private boolean passedEndKey(final byte[] binKey,
                                     final DatabaseImpl dbImpl) {

            return dbImpl.getSortedDuplicates() ?

                DupKeyData.compareMainKey(
                    binKey, endKey, 0, endKey.length,
                    dbImpl.getBtreeComparator()) >= 0 :

                Key.compareKeys(
                    binKey, endKey,
                    dbImpl.getBtreeComparator()) >= 0;
        }

        private boolean isKeyTargeted(final DatabaseImpl dbImpl,
                                      final byte[] key) {

            if (!dbIds.contains(dbImpl.getId().getId())) {
                return false;
            }

            if (beginKey != null &&
                compareMainKey(key, beginKey, dbImpl) < 0) {
                return false;
            }

            if (endKey != null &&
                compareMainKey(key, endKey, dbImpl) > 0) {
                return false;
            }

            if (filter != null && !filter.checkKey(key).getInclude()) {
                return false;
            }

            return true;
        }
    }

    private static int compareMainKey(final byte[] key1,
                                      final byte[] key2,
                                      final DatabaseImpl dbImpl) {

        return dbImpl.getSortedDuplicates() ?

            DupKeyData.compareMainKey(
                key1, key2,
                dbImpl.getBtreeComparator()) :

            Key.compareKeys(
                key1, key2,
                dbImpl.getBtreeComparator());
    }

    /* ---------------------------*
     * Database Extinction methods
     * ---------------------------*/

    /**
     * Must be called before writing a NameLN, when {@link #startDbExtinction}
     * will be called, i.e., for a DB remove/truncate operation. This method
     * ensures that the scan DB can be opened on the replica when replaying
     * the NameLN.
     *
     * Returns without taking any action if this is a replica and the NameLN
     * for the scan DB has not yet been replayed (the DB does not exist), and
     * the truncate/remove operation itself is not replicated. See
     * startDbExtinction for handling of this special case.
     *
     * @throws com.sleepycat.je.rep.ReplicaWriteException if this is a replica
     * node and the scan DB NameLN has not yet been replayed.
     */
    public void prepareForDbExtinction(final ReplicationContext repContext) {
        try {
            openDb();
        } catch (OperationFailureException e) {
            if (e.isReplicaWrite() && !repContext.inReplicationStream()) {
                /* startDbExtinction will execute extinction synchronously. */
                return;
            }
            throw e;
        }
    }

    /**
     * Used after transaction commit or non-transactional operation end in
     * these cases:
     * 
     *    purge the deleted database after a commit of
     *    Environment.removeDatabase
     *
     *    purge the deleted database after a commit of
     *    Environment.truncateDatabase
     *
     *    purge the newly created database after an abort of
     *    Environment.truncateDatabase
     * 
     *
     * Note that the processing of the naming tree means the MapLN is never
     * actually accessible from the current tree, but making the DB extinct
     * is necessary to reclaim the memory and disk space it occupies.
     * This method initiates an async task to perform the following:
     * 
     *     Release the INs for the deleted database,
     *     count all log entries for this database as obsolete,
     *     delete the MapLN,
     *     set the DatabaseImpl state to DELETE_FINISHED,
     *     and call DbTree.releaseDb.
     * 
     *
     * In one special case the task is executed synchronously (see
     * below).
     */
    public void startDbExtinction(final DatabaseImpl dbImpl)
        throws DatabaseException {

        assert dbImpl.isDeleting();
        assert !dbImpl.isDeleteFinished();
        assert threadPool != null;

        assert TestHookExecute.doHookIfSet(dbBeforeWriteTaskHook);

        /*
         * If scanDb is null then prepareForDbExtinction encountered a
         * ReplicaWriteException when trying to open the scan DB because it
         * does not yet exist on this replica, and the truncate/remove
         * operation is not replicated. In this corner case, we have no choice
         * but to execute the DB extinction task synchronously. This is
         * acceptable in that it should be rare and will not cause a delay
         * during replica replay (because the operation is not replicated).
         */
        if (scanDb == null) {
            /* Task ID zero indicates it is non-persistent. */
            final DatabaseExtinction task = new DatabaseExtinction(0, dbImpl);
            task.run();
        } else {
            /* Otherwise, add task to the scan DB. */
            final DatabaseExtinction task = new DatabaseExtinction(
                lastNonRepScanID.incrementAndGet(), dbImpl);

            task.writeTask();

            /*
             * If recovery is complete then execute the task, else save it
             * until recovery is complete.
             */
            if (recoveredTasks == null) {
                threadPool.execute(task);
            } else {
                recoveredTasks.add(task);
            }
        }
    }

    /**
     * Called during recovery when a NameLN delete/truncate is not followed by
     * the appropriate MapLN deletion. Add an async task for the DB extinction
     * if one does not already exist. Ensures that DbTree.releaseDb is called.
     */
    public void ensureDbExtinction(final DatabaseImpl dbImpl) {

        assert recoveredTasks != null;
        final DatabaseId dbId = dbImpl.getId();

        for (final ExtinctionTask task : recoveredTasks) {
            if (task.isExtinctionForDb(dbId)) {
                envImpl.getDbTree().releaseDb(dbImpl);
                return;
            }
        }

        dbImpl.setDeleteStarted();
        startDbExtinction(dbImpl);
    }

    /**
     * Entire databases become extinct when they are removed or truncated by
     * the app (via {@link Environment#removeDatabase} or {@link
     * Environment#truncateDatabase), or automatically in the case of
     * temporary DBs. We cannot perform utilization counting synchronously
     * (in the app operation thread) because it could cause timeouts, most
     * importantly during replica replay. Therefore, a record is added to
     * the scan DB and the database extinction is performed asynchronously,
     * like record extinction.
     *
     * You may ask, why can't we rely on redo of the NameLN or MapLN to
     * trigger retry of utilization counting after a crash, to avoid inserting
     * a record in the scan DB? The reasons are:
     * 
     *     The NameLN is logged/committed before counting. So it will
     *     not be replayed by recovery when a checkpoint occurs after the
     *     commit. So counting will not be retried if the following sequence
     *     occurs before counting is complete: NameLN commit, checkpoint,
     *     crash.
     *
     *     The MapLN cannot be deleted until after counting, since
     *     deleting it before counting would make its Btree inaccessible for
     *     retrying after a crash. This is because deletion flushes the
     *     MapLN and its possibly outdated rootLSN, without first flushing
     *     its descendant INs.
     * 
     *
     * The statement about MapLN deletion above also implies that dirty
     * nodes in a DB that has not yet been counted must be flushed by the
     * checkpointer. Otherwise, their children would become inaccessible
     * after the checkpoint, if a crash occurs before counting is complete.
     * This scenario is tested by testRemoveWithIntermediateCheckpoint in
     * TruncateAndRemoveTest.
     */
    private class DatabaseExtinction implements ExtinctionTask {

        private final long id;
        private final DatabaseId dbId;
        /* dbImpl is null when task is materialized from scan DB. */
        private DatabaseImpl dbImpl;

        DatabaseExtinction(final long id, final DatabaseImpl dbImpl) {
            this.id = id;
            dbId = dbImpl.getId();
            this.dbImpl = dbImpl;
        }

        DatabaseExtinction(final long id, final TupleInput in) {
            this.id = id;
            in.readPackedInt(); // Discard version for now
            dbId = new DatabaseId(in.readPackedLong());
        }

        void serializeData(final TupleOutput out) {
            out.writePackedInt(0); // Version
            out.writePackedLong(dbId.getId());
        }

        void writeTask() {
            assert scanDb != null;
            assert id >= 0;

            final DatabaseEntry keyEntry = new DatabaseEntry();
            final DatabaseEntry dataEntry = new DatabaseEntry();

            serializeKey(id, keyEntry);
            final TupleOutput out = serializeType(DATABASE_EXTINCTION);
            serializeData(out);
            TupleBase.outputToEntry(out, dataEntry);

            /*
             * Use non-replicated locker so the update is not replicated.
             * The NameLN, which is replicated, is the trigger for inserting
             * the extinction record independently on each node.
             */
            final Locker locker = Txn.createLocalAutoTxn(envImpl, NO_SYNC_TXN);
            boolean success = false;

            try (final Cursor cursor =
                     DbInternal.makeCursor(scanDb, locker, null)) {

                if (DbInternal.putWithRepContext(
                    cursor, keyEntry, dataEntry,
                    PutMode.NO_OVERWRITE,
                    ReplicationContext.NO_REPLICATE) == null) {

                    throw EnvironmentFailureException.unexpectedState();
                }

                success = true;
            } finally {
                locker.operationEnd(success);
            }
        }

        public boolean isExtinctionForDb(final DatabaseId dbId) {
            return dbId.equals(this.dbId);
        }

        @Override
        public void run() {
            assert TestHookExecute.doHookIfSet(dbBeforeExecTaskHook);

            final DbTree dbTree = envImpl.getDbTree();

            if (dbImpl == null) {
                /* Task was materialized from DB. */
                dbImpl = dbTree.getDb(dbId);
                if (dbImpl == null) {
                    /* Extinction was completed earlier. */
                    deleteTask(id);
                    return;
                }
                dbImpl.setDeleteStarted();
            } else {
                /*
                 * The truncate/remove operation has already called
                 * DbTree.getDb and DatabaseImpl.setDeleteStarted().
                 */
                assert dbImpl.isInUse();
                assert dbImpl.isDeleting();
                assert !dbImpl.isDeleteFinished();
            }

            try {
                final Tree tree = dbImpl.getTree();
                final long rootLsn = tree.getRootLsn();
                final IN rootIN = tree.getResidentRootIN(false);

                final LocalUtilizationTracker tracker =
                    new LocalUtilizationTracker(envImpl);

                if (rootLsn != DbLsn.NULL_LSN) {
                    tracker.countObsoleteNodeInexact(
                        rootLsn, LogEntryType.LOG_IN, 0);
                }

                /* Fetch LNs to count LN sizes only if so configured. */
                final boolean fetchLNSize =
                    envImpl.getCleaner().getFetchObsoleteSize(dbImpl);

                /* Use SLTW to visit every child LSN in the tree. */
                final ObsoleteProcessor obsoleteProcessor =
                    new ObsoleteProcessor(dbImpl, tracker, id);

                final SortedLSNTreeWalker walker = new ObsoleteTreeWalker(
                    dbImpl, rootLsn, fetchLNSize, obsoleteProcessor, rootIN);

                LoggerUtils.info(
                    logger, envImpl,
                    "Start DB remove/truncate scan, id=" + id +
                        " dbId=" + dbImpl.getId() +
                        " dbName=" + dbImpl.getName());

                /*
                 * At this point, it's possible for the evictor to find an IN
                 * for this database on the INList. It should be ignored.
                 */
                walker.walk();

                LoggerUtils.info(
                    logger, envImpl,
                    "End DB remove/truncate scan, id=" + id +
                        " scanned=" + obsoleteProcessor.scannedRecords +
                        " dbId=" + dbImpl.getId() +
                        " dbName=" + dbImpl.getName());

                /*
                 * Delete MapLN after flushing FileSummaryLNs, since the
                 * presence of the MapLN is used to determine whether counting
                 * is needed after a crash. More importantly, if the MapLN
                 * were deleted before utilization counting is complete (and
                 * flushed), the INs of the extinct DB would become unavailable
                 * (and perhaps even cleaned) making it impossible to count
                 * utilization of the IN's child nodes.
                 */
                envImpl.getUtilizationProfile().flushLocalTracker(tracker);

                assert TestHookExecute.doHookIfSet(dbBeforeDeleteMapLNHook);

                dbTree.deleteMapLN(dbImpl.getId());

                /*
                 * Flush the FileSummaryLNs and MapLN deletion to reduce the
                 * window for double-counting after a crash.
                 */
                envImpl.getLogManager().flushNoSync();

                assert TestHookExecute.doHookIfSet(dbBeforeDeleteTaskLNHook);

                /*
                 * The task record is no longer needed. While we could flush
                 * the log here, it isn't necessary -- if this task is
                 * processed again after a crash, it will find the MapLN
                 * deleted and will do nothing.
                 */
                deleteTask(id);

                /*
                 * Remove INs from cache only after deleting the MapLN, since
                 * it references the root IN.
                 */
                removeFromINList();

                /*
                 * Deletion is finished. The cleaner can now delete the files
                 * containing the DB's entries. And the evictor will expect
                 * that no INs for deleted DBs are in the INList.
                 */
                dbImpl.setDeleteFinished();

            } finally {
                /*
                 * Unconditionally call releaseDb to balance the call to getDb
                 * by the truncate or remove method.
                 */
                dbTree.releaseDb(dbImpl);
            }
        }

        private void removeFromINList() {

            long memoryChange = 0;

            try {
                final Iterator iter =
                    envImpl.getInMemoryINs().iterator();

                while (iter.hasNext()) {
                    final IN in = iter.next();

                    if (in.getDatabase() != dbImpl) {
                        continue;
                    }

                    iter.remove();
                    memoryChange -= in.getBudgetedMemorySize();
                }
            } finally {
                envImpl.getMemoryBudget().
                    updateTreeMemoryUsage(memoryChange);
            }
        }
    }

    private static class ObsoleteTreeWalker extends SortedLSNTreeWalker {

        private final IN rootIN;

        private ObsoleteTreeWalker(final DatabaseImpl dbImpl,
                                   final long rootLsn,
                                   final boolean fetchLNSize,
                                   final TreeNodeProcessor callback,
                                   final IN rootIN)
            throws DatabaseException {

            super(new DatabaseImpl[] { dbImpl },
                new long[] { rootLsn },
                callback,
                null,  /* savedException */
                null); /* exception predicate */

            accumulateLNs = fetchLNSize;
            this.rootIN = rootIN;

            /*
             * FUTURE: Set mem limit to remainder of JE cache size during
             * recovery. For now, use a smallish value (50 MB).
             */
            setInternalMemoryLimit(50L * 1024L * 1024L);
        }

        @Override
        public IN getResidentRootIN(@SuppressWarnings("unused")
                                    final DatabaseImpl ignore) {
            if (rootIN != null) {
                rootIN.latchShared();
            }
            return rootIN;
        }
    }

    /* Mark each LSN obsolete in the utilization tracker. */
    private class ObsoleteProcessor
        implements SortedLSNTreeWalker.TreeNodeProcessor {

        private final LocalUtilizationTracker tracker;
        private final boolean isLnImmediatelyObsolete;
        private final DatabaseImpl dbImpl;
        private final long id;
        private long scannedRecords;

        ObsoleteProcessor(final DatabaseImpl dbImpl,
                          final LocalUtilizationTracker tracker,
                          final long id) {
            this.dbImpl = dbImpl;
            this.tracker = tracker;
            this.id = id;
            this.isLnImmediatelyObsolete = dbImpl.isLNImmediatelyObsolete();
        }

        @Override
        public void processLSN(final long childLsn,
                               final LogEntryType childType,
                               final Node node,
                               final byte[] lnKey,
                               final int lastLoggedSize,
                               final boolean isEmbedded) {

            if (isEmbedded || DbLsn.isTransientOrNull(childLsn)) {
                return;
            }

            /*
             * Count the LN log size if an LN node and key are available, i.e.,
             * we are certain this is an LN. [#15365]
             */
            int size = 0;
            if (lnKey != null && childType.isLNType()) {
                if (isLnImmediatelyObsolete) {
                    return;
                }
                size = lastLoggedSize;
            }

            tracker.countObsoleteNodeInexact(childLsn, childType, size);

            noteScanned();
        }

        @Override
        public void processDirtyDeletedLN(final long childLsn,
                                          final LN ln,
                                          @SuppressWarnings("unused")
                                          final byte[] lnKey) {
            assert ln != null;

            /*
             * Do not count the size (pass zero) because the LN is dirty and
             * the logged LN is not available.
             */
            tracker.countObsoleteNodeInexact(
                childLsn, ln.getGenericLogType(), 0);

            noteScanned();
        }

        private void noteScanned() {
            scannedRecords += 1;
            if (scannedRecords % batchSize != 0) {
                return;
            }
            if (batchDelayMs <= 0) {
                return;
            }
            try {
                Thread.sleep(batchDelayMs);
            } catch (InterruptedException e) {
                LoggerUtils.warning(
                    logger, envImpl,
                    "DB remove/truncate scan interrupted, id=" + id +
                        " dbId=" + dbImpl.getId() +
                        " dbName=" + dbImpl.getName());
                throw new ThreadInterruptedException(envImpl, e);
            }
        }

        @Override
        public void noteMemoryExceeded() {
            /* Do nothing. */
        }
    }
}