com.sleepycat.je.cleaner.DataEraser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of starrocks-bdb-je Show documentation
starrocks managed bdb je
The newest version!
/*-
 * Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle Berkeley
 * DB Java Edition made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle Berkeley DB Java Edition for a copy of the
 * license and additional information.
 */
package com.sleepycat.je.cleaner;

import static com.sleepycat.je.ExtinctionFilter.ExtinctionStatus.EXTINCT;
import static com.sleepycat.je.ExtinctionFilter.ExtinctionStatus.MAYBE_EXTINCT;

import java.io.EOFException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.text.DateFormat;
import java.util.Collections;
import java.util.Date;
import java.util.NavigableMap;
import java.util.NavigableSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;

import com.sleepycat.bind.tuple.TupleBase;
import com.sleepycat.bind.tuple.TupleInput;
import com.sleepycat.bind.tuple.TupleOutput;
import com.sleepycat.je.CacheMode;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.EnvironmentFailureException;
import com.sleepycat.je.EnvironmentMutableConfig;
import com.sleepycat.je.ExtinctionFilter;
import com.sleepycat.je.StatsConfig;
import com.sleepycat.je.config.EnvironmentParams;
import com.sleepycat.je.dbi.DatabaseId;
import com.sleepycat.je.dbi.DatabaseImpl;
import com.sleepycat.je.dbi.DbConfigManager;
import com.sleepycat.je.dbi.EnvConfigObserver;
import com.sleepycat.je.dbi.EnvironmentFailureReason;
import com.sleepycat.je.dbi.EnvironmentImpl;
import com.sleepycat.je.dbi.MetadataStore;
import com.sleepycat.je.log.ChecksumException;
import com.sleepycat.je.log.FileManager;
import com.sleepycat.je.log.FileReader;
import com.sleepycat.je.log.LogEntryHeader;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.entry.ErasedLogEntry;
import com.sleepycat.je.log.entry.LNLogEntry;
import com.sleepycat.je.log.entry.LogEntry;
import com.sleepycat.je.tree.BIN;
import com.sleepycat.je.tree.SearchResult;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.IntStat;
import com.sleepycat.je.utilint.LoggerUtils;
import com.sleepycat.je.utilint.LongStat;
import com.sleepycat.je.utilint.Pair;
import com.sleepycat.je.utilint.PollCondition;
import com.sleepycat.je.utilint.PropUtil;
import com.sleepycat.je.utilint.StatGroup;
import com.sleepycat.je.utilint.StoppableThread;
import com.sleepycat.je.utilint.StringStat;
import com.sleepycat.je.utilint.TestHook;
import com.sleepycat.je.utilint.TracerFormatter;
import com.sleepycat.je.utilint.VLSN;
import com.sleepycat.utilint.FormatUtil;

/**
 * Erases obsolete data from disk during a configured interval. [#26954]
 *
 * Basics
 * ======
 * - The purpose of the erasure feature is to erase obsolete user data within
 *   some reasonable period after becoming obsolete, and to do this without
 *   adding significant overhead that might impact application performance.
 *
 * - There are no strict guarantees about when erasure is complete. This
 *   means it is best to think of erasure as a "best effort". Some examples
 *   where data is not always erased according to the erasure period are:
 *
 *   1. If a node is down for an extended time, obviously no erasure will
 *      occur until it comes back up. After it comes up, there may not be
 *      enough time left in the cycle for erasure to be completed. The rate
 *      of erasure is not increased in this situation. The erasure rate is
 *      kept constant throughout each cycle to avoid resource usage spikes.
 *
 *   2. Data at the tail end of the data log will not be erased because it is
 *      needed for recovery and replication. In this case, data will be
 *      eventually erased as more writing eventually occurs.
 *
 *   3. UINs (upper internal nodes in the Btree) are not erased, and
 *      therefore keys in UINs, which can be considered user data, are not
 *      erased. Erasing UINs would be very difficult because ancestor slot
 *      keys would have to be adjusted and zero'ing the entire log entry
 *      would not be possible. For now this is simply a limitation due to
 *      operational considerations.
 *
 *   4. From a performance viewpoint, erasure is designed so that each erasure
 *      cycle will be completed over a multi-day period. If the erasure period
 *      (see below) is set to a small value for testing, the cycle may not
 *      be completed and this is acceptable behavior. The minimum time
 *      needed to complete the cycle is dependent on the total size of the
 *      files to be erased in the cycle, the load on the system, and other
 *      factors.
 *
 * - Currently the erasure feature is hidden in the JE API. It may be exposed
 *   in the future.
 *
 * - The eraser overall may be enabled or disabled using the following param.
 *   It is disabled by default.
 *
 *     je.env.runEraser -- boolean, default: false, mutable: yes
 *
 * - Currently only erasure of data in deleted DBs (removed or truncated) and
 *   erasure of extinct records are supported. These are enabled via the
 *   following two params. These are both false by default.
 *
 *     je.erase.deletedDatabases -- boolean, default: false, mutable: yes
 *     je.erase.extinctRecords -- boolean, default: false, mutable: yes
 *
 *   By setting both of these parameters to true in NoSQL DB, obsolete data
 *   will be removed for dropped tables and dropped indexes.
 *
 * - In the future we may add the ability to erase expired data (via TTL) and
 *   data that has been updated or deleted. This will be more expensive and
 *   complex, and will probably be enabled via additional config params.
 *
 * - During each N day cycle, we erase all files that were N days old (or
 *   older) at the start of the cycle. Therefore, the first erasure for a file
 *   occurs when it is between N and N*2 days old. Thereafter, the file is
 *   erased every N days. The erasure period is set using the following param,
 *   which is zero by default; the period must be explicitly set to perform
 *   erasure.
 *
 *     je.erase.period -- duration, default: 0, min: 0, max: none, mutable: yes
 *
 * - Here's an example for how the erasure period works in practice.
 *
 *     Let's say the erasure period is 30 min. At the start of each 30 min
 *     erasure cycle, the data files older than 30 min are selected. Those
 *     files should normally be erased by the end of the cycle. Then a new
 *     cycle starts and a new set of files older than 30 minutes is
 *     selected.
 *
 *     Let's assume that writing is continuous and the checkpoint interval
 *     is fairly small, and therefore we won't run into the problem that
 *     data at the end of the log cannot be erased. (This problem is
 *     described in more detail in a later section.)
 *
 *     Then at any time, roughly speaking, all data that became obsolete more
 *     than 60 minutes ago should be erased. The erasure period is meant to be
 *     half of the desired interval after which obsolete data should disappear.
 *
 *     The erasure doesn't have to be complete exactly at the end of each
 *     erasure period. In production we expect the erasure period to be set to
 *     less than half of the desired interval to allow for impreciseness,
 *     problems, etc. For example, if the desired interval is 60 days in
 *     production, the erasure period would normally be set to around 25 days.
 *
 *  - Here's an example of how params could be set to ensure that obsolete
 *    data is removed within 60 days of becoming obsolete:
 *
 *      je.env.runEraser=true
 *      je.erase.deletedDatabases=true
 *      je.erase.extinctRecords=true
 *      je.erase.period=25 days
 *
 * - WARNING: Erasure cannot be used for applications that rely on the usual
 *   immutable nature of JE data files. For example, if DbBackup is used to
 *   create a snapshot using file links, the assumption that the files will
 *   not be modified by JE no longer holds when erasure is enabled. For
 *   example, NoSQL DB snapshots (which are just links to data files) are
 *   invalidated by data erasure. To perform a valid backup, NoSQL DB must
 *   actually copy the files while they're protected by DbBackup (between
 *   the calls to startBackup and endBackup).
 *
 * Trace Logging
 * =============
 * Trace logging is performed for the following erasure cycle events. All
 * messages are INFO-level messages except when stated otherwise.
 *
 *  - A fresh cycle starts. This occurs at startup, or after the prior
 *    cycle finishes and its end time passes, or when the prior cycle is
 *    aborted because its end time has passed.
 *
 *  - A cycle finishes because erasure of all files is complete. The
 *    message is logged immediately after all files have been erased, even
 *    if the cycle end time has not yet passed. The eraser will then be
 *    idle until the cycle end time passes.
 *
 *  - A cycle aborts because its end time has passed but is incomplete
 *    because not all files were erased. A fresh cycle is then started.
 *    This is a WARNING-level message when files cannot be erased because
 *    they are protected for reasons that are expected to be short lived
 *    (backup, replication) but have persisted until the end of the cycle.
 *    In other situations this is a INFO-level message, for example:
 *    - when the files are needed for recovery,
 *    - when the files are needed to retain at least 1000 VLSNs (or
 *      {@link com.sleepycat.je.rep.impl.RepParams#MIN_VLSN_INDEX_SIZE}),
 *    - when the files do not contain any VLSNs and therefore the VLSNIndex
 *      cannot be truncated at any specific point (this is a corner case).
 *
 *  - A cycle is suspended because the JE Environment is closed and not
 *    all files were erased. The cycle will be resumed at the next
 *    Environment open, if the cycle end time has not passed.
 *
 *  - An incomplete cycle is resumed at startup. The cycle was incomplete
 *    at the last Environment close, and its end time has not yet passed.
 *
 *  - An incomplete cycle cannot be resumed at startup because its end
 *    time has now passed. A fresh cycle is then started.
 *
 * All messages start with ERASER and include the cycle start/end times, the
 * files processed, and the files that are yet to be processed.
 *
 * Stats
 * =====
 * Stat Group: Eraser
 *
 * eraserCycleStart - Erasure cycle start time (UTC).
 * eraserCycleEnd - Erasure cycle end time (UTC).
 * eraserFilesRemaining - Number of files still to be processed in erasure
 *                        cycle.
 * eraserFilesErased - Number of files erased by overwriting obsolete entries.
 * eraserFilesDeleted - Number of reserved files deleted by the eraser.
 * eraserFilesAlreadyDeleted - Number of reserved files deleted coincidentally
 *                             by the cleaner.
 * eraserFSyncs - Number of fsyncs performed by the eraser.
 * eraserReads - Number of file reads performed by the eraser.
 * eraserReadBytes - Number of bytes read by the eraser.
 * eraserWrites - Number of file writes performed by the eraser.
 * eraserWriteBytes - Number of bytes written by the eraser.
 *
 * Erasing data at the end of log
 * ==============================
 * When writing stops, the user data at the tail end of the log may need
 * erasure at some point. This would happen if a table is dropped and then
 * there is no writing for a long time (or very little writing). This is a
 * corner case and not one we expect in production, but it can happen in
 * production and it certainly happens in tests.
 *
 * Currently JE does not allow cleaning of any file in the recovery interval.
 * This is to ensure that recovery works, of course. The same restriction
 * probably applies to erasure. If we were to treat any slot referencing an
 * erased LSN as if the slot were deleted, we might get recovery to work.
 * But this would be complex to analyze and test thoroughly, especially for IN
 * replay. Therefore if we need to erase items in the recovery interval, we
 * would need to detect this situation and force a checkpoint before erasing.
 *
 * In addition, to erase all entries at the end of the log means that the
 * VLSNIndex would need to be completely truncated, i.e., made empty. This
 * means the node could not be used as a feeder when functioning as a master,
 * and could not perform syncup when functioning as a replica. Rather than
 * emptying the VLSNIndex completely we could log a benign replicated entry
 * so there is at least one entry. But that doesn't address the broader
 * problem of syncup. Perhaps this doesn't matter when writing has stopped
 * for an extended period because the replicas will be up-to-date. And
 * perhaps network restore is fine for other unusual cases. But it is a risk.
 * Right now we always leave at least 1,000 VLSNs in the VLSNIndex to guard
 * against problems, but we don't really know what problems might arise. So
 * it would take some work to figure this out and test it.
 *
 * Therefore, we simply do not erase obsolete data when it appears at the tail
 * end of the log. One way to explain this is to say that we can't erase
 * data at the very end of the transaction log, because this would prevent
 * recovery in certain situations. We do log a warning message in this
 * situation.
 *
 * Aborting Erasure
 * ================
 * Erasure of a file is aborted in the following cases.
 *
 * - The file is needed for a backup or network restore. In both cases, it is
 *   DbBackup.startBackup that aborts the erasure. The file aborted will then
 *   be protected and won't be selected again for erasure until the backup or
 *   network restore is finished.
 *
 * - The extinction filter returns EXTINCT_MAYBE. This is due to a temporary
 *   situation at startup time, while NoSQL DB (or another app) has not yet
 *   initialized its metadata (table metadata in the case of NoSQL DB). It
 *   will be retried repeatedly until this no longer occurs.
 *
 *     Discarded idea: We could add a way to know that the filter is fully
 *     initialized and the eraser thread could delay starting until then. But
 *     we would still have to abort if MAYBE_EXTINCT is returned after that
 *     point. So for simplicity we just do the abort.
 *
 * Reserved Files, VLSNIndex, Cleaning
 * ===================================
 * - Reserved files are an exception. Because an erased file cannot be used for
 *   replication, reserved files are deleted rather than erasing them.
 *   Therefore, reserved files are never older than N*2 days.
 *
 * - Before erasing a file covered by the VLSNIndex, we truncate the
 *   VLSNIndex to remove the file from its range. Because the VLSNIndex range
 *   never retreats (only advances), files protected by the VLSNIndex will
 *   never have been erased.
 *
 * - However, other files may be erased and subsequently become protected. This
 *   is OK, because such protection only needs to guarantee that files are not
 *   changed while they are protected. These include:
 *    - Backup and network restore.
 *    - DiskOrderedCursor and Database.count.
 *
 * - Cleaning is not coordinated with erasure (except for the treatment of
 *   reserved files discussed above). A given file may be erased and cleaned
 *   concurrently, and this should not cause problems. This is a waste of
 *   resources, but since it is unlikely we do not try to detect it or
 *   optimize for it. Such coordination would add a lot of complexity.
 *
 * Throttling
 * ==========
 * Throttling is performed by estimating the total amount of work in the
 * cycle at the beginning of the cycle, dividing up the total cycle time
 * by this work amount, and throttling (waiting) at various points in
 * order to spread the work out fairly evenly over the cycle period.
 * The {@link WorkThrottle} class calculates the wait time based on work
 * done so far.
 *
 * It is possible that we cannot complete the work within the cycle period,
 * for example, because a node is down for an extended period. In such
 * cases we do _not_ attempt to catch up by speeding up the rate of work,
 * since this could cause performance spikes. Instead we intentionally
 * overestimate the amount of work, leaving spare time to account for such
 * problems. In the end, if erasure of all selected files cannot be
 * completed by the end of the cycle, the cycle will be aborted and a new
 * cycle is started with a recalculated set of files. This is acceptable
 * behavior in unusual conditions.
 *
 *   Note: Such problems could be addressed differently by integrating
 *   with the TaskCoordinator. In that case we would use a different
 *   approach entirely: we would simply work at the maximum rate allowed by
 *   the TaskCoordinator. But for this to work well, other JE components would
 *   also need to be integrated with the TaskCoordinator. So for now we simply
 *   perform work at a fixed rate within each cycle.
 *
 * Before the cycle starts we have to open each file to get its creation
 * time, which has a cost. Throttling for that one time task is performed
 * separately in {@link #startCycle}. The remaining time in the cycle is also
 * allocated by {@link #startCycle}, which initializes {@link #cycleThrottle}
 * and related fields. Work is divided as described below.
 *
 * For each file to be erased there are several components of work:
 *
 * 1. We may have to read file, or parts of files, for two reasons that
 *    are in addition to the file erasure process itself:
 *
 *    a. We may have to read the file to find its last VLSN, to determine
 *       where to truncate the VLSNIndex. In the worst case scenario we have
 *       to do this for every file, but it is much more likely that it will
 *       only have to be done for a small fraction of the files, and only a
 *       fraction (the end) of each file will normally be read. In addition,
 *       each time we truncate the VLSNIndex we perform an fsync; however,
 *       in the normal case we do this only once per cycle.
 *
 *    b. We read a file redundantly when erasure of a file is aborted and
 *       restarted later. See Aborting Erasure above.
 *
 * 2. Read through the file and overwrite the type byte for each entry
 *    that should be erased. We know the length of each file the read cost
 *    is known. We don't know how many erasures will take place, but the
 *    worst case is that every entry is erased.
 *
 *    Note that reserved files are simply deleted rather than erasing them
 *    and this is cheaper than erasure. So when there are reserved files,
 *    we will overestimate the amount of work.
 *
 * 3. Before any overwriting, at the time we determine that at least one entry
 *    must be erased, touch the file and perform an fsync to ensure that the
 *    lastModifiedTime is updated persistently. The fsync is assumed to be
 *    expensive.
 *
 * 4. After overwriting all type bytes, perform a second fsync to make the
 *    type changes persistent. The fsync is assumed to be expensive.
 *
 * 5. Overwrite the item in each entry that was erased. This cost is
 *    unknown, but the worst case is that every entry is erased.
 *
 * 6. Perform the third and final fsync to make the erasure persistent. The
 *    fsync is assumed to be expensive.
 *
 * Work units are defined as follows.
 *
 * - For component 1 we assign a work unit to each byte read (the file
 *   length). This is a very large overestimate and is intended to
 *   account for processing delays, such as when a node is down.
 *
 * - For component 2 we also assign a work unit to each byte read (the file
 *   length). The overwrite of type bytes is variable and included in this
 *   cost for simplicity.
 *
 * - For components 3, 4, 5 and 6 together we also assign the file length as
 *   a very rough estimate, and this work is divided between these components
 *   as follows:
 *     18% for step 3
 *     16% for step 4
 *     50% for step 5
 *     16% for step 6
 *
 * Therefore the total amount of work is simply three times the length of the
 * files to be erased.
 *
 * Other
 * =====
 * - To support network restore, before erasing a file we remove its cached
 *   info (which includes a checksum) from the response cache in LogFileFeeder.
 *   See {@link
 *   com.sleepycat.je.rep.impl.RepImpl#clearedCachedFileChecksum(String)}.
 *
 * - Prohibiting erasure of protected files prevents changes to the file while
 *   it is being read by the protecting entity. Even so, if we read from the
 *   log buffer cache or tip cache, we could also get different results than
 *   reading directly from the file. To be safe, erasure could clear any
 *   cached data for the file. But is this necessary? No, because reading
 *   from the tip cache and log buffer cache is done only by feeders, and
 *   the files read by feeders are protected from erasure.
 *
 * - BtreeVerifier, when configured to read LNs, checks for LSN references to
 *   erased entries since it simply reads the LNs via a cursor. If the LN has
 *   been erased and it it is not extinct, the environment is invalidated as
 *   usual.
 *
 * - The checksum for the LOG_ERASED type cannot be verified, and its
 *   {@link LogEntryHeader#hasChecksum()} method will return false. Because
 *   an entry may be erased in the middle of checksum calculation, the header
 *   may have to be re-read from disk in rare cases. See
 *   {@link com.sleepycat.je.log.ChecksumValidator#validate(long, long)}.
 *
 * - The LOG_ERASED type is not counted as an LN or IN, which could throw off
 *   utilization counts. This may only impact tests and debugging. A thorough
 *   analysis of this issue has not been performed.
 */
public class DataEraser extends StoppableThread implements EnvConfigObserver {

    private static final int MAX_FILE_INFO_MS = 1000;
    private static final int MIN_WORK_DELAY_MS = 5;
    private static final int MAX_SLEEP_MS = 100;
    private static final int FOREVER_TIMEOUT_MS = 5 * 60 * 1000;
    private static final int NO_UNPROTECTED_FILES_DELAY_MS = 1000;
    private static final String TEST_ERASE_PERIOD = "test.erasePeriod";
    private static TestHook testEventHook;

    private static final DateFormat DATE_FORMAT =
        TracerFormatter.makeDateFormat();

    private static final int WRITE_WORK_PCT = 50;
    private static final int FSYNC1_WORK_PCT = 18;
    private static final int FSYNC2_WORK_PCT = 16;
    private static final int FSYNC3_WORK_PCT = 16;

    private final Cleaner cleaner;
    private final FileProtector fileProtector;
    private final FileManager fileManager;
    private final Logger logger;
    private volatile boolean shutdownRequested = false;
    private volatile boolean enabled = false;
    private int terminateMillis;
    private volatile long cycleMs = 0;
    private boolean eraseDeletedDbs = false;
    private boolean eraseExtinctRecords = false;
    private int pollCheckMs;
    private long[] eraseOffsets = new long[5 * 1024];
    private int[] eraseSizes = new int[eraseOffsets.length];
    private byte[] zeros = new byte[10 * 1024];
    private final Object pollMutex = new Object();
    private final NavigableMap fileInfoCache = new TreeMap<>();
    private WorkThrottle cycleThrottle;
    private long totalCycleWork;
    private String lastProtectedFilesMsg;
    private Level lastProtectedFilesMsgLevel;

    /**
     * The eraser is single-threaded but there is occasional multi-threaded
     * access to the following fields due to stat loading.
     */
    private volatile long startTime;
    private volatile long endTime;
    private volatile long completionTime;
    private NavigableSet filesRemaining = Collections.emptyNavigableSet();
    private NavigableSet filesCompleted = Collections.emptyNavigableSet();
    private final AtomicInteger filesErased = new AtomicInteger();
    private final AtomicInteger filesDeleted = new AtomicInteger();
    private final AtomicInteger filesAlreadyDeleted = new AtomicInteger();
    private final AtomicInteger fSyncs = new AtomicInteger();
    private final AtomicLong reads = new AtomicLong();
    private final AtomicLong readBytes = new AtomicLong();
    private final AtomicLong writes = new AtomicLong();
    private final AtomicLong writeBytes = new AtomicLong();

    /**
     * currentFileMutex protects currentFile and abortCurrentFile.
     */
    private final Object currentFileMutex = new Object();
    private volatile Long currentFile;
    private boolean abortCurrentFile;
    private int abortTimeoutMs;

    public DataEraser(final EnvironmentImpl envImpl) {

        super(envImpl, "JEErasure");
        cleaner = envImpl.getCleaner();
        fileProtector = envImpl.getFileProtector();
        fileManager = envImpl.getFileManager();
        logger = LoggerUtils.getLogger(getClass());

        envConfigUpdate(envImpl.getConfigManager(), null);
        envImpl.addConfigObserver(this);
    }

    @Override
    public void envConfigUpdate(
        final DbConfigManager configManager,
        final EnvironmentMutableConfig ignore) {

        /*
         * If the TEST_ERASE_PERIOD system property is specified and
         * ENV_RUN_ERASER is not specified, enable erasure and use the test
         * period.
         */
        final boolean runErase;
        final String testErasePeriod = System.getProperty(TEST_ERASE_PERIOD);

        if (testErasePeriod != null &&
            !configManager.isSpecified(EnvironmentParams.ENV_RUN_ERASER)) {

            runErase = true;
            cycleMs = PropUtil.parseDuration(testErasePeriod);
            eraseDeletedDbs = true;
            eraseExtinctRecords = true;
        } else {

            runErase = configManager.getBoolean(
                EnvironmentParams.ENV_RUN_ERASER);

            cycleMs = configManager.getDuration(
                EnvironmentParams.ERASE_PERIOD);

            eraseDeletedDbs = configManager.getBoolean(
                EnvironmentParams.ERASE_DELETED_DATABASES);

            eraseExtinctRecords = configManager.getBoolean(
                EnvironmentParams.ERASE_EXTINCT_RECORDS);
        }

        enabled = runErase && cycleMs > 0 &&
            (eraseDeletedDbs || eraseExtinctRecords);

        terminateMillis = configManager.getDuration(
            EnvironmentParams.EVICTOR_TERMINATE_TIMEOUT);

        pollCheckMs = Math.min(MAX_SLEEP_MS, terminateMillis / 4);

        abortTimeoutMs = configManager.getDuration(
            EnvironmentParams.ERASE_ABORT_TIMEOUT);
    }

    public StatGroup loadStats(StatsConfig config) {

        final StatGroup statGroup = new StatGroup(
            EraserStatDefinition.GROUP_NAME, EraserStatDefinition.GROUP_DESC);

        /* Add CUMULATIVE stats. */

        final DateFormat dateFormat = TracerFormatter.makeDateFormat();

        new StringStat(
            statGroup, EraserStatDefinition.ERASER_CYCLE_START,
            (startTime == 0) ? "" : dateFormat.format(new Date(startTime)));

        new StringStat(
            statGroup, EraserStatDefinition.ERASER_CYCLE_END,
            (endTime == 0) ? "" : dateFormat.format(new Date(endTime)));

        new IntStat(
            statGroup, EraserStatDefinition.ERASER_FILES_REMAINING,
            filesRemaining.size());

        /* INCREMENTAL stats must be cleared. */

        final boolean clear = config.getClear();

        new IntStat(
            statGroup, EraserStatDefinition.ERASER_FILES_ERASED,
            getStat(filesErased, clear));

        new IntStat(
            statGroup, EraserStatDefinition.ERASER_FILES_DELETED,
            getStat(filesDeleted, clear));

        new IntStat(
            statGroup, EraserStatDefinition.ERASER_FILES_ALREADY_DELETED,
            getStat(filesAlreadyDeleted, clear));

        new IntStat(
            statGroup, EraserStatDefinition.ERASER_FSYNCS,
            getStat(fSyncs, clear));

        new LongStat(
            statGroup, EraserStatDefinition.ERASER_READS,
            getStat(reads, clear));

        new LongStat(
            statGroup, EraserStatDefinition.ERASER_READ_BYTES,
            getStat(readBytes, clear));

        new LongStat(
            statGroup, EraserStatDefinition.ERASER_WRITES,
            getStat(writes, clear));

        new LongStat(
            statGroup, EraserStatDefinition.ERASER_WRITE_BYTES,
            getStat(writeBytes, clear));

        return statGroup;
    }

    private long getStat(final AtomicLong val, final boolean clear) {
        return clear ? val.getAndSet(0) : val.get();
    }

    private int getStat(final AtomicInteger val, final boolean clear) {
        return clear ? val.getAndSet(0) : val.get();
    }

    @Override
    public Logger getLogger() {
        return logger;
    }

    @Override
    public int initiateSoftShutdown() {
        shutdownRequested = true;
        synchronized (pollMutex) {
            pollMutex.notify();
        }
        return terminateMillis;
    }

    public void startThread() {

        if (enabled &&
            !envImpl.isMemOnly() &&
            !envImpl.isReadOnly() &&
            !isAlive()) {

            /*
             * Ensure metadata DB is opened before Environment ctor finishes,
             * to avoid timing problems in tests that are sensitive to the
             * number of open DBs.
             */
            envImpl.getMetadataStore().openDb();

            start();
        }
    }

    @Override
    public void run() {
        boolean isInitialized = false;
        boolean isStarted = false;

        /*
         * Internal exceptions are used to signal state changes, and they are
         * handled in this method. Other than ShutdownRequestedException, all
         * such exceptions must be handled inside the while loop.
         */
        try {
            while (true) {
                checkShutdown();
                try {
                    if (!isInitialized) {
                        isInitialized = true;
                        if (resumeCycle()) {
                            isStarted = true;
                        }
                    }
                    if (!isStarted) {
                        startCycle();
                        isStarted = true;
                    }
                    if (checkForCycleEnd()) {
                        waitForEnabled();
                        isStarted = false;
                        continue;
                    }
                    final Long file = getNextFile();
                    if (file != null) {
                        try {
                            eraseFile(file);
                            filesCompleted.add(file);
                            filesRemaining.remove(file);
                            storeState();
                        } finally {
                            clearCurrentFile();
                        }
                    } else {
                        waitForCycleEnd();
                        waitForEnabled();
                        isStarted = false;
                    }
                } catch (NoUnprotectedFilesException e) {
                    waitForUnprotectedFiles(e);
                } catch (ErasureDisabledException e) {
                    waitForEnabled();
                } catch (PeriodChangedException e) {
                    logPeriodChanged();
                    isStarted = false;
                } catch (AbortCurrentFileException e) {
                    /* Continue. */
                }
            }
        } catch (ShutdownRequestedException e) {
            if (!filesRemaining.isEmpty()) {
                logCycleSuspend();
            }
        }
    }

    /**
     * If we can load the last stored state and the endTime of the last cycle
     * has not yet arrived, resume execution and return true.
     * Otherwise, return false.
     */
    private boolean resumeCycle() {

        if (!loadState()) {
            return false;
        }

        if (System.currentTimeMillis() >= endTime) {
            if (!filesRemaining.isEmpty()) {
                logCycleCannotResume();
            }
            return false;
        }

        logCycleResume();

        final WorkThrottle throttle =
            createFileInfoThrottle(filesRemaining.size(), cycleMs);

        /* Populate file info cache for files remaining in cycle. */
        for (final Long file : filesRemaining) {

            fileInfoCache.put(
                file,
                new FileInfo(getFileCreationTime(file), getFileLength(file)));

            throttle.throttle(1);
        }

        cycleThrottle = new WorkThrottle(totalCycleWork, endTime - startTime);
        return true;
    }

    /**
     * Resets per-cycle info, including determining the set of files to be
     * erased in the next cycle.
     */
    private void startCycle() {

        /* Ignore config changes during calculations. */
        final long localCycleMs = cycleMs;

        startTime = System.currentTimeMillis();
        endTime = startTime + localCycleMs;
        final long fileAgeCutoff = startTime - localCycleMs;
        filesCompleted = Collections.synchronizedNavigableSet(new TreeSet<>());
        filesRemaining = Collections.synchronizedNavigableSet(new TreeSet<>());
        completionTime = 0;

        final NavigableSet allFiles =
            fileProtector.getAllCompletedFiles();

        final WorkThrottle throttle =
            createFileInfoThrottle(allFiles.size(), localCycleMs);

        logCycleInit(allFiles.size());

        /*
         * Iterate through all files except for the last file. We can't erase
         * the last file anyway, because it is in the recovery interval.
         */
        for (final Long file : allFiles) {

            /*
             * Trying using cached file info from previous cycles. A file in a
             * previous cycle will normally have a creationTime that qualifies,
             * but we check again in case the erasure period has changed.
             */
            final FileInfo prevInfo = fileInfoCache.get(file);
            if (prevInfo != null) {
                if (prevInfo.creationTime <= fileAgeCutoff) {
                    filesRemaining.add(file);
                }
                continue;
            }

            final long creationTime = getFileCreationTime(file);

            throttle.throttle(1);

            if (creationTime > fileAgeCutoff) {
                continue;
            }

            filesRemaining.add(file);

            fileInfoCache.put(
                file, new FileInfo(creationTime, getFileLength(file)));
        }

        /*
         * Prevent the file cache from growing without bounds. The cache need
         * only contain an entry for all filesToErase.
         */
        fileInfoCache.navigableKeySet().retainAll(filesRemaining);

        /*
         * Use three times the sum of the file lengths as the work for the
         * remaining time in the cycle, as described in the class comments.
         */
        totalCycleWork = filesRemaining.stream()
            .mapToLong(file -> fileInfoCache.get(file).length)
            .sum() * 3;

        cycleThrottle = new WorkThrottle(totalCycleWork, endTime - startTime);
        logCycleStart();
    }

    /**
     * Creates a throttle for reading file info when a cycle is started or
     * restored.
     *
     * Use only a small portion of the cycle for collecting file info.
     * Use at most 0.1% of the cycle time and at most MAX_FILE_INFO_MS
     * per file. The idea is just to do a reasonable amount of throttling
     * between calls to getFileCreationTime.
     */
    private WorkThrottle createFileInfoThrottle(final int files,
                                                final long localCycleMs) {
        final long workTime = Math.min(
            localCycleMs / 1000,
            files * MAX_FILE_INFO_MS);

        return new WorkThrottle(files, workTime);
    }

    /**
     * Returns the file creation time by opening the file and reading its
     * header entry.
     *
     * @return creation time, or Long.MAX_VALUE if the file does not exist.
     */
    private long getFileCreationTime(final long file) {
        try {
            return fileManager.getFileHeaderTimestamp(file).getTime();

        } catch (ChecksumException e) {
            throw new EnvironmentFailureException(
                envImpl, EnvironmentFailureReason.LOG_CHECKSUM,
                "Exception erasing file 0x" + Long.toHexString(file),
                e);

        } catch (FileNotFoundException e) {
            /* File was deleted by the cleaner. */
            return Long.MAX_VALUE;
        }
    }

    private long getFileLength(final long file) {
        return new File(fileManager.getFullFileName(file)).length();
    }

    /**
     * Return the next unprotected file to be processed.
     *
     * First return files that are unprotected, to defer VLSNIndex
     * truncation until it is necessary.
     *
     * If all files are protected, truncate the VLSNIndex and try again.
     * This is the same approach used by {@link Cleaner#manageDiskUsage}.
     *
     * A WARNING level is specified in the NoUnprotectedFilesException
     * when files are protected for reasons that are expected to be short
     * lived (backup, replication). The WARNING is actually logged only if
     * these reasons persist until the end of the erasure cycle.
     *
     * @return the next file to be processed, or null if all files have been
     * processed.
     *
     * @throws NoUnprotectedFilesException if all remaining files are
     * protected.
     */
    private Long getNextFile() {

        lastProtectedFilesMsg = null;
        lastProtectedFilesMsgLevel = null;

        if (filesRemaining.isEmpty()) {
            return null;
        }

        Long file = getNextUnprotectedFile();
        if (file != null) {
            return file;
        }

        if (!cleaner.isFileDeletionEnabled()) {
            throw new NoUnprotectedFilesException(
                "Test mode prohibits VLSNIndex truncation.",
                Level.INFO);
        }

        final long lastRecoveryFile = getFirstFileInRecoveryInterval();

        if (filesRemaining.first() >= lastRecoveryFile) {
            throw new NoUnprotectedFilesException(
                "All remaining files are in the recovery interval," +
                    " lastRecoveryFile=0x" +
                    Long.toHexString(lastRecoveryFile) + ".",
                Level.INFO);
        }

        /*
         * Determine whether it is worthwhile and possible to truncate the
         * VLSNIndex.
         */
        if (!envImpl.isReplicated()) {
            /* VLSNIndex does not exist in a non-replicated env. */
            throw new NoUnprotectedFilesException(
                "Protected files are not in the recovery interval.",
                Level.WARNING);
        }

        final long vlsnIndexStartFile = fileProtector.getVLSNIndexStartFile();

        if (vlsnIndexStartFile > filesRemaining.last()) {
            throw new NoUnprotectedFilesException(
                "Protected files are not in the recovery interval and not " +
                "protected by the VLSNIndex vlsnIndexStartFile=0x" +
                    Long.toHexString(vlsnIndexStartFile) + ".",
                Level.WARNING);
        }

        final Pair truncateInfo = getVLSNIndexTruncationInfo();

        if (truncateInfo == null) {
            /* This is a corner case and seems unlikely to ever happen. */
            throw new NoUnprotectedFilesException(
                "Cannot truncate VLSNIndex because no remaining files " +
                    "contain VLSNs.",
                Level.INFO);
        }

        /*
         * Truncate the VLSNIndex, then try again to get an unprotected file.
         */
        if (!envImpl.tryTruncateVlsnHead(
                truncateInfo.first(), truncateInfo.second())) {
            /*
             * There are several reasons for tryTruncateVlsnHead to return
             * false as explained by VLSNTracker.tryTruncateFromHead(VLSN,
             * long, LogItemCache) javadoc.  Only one of them -- that the file
             * is protected -- should cause a WARNING message. To avoid false
             * alarms we log an INFO message here for now. TODO: Return more
             * info from tryTruncateVlsnHead so we can log a WARNING when
             * appropriate.
             */
            throw new NoUnprotectedFilesException(
                "VLSNIndex already truncated or cannot be truncated further.",
                Level.INFO);
        }

        file = getNextUnprotectedFile();
        if (file != null) {
            return file;
        }

        throw new NoUnprotectedFilesException(
            "Protected files are not in the recovery interval and " +
                "VLSNIndex was successfully truncated.",
            Level.WARNING);
    }

    /**
     * Returns a human readable description of what may be preventing erasure.
     * These are the reasons that {@link NoUnprotectedFilesException} may be
     * thrown by {@link #getNextFile()}.
     */
    private String getFileProtectionMessage() {

        final long firstRecoveryFile = getFirstFileInRecoveryInterval();

        return "Files protected by the current recovery interval: [" +
            FormatUtil.asHexString(
                filesRemaining.tailSet(firstRecoveryFile)) +
            "]. Other protected files: " +
            fileProtector.getProtectedFileMap(filesRemaining) +
            ". FirstRecoveryFile: 0x" + Long.toHexString(firstRecoveryFile) +
            ". FirstVLSNIndexFile: 0x" +
            Long.toHexString(fileProtector.getVLSNIndexStartFile()) + ".";
    }

    /**
     * Returns the first file remaining that is unprotected. If a file is
     * selected, the currentFile field is updated to support aborting.
     * This method synchronizes with {@link #abortErase} to ensure that a
     * file passed to abortErase will not be selected again for erasure.
     */
    private Long getNextUnprotectedFile() {

        synchronized (currentFileMutex) {

            final Long file =
                fileProtector.getFirstUnprotectedFile(filesRemaining);

            if (file == null) {
                return null;
            }

            if (file >= getFirstFileInRecoveryInterval()) {
                return null;
            }

            currentFile = file;
            abortCurrentFile = false;
            return file;
        }
    }

    /**
     * Called after a file returned by {@link #getNextUnprotectedFile()} is no
     * longer being processed.
     */
    private void clearCurrentFile() {

        synchronized (currentFileMutex) {
            currentFile = null;
            abortCurrentFile = false;
        }
    }

    /**
     * Returns the file being processed or null. A volatile field is accessed
     * without synchronization.
     */
    Long getCurrentFile() {
        return currentFile;
    }

    /**
     * Used to ensure that erasure of a file stops before coping that file
     * during a backup or network restore. A short wait may be performed to
     * ensure that the eraser thread notices the abort and any in-process
     * writes are completed.
     *
     * @param fileSet erasure of the current file is aborted if the current
     * file is protected by this protected file set. The given fileSet must
     * be protected at the time this method is called.
     *
     * @throws EraserAbortException if we can't abort erasure of a target
     * file within {@link EnvironmentParams#ERASE_ABORT_TIMEOUT}. The
     * timeout is long, so this should not happen unless the eraser thread
     * is wedged or starved.
     */
    public void abortErase(final FileProtector.ProtectedFileSet fileSet) {

        final Long abortFile;

        /*
         * Synchronize with getNextUnprotectedFile to ensure that a file
         * passed to abortErase will not be selected again for erasure.
         */
        synchronized (currentFileMutex) {

            /*
             * The check for isReservedFile is used to avoid aborting when we
             * will not erase the file. This is a minor optimization and is
             * not required for correctness.
             */
            if (currentFile != null &&
                fileSet.isProtected(currentFile, null) &&
                !fileProtector.isReservedFile(currentFile)) {

                abortCurrentFile = true;
                abortFile = currentFile;
            } else {
                return;
            }
        }

        /*
         * Wait for the eraser thread to notice that the abortCurrentFile flag
         * is set, at which time it will set currentFile to null. The wait is
         * necessary to ensure that a write will not be performed after this
         * method returns.
         *
         * It is important that we return ASAP, so we use Object.notify to
         * wake up any waiters. All other polling is done with Object.wait on
         * the pollMutex object, to ensure we can abort promptly.
         */
        if (PollCondition.await(1, abortTimeoutMs,
            () -> {
                synchronized (pollMutex) {
                    pollMutex.notify();
                }
                synchronized (currentFileMutex) {
                    return !abortFile.equals(currentFile);
                }
            })) {
            return; /* Aborted. */
        }

        /*
         * We don't expect this to ever occur, because the timeout is quite
         * long and all erasure operations are checking the abortCurrentFile
         * flag quite frequently. EraserAbortException is thrown (rather than
         * EnvironmentWedgedException) because we don't want to shut down the
         * environment when DbBackup.start fails.
         */
        final String msg = "Unable to abort erasure of file 0x" +
            Long.toHexString(abortFile) + " within " +
            abortTimeoutMs + "ms.";

        LoggerUtils.warning(logger, envImpl, msg);
        throw new EraserAbortException(msg);
    }

    /**
     * Returns the file number of the firstActiveLsn for the last completed
     * checkpoint. Files GTE this file would be in the recovery interval if
     * a crash occurs, and are protected from erasure.
     */
    private long getFirstFileInRecoveryInterval() {

        return DbLsn.getFileNumber(
            envImpl.getCheckpointer().getLastCheckpointFirstActiveLsn());
    }

    /**
     * Returns the {endVLSN,endFile} pair that is needed to truncate the
     * VLSNIndex.
     *
     * Returns null if no remaining files-to-erase contain VLSNs. A file that
     * does not contain VLSNs (is "barren" may contain BINs or migrated LNs
     * for data that needs erasure. However, this is expected to be a rare
     * condition so we tolerate it. It will be corrected during a subsequent
     * cycle if any file following this cycle's files need erasure and do
     * contain VLSNs, or the VLSNIndex is truncated for other reasons, for
     * example, to reclaim disk space.
     *
     * Selects the last file-to-erase containing a VLSN. This reduces the
     * number of (fairly expensive) VLSNIndex truncations to the minimum.
     * However, it means that the index will be truncated for a large number
     * of files at once, rather than truncating it incrementally.
     */
    private Pair getVLSNIndexTruncationInfo() {

        for (final Long file : filesRemaining.descendingSet()) {

            checkContinue();

            /*
             * We avoid searching the file if it happens to be a reserved
             * file, since the last VLSN is known to the FileProtector.
             */
            VLSN lastVlsn = fileProtector.getReservedFileLastVLSN(file);

            if (lastVlsn != null) {
                if (lastVlsn.isNull()) {
                    continue;
                }
                return new Pair<>(lastVlsn, file);
            }

            /* Must search. */
            lastVlsn = searchFileForLastVLSN(file);

            if (lastVlsn != null) {
                if (lastVlsn.isNull()) {
                    continue;
                }
                return new Pair<>(lastVlsn, file);
            }
        }

        return null;
    }

    /**
     * Returns the last VLSN in the given file, or NULL_VLSN if the file does
     * contain any VLSNs, or null if the file does not exist.
     *
     * Only VLSNs in replicated entries are considered. VLSNs in migrated LNs
     * are ignored.
     */
    private VLSN searchFileForLastVLSN(final Long file) {

        final FileInfo fileInfo = fileInfoCache.get(file);

        if (fileInfo != null && fileInfo.lastVlsn != null) {
            return fileInfo.lastVlsn;
        }

        final long fileLength = (fileInfo != null) ?
            fileInfo.length : getFileLength(file);

        boolean forward;
        long startLsn;
        long finishLsn;
        long endOfFileLsn;

        try {
            /*
             * If file+1 exists, read its header to get the offset of the
             * previous entry, and then read the file backwards to find the
             * last VLSN.
             */
            final long prevOffset =
                fileManager.getFileHeaderPrevOffset(file + 1);

            forward = false;
            startLsn = DbLsn.makeLsn(file, prevOffset);
            finishLsn = DbLsn.makeLsn(file, 0);
            endOfFileLsn = DbLsn.makeLsn(file, fileLength);

        } catch (FileNotFoundException e) {
            /*
             * If file+1 does not exist, read the entire file from the start.
             */
            forward = true;
            startLsn = DbLsn.makeLsn(file, 0);
            finishLsn = DbLsn.NULL_LSN;
            endOfFileLsn = DbLsn.NULL_LSN;

        } catch (ChecksumException e) {
            throw new EnvironmentFailureException(
                envImpl, EnvironmentFailureReason.LOG_CHECKSUM, e);
        }

        final FileReader reader = new FileReader(
            envImpl, cleaner.readBufferSize, forward, startLsn,
            file /*singleFileNumber*/, endOfFileLsn, finishLsn) {

            @Override
            protected boolean processEntry(ByteBuffer entryBuffer) {

                final int readOps = getAndResetNReads();
                reads.addAndGet(readOps);
                readBytes.addAndGet(readOps * cleaner.readBufferSize);

                cycleThrottle.throttle(currentEntryHeader.getEntrySize());
                checkContinue();

                /* Skip the data, no need to materialize the entry. */
                skipEntry(entryBuffer);

                /*
                 * Only consider replicated entries. Note that this includes
                 * Erased entries.
                 */
                return currentEntryHeader.getReplicated();
            }
        };

        VLSN lastVlsn = VLSN.NULL_VLSN;

        try {
            while (reader.readNextEntryAllowExceptions()) {
                lastVlsn = reader.getLastVlsn();

                if (lastVlsn == null || lastVlsn.isNull()) {
                    throw EnvironmentFailureException.unexpectedState(
                        "Replicated entries must have a VLSN.");
                }

                if (!forward) {
                    break;
                }
            }
        } catch (ChecksumException e) {
            throw new EnvironmentFailureException(
                envImpl, EnvironmentFailureReason.LOG_CHECKSUM, e);

        } catch (FileNotFoundException e) {
            return null;
        }

        if (fileInfo != null) {
            fileInfo.lastVlsn = lastVlsn;
        }

        return lastVlsn;
    }

    /**
     * Erases the targeted entries in the given file and makes the changes
     * persistent using fsync.
     */
    private void eraseFile(final Long file) {

        /*
         * If a reserved file can be deleted, this is cheaper than erasure.
         * Use 25% of the file size for throttling.
         *
         * If cleaner.deleteReservedFile returns false then we proceed with
         * erasure. This is safe because the VLSNIndex has been truncated.
         */
        final long fileLength = fileInfoCache.get(file).length;

        if (fileProtector.isReservedFile(file) &&
            cleaner.deleteReservedFile(file, "ERASER")) {

            filesDeleted.incrementAndGet();
            cycleThrottle.throttle((fileLength * 25) / 100);
            return;
        }

        final EraserReader reader = new EraserReader(file);
        final DbCache dbCache = new DbCache(envImpl, cleaner);

        final ExtinctionScanner extinctionScanner =
            envImpl.getExtinctionScanner();

        int entriesToErase = 0;
        long firstWriteTime = 0;
        boolean completed = false;

        final PackedOffsets obsoleteOffsets =
            envImpl.getUtilizationProfile().getObsoleteDetailPacked(
                file, false /*logUpdate*/, this::checkContinue);

        final PackedOffsets.Iterator obsoleteIter = obsoleteOffsets.iterator();
        long nextObsolete = -1;

        final String fullFileName = fileManager.getFullFileName(file);
        RandomAccessFile raf = null;
        try {
            /* Clear checksum saved by network restore. */
            envImpl.clearedCachedFileChecksum(
                new File(fullFileName).getName());

            while (reader.readNextEntryAllowExceptions()) {

                dbCache.clearCachePeriodically();

                final long logLsn = reader.getLastLsn();
                final long fileOffset = DbLsn.getFileOffset(logLsn);

                while (nextObsolete < fileOffset && obsoleteIter.hasNext()) {
                    nextObsolete = obsoleteIter.next();
                }
                final boolean isKnownObsolete = (nextObsolete == fileOffset);

                boolean doErase = false;

                if (reader.isErased) {

                    final ErasedLogEntry erasedEntry =
                        (ErasedLogEntry) reader.logEntry;

                    if (!erasedEntry.isAllZeros()) {
                        doErase = true;
                    }
                } else {
                    final DatabaseId dbId = reader.logEntry.getDbId();
                    DbCache.DbInfo dbInfo = dbCache.getDbInfo(dbId);

                    if (dbInfo.deleted) {
                        /*
                         * All LNs and BINs in deleted DBs can be erased. But
                         * when eraseDeletedDbs is false, if the DB is deleted
                         * then we cannot erase its extinct records either,
                         * because we don't have the DB name and dups status.
                         *
                         * If the DB is being deleted (DbInfo.deleting is
                         * true) then we cannot erase its active INs because
                         * they may not have been counted obsolete yet.
                         * However, it is OK to erase its LNs and obsolete
                         * INs, and we do that further below.
                         */
                        if (eraseDeletedDbs) {
                            doErase = true;
                        }

                    } else if (reader.isLN && eraseExtinctRecords) {
                        /*
                         * All extinct LNs (in non-deleted DBs) can be erased.
                         */
                        final LNLogEntry lnEntry =
                            (LNLogEntry) reader.logEntry;

                        lnEntry.postFetchInit(dbInfo.dups);

                        if (envImpl.getExtinctionState(
                            dbInfo.name, dbInfo.dups, dbInfo.internal,
                            lnEntry.getKey()) == EXTINCT) {

                            doErase = true;
                        }

                    } else if (reader.isBIN && eraseExtinctRecords) {
                        /*
                         * A BIN with an extinct slot can be erased only if
                         * the BIN is obsolete/dead.
                         */
                        final BIN bin = (BIN) reader.logEntry.getMainItem();

                        for (int i = 0; i < bin.getNEntries(); i += 1) {

                            checkContinue();

                            final byte[] key = bin.getKey(i);

                            final ExtinctionFilter.ExtinctionStatus status =
                                envImpl.getExtinctionState(
                                    dbInfo.name, dbInfo.dups, dbInfo.internal,
                                    key);

                            if (status == MAYBE_EXTINCT) {
                                /*
                                 * This is a crude way to restart and give the
                                 * the app a chance to initialize its metadata.
                                 */
                                throw new AbortCurrentFileException(file);
                            }

                            if (status != EXTINCT) {
                                continue;
                            }

                            if (isKnownObsolete) {
                                doErase = true;
                                break;
                            }

                            /*
                             * Defer expensive getDbImpl and Btree lookup
                             * (isBINDead call) until we encounter an extinct
                             * slot in a BIN that is not known-obsolete. We
                             * must do a Btree lookup because the known-
                             * obsolete metadata is not 100% complete.
                             */
                            dbInfo = dbCache.getDbImpl(dbId);

                            /*
                             * If we need to do a Btree lookup and dbImpl is
                             * null, then the DB was deleted since the last
                             * DB lookup and erasure must wait until the next
                             * cycle.
                             *
                             * If the key for this DB is part of an active
                             * extinction, then the BIN cannot be persistently
                             * obsolete and we must wait for the task to
                             * finish before erasing the BIN.
                             *
                             * In addition, as an extra precaution, do not
                             * erase the BIN if isBINDead returns false, which
                             * means the BIN is still active in the in-memory
                             * Btree.
                             */
                            if (dbInfo.dbImpl != null &&
                                !extinctionScanner.isRecordExtinctionIncomplete(
                                    dbInfo.dbImpl, key) &&
                                isBINDead(dbInfo.dbImpl, bin, logLsn)) {

                                /* BIN is dead/obsolete. */
                                doErase = true;
                            }

                            /*
                             * If the BIN is not dead, we cannot erase it.
                             * This is the edge case where a record extinction
                             * has started (a slot is extinct) but not yet
                             * complete (the BIN is not obsolete yet).
                             */
                            break;
                        }
                    }
                }

                if (doErase) {
                    if (raf == null) {
                        firstWriteTime = System.currentTimeMillis();
                        raf = fileManager.openFileReadWrite(fullFileName);
                        touchAndFsync(raf, fileLength);
                    }

                    if (!reader.isErased) {
                        writeErasedType(raf, fileOffset);
                    }

                    entriesToErase = addEraseEntry(
                        entriesToErase,
                        fileOffset + reader.header.getSize(),
                        reader.header.getItemSize());
                }
            }

            /* Call releaseDbImpls before eraseEntries, which throttles. */
            dbCache.releaseDbImpls();

            eraseEntries(raf, fileLength, entriesToErase);
            filesErased.incrementAndGet();
            completed = true;

        } catch (ChecksumException e) {
            throw new EnvironmentFailureException(
                envImpl, EnvironmentFailureReason.LOG_CHECKSUM,
                "Exception erasing file 0x" + Long.toHexString(file),
                e);

        } catch (FileNotFoundException e) {
            /* File was deleted by the cleaner. */
            filesAlreadyDeleted.incrementAndGet();

        } catch (IOException e) {
            throw new EnvironmentFailureException(
                envImpl, EnvironmentFailureReason.LOG_WRITE,
                "Exception erasing file 0x" + Long.toHexString(file),
                e);

        } finally {

            if (firstWriteTime != 0) {
                LoggerUtils.info(
                    logger, envImpl,
                    "ERASER attempted to erase " + entriesToErase +
                        " entries in file 0x" + Long.toHexString(file) +
                        ", first write at " + formatTime(firstWriteTime) +
                        ", erasure is " +
                        (completed ? "complete" : "incomplete") + ".");
            }

            dbCache.releaseDbImpls();

            if (raf != null) {
                try {
                    raf.close();
                } catch (IOException e) {
                    LoggerUtils.warning(
                        logger, envImpl,
                        "DataEraser.eraseFile exception when closing " +
                            "file 0x" + Long.toHexString(file) + ": " + e);
                }
            }
        }
    }

    /**
     * File reader that materializes only user-database LNs, and BINs and
     * BIN-deltas.
     */
    private class EraserReader extends FileReader {
        LogEntryHeader header;
        LogEntryType entryType;
        LogEntry logEntry;
        boolean isLN;
        boolean isBIN;
        boolean isErased;

        EraserReader(final Long file) {
            super(DataEraser.this.envImpl, cleaner.readBufferSize,
                true /*forward*/,
                DbLsn.makeLsn(file, 0) /*startLsn*/,
                file /*singleFileNumber*/,
                DbLsn.NULL_LSN /*endOfFileLsn*/,
                DbLsn.NULL_LSN /*finishLsn*/);
        }

        @Override
        protected boolean processEntry(final ByteBuffer entryBuffer) {

            final int readOps = getAndResetNReads();
            reads.addAndGet(readOps);
            readBytes.addAndGet(readOps * cleaner.readBufferSize);

            cycleThrottle.throttle(currentEntryHeader.getEntrySize());
            checkContinue();

            header = currentEntryHeader;
            isLN = false;
            isBIN = false;
            isErased = false;
            logEntry = null;
            entryType = LogEntryType.findType(header.getType());
            assert entryType != null;

            if (entryType.isUserLNType()) {
                isLN = true;
            } else if (entryType.equals(LogEntryType.LOG_BIN) ||
                entryType.equals(LogEntryType.LOG_BIN_DELTA)) {
                isBIN = true;
            } else if (entryType.equals(LogEntryType.LOG_ERASED)) {
                isErased = true;
            } else {
                skipEntry(entryBuffer);
                return false;
            }

            logEntry = entryType.getNewLogEntry();
            logEntry.readEntry(envImpl, header, entryBuffer);
            return true;
        }
    }

    /**
     * Returns whether the given BIN or BIN-delta is absent from the Btree.
     * This only checks the in-memory cache, so returning true does not
     * guarantee that the BIN is persistently obsolete. Therefore this method
     * is only useful to provide added safety.
     */
    private boolean isBINDead(final DatabaseImpl db,
                              final BIN binFromLog,
                              final long logLsn) {

        binFromLog.latchNoUpdateLRU(db);

        final SearchResult result = db.getTree().getParentINForChildIN(
            binFromLog, true /*useTargetLevel*/,
            true /*doFetch*/, CacheMode.UNCHANGED);

        if (result.parent == null) {
            /*
             * Btree is completely empty (no root IN). Since we never reduce
             * the number of levels in the Btree, this should only occur if
             * the DB was truncated.
             */
            return true;
        }

        try {
            /*
             * For a parent search with useTargetLevel=true, a non-null parent
             * implies that a match was found.
             */
            assert result.exactParentFound;

            /*
             * Check whether the BIN is active by comparing the active Btree
             * LSN to the LSN of the BIN we read from the log.
             */
            final long treeLsn = result.parent.getLsn(result.index);

            if (binFromLog.isBINDelta(false)) {
                /*
                 * A BIN-delta read from the log is obsolete if its LSN is
                 * not equal to the tree LSN.
                 */
                return logLsn != treeLsn;
            }

            /*
             * A full BIN read from the log is active if its LSN is equal to
             * the tree LSN.
             */
            if (logLsn == treeLsn) {
                return false;
            }

            /*
             * If the tree and log LSNs are unequal, then we must get the full
             * version LSN in case the tree LSN is actually for a BIN-delta.
             * The only way to do that is to fetch the IN in the tree.
             */
            final BIN treeBin = (BIN) result.parent.fetchIN(
                result.index, CacheMode.UNCHANGED);

            return logLsn != treeBin.getLastFullLsn();

        } finally {
            result.parent.releaseLatch();
        }
    }

    /**
     * Adds the offset/size of one entry to be erased.
     */
    private int addEraseEntry(final int n,
                              final long offset,
                              final int size) {

        if (n == eraseOffsets.length) {

            /* Enlarge arrays. */
            final long[] newOffsets = new long[n * 2];
            System.arraycopy(eraseOffsets, 0, newOffsets, 0, n);
            eraseOffsets = newOffsets;

            final int[] newSizes = new int[newOffsets.length];
            System.arraycopy(eraseSizes, 0, newSizes, 0, n);
            eraseSizes = newSizes;
        }

        eraseOffsets[n] = offset;
        eraseSizes[n] = size;
        return n + 1;
    }

    /**
     * Ensures that the file's lastModifiedTime is persistently updated,
     * before performing any modifications to the file. This allows
     * applications to detect erasure by checking for a change to
     * lastModifiedTime across erasure cycles, without the worry that a
     * crash during erasure may have prevented the file system from updating
     * the lastModifiedTime.
     */
    private void touchAndFsync(final RandomAccessFile file,
                               final long fileLength)
        throws IOException {

        checkContinue();

        file.seek(0);
        final byte b = file.readByte();
        file.seek(0);
        file.writeByte(b);
        writes.incrementAndGet();
        writeBytes.incrementAndGet();

        file.getChannel().force(false);
        fSyncs.incrementAndGet();
        cycleThrottle.throttle((fileLength * FSYNC1_WORK_PCT) / 100);

        // TODO update FileManager stats?
    }

    /**
     * Changes the type of the entry on disk to the LOG_ERASED type.
     */
    private void writeErasedType(final RandomAccessFile file,
                                 final long headerOffset)
        throws IOException {

        file.seek(headerOffset + LogEntryHeader.ENTRYTYPE_OFFSET);
        checkContinue();
        file.writeByte(LogEntryType.LOG_ERASED.getTypeNum());

        writes.incrementAndGet();
        writeBytes.incrementAndGet();

        // TODO update FileManager stats?
    }

    /**
     * Writes zeros in the entries in the erase list and fsyncs to persist
     * the changes.
     */
    private void eraseEntries(final RandomAccessFile raf,
                              final long fileLength,
                              final int entries)
        throws IOException {

        if (entries == 0) {
            return;
        }

        final long fsync1Work = (fileLength * FSYNC1_WORK_PCT) / 100;
        final long fsync2Work = (fileLength * FSYNC2_WORK_PCT) / 100;
        final long fsync3Work = (fileLength * FSYNC3_WORK_PCT) / 100;
        final long writeWork = ((fileLength * WRITE_WORK_PCT) / 100) / entries;

        /* Don't neglect work lost due to int truncation. */
        final long extraWork = fileLength -
            ((writeWork * entries) + fsync1Work + fsync2Work + fsync3Work);

        /*
         * Make type changes persistent before writing zeros. This is the only
         * way to be sure that an entry's type is changed before it is zero
         * filled, since write ordering is not guaranteed.
         */
        checkContinue();
        raf.getChannel().force(false);
        fSyncs.incrementAndGet();
        cycleThrottle.throttle(fsync2Work);

        for (int i = 0; i < entries; i += 1) {

            final long offset = eraseOffsets[i];
            final int size = eraseSizes[i];

            if (zeros.length < size) {
                zeros = new byte[size * 2];
            }

            raf.seek(offset);
            checkContinue();
            raf.write(zeros, 0, size);
            writes.incrementAndGet();
            writeBytes.addAndGet(size);
            cycleThrottle.throttle(writeWork);
        }

        /* Make complete erasure of this file persistent. */
        checkContinue();
        raf.getChannel().force(false);
        fSyncs.incrementAndGet();
        cycleThrottle.throttle(fsync3Work + extraWork);

        // TODO update FileManager stats?
    }

    /**
     * Returns whether the log entry at the given LSN has been erased.
     *
     * Is not particularly efficient (it opens and closes the file each time
     * it is called) and is meant for exceptional circumstances such as when a
     * checksum verification fails.
     */
    public boolean isEntryErased(final long lsn) {

        final long file = DbLsn.getFileNumber(lsn);
        final long offset = DbLsn.getFileOffset(lsn);

        /*
         * Can't use FileManager.getFileHandle here because we may be called
         * with the FileHandle already latched and latching is not reentrant.
         */
        RandomAccessFile fileHandle = null;
        try {
            fileHandle = new RandomAccessFile(
                fileManager.getFullFileName(file),
                FileManager.FileMode.READ_MODE.getModeValue());

            fileHandle.seek(offset + LogEntryHeader.ENTRYTYPE_OFFSET);

            return fileHandle.readByte() ==
                LogEntryType.LOG_ERASED.getTypeNum();

        } catch (FileNotFoundException|EOFException e) {
            return false;

        } catch (IOException e) {
            throw new EnvironmentFailureException(
                envImpl, EnvironmentFailureReason.LOG_WRITE,
                "Exception checking erasure file 0x" + Long.toHexString(file),
                e);
        } finally {
            if (fileHandle != null) {
                try {
                    fileHandle.close();
                } catch (IOException e) {
                    LoggerUtils.warning(
                        logger, envImpl,
                        "DataEraser.isEntryErased exception when closing " +
                            "file 0x" + Long.toHexString(file) + ": " + e);
                }
            }
        }
    }

    /**
     * Utility to help spread a given amount of totalWork overs a given
     * durationMs.
     */
    private class WorkThrottle {

        private final float msPerUnitOfWork;
        private final long startTime;
        private long workDone;

        WorkThrottle(final long totalWork,
                     final long durationMs) {

            msPerUnitOfWork = ((float) durationMs) / totalWork;
            startTime = System.currentTimeMillis();
            workDone = 0;
        }

        /**
         * Waits until the ratio of elapsed time to durationMs is GTE the
         * ratio of workDone to totalWork. Elapsed time is time since the
         * constructor was called.
         *
         * If the computed wait time is less than MIN_WORK_DELAY_MS, no wait
         * is performed. This is meant to prevent large numbers of waits for
         * relatively small amounts of work done. Note that this method is
         * called frequently to add very very small amounts of work.
         */
        void throttle(final long addWork) {

            workDone += addWork;

            final long workDoneMs = (long) (workDone * msPerUnitOfWork);

            final long delayMs =
                workDoneMs - (System.currentTimeMillis() - startTime);

            if (delayMs >= MIN_WORK_DELAY_MS) {

                final long checkMs = Math.min(delayMs, pollCheckMs);

                PollCondition.await(checkMs, delayMs, pollMutex,
                    () -> {
                        checkContinue();
                        return false;
                    });
            }
        }
    }

    private void checkShutdown() {
        if (shutdownRequested || !envImpl.isValid()) {
            throw new ShutdownRequestedException();
        }
    }

    private void checkContinue() {

        checkShutdown();

        if (!enabled) {
            throw new ErasureDisabledException();
        }

        if (endTime - startTime != cycleMs) {
            throw new PeriodChangedException();
        }

        synchronized (currentFileMutex) {
            if (abortCurrentFile) {
                final long file = currentFile;
                currentFile = null;
                abortCurrentFile = false;
                throw new AbortCurrentFileException(file);
            }
        }
    }

    private boolean checkForCycleEnd() {

        if (System.currentTimeMillis() >= endTime) {
            if (filesRemaining.isEmpty()) {
                logCycleComplete();
            } else {
                logCycleIncomplete();
            }
            return true;
        }

        return false;
    }

    private static class NoUnprotectedFilesException extends RuntimeException {

        final Level logLevel;

        NoUnprotectedFilesException(final String msg,
                                    final Level logLevel) {
            super(msg);
            this.logLevel = logLevel;
        }
    }

    private static class ErasureDisabledException extends RuntimeException {
    }

    private static class PeriodChangedException extends RuntimeException {
    }

    private static class ShutdownRequestedException extends RuntimeException {
    }

    private static class AbortCurrentFileException extends RuntimeException {
        final long file;

        AbortCurrentFileException(final long file) {
            this.file = file;
        }
    }

    /**
     * If enabled has been set to false, wait for it to be set to true again.
     *
     * Throws ShutdownRequestedException but no other internal exceptions.
     * Calling checkContinue is unnecessary because currentFile is null.
     */
    private void waitForEnabled() {
        while (true) {
            if (PollCondition.await(
                pollCheckMs, FOREVER_TIMEOUT_MS, pollMutex,
                () -> {
                    checkShutdown();
                    return enabled;
                })) {
                return;
            }
        }
    }

    /**
     * Wait for the current time to pass the end time.
     */
    private void waitForCycleEnd() {
        completionTime = System.currentTimeMillis();
        logCycleComplete();

        PollCondition.await(
            pollCheckMs, endTime - completionTime, pollMutex,
            () -> {
                checkContinue();
                return false;
            });
    }

    /**
     * Sleep for a time and dream about things that cause files to become
     * unprotected. When we wake up, our dreams may have come true.
     *
     * Throws ShutdownRequestedException but no other internal exceptions.
     * Calling checkContinue is unnecessary because currentFile is null.
     */
    private void waitForUnprotectedFiles(final NoUnprotectedFilesException e) {

        lastProtectedFilesMsg = e.getMessage();
        lastProtectedFilesMsgLevel = e.logLevel;

        PollCondition.await(
            pollCheckMs, NO_UNPROTECTED_FILES_DELAY_MS, pollMutex,
            () -> {
                checkShutdown();
                return false;
            });
    }

    private void storeState() {

        final TupleOutput out = new TupleOutput();

        out.writeLong(startTime);
        out.writeLong(endTime);
        out.writePackedLong(totalCycleWork);
        storeFileSet(out, filesCompleted);
        storeFileSet(out, filesRemaining);

        final DatabaseEntry data = new DatabaseEntry();
        TupleBase.outputToEntry(out, data);

        envImpl.getMetadataStore().put(MetadataStore.KEY_ERASER, data);
    }

    private boolean loadState() {

        final DatabaseEntry data = new DatabaseEntry();

        if (envImpl.getMetadataStore().get(
            MetadataStore.KEY_ERASER, data) == null) {
            return false;
        }

        final TupleInput in = TupleBase.entryToInput(data);

        startTime = in.readLong();
        endTime = in.readLong();
        totalCycleWork = in.readPackedLong();
        filesCompleted = loadFileSet(in);
        filesRemaining = loadFileSet(in);

        return true;
    }

    private void storeFileSet(final TupleOutput out,
                              final NavigableSet set) {

        out.writePackedInt(set.size());
        long priorFile = 0;

        for (final long file : set) {
            out.writePackedLong(file - priorFile);
            priorFile = file;
        }
    }

    private NavigableSet loadFileSet(final TupleInput in) {

        final int size = in.readPackedInt();

        final NavigableSet set =
            Collections.synchronizedNavigableSet(new TreeSet<>());

        long file = 0;

        for (int i = 0; i < size; i += 1) {
            file += in.readPackedLong();
            set.add(file);
        }

        return set;
    }

    /** Cached info about a file to be erased. */
    private static class FileInfo {

        final long creationTime;
        final long length;
        VLSN lastVlsn;

        FileInfo(final long creationTime, final long length) {
            this.creationTime = creationTime;
            this.length = length;
        }
    }

    private void logCycleInit(final int filesToExamine) {

        LoggerUtils.info(logger, envImpl,
            "ERASER initializing new cycle. Total files: " + filesToExamine);

        callTestEventHook(TestEvent.Type.INIT);
    }

    private void logCycleStart() {

        LoggerUtils.info(logger, envImpl,
            "ERASER new cycle started. " + getCycleStatus());

        callTestEventHook(TestEvent.Type.START);
    }

    private void logCycleComplete() {

        LoggerUtils.info(logger, envImpl,
            "ERASER cycle completed. " + getCycleStatus());

        callTestEventHook(TestEvent.Type.COMPLETE);
    }

    private void logCycleIncomplete() {

        final String msg = "ERASER unable to erase files " +
            "within the erasure period. " +
            ((lastProtectedFilesMsg != null) ?
                lastProtectedFilesMsg :
                "File protection did not prevent erasure, so probably " +
                    "just ran out of time.") +
            " " + getFileProtectionMessage() +
            " " + getCycleStatus();

        LoggerUtils.logMsg(
            logger, envImpl,
            (lastProtectedFilesMsgLevel != null) ?
                lastProtectedFilesMsgLevel : Level.INFO,
            msg);

        callTestEventHook(TestEvent.Type.INCOMPLETE);
    }

    private void logCycleResume() {

        LoggerUtils.info(logger, envImpl,
            "ERASER previously incomplete cycle resumed at startup. " +
                getCycleStatus());

        callTestEventHook(TestEvent.Type.RESUME);
    }

    private void logCycleCannotResume() {

        LoggerUtils.warning(logger, envImpl,
            "ERASER previously incomplete cycle not resumed at startup" +
                " because end time has passed. " + getCycleStatus());

        callTestEventHook(TestEvent.Type.CANNOT_RESUME);
    }

    private void logCycleSuspend() {

        LoggerUtils.info(logger, envImpl,
            "ERASER incomplete cycle suspended at shutdown. " +
                getFileProtectionMessage() + " " + getCycleStatus());

        callTestEventHook(TestEvent.Type.SUSPEND);
    }

    private void logPeriodChanged() {

        LoggerUtils.info(logger, envImpl,
            "ERASER period param was changed, current cycle aborted. " +
                getCycleStatus());

        callTestEventHook(TestEvent.Type.PERIOD_CHANGED);
    }

    private String getCycleStatus() {

        return "Cycle start: " + formatTime(startTime) +
            ", end: " + formatTime(endTime) +
            ", filesCompleted: [" + FormatUtil.asHexString(filesCompleted) +
            "], filesRemaining: [" + FormatUtil.asHexString(filesRemaining) +
            "].";
    }

    private static String formatTime(final long time) {

        return DATE_FORMAT.format(new Date(time));
    }

    private void callTestEventHook(final TestEvent.Type type) {

        if (testEventHook == null) {
            return;
        }

        testEventHook.doHook(new TestEvent(type, this));
    }

    static void setTestEventHook(final TestHook hook) {
        testEventHook = hook;
    }

    static class TestEvent {

        enum Type {
            INIT,
            START,
            COMPLETE,
            INCOMPLETE,
            RESUME,
            CANNOT_RESUME,
            SUSPEND,
            PERIOD_CHANGED,
        }

        final Type type;
        final long startTime;
        final long endTime;
        final long completionTime;
        final NavigableSet filesCompleted;
        final NavigableSet filesRemaining;

        TestEvent(final Type type,
                  final DataEraser eraser) {

            this.type = type;
            startTime = eraser.startTime;
            endTime = eraser.endTime;
            completionTime = eraser.completionTime;
            filesCompleted = new TreeSet<>(eraser.filesCompleted);
            filesRemaining = new TreeSet<>(eraser.filesRemaining);
        }

        @Override
        public String toString() {
            return type.toString() +
                " startTime=" + formatTime(startTime) +
                " endTime=" + formatTime(endTime) +
                " completionTime=" + formatTime(completionTime) +
                " filesCompleted=[" + FormatUtil.asHexString(filesCompleted) +
                "] filesRemaining=[" + FormatUtil.asHexString(filesRemaining) +
                "]";
        }
    }
}