All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.sleepycat.je.cleaner.DataEraser Maven / Gradle / Ivy

The newest version!
/*-
 * Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle Berkeley
 * DB Java Edition made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle Berkeley DB Java Edition for a copy of the
 * license and additional information.
 */
package com.sleepycat.je.cleaner;

import static com.sleepycat.je.ExtinctionFilter.ExtinctionStatus.EXTINCT;
import static com.sleepycat.je.ExtinctionFilter.ExtinctionStatus.MAYBE_EXTINCT;

import java.io.EOFException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.text.DateFormat;
import java.util.Collections;
import java.util.Date;
import java.util.NavigableMap;
import java.util.NavigableSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;

import com.sleepycat.bind.tuple.TupleBase;
import com.sleepycat.bind.tuple.TupleInput;
import com.sleepycat.bind.tuple.TupleOutput;
import com.sleepycat.je.CacheMode;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.EnvironmentFailureException;
import com.sleepycat.je.EnvironmentMutableConfig;
import com.sleepycat.je.ExtinctionFilter;
import com.sleepycat.je.StatsConfig;
import com.sleepycat.je.config.EnvironmentParams;
import com.sleepycat.je.dbi.DatabaseId;
import com.sleepycat.je.dbi.DatabaseImpl;
import com.sleepycat.je.dbi.DbConfigManager;
import com.sleepycat.je.dbi.EnvConfigObserver;
import com.sleepycat.je.dbi.EnvironmentFailureReason;
import com.sleepycat.je.dbi.EnvironmentImpl;
import com.sleepycat.je.dbi.MetadataStore;
import com.sleepycat.je.log.ChecksumException;
import com.sleepycat.je.log.FileManager;
import com.sleepycat.je.log.FileReader;
import com.sleepycat.je.log.LogEntryHeader;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.entry.ErasedLogEntry;
import com.sleepycat.je.log.entry.LNLogEntry;
import com.sleepycat.je.log.entry.LogEntry;
import com.sleepycat.je.tree.BIN;
import com.sleepycat.je.tree.SearchResult;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.IntStat;
import com.sleepycat.je.utilint.LoggerUtils;
import com.sleepycat.je.utilint.LongStat;
import com.sleepycat.je.utilint.Pair;
import com.sleepycat.je.utilint.PollCondition;
import com.sleepycat.je.utilint.PropUtil;
import com.sleepycat.je.utilint.StatGroup;
import com.sleepycat.je.utilint.StoppableThread;
import com.sleepycat.je.utilint.StringStat;
import com.sleepycat.je.utilint.TestHook;
import com.sleepycat.je.utilint.TracerFormatter;
import com.sleepycat.je.utilint.VLSN;
import com.sleepycat.utilint.FormatUtil;

/**
 * Erases obsolete data from disk during a configured interval. [#26954]
 *
 * Basics
 * ======
 * - The purpose of the erasure feature is to erase obsolete user data within
 *   some reasonable period after becoming obsolete, and to do this without
 *   adding significant overhead that might impact application performance.
 *
 * - There are no strict guarantees about when erasure is complete. This
 *   means it is best to think of erasure as a "best effort". Some examples
 *   where data is not always erased according to the erasure period are:
 *
 *   1. If a node is down for an extended time, obviously no erasure will
 *      occur until it comes back up. After it comes up, there may not be
 *      enough time left in the cycle for erasure to be completed. The rate
 *      of erasure is not increased in this situation. The erasure rate is
 *      kept constant throughout each cycle to avoid resource usage spikes.
 *
 *   2. Data at the tail end of the data log will not be erased because it is
 *      needed for recovery and replication. In this case, data will be
 *      eventually erased as more writing eventually occurs.
 *
 *   3. UINs (upper internal nodes in the Btree) are not erased, and
 *      therefore keys in UINs, which can be considered user data, are not
 *      erased. Erasing UINs would be very difficult because ancestor slot
 *      keys would have to be adjusted and zero'ing the entire log entry
 *      would not be possible. For now this is simply a limitation due to
 *      operational considerations.
 *
 *   4. From a performance viewpoint, erasure is designed so that each erasure
 *      cycle will be completed over a multi-day period. If the erasure period
 *      (see below) is set to a small value for testing, the cycle may not
 *      be completed and this is acceptable behavior. The minimum time
 *      needed to complete the cycle is dependent on the total size of the
 *      files to be erased in the cycle, the load on the system, and other
 *      factors.
 *
 * - Currently the erasure feature is hidden in the JE API. It may be exposed
 *   in the future.
 *
 * - The eraser overall may be enabled or disabled using the following param.
 *   It is disabled by default.
 *
 *     je.env.runEraser -- boolean, default: false, mutable: yes
 *
 * - Currently only erasure of data in deleted DBs (removed or truncated) and
 *   erasure of extinct records are supported. These are enabled via the
 *   following two params. These are both false by default.
 *
 *     je.erase.deletedDatabases -- boolean, default: false, mutable: yes
 *     je.erase.extinctRecords -- boolean, default: false, mutable: yes
 *
 *   By setting both of these parameters to true in NoSQL DB, obsolete data
 *   will be removed for dropped tables and dropped indexes.
 *
 * - In the future we may add the ability to erase expired data (via TTL) and
 *   data that has been updated or deleted. This will be more expensive and
 *   complex, and will probably be enabled via additional config params.
 *
 * - During each N day cycle, we erase all files that were N days old (or
 *   older) at the start of the cycle. Therefore, the first erasure for a file
 *   occurs when it is between N and N*2 days old. Thereafter, the file is
 *   erased every N days. The erasure period is set using the following param,
 *   which is zero by default; the period must be explicitly set to perform
 *   erasure.
 *
 *     je.erase.period -- duration, default: 0, min: 0, max: none, mutable: yes
 *
 * - Here's an example for how the erasure period works in practice.
 *
 *     Let's say the erasure period is 30 min. At the start of each 30 min
 *     erasure cycle, the data files older than 30 min are selected. Those
 *     files should normally be erased by the end of the cycle. Then a new
 *     cycle starts and a new set of files older than 30 minutes is
 *     selected.
 *
 *     Let's assume that writing is continuous and the checkpoint interval
 *     is fairly small, and therefore we won't run into the problem that
 *     data at the end of the log cannot be erased. (This problem is
 *     described in more detail in a later section.)
 *
 *     Then at any time, roughly speaking, all data that became obsolete more
 *     than 60 minutes ago should be erased. The erasure period is meant to be
 *     half of the desired interval after which obsolete data should disappear.
 *
 *     The erasure doesn't have to be complete exactly at the end of each
 *     erasure period. In production we expect the erasure period to be set to
 *     less than half of the desired interval to allow for impreciseness,
 *     problems, etc. For example, if the desired interval is 60 days in
 *     production, the erasure period would normally be set to around 25 days.
 *
 *  - Here's an example of how params could be set to ensure that obsolete
 *    data is removed within 60 days of becoming obsolete:
 *
 *      je.env.runEraser=true
 *      je.erase.deletedDatabases=true
 *      je.erase.extinctRecords=true
 *      je.erase.period=25 days
 *
 * - WARNING: Erasure cannot be used for applications that rely on the usual
 *   immutable nature of JE data files. For example, if DbBackup is used to
 *   create a snapshot using file links, the assumption that the files will
 *   not be modified by JE no longer holds when erasure is enabled. For
 *   example, NoSQL DB snapshots (which are just links to data files) are
 *   invalidated by data erasure. To perform a valid backup, NoSQL DB must
 *   actually copy the files while they're protected by DbBackup (between
 *   the calls to startBackup and endBackup).
 *
 * Trace Logging
 * =============
 * Trace logging is performed for the following erasure cycle events. All
 * messages are INFO-level messages except when stated otherwise.
 *
 *  - A fresh cycle starts. This occurs at startup, or after the prior
 *    cycle finishes and its end time passes, or when the prior cycle is
 *    aborted because its end time has passed.
 *
 *  - A cycle finishes because erasure of all files is complete. The
 *    message is logged immediately after all files have been erased, even
 *    if the cycle end time has not yet passed. The eraser will then be
 *    idle until the cycle end time passes.
 *
 *  - A cycle aborts because its end time has passed but is incomplete
 *    because not all files were erased. A fresh cycle is then started.
 *    This is a WARNING-level message when files cannot be erased because
 *    they are protected for reasons that are expected to be short lived
 *    (backup, replication) but have persisted until the end of the cycle.
 *    In other situations this is a INFO-level message, for example:
 *    - when the files are needed for recovery,
 *    - when the files are needed to retain at least 1000 VLSNs (or
 *      {@link com.sleepycat.je.rep.impl.RepParams#MIN_VLSN_INDEX_SIZE}),
 *    - when the files do not contain any VLSNs and therefore the VLSNIndex
 *      cannot be truncated at any specific point (this is a corner case).
 *
 *  - A cycle is suspended because the JE Environment is closed and not
 *    all files were erased. The cycle will be resumed at the next
 *    Environment open, if the cycle end time has not passed.
 *
 *  - An incomplete cycle is resumed at startup. The cycle was incomplete
 *    at the last Environment close, and its end time has not yet passed.
 *
 *  - An incomplete cycle cannot be resumed at startup because its end
 *    time has now passed. A fresh cycle is then started.
 *
 * All messages start with ERASER and include the cycle start/end times, the
 * files processed, and the files that are yet to be processed.
 *
 * Stats
 * =====
 * Stat Group: Eraser
 *
 * eraserCycleStart - Erasure cycle start time (UTC).
 * eraserCycleEnd - Erasure cycle end time (UTC).
 * eraserFilesRemaining - Number of files still to be processed in erasure
 *                        cycle.
 * eraserFilesErased - Number of files erased by overwriting obsolete entries.
 * eraserFilesDeleted - Number of reserved files deleted by the eraser.
 * eraserFilesAlreadyDeleted - Number of reserved files deleted coincidentally
 *                             by the cleaner.
 * eraserFSyncs - Number of fsyncs performed by the eraser.
 * eraserReads - Number of file reads performed by the eraser.
 * eraserReadBytes - Number of bytes read by the eraser.
 * eraserWrites - Number of file writes performed by the eraser.
 * eraserWriteBytes - Number of bytes written by the eraser.
 *
 * Erasing data at the end of log
 * ==============================
 * When writing stops, the user data at the tail end of the log may need
 * erasure at some point. This would happen if a table is dropped and then
 * there is no writing for a long time (or very little writing). This is a
 * corner case and not one we expect in production, but it can happen in
 * production and it certainly happens in tests.
 *
 * Currently JE does not allow cleaning of any file in the recovery interval.
 * This is to ensure that recovery works, of course. The same restriction
 * probably applies to erasure. If we were to treat any slot referencing an
 * erased LSN as if the slot were deleted, we might get recovery to work.
 * But this would be complex to analyze and test thoroughly, especially for IN
 * replay. Therefore if we need to erase items in the recovery interval, we
 * would need to detect this situation and force a checkpoint before erasing.
 *
 * In addition, to erase all entries at the end of the log means that the
 * VLSNIndex would need to be completely truncated, i.e., made empty. This
 * means the node could not be used as a feeder when functioning as a master,
 * and could not perform syncup when functioning as a replica. Rather than
 * emptying the VLSNIndex completely we could log a benign replicated entry
 * so there is at least one entry. But that doesn't address the broader
 * problem of syncup. Perhaps this doesn't matter when writing has stopped
 * for an extended period because the replicas will be up-to-date. And
 * perhaps network restore is fine for other unusual cases. But it is a risk.
 * Right now we always leave at least 1,000 VLSNs in the VLSNIndex to guard
 * against problems, but we don't really know what problems might arise. So
 * it would take some work to figure this out and test it.
 *
 * Therefore, we simply do not erase obsolete data when it appears at the tail
 * end of the log. One way to explain this is to say that we can't erase
 * data at the very end of the transaction log, because this would prevent
 * recovery in certain situations. We do log a warning message in this
 * situation.
 *
 * Aborting Erasure
 * ================
 * Erasure of a file is aborted in the following cases.
 *
 * - The file is needed for a backup or network restore. In both cases, it is
 *   DbBackup.startBackup that aborts the erasure. The file aborted will then
 *   be protected and won't be selected again for erasure until the backup or
 *   network restore is finished.
 *
 * - The extinction filter returns EXTINCT_MAYBE. This is due to a temporary
 *   situation at startup time, while NoSQL DB (or another app) has not yet
 *   initialized its metadata (table metadata in the case of NoSQL DB). It
 *   will be retried repeatedly until this no longer occurs.
 *
 *     Discarded idea: We could add a way to know that the filter is fully
 *     initialized and the eraser thread could delay starting until then. But
 *     we would still have to abort if MAYBE_EXTINCT is returned after that
 *     point. So for simplicity we just do the abort.
 *
 * Reserved Files, VLSNIndex, Cleaning
 * ===================================
 * - Reserved files are an exception. Because an erased file cannot be used for
 *   replication, reserved files are deleted rather than erasing them.
 *   Therefore, reserved files are never older than N*2 days.
 *
 * - Before erasing a file covered by the VLSNIndex, we truncate the
 *   VLSNIndex to remove the file from its range. Because the VLSNIndex range
 *   never retreats (only advances), files protected by the VLSNIndex will
 *   never have been erased.
 *
 * - However, other files may be erased and subsequently become protected. This
 *   is OK, because such protection only needs to guarantee that files are not
 *   changed while they are protected. These include:
 *    - Backup and network restore.
 *    - DiskOrderedCursor and Database.count.
 *
 * - Cleaning is not coordinated with erasure (except for the treatment of
 *   reserved files discussed above). A given file may be erased and cleaned
 *   concurrently, and this should not cause problems. This is a waste of
 *   resources, but since it is unlikely we do not try to detect it or
 *   optimize for it. Such coordination would add a lot of complexity.
 *
 * Throttling
 * ==========
 * Throttling is performed by estimating the total amount of work in the
 * cycle at the beginning of the cycle, dividing up the total cycle time
 * by this work amount, and throttling (waiting) at various points in
 * order to spread the work out fairly evenly over the cycle period.
 * The {@link WorkThrottle} class calculates the wait time based on work
 * done so far.
 *
 * It is possible that we cannot complete the work within the cycle period,
 * for example, because a node is down for an extended period. In such
 * cases we do _not_ attempt to catch up by speeding up the rate of work,
 * since this could cause performance spikes. Instead we intentionally
 * overestimate the amount of work, leaving spare time to account for such
 * problems. In the end, if erasure of all selected files cannot be
 * completed by the end of the cycle, the cycle will be aborted and a new
 * cycle is started with a recalculated set of files. This is acceptable
 * behavior in unusual conditions.
 *
 *   Note: Such problems could be addressed differently by integrating
 *   with the TaskCoordinator. In that case we would use a different
 *   approach entirely: we would simply work at the maximum rate allowed by
 *   the TaskCoordinator. But for this to work well, other JE components would
 *   also need to be integrated with the TaskCoordinator. So for now we simply
 *   perform work at a fixed rate within each cycle.
 *
 * Before the cycle starts we have to open each file to get its creation
 * time, which has a cost. Throttling for that one time task is performed
 * separately in {@link #startCycle}. The remaining time in the cycle is also
 * allocated by {@link #startCycle}, which initializes {@link #cycleThrottle}
 * and related fields. Work is divided as described below.
 *
 * For each file to be erased there are several components of work:
 *
 * 1. We may have to read file, or parts of files, for two reasons that
 *    are in addition to the file erasure process itself:
 *
 *    a. We may have to read the file to find its last VLSN, to determine
 *       where to truncate the VLSNIndex. In the worst case scenario we have
 *       to do this for every file, but it is much more likely that it will
 *       only have to be done for a small fraction of the files, and only a
 *       fraction (the end) of each file will normally be read. In addition,
 *       each time we truncate the VLSNIndex we perform an fsync; however,
 *       in the normal case we do this only once per cycle.
 *
 *    b. We read a file redundantly when erasure of a file is aborted and
 *       restarted later. See Aborting Erasure above.
 *
 * 2. Read through the file and overwrite the type byte for each entry
 *    that should be erased. We know the length of each file the read cost
 *    is known. We don't know how many erasures will take place, but the
 *    worst case is that every entry is erased.
 *
 *    Note that reserved files are simply deleted rather than erasing them
 *    and this is cheaper than erasure. So when there are reserved files,
 *    we will overestimate the amount of work.
 *
 * 3. Before any overwriting, at the time we determine that at least one entry
 *    must be erased, touch the file and perform an fsync to ensure that the
 *    lastModifiedTime is updated persistently. The fsync is assumed to be
 *    expensive.
 *
 * 4. After overwriting all type bytes, perform a second fsync to make the
 *    type changes persistent. The fsync is assumed to be expensive.
 *
 * 5. Overwrite the item in each entry that was erased. This cost is
 *    unknown, but the worst case is that every entry is erased.
 *
 * 6. Perform the third and final fsync to make the erasure persistent. The
 *    fsync is assumed to be expensive.
 *
 * Work units are defined as follows.
 *
 * - For component 1 we assign a work unit to each byte read (the file
 *   length). This is a very large overestimate and is intended to
 *   account for processing delays, such as when a node is down.
 *
 * - For component 2 we also assign a work unit to each byte read (the file
 *   length). The overwrite of type bytes is variable and included in this
 *   cost for simplicity.
 *
 * - For components 3, 4, 5 and 6 together we also assign the file length as
 *   a very rough estimate, and this work is divided between these components
 *   as follows:
 *     18% for step 3
 *     16% for step 4
 *     50% for step 5
 *     16% for step 6
 *
 * Therefore the total amount of work is simply three times the length of the
 * files to be erased.
 *
 * Other
 * =====
 * - To support network restore, before erasing a file we remove its cached
 *   info (which includes a checksum) from the response cache in LogFileFeeder.
 *   See {@link
 *   com.sleepycat.je.rep.impl.RepImpl#clearedCachedFileChecksum(String)}.
 *
 * - Prohibiting erasure of protected files prevents changes to the file while
 *   it is being read by the protecting entity. Even so, if we read from the
 *   log buffer cache or tip cache, we could also get different results than
 *   reading directly from the file. To be safe, erasure could clear any
 *   cached data for the file. But is this necessary? No, because reading
 *   from the tip cache and log buffer cache is done only by feeders, and
 *   the files read by feeders are protected from erasure.
 *
 * - BtreeVerifier, when configured to read LNs, checks for LSN references to
 *   erased entries since it simply reads the LNs via a cursor. If the LN has
 *   been erased and it it is not extinct, the environment is invalidated as
 *   usual.
 *
 * - The checksum for the LOG_ERASED type cannot be verified, and its
 *   {@link LogEntryHeader#hasChecksum()} method will return false. Because
 *   an entry may be erased in the middle of checksum calculation, the header
 *   may have to be re-read from disk in rare cases. See
 *   {@link com.sleepycat.je.log.ChecksumValidator#validate(long, long)}.
 *
 * - The LOG_ERASED type is not counted as an LN or IN, which could throw off
 *   utilization counts. This may only impact tests and debugging. A thorough
 *   analysis of this issue has not been performed.
 */
public class DataEraser extends StoppableThread implements EnvConfigObserver {

    private static final int MAX_FILE_INFO_MS = 1000;
    private static final int MIN_WORK_DELAY_MS = 5;
    private static final int MAX_SLEEP_MS = 100;
    private static final int FOREVER_TIMEOUT_MS = 5 * 60 * 1000;
    private static final int NO_UNPROTECTED_FILES_DELAY_MS = 1000;
    private static final String TEST_ERASE_PERIOD = "test.erasePeriod";
    private static TestHook testEventHook;

    private static final DateFormat DATE_FORMAT =
        TracerFormatter.makeDateFormat();

    private static final int WRITE_WORK_PCT = 50;
    private static final int FSYNC1_WORK_PCT = 18;
    private static final int FSYNC2_WORK_PCT = 16;
    private static final int FSYNC3_WORK_PCT = 16;

    private final Cleaner cleaner;
    private final FileProtector fileProtector;
    private final FileManager fileManager;
    private final Logger logger;
    private volatile boolean shutdownRequested = false;
    private volatile boolean enabled = false;
    private int terminateMillis;
    private volatile long cycleMs = 0;
    private boolean eraseDeletedDbs = false;
    private boolean eraseExtinctRecords = false;
    private int pollCheckMs;
    private long[] eraseOffsets = new long[5 * 1024];
    private int[] eraseSizes = new int[eraseOffsets.length];
    private byte[] zeros = new byte[10 * 1024];
    private final Object pollMutex = new Object();
    private final NavigableMap fileInfoCache = new TreeMap<>();
    private WorkThrottle cycleThrottle;
    private long totalCycleWork;
    private String lastProtectedFilesMsg;
    private Level lastProtectedFilesMsgLevel;

    /**
     * The eraser is single-threaded but there is occasional multi-threaded
     * access to the following fields due to stat loading.
     */
    private volatile long startTime;
    private volatile long endTime;
    private volatile long completionTime;
    private NavigableSet filesRemaining = Collections.emptyNavigableSet();
    private NavigableSet filesCompleted = Collections.emptyNavigableSet();
    private final AtomicInteger filesErased = new AtomicInteger();
    private final AtomicInteger filesDeleted = new AtomicInteger();
    private final AtomicInteger filesAlreadyDeleted = new AtomicInteger();
    private final AtomicInteger fSyncs = new AtomicInteger();
    private final AtomicLong reads = new AtomicLong();
    private final AtomicLong readBytes = new AtomicLong();
    private final AtomicLong writes = new AtomicLong();
    private final AtomicLong writeBytes = new AtomicLong();

    /**
     * currentFileMutex protects currentFile and abortCurrentFile.
     */
    private final Object currentFileMutex = new Object();
    private volatile Long currentFile;
    private boolean abortCurrentFile;
    private int abortTimeoutMs;

    public DataEraser(final EnvironmentImpl envImpl) {

        super(envImpl, "JEErasure");
        cleaner = envImpl.getCleaner();
        fileProtector = envImpl.getFileProtector();
        fileManager = envImpl.getFileManager();
        logger = LoggerUtils.getLogger(getClass());

        envConfigUpdate(envImpl.getConfigManager(), null);
        envImpl.addConfigObserver(this);
    }

    @Override
    public void envConfigUpdate(
        final DbConfigManager configManager,
        final EnvironmentMutableConfig ignore) {

        /*
         * If the TEST_ERASE_PERIOD system property is specified and
         * ENV_RUN_ERASER is not specified, enable erasure and use the test
         * period.
         */
        final boolean runErase;
        final String testErasePeriod = System.getProperty(TEST_ERASE_PERIOD);

        if (testErasePeriod != null &&
            !configManager.isSpecified(EnvironmentParams.ENV_RUN_ERASER)) {

            runErase = true;
            cycleMs = PropUtil.parseDuration(testErasePeriod);
            eraseDeletedDbs = true;
            eraseExtinctRecords = true;
        } else {

            runErase = configManager.getBoolean(
                EnvironmentParams.ENV_RUN_ERASER);

            cycleMs = configManager.getDuration(
                EnvironmentParams.ERASE_PERIOD);

            eraseDeletedDbs = configManager.getBoolean(
                EnvironmentParams.ERASE_DELETED_DATABASES);

            eraseExtinctRecords = configManager.getBoolean(
                EnvironmentParams.ERASE_EXTINCT_RECORDS);
        }

        enabled = runErase && cycleMs > 0 &&
            (eraseDeletedDbs || eraseExtinctRecords);

        terminateMillis = configManager.getDuration(
            EnvironmentParams.EVICTOR_TERMINATE_TIMEOUT);

        pollCheckMs = Math.min(MAX_SLEEP_MS, terminateMillis / 4);

        abortTimeoutMs = configManager.getDuration(
            EnvironmentParams.ERASE_ABORT_TIMEOUT);
    }

    public StatGroup loadStats(StatsConfig config) {

        final StatGroup statGroup = new StatGroup(
            EraserStatDefinition.GROUP_NAME, EraserStatDefinition.GROUP_DESC);

        /* Add CUMULATIVE stats. */

        final DateFormat dateFormat = TracerFormatter.makeDateFormat();

        new StringStat(
            statGroup, EraserStatDefinition.ERASER_CYCLE_START,
            (startTime == 0) ? "" : dateFormat.format(new Date(startTime)));

        new StringStat(
            statGroup, EraserStatDefinition.ERASER_CYCLE_END,
            (endTime == 0) ? "" : dateFormat.format(new Date(endTime)));

        new IntStat(
            statGroup, EraserStatDefinition.ERASER_FILES_REMAINING,
            filesRemaining.size());

        /* INCREMENTAL stats must be cleared. */

        final boolean clear = config.getClear();

        new IntStat(
            statGroup, EraserStatDefinition.ERASER_FILES_ERASED,
            getStat(filesErased, clear));

        new IntStat(
            statGroup, EraserStatDefinition.ERASER_FILES_DELETED,
            getStat(filesDeleted, clear));

        new IntStat(
            statGroup, EraserStatDefinition.ERASER_FILES_ALREADY_DELETED,
            getStat(filesAlreadyDeleted, clear));

        new IntStat(
            statGroup, EraserStatDefinition.ERASER_FSYNCS,
            getStat(fSyncs, clear));

        new LongStat(
            statGroup, EraserStatDefinition.ERASER_READS,
            getStat(reads, clear));

        new LongStat(
            statGroup, EraserStatDefinition.ERASER_READ_BYTES,
            getStat(readBytes, clear));

        new LongStat(
            statGroup, EraserStatDefinition.ERASER_WRITES,
            getStat(writes, clear));

        new LongStat(
            statGroup, EraserStatDefinition.ERASER_WRITE_BYTES,
            getStat(writeBytes, clear));

        return statGroup;
    }

    private long getStat(final AtomicLong val, final boolean clear) {
        return clear ? val.getAndSet(0) : val.get();
    }

    private int getStat(final AtomicInteger val, final boolean clear) {
        return clear ? val.getAndSet(0) : val.get();
    }

    @Override
    public Logger getLogger() {
        return logger;
    }

    @Override
    public int initiateSoftShutdown() {
        shutdownRequested = true;
        synchronized (pollMutex) {
            pollMutex.notify();
        }
        return terminateMillis;
    }

    public void startThread() {

        if (enabled &&
            !envImpl.isMemOnly() &&
            !envImpl.isReadOnly() &&
            !isAlive()) {

            /*
             * Ensure metadata DB is opened before Environment ctor finishes,
             * to avoid timing problems in tests that are sensitive to the
             * number of open DBs.
             */
            envImpl.getMetadataStore().openDb();

            start();
        }
    }

    @Override
    public void run() {
        boolean isInitialized = false;
        boolean isStarted = false;

        /*
         * Internal exceptions are used to signal state changes, and they are
         * handled in this method. Other than ShutdownRequestedException, all
         * such exceptions must be handled inside the while loop.
         */
        try {
            while (true) {
                checkShutdown();
                try {
                    if (!isInitialized) {
                        isInitialized = true;
                        if (resumeCycle()) {
                            isStarted = true;
                        }
                    }
                    if (!isStarted) {
                        startCycle();
                        isStarted = true;
                    }
                    if (checkForCycleEnd()) {
                        waitForEnabled();
                        isStarted = false;
                        continue;
                    }
                    final Long file = getNextFile();
                    if (file != null) {
                        try {
                            eraseFile(file);
                            filesCompleted.add(file);
                            filesRemaining.remove(file);
                            storeState();
                        } finally {
                            clearCurrentFile();
                        }
                    } else {
                        waitForCycleEnd();
                        waitForEnabled();
                        isStarted = false;
                    }
                } catch (NoUnprotectedFilesException e) {
                    waitForUnprotectedFiles(e);
                } catch (ErasureDisabledException e) {
                    waitForEnabled();
                } catch (PeriodChangedException e) {
                    logPeriodChanged();
                    isStarted = false;
                } catch (AbortCurrentFileException e) {
                    /* Continue. */
                }
            }
        } catch (ShutdownRequestedException e) {
            if (!filesRemaining.isEmpty()) {
                logCycleSuspend();
            }
        }
    }

    /**
     * If we can load the last stored state and the endTime of the last cycle
     * has not yet arrived, resume execution and return true.
     * Otherwise, return false.
     */
    private boolean resumeCycle() {

        if (!loadState()) {
            return false;
        }

        if (System.currentTimeMillis() >= endTime) {
            if (!filesRemaining.isEmpty()) {
                logCycleCannotResume();
            }
            return false;
        }

        logCycleResume();

        final WorkThrottle throttle =
            createFileInfoThrottle(filesRemaining.size(), cycleMs);

        /* Populate file info cache for files remaining in cycle. */
        for (final Long file : filesRemaining) {

            fileInfoCache.put(
                file,
                new FileInfo(getFileCreationTime(file), getFileLength(file)));

            throttle.throttle(1);
        }

        cycleThrottle = new WorkThrottle(totalCycleWork, endTime - startTime);
        return true;
    }

    /**
     * Resets per-cycle info, including determining the set of files to be
     * erased in the next cycle.
     */
    private void startCycle() {

        /* Ignore config changes during calculations. */
        final long localCycleMs = cycleMs;

        startTime = System.currentTimeMillis();
        endTime = startTime + localCycleMs;
        final long fileAgeCutoff = startTime - localCycleMs;
        filesCompleted = Collections.synchronizedNavigableSet(new TreeSet<>());
        filesRemaining = Collections.synchronizedNavigableSet(new TreeSet<>());
        completionTime = 0;

        final NavigableSet allFiles =
            fileProtector.getAllCompletedFiles();

        final WorkThrottle throttle =
            createFileInfoThrottle(allFiles.size(), localCycleMs);

        logCycleInit(allFiles.size());

        /*
         * Iterate through all files except for the last file. We can't erase
         * the last file anyway, because it is in the recovery interval.
         */
        for (final Long file : allFiles) {

            /*
             * Trying using cached file info from previous cycles. A file in a
             * previous cycle will normally have a creationTime that qualifies,
             * but we check again in case the erasure period has changed.
             */
            final FileInfo prevInfo = fileInfoCache.get(file);
            if (prevInfo != null) {
                if (prevInfo.creationTime <= fileAgeCutoff) {
                    filesRemaining.add(file);
                }
                continue;
            }

            final long creationTime = getFileCreationTime(file);

            throttle.throttle(1);

            if (creationTime > fileAgeCutoff) {
                continue;
            }

            filesRemaining.add(file);

            fileInfoCache.put(
                file, new FileInfo(creationTime, getFileLength(file)));
        }

        /*
         * Prevent the file cache from growing without bounds. The cache need
         * only contain an entry for all filesToErase.
         */
        fileInfoCache.navigableKeySet().retainAll(filesRemaining);

        /*
         * Use three times the sum of the file lengths as the work for the
         * remaining time in the cycle, as described in the class comments.
         */
        totalCycleWork = filesRemaining.stream()
            .mapToLong(file -> fileInfoCache.get(file).length)
            .sum() * 3;

        cycleThrottle = new WorkThrottle(totalCycleWork, endTime - startTime);
        logCycleStart();
    }

    /**
     * Creates a throttle for reading file info when a cycle is started or
     * restored.
     *
     * Use only a small portion of the cycle for collecting file info.
     * Use at most 0.1% of the cycle time and at most MAX_FILE_INFO_MS
     * per file. The idea is just to do a reasonable amount of throttling
     * between calls to getFileCreationTime.
     */
    private WorkThrottle createFileInfoThrottle(final int files,
                                                final long localCycleMs) {
        final long workTime = Math.min(
            localCycleMs / 1000,
            files * MAX_FILE_INFO_MS);

        return new WorkThrottle(files, workTime);
    }

    /**
     * Returns the file creation time by opening the file and reading its
     * header entry.
     *
     * @return creation time, or Long.MAX_VALUE if the file does not exist.
     */
    private long getFileCreationTime(final long file) {
        try {
            return fileManager.getFileHeaderTimestamp(file).getTime();

        } catch (ChecksumException e) {
            throw new EnvironmentFailureException(
                envImpl, EnvironmentFailureReason.LOG_CHECKSUM,
                "Exception erasing file 0x" + Long.toHexString(file),
                e);

        } catch (FileNotFoundException e) {
            /* File was deleted by the cleaner. */
            return Long.MAX_VALUE;
        }
    }

    private long getFileLength(final long file) {
        return new File(fileManager.getFullFileName(file)).length();
    }

    /**
     * Return the next unprotected file to be processed.
     *
     * 

First return files that are unprotected, to defer VLSNIndex * truncation until it is necessary.

* *

If all files are protected, truncate the VLSNIndex and try again. * This is the same approach used by {@link Cleaner#manageDiskUsage}.

* *

A WARNING level is specified in the NoUnprotectedFilesException * when files are protected for reasons that are expected to be short * lived (backup, replication). The WARNING is actually logged only if * these reasons persist until the end of the erasure cycle.

* * @return the next file to be processed, or null if all files have been * processed. * * @throws NoUnprotectedFilesException if all remaining files are * protected. */ private Long getNextFile() { lastProtectedFilesMsg = null; lastProtectedFilesMsgLevel = null; if (filesRemaining.isEmpty()) { return null; } Long file = getNextUnprotectedFile(); if (file != null) { return file; } if (!cleaner.isFileDeletionEnabled()) { throw new NoUnprotectedFilesException( "Test mode prohibits VLSNIndex truncation.", Level.INFO); } final long lastRecoveryFile = getFirstFileInRecoveryInterval(); if (filesRemaining.first() >= lastRecoveryFile) { throw new NoUnprotectedFilesException( "All remaining files are in the recovery interval," + " lastRecoveryFile=0x" + Long.toHexString(lastRecoveryFile) + ".", Level.INFO); } /* * Determine whether it is worthwhile and possible to truncate the * VLSNIndex. */ if (!envImpl.isReplicated()) { /* VLSNIndex does not exist in a non-replicated env. */ throw new NoUnprotectedFilesException( "Protected files are not in the recovery interval.", Level.WARNING); } final long vlsnIndexStartFile = fileProtector.getVLSNIndexStartFile(); if (vlsnIndexStartFile > filesRemaining.last()) { throw new NoUnprotectedFilesException( "Protected files are not in the recovery interval and not " + "protected by the VLSNIndex vlsnIndexStartFile=0x" + Long.toHexString(vlsnIndexStartFile) + ".", Level.WARNING); } final Pair truncateInfo = getVLSNIndexTruncationInfo(); if (truncateInfo == null) { /* This is a corner case and seems unlikely to ever happen. */ throw new NoUnprotectedFilesException( "Cannot truncate VLSNIndex because no remaining files " + "contain VLSNs.", Level.INFO); } /* * Truncate the VLSNIndex, then try again to get an unprotected file. */ if (!envImpl.tryTruncateVlsnHead( truncateInfo.first(), truncateInfo.second())) { /* * There are several reasons for tryTruncateVlsnHead to return * false as explained by VLSNTracker.tryTruncateFromHead(VLSN, * long, LogItemCache) javadoc. Only one of them -- that the file * is protected -- should cause a WARNING message. To avoid false * alarms we log an INFO message here for now. TODO: Return more * info from tryTruncateVlsnHead so we can log a WARNING when * appropriate. */ throw new NoUnprotectedFilesException( "VLSNIndex already truncated or cannot be truncated further.", Level.INFO); } file = getNextUnprotectedFile(); if (file != null) { return file; } throw new NoUnprotectedFilesException( "Protected files are not in the recovery interval and " + "VLSNIndex was successfully truncated.", Level.WARNING); } /** * Returns a human readable description of what may be preventing erasure. * These are the reasons that {@link NoUnprotectedFilesException} may be * thrown by {@link #getNextFile()}. */ private String getFileProtectionMessage() { final long firstRecoveryFile = getFirstFileInRecoveryInterval(); return "Files protected by the current recovery interval: [" + FormatUtil.asHexString( filesRemaining.tailSet(firstRecoveryFile)) + "]. Other protected files: " + fileProtector.getProtectedFileMap(filesRemaining) + ". FirstRecoveryFile: 0x" + Long.toHexString(firstRecoveryFile) + ". FirstVLSNIndexFile: 0x" + Long.toHexString(fileProtector.getVLSNIndexStartFile()) + "."; } /** * Returns the first file remaining that is unprotected. If a file is * selected, the currentFile field is updated to support aborting. * This method synchronizes with {@link #abortErase} to ensure that a * file passed to abortErase will not be selected again for erasure. */ private Long getNextUnprotectedFile() { synchronized (currentFileMutex) { final Long file = fileProtector.getFirstUnprotectedFile(filesRemaining); if (file == null) { return null; } if (file >= getFirstFileInRecoveryInterval()) { return null; } currentFile = file; abortCurrentFile = false; return file; } } /** * Called after a file returned by {@link #getNextUnprotectedFile()} is no * longer being processed. */ private void clearCurrentFile() { synchronized (currentFileMutex) { currentFile = null; abortCurrentFile = false; } } /** * Returns the file being processed or null. A volatile field is accessed * without synchronization. */ Long getCurrentFile() { return currentFile; } /** * Used to ensure that erasure of a file stops before coping that file * during a backup or network restore. A short wait may be performed to * ensure that the eraser thread notices the abort and any in-process * writes are completed. * * @param fileSet erasure of the current file is aborted if the current * file is protected by this protected file set. The given fileSet must * be protected at the time this method is called. * * @throws EraserAbortException if we can't abort erasure of a target * file within {@link EnvironmentParams#ERASE_ABORT_TIMEOUT}. The * timeout is long, so this should not happen unless the eraser thread * is wedged or starved. */ public void abortErase(final FileProtector.ProtectedFileSet fileSet) { final Long abortFile; /* * Synchronize with getNextUnprotectedFile to ensure that a file * passed to abortErase will not be selected again for erasure. */ synchronized (currentFileMutex) { /* * The check for isReservedFile is used to avoid aborting when we * will not erase the file. This is a minor optimization and is * not required for correctness. */ if (currentFile != null && fileSet.isProtected(currentFile, null) && !fileProtector.isReservedFile(currentFile)) { abortCurrentFile = true; abortFile = currentFile; } else { return; } } /* * Wait for the eraser thread to notice that the abortCurrentFile flag * is set, at which time it will set currentFile to null. The wait is * necessary to ensure that a write will not be performed after this * method returns. * * It is important that we return ASAP, so we use Object.notify to * wake up any waiters. All other polling is done with Object.wait on * the pollMutex object, to ensure we can abort promptly. */ if (PollCondition.await(1, abortTimeoutMs, () -> { synchronized (pollMutex) { pollMutex.notify(); } synchronized (currentFileMutex) { return !abortFile.equals(currentFile); } })) { return; /* Aborted. */ } /* * We don't expect this to ever occur, because the timeout is quite * long and all erasure operations are checking the abortCurrentFile * flag quite frequently. EraserAbortException is thrown (rather than * EnvironmentWedgedException) because we don't want to shut down the * environment when DbBackup.start fails. */ final String msg = "Unable to abort erasure of file 0x" + Long.toHexString(abortFile) + " within " + abortTimeoutMs + "ms."; LoggerUtils.warning(logger, envImpl, msg); throw new EraserAbortException(msg); } /** * Returns the file number of the firstActiveLsn for the last completed * checkpoint. Files GTE this file would be in the recovery interval if * a crash occurs, and are protected from erasure. */ private long getFirstFileInRecoveryInterval() { return DbLsn.getFileNumber( envImpl.getCheckpointer().getLastCheckpointFirstActiveLsn()); } /** * Returns the {endVLSN,endFile} pair that is needed to truncate the * VLSNIndex. * * Returns null if no remaining files-to-erase contain VLSNs. A file that * does not contain VLSNs (is "barren" may contain BINs or migrated LNs * for data that needs erasure. However, this is expected to be a rare * condition so we tolerate it. It will be corrected during a subsequent * cycle if any file following this cycle's files need erasure and do * contain VLSNs, or the VLSNIndex is truncated for other reasons, for * example, to reclaim disk space. * * Selects the last file-to-erase containing a VLSN. This reduces the * number of (fairly expensive) VLSNIndex truncations to the minimum. * However, it means that the index will be truncated for a large number * of files at once, rather than truncating it incrementally. */ private Pair getVLSNIndexTruncationInfo() { for (final Long file : filesRemaining.descendingSet()) { checkContinue(); /* * We avoid searching the file if it happens to be a reserved * file, since the last VLSN is known to the FileProtector. */ VLSN lastVlsn = fileProtector.getReservedFileLastVLSN(file); if (lastVlsn != null) { if (lastVlsn.isNull()) { continue; } return new Pair<>(lastVlsn, file); } /* Must search. */ lastVlsn = searchFileForLastVLSN(file); if (lastVlsn != null) { if (lastVlsn.isNull()) { continue; } return new Pair<>(lastVlsn, file); } } return null; } /** * Returns the last VLSN in the given file, or NULL_VLSN if the file does * contain any VLSNs, or null if the file does not exist. * * Only VLSNs in replicated entries are considered. VLSNs in migrated LNs * are ignored. */ private VLSN searchFileForLastVLSN(final Long file) { final FileInfo fileInfo = fileInfoCache.get(file); if (fileInfo != null && fileInfo.lastVlsn != null) { return fileInfo.lastVlsn; } final long fileLength = (fileInfo != null) ? fileInfo.length : getFileLength(file); boolean forward; long startLsn; long finishLsn; long endOfFileLsn; try { /* * If file+1 exists, read its header to get the offset of the * previous entry, and then read the file backwards to find the * last VLSN. */ final long prevOffset = fileManager.getFileHeaderPrevOffset(file + 1); forward = false; startLsn = DbLsn.makeLsn(file, prevOffset); finishLsn = DbLsn.makeLsn(file, 0); endOfFileLsn = DbLsn.makeLsn(file, fileLength); } catch (FileNotFoundException e) { /* * If file+1 does not exist, read the entire file from the start. */ forward = true; startLsn = DbLsn.makeLsn(file, 0); finishLsn = DbLsn.NULL_LSN; endOfFileLsn = DbLsn.NULL_LSN; } catch (ChecksumException e) { throw new EnvironmentFailureException( envImpl, EnvironmentFailureReason.LOG_CHECKSUM, e); } final FileReader reader = new FileReader( envImpl, cleaner.readBufferSize, forward, startLsn, file /*singleFileNumber*/, endOfFileLsn, finishLsn) { @Override protected boolean processEntry(ByteBuffer entryBuffer) { final int readOps = getAndResetNReads(); reads.addAndGet(readOps); readBytes.addAndGet(readOps * cleaner.readBufferSize); cycleThrottle.throttle(currentEntryHeader.getEntrySize()); checkContinue(); /* Skip the data, no need to materialize the entry. */ skipEntry(entryBuffer); /* * Only consider replicated entries. Note that this includes * Erased entries. */ return currentEntryHeader.getReplicated(); } }; VLSN lastVlsn = VLSN.NULL_VLSN; try { while (reader.readNextEntryAllowExceptions()) { lastVlsn = reader.getLastVlsn(); if (lastVlsn == null || lastVlsn.isNull()) { throw EnvironmentFailureException.unexpectedState( "Replicated entries must have a VLSN."); } if (!forward) { break; } } } catch (ChecksumException e) { throw new EnvironmentFailureException( envImpl, EnvironmentFailureReason.LOG_CHECKSUM, e); } catch (FileNotFoundException e) { return null; } if (fileInfo != null) { fileInfo.lastVlsn = lastVlsn; } return lastVlsn; } /** * Erases the targeted entries in the given file and makes the changes * persistent using fsync. */ private void eraseFile(final Long file) { /* * If a reserved file can be deleted, this is cheaper than erasure. * Use 25% of the file size for throttling. * * If cleaner.deleteReservedFile returns false then we proceed with * erasure. This is safe because the VLSNIndex has been truncated. */ final long fileLength = fileInfoCache.get(file).length; if (fileProtector.isReservedFile(file) && cleaner.deleteReservedFile(file, "ERASER")) { filesDeleted.incrementAndGet(); cycleThrottle.throttle((fileLength * 25) / 100); return; } final EraserReader reader = new EraserReader(file); final DbCache dbCache = new DbCache(envImpl, cleaner); final ExtinctionScanner extinctionScanner = envImpl.getExtinctionScanner(); int entriesToErase = 0; long firstWriteTime = 0; boolean completed = false; final PackedOffsets obsoleteOffsets = envImpl.getUtilizationProfile().getObsoleteDetailPacked( file, false /*logUpdate*/, this::checkContinue); final PackedOffsets.Iterator obsoleteIter = obsoleteOffsets.iterator(); long nextObsolete = -1; final String fullFileName = fileManager.getFullFileName(file); RandomAccessFile raf = null; try { /* Clear checksum saved by network restore. */ envImpl.clearedCachedFileChecksum( new File(fullFileName).getName()); while (reader.readNextEntryAllowExceptions()) { dbCache.clearCachePeriodically(); final long logLsn = reader.getLastLsn(); final long fileOffset = DbLsn.getFileOffset(logLsn); while (nextObsolete < fileOffset && obsoleteIter.hasNext()) { nextObsolete = obsoleteIter.next(); } final boolean isKnownObsolete = (nextObsolete == fileOffset); boolean doErase = false; if (reader.isErased) { final ErasedLogEntry erasedEntry = (ErasedLogEntry) reader.logEntry; if (!erasedEntry.isAllZeros()) { doErase = true; } } else { final DatabaseId dbId = reader.logEntry.getDbId(); DbCache.DbInfo dbInfo = dbCache.getDbInfo(dbId); if (dbInfo.deleted) { /* * All LNs and BINs in deleted DBs can be erased. But * when eraseDeletedDbs is false, if the DB is deleted * then we cannot erase its extinct records either, * because we don't have the DB name and dups status. * * If the DB is being deleted (DbInfo.deleting is * true) then we cannot erase its active INs because * they may not have been counted obsolete yet. * However, it is OK to erase its LNs and obsolete * INs, and we do that further below. */ if (eraseDeletedDbs) { doErase = true; } } else if (reader.isLN && eraseExtinctRecords) { /* * All extinct LNs (in non-deleted DBs) can be erased. */ final LNLogEntry lnEntry = (LNLogEntry) reader.logEntry; lnEntry.postFetchInit(dbInfo.dups); if (envImpl.getExtinctionState( dbInfo.name, dbInfo.dups, dbInfo.internal, lnEntry.getKey()) == EXTINCT) { doErase = true; } } else if (reader.isBIN && eraseExtinctRecords) { /* * A BIN with an extinct slot can be erased only if * the BIN is obsolete/dead. */ final BIN bin = (BIN) reader.logEntry.getMainItem(); for (int i = 0; i < bin.getNEntries(); i += 1) { checkContinue(); final byte[] key = bin.getKey(i); final ExtinctionFilter.ExtinctionStatus status = envImpl.getExtinctionState( dbInfo.name, dbInfo.dups, dbInfo.internal, key); if (status == MAYBE_EXTINCT) { /* * This is a crude way to restart and give the * the app a chance to initialize its metadata. */ throw new AbortCurrentFileException(file); } if (status != EXTINCT) { continue; } if (isKnownObsolete) { doErase = true; break; } /* * Defer expensive getDbImpl and Btree lookup * (isBINDead call) until we encounter an extinct * slot in a BIN that is not known-obsolete. We * must do a Btree lookup because the known- * obsolete metadata is not 100% complete. */ dbInfo = dbCache.getDbImpl(dbId); /* * If we need to do a Btree lookup and dbImpl is * null, then the DB was deleted since the last * DB lookup and erasure must wait until the next * cycle. * * If the key for this DB is part of an active * extinction, then the BIN cannot be persistently * obsolete and we must wait for the task to * finish before erasing the BIN. * * In addition, as an extra precaution, do not * erase the BIN if isBINDead returns false, which * means the BIN is still active in the in-memory * Btree. */ if (dbInfo.dbImpl != null && !extinctionScanner.isRecordExtinctionIncomplete( dbInfo.dbImpl, key) && isBINDead(dbInfo.dbImpl, bin, logLsn)) { /* BIN is dead/obsolete. */ doErase = true; } /* * If the BIN is not dead, we cannot erase it. * This is the edge case where a record extinction * has started (a slot is extinct) but not yet * complete (the BIN is not obsolete yet). */ break; } } } if (doErase) { if (raf == null) { firstWriteTime = System.currentTimeMillis(); raf = fileManager.openFileReadWrite(fullFileName); touchAndFsync(raf, fileLength); } if (!reader.isErased) { writeErasedType(raf, fileOffset); } entriesToErase = addEraseEntry( entriesToErase, fileOffset + reader.header.getSize(), reader.header.getItemSize()); } } /* Call releaseDbImpls before eraseEntries, which throttles. */ dbCache.releaseDbImpls(); eraseEntries(raf, fileLength, entriesToErase); filesErased.incrementAndGet(); completed = true; } catch (ChecksumException e) { throw new EnvironmentFailureException( envImpl, EnvironmentFailureReason.LOG_CHECKSUM, "Exception erasing file 0x" + Long.toHexString(file), e); } catch (FileNotFoundException e) { /* File was deleted by the cleaner. */ filesAlreadyDeleted.incrementAndGet(); } catch (IOException e) { throw new EnvironmentFailureException( envImpl, EnvironmentFailureReason.LOG_WRITE, "Exception erasing file 0x" + Long.toHexString(file), e); } finally { if (firstWriteTime != 0) { LoggerUtils.info( logger, envImpl, "ERASER attempted to erase " + entriesToErase + " entries in file 0x" + Long.toHexString(file) + ", first write at " + formatTime(firstWriteTime) + ", erasure is " + (completed ? "complete" : "incomplete") + "."); } dbCache.releaseDbImpls(); if (raf != null) { try { raf.close(); } catch (IOException e) { LoggerUtils.warning( logger, envImpl, "DataEraser.eraseFile exception when closing " + "file 0x" + Long.toHexString(file) + ": " + e); } } } } /** * File reader that materializes only user-database LNs, and BINs and * BIN-deltas. */ private class EraserReader extends FileReader { LogEntryHeader header; LogEntryType entryType; LogEntry logEntry; boolean isLN; boolean isBIN; boolean isErased; EraserReader(final Long file) { super(DataEraser.this.envImpl, cleaner.readBufferSize, true /*forward*/, DbLsn.makeLsn(file, 0) /*startLsn*/, file /*singleFileNumber*/, DbLsn.NULL_LSN /*endOfFileLsn*/, DbLsn.NULL_LSN /*finishLsn*/); } @Override protected boolean processEntry(final ByteBuffer entryBuffer) { final int readOps = getAndResetNReads(); reads.addAndGet(readOps); readBytes.addAndGet(readOps * cleaner.readBufferSize); cycleThrottle.throttle(currentEntryHeader.getEntrySize()); checkContinue(); header = currentEntryHeader; isLN = false; isBIN = false; isErased = false; logEntry = null; entryType = LogEntryType.findType(header.getType()); assert entryType != null; if (entryType.isUserLNType()) { isLN = true; } else if (entryType.equals(LogEntryType.LOG_BIN) || entryType.equals(LogEntryType.LOG_BIN_DELTA)) { isBIN = true; } else if (entryType.equals(LogEntryType.LOG_ERASED)) { isErased = true; } else { skipEntry(entryBuffer); return false; } logEntry = entryType.getNewLogEntry(); logEntry.readEntry(envImpl, header, entryBuffer); return true; } } /** * Returns whether the given BIN or BIN-delta is absent from the Btree. * This only checks the in-memory cache, so returning true does not * guarantee that the BIN is persistently obsolete. Therefore this method * is only useful to provide added safety. */ private boolean isBINDead(final DatabaseImpl db, final BIN binFromLog, final long logLsn) { binFromLog.latchNoUpdateLRU(db); final SearchResult result = db.getTree().getParentINForChildIN( binFromLog, true /*useTargetLevel*/, true /*doFetch*/, CacheMode.UNCHANGED); if (result.parent == null) { /* * Btree is completely empty (no root IN). Since we never reduce * the number of levels in the Btree, this should only occur if * the DB was truncated. */ return true; } try { /* * For a parent search with useTargetLevel=true, a non-null parent * implies that a match was found. */ assert result.exactParentFound; /* * Check whether the BIN is active by comparing the active Btree * LSN to the LSN of the BIN we read from the log. */ final long treeLsn = result.parent.getLsn(result.index); if (binFromLog.isBINDelta(false)) { /* * A BIN-delta read from the log is obsolete if its LSN is * not equal to the tree LSN. */ return logLsn != treeLsn; } /* * A full BIN read from the log is active if its LSN is equal to * the tree LSN. */ if (logLsn == treeLsn) { return false; } /* * If the tree and log LSNs are unequal, then we must get the full * version LSN in case the tree LSN is actually for a BIN-delta. * The only way to do that is to fetch the IN in the tree. */ final BIN treeBin = (BIN) result.parent.fetchIN( result.index, CacheMode.UNCHANGED); return logLsn != treeBin.getLastFullLsn(); } finally { result.parent.releaseLatch(); } } /** * Adds the offset/size of one entry to be erased. */ private int addEraseEntry(final int n, final long offset, final int size) { if (n == eraseOffsets.length) { /* Enlarge arrays. */ final long[] newOffsets = new long[n * 2]; System.arraycopy(eraseOffsets, 0, newOffsets, 0, n); eraseOffsets = newOffsets; final int[] newSizes = new int[newOffsets.length]; System.arraycopy(eraseSizes, 0, newSizes, 0, n); eraseSizes = newSizes; } eraseOffsets[n] = offset; eraseSizes[n] = size; return n + 1; } /** * Ensures that the file's lastModifiedTime is persistently updated, * before performing any modifications to the file. This allows * applications to detect erasure by checking for a change to * lastModifiedTime across erasure cycles, without the worry that a * crash during erasure may have prevented the file system from updating * the lastModifiedTime. */ private void touchAndFsync(final RandomAccessFile file, final long fileLength) throws IOException { checkContinue(); file.seek(0); final byte b = file.readByte(); file.seek(0); file.writeByte(b); writes.incrementAndGet(); writeBytes.incrementAndGet(); file.getChannel().force(false); fSyncs.incrementAndGet(); cycleThrottle.throttle((fileLength * FSYNC1_WORK_PCT) / 100); // TODO update FileManager stats? } /** * Changes the type of the entry on disk to the LOG_ERASED type. */ private void writeErasedType(final RandomAccessFile file, final long headerOffset) throws IOException { file.seek(headerOffset + LogEntryHeader.ENTRYTYPE_OFFSET); checkContinue(); file.writeByte(LogEntryType.LOG_ERASED.getTypeNum()); writes.incrementAndGet(); writeBytes.incrementAndGet(); // TODO update FileManager stats? } /** * Writes zeros in the entries in the erase list and fsyncs to persist * the changes. */ private void eraseEntries(final RandomAccessFile raf, final long fileLength, final int entries) throws IOException { if (entries == 0) { return; } final long fsync1Work = (fileLength * FSYNC1_WORK_PCT) / 100; final long fsync2Work = (fileLength * FSYNC2_WORK_PCT) / 100; final long fsync3Work = (fileLength * FSYNC3_WORK_PCT) / 100; final long writeWork = ((fileLength * WRITE_WORK_PCT) / 100) / entries; /* Don't neglect work lost due to int truncation. */ final long extraWork = fileLength - ((writeWork * entries) + fsync1Work + fsync2Work + fsync3Work); /* * Make type changes persistent before writing zeros. This is the only * way to be sure that an entry's type is changed before it is zero * filled, since write ordering is not guaranteed. */ checkContinue(); raf.getChannel().force(false); fSyncs.incrementAndGet(); cycleThrottle.throttle(fsync2Work); for (int i = 0; i < entries; i += 1) { final long offset = eraseOffsets[i]; final int size = eraseSizes[i]; if (zeros.length < size) { zeros = new byte[size * 2]; } raf.seek(offset); checkContinue(); raf.write(zeros, 0, size); writes.incrementAndGet(); writeBytes.addAndGet(size); cycleThrottle.throttle(writeWork); } /* Make complete erasure of this file persistent. */ checkContinue(); raf.getChannel().force(false); fSyncs.incrementAndGet(); cycleThrottle.throttle(fsync3Work + extraWork); // TODO update FileManager stats? } /** * Returns whether the log entry at the given LSN has been erased. * * Is not particularly efficient (it opens and closes the file each time * it is called) and is meant for exceptional circumstances such as when a * checksum verification fails. */ public boolean isEntryErased(final long lsn) { final long file = DbLsn.getFileNumber(lsn); final long offset = DbLsn.getFileOffset(lsn); /* * Can't use FileManager.getFileHandle here because we may be called * with the FileHandle already latched and latching is not reentrant. */ RandomAccessFile fileHandle = null; try { fileHandle = new RandomAccessFile( fileManager.getFullFileName(file), FileManager.FileMode.READ_MODE.getModeValue()); fileHandle.seek(offset + LogEntryHeader.ENTRYTYPE_OFFSET); return fileHandle.readByte() == LogEntryType.LOG_ERASED.getTypeNum(); } catch (FileNotFoundException|EOFException e) { return false; } catch (IOException e) { throw new EnvironmentFailureException( envImpl, EnvironmentFailureReason.LOG_WRITE, "Exception checking erasure file 0x" + Long.toHexString(file), e); } finally { if (fileHandle != null) { try { fileHandle.close(); } catch (IOException e) { LoggerUtils.warning( logger, envImpl, "DataEraser.isEntryErased exception when closing " + "file 0x" + Long.toHexString(file) + ": " + e); } } } } /** * Utility to help spread a given amount of totalWork overs a given * durationMs. */ private class WorkThrottle { private final float msPerUnitOfWork; private final long startTime; private long workDone; WorkThrottle(final long totalWork, final long durationMs) { msPerUnitOfWork = ((float) durationMs) / totalWork; startTime = System.currentTimeMillis(); workDone = 0; } /** * Waits until the ratio of elapsed time to durationMs is GTE the * ratio of workDone to totalWork. Elapsed time is time since the * constructor was called. * * If the computed wait time is less than MIN_WORK_DELAY_MS, no wait * is performed. This is meant to prevent large numbers of waits for * relatively small amounts of work done. Note that this method is * called frequently to add very very small amounts of work. */ void throttle(final long addWork) { workDone += addWork; final long workDoneMs = (long) (workDone * msPerUnitOfWork); final long delayMs = workDoneMs - (System.currentTimeMillis() - startTime); if (delayMs >= MIN_WORK_DELAY_MS) { final long checkMs = Math.min(delayMs, pollCheckMs); PollCondition.await(checkMs, delayMs, pollMutex, () -> { checkContinue(); return false; }); } } } private void checkShutdown() { if (shutdownRequested || !envImpl.isValid()) { throw new ShutdownRequestedException(); } } private void checkContinue() { checkShutdown(); if (!enabled) { throw new ErasureDisabledException(); } if (endTime - startTime != cycleMs) { throw new PeriodChangedException(); } synchronized (currentFileMutex) { if (abortCurrentFile) { final long file = currentFile; currentFile = null; abortCurrentFile = false; throw new AbortCurrentFileException(file); } } } private boolean checkForCycleEnd() { if (System.currentTimeMillis() >= endTime) { if (filesRemaining.isEmpty()) { logCycleComplete(); } else { logCycleIncomplete(); } return true; } return false; } private static class NoUnprotectedFilesException extends RuntimeException { final Level logLevel; NoUnprotectedFilesException(final String msg, final Level logLevel) { super(msg); this.logLevel = logLevel; } } private static class ErasureDisabledException extends RuntimeException { } private static class PeriodChangedException extends RuntimeException { } private static class ShutdownRequestedException extends RuntimeException { } private static class AbortCurrentFileException extends RuntimeException { final long file; AbortCurrentFileException(final long file) { this.file = file; } } /** * If enabled has been set to false, wait for it to be set to true again. * * Throws ShutdownRequestedException but no other internal exceptions. * Calling checkContinue is unnecessary because currentFile is null. */ private void waitForEnabled() { while (true) { if (PollCondition.await( pollCheckMs, FOREVER_TIMEOUT_MS, pollMutex, () -> { checkShutdown(); return enabled; })) { return; } } } /** * Wait for the current time to pass the end time. */ private void waitForCycleEnd() { completionTime = System.currentTimeMillis(); logCycleComplete(); PollCondition.await( pollCheckMs, endTime - completionTime, pollMutex, () -> { checkContinue(); return false; }); } /** * Sleep for a time and dream about things that cause files to become * unprotected. When we wake up, our dreams may have come true. * * Throws ShutdownRequestedException but no other internal exceptions. * Calling checkContinue is unnecessary because currentFile is null. */ private void waitForUnprotectedFiles(final NoUnprotectedFilesException e) { lastProtectedFilesMsg = e.getMessage(); lastProtectedFilesMsgLevel = e.logLevel; PollCondition.await( pollCheckMs, NO_UNPROTECTED_FILES_DELAY_MS, pollMutex, () -> { checkShutdown(); return false; }); } private void storeState() { final TupleOutput out = new TupleOutput(); out.writeLong(startTime); out.writeLong(endTime); out.writePackedLong(totalCycleWork); storeFileSet(out, filesCompleted); storeFileSet(out, filesRemaining); final DatabaseEntry data = new DatabaseEntry(); TupleBase.outputToEntry(out, data); envImpl.getMetadataStore().put(MetadataStore.KEY_ERASER, data); } private boolean loadState() { final DatabaseEntry data = new DatabaseEntry(); if (envImpl.getMetadataStore().get( MetadataStore.KEY_ERASER, data) == null) { return false; } final TupleInput in = TupleBase.entryToInput(data); startTime = in.readLong(); endTime = in.readLong(); totalCycleWork = in.readPackedLong(); filesCompleted = loadFileSet(in); filesRemaining = loadFileSet(in); return true; } private void storeFileSet(final TupleOutput out, final NavigableSet set) { out.writePackedInt(set.size()); long priorFile = 0; for (final long file : set) { out.writePackedLong(file - priorFile); priorFile = file; } } private NavigableSet loadFileSet(final TupleInput in) { final int size = in.readPackedInt(); final NavigableSet set = Collections.synchronizedNavigableSet(new TreeSet<>()); long file = 0; for (int i = 0; i < size; i += 1) { file += in.readPackedLong(); set.add(file); } return set; } /** Cached info about a file to be erased. */ private static class FileInfo { final long creationTime; final long length; VLSN lastVlsn; FileInfo(final long creationTime, final long length) { this.creationTime = creationTime; this.length = length; } } private void logCycleInit(final int filesToExamine) { LoggerUtils.info(logger, envImpl, "ERASER initializing new cycle. Total files: " + filesToExamine); callTestEventHook(TestEvent.Type.INIT); } private void logCycleStart() { LoggerUtils.info(logger, envImpl, "ERASER new cycle started. " + getCycleStatus()); callTestEventHook(TestEvent.Type.START); } private void logCycleComplete() { LoggerUtils.info(logger, envImpl, "ERASER cycle completed. " + getCycleStatus()); callTestEventHook(TestEvent.Type.COMPLETE); } private void logCycleIncomplete() { final String msg = "ERASER unable to erase files " + "within the erasure period. " + ((lastProtectedFilesMsg != null) ? lastProtectedFilesMsg : "File protection did not prevent erasure, so probably " + "just ran out of time.") + " " + getFileProtectionMessage() + " " + getCycleStatus(); LoggerUtils.logMsg( logger, envImpl, (lastProtectedFilesMsgLevel != null) ? lastProtectedFilesMsgLevel : Level.INFO, msg); callTestEventHook(TestEvent.Type.INCOMPLETE); } private void logCycleResume() { LoggerUtils.info(logger, envImpl, "ERASER previously incomplete cycle resumed at startup. " + getCycleStatus()); callTestEventHook(TestEvent.Type.RESUME); } private void logCycleCannotResume() { LoggerUtils.warning(logger, envImpl, "ERASER previously incomplete cycle not resumed at startup" + " because end time has passed. " + getCycleStatus()); callTestEventHook(TestEvent.Type.CANNOT_RESUME); } private void logCycleSuspend() { LoggerUtils.info(logger, envImpl, "ERASER incomplete cycle suspended at shutdown. " + getFileProtectionMessage() + " " + getCycleStatus()); callTestEventHook(TestEvent.Type.SUSPEND); } private void logPeriodChanged() { LoggerUtils.info(logger, envImpl, "ERASER period param was changed, current cycle aborted. " + getCycleStatus()); callTestEventHook(TestEvent.Type.PERIOD_CHANGED); } private String getCycleStatus() { return "Cycle start: " + formatTime(startTime) + ", end: " + formatTime(endTime) + ", filesCompleted: [" + FormatUtil.asHexString(filesCompleted) + "], filesRemaining: [" + FormatUtil.asHexString(filesRemaining) + "]."; } private static String formatTime(final long time) { return DATE_FORMAT.format(new Date(time)); } private void callTestEventHook(final TestEvent.Type type) { if (testEventHook == null) { return; } testEventHook.doHook(new TestEvent(type, this)); } static void setTestEventHook(final TestHook hook) { testEventHook = hook; } static class TestEvent { enum Type { INIT, START, COMPLETE, INCOMPLETE, RESUME, CANNOT_RESUME, SUSPEND, PERIOD_CHANGED, } final Type type; final long startTime; final long endTime; final long completionTime; final NavigableSet filesCompleted; final NavigableSet filesRemaining; TestEvent(final Type type, final DataEraser eraser) { this.type = type; startTime = eraser.startTime; endTime = eraser.endTime; completionTime = eraser.completionTime; filesCompleted = new TreeSet<>(eraser.filesCompleted); filesRemaining = new TreeSet<>(eraser.filesRemaining); } @Override public String toString() { return type.toString() + " startTime=" + formatTime(startTime) + " endTime=" + formatTime(endTime) + " completionTime=" + formatTime(completionTime) + " filesCompleted=[" + FormatUtil.asHexString(filesCompleted) + "] filesRemaining=[" + FormatUtil.asHexString(filesRemaining) + "]"; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy