com.sleepycat.je.dbi.DiskOrderedScanner Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of starrocks-bdb-je Show documentation
starrocks managed bdb je
The newest version!
/*-
 * Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
 *
 * This file was distributed by Oracle as part of a version of Oracle Berkeley
 * DB Java Edition made available at:
 *
 * http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
 *
 * Please see the LICENSE file included in the top-level directory of the
 * appropriate version of Oracle Berkeley DB Java Edition for a copy of the
 * license and additional information.
 */

package com.sleepycat.je.dbi;

import java.lang.ref.WeakReference;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;

import com.sleepycat.je.CacheMode;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.EnvironmentFailureException;
import com.sleepycat.je.ThreadInterruptedException;
import com.sleepycat.je.cleaner.FileProtector;
import com.sleepycat.je.evictor.Evictor;
import com.sleepycat.je.evictor.OffHeapCache;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.LogManager;
import com.sleepycat.je.log.entry.LNLogEntry;
import com.sleepycat.je.log.entry.LogEntry;
import com.sleepycat.je.tree.BIN;
import com.sleepycat.je.tree.IN;
import com.sleepycat.je.tree.Key;
import com.sleepycat.je.tree.LN;
import com.sleepycat.je.tree.OldBINDelta;
import com.sleepycat.je.tree.SearchResult;
import com.sleepycat.je.tree.Tree;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.TestHook;
import com.sleepycat.je.utilint.TestHookExecute;

/**
 * Provides an enumeration of all key/data pairs in a database, striving to
 * fetch in disk order.
 *
 * Unlike SortedLSNTreeWalker, for which the primary use case is preload, this
 * class notifies the callback while holding a latch only if that can be done
 * without blocking (e.g., when the callback can buffer the data without
 * blocking).  In other words, while holding a latch, the callback will not be
 * notified if it might block.  This is appropriate for the DOS
 * (DiskOrderedCursor) use case, since the callback will block if the DOS queue
 * is full, and the user's consumer thread may not empty the queue as quickly
 * as it can be filled by the producer thread.  If the callback were allowed to
 * block while a latch is held, this would block other threads attempting to
 * access the database, including JE internal threads, which would have a very
 * detrimental impact.
 *
 * Algorithm
 * =========
 *
 * Terminology
 * -----------
 * callback: object implementing the RecordProcessor interface
 * process: invoking the callback with a key-data pair
 * iteration: top level iteration consisting of phase I and II
 * phase I: accumulate LSNs
 * phase II: sort, fetch and process LSNs
 *
 * Phase I and II
 * --------------
 * To avoid processing resident nodes (invoking the callback with a latch
 * held), a non-recursive algorithm is used.  Instead of recursively
 * accumulating LSNs in a depth-first iteration of the tree (like the
 * SortedLSNTreeWalker algorithm), level 2 INs are traversed in phase I and
 * LSNs are accumulated for LNs or BINs (more on this below).  When the memory
 * or LSN batch size limit is exceeded, phase I ends and all tree latches are
 * released.  During phase II the previously accumulated LSNs are fetched and
 * the callback is invoked for each key or key-data pair.  Since no latches are
 * held, it is permissible for the callback to block.
 *
 * One iteration of phase I and II processes some subset of the database.
 * Since INs are traversed in tree order in phase I, this subset is described
 * by a range of keys.  When performing the next iteration, the IN traversal is
 * restarted at the highest key that was processed by the previous iteration.
 * The previous highest key is used to avoid duplication of entries, since some
 * overlap between iterations may occur.
 *
 * LN and BIN modes
 * ----------------
 * As mentioned above, we accumulate LSNs for either LNs or BINs.  The BIN
 * accumulation mode provides an optimization for key-only traversals and for
 * all traversals of duplicate DBs (in a dup DB, the data is included in the
 * key).  In these cases we never need to fetch the LN, so we can sort and
 * fetch the BIN LSNs instead.  This supports at least some types of traversals
 * that are efficient when all BINs are not in the JE cache.
 *
 * We must only accumulate LN or BIN LSNs, never both, and never the LSNs of
 * other INs (above level 1).  If we broke this rule, there would be no way to
 * constrain memory usage in our non-recursive approach, since we could not
 * easily predict in advance how much memory would be needed to fetch the
 * nested nodes.  Even if we were able predict the memory needed, it would be
 * of little advantage to sort and fetch a small number of higher level nodes,
 * only to accumulate the LSNs of their descendants (which are much larger in
 * number).  The much smaller number of higher level nodes would likely be
 * fetched via random IO anyway, in a large data set anyway.
 *
 * The above justification also applies to the algorithm we use in LN mode, in
 * which we accumulate and fetch only LN LSNs.  In this mode we always fetch
 * BINs explicitly (not in LSN sorted order), if they are not resident, for the
 * reasons stated above.
 *
 * Furthermore, in BIN mode we must account for BIN-deltas.  Phase I must keep
 * a copy any BIN-deltas encountered in the cache.  And phase II must make two
 * passes for the accumulated LSNs: one pass to load the deltas and another to
 * load the full BINs and merge the previously loaded deltas.  Unfortunately
 * we must budget memory for the deltas during phase I; since most BIN LSNs are
 * for deltas, not full BINs, we assume that we will need to temporarily save a
 * delta for each LSN.  This two pass approach is not like the recursive
 * algorithm we rejected above, however, in two respects: 1) we know in advance
 * (roughly anyway) how much memory we will need for both passes, and 2) the
 * number of LSNs fetched in each pass is roughly the same.
 *
 * Data Lag
 * --------
 * In phase I, as an exception to what was said above, we sometimes process
 * nodes that are resident in the Btree (in the JE cache) if this is possible
 * without blocking.  The primary intention of this is to provide more recent
 * data to the callback.  When accumulating BINs, if the BIN is dirty then
 * fetching its LSN later means that some recently written LNs will not be
 * included.  Therefore, if the callback would not block, we process the keys
 * in a dirty BIN during phase I.  Likewise, when accumulating LNs in a
 * deferred-write database, we process dirty LNs if the callback would not
 * block.  When accumulating LN LSNs for a non-deferred-write database, we can
 * go further and process all resident LNs, as long as the callback would not
 * block, since we know that no LNs are dirty.
 *
 * In spite of our attempt to process resident nodes, we may not be able to
 * process all of them if doing so would cause the callback to block.  When we
 * can't process a dirty, resident node, the information added (new, deleted or
 * updated records) since the node was last flushed will not be visible to the
 * callback.
 *
 * In other words, the data presented to the callback may lag back to the time
 * of the last checkpoint.  It cannot lag further back than the last
 * checkpoint, because: 1) the scan doesn't accumulate LSNs any higher than the
 * BIN level, and 2) checkpoints flush all dirty BINs.  For a DOS, the user may
 * decrease the likelihood of stale data by increasing the DOS queue size,
 * decreasing the LSN batch size, decreasing the memory limit, or performing a
 * checkpoint immediately before the start of the scan.  Even so, it may be
 * impossible to guarantee that all records written at the start of the scan
 * are visible to the callback.
 */
public class DiskOrderedScanner {

    /**
     * Interface implemented by the callback.
     */
    interface RecordProcessor {

        /**
         * Process a key-data pair, in user format (dup DB keys are already
         * split into key and data).
         *
         * @param key always non-null.
         * @param data is null only in keys-only mode.
         */
        void process(
            int dbIdx,
            byte[] key,
            byte[] data,
            int expiration,
            boolean expirationInHours);

        /**
         * Returns whether process() can be called nRecords times, immediately
         * after calling this method, without any possibility of blocking.
         * For example, with DOS this method returns true if the DOS queue has
         * nRecords or more remaining capacity.
         */
        boolean canProcessWithoutBlocking(int nRecords);

        int getCapacity();

        void checkShutdown();
    }

    /*
     *
     */
    private static class DBContext {

        final int dbIdx;

        final DatabaseImpl dbImpl;

        boolean done = false;

        byte[] prevEndingKey = null;
        byte[] newEndingKey = null;

        long lastBinLsn = DbLsn.NULL_LSN;
        boolean safeToUseCachedDelta = false;

        IN parent = null;
        boolean parentIsLatched;
        int pidx = 0;
        long plsn;
        byte[] pkey;

        boolean checkLevel2Keys = true;

        byte[] binKey;
        boolean reuseBin = false;

        DBContext(int dbIdx, DatabaseImpl db) {
            this.dbIdx = dbIdx;
            dbImpl = db;
        }
    }

    /*
     * WeakBinRefs are used to reduce the memory consumption for BIN deltas
     * that are found in the je cache during phase 1. In an older implementation
     * of DOS all such deltas were copied locally and the copies retained
     * until phase 2b, when they would be merged with their associated full
     * bins. Obviously this could consume a lot of memory.
     *
     * In the current implementation, dirty deltas are still treated the old
     * way, i.e. copied (see below for a justification). But for clean deltas,
     * a WeakBinRef is created when, during phase 1, such a delta is found in
     * the je cache (the WeakBinRef is created while the delta is latched). At
     * creation time, the WeakBinRef stores (a) a WeakReference to the BIN obj,
     * (b) the lsn found in the parent slot for this bin (the on-disk image
     * pointed to by this lsn must be the same as the latched, in-memory image,
     * given that we use WeakBinRefs for clean deltas only) (c) the lsn of
     * the full bin associated with this bin delta (copied out from the delta
     * itself), and (d) the average size of deltas for the database and log
     * file that the delta belongs to.
     *
     * DiskOrderedScanner.binDeltas is an ArrayList of Object refs, pointing
     * to either WeakBinRefs (for clean deltas) or BINs (for copies of dirty
     * deltas). The list is built during phase 1 and is used during phase 2b
     * to merge the referenced deltas with their associated full BINs. The
     * list is first sorted by full-bin LSN so that the full bins are fetched
     * in disk order.
     *
     * Shortly after a WeakBinRef is created, its associated bin gets unlatched
     * and remains unlatched until the WeakBinRef is used in phase 2b. As a
     * result, anything can happen to the bin in between. The bin may be evicted
     * and then garbage-collected, or it may be converted to a full BIN, split,
     * logged, and mutated to a delta again. If the BIN obj gets GC-ed,
     * this.get() will be null when we process this WeakBinRef in phase 2b.
     * In this case, to avoid doing one or two random I/Os in the middle of
     * sequential I/Os, this.binLsn is saved in a set of "deferred LSNs"
     * (see below) to be processed in a subsequent iteration. If the BIN obj
     * is still accessible, we compare its current full-bin LSN with the one
     * saved in this.fullBinLsn. If they are different, we again add
     * this.binLsn to deferred-LSNs set. Again this is done to avoid
     * disturbing the sequential I/O (we could use the current full-bin LSN to
     * merge the delta with the correct full bin, but this would require a
     * random I/O), but also because the bin could have split, and there is no
     * easy way to find the other bin(s) where slots from this bin were moved
     * to. Note that detecting splits via comparing the 2 full-bin LSNs is a
     * conservative approach that relies on splits being logged immediately. 
     *
     * WeakReferences are used to ref cached clean deltas in order to avoid
     * counting the size of the delta in the DOS budget or having to pin the
     * bin for a long period of time. Of course, if this.binLsn needs to be
     * deferred for the reasons mentioned above, then an average delta size will
     * be added to the memory usage for the DOS as well as the je cache.
     *
     * What about dirty cached deltas? We could be doing the same as for clean
     * deltas. However, for dirty deltas, their unsaved updates could have
     * happened before the start of the DOS and we would loose these updates
     * if instead of processing the original bin delta, we have to fetch
     * this.binLsn from disk. So, we would be violating the DOS contract that
     * says that the returned data are not older than the start of the DOS.  
     */
    public static class WeakBinRef extends WeakReference {

        final long binLsn;
        final long fullBinLsn;
        final int memSize;

        /* For Sizeof util program */
        public WeakBinRef() {
            super(null);
            binLsn = 0;
            fullBinLsn = 0;
            memSize = 0;
        }

        WeakBinRef(
            DiskOrderedScanner scanner,
            DBContext ctx,
            long lsn,
            BIN delta) {

            super(delta);
            binLsn = lsn;
            fullBinLsn = delta.getLastFullLsn();

            assert(lsn != delta.getLastFullLsn());

            if (lsn != delta.getLastFullLsn()) {
                memSize = DELTA_MEM_SIZE;
            } else {
                memSize = 0;
            }
        }
    }

    public static class OffHeapBinRef {

        final int ohBinId;
        final long binLsn;
        final long fullBinLsn;
        final int memSize;

        /* For Sizeof util program */
        public OffHeapBinRef() {
            ohBinId = 0;
            binLsn = 0;
            fullBinLsn = 0;
            memSize = 0;
        }

        OffHeapBinRef(
            DiskOrderedScanner scanner,
            DBContext ctx,
            long binLsn,
            long fullBinLsn,
            int ohBinId) {

            this.ohBinId = ohBinId;
            this.binLsn = binLsn;
            this.fullBinLsn = fullBinLsn;
            memSize = DELTA_MEM_SIZE;
        }
    }

    /*
     * A DeferredLsnsBatch obj stores a set of lsns whose processing cannot be
     * done in the current iteration, and as a result, must be deferred until
     * one of the subsequent iterations. 
     *
     * The DeferredLsnsBatch obj also stores the total (approximate) memory
     * that will be consumed when, during a subsequent phase 1a, for each bin
     * pointed-to by these lsns, the bin is fetched from disk and, if it is a
     * delta, stored in memory until it is merged with its associated full bin
     * (which is done in phase 2b).
     *
     * Given that we cannot exceed the DOS memory budget during each iteration,
     * no more LSNs are added to a DeferredLsnsBatch once its memoryUsage gets
     * >= memoryLimit. Instead, a new DeferredLsnsBatch is created and added
     * at the tail of the DiskOrderedScanner.deferredLsns queue. Only the batch
     * at the head of the deferredLsns queue can be processed during each
     * iteration. So, if we have N DeferredLsnsBatches in the queue, the next
     * N iteration will process only deferred lsns (no phase 1 will be done
     * during these N iterations).
     */
    public static class DeferredLsnsBatch {

        /*
         * Fixed overhead for adding an lsn to this batch 
         */
        final static int LSN_MEM_OVERHEAD =
            (MemoryBudget.HASHSET_ENTRY_OVERHEAD +
             MemoryBudget.LONG_OVERHEAD);

        final HashSet lsns;
        long memoryUsage = 0;

        /* For Sizeof util program */
        public DeferredLsnsBatch() {
            this.lsns = new HashSet<>();
            memoryUsage = 0;
        }

        DeferredLsnsBatch(DiskOrderedScanner scanner) {
            this.lsns = new HashSet<>();
            scanner.addGlobalMemory(SIZEOF_DeferredLsnsBatch);
            memoryUsage += SIZEOF_DeferredLsnsBatch;
        }

        void free(DiskOrderedScanner scanner) {
            scanner.addGlobalMemory(-SIZEOF_DeferredLsnsBatch);
            memoryUsage -= SIZEOF_DeferredLsnsBatch;
            assert(memoryUsage == 0);
        }

        boolean containsLsn(long lsn) {
            return lsns.contains(lsn);
        }

        /*
         * Called during phase 2b, when a WeakBinRef or OffHeapBinRef has to
         * be deferred.
         */
        boolean addLsn(DiskOrderedScanner scanner, long lsn, int memSize) {

            this.lsns.add(lsn);

            int currentMemConsumption = LSN_MEM_OVERHEAD;
            scanner.addGlobalMemory(currentMemConsumption);

            /*
             * ref.memSize is the memory that will be needed during a
             * subsequent phase 2a to store the delta fetched via ref.binLsn,
             * if ref.binLsn is indeed pointing to a delta; 0 if ref.binLsn
             * points to a full bin.
             */
            int futureMemConsumption = memSize;

            /* For storing ref.binLsn in the lsns array created in phase 2a. */
            futureMemConsumption += 8;

            /*
             * For the DeferredDeltaRef created in phase 2a to reference the
             * delta fetched via ref.binLsn.
             */
            futureMemConsumption += SIZEOF_DeferredDeltaRef;

            memoryUsage += (currentMemConsumption + futureMemConsumption);

            return scanner.accLimitExceeded(memoryUsage, lsns.size());
        }

        /*
         * Called during phase 2a after fetching the bin pointed-to by the
         * given lsn. 
         */
        boolean removeLsn(
            DiskOrderedScanner scanner,
            long lsn,
            int memSize) {
            
            boolean found =  lsns.remove(lsn);

            if (found) {

                scanner.addGlobalMemory(-LSN_MEM_OVERHEAD);

                /*
                 * We don't really need to subtract from this.memoryUsage here.
                 * It is done only for sanity checking (the assertion in
                 * this.free()).
                 */
                int memDelta = LSN_MEM_OVERHEAD;
                memDelta += memSize;
                memDelta += 8;
                memDelta += SIZEOF_DeferredDeltaRef;
                memoryUsage -= memDelta;
            }

            return found;
        }

        void undoLsn(
            DiskOrderedScanner scanner,
            long lsn,
            int memSize) {
            
            boolean found =  lsns.remove(lsn);
            assert(found);

            scanner.addGlobalMemory(-LSN_MEM_OVERHEAD);

            int memDelta = LSN_MEM_OVERHEAD;
            memDelta += memSize;
            memDelta += 8;
            memDelta += SIZEOF_DeferredDeltaRef;
            memoryUsage -= memDelta;
        }
    }

    /*
     * Wrapper for a BIN ref. Used to distinguish whether the fetching of a
     * bin delta during phase 1a was done via a deferred lsn or not. This info
     * is needed during phase 2b: after the delta is merged with its associated
     * full bin, we need to know whether it is a deferred bin, in which case
     * its records will be processed without checking their keys against
     * prevEndingKey.
     */
    public static class DeferredDeltaRef {

        final BIN delta;

        /* For Sizeof util program */
        public DeferredDeltaRef() {
            delta = null;
        }

        DeferredDeltaRef(DiskOrderedScanner scanner, BIN delta) {
            this.delta = delta;
            scanner.addGlobalMemory(SIZEOF_DeferredDeltaRef);
        }

        void free(DiskOrderedScanner scanner) {
            scanner.addGlobalMemory(-SIZEOF_DeferredDeltaRef);
        }
    }

    private static final LogEntryType[] LN_ONLY = new LogEntryType[] {
        LogEntryType.LOG_INS_LN /* Any LN type will do. */
    };

    private static final LogEntryType[] BIN_ONLY = new LogEntryType[] {
        LogEntryType.LOG_BIN
    };

    private static final LogEntryType[] BIN_OR_DELTA = new LogEntryType[] {
        LogEntryType.LOG_BIN,
        LogEntryType.LOG_BIN_DELTA,
        LogEntryType.LOG_OLD_BIN_DELTA,
    };

    private final static int SIZEOF_JAVA_REF =
        MemoryBudget.OBJECT_ARRAY_ITEM_OVERHEAD;

    private final static int SIZEOF_WeakBinRef =
        MemoryBudget.DOS_WEAK_BINREF_OVERHEAD;

    private final static int SIZEOF_OffHeapBinRef =
        MemoryBudget.DOS_OFFHEAP_BINREF_OVERHEAD;

    private final static int SIZEOF_DeferredDeltaRef =
        MemoryBudget.DOS_DEFERRED_DELTAREF_OVERHEAD;

    private final static int SIZEOF_DeferredLsnsBatch =
        MemoryBudget.DOS_DEFERRED_LSN_BATCH_OVERHEAD;

    private final static int ACCUMULATED_MEM_LIMIT = 100000; // bytes

    private final static int SUSPENSION_INTERVAL = 50; // in milliseconds

    /* A rough estimate of the memory needed for a BIN-delta object. */
    private static final int DELTA_MEM_SIZE = 1536;

    private final boolean scanSerial;

    private final boolean countOnly;
    private final boolean keysOnly;
    private final boolean binsOnly;

    private final long lsnBatchSize;
    private final long memoryLimit;

    private final EnvironmentImpl env;

    private final RecordProcessor processor;

    private final int numDBs;

    private final DBContext[] dbs;

    private final Map dbid2dbidxMap;

    private final boolean dupDBs;

    private final ArrayList