Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*-
* Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
*
* This file was distributed by Oracle as part of a version of Oracle Berkeley
* DB Java Edition made available at:
*
* http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
*
* Please see the LICENSE file included in the top-level directory of the
* appropriate version of Oracle Berkeley DB Java Edition for a copy of the
* license and additional information.
*/
package com.sleepycat.je.dbi;
import java.lang.ref.WeakReference;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Map;
import com.sleepycat.je.CacheMode;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.EnvironmentFailureException;
import com.sleepycat.je.ThreadInterruptedException;
import com.sleepycat.je.cleaner.FileProtector;
import com.sleepycat.je.evictor.Evictor;
import com.sleepycat.je.evictor.OffHeapCache;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.LogManager;
import com.sleepycat.je.log.entry.LNLogEntry;
import com.sleepycat.je.log.entry.LogEntry;
import com.sleepycat.je.tree.BIN;
import com.sleepycat.je.tree.IN;
import com.sleepycat.je.tree.Key;
import com.sleepycat.je.tree.LN;
import com.sleepycat.je.tree.OldBINDelta;
import com.sleepycat.je.tree.SearchResult;
import com.sleepycat.je.tree.Tree;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.TestHook;
import com.sleepycat.je.utilint.TestHookExecute;
/**
* Provides an enumeration of all key/data pairs in a database, striving to
* fetch in disk order.
*
* Unlike SortedLSNTreeWalker, for which the primary use case is preload, this
* class notifies the callback while holding a latch only if that can be done
* without blocking (e.g., when the callback can buffer the data without
* blocking). In other words, while holding a latch, the callback will not be
* notified if it might block. This is appropriate for the DOS
* (DiskOrderedCursor) use case, since the callback will block if the DOS queue
* is full, and the user's consumer thread may not empty the queue as quickly
* as it can be filled by the producer thread. If the callback were allowed to
* block while a latch is held, this would block other threads attempting to
* access the database, including JE internal threads, which would have a very
* detrimental impact.
*
* Algorithm
* =========
*
* Terminology
* -----------
* callback: object implementing the RecordProcessor interface
* process: invoking the callback with a key-data pair
* iteration: top level iteration consisting of phase I and II
* phase I: accumulate LSNs
* phase II: sort, fetch and process LSNs
*
* Phase I and II
* --------------
* To avoid processing resident nodes (invoking the callback with a latch
* held), a non-recursive algorithm is used. Instead of recursively
* accumulating LSNs in a depth-first iteration of the tree (like the
* SortedLSNTreeWalker algorithm), level 2 INs are traversed in phase I and
* LSNs are accumulated for LNs or BINs (more on this below). When the memory
* or LSN batch size limit is exceeded, phase I ends and all tree latches are
* released. During phase II the previously accumulated LSNs are fetched and
* the callback is invoked for each key or key-data pair. Since no latches are
* held, it is permissible for the callback to block.
*
* One iteration of phase I and II processes some subset of the database.
* Since INs are traversed in tree order in phase I, this subset is described
* by a range of keys. When performing the next iteration, the IN traversal is
* restarted at the highest key that was processed by the previous iteration.
* The previous highest key is used to avoid duplication of entries, since some
* overlap between iterations may occur.
*
* LN and BIN modes
* ----------------
* As mentioned above, we accumulate LSNs for either LNs or BINs. The BIN
* accumulation mode provides an optimization for key-only traversals and for
* all traversals of duplicate DBs (in a dup DB, the data is included in the
* key). In these cases we never need to fetch the LN, so we can sort and
* fetch the BIN LSNs instead. This supports at least some types of traversals
* that are efficient when all BINs are not in the JE cache.
*
* We must only accumulate LN or BIN LSNs, never both, and never the LSNs of
* other INs (above level 1). If we broke this rule, there would be no way to
* constrain memory usage in our non-recursive approach, since we could not
* easily predict in advance how much memory would be needed to fetch the
* nested nodes. Even if we were able predict the memory needed, it would be
* of little advantage to sort and fetch a small number of higher level nodes,
* only to accumulate the LSNs of their descendants (which are much larger in
* number). The much smaller number of higher level nodes would likely be
* fetched via random IO anyway, in a large data set anyway.
*
* The above justification also applies to the algorithm we use in LN mode, in
* which we accumulate and fetch only LN LSNs. In this mode we always fetch
* BINs explicitly (not in LSN sorted order), if they are not resident, for the
* reasons stated above.
*
* Furthermore, in BIN mode we must account for BIN-deltas. Phase I must keep
* a copy any BIN-deltas encountered in the cache. And phase II must make two
* passes for the accumulated LSNs: one pass to load the deltas and another to
* load the full BINs and merge the previously loaded deltas. Unfortunately
* we must budget memory for the deltas during phase I; since most BIN LSNs are
* for deltas, not full BINs, we assume that we will need to temporarily save a
* delta for each LSN. This two pass approach is not like the recursive
* algorithm we rejected above, however, in two respects: 1) we know in advance
* (roughly anyway) how much memory we will need for both passes, and 2) the
* number of LSNs fetched in each pass is roughly the same.
*
* Data Lag
* --------
* In phase I, as an exception to what was said above, we sometimes process
* nodes that are resident in the Btree (in the JE cache) if this is possible
* without blocking. The primary intention of this is to provide more recent
* data to the callback. When accumulating BINs, if the BIN is dirty then
* fetching its LSN later means that some recently written LNs will not be
* included. Therefore, if the callback would not block, we process the keys
* in a dirty BIN during phase I. Likewise, when accumulating LNs in a
* deferred-write database, we process dirty LNs if the callback would not
* block. When accumulating LN LSNs for a non-deferred-write database, we can
* go further and process all resident LNs, as long as the callback would not
* block, since we know that no LNs are dirty.
*
* In spite of our attempt to process resident nodes, we may not be able to
* process all of them if doing so would cause the callback to block. When we
* can't process a dirty, resident node, the information added (new, deleted or
* updated records) since the node was last flushed will not be visible to the
* callback.
*
* In other words, the data presented to the callback may lag back to the time
* of the last checkpoint. It cannot lag further back than the last
* checkpoint, because: 1) the scan doesn't accumulate LSNs any higher than the
* BIN level, and 2) checkpoints flush all dirty BINs. For a DOS, the user may
* decrease the likelihood of stale data by increasing the DOS queue size,
* decreasing the LSN batch size, decreasing the memory limit, or performing a
* checkpoint immediately before the start of the scan. Even so, it may be
* impossible to guarantee that all records written at the start of the scan
* are visible to the callback.
*/
public class DiskOrderedScanner {
/**
* Interface implemented by the callback.
*/
interface RecordProcessor {
/**
* Process a key-data pair, in user format (dup DB keys are already
* split into key and data).
*
* @param key always non-null.
* @param data is null only in keys-only mode.
*/
void process(
int dbIdx,
byte[] key,
byte[] data,
int expiration,
boolean expirationInHours);
/**
* Returns whether process() can be called nRecords times, immediately
* after calling this method, without any possibility of blocking.
* For example, with DOS this method returns true if the DOS queue has
* nRecords or more remaining capacity.
*/
boolean canProcessWithoutBlocking(int nRecords);
int getCapacity();
void checkShutdown();
}
/*
*
*/
private static class DBContext {
final int dbIdx;
final DatabaseImpl dbImpl;
boolean done = false;
byte[] prevEndingKey = null;
byte[] newEndingKey = null;
long lastBinLsn = DbLsn.NULL_LSN;
boolean safeToUseCachedDelta = false;
IN parent = null;
boolean parentIsLatched;
int pidx = 0;
long plsn;
byte[] pkey;
boolean checkLevel2Keys = true;
byte[] binKey;
boolean reuseBin = false;
DBContext(int dbIdx, DatabaseImpl db) {
this.dbIdx = dbIdx;
dbImpl = db;
}
}
/*
* WeakBinRefs are used to reduce the memory consumption for BIN deltas
* that are found in the je cache during phase 1. In an older implementation
* of DOS all such deltas were copied locally and the copies retained
* until phase 2b, when they would be merged with their associated full
* bins. Obviously this could consume a lot of memory.
*
* In the current implementation, dirty deltas are still treated the old
* way, i.e. copied (see below for a justification). But for clean deltas,
* a WeakBinRef is created when, during phase 1, such a delta is found in
* the je cache (the WeakBinRef is created while the delta is latched). At
* creation time, the WeakBinRef stores (a) a WeakReference to the BIN obj,
* (b) the lsn found in the parent slot for this bin (the on-disk image
* pointed to by this lsn must be the same as the latched, in-memory image,
* given that we use WeakBinRefs for clean deltas only) (c) the lsn of
* the full bin associated with this bin delta (copied out from the delta
* itself), and (d) the average size of deltas for the database and log
* file that the delta belongs to.
*
* DiskOrderedScanner.binDeltas is an ArrayList of Object refs, pointing
* to either WeakBinRefs (for clean deltas) or BINs (for copies of dirty
* deltas). The list is built during phase 1 and is used during phase 2b
* to merge the referenced deltas with their associated full BINs. The
* list is first sorted by full-bin LSN so that the full bins are fetched
* in disk order.
*
* Shortly after a WeakBinRef is created, its associated bin gets unlatched
* and remains unlatched until the WeakBinRef is used in phase 2b. As a
* result, anything can happen to the bin in between. The bin may be evicted
* and then garbage-collected, or it may be converted to a full BIN, split,
* logged, and mutated to a delta again. If the BIN obj gets GC-ed,
* this.get() will be null when we process this WeakBinRef in phase 2b.
* In this case, to avoid doing one or two random I/Os in the middle of
* sequential I/Os, this.binLsn is saved in a set of "deferred LSNs"
* (see below) to be processed in a subsequent iteration. If the BIN obj
* is still accessible, we compare its current full-bin LSN with the one
* saved in this.fullBinLsn. If they are different, we again add
* this.binLsn to deferred-LSNs set. Again this is done to avoid
* disturbing the sequential I/O (we could use the current full-bin LSN to
* merge the delta with the correct full bin, but this would require a
* random I/O), but also because the bin could have split, and there is no
* easy way to find the other bin(s) where slots from this bin were moved
* to. Note that detecting splits via comparing the 2 full-bin LSNs is a
* conservative approach that relies on splits being logged immediately.
*
* WeakReferences are used to ref cached clean deltas in order to avoid
* counting the size of the delta in the DOS budget or having to pin the
* bin for a long period of time. Of course, if this.binLsn needs to be
* deferred for the reasons mentioned above, then an average delta size will
* be added to the memory usage for the DOS as well as the je cache.
*
* What about dirty cached deltas? We could be doing the same as for clean
* deltas. However, for dirty deltas, their unsaved updates could have
* happened before the start of the DOS and we would loose these updates
* if instead of processing the original bin delta, we have to fetch
* this.binLsn from disk. So, we would be violating the DOS contract that
* says that the returned data are not older than the start of the DOS.
*/
public static class WeakBinRef extends WeakReference {
final long binLsn;
final long fullBinLsn;
final int memSize;
/* For Sizeof util program */
public WeakBinRef() {
super(null);
binLsn = 0;
fullBinLsn = 0;
memSize = 0;
}
WeakBinRef(
DiskOrderedScanner scanner,
DBContext ctx,
long lsn,
BIN delta) {
super(delta);
binLsn = lsn;
fullBinLsn = delta.getLastFullLsn();
assert(lsn != delta.getLastFullLsn());
if (lsn != delta.getLastFullLsn()) {
memSize = DELTA_MEM_SIZE;
} else {
memSize = 0;
}
}
}
public static class OffHeapBinRef {
final int ohBinId;
final long binLsn;
final long fullBinLsn;
final int memSize;
/* For Sizeof util program */
public OffHeapBinRef() {
ohBinId = 0;
binLsn = 0;
fullBinLsn = 0;
memSize = 0;
}
OffHeapBinRef(
DiskOrderedScanner scanner,
DBContext ctx,
long binLsn,
long fullBinLsn,
int ohBinId) {
this.ohBinId = ohBinId;
this.binLsn = binLsn;
this.fullBinLsn = fullBinLsn;
memSize = DELTA_MEM_SIZE;
}
}
/*
* A DeferredLsnsBatch obj stores a set of lsns whose processing cannot be
* done in the current iteration, and as a result, must be deferred until
* one of the subsequent iterations.
*
* The DeferredLsnsBatch obj also stores the total (approximate) memory
* that will be consumed when, during a subsequent phase 1a, for each bin
* pointed-to by these lsns, the bin is fetched from disk and, if it is a
* delta, stored in memory until it is merged with its associated full bin
* (which is done in phase 2b).
*
* Given that we cannot exceed the DOS memory budget during each iteration,
* no more LSNs are added to a DeferredLsnsBatch once its memoryUsage gets
* >= memoryLimit. Instead, a new DeferredLsnsBatch is created and added
* at the tail of the DiskOrderedScanner.deferredLsns queue. Only the batch
* at the head of the deferredLsns queue can be processed during each
* iteration. So, if we have N DeferredLsnsBatches in the queue, the next
* N iteration will process only deferred lsns (no phase 1 will be done
* during these N iterations).
*/
public static class DeferredLsnsBatch {
/*
* Fixed overhead for adding an lsn to this batch
*/
final static int LSN_MEM_OVERHEAD =
(MemoryBudget.HASHSET_ENTRY_OVERHEAD +
MemoryBudget.LONG_OVERHEAD);
final HashSet lsns;
long memoryUsage = 0;
/* For Sizeof util program */
public DeferredLsnsBatch() {
this.lsns = new HashSet<>();
memoryUsage = 0;
}
DeferredLsnsBatch(DiskOrderedScanner scanner) {
this.lsns = new HashSet<>();
scanner.addGlobalMemory(SIZEOF_DeferredLsnsBatch);
memoryUsage += SIZEOF_DeferredLsnsBatch;
}
void free(DiskOrderedScanner scanner) {
scanner.addGlobalMemory(-SIZEOF_DeferredLsnsBatch);
memoryUsage -= SIZEOF_DeferredLsnsBatch;
assert(memoryUsage == 0);
}
boolean containsLsn(long lsn) {
return lsns.contains(lsn);
}
/*
* Called during phase 2b, when a WeakBinRef or OffHeapBinRef has to
* be deferred.
*/
boolean addLsn(DiskOrderedScanner scanner, long lsn, int memSize) {
this.lsns.add(lsn);
int currentMemConsumption = LSN_MEM_OVERHEAD;
scanner.addGlobalMemory(currentMemConsumption);
/*
* ref.memSize is the memory that will be needed during a
* subsequent phase 2a to store the delta fetched via ref.binLsn,
* if ref.binLsn is indeed pointing to a delta; 0 if ref.binLsn
* points to a full bin.
*/
int futureMemConsumption = memSize;
/* For storing ref.binLsn in the lsns array created in phase 2a. */
futureMemConsumption += 8;
/*
* For the DeferredDeltaRef created in phase 2a to reference the
* delta fetched via ref.binLsn.
*/
futureMemConsumption += SIZEOF_DeferredDeltaRef;
memoryUsage += (currentMemConsumption + futureMemConsumption);
return scanner.accLimitExceeded(memoryUsage, lsns.size());
}
/*
* Called during phase 2a after fetching the bin pointed-to by the
* given lsn.
*/
boolean removeLsn(
DiskOrderedScanner scanner,
long lsn,
int memSize) {
boolean found = lsns.remove(lsn);
if (found) {
scanner.addGlobalMemory(-LSN_MEM_OVERHEAD);
/*
* We don't really need to subtract from this.memoryUsage here.
* It is done only for sanity checking (the assertion in
* this.free()).
*/
int memDelta = LSN_MEM_OVERHEAD;
memDelta += memSize;
memDelta += 8;
memDelta += SIZEOF_DeferredDeltaRef;
memoryUsage -= memDelta;
}
return found;
}
void undoLsn(
DiskOrderedScanner scanner,
long lsn,
int memSize) {
boolean found = lsns.remove(lsn);
assert(found);
scanner.addGlobalMemory(-LSN_MEM_OVERHEAD);
int memDelta = LSN_MEM_OVERHEAD;
memDelta += memSize;
memDelta += 8;
memDelta += SIZEOF_DeferredDeltaRef;
memoryUsage -= memDelta;
}
}
/*
* Wrapper for a BIN ref. Used to distinguish whether the fetching of a
* bin delta during phase 1a was done via a deferred lsn or not. This info
* is needed during phase 2b: after the delta is merged with its associated
* full bin, we need to know whether it is a deferred bin, in which case
* its records will be processed without checking their keys against
* prevEndingKey.
*/
public static class DeferredDeltaRef {
final BIN delta;
/* For Sizeof util program */
public DeferredDeltaRef() {
delta = null;
}
DeferredDeltaRef(DiskOrderedScanner scanner, BIN delta) {
this.delta = delta;
scanner.addGlobalMemory(SIZEOF_DeferredDeltaRef);
}
void free(DiskOrderedScanner scanner) {
scanner.addGlobalMemory(-SIZEOF_DeferredDeltaRef);
}
}
private static final LogEntryType[] LN_ONLY = new LogEntryType[] {
LogEntryType.LOG_INS_LN /* Any LN type will do. */
};
private static final LogEntryType[] BIN_ONLY = new LogEntryType[] {
LogEntryType.LOG_BIN
};
private static final LogEntryType[] BIN_OR_DELTA = new LogEntryType[] {
LogEntryType.LOG_BIN,
LogEntryType.LOG_BIN_DELTA,
LogEntryType.LOG_OLD_BIN_DELTA,
};
private final static int SIZEOF_JAVA_REF =
MemoryBudget.OBJECT_ARRAY_ITEM_OVERHEAD;
private final static int SIZEOF_WeakBinRef =
MemoryBudget.DOS_WEAK_BINREF_OVERHEAD;
private final static int SIZEOF_OffHeapBinRef =
MemoryBudget.DOS_OFFHEAP_BINREF_OVERHEAD;
private final static int SIZEOF_DeferredDeltaRef =
MemoryBudget.DOS_DEFERRED_DELTAREF_OVERHEAD;
private final static int SIZEOF_DeferredLsnsBatch =
MemoryBudget.DOS_DEFERRED_LSN_BATCH_OVERHEAD;
private final static int ACCUMULATED_MEM_LIMIT = 100000; // bytes
private final static int SUSPENSION_INTERVAL = 50; // in milliseconds
/* A rough estimate of the memory needed for a BIN-delta object. */
private static final int DELTA_MEM_SIZE = 1536;
private final boolean scanSerial;
private final boolean countOnly;
private final boolean keysOnly;
private final boolean binsOnly;
private final long lsnBatchSize;
private final long memoryLimit;
private final EnvironmentImpl env;
private final RecordProcessor processor;
private final int numDBs;
private final DBContext[] dbs;
private final Map dbid2dbidxMap;
private final boolean dupDBs;
private final ArrayList