com.sleepycat.je.rep.vlsn.VLSNIndex Maven / Gradle / Ivy
/*-
* Copyright (C) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
*
* This file was distributed by Oracle as part of a version of Oracle Berkeley
* DB Java Edition made available at:
*
* http://www.oracle.com/technetwork/database/database-technologies/berkeleydb/downloads/index.html
*
* Please see the LICENSE file included in the top-level directory of the
* appropriate version of Oracle Berkeley DB Java Edition for a copy of the
* license and additional information.
*/
package com.sleepycat.je.rep.vlsn;
import static com.sleepycat.je.utilint.VLSN.NULL_VLSN;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;
import com.sleepycat.bind.tuple.LongBinding;
import com.sleepycat.je.Cursor;
import com.sleepycat.je.CursorConfig;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseConfig;
import com.sleepycat.je.DatabaseEntry;
import com.sleepycat.je.DatabaseException;
import com.sleepycat.je.DbInternal;
import com.sleepycat.je.Durability;
import com.sleepycat.je.Environment;
import com.sleepycat.je.EnvironmentFailureException;
import com.sleepycat.je.LockMode;
import com.sleepycat.je.OperationStatus;
import com.sleepycat.je.StatsConfig;
import com.sleepycat.je.TransactionConfig;
import com.sleepycat.je.cleaner.FileProtector.ProtectedFileSet;
import com.sleepycat.je.dbi.DatabaseImpl;
import com.sleepycat.je.dbi.DbTree;
import com.sleepycat.je.dbi.DbType;
import com.sleepycat.je.dbi.EnvironmentImpl;
import com.sleepycat.je.log.LogEntryType;
import com.sleepycat.je.log.LogItem;
import com.sleepycat.je.recovery.RecoveryInfo;
import com.sleepycat.je.rep.impl.RepParams;
import com.sleepycat.je.rep.impl.node.NameIdPair;
import com.sleepycat.je.rep.vlsn.VLSNRange.VLSNRangeBinding;
import com.sleepycat.je.txn.BasicLocker;
import com.sleepycat.je.txn.Locker;
import com.sleepycat.je.txn.Txn;
import com.sleepycat.je.utilint.DbLsn;
import com.sleepycat.je.utilint.LoggerUtils;
import com.sleepycat.je.utilint.LongStat;
import com.sleepycat.je.utilint.Pair;
import com.sleepycat.je.utilint.StatGroup;
import com.sleepycat.je.utilint.TestHook;
import com.sleepycat.je.utilint.TestHookExecute;
import com.sleepycat.je.utilint.VLSN;
/**
* {@literal
* A VLSN (Virtual LSN) is used to identify every log entry shared between
* members of the replication group. Since a JE log is identified by LSNs, we
* must have a way to map VLSN->LSNs in order to fetch a replicated log record
* from the local log, using the VLSN. The VLSNIndex implements those
* mappings. The VLSNIndex has these responsibilities:
*
* Generating new VLSNs.
* Only masters need to generate VLSNs, but any node may have the potential
* to be a master. The VLSN sequence must ascend over time and across
* recoveries, so the VSLN id must be preserved much like the database, node
* and txn ids.
* Maintaining the VLSN range.
* Although each node needs to receive and store each log entry from the
* replication stream, over time the part of the stream that is stored can be
* reduced, either by log cleaning, or by syncups which can truncate the
* replication stream. A node always holds a contiguous portion of the
* replication stream. The VLSN range identifies that portion by having the
* start and end VLSNs, as well as key landmarks such as the lastSync-able
* log entry and the last commit log entry. VLSN range information is used by
* elections and syncup.
* Gatekeeper for waiting for the most recently logged entries.
* Feeders block upon the VLSNIndex when they are trying to fetch the most
* recently logged entries. These recent log entries are held in a two level
* cache within the VLSNIndex. A call to VLSNIndex.waitForLsn() goes through
* this sequence:
* 1) check the log item stored in the vlsn wait latch, if the call did wait.
* 2) check the log item cache
* If both fail, the FeederReader will fetch the required log entry from log
* buffers or disk
* Providing the LSN mapping for a log record identified by its VLSN.
* The Feeders and the syncup protocol both need to retrieve log records
* by VLSN. To do that, we need an LSN mapping.
*
* Mappings are added to VLSNIndex when replicated log entries are written into
* the local log. Although all mappings are registered, the VLSNIndex does not
* keep every one, in order to save on disk and in-memory storage. Only a
* sparse set is kept. When searching for a log entry by VLSN, the caller uses
* the closest available mapping and then scans the log looking for that entry.
*
* The VLSNIndex relies on the assumption that VLSN tagged log entries are
* ordered and contiguous in the log. That is, the LSN for VLSN 1 is < the LSN
* for VLSN 2 < LSN for VLSN 3, and there is never a gap in the VLSNs. However,
* at node syncup, the replication stream may need to be truncated when rolling
* back a non-committed log entry. We can't literally truncate the log files
* because the JE logs contain intermingled transactional and non transactional
* information. Instead, the truncation is done both logically by amending the
* VLSNIndex, and physically by overmarking those entries in the JE
* logs. Because of that, a physical dump of the log may show some VLSN tagged
* entries as duplicate and/or out of order because they're abandoned log
* entries that are not logically part of the replication stream any more
* For example, the log can look like this:
* LSN 100, VLSN 1
* LSN 200, VLSN 2 <- overmarked
* LSN 300, VLSN 3 <- overmarked
* --- syncup, rollback to VLSN 1, restart at VLSN 2
* LSN 400, VLSN 2
* LSN 500, VLSN 3
*
* VLSN->LSN mappings are created under the log write latch, which ensures that
* all VLSN tagged log entries are ordered in the logical replication stream in
* the log. However, the mapping is added to the VLSNIndex outside the log
* write latch, so the VLSNIndex database may have a momentary gap. For
* example,
*
* t0- thread 1 logs entry at VLSN=1, LSN=100, within log write latch
* t1- thread 2 logs entry at VLSN=2, LSN=150, within log write latch
* t2- thread 3 logs entry at VLSN=3, LSN=200, within log write latch
* t3- thread 1 calls VLSNIndex.put(VLSN=1/LSN=100)
* t4- thread 3 calls VLSNIndex.put(VLSN=3/LSN=200)
* t5- thread 2 calls VLSNIndex.put(VLSN=2/LSN=150)
*
* At t4, the VLSNIndex contains 1/100, 3/200, but not 2/150. However, we know
* that the VLSNIndex always represents a contiguous range of VLSNs, so the
* fact that 2/150 is not yet is handled, and is just like the case where
* the VLSNIndex optimized away the mapping in order to keep the index sparse.
*
* We do guarantee that the start and end VLSNs in the range have mappings, in
* order to always be able to provide a LTE and GTE mapping for all valid
* VLSNs. Because of that, if a VLSN comes out of order, it does not update the
* range. Care must be taken when truncating the VLSNIndex from the head or the
* tail to ensure that the guaranteed existence of the start and end range
* mapping remains valid.
*
* Cache and persistent storage:
*
* The VLSN->LSN mappings in the range are grouped into instances of
* com.sleepycat.je.util.VLSNBucket. Each bucket knows the first and last VLSN
* within its mini-range. We observe these invariants
* - buckets are ordered by VLSN in the database and the bucket cache,
* - only the last bucket is the target of updates at any time,
* - a single bucket corresponds to a single file, but a single file may
* have multiple buckets covering it.
*
* While it would be nice to also guarantee that there are no gaps between
* buckets, ie:
* bucket(N-1).last == bucket(N).first - 1
* bucket(N).last == bucket(N-1).first - 1
* it is not possible to do so because we the put() call is not serialized
* because we don't want to add overhead to the log write latch. In order
* to permit out of order puts(), and to require that only the last bucket
* is updated, we must permit gaps between buckets.
*
* Mappings start out being cached in VLSNBuckets held in memory by the
* VLSNTracker. As the tracker fills, the buckets are flushed to persistent
* storage in a internal, non-replicated database. Both the database and
* the tracker cache hold key/value pairs where
*
* key = bucket.first
* data = bucket
*
* Since the first valid VLSN is 1, key = -1 is reserved for storage of the
* VLSNRange.
*
* Buckets are filled up as new VLSNs arrive (either because they've been
* generated by write operations on the master, or because they're incoming
* operations on the replica). They're flushed to disk periodically rather than
* with every new VLSN, because the update rate would have too much of a
* performance impact. Since there is this level of caching happening, we must
* be careful to write in-memory buckets to disk at well known points to
* support recoverability. The flushing must be instigated by a third party
* activity, such as checkpointing, rather than by the action of adding a new
* mapping. That's because mappings are registered by the logging system, and
* although we are not holding the log write latch at that point, it seems
* inadvisable to recursively generate another logging call on behalf of the
* flush. Currently the VLSNIndex is flushed to disk at every checkpoint. It
* can also optionally happen more often, and (TODO) we may want to do so
* because we've seen cases where checkpoints take a very long time. Perhaps we
* should flush when we flip to a new log file?
*
* Once written to disk, the buckets are generally not updated. Updates can
* happen when the range is truncated, such as for syncup rollback, but the
* system is quiescent at that time, and there are no new mappings created. Log
* cleaning can read the vlsnIndex and delete buckets, but will not modify
* mappings. The VLSNRange does naturally change often, and that data record
* does get updated.
*
* Recovery:
*
* The VLSN database is restored at recovery time just as all other databases
* are. However, there may be a portion of the VLSN range that was not flushed
* to disk. At recovery, we piggyback onto the log scanning done and re-track
* the any mappings found within the recovery range. Those mappings are merged
* into those stored on disk, so that the VLSNIndex correctly reflects the
* entire replication stream at startup. For example, suppose a log has:
*
* LSN
* 100 firstActiveLSN
* 200 Checkpoint start
* 300 VLSN 78
* 400 VLSNIndex flushed here
* 500 Checkpoint end
* 600 VLSN 79
*
* The VLSNIndex is initially populated with the version of the index found
* at LSN 400. That doesn't include VLSN 79. A tracking pass is done from
* checkpoint start -> end of log, which sweeps up VLSN 78 and VLSN 79 into
* a temporary tracker. That tracker is merged in the VLSNIndex, to update
* its mappings to VLSN 79.
*
* Note that the checkpoint VLSNIndex must encompass all vlsn mappings that are
* prior to the checkpoint start of that recovery period. This follows the
* general philosophy that checkpoint flushes all metadata, and recovery reads
* from checkpoint start onewards to add on any neede extra data.
* Retrieving mappings:
*
* Callers who need to retrieve mappings obtain a VLSNScanner, which acts as a
* cursor over the VLSNIndex. A VLSNScanner finds and saves the applicable
* VLSNBucket, and queries the bucket directly as long as it can provide
* mappings. This reduces the level of contention between multiple readers
* (feeders) and writers (application threads, or the replay thread)
*
* Synchronization hierarchy:
*
* To write a new mapping, you must have the mutex on the VLSIndex, and then
* the tracker, which lets you obtain the correct bucket, and then you must
* have a mutex on the bucket. To read a mapping, you must have the tracker
* mutex to obtain the right bucket. If you already have the right bucket in
* hand, you only need the bucket mutex.
*
* In truth, buckets which are not the "currentBucket" are not modified again,
* so a future optimization would allow for reading a mapping on a finished
* bucket without synchronization.
*
* The VLSNRange is updated as an atomic assignment to a volatile field after
* taking the mutex on the current bucket. It is read without a mutex, by
* looking at it as a volatile field.
*
* The hierarchy is
* VLSNIndex -> VLSNTracker -> VLSNBucket
* VLSNIndex -> VLSNTracker -> VLSNRange
* VLSNIndex -> VLSNIndex.mappingSynchronizer
* VLSNIndex.flushSynchronizer -> VLSNTracker
*
* Removing mappings vs reading mappings - sync on the range.
*
* We also need to consider that fact that callers of the VLSNIndex may be
* holding other mutex, or IN latches, and that the VLSNIndex methods may do
* database operations to read or write to the internal VLSN database. That can
* result in a nested database operation, and we need to be careful to avoid
* deadlocks. To be safe, we disable critical eviction [#18475]
* VLSNBucket.writeDatabase().
*
* Writers
* -------
* Allocating a new VLSN: bump()
* - sync on log write latch
* Note that since there is no synchronization on the VLSNINdex itself,
* [allocating new VLSN, logging its entry] and [flushing the vlsn index
* to disk] is not atomic. See awaitConsistency().
*
* Adding a mapping: put()
* - sync on VLSNIndex
* -sync on VLSNTracker to access the right bucket, and possibly
* create a new bucket. Atomically modify the VLSNRange.
*
* Flushing mappings to disk: writeToDatabase()
* - sync on VLSNIndex.flushSyncrhonizer -> VLSNTracker
*
* Replica side syncup truncates the VLSNIndex from the end:
* - no synchronization needed, the system is quiescent, and we can assume
* that VLSNs are neither read nor written by other threads.
*
* Log cleaning truncates the VLSNIndex from the beginning:
* We assume that the log cleaner is prohibited from deleting files that are
* being used for current feeding. We can also assume that the end of the
* log is not being deleted, and that we're not conflict with put(). We do
* have to worry about conflicting with backwards scans when executing
* syncup as a feeder, and with flushing mappings to disk. Shall we
* disable log file deletion at this point?
*
* Steps to take:
*
* First change the VLSNRange:
* - sync on VLSNIndex
* - atomically modify the VLSNRange to ensure that no readers or
* writers touch the buckets that will be deleted.
* - sync on VLSNTracker to delete any dead buckets. Do that before
* updating the on-disk database, so that we don't lose any
* buckets to writeToDatabase().
* - without synchronization, scan the database and non-transactionally
* delete any on-disk buckets that are <= the log cleaned file.
*
* Readers
* -------
* Active forward feeder checks if a mapping exists, and waits if necessary
* - read the current VLSNRange w/out a mutex. If not satisfactory
* - sync on VLSNIndex
* - sync on VLSNIndex.mappingSynchronizer
*
* Active forward feeder reads a mapping:
* first - getBucket()
* - sync on VLSNTracker to access the right bucket
* if bucket is in hand
* - sync on target bucket to read bucket
* }
*/
public class VLSNIndex {
/*
* The length of time that a checkpoint will wait for the vlsn index to
* contain all vlsn->lsn mappings before the checkpoint start.
*/
public static int AWAIT_CONSISTENCY_MS = 60000;
private final EnvironmentImpl envImpl;
/*
* VLSN waiting: A Feeder may block waiting for the next available record
* in the replication stream.
* vlsnPutLatch - Latch used to wait for the next VLSN put operation.
* putWaitVLSN - The VLSN associated with the vlsnPutLatch, it's only
* meaningful in the presence of a latch.
*/
private VLSNAwaitLatch vlsnPutLatch = null;
private VLSN putWaitVLSN = null;
/*
* Consider replacing the mapping synchronizer with a lower overhead and
* multi-processor friendly CAS style nowait code sequence.
*/
private final Object mappingSynchronizer = new Object();
private final Object flushSynchronizer = new Object();
private final Logger logger;
/*
* nextVLSNCounter is incremented under the log write latch, when used on
* the master. If this node transitions from replica to master, this
* counter must be initialized before write operations begin. It can also
* be used by both masters and replicas when checking vlsn consistency
* before checkpoints.
*/
private AtomicLong nextVLSNCounter;
/*
* For replica node to record the latest vlsn seq copied from master
*/
private volatile long replicaLatestVLSNSeq = VLSN.NULL_VLSN_SEQUENCE;
/*
* For storing the persistent version of the VLSNIndex. For keys > 0,
* the key is the VLSN sequence number, data = VLSNBucket. Key = -1 has
* a special data item, which is the VLSNRange.
*/
private DatabaseImpl mappingDbImpl;
/*
* The tracker handles the real mechanics of maintaining the VLSN range
* and mappings.
*/
private VLSNTracker tracker;
/*
* A wait-free cache of the most recent log items in the VLSN index. These
* items are important since they are the ones needed by the feeders that
* are responsible for supplying timely commit acknowledgments.
*/
private final LogItemCache logItemCache;
/*
* Statistics associated with the VLSN index
*/
private final StatGroup statistics;
private final LongStat nHeadBucketsDeleted;
private final LongStat nTailBucketsDeleted;
/* For testing [#20726] flushToDatabase while getGTEBucket is executing */
private TestHook> searchGTEHook;
/**
* The mapping db's name is passed in as a parameter instead of the more
* intuitive approach of defining it within the class to facilitate unit
* testing of the VLSNIndex.
*/
public VLSNIndex(EnvironmentImpl envImpl,
String mappingDbName,
@SuppressWarnings("unused")
NameIdPair nameIdPair,
int vlsnStride,
int vlsnMaxMappings,
int vlsnMaxDistance,
RecoveryInfo recoveryInfo)
throws DatabaseException {
this.envImpl = envImpl;
/*
* initialize the logger early so it can be used by the following
* methods.
*/
logger = LoggerUtils.getLogger(getClass());
statistics = new StatGroup(VLSNIndexStatDefinition.GROUP_NAME,
VLSNIndexStatDefinition.GROUP_DESC);
nHeadBucketsDeleted =
new LongStat(statistics,
VLSNIndexStatDefinition.N_HEAD_BUCKETS_DELETED);
nTailBucketsDeleted =
new LongStat(statistics,
VLSNIndexStatDefinition.N_TAIL_BUCKETS_DELETED);
init(mappingDbName,
vlsnStride,
vlsnMaxMappings,
vlsnMaxDistance,
recoveryInfo);
logItemCache = new LogItemCache(envImpl.getConfigManager().
getInt(RepParams.VLSN_LOG_CACHE_SIZE),
statistics);
}
/**
* Initialize before this node begins working as a master. This node may
* become a Master directly after recovery, or it may transition to the
* master state after running for some time as a Replica.
*
* Reset the vlsnIndex so the VLSN sequence corresponds to what this node
* thinks is the next VLSN.
*/
public void initAsMaster() {
VLSN last = tracker.getRange().getLast();
if (last.equals(VLSN.NULL_VLSN)) {
/*
* If the master does the conversion, the started VLSN should start
* from 2 so that Replica would throw a LogRefreshRequiredException
* and do a NetworkRestore to copy the master logs.
*/
nextVLSNCounter = envImpl.needRepConvert() ?
new AtomicLong(1) :
new AtomicLong(0);
} else {
nextVLSNCounter = new AtomicLong(last.getSequence());
}
replicaLatestVLSNSeq = VLSN.NULL_VLSN_SEQUENCE;
}
/**
* Initialize before this node begins working as a replica after being
* a master.
*/
public synchronized void initAsReplica() {
/*
* Clear the VLSN await mechanism, which is used for feeding and for
* checkpoint precondition checking. Used when this node transitions away
* from master status, to replica status.
*/
if (vlsnPutLatch != null) {
vlsnPutLatch.terminate();
vlsnPutLatch = null;
}
putWaitVLSN = null;
nextVLSNCounter = null;
replicaLatestVLSNSeq = VLSN.UNINITIALIZED_VLSN_SEQUENCE;
}
/*
* Return the VLSN to use for tagging the next replicated log entry. Must
* be called within the log write latch.
*/
public VLSN bump() {
return new VLSN(nextVLSNCounter.incrementAndGet());
}
public long getLatestAllocatedVal() {
if (nextVLSNCounter != null) {
return nextVLSNCounter.get();
} else {
return replicaLatestVLSNSeq;
}
}
public void setReplicaLatestVLSNSeq(long seq) {
replicaLatestVLSNSeq = seq;
}
/*
* Register a new VLSN->LSN mapping. This is called outside the log write
* latch, but within the LogManager log() call. It must not cause any
* logging of its own and should not cause I/O.
*/
public void put(LogItem logItem) {
final VLSN vlsn = logItem.header.getVLSN();
final long lsn = logItem.lsn;
final byte entryType = logItem.header.getType();
logItemCache.put(vlsn, logItem);
synchronized (this) {
tracker.track(vlsn, lsn, entryType);
synchronized (mappingSynchronizer) {
/*
* Put() calls may come out of order, so free the wait latch if
* the incoming VLSN >= the waiting VLSN. For example, a feeder
* may be awaiting VLSN 100, but the call to put(101) comes in
* before the call to put(100).
*/
if ((vlsnPutLatch != null) &&
vlsn.compareTo(putWaitVLSN) >= 0) {
vlsnPutLatch.setLogItem(logItem);
vlsnPutLatch.countDown();
vlsnPutLatch = null;
putWaitVLSN = null;
}
}
}
if (logger.isLoggable(Level.FINEST)) {
LoggerUtils.finest(logger, envImpl, "vlsnIndex put " + vlsn);
}
}
/**
* Wait for the vlsn, or a higher numbered vlsn, to make its appearance in
* the VLSN index.
*
* @throws InterruptedException
* @throws WaitTimeOutException if the VLSN did not appear within waitTime
* or the latch was explicitly terminated.
*
* @return the LogItem associated with the vlsn, or null if the entry is
* now present in the log, but is not available in the LogItemCache.
*/
public LogItem waitForVLSN(VLSN vlsn, int waitTime)
throws InterruptedException, WaitTimeOutException {
/* First check the volatile range field, without synchronizing. */
VLSNRange useRange = tracker.getRange();
if (useRange.getLast().compareTo(vlsn) >= 0) {
return logItemCache.get(vlsn);
}
VLSNAwaitLatch waitLatch = null;
synchronized (this) {
useRange = tracker.getRange();
if (useRange.getLast().compareTo(vlsn) >= 0) {
return logItemCache.get(vlsn);
}
synchronized (mappingSynchronizer) {
/* The target VLSN hasn't arrived yet, we'll wait. */
setupWait(vlsn);
/* Copy the latch while synchronized. */
waitLatch = vlsnPutLatch;
}
}
/*
* Do any waiting outside the synchronization section. If the
* waited-for VLSN has already arrived, the waitLatch will have been
* counted down, and we'll go through.
*/
if (!waitLatch.await(waitTime, TimeUnit.MILLISECONDS) ||
waitLatch.isTerminated()) {
/* Timed out waiting for an incoming VLSN, or was terminated. */
throw new WaitTimeOutException();
}
if (! (tracker.getRange().getLast().compareTo(vlsn) >= 0)) {
throw EnvironmentFailureException.
unexpectedState(envImpl, "Waited for vlsn:" + vlsn +
" should be greater than last in range:" +
tracker.getRange().getLast());
}
LogItem logItem = waitLatch.getLogItem();
/* If we waited successfully, logItem can't be null. */
return logItem.header.getVLSN().equals(vlsn) ?
logItem :
/*
* An out-of-order vlsn put, that is, a later VLSN arrived at
* the index before this one. We could look for it in the log
* item cache, but due to the very nature of the out of order
* put it's unlikely to be there and we would rather not incur
* the overhead of a failed lookup.
*/
null;
}
/**
* For unit test only.
*/
synchronized VLSN getPutWaitVLSN() {
return putWaitVLSN;
}
/**
* Setup the context for waiting for a not-yet-registered VLSN.
*/
private void setupWait(VLSN vlsn) {
if (vlsnPutLatch == null) {
putWaitVLSN = vlsn;
vlsnPutLatch = new VLSNAwaitLatch();
} else {
/* There can only be on possible VLSN to wait on. */
if (!vlsn.equals(putWaitVLSN)) {
throw EnvironmentFailureException.unexpectedState
(envImpl, "unexpected get for VLSN: " + vlsn +
" already waiting for VLSN: " + putWaitVLSN +
" current range=" + getRange());
}
}
}
/**
* Prevents truncation of the head of the index range (the lower bound).
* Used at the beginning of syncup. After calling this method, the head of
* the range is prevented from changing and and the files in the range will
* not be deleted. Passing the returned value to {@link
* com.sleepycat.je.cleaner.FileProtector#removeFileProtection} will allow
* the head of the range to change and files to be deleted.
*
*
It is important that a syncup does not synchronize on VLSNIndex,
* since this could block waiting for an expensive operation such as
* truncation. The synchronization for protecting the range head is
* therefore on VLSNTracker.
*
* @param lockerName the name of the protecting entity, i.e., the syncup,
* to be used in LogSizeStats.
*
* @return the ProtectedFileSet protecting the files in theVLSNIndex range.
*/
public ProtectedFileSet protectRangeHead(String lockerName) {
return tracker.protectRangeHead(lockerName);
}
/**
* Returns the file at the lower bound of the current range. This method
* does not synchronize.
*/
public long getProtectedRangeStartFile() {
return tracker.getProtectedRangeStartFile();
}
/**
* Try to advance the VLSNIndex ProtectedFileRange and truncate the head
* of the VLSNIndex range, so that bytesNeeded can be freed by deleting
* files in this range.
*
* Remove all information from the VLSNIndex for {@literal VLSNs <=
* deleteEndpoint}. Used by log cleaning. To properly coordinate with
* readers of the VLSNIndex, we need to update the range before updating
* the buckets.
*
* We assume that deleteEnd is always the last vlsn in a file, and because
* of that, truncations will never split a bucket.
*
* A truncation may leave a gap at the head of the vlsn index though.
* This could occur if the buckets have a gap, due to out of order VLSNs.
* For example, it's possible that the index has these buckets:
*
* bucket A: firstVLSN = 10, lastVLSN = 20
* bucket B: firstVLSN = 22, lastVLSN = 30
*
* If we truncate the index at 20 (deleteEnd == 20), then the resulting
* start of the range is 21, but the first bucket value is 22. In this
* case, we need to insert a ghost bucket.
*
* This method ensures that any changes are fsynced to disk before file
* deletion occurs. [#20702]
*/
public synchronized boolean tryTruncateFromHead(final long bytesNeeded) {
final Pair truncateInfo = tracker.tryTruncateFromHead(
bytesNeeded, logItemCache);
if (truncateInfo == null) {
/* No change to the range was needed/possible. */
return false;
}
truncateDatabaseFromHead(truncateInfo.first(), truncateInfo.second());
return true;
}
/**
* Like {@link #tryTruncateFromHead(long)} but allows specifying a
* specific {deleteEnd, deleteFileNum}.
*/
public synchronized boolean tryTruncateFromHead(final VLSN deleteEnd,
final long deleteFileNum) {
if (!tracker.tryTruncateFromHead(
deleteEnd, deleteFileNum, logItemCache)) {
/* No change to the range was needed/possible. */
return false;
}
truncateDatabaseFromHead(deleteEnd, deleteFileNum);
return true;
}
/**
* Forcibly truncate the VLSNIndex range head, in situations where the
* environment is quiescent and we know that truncation is safe.
*
* @param deleteEnd the last VLSN to be truncated.
*
* @param deleteFileNum the file having deleteEnd as its last VLSN.
*/
public synchronized void truncateFromHead(VLSN deleteEnd,
long deleteFileNum) {
LoggerUtils.fine(logger, envImpl,
"head truncate with " + deleteEnd +
" delete file#:" + deleteFileNum);
/*
* Since the range is the gatekeeper, update the tracker cache before
* the database, so that the range is adjusted first.
*/
if (!tracker.truncateFromHead(
deleteEnd, deleteFileNum, logItemCache)) {
/* No change to the range was needed. */
return;
}
truncateDatabaseFromHead(deleteEnd, deleteFileNum);
}
private synchronized void truncateDatabaseFromHead(VLSN deleteEnd,
long deleteFileNum) {
/*
* Be sure that the changes are fsynced before deleting any files. The
* changed vlsn index must be persisted so that there are no references
* to the deleted, cleaned files. Instead of using COMMIT_SYNC, use
* COMMIT_NO_SYNC with an explicit environment flush and fsync, because
* the latter ends the txn and releases locks sooner, and reduces
* possible lock contention on the VLSNIndex. Both feeders and write
* operations need to lock the VLSNIndex, so keeping lock contention
* minimal is essential.
* [#20702]
*/
TransactionConfig config = new TransactionConfig();
config.setDurability(Durability.COMMIT_NO_SYNC);
Txn txn = Txn.createLocalTxn(envImpl, config);
boolean success = false;
try {
synchronized (flushSynchronizer) {
pruneDatabaseHead(deleteEnd, deleteFileNum, txn);
flushToDatabase(txn);
}
txn.commit();
envImpl.flushLog(true /*fsync required*/);
success = true;
} finally {
if (!success) {
txn.abort();
}
}
}
/**
* Remove all information from the VLSNIndex for {@literal VLSNs >=
* deleteStart} Used by replica side syncup, when the log is
* truncated. Assumes that the vlsnIndex is quiescent, and no writes are
* happening, although the cleaner may read the vlsnIndex.
* @throws DatabaseException
*/
public synchronized void truncateFromTail(VLSN deleteStart, long lastLsn)
throws DatabaseException {
logItemCache.clear();
VLSNRange currentRange = tracker.getRange();
if (currentRange.getLast().getNext().equals(deleteStart)) {
/*
* deleteStart directly follows what's in this range, no need to
* delete anything.
*/
return;
}
/*
* The VLSNIndex has two parts -- the in-memory cache, and the
* database. Update the tracker, which holds the cache first, and then
* update the database.
*/
tracker.truncateFromTail(deleteStart, lastLsn);
TransactionConfig config = new TransactionConfig();
/*
* Be sure to commit synchronously so that changes to the vlsn index
* are persisted before the log is truncated. There are no feeders or
* repstream write operations at this time, so the use of COMMIT_SYNC
* does not introduce any lock contention. [#20702]
*/
config.setDurability(Durability.COMMIT_SYNC);
Txn txn = Txn.createLocalTxn(envImpl, config);
boolean success = false;
try {
/*
* The tracker knows the boundary between VLSNs that are on disk
* and VLSNs that are within its cache, and maintains that info
* as mappings are added, and as the tracker/cache is flushed.
* But since we're potentially truncating mappings that were on
* disk, we need to update the tracker's notion of where the flush
* boundary is.
*/
VLSN lastOnDisk = pruneDatabaseTail(deleteStart, lastLsn, txn);
tracker.setLastOnDiskVLSN(lastOnDisk);
/*
* Because mappings can come out of order, it's possible that
* buckets are not completely contiguous, and that truncating
* will result in the loss of the mapping for the end of the range.
* For example, suppose the buckets are like this:
* On disk: vlsn 13 -> bucket for vlsns 13-16
* In tracker: vlsn 18 -> bucket for vlsns 18 -23
* Truncating the vlsnIndex at 18 will make the last VLSN become
* 17, and removing the vlsn 18 bucket will result in no mapping
* for the new end range, vlsn 17. If so, the tracker should
* create a new mapping, of vlsn 17 -> lastlsn, to cap off the
* range and ensure that there are mappings for the start and end
* lsns.
*/
tracker.ensureRangeEndIsMapped(deleteStart.getPrev(), lastLsn);
flushToDatabase(txn);
txn.commit();
success = true;
} finally {
if (!success) {
txn.abort();
}
}
}
/**
* All range points (first, last, etc) ought to be seen as one consistent
* group. Because of that, VLSNIndex doesn't offer getLastVLSN,
* getFirstVLSN type methods, to discourage the possibility of retrieving
* range points across two different range sets.
*/
public VLSNRange getRange() {
return tracker.getRange();
}
/**
* Returns the statistics associated with the VLSNIndex
*
* @return the vlsn statistics.
*/
public StatGroup getStats(StatsConfig config) {
return statistics.cloneGroup(config.getClear());
}
/**
* Return the nearest file number {@literal <=} the log file that houses
* this VLSN. This method is meant to be efficient and will not incur
* I/O. If there is no available, it does an approximation. The requested
* VLSN must be within the VLSNIndex range.
* @throws DatabaseException
*/
public long getLTEFileNumber(VLSN vlsn)
throws DatabaseException {
VLSNBucket bucket = getLTEBucket(vlsn);
return bucket.getLTEFileNumber();
}
/**
* The caller must ensure that the requested VLSN is within the VLSNIndex
* range; we assume that there is a valid bucket.
*/
public long getGTEFileNumber(VLSN vlsn)
throws DatabaseException {
VLSNBucket bucket = getGTEBucket(vlsn, null);
return bucket.getGTEFileNumber();
}
/**
* The requested VLSN must be within the VLSNIndex range; we assume that
* there is a valid bucket.
*/
public long getGTELsn(VLSN vlsn) {
VLSNBucket bucket = getGTEBucket(vlsn, null);
return bucket.getGTELsn(vlsn);
}
/**
* Get the vlsnBucket that owns this VLSN. If there is no such bucket, get
* the bucket that follows this VLSN. Must always return a bucket.
*
* Because this is unsynchronized, there is actually a remote chance that
* this call could view the VLSNIndex while a truncateFromTail() is going
* on, and see the index while it is logically inconsistent, should
* there be non-contiguous buckets in the vlsnIndex.. In that case,
* the caller will get an EnvironmentFailureException. Because the window
* is exceedingly small, requiring log cleaning and a rollback to collide
* in a very particular way, and because it is unpalatable to create
* synchronization hierarchy complexity for this tiny window, and because
* the problem is transient, this method is not synchronized. [#23491]
*
* @param currentBucketInUse is used only for debugging, to add to the
* error message if the GTEBucketFromDatabase fails.
* @throws DatabaseException
*/
public VLSNBucket getGTEBucket(VLSN vlsn, VLSNBucket currentBucketInUse)
throws DatabaseException {
VLSNBucket bucket = tracker.getGTEBucket(vlsn);
if (bucket == null) {
return getGTEBucketFromDatabase(vlsn, currentBucketInUse);
}
return bucket;
}
/**
* Get the vlsnBucket that owns this VLSN. If there is no such bucket, get
* the bucket that precedes this VLSN. Must always return a bucket.
* @throws DatabaseException
*/
VLSNBucket getLTEBucket(VLSN vlsn)
throws DatabaseException {
VLSNBucket bucket = tracker.getLTEBucket(vlsn);
if (bucket == null) {
return getLTEBucketFromDatabase(vlsn);
}
return bucket;
}
/**
* @return true if the status and key value indicate that this
* cursor is pointing at a valid bucket. Recall that the VLSNRange is
* stored in the same database at entry -1.
*/
private boolean isValidBucket(OperationStatus status,
DatabaseEntry key) {
return ((status == OperationStatus.SUCCESS) &&
(LongBinding.entryToLong(key) != VLSNRange.RANGE_KEY));
}
/*
* Get the bucket that matches this VLSN. If this vlsn is Y, then we want
* bucket at key X where X <= Y. If this method is called, we guarantee
* that a non-null bucket will be returned.
*/
public VLSNBucket getLTEBucketFromDatabase(VLSN vlsn)
throws DatabaseException {
Cursor cursor = null;
Locker locker = null;
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry data= new DatabaseEntry();
try {
locker = BasicLocker.createBasicLocker(envImpl);
cursor = makeCursor(locker);
if (positionBeforeOrEqual(cursor, vlsn, key, data)) {
return VLSNBucket.readFromDatabase(data);
}
/* Shouldn't get here. */
throw EnvironmentFailureException.unexpectedState
(envImpl, "Couldn't find bucket for LTE VLSN " + vlsn +
" in database. tracker=" + tracker);
} finally {
if (cursor != null) {
cursor.close();
}
if (locker != null) {
locker.operationEnd(true);
}
}
}
/**
* Return the bucket that holds a mapping >= this VLSN. If this method is
* called, we guarantee that a non-null bucket will be returned.
*
* At this point, we are sure that the target vlsn is within the range of
* vlsns held in the database. However, note that there is no explicit
* synchronization between this database search, and the
* VLSNTracker.flushToDatabase, which might be writing additional buckets
* to this database. This may affect the cases when the cursor search
* does not return a equality match on a bucket. [#20726]
*
* For example, suppose the database looks like this:
* key=vlsn 10, data = bucket: vlsn 10 -> lsn 0x10/100
* vlsn 15 -> lsn 0x10/150
* key=vlsn 20, data = bucket: vlsn 20 -> lsn 0x11/100
* vlsn 25 -> lsn 0x11/150
* If we are looking for a bucket for vlsn 22, there will not be a match
* from the call to cursor.getSearchKeyRange(key=22). The code that
* accounts for that will need to consider that new buckets may be flushed
* to the database while the search for a new bucket is going on. For
* example,
*
* key=vlsn 30, data = bucket: vlsn 30 -> lsn 0x12/100
* vlsn 35 -> lsn 0x12/150
*
* may be written to the database while we are searching for a bucket that
* owns vlsn 22.
*/
private VLSNBucket getGTEBucketFromDatabase(VLSN target,
VLSNBucket currentBucketInUse)
throws DatabaseException {
Cursor cursor = null;
Locker locker = null;
try {
locker = BasicLocker.createBasicLocker(envImpl);
cursor = makeCursor(locker);
/*
* Look at the bucket at key >= target.Will return null if no GTE
* bucket.
*/
VLSNBucket bucket = examineGTEBucket(target, cursor);
if (bucket != null) {
return bucket;
}
/*
* We're here because we did not find a bucket >= target. Let's
* examine the last bucket in this database. We know that it will
* either be:
*
* 1) a bucket that's < target, but owns the mapping
* 2) if the index was appended to by VLSNTracker.flushToDatabase
* while the search is going on, the last bucket may be one
* that is > or >= target.
* Using the example above, the last bucket could be case 1:
*
* a bucket that is < target 22:
* key=vlsn 20, data = bucket: vlsn 20 -> lsn 0x11/100
* vlsn 25 -> lsn 0x11/150
*
* or case 2, a bucket that is >= target 22, because the index grew
* key=vlsn 30, data = bucket: vlsn 30 -> lsn 0x12/100
* vlsn 35 -> lsn 0x12/150
*/
assert(TestHookExecute.doHookIfSet(searchGTEHook));
VLSNBucket endBucket = null;
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry data = new DatabaseEntry();
OperationStatus status = cursor.getLast(key, data,
LockMode.DEFAULT);
if (isValidBucket(status, key)) {
endBucket = VLSNBucket.readFromDatabase(data);
if (endBucket.owns(target)) {
return endBucket;
}
/*
* If this end bucket is not the owner of the target VLSN, we
* expect it to be a greaterThan bucket which was inserted
* because of a concurrent VLSNTracker.flushToDatabase call
* that did not exist when we did the previous
* cursor.getKeyRangeSearch (case 2), In that case, we can
* search again for the owning bucket.
*/
if (endBucket.follows(target)) {
bucket = examineGTEBucket(target, cursor);
if (bucket != null) {
return bucket;
}
}
}
/*
* Shouldn't get here! There should have been a bucket in this
* database >= this target.
*/
/* Dump the bucket database for debugging. */
int count = 0;
StringBuilder sb = new StringBuilder();
status = cursor.getFirst(key, data, LockMode.DEFAULT);
while (status == OperationStatus.SUCCESS) {
Long keyValue = LongBinding.entryToLong(key);
sb.append("key => " + keyValue + "\n");
if (count == 0) {
VLSNRange range = VLSNRange.readFromDatabase(data);
sb.append("range =>" + range + "\n");
} else {
bucket = VLSNBucket.readFromDatabase(data);
sb.append("bucket => " + bucket + "\n");
}
count++;
status = cursor.getNext(key, data, LockMode.DEFAULT);
}
LoggerUtils.severe(logger, envImpl, "VLSNIndex Dump: " +
sb.toString());
throw EnvironmentFailureException.unexpectedState
(envImpl, "Couldn't find bucket for GTE VLSN " + target +
" in database. EndBucket=" + endBucket + "currentBucket=" +
currentBucketInUse + " tracker = " + tracker);
} finally {
if (cursor != null) {
cursor.close();
}
if (locker != null) {
locker.operationEnd(true);
}
}
}
/**
* Find a bucket that is GTE the target, and sees if that bucket is
* the owner. If it is not the owner look at the previous bucket.
* @return null if no GTE bucket was found.
*/
private VLSNBucket examineGTEBucket(VLSN target, Cursor cursor) {
/* getSearchKeyRange will return a bucket >= target if one exists */
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry data = new DatabaseEntry();
LongBinding.longToEntry(target.getSequence(), key);
OperationStatus status =
cursor.getSearchKeyRange(key, data, LockMode.DEFAULT);
if (status == OperationStatus.SUCCESS) {
VLSNBucket bucket = VLSNBucket.readFromDatabase(data);
if (bucket.owns(target)) {
return bucket;
}
/*
* The bucket we found is > than our target. Look at the
* previous one.
*/
status = cursor.getPrev(key, data, LockMode.DEFAULT);
if (isValidBucket(status, key)) {
VLSNBucket prevBucket = VLSNBucket.readFromDatabase(data);
if (prevBucket.owns(target)) {
return prevBucket;
}
}
/*
* There is no bucket that owns this target, return the greater
* one.
*/
return bucket;
}
/* No bucket at a key >= the target. */
return null;
}
/*
* Position this cursor at the largest value bucket which is <= the
* target VLSN.
* @return true if there is a bucket that fits this criteria,
*/
private boolean positionBeforeOrEqual(Cursor cursor,
VLSN vlsn,
DatabaseEntry key,
DatabaseEntry data)
throws DatabaseException {
LongBinding.longToEntry(vlsn.getSequence(), key);
VLSNBucket bucket = null;
/* getSearchKeyRange will give us a bucket >= Y. */
OperationStatus status =
cursor.getSearchKeyRange(key, data, LockMode.DEFAULT);
if (status == OperationStatus.SUCCESS) {
bucket = VLSNBucket.readFromDatabase(data);
if (bucket.owns(vlsn)) {
return true;
}
/* The bucket we found is > than our VLSN. Get the previous one. */
status = cursor.getPrev(key, data, LockMode.DEFAULT);
if (isValidBucket(status, key)) {
return true;
}
/* Hey, nothing else in the database. */
return false;
}
/*
* There was no bucket >= Y. Let's find the last bucket in this
* database then. It should be a bucket that's < Y.
*/
status = cursor.getLast(key, data, LockMode.DEFAULT);
if (isValidBucket(status, key)) {
return true;
}
return false;
}
/*
* Position this cursor at the smallest value bucket which is >= the
* target VLSN.
* @return true if there is a bucket that fits this criteria,
*/
private boolean positionAfterOrEqual(Cursor cursor,
VLSN vlsn,
DatabaseEntry key,
DatabaseEntry data)
throws DatabaseException {
LongBinding.longToEntry(vlsn.getSequence(), key);
VLSNBucket bucket = null;
/* getSearchKeyRange will give us a bucket >= Y. */
OperationStatus status =
cursor.getSearchKeyRange(key, data, LockMode.DEFAULT);
if (status == OperationStatus.SUCCESS) {
bucket = VLSNBucket.readFromDatabase(data);
if (bucket.owns(vlsn)) {
return true;
}
/*
* This bucket is > our VLSN. Check the bucket before.
* - It might be a bucket that owns this VLSN
* - the prevbucket might precede this VLSN.
* - the record before might be the range.
* One way or another, there should always be a record before
* any bucket -- it's the range.
*/
status = cursor.getPrev(key, data, LockMode.DEFAULT);
assert status == OperationStatus.SUCCESS;
if (isValidBucket(status, key)) {
bucket = VLSNBucket.readFromDatabase(data);
if (bucket.owns(vlsn)) {
return true;
}
}
/*
* Move back to the original bucket, all those preceding buckets
* were unsatifactory.
*/
status = cursor.getNext(key, data, LockMode.DEFAULT);
return true;
}
/*
* There was no bucket >= Y. Let's find the last bucket in this
* database then. It should be a bucket that's < Y.
*/
status = cursor.getLast(key, data, LockMode.DEFAULT);
if (isValidBucket(status, key)) {
bucket = VLSNBucket.readFromDatabase(data);
if (bucket.owns(vlsn)) {
return true;
}
}
return false;
}
/*
* Remove all VLSN->LSN mappings <= deleteEnd
*/
private void pruneDatabaseHead(VLSN deleteEnd,
long deleteFileNum,
Txn txn)
throws DatabaseException {
Cursor cursor = null;
try {
cursor = makeCursor(txn);
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry data = new DatabaseEntry();
if (!positionBeforeOrEqual(cursor, deleteEnd, key, data)) {
/* Nothing to do. */
return;
}
/* Delete this bucket and everything before this bucket. */
/* Avoid fetching the bucket itself, since it's not needed */
final DatabaseEntry noData = new DatabaseEntry();
noData.setPartial(0, 0, true);
int deleteCount = 0;
do {
long keyValue = LongBinding.entryToLong(key);
if (keyValue == VLSNRange.RANGE_KEY) {
break;
}
OperationStatus status = cursor.delete();
deleteCount++;
if (status != OperationStatus.SUCCESS) {
throw EnvironmentFailureException.unexpectedState
(envImpl, "Couldn't delete, got status of " + status +
" for delete of bucket " + keyValue + " deleteEnd=" +
deleteEnd);
}
} while (cursor.getPrev(key, noData, LockMode.DEFAULT) ==
OperationStatus.SUCCESS);
nHeadBucketsDeleted.add(deleteCount);
/*
* Check the first real bucket, and see if we need to insert
* a ghost bucket.
*/
VLSN newStart = deleteEnd.getNext();
LongBinding.longToEntry(1, key);
OperationStatus status =
cursor.getSearchKeyRange(key, data, LockMode.DEFAULT);
/* No real buckets, nothing to adjust. */
if (status != OperationStatus.SUCCESS) {
return;
}
VLSNBucket firstBucket = VLSNBucket.readFromDatabase(data);
/* First bucket matches the range, nothing to adjust. */
if (firstBucket.getFirst().equals(newStart)) {
return;
}
if (firstBucket.getFirst().compareTo(newStart) < 0) {
throw EnvironmentFailureException.unexpectedState
(envImpl, "newStart " + newStart +
" should be < first bucket:" + firstBucket);
}
/*
* Add a ghost bucket so that there is a bucket to match the
* first item in the range.
*/
long nextFile = envImpl.getFileManager().
getFollowingFileNum(deleteFileNum,
true /* forward */);
long lastPossibleLsn = firstBucket.getLsn(firstBucket.getFirst());
VLSNBucket placeholder =
new GhostBucket(newStart, DbLsn.makeLsn(nextFile, 0),
lastPossibleLsn);
placeholder.writeToDatabase(envImpl, cursor);
} finally {
if (cursor != null) {
cursor.close();
}
}
}
/*
* Remove all VLSN->LSN mappings >= deleteStart. Recall that the
* mappingDb is keyed by the first VLSN in the bucket. The replication
* stream will be quiescent when this is called. The caller must be
* sure that there are buckets in the database that cover deleteStart.
*
* @param lastLsn is the location, if known, of the vlsn at deleteStart -1.
* If the location is not know, NULL_LSN is used. In that case the pruning
* may need to delete mappings < deleteSTart, in order to keep the bucket
* capped with a legitimate lastLSN. If lastLsn is not NULL_LSN, then the
* deletion can precisely delete only mappings >= deleteStart, because it
* can always create a new deleteStart-1 -> lastLsn mapping to cap off the
* end range.
* @return lastVLSN left on disk.
*/
VLSN pruneDatabaseTail(VLSN deleteStart, long lastLsn, Txn txn)
throws DatabaseException {
/*
* At this point, the tracker is accurate as to which vlsn is last
* on disk.
*/
VLSN lastOnDiskVLSN = tracker.getLastOnDisk();
Cursor cursor = null;
try {
cursor = makeCursor(txn);
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry data = new DatabaseEntry();
if (!positionAfterOrEqual(cursor, deleteStart, key, data)) {
/*
* No bucket that matches this criteria, everything on disk is
* < deleteStart, nothing to do.
*/
return lastOnDiskVLSN;
}
/*
* Does this bucket straddle deleteStart? Then prune off part of
* the bucket.
*/
VLSNBucket bucket = VLSNBucket.readFromDatabase(data);
if (bucket.getFirst().compareTo(deleteStart) < 0) {
bucket.removeFromTail(deleteStart, lastLsn);
lastOnDiskVLSN = bucket.getLast();
bucket.fillDataEntry(data);
OperationStatus status = cursor.putCurrent(data);
if (status != OperationStatus.SUCCESS) {
throw EnvironmentFailureException.unexpectedState
(envImpl, "Couldn't update " + bucket);
}
status = cursor.getNext(key, data, LockMode.DEFAULT);
if (status != OperationStatus.SUCCESS) {
return lastOnDiskVLSN;
}
}
/* Delete everything after this bucket. */
/* Avoid fetching the bucket itself, since it's not needed */
final DatabaseEntry noData = new DatabaseEntry();
noData.setPartial(0, 0, true);
int deleteCount = 0;
do {
OperationStatus status = cursor.delete();
if (status != OperationStatus.SUCCESS) {
throw EnvironmentFailureException.unexpectedState
(envImpl, "Couldn't delete after vlsn " + deleteStart +
" status=" + status);
}
deleteCount++;
} while (cursor.getNext(key, noData, LockMode.DEFAULT) ==
OperationStatus.SUCCESS);
nTailBucketsDeleted.add(deleteCount);
/*
* We've deleted some part of what was on disk. See what we're left
* with, and find the last mapping in the last bucket so we can say
* precisely which is the last vlsn mapped on disk and update the
* tracker cache. This last mapping may not be exactly
* deleteStart-1, if there is a gap in the mappings.
*/
OperationStatus status = cursor.getLast(key, data,
LockMode.DEFAULT);
if (isValidBucket(status, key)) {
/* A valid bucket was returned */
bucket = VLSNBucket.readFromDatabase(data);
lastOnDiskVLSN = bucket.getLast();
} else {
/*
* No mappings in the database -- either there is nothing in
* the database, or we only have the special range record at
* key=-1
*/
lastOnDiskVLSN = NULL_VLSN;
}
} finally {
if (cursor != null) {
cursor.close();
}
}
return lastOnDiskVLSN;
}
/**
* At startup, we need to
* - get a handle onto the internal database which stores the VLSN index
* - read the latest on-disk version to initialize the tracker
* - find any VLSN->LSN mappings which were not saved in the on-disk
* version, and merge them in. These mappings weren't flushed because
* they occurred after the checkpoint end. They're found by the recovery
* procedure, and are added in now.
*
* This method will execute when the map is quiescent, and needs no
* synchronization.
*/
private void init(String mappingDbName,
int vlsnStride,
int vlsnMaxMappings,
int vlsnMaxDistance,
RecoveryInfo recoveryInfo)
throws DatabaseException {
openMappingDatabase(mappingDbName);
tracker = new VLSNTracker(envImpl, mappingDbImpl, vlsnStride,
vlsnMaxMappings, vlsnMaxDistance,
statistics);
/*
* Put any in-memory mappings discovered during the recovery process
* into the fileMapperDb. That way, we'll preserve mappings that
* precede this recovery's checkpoint.
*
* For example, suppose the log looks like this:
*
* VLSN1
* VLSN2
* checkpoint start for this recovery, for the instantiation of the
* replicator
* checkpoint end for this recovery
* <- at this point in time, after the env comes up, we'll create
* the VLSN index. VLSN1 and VLSN2 were discovered during recovery and
* are recorded in memory. Normally a checkpoint flushes the VLSNIndex
* but the VLSNIndex isn't instantiated yet, because the VLSNIndex
* needs an initialized environment.
*/
merge((VLSNRecoveryTracker) recoveryInfo.vlsnProxy);
/* Initialize ProtectedFileRange after VLSN range is determined. */
VLSNRange range = tracker.getRange();
if (!range.isEmpty()) {
long firstFile = getLTEFileNumber(range.getFirst());
tracker.initProtectedFileRange(firstFile);
}
/*
* When one or more reserved files are missing, truncate the index so
* it reflects the VLSN range for existing files.
*/
if (!recoveryInfo.lastMissingFileVLSN.isNull()) {
truncateFromHead(
recoveryInfo.lastMissingFileVLSN,
recoveryInfo.lastMissingFileNumber);
}
}
/*
* Update this index, which was initialized with what's on disk, with
* mappings found during recovery. These mappings ought to either overlap
* what's on disk, or cover the range immediately after what's on disk. If
* it doesn't, the recovery mechanism, which flushes the mapping db at
* checkpoint is faulty and we've lost mappings.
*
* In other words, if this tracker holds the VLSN range a -> c, then the
* recovery tracker will have the VLSN range b -> d, where
*
* a <= b
* c <= d
* if c < b, then b == c+1
*
* This method must be called when the index and tracker are quiescent, and
* there are no calls to track().
*
* The recoveryTracker is the authoritative voice on what should be in the
* VLSN index.
*/
void merge(VLSNRecoveryTracker recoveryTracker) {
if (recoveryTracker == null) {
flushToDatabase(Durability.COMMIT_SYNC);
return;
}
if (recoveryTracker.isEmpty()) {
/*
* Even though the recovery tracker has no mappings, it may have
* seen a rollback start that indicates that the VLSNIndex should
* be truncated. Setup the recovery tracker so it looks like
* it has a single mapping -- the matchpoint VLSN and LSN and
* proceed. Take this approach, rather than truncating the index,
* because we may need that matchpoint mapping to cap off the
* VLSN range.
*
* For example, suppose an index has mappings for VLSN 1, 5, 10,
* and the rollback is going to matchpoint 7. A pure truncation
* would lop off VLSN 10, making VLSN 5 the last mapping. We
* would then need to add on VLSN 7.
*/
VLSN lastMatchpointVLSN = recoveryTracker.getLastMatchpointVLSN();
if (lastMatchpointVLSN.isNull()) {
return;
}
/*
* Use a MATCHPOINT log entry to indicate that this is a syncable
* entry. This purposefully leaves the recovery tracker's range's
* lastTxnEnd null, so it will not overwrite the on disk
* tracker. This assumes that we will never rollback past the last
* txn end.
*/
recoveryTracker.track(lastMatchpointVLSN,
recoveryTracker.getLastMatchpointLsn(),
LogEntryType.LOG_MATCHPOINT.getTypeNum());
}
/*
* The mappings held in the recoveryTracker must either overlap what's
* on disk or immediately follow the last mapping on disk. If there
* is a gap between what is on disk and the recovery tracker, something
* went awry with the checkpoint scheme, which flushes the VLSN index
* at each checkpoint. We're in danger of losing some mappings. Most
* importantly, the last txnEnd VLSN in the range might not be right.
*
* The one exception is when the Environment has been converted from
* non-replicated and there are no VLSN entries in the VLSNIndex. In
* that case, it's valid that the entries seen from the recovery
* tracker may have a gap in VLSNs. For example, in a newly converted
* environment, the VLSN index range has NULL_VLSN as its last entry,
* but the first replicated log entry will start with 2.
*
* Note: EnvironmentImpl.needRepConvert() would more accurately convey
* the fact that this is the very first recovery following a
* conversion. But needRepConvert() on a replica is never true, and we
* need to disable this check on the replica's first recovery too.
*/
VLSN persistentLast = tracker.getRange().getLast();
VLSN recoveryFirst = recoveryTracker.getRange().getFirst();
if ((!(envImpl.isRepConverted() && persistentLast.isNull()) ||
!envImpl.isRepConverted()) &&
recoveryFirst.compareTo(persistentLast.getNext()) > 0) {
throw EnvironmentFailureException.unexpectedState
(envImpl, "recoveryTracker should overlap or follow on disk " +
"last VLSN of " + persistentLast + " recoveryFirst= " +
recoveryFirst);
}
VLSNRange currentRange = tracker.getRange();
if (currentRange.getLast().getNext().equals(recoveryFirst)) {
/* No overlap, just append mappings found at recovery. */
tracker.append(recoveryTracker);
flushToDatabase(Durability.COMMIT_SYNC);
return;
}
/*
* The mappings in the recovery tracker should overwrite those in the
* VLSN index.
*/
TransactionConfig config = new TransactionConfig();
config.setDurability(Durability.COMMIT_SYNC);
Txn txn = Txn.createLocalTxn(envImpl, config);
boolean success = false;
VLSN lastOnDiskVLSN;
try {
lastOnDiskVLSN = pruneDatabaseTail(recoveryFirst, DbLsn.NULL_LSN,
txn);
tracker.merge(lastOnDiskVLSN, recoveryTracker);
flushToDatabase(txn);
txn.commit();
success = true;
} finally {
if (!success) {
txn.abort();
}
}
}
private void openMappingDatabase(String mappingDbName)
throws DatabaseException {
final Locker locker =
Txn.createLocalAutoTxn(envImpl, new TransactionConfig());
try {
DbTree dbTree = envImpl.getDbTree();
DatabaseImpl db = dbTree.getDb(locker,
mappingDbName,
null /* databaseHandle */,
false);
if (db == null) {
if (envImpl.isReadOnly()) {
/* This should have been caught earlier. */
throw EnvironmentFailureException.unexpectedState
("A replicated environment can't be opened read only.");
}
DatabaseConfig dbConfig = new DatabaseConfig();
dbConfig.setReplicated(false);
db = dbTree.createInternalDb(locker, mappingDbName, dbConfig);
}
mappingDbImpl = db;
} finally {
locker.operationEnd(true);
}
}
public synchronized void close() {
close(true);
}
public synchronized void abnormalClose() {
close(false);
}
public void close(boolean doFlush)
throws DatabaseException {
try {
if (doFlush) {
flushToDatabase(Durability.COMMIT_SYNC);
}
if (vlsnPutLatch != null) {
/*
* This should be harmless because the feeders using the latch
* should all have been interrupted and shutdown. So just log
* this fact.
*/
vlsnPutLatch.terminate();
LoggerUtils.fine
(logger, envImpl,
"Outstanding VLSN put latch cleared at close");
}
} finally {
if (mappingDbImpl != null) {
envImpl.getDbTree().releaseDb(mappingDbImpl);
mappingDbImpl = null;
}
tracker.close();
}
}
/** For unit testing. */
public DatabaseImpl getDatabaseImpl() {
return mappingDbImpl;
}
/**
* Mappings are flushed to disk at close, and at checkpoints.
*/
public void flushToDatabase(Durability useDurability) {
TransactionConfig config = new TransactionConfig();
config.setDurability(useDurability);
Txn txn = Txn.createLocalTxn(envImpl, config);
boolean success = false;
try {
flushToDatabase(txn);
txn.commit();
success = true;
} finally {
if (!success) {
txn.abort();
}
}
}
/**
* Mappings are flushed to disk at close, and at checkpoints.
*/
private void flushToDatabase(Txn txn)
throws DatabaseException {
synchronized (flushSynchronizer) {
tracker.flushToDatabase(mappingDbImpl, txn);
}
}
/**
* For debugging and unit tests
* @throws DatabaseException
*/
public Map dumpDb(boolean display) {
Cursor cursor = null;
Locker locker = null;
if (display) {
System.out.println(tracker);
}
Map mappings = new HashMap();
try {
locker = BasicLocker.createBasicLocker(envImpl);
cursor = makeCursor(locker);
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry data = new DatabaseEntry();
/*
* The first item in the database is the VLSNRange. All subsequent
* items are VLSNBuckets.
*/
int count = 0;
while (cursor.getNext(key, data, LockMode.DEFAULT) ==
OperationStatus.SUCCESS) {
Long keyValue = LongBinding.entryToLong(key);
if (display) {
System.out.println("key => " + keyValue);
}
if (count == 0) {
VLSNRange range = VLSNRange.readFromDatabase(data);
if (display) {
System.out.println("range =>");
System.out.println(range);
}
} else {
VLSNBucket bucket = VLSNBucket.readFromDatabase(data);
for (long i = bucket.getFirst().getSequence();
i <= bucket.getLast().getSequence();
i++) {
VLSN v = new VLSN(i);
long lsn = bucket.getLsn(v);
if (lsn != DbLsn.NULL_LSN) {
mappings.put(v, lsn);
}
}
if (display) {
System.out.println("bucket =>");
bucket.dump(System.out);
}
}
count++;
}
} finally {
if (cursor != null) {
cursor.close();
}
if (locker != null) {
locker.operationEnd(true);
}
}
return mappings;
}
/**
* For DbStreamVerify utility. Verify the on-disk database, disregarding
* the cached tracker.
* @throws DatabaseException
*/
@SuppressWarnings("null")
public static void verifyDb(Environment env,
PrintStream out,
boolean verbose)
throws DatabaseException {
DatabaseConfig dbConfig = new DatabaseConfig();
dbConfig.setReadOnly(true);
Database db = env.openDatabase
(null, DbType.VLSN_MAP.getInternalName(), dbConfig);
Cursor cursor = null;
try {
if (verbose) {
System.out.println("Verifying VLSN index");
}
cursor = db.openCursor(null, CursorConfig.READ_COMMITTED);
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry data = new DatabaseEntry();
/*
* The first item in the database is the VLSNRange. All subsequent
* items are VLSNBuckets.
*/
int count = 0;
VLSNRange range = null;
VLSNBucket lastBucket = null;
Long lastKey = null;
VLSN firstVLSNSeen = VLSN.NULL_VLSN;
VLSN lastVLSNSeen = VLSN.NULL_VLSN;
while (cursor.getNext(key, data, null) ==
OperationStatus.SUCCESS) {
Long keyValue = LongBinding.entryToLong(key);
if (count == 0) {
if (keyValue != VLSNRange.RANGE_KEY) {
out.println("Wrong key value for range! " + range);
}
range = VLSNRange.readFromDatabase(data);
if (verbose) {
out.println("range=>" + range);
}
} else {
VLSNBucket bucket = VLSNBucket.readFromDatabase(data);
if (verbose) {
out.print("key=> " + keyValue);
out.println(" bucket=>" + bucket);
}
if (lastBucket != null) {
if (lastBucket.getLast().compareTo(bucket.getFirst())
>= 0) {
out.println("Buckets out of order.");
out.println("Last = " + lastKey + "/" +
lastBucket);
out.println("Current = " + keyValue + "/" +
bucket);
}
}
lastBucket = bucket;
lastKey = keyValue;
if ((firstVLSNSeen != null) && firstVLSNSeen.isNull()) {
firstVLSNSeen = bucket.getFirst();
}
lastVLSNSeen = bucket.getLast();
}
count++;
}
if (count == 0) {
out.println("VLSNIndex not on disk");
return;
}
if (firstVLSNSeen.compareTo(range.getFirst()) != 0) {
out.println("First VLSN in bucket = " + firstVLSNSeen +
" and doesn't match range " + range.getFirst());
}
if (lastVLSNSeen.compareTo(range.getLast()) != 0) {
out.println("Last VLSN in bucket = " + lastVLSNSeen +
" and doesn't match range " + range.getLast());
}
} finally {
if (cursor != null) {
cursor.close();
}
db.close();
}
}
/* For unit test support. Index needs to be quiescent */
@SuppressWarnings("null")
public synchronized boolean verify(boolean verbose)
throws DatabaseException {
if (!tracker.verify(verbose)) {
return false;
}
VLSNRange dbRange = null;
ArrayList firstVLSN = new ArrayList();
ArrayList lastVLSN = new ArrayList();
final Locker locker = BasicLocker.createBasicLocker(envImpl);
Cursor cursor = null;
/*
* Synchronize so we don't try to verify while the checkpointer
* thread is calling flushToDatabase on the vlsnIndex.
*/
synchronized (flushSynchronizer) {
/*
* Read the on-disk range and buckets.
* -The tracker and the database buckets should not intersect.
* -The on-disk range should be a subset of the tracker range.
*/
try {
cursor = makeCursor(locker);
DatabaseEntry key = new DatabaseEntry();
DatabaseEntry data = new DatabaseEntry();
/*
* Read the on-disk range and all the buckets.
*/
OperationStatus status =
cursor.getFirst(key, data, LockMode.DEFAULT);
if (status == OperationStatus.SUCCESS) {
VLSNRangeBinding rangeBinding = new VLSNRangeBinding();
dbRange = rangeBinding.entryToObject(data);
/* Collect info about the buckets. */
while (cursor.getNext(key, data, LockMode.DEFAULT) ==
OperationStatus.SUCCESS) {
VLSNBucket bucket = VLSNBucket.readFromDatabase(data);
Long keyValue = LongBinding.entryToLong(key);
if (bucket.getFirst().getSequence() != keyValue) {
return false;
}
firstVLSN.add(bucket.getFirst());
lastVLSN.add(bucket.getLast());
}
}
} finally {
if (cursor != null) {
cursor.close();
}
locker.operationEnd(true);
}
/*
* Verify range.
*/
VLSNRange trackerRange = tracker.getRange();
if (!trackerRange.verifySubset(verbose, dbRange)) {
return false;
}
}
VLSN firstTracked = tracker.getFirstTracked();
/* The db range and the buckets need to be consistent. */
VLSN firstOnDisk = null;
VLSN lastOnDisk = null;
if (firstVLSN.size() > 0) {
/* There are buckets in the database. */
lastOnDisk = lastVLSN.get(lastVLSN.size()-1);
firstOnDisk = firstVLSN.get(0);
if (!VLSNTracker.verifyBucketBoundaries(firstVLSN, lastVLSN)) {
return false;
}
/*
* A VLSNIndex invariant is that there is always a mapping for the
* first and last VLSN in the range. However, if the log cleaner
* lops off the head of the index, leaving a bucket gap at the
* beginning of the index, we break this invariant. For example,
* suppose the index has
*
* bucketA - VLSNs 10
* no bucket, due to out of order mapping - VLSN 11, 12
* bucket B - VLSNs 13-15
*
* If the cleaner deletes VLSN 10->11, VLSN 12 will be the start
* of the range, and needs a bucket. We'll do this by adding a
* bucket placeholder.
*/
if (dbRange.getFirst().compareTo(firstOnDisk) != 0) {
dumpMsg(verbose, "Range doesn't match buckets " +
dbRange + " firstOnDisk = " + firstOnDisk);
return false;
}
/* The tracker should know what the last VLSN on disk is. */
if (!lastOnDisk.equals(tracker.getLastOnDisk())) {
dumpMsg(verbose, "lastOnDisk=" + lastOnDisk +
" tracker=" + tracker.getLastOnDisk());
return false;
}
if (!firstTracked.equals(NULL_VLSN)) {
/*
* The last bucket VLSN should precede the first tracker VLSN.
*/
if (firstTracked.compareTo(lastOnDisk.getNext()) < 0) {
dumpMsg(verbose, "lastOnDisk=" + lastOnDisk +
" firstTracked=" + firstTracked);
return false;
}
}
}
return true;
}
private void dumpMsg(boolean verbose, String msg) {
if (verbose) {
System.out.println(msg);
}
}
/*
* For unit test support only. Can only be called when replication stream
* is quiescent.
*/
public boolean isFlushedToDisk() {
return tracker.isFlushedToDisk();
}
/**
* Ensure that the in-memory vlsn index encompasses all logged entries
* before it is flushed to disk. A No-Op for non-replicated systems.
*
* The problem is in the interaction of logging and VLSN
* tracking. Allocating an new VLSN and logging a replicated log entry is
* done within the log write latch, without any VLSNINdex
* synchronization. That must be done to keep the log write latch critical
* section as small as possible, and to avoid any lock hiearchy issues.
*
* The VLSNIndex is updated after the log write latch critical section. The
* VLSNIndex is flushed to disk by checkpoint, and it is assumed that this
* persistent version of the index encompasses all VLSN entries prior to
* checkpoint start. Since the logging of a new VLSN, and the flushing of
* the index are not atomic, it's possible that the checkpointer may start
* the flush of the vlsnIndex before the last vlsn's mapping is recorded
* in the index. To obey the requirement that the checkpointed vlsn index
* encompass all mappings < checkpoint start, check that the vlsn index
* is up to date before the flush.
* [#19754]
*
* awaitConsistency() works by using the same waitForVLSN() method used by
* the Feeders. WaitForVLSN asserts that all feeders are waiting on single
* vlsn, to assure that no feeders are left in limbo, awaiting a vlsn that
* has gone by. This contract is valid for the feeders, because they wait
* for vlsns sequentially, consuming each one by one. However, this ckpter
* awaitConsistency functionality uses the nextVLSNCounter, which can
* leapfrog ahead arbitrarily, in this case:
*
* {@literal
* vlsn range holds 1 -> N-1
* Feeder is present, awaiting vlsn N
* thread A bumps vlsn to N and writes record under log write latch
* thread B bumps vlsn to N + 1 and writes record under log write latch
* ckpter awaits consistency, using N+1, while feeders are awaiting N
* thread A puts VLSN N outside log write latch
* thread B puts VLSN N+1 outside log write latch
* }
*
* Because of this, the ckpter must distinguish between what it is really
* waiting on (VLSN N+1) and what is can next wait on to fulfil the
* feeder waiting contract (VLSN N)
*/
public void awaitConsistency() {
/* VLSNIndex is not initialized and in use yet, no need to wait. */
if (nextVLSNCounter == null && replicaLatestVLSNSeq == VLSN.NULL_VLSN_SEQUENCE) {
return;
}
VLSN vlsnAllocatedBeforeCkpt = null;
VLSN endOfRangePlusOne;
while (true) {
/*
* If we retry, get a fresh VLSN value if and only if the
* previously determined vlsnAllocatedBeforeCkpt was decremented
* due to a logging failure.
*/
if (vlsnAllocatedBeforeCkpt == null) {
vlsnAllocatedBeforeCkpt = new VLSN(getLatestAllocatedVal());
} else {
VLSN latestAllocated = new VLSN(getLatestAllocatedVal());
if (latestAllocated.compareTo(vlsnAllocatedBeforeCkpt) < 0) {
LoggerUtils.info(logger, envImpl,
"Reducing awaitConsistency VLSN from " +
vlsnAllocatedBeforeCkpt + " to " +
latestAllocated);
vlsnAllocatedBeforeCkpt = latestAllocated;
}
}
/*
* [#20165] Since the await is based on the nextVLSNCounter, it's
* possible that a feeder is already waiting on earlier VLSN.
* Safeguard against that by only waiting for one more than
* the end of the range, to avoid conflict with feeders.
* See method comments.
*/
endOfRangePlusOne = tracker.getRange().getLast().getNext();
if (vlsnAllocatedBeforeCkpt.compareTo(endOfRangePlusOne) < 0) {
/*
* All vlsns allocated before the checkpoint are now in the
* range.
*/
break;
}
if (logger.isLoggable(Level.FINE)) {
LoggerUtils.fine(logger, envImpl, "awaitConsistency target=" +
endOfRangePlusOne + " allocatedBeforeCkpt=" +
vlsnAllocatedBeforeCkpt);
}
try {
waitForVLSN(endOfRangePlusOne, AWAIT_CONSISTENCY_MS);
if (endOfRangePlusOne.compareTo(vlsnAllocatedBeforeCkpt) >= 0) {
/* We reached the real target. */
break;
}
/*
* We got to the VLSN we waited for, but it's still earlier than
* vlsnAllocatedBeforeCkpt. Loop again.
*/
} catch (WaitTimeOutException e) {
LoggerUtils.severe(logger, envImpl,
"Retrying for vlsn index consistency " +
" before checkpoint, awaiting vlsn " +
endOfRangePlusOne +
" with ckpt consistency target of " +
vlsnAllocatedBeforeCkpt);
} catch (InterruptedException e) {
LoggerUtils.severe(logger, envImpl,
"Interrupted while awaiting vlsn index " +
"consistency before checkpoint, awaiting " +
"vlsn " + endOfRangePlusOne +
" with ckpt consistency target of " +
vlsnAllocatedBeforeCkpt + ", will retry");
}
/*
* If the environment was invalidated by other activity, get out of
* this loop because the vlsn we are waiting for may never
* come. Re-throw the invalidating exception to indicate that the
* checkpoint did not succeed. [#20919]
*/
envImpl.checkIfInvalid();
}
}
void setGTEHook(TestHook> hook) {
searchGTEHook = hook;
}
/**
* A cursor over the VLSNIndex.
*/
private abstract static class VLSNScanner {
VLSNBucket currentBucket;
final VLSNIndex vlsnIndex;
/*
* This is purely for assertions. The VLSNScanner assumes that
* getStartingLsn() is called once before getLsn() is called.
*/
int startingLsnInvocations;
VLSNScanner(VLSNIndex vlsnIndex) {
this.vlsnIndex = vlsnIndex;
startingLsnInvocations = 0;
}
public abstract long getStartingLsn(VLSN vlsn);
/**
* @param vlsn We're requesting a LSN mapping for this vlsn
* @return If there is a mapping for this VLSN, return it, else return
* NULL_LSN. We assume that we checked that this VLSN is in the
* VLSNIndex's range.
*/
public abstract long getPreciseLsn(VLSN vlsn);
}
/**
* Assumes that VLSNs are scanned backwards. May be used by syncup to
* optimally search for matchpoints.
*/
public static class BackwardVLSNScanner extends VLSNScanner {
public BackwardVLSNScanner(VLSNIndex vlsnIndex) {
super(vlsnIndex);
}
/*
* Use the >= mapping for the requested VLSN to find the starting lsn
* to use for a scan. This can only be used on a VLSN that is known to
* be in the range.
*/
@Override
public long getStartingLsn(VLSN vlsn) {
startingLsnInvocations++;
currentBucket = vlsnIndex.getGTEBucket(vlsn, null);
return currentBucket.getGTELsn(vlsn);
}
/**
* @see VLSNScanner#getPreciseLsn
*/
@Override
public long getPreciseLsn(VLSN vlsn) {
assert startingLsnInvocations == 1 : "startingLsns() called " +
startingLsnInvocations + " times";
/*
* Ideally, we have a bucket that has the mappings for this VLSN.
* If we don't, we attempt to get the next applicable bucket.
*/
if (currentBucket != null) {
if (!currentBucket.owns(vlsn)) {
/*
* This bucket doesn't own the VLSN. Is it because (a)
* there's a gap and two buckets don't abut, or (b) because
* we walked off the end of the current bucket, and we need
* a new one? Distinguish case (a) by seeing if the current
* bucket will be needed for an upcoming VLSN.
*/
if (currentBucket.precedes(vlsn)) {
return DbLsn.NULL_LSN;
}
/*
* Case B: We've walked off the end of the current
* bucket.
*/
currentBucket = null;
}
}
/*
* We walked off the end of the currentBucket. Get a new bucket,
* finding the closest bucket that would hold this mapping.
*/
if (currentBucket == null) {
currentBucket = vlsnIndex.getLTEBucket(vlsn);
/*
* The next bucket doesn't own this vlsn, which means that
* we're in a gap between two buckets. Note:
* vlsnIndex.LTEBucket guards against returning null.
*/
if (!currentBucket.owns(vlsn)) {
return DbLsn.NULL_LSN;
}
}
assert currentBucket.owns(vlsn) : "vlsn = " + vlsn +
" currentBucket=" + currentBucket;
/* We're in the right bucket. */
return currentBucket.getLsn(vlsn);
}
}
/**
* Disable critical eviction for all VLSNIndex cursors. [#18475] An
* improvement would be to enable eviction, and do all database operations
* that are in a loop asynchronously.
*/
private Cursor makeCursor(Locker locker) {
Cursor cursor = DbInternal.makeCursor(mappingDbImpl,
locker,
CursorConfig.DEFAULT);
DbInternal.getCursorImpl(cursor).setAllowEviction(false);
return cursor;
}
/**
* Scans VLSNs in a forward direction, used by feeders.
*/
public static class ForwardVLSNScanner extends VLSNScanner {
public ForwardVLSNScanner(VLSNIndex vlsnIndex) {
super(vlsnIndex);
}
/**
* Use the {@literal <=} mapping to the requested VLSN to find the
* starting lsn to use for a scan. This can only be used on a VLSN
* that is known to be in the range.
*/
@Override
public long getStartingLsn(VLSN vlsn) {
startingLsnInvocations++;
currentBucket = vlsnIndex.getLTEBucket(vlsn);
return currentBucket.getLTELsn(vlsn);
}
/**
* @see VLSNScanner#getPreciseLsn
*/
@Override
public long getPreciseLsn(VLSN vlsn) {
return getLsn(vlsn, false /* approximate */);
}
/**
* {@literal
* When doing an approximate search, the target vlsn may be a non-mapped
* vlsn within a bucket, or it may be between two different buckets.
* For example, suppose we have two buckets:
*
* vlsn 1 -> lsn 10
* vlsn 5 -> lsn 50
* vlsn 7 -> lsn 70
*
* vlsn 20 -> lsn 120
* vlsn 25 -> lsn 125
*
* If the vlsn we are looking for is 4, the LTE lsn for an approximate
* return value will be vlsn 1-> lsn 10, in the same bucket. If we are
* looking for vlsn 9, the LTE lsn for an approximate return value will
* be vlsn 7->lsn 70, which is the last mapping in an earlier bucket.
* }
*
* @param vlsn We're requesting a LSN mapping for this vlsn
* @return If there is a mapping for this VLSN, return it. If it does
* not exist, return the nearest non-null mapping, where nearest the
* {@literal <=} LSN. We assume that we checked that this VLSN is in
* the VLSNIndex's range.
*/
public long getApproximateLsn(VLSN vlsn) {
return getLsn(vlsn, true /* approximate */);
}
private long getLsn(VLSN vlsn, boolean approximate) {
assert startingLsnInvocations == 1 : "startingLsns() called " +
startingLsnInvocations + " times";
VLSNBucket debugBucket = currentBucket;
/*
* Ideally, we have a bucket that has the mappings for this VLSN.
* If we don't, we attempt to get the next applicable bucket.
*/
if (currentBucket != null) {
if (!currentBucket.owns(vlsn)) {
/*
* This bucket doesn't own the VLSN. Is it because (a)
* there's a gap and two buckets don't abut, or (b) because
* we walked off the end of the current bucket, and we need
* a new one? Distinguish case (a) by seeing if the current
* bucket will be needed for an upcoming VLSN.
*/
if (currentBucket.follows(vlsn)) {
/* Case A: No bucket available for this VLSN. */
return approximate ?
findPrevLsn(vlsn) : DbLsn.NULL_LSN;
}
/* Case B: We've walked off the end of the bucket. */
currentBucket = null;
}
}
/*
* We walked off the end of the currentBucket. Get a new bucket,
* finding the closest bucket that would hold this mapping.
*/
if (currentBucket == null) {
currentBucket = vlsnIndex.getGTEBucket(vlsn, debugBucket);
/*
* The next bucket doesn't own this vlsn, which means that
* we're in a gap between two buckets. Note:
* vlsnIndex.getGTEBucket guards against returning null.
*/
if (!currentBucket.owns(vlsn)) {
return approximate ? findPrevLsn(vlsn) : DbLsn.NULL_LSN;
}
}
assert currentBucket.owns(vlsn) : "vlsn = " + vlsn +
" currentBucket=" + currentBucket;
if (approximate) {
/*
* We're in the right bucket, and it owns this
* VLSN. Nevertheless, the bucket may or may not contain a
* mapping for this VLSN, so return the LTE version mapping.
*/
return currentBucket.getLTELsn(vlsn);
}
return currentBucket.getLsn(vlsn);
}
/*
* Find the lsn mapping that precedes the target. This assumes that
* no bucket owns the target vlsn -- that it's a vlsn that falls
* between buckets.
*/
private long findPrevLsn(VLSN target) {
VLSNBucket prevBucket = vlsnIndex.getLTEBucket(target);
assert !prevBucket.owns(target) : "target=" + target +
"prevBucket=" + prevBucket + " currentBucket=" + currentBucket;
return prevBucket.getLastLsn();
}
}
/**
* Associates the logItem with the latch, so that it's readily available
* when the latch is released.
*/
public static class VLSNAwaitLatch extends CountDownLatch {
/* The LogItem whose addition to the VLSN released the latch. */
private LogItem logItem = null;
private boolean terminated = false;
public VLSNAwaitLatch() {
super(1);
}
public long getTriggerLSN() {
return logItem.lsn;
}
public VLSN getTriggerVLSN() {
return logItem.header.getVLSN();
}
public void setLogItem(LogItem logItem) {
this.logItem = logItem;
}
/**
* Returns the log item that caused the latch to be released. It's only
* meaningful after the latch has been released.
*
* @return log item or null if the latch timed out or it's wait was
* terminated
*/
public LogItem getLogItem() {
return logItem;
}
/* Free up any waiters on this latch and shutdown. */
public void terminate() {
terminated = true;
countDown();
}
public boolean isTerminated() {
return terminated;
}
}
/*
* An exception primarily intended to implement non-local control flow
* upon a vlsn wait latch timeout.
*/
@SuppressWarnings("serial")
static public class WaitTimeOutException extends Exception {
@Override
/* Eliminate unnecessary overhead. */
public synchronized Throwable fillInStackTrace(){return null;}
}
}