com.bigdata.btree.BTree Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Nov 15, 2006
*/
package com.bigdata.btree;
import java.lang.ref.WeakReference;
import java.lang.reflect.Constructor;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Lock;
import com.bigdata.BigdataStatics;
import com.bigdata.btree.AbstractBTreeTupleCursor.MutableBTreeTupleCursor;
import com.bigdata.btree.Leaf.ILeafListener;
import com.bigdata.btree.data.ILeafData;
import com.bigdata.btree.data.INodeData;
import com.bigdata.io.ByteArrayBuffer;
import com.bigdata.journal.AbstractJournal;
import com.bigdata.journal.IIndexManager;
import com.bigdata.mdi.IResourceMetadata;
import com.bigdata.mdi.JournalMetadata;
import com.bigdata.mdi.LocalPartitionMetadata;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.rwstore.IRWStrategy;
import com.bigdata.util.Bytes;
/**
*
* This class implements a variant of a B+Tree in which all values are stored in
* leaves, but the leaves are not connected with prior-next links. This
* constraint arises from the requirement to support a copy-on-write policy.
*
*
* Note: No mechanism is exposed for fetching a node or leaf of the tree other
* than the root by its key. This is because the parent reference on the node
* (or leaf) can only be set when it is read from the store in context by its
* parent node.
*
*
* Note: the leaves can not be stitched together with prior and next references
* without forming cycles that make it impossible to write out the leaves of the
* btree. This restriction arises because each time we write out a node or leaf
* it is assigned a persistent identifier as an unavoidable artifact of
* providing isolation for the object index.
*
*
* Note: This implementation is thread-safe for concurrent readers BUT NOT for
* concurrent writers. If a writer has access to a {@link BTree} then there MUST
* NOT be any other reader -or- writer operating on the {@link BTree} at the
* same time.
*
*
* @todo consider also tracking the #of deleted entries in a key range (parallel
* to how we track the #of entries in a key range) so that we can report
* the exact #of non-deleted entries when the btree supports isolation
* (right now you have to do a key range scan and count the #of
* non-deleted entries).
*
* @todo Choose the split point in the branch nodes within a parameterized
* region such that we choose the shortest separator keys to promote to
* the parent. This is a bit more tricky for the
* {@link IndexSegmentBuilder}.
*
* @todo reuse a buffer when copying keys and values out of the index, e.g., for
* the {@link ITupleIterator}, in order to minimize heap churn.
*
* @todo The B+Tree implementation does not support limits on the serialized
* size of a node or leaf. The design strategy is to allow flexible sizes
* for serialized node and leaf records since larger branching factors and
* continuous IO are cheaper and nothing in particular requires the use of
* fixed size records when accessing disk. However, it would still be
* useful to be able to place an upper bound on the serialized size of a
* node or leaf. Nodes and leaves are only serialized on eviction, so this
* would require the ability to split nodes and/or leaves immediately
* before eviction if they would be likely to overflow the upper bound on
* the record size. Probably the node or leaf needs to be made immutable,
* the serialized size checked, and then a split operation invoked if the
* record would exceed the upper bound. If this happens a lot, then the
* branching factor is too high for the data and would have to be lowered
* to regain performance.
*
* Another use case for limits on the size of a node/leaf is maintaining
* the data in a serialized form. This can have several benefits,
* including reducing heap churn by copying keys and values (not their
* references) into the record structure, and reducing (eliminating)
* deserialization costs (which are substantially more expensive than
* serialization). This can be approached using new implementations of the
* key buffer and a value buffer implementation (one that can be used for
* nodes or for leaves). If an object implements both then it is a small
* additional step towards using it for the entire serialized record
* (minus some header/tailer).
*
* Adaptive packed memory arrays can be used in this context to minimize
* the amount of copying that needs to be performed on mutation of a node
* or leaf.
*
* Leading key compression is intrinsically part of how the key buffer
* maintains its representation so, for example, a micro-index would have
* to be maintained as part of the key buffer.
*
* @todo we could defer splits by redistributing keys to left/right siblings
* that are under capacity - this makes the tree a b*-tree. however, this
* is not critical since the journal is designed to be fully buffered and
* the index segments are read-only but it would reduce memory by reducing
* the #of nodes -- and we can expect that the siblings will be either
* resident or in the direct buffer for the journal
*
* @todo evict subranges by touching the node on the way up so that a node that
* is evicted from the hard reference cache will span a subrange that can
* be evicted together. this will help to preserve locality on disk. with
* this approach leaves might be evicted independently, but also as part
* of a node subrange eviction.
*
* @todo since forward scans are much more common, change the post-order
* iterator for commit processing to use a reverse traversal so that we
* can write the next leaf field whenever we evict a sequence of leaves as
* part of a sub-range commit (probably not much of an issue since the
* journal is normally fully buffered and the perfect index segments will
* not have this problem).
*
* @todo pre-fetch leaves for range scans? if we define a parallel iterator
* flag?
*
* @todo Note that efficient support for large branching factors requires a more
* sophisticated approach to maintaining the key order within a node or
* leaf. E.g., using a red-black tree or adaptive packed memory array.
* However, we can use smaller branching factors for btrees in the journal
* and use a separate implementation for bulk generating and reading
* "perfect" read-only key range segments.
*
* @todo derive a string index that uses patricia trees in the leaves per
* several published papers.
*
* @author Bryan Thompson
*/
public class BTree extends AbstractBTree implements //ICommitter,
ICheckpointProtocol {// ILocalBTreeView {
@Override
final public int getHeight() {
return height;
}
@Override
final public long getNodeCount() {
return nnodes;
}
@Override
final public long getLeafCount() {
return nleaves;
}
@Override
final public long getEntryCount() {
return nentries;
}
@Override
final public IRawStore getStore() {
return store;
}
/**
* Returns an {@link ICounter}. The {@link ICounter} is mutable iff the
* {@link BTree} is mutable. All {@link ICounter}s returned by this method
* report and increment the same underlying counter.
*
* Note: When the {@link BTree} is part of a scale-out index then the
* counter will assign values within a namespace defined by the partition
* identifier.
*/
@Override
public ICounter getCounter() {
ICounter counter = new Counter(this);
final LocalPartitionMetadata pmd = metadata.getPartitionMetadata();
if (pmd != null) {
counter = new PartitionedCounter(pmd.getPartitionId(), counter);
}
if (isReadOnly()) {
return new ReadOnlyCounter(counter);
}
return counter;
}
/**
* The constructor sets this field initially based on a {@link Checkpoint}
* record containing the only address of the {@link IndexMetadata} for the
* index. Thereafter this reference is maintained as the {@link Checkpoint}
* record last written by {@link #writeCheckpoint()} or read by
* {@link #load(IRawStore, long, boolean)}.
*/
private Checkpoint checkpoint = null;
// /**
// * The root of the btree. This is initially a leaf until the leaf is split,
// * at which point it is replaced by a node. The root is also replaced each
// * time copy-on-write triggers a cascade of updates.
// */
// protected AbstractNode root;
/**
* The height of the btree. The height is the #of leaves minus one. A btree
* with only a root leaf is said to have height := 0
. Note
* that the height only changes when we split the root node.
*/
protected int height;
/**
* The #of non-leaf nodes in the btree. The is zero (0) for a new btree.
*/
protected long nnodes;
/**
* The #of leaf nodes in the btree. This is one (1) for a new btree.
*/
protected long nleaves;
/**
* The #of entries in the btree. This is zero (0) for a new btree.
*/
protected long nentries;
/**
* The value of the record version number that will be assigned to the next
* node or leaf written onto the backing store. This number is incremented
* each time a node or leaf is written onto the backing store. The initial
* value is ZERO (0). The first value assigned to a node or leaf will be
* ZERO (0).
*/
protected long recordVersion;
/**
* The mutable counter exposed by #getCounter()}.
*
* Note: This is protected
so that it will be visible to
* {@link Checkpoint} which needs to write the actual value store in this
* counter into its serialized record (without the partition identifier).
*/
protected AtomicLong counter;
/**
* A buffer used to encode a raw record address for a mutable {@link BTree}
* and otherwise null
.
*/
private final ByteArrayBuffer recordAddrBuf;
// /**
// * The last address from which the {@link IndexMetadata} record was read or
// * on which it was written.
// */
// private long lastMetadataAddr;
// @Override
// final protected int getReadRetentionQueueCapacity() {
//
// return metadata.getBTreeReadRetentionQueueCapacity();
//
// }
//
// @Override
// final protected int getReadRetentionQueueScan() {
//
// return metadata.getBTreeReadRetentionQueueScan();
//
// }
/**
* Required constructor form for {@link BTree} and any derived subclasses.
* This constructor is used both to create a new {@link BTree}, and to load
* a {@link BTree} from the store using a {@link Checkpoint} record.
*
* @param store
* The store.
* @param checkpoint
* A {@link Checkpoint} record for that {@link BTree}.
* @param metadata
* The metadata record for that {@link BTree}.
* @param readOnly
* When true
the {@link BTree} will be immutable.
*
* @see BTree#create(IRawStore, IndexMetadata)
* @see BTree#load(IRawStore, long, boolean)
*/
public BTree(final IRawStore store, final Checkpoint checkpoint,
final IndexMetadata metadata, final boolean readOnly) {
super( store,
NodeFactory.INSTANCE, //
readOnly, // read-only
metadata,//
metadata.getBtreeRecordCompressorFactory()
);
// this.readOnly = readOnly;
if (checkpoint == null) {
throw new IllegalArgumentException();
}
if (store != null) {
if (checkpoint.getMetadataAddr() != metadata.getMetadataAddr()) {
// must agree.
throw new IllegalArgumentException();
}
}
if (metadata.getConflictResolver() != null && !metadata.isIsolatable()) {
throw new IllegalArgumentException(
"Conflict resolver may only be used with isolatable indices");
}
setCheckpoint(checkpoint);
// // save the address from which the index metadata record was read.
// this.lastMetadataAddr = metadata.getMetadataAddr();
/*
* Note: the mutable BTree has a limit here so that split() will always
* succeed. That limit does not apply for an immutable btree.
*/
assert readOnly || writeRetentionQueue.capacity() >= IndexMetadata.Options.MIN_WRITE_RETENTION_QUEUE_CAPACITY;
/*
* Note: Re-open is deferred so that we can mark the BTree as read-only
* before we read the root node.
*/
// reopen();
/*
* Buffer used to encode addresses into the tuple value for a mutable
* B+Tree.
*/
recordAddrBuf = readOnly ? null
: new ByteArrayBuffer(Bytes.SIZEOF_LONG);
}
/**
* Encode a raw record address into a byte[] suitable for storing in the
* value associated with a tuple and decoding using
* {@link AbstractBTree#decodeRecordAddr(byte[])}. This method is only
* supported for a mutable {@link BTree} instance. Per the contract of the
* mutable {@link BTree}, it is not thread-safe.
*
* @param addr
* The raw record address.
*
* @return A newly allocated byte[] which encodes that address.
*/
byte[] encodeRecordAddr(final long addr) {
return AbstractBTree.encodeRecordAddr(recordAddrBuf, addr);
}
/**
* Sets the {@link #checkpoint} and initializes the mutable fields from the
* checkpoint record. In order for this operation to be atomic, the caller
* must be synchronized on the {@link BTree} or otherwise guaranteed to have
* exclusive access, e.g., during the ctor or when the {@link BTree} is
* mutable and access is therefore required to be single-threaded.
*/
private void setCheckpoint(final Checkpoint checkpoint) {
this.checkpoint = checkpoint; // save reference.
this.height = checkpoint.getHeight();
this.nnodes = checkpoint.getNodeCount();
this.nleaves = checkpoint.getLeafCount();
this.nentries = checkpoint.getEntryCount();
this.counter = new AtomicLong( checkpoint.getCounter() );
this.recordVersion = checkpoint.getRecordVersion();
}
/**
* Creates and sets new root {@link Leaf} on the B+Tree and (re)sets the
* various counters to be consistent with that root. This is used both
* by the constructor for a new {@link BTree} and by {@link #removeAll()}.
*
* Note: The {@link #getCounter()} is NOT changed by this method.
*/
final private void newRootLeaf() {
height = 0;
nnodes = 0;
nentries = 0;
final boolean wasDirty = root != null && root.dirty;
root = new Leaf(this);
nleaves = 1;
// Note: Counter is unchanged!
// counter = new AtomicLong( 0L );
if (metadata.getBloomFilterFactory() != null) {
/*
* Note: Allocate a new bloom filter since the btree is now empty
* and set its reference on the BTree. We need to do this here in
* order to (a) overwrite the old reference when we are replacing
* the root, e.g., for removeAll(); and (b) to make sure that the
* bloomFilter reference is defined since the checkpoint will not
* have its address until the next time we call writeCheckpoint()
* and therefore readBloomFilter() would fail if we did not create
* and assign the bloom filter here.
*/
bloomFilter = metadata.getBloomFilterFactory().newBloomFilter();
}
/*
* Note: a new root leaf is created when an empty btree is (re-)opened.
* However, if the BTree is marked as read-only then do not fire off a
* dirty event.
*/
if(!wasDirty && !readOnly) {
fireDirtyEvent();
}
}
@Override
protected void _reopen() {
if (checkpoint.getRootAddr() == 0L) {
/*
* Create the root leaf.
*
* Note: if there is an optional bloom filter, then it is created
* now.
*/
newRootLeaf();
} else {
/*
* Read the root node of the btree.
*/
root = readNodeOrLeaf(checkpoint.getRootAddr());
// Note: The optional bloom filter will be read lazily.
}
}
/**
* Lazily reads the bloom filter from the backing store if it exists and is
* not already in memory.
*/
@Override
final public BloomFilter getBloomFilter() {
// make sure the index is open.
reopen();
if (bloomFilter == null) {
if (checkpoint.getBloomFilterAddr() == 0L) {
// No bloom filter.
return null;
}
synchronized(this) {
if (bloomFilter == null) {
bloomFilter = readBloomFilter();
}
}
}
if (bloomFilter != null && !bloomFilter.isEnabled()) {
/*
* Do NOT return the reference if the bloom filter has been
* disabled. This simplifies things for the caller. The only code
* that needs the reference regardless of whether it is enabled is
* writeCheckpoint() and Checkpoint and those bits use the reference
* directly.
*/
return null;
}
// Note: will be null if there is no bloom filter!
return bloomFilter;
}
/**
* Read the bloom filter from the backing store using the address stored in
* the last {@link #checkpoint} record. This method will be invoked by
* {@link #getBloomFilter()} when the bloom filter reference is
* null
but the bloom filter is known to exist and the bloom
* filter object is requested.
*
* Note: A bloom filter can be relatively large. The bit length of a bloom
* filter is approximately one byte per index entry, so a filter for an
* index with 10M index entries will be on the order of 10mb. Therefore this
* method will typically have high latency.
*
* Note: the {@link Checkpoint} record initially stores 0L
* for the bloom filter address. {@link #newRootLeaf()} is responsible for
* allocating the bloom filter (if one is to be used) when the root leaf is
* (re-)created. The address then gets stored in the {@link Checkpoint}
* record by {@link #writeCheckpoint()} (if invoked and once the bloom
* filter is dirty).
*
* @throws IllegalStateException
* if the bloom filter does not exist (the caller should check
* this first to avoid obtaining a lock).
*/
final protected BloomFilter readBloomFilter() {
final long bloomFilterAddr = checkpoint.getBloomFilterAddr();
if (bloomFilterAddr == 0L) {
// No bloom filter.
throw new IllegalStateException();
}
return BloomFilter.read(store, bloomFilterAddr);
}
// final public boolean isReadOnly() {
//
// return readOnly;
//
// }
// /**
// * Mark the B+Tree as read-only. Once the B+Tree is marked as read-only,
// * that instance will remain read-only.
// *
// * @param readOnly
// * true
if you want to mark the B+Tree as
// * read-only.
// *
// * @throws UnsupportedOperationException
// * if the B+Tree is already read-only and you pass
// * false
.
// */
// final public void setReadOnly(final boolean readOnly) {
//
// if (this.readOnly && !readOnly) {
//
// throw new UnsupportedOperationException(ERROR_READ_ONLY);
//
// }
//
// this.readOnly = readOnly;
//
// }
// final private boolean readOnly;
@Override
final public long getLastCommitTime() {
return lastCommitTime;
}
@Override
final public long getRevisionTimestamp() {
if (readOnly)
throw new UnsupportedOperationException(ERROR_READ_ONLY);
return lastCommitTime + 1;
}
@Override
final public void setLastCommitTime(final long lastCommitTime) {
if (lastCommitTime == 0L)
throw new IllegalArgumentException();
if (this.lastCommitTime == lastCommitTime) {
// No change.
return;
}
if (INFO)
log.info("old=" + this.lastCommitTime + ", new=" + lastCommitTime);
// Note: Commented out to allow replay of historical transactions.
/*if (this.lastCommitTime != 0L && this.lastCommitTime > lastCommitTime) {
throw new IllegalStateException("Updated lastCommitTime: old="
+ this.lastCommitTime + ", new=" + lastCommitTime);
}*/
this.lastCommitTime = lastCommitTime;
}
/**
* The lastCommitTime of the {@link Checkpoint} record from which the
* {@link BTree} was loaded.
*
* Note: Made volatile on 8/2/2010 since it is not otherwise obvious what
* would guarantee visibility of this field, through I do seem to remember
* that visibility might be guaranteed by how the BTree class is discovered
* and returned to the class. Still, it does no harm to make this a volatile
* read.
*/
volatile private long lastCommitTime = 0L;// Until the first commit.
/**
* Return the {@link IDirtyListener}.
*/
@Override
final public IDirtyListener getDirtyListener() {
return listener;
}
/**
* Set or clear the listener (there can be only one).
*
* @param listener The listener.
*/
@Override
final public void setDirtyListener(final IDirtyListener listener) {
assertNotReadOnly();
this.listener = listener;
}
private IDirtyListener listener;
/**
* Fire an event to the listener (iff set).
*/
final protected void fireDirtyEvent() {
assertNotReadOnly();
final IDirtyListener l = this.listener;
if (l == null)
return;
if (Thread.interrupted()) {
throw new RuntimeException(new InterruptedException());
}
l.dirtyEvent(this);
}
/**
* Flush the nodes of the {@link BTree} to the backing store. After invoking
* this method the root of the {@link BTree} will be clean.
*
* Note: This does NOT flush all persistent state. See
* {@link #writeCheckpoint()} which also handles the optional bloom filter,
* the {@link IndexMetadata}, and the {@link Checkpoint} record itself.
*
* @return true
if anything was written.
*/
final private boolean flush() {
assertNotTransient();
assertNotReadOnly();
if (root != null && root.dirty) {
writeNodeRecursive(root);
if(INFO)
log.info("flushed root: addr=" + root.identity);
return true;
}
return false;
}
/**
* Returns an immutable view of this {@link BTree}. If {@link BTree} is
* already read-only, then this instance is returned. Otherwise, a
* read-only {@link BTree} is loaded from the last checkpoint and returned.
*
* @throws IllegalStateException
* If the {@link BTree} is dirty.
*
* @todo The {@link Checkpoint} could hold a {@link WeakReference} singleton
* to the read-only view loaded from that checkpoint.
*/
public BTree asReadOnly() {
if (isReadOnly()) {
return this;
}
if(needsCheckpoint())
throw new IllegalStateException();
return BTree.load(store, checkpoint.addrCheckpoint, true/* readOnly */);
}
// /**
// * Converts this {@link BTree} to a read-only {@link BTree}, stealing its
// * cached nodes and leaves. If {@link BTree} is already read-only, then
// * this instance is returned. Otherwise, a read-only {@link BTree} is
// * loaded from the last checkpoint, populated with new instances of cached
// * nodes and leaves from this {@link BTree} using a pre-order dirty node
// * traversal, and then returned. When the cached nodes and leaves are
// * populated, new instances are used but they will wrap the data records
// * already in memory. The new nodes and leaves are touched as they are
// * brought into the new {@link BTree} so they will be entered onto the write
// * retention cache.
// *
// * @throws IllegalStateException
// * If the {@link BTree} is dirty.
// * */
// public BTree convertToReadOnly() {
//
// if(isReadOnly()) {
//
// return this;
//
// }
//
// if(needsCheckpoint())
// throw new IllegalStateException();
//
// final BTree tmp = BTree
// .load(store, checkpoint.addrCheckpoint, true/* readOnly */);
//
// /*
// * @todo We do not have a pre-order iterator which only visits the
// * materialized children. That is what we need here. Then for each
// * visited Node or Leaf, we would create an appropriate child of the
// * corresponding Node in the new read-only BTree. The visitation has to
// * be pre-order to ensure that the parent node already exists in the new
// * read-only BTree. It needs to only visit the materialized children to
// * avoid overhead with materializing nodes from the backing store.
// */
//// {
////
//// final Iterator itr = getRoot()
//// .postOrderNodeIterator(true/* dirtyNodesOnly */);
////
//// }
//
// return tmp;
//
// }
/**
* {@inheritDoc}
*
* This checkpoint operation {@link #flush()}es dirty nodes, the optional
* {@link IBloomFilter} (if dirty), the {@link IndexMetadata} (if dirty),
* and then writes a new {@link Checkpoint} record on the backing store,
* saves a reference to the current {@link Checkpoint} and returns the
* address of that {@link Checkpoint} record.
*
* @see #writeCheckpoint2(), which returns the {@link Checkpoint} record
* itself.
*
* @see #load(IRawStore, long, boolean)
*/
@Override
final public long writeCheckpoint() {
// write checkpoint and return address of that checkpoint record.
return writeCheckpoint2().getCheckpointAddr();
}
/**
* {@inheritDoc}
*
* @see #load(IRawStore, long, boolean)
*/
@Override
final public Checkpoint writeCheckpoint2() {
assertNotTransient();
assertNotReadOnly();
/*
* Note: Acquiring this lock provides for atomicity of the checkpoint of
* the BTree during the commit protocol. Without this lock, users of the
* UnisolatedReadWriteIndex could be concurrently modifying the BTree
* while we are attempting to snapshot it for the commit.
*
* Note: An alternative design would declare a global read/write lock
* for mutation of the indices in addition to the per-BTree read/write
* lock provided by UnisolatedReadWriteIndex. Rather than taking the
* per-BTree write lock here, we would take the global write lock in the
* AbstractJournal's commit protocol, e.g., commitNow(). The global read
* lock would be taken by UnisolatedReadWriteIndex before taking the
* per-BTree write lock. This is effectively a hierarchical locking
* scheme and could provide a workaround if deadlocks are found to occur
* due to lock ordering problems with the acquisition of the
* UnisolatedReadWriteIndex lock (the absence of lock ordering problems
* really hinges around UnisolatedReadWriteLocks not being taken for
* more than one index at a time.)
*
* @see https://sourceforge.net/apps/trac/bigdata/ticket/278
* @see https://sourceforge.net/apps/trac/bigdata/ticket/284
* @see https://sourceforge.net/apps/trac/bigdata/ticket/288
* @see https://sourceforge.net/apps/trac/bigdata/ticket/343
* @see https://sourceforge.net/apps/trac/bigdata/ticket/440
*/
final Lock lock = writeLock();
lock.lock();
try {
/**
* Do not permit checkpoint if the index is in an error state.
*
* @see Invalidate
* BTree objects if error occurs during eviction
*/
if (error != null)
throw new IllegalStateException(ERROR_ERROR_STATE, error);
//synchronized(this) {
if (/* autoCommit && */needsCheckpoint()) {
/*
* Flush the btree, write a checkpoint record, and return the
* address of that checkpoint record. The [checkpoint] reference
* is also updated.
*/
return _writeCheckpoint2();
}
//}
/*
* There have not been any writes on this btree or auto-commit is
* disabled.
*
* Note: if the application has explicitly invoked writeCheckpoint()
* then the returned address will be the address of that checkpoint
* record and the BTree will have a new checkpoint address made
* restart safe on the backing store.
*/
return checkpoint;
} finally {
lock.unlock();
}
}
/**
* Core implementation invoked by {@link #writeCheckpoint2()} while holding
* the lock - DO NOT INVOKE THIS METHOD DIRECTLY.
*
* @return the checkpoint.
*/
private final Checkpoint _writeCheckpoint2() {
assertNotTransient();
assertNotReadOnly();
// assert root != null : "root is null"; // i.e., isOpen().
// flush any dirty nodes.
flush();
// pre-condition: all nodes in the tree are clean.
assert root == null || !root.dirty;
{
/*
* Note: Use the [AbstractBtree#bloomFilter] reference here!!!
*
* If that reference is [null] then the bloom filter is either
* clean, disabled, or was not configured. For any of those (3)
* conditions we will use the address of the bloom filter from the
* last checkpoint record. If the bloom filter is clean, then we
* will just carry forward its old address. Otherwise the address in
* the last checkpoint record will be 0L and that will be carried
* forward.
*/
final BloomFilter filter = this.bloomFilter;
if (filter != null && filter.isDirty() && filter.isEnabled()) {
/*
* The bloom filter is enabled, is loaded and is dirty, so write
* it on the store now.
*
* @see https://sourceforge.net/apps/trac/bigdata/ticket/440
*/
recycle(filter.getAddr());
filter.write(store);
}
}
if (metadata.getMetadataAddr() == 0L) {
/*
* The index metadata has been modified so we write out a new
* metadata record on the store.
*/
if (checkpoint != null) {
// Recycle the old IndexMetadata record (if any).
recycle(checkpoint.getMetadataAddr());
}
metadata.write(store);
}
/*
* Recycle the old checkpoint record.
*
* @see https://sourceforge.net/apps/trac/bigdata/ticket/440
*/
recycle(checkpoint != null ? checkpoint.addrCheckpoint : IRawStore.NULL);
/*
* Delete old root data iff changed.
*
* @see https://sourceforge.net/apps/trac/bigdata/ticket/473
* (PhysicalAddressResolutionException after reopen using RWStore and
* recycler)
*/
if (checkpoint != null && getRoot() != null
&& checkpoint.getRootAddr() != getRoot().identity) {
recycle(checkpoint != null ? checkpoint.getRootAddr()
: IRawStore.NULL);
}
// create new checkpoint record.
checkpoint = newCheckpoint();
// write it on the store.
checkpoint.write(store);
if (BigdataStatics.debug||INFO) {
final String msg = "name=" + metadata.getName()
+ ", writeQueue{size=" + writeRetentionQueue.size()
+ ",distinct=" + ndistinctOnWriteRetentionQueue + "} : "
+ checkpoint;
if (BigdataStatics.debug)
System.err.println(msg);
if (INFO)
log.info(msg);
}
// return the checkpoint record.
return checkpoint;
}
/**
* Create a {@link Checkpoint} for a {@link BTree}.
*
* The caller is responsible for writing the {@link Checkpoint} record onto
* the store.
*
* The class identified by {@link IndexMetadata#getCheckpointClassName()}
* MUST declare a public constructor with the following method signature
*
*
* ...( BTree btree )
*
*
* @return The {@link Checkpoint}.
*/
@SuppressWarnings("unchecked")
final private Checkpoint newCheckpoint() {
try {
@SuppressWarnings("rawtypes")
final Class cl = Class.forName(metadata.getCheckpointClassName());
/*
* Note: A NoSuchMethodException thrown here means that you did not
* declare the required public constructor on the Checkpoint class
* [cl].
*/
@SuppressWarnings("rawtypes")
final Constructor ctor = cl.getConstructor(new Class[] {
BTree.class //
});
final Checkpoint checkpoint = (Checkpoint) ctor
.newInstance(new Object[] { //
this //
});
return checkpoint;
} catch(Exception ex) {
throw new RuntimeException(ex);
}
}
@Override
final public Checkpoint getCheckpoint() {
if (checkpoint == null)
throw new AssertionError();
return checkpoint;
}
@Override
final public long getRecordVersion() {
return recordVersion;
}
@Override
final public long getMetadataAddr() {
return metadata.getMetadataAddr();
}
@Override
final public long getRootAddr() {
return (root == null ? getCheckpoint().getRootAddr() : root
.getIdentity());
}
// /**
// * Return true iff the state of this B+Tree has been modified since the last
// * {@link Checkpoint} record associated with the given address.
// *
// * @param checkpointAddr
// * The historical metadata address for this B+Tree.
// *
// * @return true iff the B+Tree has been modified since the identified
// * historical version.
// *
// * @see #getCheckpoint()
// * @see Checkpoint#getMetadataAddr()
// *
// * @todo this test might not work with overflow() handling since the
// * {@link Checkpoint} addresses would be independent in each store
// * file.
// */
// public boolean modifiedSince(long checkpointAddr) {
//
// return needsCheckpoint() || checkpoint.getCheckpointAddr() != checkpointAddr;
//
// }
/**
* Return true iff changes would be lost unless the B+Tree is flushed to the
* backing store using {@link #writeCheckpoint()}.
*
* Note: In order to avoid needless checkpoints this method will return
* false
if:
*
* - the metadata record is persistent -AND-
*
* - EITHER the root is
null
, indicating that the index
* is closed (in which case there are no buffered writes);
* - OR the root of the btree is NOT dirty, the persistent address of the
* root of the btree agrees with {@link Checkpoint#getRootAddr()}, and the
* {@link #counter} value agrees {@link Checkpoint#getCounter()}.
*
*
*
*
* @return true
true iff changes would be lost unless the
* B+Tree was flushed to the backing store using
* {@link #writeCheckpoint()}.
*/
public boolean needsCheckpoint() {
if(checkpoint.addrCheckpoint == 0L) {
/*
* The checkpoint record needs to be written.
*/
return true;
}
if(metadata.getMetadataAddr() == 0L) {
/*
* The index metadata record was replaced and has not yet been
* written onto the store.
*/
return true;
}
if(metadata.getMetadataAddr() != checkpoint.getMetadataAddr()) {
/*
* The index metadata record was replaced and has been written on
* the store but the checkpoint record does not reflect the new
* address yet.
*/
return true;
}
if(checkpoint.getCounter() != counter.get()) {
// The counter has been modified.
return true;
}
if(root != null ) {
if (root.isDirty()) {
// The root node is dirty.
return true;
}
if(checkpoint.getRootAddr() != root.identity) {
// The root node has a different persistent identity.
return true;
}
}
/*
* No apparent change in persistent state so we do NOT need to do a
* checkpoint.
*/
return false;
// if (metadata.getMetadataAddr() != 0L && //
// (root == null || //
// ( !root.dirty //
// && checkpoint.getRootAddr() == root.identity //
// && checkpoint.getCounter() == counter.get())
// )
// ) {
//
// return false;
//
// }
//
// return true;
}
/**
* Method updates the index metadata associated with this {@link BTree}.
* The new metadata record will be written out as part of the next index
* {@link #writeCheckpoint()}.
*
* Note: this method should be used with caution.
*
* @param indexMetadata
* The new metadata description for the {@link BTree}.
*
* @throws IllegalArgumentException
* if the new value is null
* @throws IllegalArgumentException
* if the new {@link IndexMetadata} record has already been
* written on the store - see
* {@link IndexMetadata#getMetadataAddr()}
* @throws UnsupportedOperationException
* if the index is read-only.
*/
final public void setIndexMetadata(final IndexMetadata indexMetadata) {
assertNotReadOnly();
if (indexMetadata == null)
throw new IllegalArgumentException();
if (indexMetadata.getMetadataAddr() != 0)
throw new IllegalArgumentException();
this.metadata = indexMetadata;
// gets us on the commit list for Name2Addr.
fireDirtyEvent();
}
/**
* Handle request for a commit by {@link #writeCheckpoint()}ing dirty nodes
* and leaves onto the store, writing a new metadata record, and returning
* the address of that metadata record.
*
* Note: In order to avoid needless writes the existing metadata record is
* always returned if {@link #needsCheckpoint()} is false
.
*
* Note: The address of the existing {@link Checkpoint} record is always
* returned if {@link #getAutoCommit() autoCommit} is disabled.
*
* @return The address of a {@link Checkpoint} record from which the btree
* may be reloaded.
*/
@Override
public long handleCommit(final long commitTime) {
return writeCheckpoint2().getCheckpointAddr();
}
/**
* Remove all entries in the B+Tree.
*
* When delete markers are not enabled this simply replaces the root with a
* new root leaf and resets the counters (height, #of nodes, #of leaves,
* etc). to be consistent with that new root. If the btree is then made
* restart-safe by the commit protocol of the backing store then the effect
* is as if all entries had been deleted. Old nodes and leaves will be swept
* from the store eventually when the journal overflows.
*
* When delete markers are enabled all un-deleted entries in the index are
* overwritten with a delete marker.
*
* Note: The {@link IIndexManager} defines methods for registering (adding)
* and dropping indices vs removing the entries in an individual
* {@link BTree}.
*/
@Override
final public void removeAll() {
assertNotReadOnly();
/*
* FIXME simplify conditionals - mgc
*/
if (!getIndexMetadata().getDeleteMarkers()
&& getStore() instanceof IRWStrategy) {
/*
* Per https://sourceforge.net/apps/trac/bigdata/ticket/221, we
* special case this for the RWStore when delete markers are not
* enabled and just issue deletes against each node and leave in the
* BTree. This is done using a post-order traversal of the nodes and
* leaves such that the parent is not removed from the store until
* its children have been removed. The deletes are low-level
* IRawStore#delete(addr) invocations without maintenance to the
* B+Tree data structures. Afterwards replaceRootWithEmptyLeaf() is
* invoked to discard the hard reference ring buffer and associate a
* new root leaf with the B+Tree.
*
* FIXME https://sourceforge.net/apps/trac/bigdata/ticket/217 - we
* should update the performance counters when we use this short
* cut to release the storage associated with the B+Tree.
*/
/*
* Visit all Nodes using a pre-order traversal, but do not
* materialize the leaves.
*/
final Iterator itr = getRoot().postOrderNodeIterator(
false/* dirtyNodesOnly */, true/* nodesOnly */);
while(itr.hasNext()) {
final Node node = (Node) itr.next();
final int nchildren = node.getChildCount();
for (int i = 0; i < nchildren; i++) {
final long childAddr = node.getChildAddr(i);
if(childAddr != 0L) {
// delete persistent child.
deleteNodeOrLeaf(childAddr);
}
}
}
final long raddr = getRoot().getIdentity();
if (raddr != IRawStore.NULL) {
// delete root iff persistent.
deleteNodeOrLeaf(raddr);
}
// @todo update bytesOnStore to ZERO.
replaceRootWithEmptyLeaf();
} else if (getIndexMetadata().getDeleteMarkers()
|| getStore() instanceof IRWStrategy//
|| metadata.getRawRecords()//
) {
/*
* Visit each tuple.
*
* If deletion markers are enabled, then this will write deletion
* markers for each non-deleted entry. When the transaction commits,
* those delete markers will have to validate against the global
* state of the tree. If the transaction validates, then the merge
* down onto the global state will cause the corresponding entries
* to be removed from the global tree.
*
* If raw record support is enabled, then the raw records for the
* tuples visited will be deleted on the backing store.
*
* Note: This operation can change the tree structure by triggering
* copy-on-write for immutable node or leaves.
*/
final ITupleIterator itr = rangeIterator(null, null,
0/* capacity */, REMOVEALL/* flags */, null/* filter */);
while (itr.hasNext()) {
itr.next();
}
} else {
replaceRootWithEmptyLeaf();
}
}
/**
* Clears the hard reference cache and replaces the root node with an empty
* root leaf. This is a low level method.
*/
private void replaceRootWithEmptyLeaf() {
/*
* Clear the hard reference cache (sets the head, tail and count to
* zero).
*
* Note: This is important since the cache will typically contain
* dirty nodes and leaves. If those get evicted then an exception
* will be thrown since their parents are not grounded on the new
* root leaf.
*
* Note: By clearing the references to null we also facilitate
* garbage collection of the nodes and leaves in the cache.
*/
writeRetentionQueue.clear(true/* clearRefs */);
ndistinctOnWriteRetentionQueue = 0;
// if (readRetentionQueue != null) {
//
// // Note: MUST NOT clear the read retention queue if global!
// readRetentionQueue.clear(true/* clearRefs */);
//
// }
/*
* Replace the root with a new root leaf.
*/
newRootLeaf();
}
/**
* Create a new checkpoint for a mutable {@link BTree} in which the view is
* redefined to include the previous view of the {@link BTree} (the one from
* which this {@link BTree} instance was loaded) plus the current view of
* the {@link BTree}. The root of the {@link BTree} is replaced with an
* empty leaf as part of this operation. The {@link LocalPartitionMetadata}
* is updated to reflect the new view.
*
* Note: This method is used by the scale-out architecture for which it
* performs a very specific function. It should not be used for any other
* purpose and should not be invoked by user code. This encapsulates all of
* the trickery for creating the necessary checkpoint without exposing any
* methods which could be used to replace the root node with an empty root
* leaf.
*
* @return The timestamp associated with the checkpoint record from which
* this {@link BTree} was loaded. This is the timestamp that was
* placed on the 2nd element of the view. Any read-only build or
* merge task will read from this timestamp.
*
* @throws IllegalStateException
* if the {@link BTree} is read only.
* @throws IllegalStateException
* if the {@link BTree} is dirty.
* @throws IllegalStateException
* if the {@link BTree} has never been committed.
*/
public long createViewCheckpoint() {
if(isReadOnly()) {
/*
* This can only be done for the mutable BTree.
*/
throw new IllegalStateException();
}
if(needsCheckpoint()) {
/*
* It it an error if any writes are buffered when you call this
* method. While not strictly a problem, this is indicative that the
* caller does not understand what the method is supposed to do.
*/
throw new IllegalStateException();
}
/*
* The commitTime associated with the commit point from which we loaded
* the BTree view.
*/
final long priorCommitTime = getLastCommitTime();
if (priorCommitTime == 0L) {
/*
* The BTree has never been committed.
*/
throw new IllegalStateException();
}
/*
* Redefine the view.
*/
{
// clone the current metadata record for the live index.
final IndexMetadata indexMetadata = getIndexMetadata().clone();
final LocalPartitionMetadata oldPmd = indexMetadata
.getPartitionMetadata();
final IResourceMetadata[] oldResources = oldPmd.getResources();
/*
* The new view uses the current journal in both the 1st and 2nd
* positions.
*/
final IResourceMetadata[] newResources = new IResourceMetadata[oldResources.length + 1];
// unchanged.
newResources[0] = oldResources[0];
// the same store, but reading from the same commit time as this
// view of this BTree.
newResources[1] = new JournalMetadata((AbstractJournal) getStore(),
priorCommitTime);
// any other stores in the view are copied.
for (int i = 1; i < oldResources.length; i++) {
newResources[i + 1] = oldResources[i];
}
final LocalPartitionMetadata newPmd = new LocalPartitionMetadata(
oldPmd.getPartitionId(), // partitionId
-1, // sourcePartitionId
oldPmd.getLeftSeparatorKey(), //
oldPmd.getRightSeparatorKey(),//
newResources,//
oldPmd.getIndexPartitionCause()//
// "" // history is deprecated.
);
// update the local partition metadata on our cloned IndexMetadata.
indexMetadata.setPartitionMetadata(newPmd);
// update the metadata associated with the btree so it will get written out.
setIndexMetadata(indexMetadata);
// verify BTree is now dirty.
assert needsCheckpoint();
}
/*
* Replace the root node with an empty root leaf.
*/
{
replaceRootWithEmptyLeaf();
}
return priorCommitTime;
}
/**
* Create a new {@link BTree} or derived class. This method works by writing
* the {@link IndexMetadata} record on the store and then loading the
* {@link BTree} from the {@link IndexMetadata} record.
*
* @param store
* The store.
*
* @param metadata
* The metadata record.
*
* @return The newly created {@link BTree}.
*
* @see #load(IRawStore, long, boolean)
*
* @exception IllegalStateException
* If you attempt to create two btree objects from the same
* metadata record since the metadata address will have
* already been noted on the {@link IndexMetadata} object.
* You can use {@link IndexMetadata#clone()} to obtain a new
* copy of the metadata object with the metadata address set
* to 0L
.
* @exception IllegalStateException
* if the {@link IndexTypeEnum} in the supplied
* {@link IndexMetadata} object is not
* {@link IndexTypeEnum#BTree}.
*/
public static BTree create(final IRawStore store, final IndexMetadata metadata) {
if (metadata.getMetadataAddr() != 0L) {
throw new IllegalStateException("Metadata record already in use");
}
if (metadata.getIndexType() != IndexTypeEnum.BTree) {
throw new IllegalStateException("Wrong index type: "
+ metadata.getIndexType());
}
/*
* Write metadata record on store. The address of that record is set as
* a side-effect on the metadata object.
*/
metadata.write(store);
/*
* Create checkpoint for the new B+Tree.
*/
final Checkpoint firstCheckpoint = metadata.firstCheckpoint();
/*
* Write the checkpoint record on the store. The address of the
* checkpoint record is set on the object as a side effect.
*/
firstCheckpoint.write(store);
/*
* Load the B+Tree from the store using that checkpoint record. There is
* no root so a new root leaf will be created when the B+Tree is opened.
*/
return load(store, firstCheckpoint.getCheckpointAddr(), false/* readOnly */);
}
/**
* Create a new {@link BTree} or derived class that is fully transient (NO
* backing {@link IRawStore}).
*
* Fully transient {@link BTree}s provide the functionality of a B+Tree
* without a backing persistence store. Internally, reachable nodes and
* leaves of the transient {@link BTree} use hard references to ensure that
* remain strongly reachable. Deleted nodes and leaves simply clear their
* references and will be swept by the garbage collector shortly thereafter.
*
* Operations which attempt to write on the backing store will fail.
*
* While nodes and leaves are never persisted, the keys and values of the
* transient {@link BTree} are unsigned byte[]s. This means that application
* keys and values are always converted into unsigned byte[]s before being
* stored in the {@link BTree}. Hence if an object that is inserted into
* the {@link BTree} and then looked up using the {@link BTree} API, you
* WILL NOT get back the same object reference.
*
* Note: CLOSING A TRANSIENT INDEX WILL DISCARD ALL DATA!
*
* @param metadata
* The metadata record.
*
* @return The transient {@link BTree}.
*/
@SuppressWarnings("unchecked")
public static BTree createTransient(final IndexMetadata metadata) {
/*
* Create checkpoint for the new B+Tree.
*/
final Checkpoint firstCheckpoint = metadata.firstCheckpoint();
/*
* Create B+Tree object instance.
*/
try {
final Class cl = Class.forName(metadata.getBTreeClassName());
/*
* Note: A NoSuchMethodException thrown here means that you did not
* declare the required public constructor for a class derived from
* BTree.
*/
final Constructor ctor = cl.getConstructor(new Class[] {
IRawStore.class,//
Checkpoint.class,//
IndexMetadata.class,//
Boolean.TYPE//
});
final BTree btree = (BTree) ctor.newInstance(new Object[] { //
null , // store
firstCheckpoint, //
metadata, //
false// readOnly
});
// create the root node.
btree.reopen();
return btree;
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
// /**
// * Load an instance of a {@link BTree} or derived class from the store. The
// * {@link BTree} or derived class MUST declare a constructor with the
// * following signature:
// *
// * className(IRawStore store, Checkpoint checkpoint, BTreeMetadata metadata, boolean readOnly)
// *
// *
// *
// * @param store
// * The store.
// *
// * @param addrCheckpoint
// * The address of a {@link Checkpoint} record for the index.
// *
// * @return The {@link BTree} or derived class loaded from that
// * {@link Checkpoint} record.
// *
// * @deprecated by {@link #load(IRawStore, long, boolean)} which specifies
// * whether or not the {@link BTree} will be opened in a
// * read-only mode and therefore allows for certain
// * optimizations within the {@link BTree} constructor.
// */
// public static BTree load(final IRawStore store, final long addrCheckpoint) {
//
// return load(store, addrCheckpoint, false/* readOnly */);
//
// }
/**
* Load an instance of a {@link BTree} or derived class from the store. The
* {@link BTree} or derived class MUST declare a constructor with the
* following signature:
*
* className(IRawStore store, Checkpoint checkpoint, IndexMetadata metadata, boolean readOnly)
*
*
*
* @param store
* The store.
* @param addrCheckpoint
* The address of a {@link Checkpoint} record for the index.
* @param readOnly
* When true
the {@link BTree} will be marked as
* read-only. Marking has some advantages relating to the locking
* scheme used by {@link Node#getChild(int)} since the root node
* is known to be read-only at the time that it is allocated as
* per-child locking is therefore in place for all nodes in the
* read-only {@link BTree}. It also results in much higher
* concurrency for {@link AbstractBTree#touch(AbstractNode)}.
*
* @return The {@link BTree} or derived class loaded from that
* {@link Checkpoint} record.
*
* @throws IllegalArgumentException
* if store is null
.
*/
@SuppressWarnings("unchecked")
public static BTree load(final IRawStore store, final long addrCheckpoint,
final boolean readOnly) {
if (store == null)
throw new IllegalArgumentException();
/*
* Read checkpoint record from store.
*/
final Checkpoint checkpoint;
try {
checkpoint = Checkpoint.load(store, addrCheckpoint);
} catch (Throwable t) {
throw new RuntimeException("Could not load Checkpoint: store="
+ store + ", addrCheckpoint="
+ store.toString(addrCheckpoint), t);
}
if (checkpoint.getIndexType() != IndexTypeEnum.BTree)
throw new RuntimeException("Not a BTree checkpoint: " + checkpoint);
/*
* Read metadata record from store.
*/
final IndexMetadata metadata;
try {
metadata = IndexMetadata.read(store, checkpoint.getMetadataAddr());
} catch (Throwable t) {
throw new RuntimeException("Could not read IndexMetadata: store="
+ store + ", checkpoint=" + checkpoint, t);
}
if (INFO) {
// Note: this is the scale-out index name for a partitioned index.
final String name = metadata.getName();
log.info((name == null ? "" : "name=" + name + ", ")
+ "readCheckpoint=" + checkpoint);
}
/*
* Create B+Tree object instance.
*/
try {
final Class cl = Class.forName(metadata.getBTreeClassName());
/*
* Note: A NoSuchMethodException thrown here means that you did not
* declare the required public constructor for a class derived from
* BTree.
*/
final Constructor ctor = cl.getConstructor(new Class[] {
IRawStore.class,//
Checkpoint.class,//
IndexMetadata.class, //
Boolean.TYPE
});
final BTree btree = (BTree) ctor.newInstance(new Object[] { //
store,//
checkpoint, //
metadata, //
readOnly
});
// if (readOnly) {
//
// btree.setReadOnly(true);
//
// }
// read the root node.
btree.reopen();
return btree;
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
/**
* Factory for mutable nodes and leaves used by the {@link NodeSerializer}.
*/
protected static class NodeFactory implements INodeFactory {
public static final INodeFactory INSTANCE = new NodeFactory();
private NodeFactory() {
}
@Override
public Leaf allocLeaf(final AbstractBTree btree, final long addr,
final ILeafData data) {
return new Leaf(btree, addr, data);
}
@Override
public Node allocNode(final AbstractBTree btree, final long addr,
final INodeData data) {
return new Node(btree, addr, data);
}
}
/**
* Mutable counter.
*
* @author Bryan Thompson
*/
public static class Counter implements ICounter {
private final BTree btree;
public Counter(final BTree btree) {
if (btree == null)
throw new IllegalArgumentException();
this.btree = btree;
}
@Override
public long get() {
return btree.counter.get();
}
@Override
public long incrementAndGet() {
final long counter = btree.counter.incrementAndGet();
if (counter == btree.checkpoint.getCounter() + 1) {
/*
* The first time the counter is incremented beyond the value in
* the checkpoint record we fire off a dirty event to put the
* BTree on the commit list.
*/
btree.fireDirtyEvent();
}
if (counter == 0L) {
/*
* The counter has wrapped back to ZERO.
*/
throw new RuntimeException("Counter overflow");
}
return counter;
}
} // class Counter
/**
* Places the counter values into a namespace formed by the partition
* identifier. The partition identifier is found in the high int32 word and
* the counter value from the underlying {@link BTree} is found in the low
* int32 word.
*
* @author Bryan Thompson
*/
public static class PartitionedCounter implements ICounter {
private final int partitionId;
private final ICounter src;
private static final long MAX_LOCAL_COUNTER = 1L << 32;
public PartitionedCounter(final int partitionId, final ICounter src) {
if (src == null)
throw new IllegalArgumentException();
this.partitionId = partitionId;
this.src = src;
}
/**
* Range checks the source counter (it must fit in the lower 32-bit
* word) and then jambs the partition identifier into the high word.
*
* @param tmp
* The source counter value.
*
* @return The partition-specific counter value.
*
* @throws RuntimeException
* if the source counter has overflowed.
*
* @see TermIdEncoder#combine(pid,ctr), which performs the same
* operation and MUST be consistent with this method.
*/
private long wrap(final long tmp) {
if (tmp >= MAX_LOCAL_COUNTER) {
/*
* Note: Checkpoint MUST persist the raw counter in scale-out
* this code will observe the partitionId in the high word and
* throw an exception. The whole point of this check is to
* verify that the underlying index local counter has not
* overflowed. Checkpoint must persistent the raw counter for us
* to do test this.
*/
throw new RuntimeException("Counter overflow: counter=" + tmp
+ ", pid=" + getPartitionId(tmp) + ", ctr="
+ getLocalCounter(tmp));
}
return combine(partitionId, (int) tmp);
// /*
// * Place the partition identifier into the high int32 word and place
// * the truncated counter value into the low int32 word.
// */
//
//// return (((long)partitionId) << 32) | (int) tmp;
// return ((long) partitionId) << 32 | (0xFFFFFFFFL & (long) tmp);
}
@Override
public long get() {
return wrap( src.get() );
}
@Override
public long incrementAndGet() {
return wrap(src.incrementAndGet());
}
/**
* Return the partition identifier from the high word of a partitioned
* counter.
*
* @param v
* The partitioned counter.
*
* @return The high word.
*/
public static int getPartitionId(final long v) {
return (int) (v >>> 32);
}
/**
* Return the local counter from the low word of a partitioned counter.
*
* @param v
* The partitioned counter.
*
* @return The low word.
*/
public static int getLocalCounter(final long v) {
return (int) v;
}
/**
* Combines the partition identifier and the local counter using the same
* logic as the {@link PartitionedCounter}.
*
* @param pid
* The partition identifier.
* @param ctr
* The local counter.
*
* @return The long counter assembled from those values.
*/
public static long combine(final int pid, final int ctr) {
return ((long) pid) << 32 | (0xFFFFFFFFL & (long) ctr);
}
} // class PartitionedCounter
/**
* Returns ONE (1).
*/
@Override
final public int getSourceCount() {
return 1;
}
/**
* An array containing this {@link BTree}.
*/
@Override
final public AbstractBTree[] getSources() {
return new AbstractBTree[]{this};
}
@Override
final public BTree getMutableBTree() {
return this;
}
@Override
public LeafCursor newLeafCursor(final SeekEnum where) {
return new LeafCursor(where);
}
@Override
public LeafCursor newLeafCursor(final byte[] key) {
return new LeafCursor(key);
}
/**
* A simple stack based on an array used to maintain hard references for the
* parent {@link Node}s in the {@link LeafCursor}. This class is optimized
* for light-weight push/pop and copy operations. In particular, the copy
* operation is used to support atomic state changes for
* {@link LeafCursor#prior()} and {@link LeafCursor#next()}.
*
* @author Bryan
* Thompson
*/
protected static class Stack {
/** backing array used to store the elements in the stack. */
private Node[] a;
/** #of elements in the stack. */
private int n = 0;
/**
* Stack with initial capacity of 10.
*
* Note: The initial capacity is a relatively small since a B+Tree of
* depth 10 will have a very large number of tuples even with a small
* branching factor.
*/
public Stack() {
this( 10 );
}
public Stack(final int capacity) {
a = new Node[capacity];
}
/**
* The size of the backing array.
*/
final public int capacity() {
return a.length;
}
/**
* The #of elements on the stack.
*/
final public int size() {
return n;
}
/**
* Push an element onto the stack.
*
* @param item
* The element (required).
*/
public void push(final Node item) {
assert item != null;
if (n == a.length) {
// extend the size of the backing array.
final Node[] t = new Node[a.length * 2];
// copy the data into the new array.
System.arraycopy(a, 0, t, 0, n);
// replace the backing array.
a = t;
}
a[n++] = item;
}
/**
* Pop an element off of the stack.
*
* @return The element that was on the top of the stack.
*
* @throws IllegalStateException
* if the stack is empty.
*/
public Node pop() {
if (n == 0)
throw new IllegalStateException();
final Node item = a[--n];
/*
* Note: it is important to clear the reference since this stack
* is being used to manage weak references to parent nodes.
*/
a[n] = null;
return item;
}
/**
* Return the element on the top of the stack.
*
* @throws IllegalStateException
* if the stack is empty.
*/
public Node peek() {
if (n == 0)
throw new IllegalStateException();
return a[n-1];
}
/**
* Replace the state of this {@link Stack} with the state of the source
* {@link Stack}.
*
* @param src
* The source {@link Stack}.
*/
public void copyFrom(final Stack src) {
assert src != null;
if (src.n > a.length) {
// new backing array of same size as the source's backing array.
a = new Node[src.a.length];
} else {
clear();
}
// copy the data into the backing array.
System.arraycopy(src.a, 0, a, 0, src.n);
// update the size of the stack.
n = src.n;
}
/**
* Clear the stack.
*/
public void clear() {
while (n > 0) {
/*
* Again, clearing the reference is important since we are
* managing weak reference reachability with this stack.
*/
a[--n] = null;
}
}
}
/**
* A cursor that may be used to traversal {@link Leaf}s.
*
* Note: Instances of this class do NOT register an {@link ILeafListener}
* and therefore do NOT notice if mutation causes the current leaf to become
* invalid. In general, you need to have a specific key in mind
* in order to re-locate the appropriate leaf after such mutation events.
*
* Note: The {@link MutableBTreeTupleCursor} does register such listeners.
*
* @author Bryan Thompson
*/
public class LeafCursor implements ILeafCursor {
/**
* A stack containing the ordered ancestors of the current leaf. The
* root of the B+Tree will always be on the bottom of the stack. Nodes
* are pushed onto the stack during top-down navigation by the various
* methods that (re-)locate the current leaf.
*
* The purpose of this stack is to ensure that the ancestors of the
* current leaf remain strongly reachable while the cursor is positioned
* on that leaf. This is necessary in order for the methods that change
* the cursor position {@link #next()} {@link #prior()} to succeed
* since they depend on the parent references which could otherwise be
* cleared (parent references of nodes and leaves are weak references).
* This means that hard references MUST be held to all {@link Node}s
* that are ancestors of the current {@link Leaf}. This criteria is
* satisfied by normal top-down navigation since the hard references are
* on the stack frame. However, non-recursive constructions violate this
* requirement since the hard references to the parents are not on the
* stack frame. This {@link Stack} of {@link Node}s is therefore
* maintained by all of the methods on this class such that we always
* have hard references for the ancestors of the leaf that is being
* visited.
*
* Another benefit of this stack is that is ensures that not only the
* current leaf but also all nodes above the current leaf remain
* strongly reachable and hence require NO IOs. This should have the
* most pronounced effect on a read-only {@link BTree} with a large
* number of concurrent cursors as the hard reference queue for the
* index might otherwise let the relevant nodes become weakly reachable.
* It will have less of a performance effect for a mutable {@link BTree}
* since that class is single-threaded for writers.
*
* Note: The {@link IndexSegment} handles this differently since it has
* the address of the prior and next leaf on hand and does not need the
* parent references for navigation.
*/
private Stack stack = new Stack();
/**
* {@link #prior()} and {@link #next()} use this in order to make their
* operations on the {@link #stack} atomic. If they are unable to find
* the prior/next leaf (because the cursor is already on the first/last
* leaf) then they undo their operations on the stack by restoring it
* from this copy.
*/
private Stack backup = null;
/**
* Save a copy of the {@link #stack}.
*/
// @SuppressWarnings("unchecked")
private void backup() {
assert stack != null;
if (backup == null) {
backup = new Stack(stack.capacity());
}
backup.copyFrom(stack);
}
/**
* Restore the {@link #stack} from its backup copy.
*/
private void restore() {
assert backup != null;
stack = backup;
backup = null;
}
/**
* The current leaf (always defined).
*/
private Leaf leaf;
@Override
public Leaf leaf() {
return leaf;
}
@Override
public BTree getBTree() {
return BTree.this;
}
@Override
public LeafCursor clone() {
return new LeafCursor(this);
}
/**
* Copy constructor used by {@link #clone()}.
*/
private LeafCursor(final LeafCursor src) {
if (src == null)
throw new IllegalArgumentException();
// copy the stack state from the source cursor.
stack = new Stack(src.stack.capacity());
stack.copyFrom(src.stack);
leaf = src.leaf();
}
public LeafCursor(final SeekEnum where) {
switch (where) {
case First:
first();
break;
case Last:
last();
break;
default:
throw new AssertionError("Unknown seek directive: " + where);
}
}
public LeafCursor(final byte[] key) {
seek(key);
}
@Override
public Leaf first() {
stack.clear();
AbstractNode> node = getRoot();
while (!node.isLeaf()) {
final Node n = (Node) node;
stack.push(n);
// left-most child.
node = n.getChild(0);
}
return leaf = (Leaf)node;
}
@Override
public Leaf last() {
stack.clear();
AbstractNode> node = getRoot();
while (!node.isLeaf()) {
final Node n = (Node) node;
stack.push(n);
// right-most child.
node = n.getChild(n.getKeyCount());
}
return leaf = (Leaf)node;
}
/**
* Descend from the root node to the leaf spanning that key. Note that
* the leaf may not actually contain the key, in which case it is the
* leaf that contains the insertion point for the key.
*/
@Override
public Leaf seek(final byte[] key) {
stack.clear();
AbstractNode> node = getRoot();
while(!node.isLeaf()) {
final Node n = (Node)node;
final int index = n.findChild(key);
stack.push( n );
node = n.getChild( index );
}
return leaf = (Leaf)node;
}
@Override
public Leaf seek(final ILeafCursor src) {
if (src == null)
throw new IllegalArgumentException();
if (src == this) {
// NOP
return leaf;
}
if (src.getBTree() != BTree.this) {
throw new IllegalArgumentException();
}
// copy the stack state from the source cursor.
stack.copyFrom(((LeafCursor) src).stack);
return leaf = src.leaf();
}
@Override
public Leaf next() {
// make sure that the current leaf is valid.
if (leaf.isDeleted())
throw new IllegalStateException("deleted");
// save a copy of the stack.
backup();
/*
* Starting with the current leaf, recursive ascent until there is a
* right-sibling of the current child.
*/
AbstractNode> sibling = null;
{
AbstractNode> child = leaf;
Node p = child.getParent();
while (true) {
if (p == null) {
/*
* No right-sibling (must be the last leaf).
*/
// undo changes to the stack.
restore();
// do not change the cursor position.
return null;
}
sibling = p.getRightSibling(child, true/*materialize*/);
if(sibling != null) break;
if (p != stack.pop())
throw new AssertionError();
// check the parent.
child = p;
p = p.getParent();
}
}
/*
* Recursive descent to the left-most child.
*/
{
while(!sibling.isLeaf()) {
stack.push((Node)sibling);
sibling = ((Node) sibling).getChild(0);
}
}
return leaf = (Leaf) sibling;
}
/**
* Materialize the prior leaf in the natural order of the index (this is
* more general than the right sibling which is restricted to leaves
* that are children of the same direct parent). The algorithm is:
*
*
*
* - Recursive ascent via the parent until there is a left-sibling of
* the child node (the index of the child in the parent must be GT
* zero). If we reach the root and there is no left-sibling then we are
* already on the left-most leaf and we are done.
*
* - t = leftSibling
*
* - while(!t.isLeaf()) right-descend
*
*
*
* As we go up we pop each parent off of the hard reference stack.
*
* As we go down we add each non-leaf node to the hard reference stack.
*
* In order to make this operation atomic, the nodes to be popped are
* added to a temporary stack until we
*
* @return The prior leaf -or- null
if there is no
* predecessor of this leaf.
*/
@Override
public Leaf prior() {
// make sure that the current leaf is valid.
if (leaf.isDeleted())
throw new IllegalStateException("deleted");
// save a copy of the stack.
backup();
/*
* Starting with the current leaf, recursive ascent until there is a
* left-sibling of the current child.
*/
AbstractNode> sibling = null;
{
AbstractNode> child = leaf;
Node p = child.getParent();
while (true) {
if(p == null) {
/*
* No left-sibling (must be the first leaf).
*/
// undo changes to the stack.
restore();
// do not change the cursor position.
return null;
}
sibling = p.getLeftSibling(child, true/*materialize*/);
if(sibling != null) break;
if (p != stack.pop())
throw new AssertionError();
// check the parent.
child = p;
p = p.getParent();
}
}
/*
* Recursive descent to the right-most child.
*/
{
while(!sibling.isLeaf()) {
stack.push((Node)sibling);
sibling = ((Node) sibling)
.getChild(sibling.getKeyCount());
}
}
return leaf = (Leaf) sibling;
}
}
}