com.bigdata.btree.Leaf Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Nov 15, 2006
*/
package com.bigdata.btree;
import java.io.PrintStream;
import java.nio.ByteBuffer;
import java.util.Iterator;
import java.util.WeakHashMap;
import org.apache.log4j.Level;
import com.bigdata.btree.data.DefaultLeafCoder;
import com.bigdata.btree.data.ILeafData;
import com.bigdata.btree.filter.EmptyTupleIterator;
import com.bigdata.btree.isolation.IsolatedFusedView;
import com.bigdata.btree.raba.IRaba;
import com.bigdata.btree.raba.MutableKeyBuffer;
import com.bigdata.btree.raba.MutableValueBuffer;
import com.bigdata.io.AbstractFixedByteArrayBuffer;
import com.bigdata.journal.ITransactionService;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.util.BytesUtil;
import cutthecrap.utils.striterators.EmptyIterator;
import cutthecrap.utils.striterators.SingleValueIterator;
/**
*
* A B+-Tree leaf.
*
* Tuple revision timestamps
*
* When tuple revision timestamps are maintained, they must be propagated to the
* parents if we insert or remove a tuple, but also need to be propagated if we
* update a tuple in a manner which changes the min/max version timestamp. This
* is done either by {@link Node#updateEntryCount(AbstractNode, int)}, when the
* #of tuples in the leaf was changed, or by
* {@link Node#updateMinMaxVersionTimestamp(AbstractNode)} when the #of tuples
* in the leaf is unchanged.
*
* The {@link #getMinimumVersionTimestamp()} and
* {@link #getMaximumVersionTimestamp()} can be used to efficiently filter
* iterators so as to only visit those nodes and leaves which have updates for
* some revision timestamp range. This filtering is effective because if we
* reject a node has not having data for the revision range of interest, then we
* do not need to consider any of the nodes or leaves spanned by that node.
*
* Note that revision timestamps ARE NOT commit timestamps. See
* {@link ITransactionService} and {@link IsolatedFusedView} for more about
* this and how to obtain and work with revision timestamps.
*
* @author Bryan Thompson
*/
public class Leaf extends AbstractNode implements ILeafData, IRawRecordAccess {
/**
* The data record. {@link MutableLeafData} is used for all mutation
* operations. {@link ReadOnlyLeafData} is used when the {@link Leaf} is
* made persistent. A read-only data record is automatically converted into
* a {@link MutableLeafData} record when a mutation operation is requested.
*
* Note: This is package private in order to expose it to {@link Node}.
*/
ILeafData data;
/**
* Return (branchingFactor + 1) << 1
*
* Note: the root may have fewer keys.
*/
@Override
protected final int minKeys() {
// /*
// * Compute the minimum #of children/values. This is the same whether
// * this is a Node or a Leaf.
// */
// final int minChildren = (btree.branchingFactor + 1) >> 1;
//
// // this.minKeys = isLeaf() ? minChildren : minChildren - 1;
//
// return minChildren;
return btree.minChildren;
}
/**
* Return branchingFactor
, which is the maximum #of keys for a
* {@link Leaf}.
*/
@Override
protected final int maxKeys() {
// // The maximum #of keys is easy to compute.
// this.maxKeys = isLeaf() ? branchingFactor : branchingFactor - 1;
return btree.branchingFactor;
}
@Override
final public ILeafData getDelegate() {
return data;
}
/*
* ILeafData
*/
/**
* Always returns true
.
*/
@Override
final public boolean isLeaf() {
return true;
}
/**
* The result depends on the implementation. The {@link Leaf} will be
* mutable when it is first created and is made immutable when it is
* persisted. If there is a mutation operation, the backing
* {@link ILeafData} is automatically converted into a mutable instance.
*/
@Override
final public boolean isReadOnly() {
return data.isReadOnly();
}
@Override
final public boolean isCoded() {
return data.isCoded();
}
@Override
final public AbstractFixedByteArrayBuffer data() {
return data.data();
}
@Override
final public boolean getDeleteMarker(final int index) {
return data.getDeleteMarker(index);
}
@Override
final public int getKeyCount() {
return data.getKeyCount();
}
@Override
// See https://sourceforge.net/apps/trac/bigdata/ticket/550 (NPE in Leaf.getKey())
final public IRaba getKeys() {
if(data==null) throw new NullPointerException("leaf="+toString());
return data.getKeys();
}
// final public int getSpannedTupleCount() {
//
// return data.getSpannedTupleCount();
//
// }
@Override
final public int getValueCount() {
return data.getValueCount();
}
@Override
final public IRaba getValues() {
return data.getValues();
}
/**
* Convenience method returns the byte[] for the given index in the leaf. If
* the tuple at that index is a raw record, then the record is read from the
* backing store. More efficient operations should be performed when copying
* the value into a tuple.
*
* @param leaf
* The leaf.
* @param index
* The index in the leaf.
*
* @return The data.
*
* @see AbstractTuple#copy(int, Leaf)
*/
public byte[] getValue(final int index) {
if (!hasRawRecords()) {
return getValues().get(index);
}
final long addr = getRawRecord(index);
if( addr == IRawStore.NULL) {
return getValues().get(index);
}
final ByteBuffer tmp = btree.readRawRecord(addr);
if (tmp.hasArray() && tmp.arrayOffset() == 0 && tmp.position() == 0
&& tmp.limit() == tmp.capacity()) {
/*
* Return the backing array.
*/
return tmp.array();
}
/*
* Copy the data into a byte[].
*/
final int len = tmp.remaining();
final byte[] a = new byte[len];
tmp.get(a);
return a;
}
final public long getVersionTimestamp(final int index) {
return data.getVersionTimestamp(index);
}
final public long getRawRecord(final int index) {
return data.getRawRecord(index);
}
final public boolean hasDeleteMarkers() {
return data.hasDeleteMarkers();
}
final public boolean hasVersionTimestamps() {
return data.hasVersionTimestamps();
}
final public long getMinimumVersionTimestamp() {
return data.getMinimumVersionTimestamp();
}
final public long getMaximumVersionTimestamp() {
return data.getMaximumVersionTimestamp();
}
final public boolean hasRawRecords() {
return data.hasRawRecords();
}
final public boolean isDoubleLinked() {
return data.isDoubleLinked();
}
final public long getPriorAddr() {
return data.getPriorAddr();
}
final public long getNextAddr() {
return data.getNextAddr();
}
/**
* De-serialization constructor.
*
* Note: The de-serialization constructor (and ONLY the de-serialization
* constructor) ALWAYS creates a clean leaf. Therefore the {@link PO#dirty}
* flag passed up from this constructor has the value false
.
*
* @param btree
* The tree to which the leaf belongs.
* @param addr
* The address of this leaf.
* @param data
* The data record.
*/
protected Leaf(final AbstractBTree btree, final long addr,
final ILeafData data) {
super(btree, false /* The leaf is NOT dirty. */);
assert data != null;
/*
* Cross check flags against the B+Tree when we wrap the record in a
* Leaf.
*/
assert data.hasDeleteMarkers() == btree.getIndexMetadata()
.getDeleteMarkers();
assert data.hasVersionTimestamps() == btree.getIndexMetadata()
.getVersionTimestamps();
setIdentity(addr);
this.data = data;
// // must clear the dirty since we just de-serialized this leaf.
// setDirty(false);
// // Add to the hard reference queue.
// btree.touch(this);
}
/**
* Creates a new mutable leaf.
*
* @param btree
* A mutable B+Tree.
*/
protected Leaf(final AbstractBTree btree) {
super(btree, true /*dirty*/ );
final IndexMetadata md = btree.getIndexMetadata();
data = new MutableLeafData(//
btree.branchingFactor, //
md.getVersionTimestamps(),//
md.getDeleteMarkers(),//
md.getRawRecords()//
);
// final int branchingFactor = btree.branchingFactor;
//
// this.keys = new MutableKeyBuffer(branchingFactor + 1);
//
// values = new MutableValueBuffer(0/* size */,
// new byte[branchingFactor + 1][]);
//
// if(btree.getIndexMetadata().getVersionTimestamps()) {
//
// versionTimestamps = new long[branchingFactor + 1];
//
// }
//
// if (btree.getIndexMetadata().getDeleteMarkers()) {
//
// deleteMarkers = new boolean[branchingFactor + 1];
//
// }
// /*
// * Add to the hard reference queue. If the queue is full, then this will
// * force the incremental write whatever gets evicted from the queue.
// */
// btree.touch(this);
}
/**
* Copy constructor.
*
* @param src
* The source node (must be immutable).
*
* @see AbstractNode#copyOnWrite()
*/
protected Leaf(final Leaf src) {
super(src);
assert !src.isDirty();
assert src.isReadOnly();
// assert src.isPersistent();
// steal/clone the data record.
this.data = src.isReadOnly() ? new MutableLeafData(src
.getBranchingFactor(), src.data) : src.data;
// clear reference on source.
src.data = null;
// /*
// * Steal/copy the keys.
// *
// * Note: The copy constructor is invoked when we need to begin mutation
// * operations on an immutable node or leaf, so make sure that the keys
// * are mutable.
// */
// {
//
//// nkeys = src.nkeys;
//
// if (src.getKeys() instanceof MutableKeyBuffer) {
//
// keys = src.getKeys();
//
// } else {
//
// keys = new MutableKeyBuffer(src.getBranchingFactor(), src
// .getKeys());
//
// }
//
// // release reference on the source node.
//// src.nkeys = 0;
// src.keys = null;
//
// }
//
//// /*
//// * Steal the values[].
//// */
////
//// // steal reference and clear reference on the source node.
//// values = src.values;
//
// /*
// * Steal/copy the values[].
// *
// * Note: The copy constructor is invoked when we need to begin mutation
// * operations on an immutable node or leaf, so make sure that the values
// * are mutable.
// */
// {
//
// if (src.values instanceof MutableValueBuffer) {
//
// values = src.values;
//
// } else {
//
// values = new MutableValueBuffer(src.getBranchingFactor(),
// src.values);
//
// }
//
// // release reference on the source node.
// src.values = null;
//
// }
//
// versionTimestamps = src.versionTimestamps;
//
// deleteMarkers = src.deleteMarkers;
// // Add to the hard reference queue.
// btree.touch(this);
}
@Override
public void delete() {
/*
* Note: This event MUST go out before we clear [leafListners].
*
* Note: Since we fire this event here we do NOT need to fire it
* explicitly after a copy-on-write since copy-on-write ALWAYS calls
* delete() on the original leaf if it makes a copy.
*/
fireInvalidateLeafEvent();
super.delete();
// clear references.
data = null;
// keys = null;
//
// values = null;
//
// versionTimestamps = null;
//
// deleteMarkers = null;
leafListeners = null;
}
/**
* Insert or update an entry in the leaf as appropriate. The caller MUST
* ensure by appropriate navigation of parent nodes that the key for the
* next tuple either exists in or belongs in this leaf. If the leaf
* overflows then it is split after the insert.
*
* FIXME maintain min/max version timestamps.
*/
@Override
public Tuple insert(final byte[] searchKey, final byte[] newval,
final boolean delete, final boolean putIfAbsent, final long timestamp, final Tuple tuple) {
if (delete && !data.hasDeleteMarkers()) {
/*
* You may not specify the delete flag unless delete markers are
* being maintained.
*/
throw new UnsupportedOperationException();
}
if(btree.debug) assertInvariants();
// btree.touch(this); // Note: Invoked by copyOnWrite() (immediately below)
int entryIndex = Integer.MAX_VALUE; // shut up the compiler. complains about not assigned on all code paths.
if (putIfAbsent) {
/*
* putIfAbsent code path. We look for the search key in the leaf
* before triggering copy-on-write. If the key is found, then we
* will (optionally) return the value under the key and WILL NOT
* modify the leaf.
*
* Note: If we search for the entryIndex here, then we DO NOT
* search again below.
*
* See BLZG-1539
*/
entryIndex = this.getKeys().search(searchKey);
if (entryIndex >= 0) {
// found entry for that key.
if (!hasDeleteMarkers() || !getDeleteMarker(entryIndex)) {
/*
* Found an existing (non-deleted) entry under the key.
*
* Do NOT mutate the leaf.
*/
// copy tuple (optional).
if (tuple != null)
tuple.copy(entryIndex, this);
// return caller's tuple.
return tuple;
}
}
}
/*
* Note: This is one of the few gateways for mutation of a leaf via the
* main btree API (insert, lookup, delete). By ensuring that we have a
* mutable leaf here, we can assert that the leaf must be mutable in
* other methods.
*/
final Leaf copy = (Leaf) copyOnWrite();
if (copy != this) {
/*
* This leaf has been copied so delegate the operation to the new
* leaf.
*
* Note: copy-on-write deletes [this] leaf and delete() notifies any
* leaf listeners before it clears the [leafListeners] reference so
* not only don't we have to do that here, but we can't since the
* listeners would be cleared before we could fire off the event
* ourselves.
*
* Note: putIfAbsent() is handled above without triggering a
* copy-on-write so we always set the flag to false if we triggered
* copy on write so the mutation is unconditional on the copy of the
* leaf data.
*/
return copy.insert(searchKey, newval, delete, false/*putIfAbsent*/, timestamp, tuple);
}
/*
* Search for the key.
*
* Note: We do NOT search before triggering copy-on-write for an object
* index since an insert/update always triggers a mutation.
*/
// look for the search key in the leaf.
if (!putIfAbsent) {
/*
* when putIfAbsent is true, we already found the entryIndex above.
* Otherwise we find it now. See BLZG-1539.
*/
entryIndex = this.getKeys().search(searchKey);
}
if (entryIndex >= 0) {
/*
* The key is already present in the leaf, so we are updating an
* existing entry.
*/
if (tuple != null) {
/*
* Copy data and metadata for the old value stored under the
* search key.
*/
tuple.copy(entryIndex, this);
}
// Tunnel through to the mutable object.
final MutableLeafData data = (MutableLeafData) this.data;
/*
* Update the entry on the leaf.
*/
if (hasRawRecords()) {
/*
* Note: If the old value was a raw record, we need to delete
* that raw record now.
*
* Note: If the new value will be a raw record, we need to write
* that raw record onto the store now and save its address into
* the values[] raba.
*/
final long oaddr = getRawRecord(entryIndex);
if(oaddr != IRawStore.NULL) {
btree.deleteRawRecord(oaddr);
}
final long maxRecLen = btree.getMaxRecLen();
if (newval != null && newval.length > maxRecLen) {
// write the value on the backing store.
final long naddr = btree.writeRawRecord(newval);
// save its address in the values raba.
data.vals.values[entryIndex] = ((BTree) btree)
.encodeRecordAddr(naddr);
// flag as a raw record.
data.rawRecords[entryIndex] = true;
} else {
data.vals.values[entryIndex] = newval;
data.rawRecords[entryIndex] = false;
}
} else {
data.vals.values[entryIndex] = newval;
}
if (data.deleteMarkers != null) {
if (!data.deleteMarkers[entryIndex] && delete) {
/*
* Changing from a non-deleted to a deleted tuple (we don't
* count re-deletes of an already deleted tuple).
*/
btree.getBtreeCounters().ntupleUpdateDelete++;
} else if(!delete) {
/*
* Either changing from a deleted to a non-deleted tuple or
* just overwriting an existing non-deleted tuple.
*/
btree.getBtreeCounters().ntupleUpdateValue++;
}
data.deleteMarkers[entryIndex] = delete;
} else {
/*
* Update value for existing tuple (delete markers are not in
* use).
*/
btree.getBtreeCounters().ntupleUpdateValue++;
}
if (data.versionTimestamps != null) {
boolean propagateMinMax = false;
data.versionTimestamps[entryIndex] = timestamp;
if (data.minimumVersionTimestamp > timestamp) {
data.minimumVersionTimestamp = timestamp;
propagateMinMax = true;
}
if (data.maximumVersionTimestamp < timestamp) {
data.maximumVersionTimestamp = timestamp;
propagateMinMax = true;
}
if (propagateMinMax && parent != null) {
parent.get().updateMinMaxVersionTimestamp(this);
}
}
// // notify any listeners that this tuple's state has been changed.
// fireInvalidateTuple(entryIndex);
// return the old value.
return tuple;
}
/*
* The insert goes into this leaf.
*/
// Convert the position to obtain the insertion point.
entryIndex = -entryIndex - 1;
// insert an entry under that key.
{
final int nkeys = getKeyCount();
if (entryIndex < nkeys) {
/* index = 2;
* nkeys = 6;
*
* [ 0 1 2 3 4 5 ]
* ^ index
*
* count = keys - index = 4;
*/
final int count = nkeys - entryIndex;
assert count >= 1;
copyDown(entryIndex, count);
}
/*
* Insert at index.
*/
// Tunnel through to the mutable object.
final MutableLeafData data = (MutableLeafData) this.data;
final MutableKeyBuffer keys = data.keys;
final MutableValueBuffer vals = data.vals;
// copyKey(entryIndex, searchKeys, tupleIndex);
keys.keys[entryIndex] = searchKey; // note: presumes caller does not reuse the searchKeys!
if (hasRawRecords()) {
final long maxRecLen = btree.getMaxRecLen();
if (newval != null && newval.length > maxRecLen) {
// write the value on the backing store.
final long naddr = btree.writeRawRecord(newval);
// save its address in the values raba.
data.vals.values[entryIndex] = ((BTree) btree)
.encodeRecordAddr(naddr);
// flag as a raw record.
data.rawRecords[entryIndex] = true;
} else {
data.vals.values[entryIndex] = newval;
data.rawRecords[entryIndex] = false;
}
} else {
vals.values[entryIndex] = newval;
}
if (data.deleteMarkers != null) {
if (delete) {
// Inserting a deleted tuple.
btree.getBtreeCounters().ntupleInsertDelete++;
} else if (!delete) {
// Inserting a non-deleted tuple.
btree.getBtreeCounters().ntupleInsertValue++;
}
data.deleteMarkers[entryIndex] = delete;
} else {
// Inserting a tuple (delete markers not in use).
btree.getBtreeCounters().ntupleInsertValue++;
}
if (data.versionTimestamps != null) {
data.versionTimestamps[entryIndex] = timestamp;
if (data.minimumVersionTimestamp > timestamp)
data.minimumVersionTimestamp = timestamp;
if (data.maximumVersionTimestamp < timestamp)
data.maximumVersionTimestamp = timestamp;
}
/*nkeys++;*/keys.nkeys++; vals.nvalues++;
}
// one more entry in the btree.
((BTree)btree).nentries++;
if( parent != null ) {
// update spanned tuple count and min/max version timestamp.
parent.get().updateEntryCount(this, 1);
}
// if (INFO) {
// log.info("this="+this+", key="+key+", value="+entry);
// if(DEBUG) {
// System.err.println("this"); dump(Level.DEBUG,System.err);
// }
// }
if (data.getKeyCount() == maxKeys() + 1) {
/*
* The insert caused the leaf to overflow, so now we split the leaf.
*/
final Leaf rightSibling = (Leaf) split();
// assert additional invariants post-split.
if(btree.debug) {
rightSibling.assertInvariants();
getParent().assertInvariants();
}
}
// assert invariants post-split.
if(btree.debug) assertInvariants();
/*
* Notify any listeners that the tuples found in the leaf have been
* changed (one was added but others will have been moved into a new
* right sibling if the leaf was split).
*/
fireInvalidateLeafEvent();
// return null since there was no pre-existing entry.
return null;
}
@Override
public Tuple lookup(final byte[] searchKey, final Tuple tuple) {
btree.touch(this);
final int entryIndex = getKeys().search(searchKey);
if (entryIndex < 0) {
// Not found.
return null;
}
// Found.
tuple.copy(entryIndex, this);
return tuple;
}
@Override
public long indexOf(final byte[] key) {
btree.touch(this);
return (long) getKeys().search(key);
}
@Override
public byte[] keyAt(final long entryIndex) {
rangeCheck2(entryIndex);
return getKeys().get((int) entryIndex);
}
@Override
public void valueAt(final long entryIndex, final Tuple tuple) {
rangeCheck2(entryIndex);
tuple.copy((int) entryIndex, this);
}
/** Used for the {@link ILinearList} methods. */
final protected boolean rangeCheck2(final long index)
throws IndexOutOfBoundsException {
if (index > Integer.MAX_VALUE)
throw new IndexOutOfBoundsException();
return rangeCheck((int) index);
}
/** Used for methods which have an index into the leaf. */
final protected boolean rangeCheck(final int index)
throws IndexOutOfBoundsException {
final int nkeys = data.getKeyCount();
if (index < 0 || index >= nkeys) {
throw new IndexOutOfBoundsException("index=" + index + ", nkeys="
+ nkeys);
}
return true;
}
/**
*
* Split an over-capacity leaf (a leaf with maxKeys+1
keys),
* creating a new rightSibling. The splitIndex (the index of the first key
* to move to the rightSibling) is (maxKeys+1)/2
. The key at
* the splitIndex is also inserted as the new separatorKey into the parent.
* All keys and values starting with splitIndex are moved to the new
* rightSibling. If this leaf is the root of the tree (no parent), then a
* new root {@link Node} is created without any keys and is made the parent
* of this leaf. In any case, we then insert( separatorKey, rightSibling )
* into the parent node, which may cause the parent node itself to split.
*
*
* @return The new rightSibling leaf.
*
* FIXME maintain min/max version timestamps.
*/
@Override
protected IAbstractNode split() {
final int maxKeys = this.maxKeys();
// MUST be mutable.
assert isDirty();
// MUST be an overflow.
assert getKeyCount() == maxKeys + 1;
final BTree btree = (BTree) this.btree;
btree.getBtreeCounters().leavesSplit++;
// #of entries in the leaf before it is split.
final int nentriesBeforeSplit = getKeyCount();
/*
* The splitIndex is the index of the first key/value to move to the new
* rightSibling.
*/
final int splitIndex = (maxKeys + 1) / 2;
/*
* The separatorKey is the shortest key that is less than or equal to
* the key at the splitIndex and greater than the key at [splitIndex-1].
* This also serves as the separator key when we insert( separatorKey,
* rightSibling ) into the parent.
*/
// final byte[] separatorKey = getKeys().get(splitIndex);
final byte[] separatorKey = BytesUtil.getSeparatorKey(//
getKeys().get(splitIndex),//
getKeys().get(splitIndex - 1)//
);
if (getParent() != null) {
/*
* Note: This code block was introduced to track down an error
* observed where a leaf split chose a separator key which already
* existed in the parent node. However, I am beginning to suspect
* that the error was introduced by a cache consistency problem
* (since fixed) in the LRUNexus.
*/
// the index of this leaf in the parent.
final int leafIndex = getParent().getIndexOf(this);
// the index of the proposed separator key in the parent (should not
// exist).
final int separatorIndex = getParent().getKeys().search(
separatorKey);
if (separatorIndex >= 0) {
/*
* The separator key should not be pre-existing in the parent.
*/
throw new AssertionError("Split on existing key: leafIndex="
+ leafIndex + ", splitIndexInLeaf=" + splitIndex
+ ", separatorIndexInParent=" + separatorIndex
+ ", separatorKey=" + keyAsString(separatorKey)
+ "\nparent=" + getParent() + "\nleaf=" + this);
}
}
// The new rightSibling of this leaf (this will be a mutable leaf).
final Leaf rightSibling = new Leaf(btree);
// Tunnel through to the mutable objects.
final MutableLeafData data = (MutableLeafData) this.data;
final MutableLeafData sdata = (MutableLeafData) rightSibling.data;
// increment #of leaves in the tree.
btree.nleaves++;
if (DEBUG) {
log.debug("this=" + this + ", nkeys=" + getKeyCount() + ", splitIndex="
+ splitIndex + ", separatorKey="
+ keyAsString(separatorKey)
);
// if(DEBUG) dump(Level.DEBUG,System.err);
}
int j = 0;
for (int i = splitIndex; i <= maxKeys; i++, j++) {
// copy key and value to the new leaf.
// rightSibling.setKey(j, getKey(i));
rightSibling.copyKey(j, this.getKeys(), i);
sdata.vals.values[j] = data.vals.values[i];
if (data.deleteMarkers != null) {
sdata.deleteMarkers[j] = data.deleteMarkers[i];
}
if (data.versionTimestamps != null) {
sdata.versionTimestamps[j] = data.versionTimestamps[i];
}
if (data.rawRecords != null) {
sdata.rawRecords[j] = data.rawRecords[i];
}
// clear out the old keys and values.
data.keys.keys[i] = null;
data.vals.values[i] = null;
if (data.deleteMarkers != null)
data.deleteMarkers[i] = false;
if (data.versionTimestamps != null)
data.versionTimestamps[i] = 0L;
if (data.rawRecords != null)
data.rawRecords[i] = false;
// one less key here.
/* nkeys--; */
data.keys.nkeys--;
data.vals.nvalues--;
// more more key there.
/*rightSibling.nkeys++;*/
sdata.keys.nkeys++;
sdata.vals.nvalues++;
}
/*
* Recalculate the version timestamps. This is both easier than tracking
* the changes on a per-tuple basis in the loop above and we have to
* recalculate the version timestamps anyway if we move one whose value
* is equal to the min or max.
*/
if (data.versionTimestamps != null)
data.recalcMinMaxVersionTimestamp();
if (sdata.versionTimestamps != null)
sdata.recalcMinMaxVersionTimestamp();
/*
* Now consider the parent. It will have to be updated. If there is no
* parent (if this is the root leaf), then we need to create that
* parent.
*/
Node p = getParent();
if (p == null) {
/*
* Use a special constructor to split the root leaf. The result is a
* new node with zero keys and one child (this leaf). The #of entries
* spanned by the new root node is the same as the #of entries found
* on this leaf _before_ the split.
*/
p = new Node((BTree) btree, this, nentriesBeforeSplit);
} else {
assert !p.isReadOnly();
// FIXME must update min/max on parent, which requires a child scan :-(
// this leaf now has fewer entries
((MutableNodeData) p.data).childEntryCounts[p.getIndexOf(this)] -= rightSibling
.getKeyCount();
if (p != btree.root && p.isRightMostNode()) {
/*
* If the leaf that is split is a child of the right most node
* in the tree then that is counted as a "tail split".
*
* Note: We DO NOT count tail splits when the leaf is the root
* leaf and we DO NOT count tail splits when the parent of the
* leaf is the root leaf. In both of those cases any leaf split
* would qualify, which is too liberal to be a useful measure.
*
* Note: The ratio of tail splits to leaf splits may be used as
* an indication of a pattern of index writes that bears heavily
* on the tail of the index.
*/
btree.getBtreeCounters().tailSplit++;
} else if (p != btree.root && p.isLeftMostNode()) {
/*
* If the leaf that is split is a child of the left-most node in
* the tree then that is counted as a "head split".
*
* Note: We DO NOT count head splits when the leaf is the root
* leaf and we DO NOT count head splits when the parent of the
* leaf is the root leaf. In both of those cases any leaf split
* would qualify, which is too liberal to be a useful measure.
*
* Note: The ratio of head splits to leaf splits may be used as
* an indication of a pattern of index writes that bears heavily
* on the head of the index.
*/
btree.getBtreeCounters().headSplit++;
}
}
/*
* Insert(splitKey,rightSibling) into the parent node. This may cause
* the parent node itself to split.
*
* Note: This operation can not cause the min/max on the parent Node
* to change. However, insertChild() does need to record the min/max
* for the new rightSibling.
*/
p.insertChild(separatorKey, rightSibling);
// Return the high leaf.
return rightSibling;
}
/**
* Redistributes a key from the specified sibling into this leaf in order to
* bring this leaf up to the minimum #of keys. This also updates the
* separator key in the parent for the right most of (this, sibling). While
* the #of entries spanned by the children of the common parent is changed
* by this method note that there is no net change in the #of entries
* spanned by that parent node.
*
* @param sibling
* A direct sibling of this leaf (either the left or right
* sibling). The sibling MUST be mutable.
*
* @todo Modify to always choose the shortest separator key from within a
* region in which the split is reasonable. This will help keep down
* the size of the separator keys in the nodes.
*
* FIXME maintain min/max version timestamps.
*/
@Override
protected void redistributeKeys(final AbstractNode sibling,
final boolean isRightSibling) {
// the sibling of a leaf must be a leaf.
final Leaf s = (Leaf) sibling;
assert s != null;
final int nkeys = this.getKeyCount();
final int snkeys = s.getKeyCount();
final int minKeys = this.minKeys();
assert dirty;
assert !deleted;
assert !isPersistent();
// verify that this leaf is deficient.
assert nkeys < minKeys;
// verify that this leaf is under minimum capacity by one key.
assert nkeys == minKeys - 1;
// the sibling MUST be _OVER_ the minimum #of keys/values.
assert snkeys > minKeys;
assert s.dirty;
assert !s.deleted;
assert !s.isPersistent();
final Node p = getParent();
// children of the same node.
assert s.getParent() == p;
if (DEBUG) {
log.debug("this="+this+", sibling="+sibling+", rightSibling="+isRightSibling);
// if(DEBUG) {
// System.err.println("this"); dump(Level.DEBUG,System.err);
// System.err.println("sibling"); sibling.dump(Level.DEBUG,System.err);
// System.err.println("parent"); p.dump(Level.DEBUG,System.err);
// }
}
/*
* The index of this leaf in its parent. we note this before we
* start mucking with the keys.
*/
final int index = p.getIndexOf(this);
// Tunnel through to the mutable objects.
final MutableLeafData data = (MutableLeafData) this.data;
final MutableLeafData sdata = (MutableLeafData) s.data;
final MutableNodeData pdata = (MutableNodeData) p.data;
final MutableKeyBuffer keys = data.keys;
final MutableKeyBuffer skeys = sdata.keys;
final MutableValueBuffer vals = data.vals;
final MutableValueBuffer svals = sdata.vals;
/*
* Determine which leaf is earlier in the key ordering and get the
* index of the sibling.
*/
if (isRightSibling) {
/*
* redistributeKeys(this,rightSibling). all we have to do is move
* the first key from the rightSibling to the end of the keys in
* this leaf. we then close up the hole that this left at index 0 in
* the rightSibling. finally, we update the separator key for the
* rightSibling to the new key in its first index position.
*/
// copy the first key from the rightSibling.
// setKey(nkeys, s.getKey(0));
copyKey(nkeys, s.getKeys(), 0);
vals.values[nkeys] = svals.values[0];
if (data.deleteMarkers != null)
data.deleteMarkers[nkeys] = sdata.deleteMarkers[0];
boolean updateMinMaxVersionTimestampOnSibling = false;
if (data.versionTimestamps != null) {
final long t = sdata.versionTimestamps[0];
data.versionTimestamps[nkeys] = t;
if (t < data.minimumVersionTimestamp)
data.minimumVersionTimestamp = t;
if (t > data.maximumVersionTimestamp)
data.maximumVersionTimestamp = t;
if (t == sdata.minimumVersionTimestamp
|| t == sdata.maximumVersionTimestamp)
updateMinMaxVersionTimestampOnSibling = true;
}
if (data.rawRecords != null)
data.rawRecords[nkeys] = sdata.rawRecords[0];
// copy down the keys on the right sibling to cover up the hole.
System.arraycopy(skeys.keys, 1, skeys.keys, 0, snkeys-1);
System.arraycopy(svals.values, 1, svals.values, 0, snkeys-1);
if(data.deleteMarkers!=null)
System.arraycopy(sdata.deleteMarkers, 1, sdata.deleteMarkers, 0, snkeys-1);
if(data.versionTimestamps!=null)
System.arraycopy(sdata.versionTimestamps, 1, sdata.versionTimestamps, 0, snkeys-1);
if(data.rawRecords !=null)
System.arraycopy(sdata.rawRecords, 1, sdata.rawRecords, 0, snkeys-1);
// erase exposed key/value on rightSibling that is no longer defined.
skeys.keys[snkeys-1] = null;
svals.values[snkeys-1] = null;
if (data.deleteMarkers != null)
sdata.deleteMarkers[snkeys - 1] = false;
if (data.versionTimestamps != null)
sdata.versionTimestamps[snkeys - 1] = 0L;
if (data.rawRecords != null)
sdata.rawRecords[snkeys - 1] = false;
/*s.nkeys--;*/ skeys.nkeys--; svals.nvalues--;
/*this.nkeys++;*/keys.nkeys++; vals.nvalues++;
if(updateMinMaxVersionTimestampOnSibling)
sdata.recalcMinMaxVersionTimestamp();
// update the separator key for the rightSibling.
// p.setKey(index, s.getKey(0));
p.copyKey(index, s.getKeys(), 0);
// update parent : one more key on this child.
pdata.childEntryCounts[index]++;
// update parent : one less key on our right sibling.
pdata.childEntryCounts[index + 1]--;
// FIXME update min/max on parent for this leaf and the rightSibling.
if (btree.debug) {
assertInvariants();
s.assertInvariants();
}
} else {
/*
* redistributeKeys(leftSibling,this). all we have to do is copy
* down the keys in this leaf by one position and move the last key
* from the leftSibling into the first position in this leaf. We
* then replace the separation key for this leaf on the parent with
* the key that we copied from the leftSibling.
*/
// copy down by one.
System.arraycopy(keys.keys, 0, keys.keys, 1, nkeys);
System.arraycopy(vals.values, 0, vals.values, 1, nkeys);
if(data.deleteMarkers!=null)
System.arraycopy(data.deleteMarkers, 0, data.deleteMarkers, 1, nkeys);
if(data.versionTimestamps!=null)
System.arraycopy(data.versionTimestamps, 0, data.versionTimestamps, 1, nkeys);
if(data.rawRecords!=null)
System.arraycopy(data.rawRecords, 0, data.rawRecords, 1, nkeys);
// move the last key/value from the leftSibling to this leaf (copy, then clear).
// copy.
// setKey(0, s.getKey(s.nkeys-1));
copyKey(0,s.getKeys(),snkeys-1);
vals.values[0] = svals.values[snkeys-1];
if (data.deleteMarkers != null)
data.deleteMarkers[0] = sdata.deleteMarkers[snkeys - 1];
// if (data.versionTimestamps != null)
// data.versionTimestamps[0] = sdata.versionTimestamps[snkeys - 1];
boolean updateMinMaxVersionTimestampOnSibling = false;
if (data.versionTimestamps != null) {
final long t = sdata.versionTimestamps[snkeys - 1];
data.versionTimestamps[0] = t;
if (t < data.minimumVersionTimestamp)
data.minimumVersionTimestamp = t;
if (t > data.maximumVersionTimestamp)
data.maximumVersionTimestamp = t;
if (t == sdata.minimumVersionTimestamp
|| t == sdata.maximumVersionTimestamp)
updateMinMaxVersionTimestampOnSibling = true;
}
if (data.rawRecords != null)
data.rawRecords[0] = sdata.rawRecords[snkeys - 1];
// clear
skeys.keys[snkeys-1] = null;
svals.values[snkeys-1] = null;
if (data.deleteMarkers != null)
sdata.deleteMarkers[snkeys - 1] = false;
if (data.versionTimestamps != null)
sdata.versionTimestamps[snkeys - 1] = 0L;
if (data.rawRecords != null)
sdata.rawRecords[snkeys - 1] = false;
/*s.nkeys--;*/ skeys.nkeys--; svals.nvalues--;
/*this.nkeys++;*/ keys.nkeys++; vals.nvalues++;
if(updateMinMaxVersionTimestampOnSibling)
sdata.recalcMinMaxVersionTimestamp();
// update the separator key for this leaf.
// p.setKey(index-1,getKey(0));
p.copyKey(index-1, this.getKeys(), 0);
// update parent : one more key on this child.
pdata.childEntryCounts[index]++;
// update parent : one less key on our left sibling.
pdata.childEntryCounts[index-1]--;
// FIXME update min/max on parent for this leaf and the rightSibling.
if (btree.debug) {
assertInvariants();
s.assertInvariants();
}
}
}
/**
* Merge the keys and values from the sibling into this leaf, delete the
* sibling from the store and remove the sibling from the parent. This will
* trigger recursive {@link AbstractNode#join()} if the parent node is now
* deficient. While this changes the #of entries spanned by the current node
* it does NOT effect the #of entries spanned by the parent. Likewise, while
* the min/max tuple revision timestamp may change for this {@link Leaf}, it
* WILL NOT change for its parent {@link Node} (since this operation does
* not remove any tuples).
*
* @param sibling
* A direct sibling of this leaf (does NOT need to be mutable).
* The sibling MUST have exactly the minimum #of keys.
*
* FIXME maintain min/max version timestamps.
*/
@Override
protected void merge(final AbstractNode sibling,
final boolean isRightSibling) {
// the sibling of a leaf must be a leaf.
final Leaf s = (Leaf)sibling;
assert s != null;
final int nkeys = this.getKeyCount();
final int snkeys = s.getKeyCount();
assert !s.deleted;
// verify that this leaf is deficient.
assert nkeys < minKeys();
// verify that this leaf is under minimum capacity by one key.
assert nkeys == minKeys() - 1;
// the sibling MUST at the minimum #of keys/values.
assert snkeys == s.minKeys();
final Node p = getParent();
// children of the same node.
assert s.getParent() == p : "this.parent="
+ (p == null ? null : p)
+ " != s.parent="
+ (s.getParent() == null ? null : s.getParent());
if (DEBUG) {
log.debug("this="+this+", sibling="+sibling+", rightSibling="+isRightSibling);
// if(DEBUG) {
// System.err.println("this"); dump(Level.DEBUG,System.err);
// System.err.println("sibling"); sibling.dump(Level.DEBUG,System.err);
// System.err.println("parent"); p.dump(Level.DEBUG,System.err);
// }
}
/*
* The index of this leaf in its parent. We note this before we
* start mucking with the keys.
*/
final int index = p.getIndexOf(this);
/*
* Tunnel through to the mutable data records.
*
* Note: We do not require the sibling to be mutable. If it is not, then
* we create a mutable copy of the sibling for use during this method.
*/
final MutableLeafData data = (MutableLeafData) this.data;
final MutableLeafData sdata = s.isReadOnly() ? new MutableLeafData(
getBranchingFactor(), s.data) : (MutableLeafData) s.data;
final MutableNodeData pdata = (MutableNodeData) p.data;
/*
* Determine which leaf is earlier in the key ordering so that we know
* whether the sibling's keys will be inserted at the front of this
* leaf's keys or appended to this leaf's keys.
*/
if( isRightSibling /*keys[nkeys-1] < s.keys[0]*/) {
/*
* merge( this, rightSibling ). the keys and values from this leaf
* will appear in their current position and the keys and values
* from the rightSibling will be appended after the last key/value
* in this leaf.
*/
/*
* Copy in the keys and values from the sibling.
*/
System.arraycopy(sdata.keys.keys, 0, data.keys.keys, nkeys, snkeys);
System.arraycopy(sdata.vals.values, 0, data.vals.values, nkeys,
snkeys);
if (data.deleteMarkers != null) {
System.arraycopy(sdata.deleteMarkers, 0, data.deleteMarkers,
nkeys, snkeys);
}
if (data.versionTimestamps != null) {
System.arraycopy(sdata.versionTimestamps, 0,
data.versionTimestamps, nkeys, snkeys);
if (sdata.minimumVersionTimestamp < data.minimumVersionTimestamp)
data.minimumVersionTimestamp = sdata.minimumVersionTimestamp;
if (sdata.maximumVersionTimestamp > data.maximumVersionTimestamp)
data.maximumVersionTimestamp = sdata.maximumVersionTimestamp;
}
if (data.rawRecords != null) {
System.arraycopy(sdata.rawRecords, 0, data.rawRecords,
nkeys, snkeys);
}
/*
* Adjust the #of keys in this leaf.
*/
// this.nkeys += s.nkeys;
data.keys.nkeys += snkeys;
data.vals.nvalues += snkeys;
/*
* Note: in this case we have to replace the separator key for this
* leaf with the separator key for its right sibling.
*
* Note: This temporarily causes the duplication of a separator key
* in the parent. However, the separator key for the right sibling
* will be deleted when the sibling is removed from the parent
* below.
*/
// p.setKey(index, p.getKey(index+1));
p.copyKey(index, p.getKeys(), index+1 );
// reallocate spanned entries from the sibling to this node.
// FIXME Update min/max on the parent for this leaf.
pdata.childEntryCounts[index] += s.getKeyCount();
if(btree.debug) assertInvariants();
} else {
/*
* merge( leftSibling, this ). The keys and values from this leaf
* will be move down by sibling.nkeys positions and then the keys
* and values from the sibling will be copied into this leaf
* starting at index zero(0).
*
* Note: we do not update the separator key in the parent because
* the separatorKey will be removed when we remove the leftSibling
* from the parent at the end of this method. This also has the
* effect of giving this leaf its left sibling's separatorKey.
*/
// move keys and values down by sibling.nkeys positions.
System.arraycopy(data.keys.keys, 0, data.keys.keys, snkeys, nkeys);
System.arraycopy(data.vals.values, 0, data.vals.values, snkeys,
nkeys);
if (data.deleteMarkers != null) {
System.arraycopy(data.deleteMarkers, 0, data.deleteMarkers,
snkeys, nkeys);
}
if (data.versionTimestamps != null) {
System.arraycopy(data.versionTimestamps, 0,
data.versionTimestamps, snkeys, nkeys);
}
if (data.rawRecords != null) {
System.arraycopy(data.rawRecords, 0, data.rawRecords,
snkeys, nkeys);
}
// copy keys and values from the sibling to index 0 of this leaf.
System.arraycopy(sdata.keys.keys, 0, data.keys.keys, 0, snkeys);
System.arraycopy(sdata.vals.values, 0, data.vals.values, 0, snkeys);
if (data.deleteMarkers != null) {
System.arraycopy(sdata.deleteMarkers, 0, data.deleteMarkers, 0,
snkeys);
}
if (data.versionTimestamps != null) {
System.arraycopy(sdata.versionTimestamps, 0,
data.versionTimestamps, 0, snkeys);
if (sdata.minimumVersionTimestamp < data.minimumVersionTimestamp)
data.minimumVersionTimestamp = sdata.minimumVersionTimestamp;
if (sdata.maximumVersionTimestamp > data.maximumVersionTimestamp)
data.maximumVersionTimestamp = sdata.maximumVersionTimestamp;
}
if (data.rawRecords != null) {
System.arraycopy(sdata.rawRecords, 0, data.rawRecords, 0,
snkeys);
}
// this.nkeys += s.nkeys;
data.keys.nkeys += snkeys;
data.vals.nvalues += snkeys;
// FIXME update min/max on the parent for this leaf.
// reallocate spanned entries from the sibling to this node.
pdata.childEntryCounts[index] += s.getKeyCount();
if(btree.debug) assertInvariants();
}
/*
* The sibling leaf is now empty. We need to detach the leaf from its
* parent node and then delete the leaf from the store.
*
* Note: We have already adjusted the min/max for this Leaf on the
* parent. The min/max for the parent itself will be unchanged by the
* merge(). Therefore this method need only clear out the min/max for
* the deleted child.
*/
p.removeChild(s);
}
/**
* Copies all keys and values from the specified start index down by one in
* order to make room to insert a key and value at that index.
*
* @param entryIndex
* The index of the first key and value to be copied.
*/
protected void copyDown(final int entryIndex, final int count) {
/*
* copy down per-key data (#values == nkeys).
*/
// Tunnel through to the mutable keys and values objects.
final MutableLeafData data = (MutableLeafData) this.data;
final MutableKeyBuffer keys = data.keys;
final MutableValueBuffer vals = data.vals;
System.arraycopy(keys.keys, entryIndex, keys.keys, entryIndex + 1,
count);
System.arraycopy(vals.values, entryIndex, vals.values, entryIndex + 1,
count);
if (data.deleteMarkers != null) {
System.arraycopy(data.deleteMarkers, entryIndex,
data.deleteMarkers, entryIndex + 1, count);
}
if (data.versionTimestamps != null) {
System.arraycopy(data.versionTimestamps, entryIndex,
data.versionTimestamps, entryIndex + 1, count);
}
if (data.rawRecords != null) {
System.arraycopy(data.rawRecords, entryIndex,
data.rawRecords, entryIndex + 1, count);
}
/*
* Clear the entry at the index. This is partly a paranoia check and
* partly critical. Some per-key elements MUST be cleared and it is much
* safer (and quite cheap) to clear them during copyDown() rather than
* relying on maintenance elsewhere.
*/
keys.keys[entryIndex] = null;
vals.values[entryIndex] = null;
if (data.deleteMarkers != null) {
data.deleteMarkers[entryIndex] = false;
}
if (data.versionTimestamps != null) {
data.versionTimestamps[entryIndex] = 0L;
// Note: caller MUST update min/max if they are invalidated!
}
if (data.rawRecords != null) {
data.rawRecords[entryIndex] = false;
}
}
@Override
public Tuple remove(final byte[] key, final Tuple tuple) {
if(btree.debug) assertInvariants();
btree.touch(this);
final int entryIndex = getKeys().search(key);
if (entryIndex < 0) {
// Not found.
return null;
}
/*
* Note: This is one of the few gateways for mutation of a leaf via
* the main btree API (insert, lookup, delete). By ensuring that we
* have a mutable leaf here, we can assert that the leaf must be
* mutable in other methods.
*/
final Leaf copy = (Leaf) copyOnWrite();
if (copy != this) {
/*
* Note: This leaf was copied so delegate to the new leaf (the old
* leaf is now unused).
*
* Note: copy-on-write deletes [this] leaf and delete() notifies any
* leaf listeners before it clears the [leafListeners] reference so
* not only don't we have to do that here, but we can't since the
* listeners would be cleared before we could fire off the event
* ourselves.
*/
return copy.remove(key, tuple);
}
// // The value that is being removed.
// final Object oldval = this.values[entryIndex];
if (tuple != null) {
/*
* Copy data and metadata for the index entry that is being removed.
*/
tuple.copy(entryIndex, this);
}
if (data.hasDeleteMarkers()) {
/*
* This operation is not allowed when delete markers are being
* maintained. You use an insert(...) instead and specify delete :=
* true.
*/
throw new UnsupportedOperationException();
}
/*
* If the tuple was associated with a raw record address, then delete
* the raw record from the backing store.
*
* Note: The general copy-on-write contract of the B+Tree combined with
* the semantics of the WORM, RW, and scale-out persistence layers will
* ensure the actual delete of the raw record is deferred until the
* commit point from which the tuple was deleted is no longer visible.
*/
if (data.hasRawRecords()) {
final long addr = data.getRawRecord(entryIndex);
if (addr != IRawStore.NULL) {
btree.deleteRawRecord(addr);
}
}
// if (INFO) {
// log.info("this="+this+", key="+key+", value="+entry+", index="+entryIndex);
// if(DEBUG) {
// System.err.println("this"); dump(Level.DEBUG,System.err);
// }
// }
/*
* Copy over the hole created when the key and value were removed
* from the leaf.
*
* Given:
* keys : [ 1 2 3 4 ]
* vals : [ a b c d ]
*
* Remove(1):
* index := 0
* length = nkeys(4) - index(0) - 1 = 3;
*
* Remove(3):
* index := 2;
* length = nkeys(4) - index(2) - 1 = 1;
*
* Remove(4):
* index := 3
* length = nkeys(4) - index(3) - 1 = 0;
*
* Given:
* keys : [ 1 ]
* vals : [ a ]
*
* Remove(1):
* index := 0
* length = nkeys(1) - index(0) - 1 = 0;
*/
{
/*
* Copy down to cover up the hole.
*/
final int nkeys = getKeyCount();
final int length = nkeys - entryIndex - 1;
// Tunnel through to the mutable objects.
final MutableLeafData data = (MutableLeafData) this.data;
final MutableKeyBuffer keys = data.keys;
final MutableValueBuffer vals = data.vals;
if (length > 0) {
System.arraycopy(keys.keys, entryIndex + 1, keys.keys, entryIndex,
length);
System.arraycopy(vals.values, entryIndex + 1, vals.values,
entryIndex, length);
if (data.versionTimestamps != null) {
System.arraycopy(data.versionTimestamps, entryIndex + 1,
data.versionTimestamps, entryIndex, length);
}
if (data.rawRecords != null) {
System.arraycopy(data.rawRecords, entryIndex + 1,
data.rawRecords, entryIndex, length);
}
}
/*
* Erase the key/value that was exposed by this operation.
*/
keys.keys[nkeys - 1] = null;
vals.values[nkeys - 1] = null;
if (data.versionTimestamps != null) {
data.versionTimestamps[nkeys - 1] = 0L;
}
if (data.rawRecords != null) {
data.rawRecords[nkeys - 1] = false;
}
// One less key in the leaf.
/*nkeys--;*/ keys.nkeys--; vals.nvalues--;
// One less entry in the tree.
((BTree)btree).nentries--;
assert ((BTree)btree).nentries >= 0;
// One more deleted tuple.
btree.getBtreeCounters().ntupleRemove++;
if (data.versionTimestamps != null) {
/*
* If the tuple with the min/max version timestamp was removed
* then we need to recalculate the min/max version timestamp.
* This needs to happen after we update nkeys/nvalues (so the
* new min/max considers only the valid tuples) and before we
* update the spanned tuple counts on the parent (so the new
* min/max will be propagated correctly).
*/
final long oldVersionTimestamp = tuple.getVersionTimestamp();
if (oldVersionTimestamp == data.minimumVersionTimestamp
|| oldVersionTimestamp == data.maximumVersionTimestamp)
data.recalcMinMaxVersionTimestamp();
}
}
if( btree.root != this ) {
/*
* this is not the root leaf.
*/
// update entry count and min/max version timestamp on ancestors.
parent.get().updateEntryCount(this, -1);
if (data.getKeyCount() < minKeys()) {
/*
* The leaf is deficient. Join it with a sibling, causing their
* keys to be redistributed such that neither leaf is deficient.
* If there is only one other sibling and it has only the
* minimum #of values then the two siblings will be merged into
* a single leaf and their parent will have only a single child.
* Since the minimum #of children is two (2), having a single
* child makes the parent of this node deficient and it will be
* joined with one of its siblings. If necessary, this process
* will continue recursively up the tree. The root leaf never
* has any siblings and never experiences underflow so it may be
* legally reduced to zero values.
*
* Note that the minimum branching factor (3) and the invariants
* together guarantee that there is at least one sibling. Also
* note that the minimum #of children for a node with the
* minimum branching factor is two (2) so a valid tree never has
* a node with a single sibling.
*
* Note that we must invoke copy-on-write before modifying a
* sibling. However, the parent of the leaf MUST already be
* mutable (aka dirty) since that is a precondition for removing
* a key from the leaf. This means that copy-on-write will not
* force the parent to be cloned.
*/
join();
}
}
if(btree.debug) assertInvariants();
/*
* Notify any listeners that the tuple(s) in the leaf have been changed.
*/
fireInvalidateLeafEvent();
return tuple;
}
/**
* Visits this leaf if unless it is not dirty and the flag is true, in which
* case the returned iterator will not visit anything.
*
* {@inheritDoc}
*/
@Override
@SuppressWarnings("unchecked")
public Iterator postOrderNodeIterator(
final boolean dirtyNodesOnly, final boolean nodesOnly) {
if (dirtyNodesOnly && ! isDirty() ) {
return EmptyIterator.DEFAULT;
} else if(nodesOnly) {
return EmptyIterator.DEFAULT;
} else {
return new SingleValueIterator(this);
}
}
/**
* Visits this leaf.
*/
@Override
@SuppressWarnings("unchecked")
public Iterator postOrderIterator(final byte[] fromKey,
final byte[] toKey) {
return new SingleValueIterator(this);
}
/**
* Iterator visits the tuples in this leaf in key order.
*/
@Override
public ITupleIterator entryIterator() {
if (getKeys().isEmpty()) {
return EmptyTupleIterator.INSTANCE;
}
return new LeafTupleIterator(this);
}
@Override
public boolean dump(final Level level, final PrintStream out,
final int height, final boolean recursive) {
final boolean debug = level.toInt() <= Level.DEBUG.toInt();
// Set to false iff an inconsistency is detected.
boolean ok = true;
final int branchingFactor = this.getBranchingFactor();
final int nkeys = this.getKeyCount();
final int minKeys = this.minKeys();
final int maxKeys = this.maxKeys();
/*
* Since the index segment does not materialize the root when running a
* leaf cursor we can not rely on [btree.root != this].
*/
final boolean isRoot = (btree.root == this)
|| ((btree instanceof IndexSegment) && btree.getEntryCount() == 0);
if (!isRoot
&& (nkeys < minKeys)) {
/*
* Min keys failure.
*
* Note: the root may have fewer keys.
*/
out.println(indent(height) + "ERROR: too few keys: m="
+ branchingFactor + ", minKeys=" + minKeys + ", nkeys="
+ nkeys + ", isLeaf=" + isLeaf() + ", isRoot=" + isRoot);
ok = false;
}
if (nkeys > branchingFactor) {
// max keys failure.
out.println(indent(height) + "ERROR: too many keys: m="
+ branchingFactor + ", maxKeys=" + maxKeys + ", nkeys="
+ nkeys + ", isLeaf=" + isLeaf() + ", isRoot=" + isRoot);
ok = false;
}
if (height != -1 && height != btree.getHeight()) {
out.println(indent(height) + "WARN: height=" + height
+ ", but btree height=" + btree.getHeight());
ok = false;
}
// verify keys are monotonically increasing.
try {
assertKeysMonotonic();
} catch (AssertionError ex) {
out.println(indent(height) + " ERROR: "+ex);
ok = false;
}
if (debug || ! ok ) {
out.println(indent(height) + toString());
// out.println(indent(height) + " parent="
// + (parent == null ? null : parent.get()));
//
// out.println(indent(height) + " isRoot=" + (btree.root == this)
// + ", dirty=" + isDirty() + ", nkeys=" + nkeys
// + ", minKeys=" + minKeys + ", maxKeys=" + maxKeys
// + ", branchingFactor=" + branchingFactor);
//
// // Note: key format is dumped by its object.
// out.println(indent(height) + " keys=" + getKeys());
//
// // Note: signed byte[]s.
// out.println(indent(height) + " vals=" + getValues());
//
// if(hasDeleteMarkers()) {
//
// out.print(indent(height) + " deleted=[");
// for (int i = 0; i <= nkeys; i++) {
// if (i > 0)
// out.print(", ");
// out.print(getDeleteMarker(i));
// }
// out.println("]");
//
// }
//
// if(hasVersionTimestamps()) {
//
// out.print(indent(height) + " timestamps=[");
// for (int i = 0; i <= nkeys; i++) {
// if (i > 0)
// out.print(", ");
// out.print(getVersionTimestamp(i));
// }
// out.println("]");
//
// }
}
return ok;
}
// /**
// * Formats the data into a {@link String}.
// *
// * @param data
// * An array of signed byte arrays.
// */
// static private String toString(final int n, final IRaba data) {
//
// final StringBuilder sb = new StringBuilder();
//
// sb.append("data(n=" + n + ")={");
//
// for (int i = 0; i < n; i++) {
//
// final byte[] a = data.get(i);
//
// sb.append("\n");
//
// sb.append("data[" + i + "]=");
//
// sb.append(Arrays.toString(a));
//
// if (i + 1 < n)
// sb.append(",");
//
// }
//
// sb.append("}");
//
// return sb.toString();
//
// }
/**
* Human readable representation of the {@link ILeafData} plus transient
* information associated with the {@link Leaf}.
*/
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
// sb.append(getClass().getName());
sb.append(super.toString());
sb.append("{ isDirty="+isDirty());
sb.append(", isDeleted="+isDeleted());
sb.append(", addr=" + identity);
final Node p = (parent == null ? null : parent.get());
sb.append(", parent=" + (p == null ? "N/A" : p.toShortString()));
sb.append(", isRoot=" + (btree.root == this));
if (data == null) {
// No data record? (Generally, this means it was stolen by copy on
// write).
sb.append(", data=NA}");
return sb.toString();
}
sb.append(", nkeys=" + getKeyCount());
sb.append(", minKeys=" + minKeys());
sb.append(", maxKeys=" + maxKeys());
DefaultLeafCoder.toString(this, sb);
sb.append("}");
return sb.toString();
}
/**
* An interface that may be used to register for and receive events when the
* state of a {@link Leaf} is changed. This includes (a) adding a new tuple
* to a leaf; (b) removing a tuple from a leaf (but not flagging an existing
* tuple as deleted); and (c) when the leaf is discarded by copy-on-write.
*
* @author Bryan Thompson
* @version $Id$
*
* @todo another listener API could be developed for tuple state changes.
* that would be useful if there was a desire for pre- or
* post-processing for each tuple. This might be useful for
* introducing triggers.
*/
public static interface ILeafListener {
/**
* Notice that the leaf state has changed and that the listener must not
* assume: (a) that a tuple of interest still resides within the leaf
* (it may have been moved up or down within the leaf or it may be in
* another leaf altogether as a result of underflow or overflow); (b)
* that the leaf is still in use (it may have been discarded by a
* copy-on-write operation).
*/
public void invalidateLeaf();
// /**
// * Notice that the state of a tuple in the leaf has been changed (the
// * tuple is still known to be located within the leaf).
// *
// * @param index
// * The index of the tuple whose state was changed.
// */
// public void invalidateTuple(int index);
}
/**
* Listeners for {@link ILeafListener} events.
*
* Note: The values in the map are null
.
*
* Note: Listeners are cleared from the map automatically by the JVM soon
* after the listener becomes only weakly reachable.
*
* Note: Mutable {@link BTree}s are single-threaded so there is no need to
* synchronize access to this collection.
*
* Note: These listeners are primarily used to support {@link ITupleCursor}s.
* The #of listeners at any one time is therefore directly related to the
* #of open iterators on the owning mutable {@link BTree}.
* Normally that is ONE (1) since the {@link BTree} is not thread-safe for
* mutation and each cursor has a current, prior, and next position meaning
* that we have typically either NO listeners or the current and either
* prior or next listener. This tends to make visiting the members of the
* collection (when it is defined) very fast, especially since we do not
* need to synchronize on anything.
*
* Note: The trigger conditions for the events of interest to the listeners
* are scattered throughout the {@link Leaf} class.
*/
private transient WeakHashMap leafListeners = null;
/**
* Register an {@link ILeafListener} with this {@link Leaf}. Listeners are
* automatically removed by the JVM shortly after they become only weakly
* reachable.
*
* @param l
* The listener.
*
* @throws IllegalStateException
* if the owning {@link AbstractBTree} is read-only.
*/
final public void addLeafListener(ILeafListener l) {
if (l == null)
throw new IllegalArgumentException();
btree.assertNotReadOnly();
if(leafListeners==null) {
leafListeners = new WeakHashMap();
}
leafListeners.put(l, null);
}
/**
* Fire an {@link ILeafListener#invalidateLeaf()} event to any registered
* listeners.
*/
final protected void fireInvalidateLeafEvent() {
if(leafListeners == null) return;
for(ILeafListener l : leafListeners.keySet()) {
l.invalidateLeaf();
}
}
// /**
// * Fire an {@link ILeafListener#invalidateTuple(int)} event to any
// * registered listeners.
// *
// * @param index
// * The index of the tuple whose state was changed.
// */
// final protected void fireInvalidateTuple(int index) {
//
// if(leafListeners == null) return;
//
// for(ILeafListener l : leafListeners.keySet()) {
//
// l.invalidateTuple(index);
//
// }
//
// }
final public ByteBuffer readRawRecord(long addr) {
return btree.readRawRecord(addr);
}
}