
com.bigdata.htree.DirectoryPage Maven / Gradle / Ivy
Show all versions of bigdata-core Show documentation
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.bigdata.htree;
import java.io.PrintStream;
import java.lang.ref.Reference;
import java.util.HashSet;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.concurrent.FutureTask;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.Node;
import com.bigdata.htree.AbstractHTree.ChildMemoizer;
import com.bigdata.htree.AbstractHTree.LoadChildRequest;
import com.bigdata.htree.data.IDirectoryData;
import com.bigdata.io.AbstractFixedByteArrayBuffer;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.util.BytesUtil;
import com.bigdata.util.concurrent.Memoizer;
import cutthecrap.utils.striterators.EmptyIterator;
import cutthecrap.utils.striterators.Expander;
import cutthecrap.utils.striterators.SingleValueIterator;
import cutthecrap.utils.striterators.Striterator;
/**
* An {@link HTree} directory page (node). Each directory page will hold one or
* more "buddy" hash tables. The #of buddy hash tables on the page depends on
* the globalDepth of the page and the addressBits as computed by
* {@link DirectoryPage#getNumBuddies()}.
*/
class DirectoryPage extends AbstractPage implements IDirectoryData {
/**
* The depth of a bucket page which overflows is always addressBits.
*/
final int getOverflowPageDepth() {
return htree.addressBits;
}
/**
* Transient references to the children.
*/
// Note: cleared by copyOnWrite when we steal the array.
/*final*/ Reference[] childRefs;
/**
* Persistent data.
*/
IDirectoryData data;
/**
* Get the child indexed by the key.
*
* Note: The recursive descent pattern requires the caller to separately
* compute the buddy index before each descent into a child.
*
* @param hashBits
* The relevant bits from the key.
* @param buddyOffset
* The offset into the child of the first slot for the buddy hash
* table or buddy hash bucket.
*
* @return The child indexed by the key.
*
* @see HTreeUtil#getBuddyOffset(int, int, int)
*/
protected AbstractPage getChild(final int hashBits, final int buddyOffset) {
// width of a buddy hash table in pointer slots.
final int tableWidth = (1 << globalDepth);
// index position of the start of the buddy hash table in the page.
final int tableOffset = (tableWidth * buddyOffset);
// index of the slot in the buddy hash table for the given hash
// bits.
final int index = tableOffset + hashBits;
return getChild(index);
}
/**
* Split the child {@link BucketPage}.
*
* Rather than immediately creating 2 buckets, we want first to just
* fill half the original slots (by testing the first key value) and
* leaving the other half as null.
*
* This will result in a lazy creation of the second BucketPage
* if required. Such that we could end up introducing several
* new layers without the needless creation of empty BucketPages.
*/
void _splitBucketPage(final BucketPage bucketPage) {
// Note: this.getChildCount() is less direct.
final byte[] tstkey = bucketPage.getFirstKey();
final int bucketSlot = getLocalHashCode(tstkey, getPrefixLength());
final int slotsOnPage = 1 << htree.addressBits;
int start = 0;
for (int s = 0; s < slotsOnPage; s++) {
if (bucketPage.self == childRefs[s]) {
start = s;
break;
}
}
int last = start;
for (int s = start + 1; s < slotsOnPage; s++) {
if (bucketPage.self == childRefs[s]) {
last++;
} else {
break;
}
}
final int npointers = last - start + 1;
assert npointers > 1;
assert npointers % 2 == 0;
assert npointers == 1 << (htree.addressBits - bucketPage.globalDepth);
final int crefs = npointers >> 1; // half references for each new child
final int newDepth = bucketPage.globalDepth + 1;
final boolean fillLowerSlots = bucketSlot < start + crefs;
assert bucketSlot >= start && bucketSlot <= last;
final BucketPage newPage = createPage(newDepth);
final Reference a = (Reference) (fillLowerSlots ? newPage.self : null);
for (int s = start; s < start + crefs; s++) {
childRefs[s] = (Reference) a;
}
final Reference b = (Reference) (!fillLowerSlots ? newPage.self : null);
for (int s = start + crefs; s <= last; s++) {
childRefs[s] = (Reference) b;
}
assert !newPage.isPersistent();
((HTree) htree).nleaves++; // only definitely add one
// remove original page and correct leaf count
bucketPage.delete();
((HTree) htree).nleaves--;
// insert raw values from original page
final int bucketSlotsPerPage = bucketPage.slotsOnPage();
for (int i = 0; i < bucketSlotsPerPage; i++) {
((HTree) htree).insertRawTuple(bucketPage, i);
}
assert ((BucketPage) childRefs[bucketSlot].get()).data.getKeyCount() > 0;
// ...and finally delete old page data
if (bucketPage.isPersistent()) {
htree.deleteNodeOrLeaf(bucketPage.getIdentity());
}
}
private BucketPage createPage(final int depth) {
final BucketPage ret = new BucketPage((HTree) htree, depth);
ret.parent = (Reference) self;
return ret;
}
/**
* This is a lazy creation of a BucketPage to fill the relevant number of slots
* based on the number of empty slots surrounding it.
*
* If all slots are empty, then they will all be filled with references to the
* new page.
*
* The regions tested follow the pattern below:
* [A A A A A A A A]
* [A A A A B B B B]
* [A A B B C C D D]
* [A B C D E F G H]
*
* Examples with 8 slots, test slot 'T'
* [- - - - T - - -] // all empty
* [S S S S S S S S] // set ALL 8
*
* [F F - - T - - -] // other half not empty
* [F F - - S S S S] // set 4 slots
*
* [F F - T - - - -] // same half not empty
* [F F S S - - - -] // set 2 slots
*
* @param slot - the target slot to be filled
* @param bucketPage - the page to replicate
*/
private void fillEmptySlots(final int slot, final BucketPage bucketPage) {
// create with initial max depth
int slots = fillEmptySlots(slot, bucketPage, 0, 1 << htree.addressBits);
assert bucketPage.globalDepth == htree.addressBits;
bucketPage.globalDepth = htree.addressBits;
while (slots > 1) {
slots >>= 1;
bucketPage.globalDepth--;
}
// assert (1 << (htree.addressBits - bucketPage.globalDepth)) == countChildRefs(bucketPage);
}
int countChildRefs(AbstractPage pge) {
int count = 0;
for (int i = 0; i < childRefs.length; i++) {
if (childRefs[i] == pge.self) {
count++;
}
}
return count;
}
private int fillEmptySlots(final int slot, final BucketPage bucketPage, final int from, final int to) {
if (from > slot | to < slot) {
throw new IllegalArgumentException();
}
assert !bucketPage.isPersistent();
if (from == slot && to == slot+1) {
childRefs[slot] = (Reference) bucketPage.self;
return 1;
}
// test for full range null
for (int i = from; i < to; i++) {
if (childRefs[i] != null || getChildAddr(i) != IRawStore.NULL) {
// recurse and return
final int childlen = (to-from) >> 1; // half
if (slot < (from + childlen)) {
return fillEmptySlots(slot, bucketPage, from, from+childlen);
} else {
return fillEmptySlots(slot, bucketPage, to-childlen, to);
}
}
}
// assign to validated null range
for (int i = from; i < to; i++) {
assert childRefs[i] == null;
childRefs[i] = (Reference) bucketPage.self;
}
return to - from;
}
/**
* Return the {@link Reference} for the child at that index.
*
* @param index
* The index
*
* @return The {@link Reference}.
*/
Reference getChildRef(final int index) {
return childRefs[index];
}
/**
* This method must be invoked on a parent to notify the parent that the
* child has become persistent. The method scans the weak references for the
* children, finds the index for the specified child, and then sets the
* corresponding index in the array of child addresses.
*
* @param child
* The child.
*
* @exception IllegalStateException
* if the child is not persistent.
* @exception IllegalArgumentException
* if the child is not a child of this node.
*/
void setChildAddr(final AbstractPage child) {
assert !isReadOnly();
if (!child.isPersistent()) {
// The child does not have persistent identity.
throw new IllegalStateException();
}
final int slotsPerPage = 1 << htree.addressBits;
if (childRefs == null)
throw new IllegalStateException("childRefs must not be NULL");
boolean found = false;
for (int i = 0; i < slotsPerPage; i++) {
if (childRefs[i] == child.self) {
((MutableDirectoryPageData) data).childAddr[i] = child
.getIdentity();
found = true;
} else if (found) {
// no more pointers to that child.
break;
}
}
}
/**
* Return the child at the specified index in the {@link DirectoryPage}.
*
* @param index
* The index of the slot in the {@link DirectoryPage}. If the
* child must be materialized, the buddyOffset will be computed
* based on the globalDepth and the #of pointers to that child
* will be computed so its depth may be set.
*
* @return The child at that index.
*/
AbstractPage getChild(final int index) {
// See BLZG-1657 (Add BTreeCounters for cache hit and cache miss)
htree.getBtreeCounters().cacheTests.increment();
/**
* FIXME: Need to check whether we are using unnecessary synchronization
*/
AbstractPage ret = checkLazyChild(index);
if (ret != null) {
return ret;
}
if (htree.memo == null) {
/*
* Optimization for the mutable B+Tree.
*
* Note: This optimization depends on the assumption that concurrent
* operations are never submitted to the mutable B+Tree. In fact,
* the UnisolatedReadWriteIndex *DOES* allow concurrent readers (it
* uses a ReentrantReadWriteLock). Therefore this code path is now
* expressed conditionally on whether or not the Memoizer object is
* initialized by AbstractBTree.
*
* Note: Since the caller is single-threaded for the mutable B+Tree
* we do not need to use the Memoizer, which just delegates to
* _getChild(index). This saves us some object creation and overhead
* for this case.
*/
// See BLZG-1657 (Add BTreeCounters for cache hit and cache miss)
htree.getBtreeCounters().cacheMisses.increment();
return _getChild(index, null/* req */);
}
/*
* If we can resolve a hard reference to the child then we do not need
* to look any further.
*/
// synchronized (childRefs)
{
/*
* Note: we need to synchronize on here to ensure visibility for
* childRefs[index] (in case it was updated in another thread). This
* is true even for the mutable B+Tree since the caller could use
* different threads for different operations. However, this
* synchronization will never be contended for the mutable B+Tree.
*/
final Reference childRef = childRefs[index];
final AbstractPage child = childRef == null ? null : childRef.get();
if (child != null) {
// Already materialized.
htree.touch(child);
return child;
}
}
/*
* Otherwise we need to go through the Memoizer pattern to achieve
* non-blocking access. It will wind up delegating to _getChild(int),
* which is immediately below. However, it will ensure that one and only
* one thread executes _getChild(int) for a given parent and child
* index. That thread will update childRefs[index]. Any concurrent
* requests for the same child will wait for the FutureTask inside of
* the Memoizer and then return the new value of childRefs[index].
*/
/*
* See BLZG-1657 (Add BTreeCounters for cache hit and cache miss)
*
* Note: This is done in the caller rather than _getChild() since the
* latter may be called from the memoizer, in which case only one thread
* will actually invoke _getChild() while the others will just obtain
* the child through the memoized Future.
*/
htree.getBtreeCounters().cacheMisses.increment();
return htree.loadChild(this, index);
}
private AbstractPage checkLazyChild(final int index) {
final AbstractPage child;
synchronized (childRefs) {
/*
* Note: we need to synchronize on here to ensure visibility for
* childRefs[index] (in case it was updated in another thread).
*/
final Reference childRef = childRefs[index];
child = childRef == null ? null : childRef.get();
if (child != null) {
// Already materialized.
htree.touch(child);
return child;
}
}
// Check for lazy BucketPage creation
if (getChildAddr(index) == IRawStore.NULL) {
final DirectoryPage copy = (DirectoryPage) copyOnWrite(IRawStore.NULL);
return copy.setLazyChild(index);
}
return null;
}
private BucketPage setLazyChild(final int index) {
final DirectoryPage copy = (DirectoryPage) copyOnWrite(IRawStore.NULL);
assert copy == this;
assert childRefs[index] == null;
final BucketPage lazyChild = new BucketPage((HTree) htree, htree.addressBits);
lazyChild.parent = (Reference) self;
((HTree) htree).nleaves++;
fillEmptySlots(index, lazyChild);
htree.touch(lazyChild);
return lazyChild;
}
/**
* Method conditionally reads the child at the specified index from the
* backing store and sets its reference on the appropriate element of
* {@link #childRefs}. This method assumes that external mechanisms
* guarantee that no other thread is requesting the same child via this
* method at the same time. For the mutable B+Tree, that guarantee is
* trivially given by its single-threaded constraint. For the read-only
* B+Tree, {@link AbstractHTree#loadChild(DirectoryPage, int)} provides this
* guarantee using a {@link Memoizer} pattern. This method explicitly
* handshakes with the {@link ChildMemoizer} to clear the {@link FutureTask}
* from the memoizer's internal cache as soon as the reference to the child
* has been set on the appropriate element of {@link #childRefs}.
*
* @param index
* The index of the child.
* @param req
* The key we need to remove the request from the
* {@link ChildMemoizer} cache (and null
if this
* method is not invoked by the memoizer pattern).
*
* @return The child and never null
.
*/
AbstractPage _getChild(final int index, final LoadChildRequest req) {
/*
* Make sure that the child is not reachable. It could have been
* concurrently set even if the caller had tested this and we do not
* want to read through to the backing store unless we need to.
*
* Note: synchronizing on childRefs[] should not be necessary. For a
* read-only B+Tree, the synchronization is provided by the Memoizer
* pattern. For a mutable B+Tree, the synchronization is provided by the
* single-threaded contract for mutation and by the requirement to use a
* construct, such as a Queue or the UnisolatedReadWriteIndex, which
* imposes a memory barrier when passing a B+Tree instance between
* threads.
*
* See http://www.cs.umd.edu/~pugh/java/memoryModel/archive/1096.html
*/
AbstractPage child;
synchronized (childRefs) {
/*
* Note: we need to synchronize on here to ensure visibility for
* childRefs[index] (in case it was updated in another thread).
*/
final Reference childRef = childRefs[index];
child = childRef == null ? null : childRef.get();
if (child != null) {
// Already materialized.
htree.touch(child);
return child;
}
}
/*
* The child needs to be read from the backing store.
*/
// width of a buddy hash table (#of pointer slots).
final int tableWidth = 1 << globalDepth;
// offset in [0:nbuddies-1] to the start of the buddy spanning that
// index.
final int tableOffset = index / tableWidth;
/*
* We need to get the address of the child, figure out the local depth
* of the child (by counting the #of points in the buddy bucket to that
* child), and then materialize the child from its address.
*/
final long addr = data.getChildAddr(index);
if (addr == IRawStore.NULL) {
// dump(Level.DEBUG, System.err);
/*
* Note: It appears that this can be triggered by a full disk, but I
* am not quite certain how a full disk leads to this condition.
* Presumably the full disk would cause a write of the child to
* fail. In turn, that should cause the thread writing on the B+Tree
* to fail. If group commit is being used, the B+Tree should then be
* discarded and reloaded from its last commit point.
*/
throw new AssertionError(
"Child does not have persistent identity: this=" + this
+ ", index=" + index);
}
/*
* Scan to count pointers to child within the buddy hash table.
*/
final int npointers;
{
int n = 0; // npointers
final int lastIndex = (tableOffset + tableWidth);
for (int i = tableOffset; i < lastIndex; i++) {
if (data.getChildAddr(i) == addr)
n++;
}
assert n > 0;
npointers = n;
}
/*
* Find the local depth of the child within this node. this becomes the
* global depth of the child.
*
* Note: This winds up being invoked to compute the depth of an overflow
* directory page when we first descend from a normal directory page.
*/
final int localDepth = HTreeUtil.getLocalDepth(htree.addressBits,
globalDepth, npointers);
assert npointers == 1 << (htree.addressBits - localDepth);
/*
* Read the child from the backing store (potentially reads through to
* the disk).
*
* Note: This is guaranteed to not do duplicate reads. There are two
* cases. (A) The mutable B+Tree. Since the mutable B+Tree is single
* threaded, this case is trivial. (B) The read-only B+Tree. Here our
* guarantee is that the caller is in ft.run() inside of the Memoizer,
* and that ensures that only one thread is executing for a given
* LoadChildRequest object (the input to the Computable). Note that
* LoadChildRequest MUST meet the criteria for a hash map for this
* guarantee to obtain.
*/
child = htree.readNodeOrLeaf(addr);
// set the depth on the child.
if (!child.isLeaf() && ((DirectoryPage) child).isOverflowDirectory()) {
/*
* Note: The global depth of an overflow page is always set to this
* constant. It should be ignored for code paths which maintain the
* overflow directory and overflow bucket pages.
*/
child.globalDepth = getOverflowPageDepth();
} else {
child.globalDepth = localDepth;
}
/*
* Set the reference for each slot in the buddy bucket which pointed at
* that child. There will be [npointers] such slots.
*
* Note: This code block is synchronized in order to facilitate the safe
* publication of the change in childRefs[index] to other threads.
*/
synchronized (childRefs) {
int n = 0;
final int lastIndex = (tableOffset + tableWidth);
for (int i = tableOffset; i < lastIndex; i++) {
if (data.getChildAddr(i) == addr) {
/*
* Since the childRefs[index] element has not been updated we do so
* now while we are synchronized.
*
* Note: This paranoia test could be tripped if the caller allowed
* concurrent requests to enter this method for the same child. In
* that case childRefs[index] could have an uncleared reference to
* the child. This would indicate a breakdown in the guarantee we
* require of the caller.
*/
assert childRefs[i] == null || childRefs[i].get() == null : "Child is already set: this="
+ this + ", index=" + i;
childRefs[i] = (Reference) child.self;
n++;
}
}
// patch parent reference since loaded from store.
child.parent = (Reference) this.self;
assert n == npointers;
}
/*
* Clear the future task from the memoizer cache.
*
* Note: This is necessary in order to prevent the cache from retaining
* a hard reference to each child materialized for the B+Tree.
*
* Note: This does not depend on any additional synchronization. The
* Memoizer pattern guarantees that only one thread actually call
* ft.run() and hence runs this code.
*/
if (req != null) {
htree.memo.removeFromCache(req);
}
htree.touch(child);
return child;
}
// private AbstractPage fixme(final int index) {
//
// /*
// * Look at the entry in the buddy hash table. If there is a reference to
// * the child and that reference has not been cleared, then we are done
// * and we can return the child reference and the offset of the buddy
// * table or bucket within the child.
// */
// final Reference ref = childRefs[index];
//
// AbstractPage child = ref == null ? null : ref.get();
//
// if (child != null) {
//
// return child;
//
// }
//
// // width of a buddy hash table (#of pointer slots).
// final int tableWidth = 1 << globalDepth;
//
// // offset in [0:nbuddies-1] to the start of the buddy spanning that
// // index.
// final int tableOffset = index / tableWidth;
//
// /*
// * We need to get the address of the child, figure out the local depth
// * of the child (by counting the #of points in the buddy bucket to that
// * child), and then materialize the child from its address.
// *
// * fixme MEMORIZER : This all needs to go through a memoizer pattern.
// * The hooks for that should be on AbstractHTree(.memo), but I have not
// * yet ported that code.
// */
// final long addr = data.getChildAddr(index);
//
// /*
// * Scan to count pointers to child within the buddy hash table.
// */
// final int npointers;
// {
//
// int n = 0; // npointers
//
// final int lastIndex = (tableOffset + tableWidth);
//
// for (int i = tableOffset; i < lastIndex; i++) {
//
// if (data.getChildAddr(i) == addr)
// n++;
//
// }
//
// assert n > 0;
//
// npointers = n;
//
// }
//
// /*
// * Find the local depth of the child within this node. this becomes the
// * global depth of the child.
// */
// final int localDepth = HTreeUtil.getLocalDepth(htree.addressBits,
// globalDepth, npointers);
//
// child = htree.readNodeOrLeaf(addr, localDepth/* globalDepthOfChild */);
//
// /*
// * Set the reference for each slot in the buddy bucket which pointed at
// * that child. There will be [npointers] such slots.
// */
// {
//
// int n = 0;
//
// final int lastIndex = (tableOffset + tableWidth);
//
// for (int i = tableOffset; i < lastIndex; i++) {
//
// if (data.getChildAddr(i) == addr) {
//
// childRefs[i] = (Reference) child.self;
//
// n++;
//
// }
//
// }
//
// assert n == npointers;
//
// }
//
// return child;
//
// }
public AbstractFixedByteArrayBuffer data() {
return data.data();
}
/**
* {@inheritDoc}
*
* Note: This method can only be used once you have decoded the hash bits
* from the key and looked up the offset of the buddy hash table on the page
* and the offset of the slot within the buddy hash table for the desired
* key. You must also know the localDepth of the child when it is
* materialized so that information can be set on the child, where is
* becomes the globalDepth of the child.
*
* @see #getChildAddrByHashCode(int, int)
*/
public long getChildAddr(int index) {
return data.getChildAddr(index);
}
public int getChildCount() {
return data.getChildCount();
}
public long getMaximumVersionTimestamp() {
return data.getMaximumVersionTimestamp();
}
public long getMinimumVersionTimestamp() {
return data.getMinimumVersionTimestamp();
}
public boolean hasVersionTimestamps() {
return data.hasVersionTimestamps();
}
public boolean isCoded() {
return data.isCoded();
}
public boolean isLeaf() {
return data.isLeaf();
}
/**
* The result depends on the backing {@link IDirectoryData} implementation.
* The {@link DirectoryPage} will be mutable when it is first created and is
* made immutable when it is persisted. If there is a mutation operation,
* the backing {@link IDirectoryData} is automatically converted into a
* mutable instance.
*/
public boolean isReadOnly() {
return data.isReadOnly();
}
/**
* @param htree
* The owning hash tree.
* @param overflowDirectory
* true
iff this is an overflow directory page.
* @param globalDepth
* The size of the address space (in bits) for each buddy hash
* table on a directory page. The global depth of a node is
* defined recursively as the local depth of that node within its
* parent. The global/local depth are not stored explicitly.
* Instead, the local depth is computed dynamically when the
* child will be materialized by counting the #of pointers to the
* the child in the appropriate buddy hash table in the parent.
* This local depth value is passed into the constructor when the
* child is materialized and set as the global depth of the
* child.
*/
@SuppressWarnings("unchecked")
public DirectoryPage(final HTree htree, final byte[] overflowKey,
final int globalDepth) {
super(htree, true/* dirty */, globalDepth);
childRefs = new Reference[(1 << htree.addressBits)];
data = new MutableDirectoryPageData(overflowKey, htree.addressBits,
htree.versionTimestamps);
}
/**
* Deserialization constructor - {@link #globalDepth} MUST be set by the
* caller.
*
* @param htree
* @param addr
* @param data
*/
DirectoryPage(final HTree htree, final long addr, final IDirectoryData data) {
super(htree, false/* dirty */, 0/*unknownGlobalDepth*/);
setIdentity(addr);
this.data = data;
childRefs = new Reference[(1 << htree.addressBits)];
}
/**
* Copy constructor.
*
* @param src
* The source node (must be immutable).
*
* @param triggeredByChildId
* The persistent identity of the child that triggered the copy
* constructor. This should be the immutable child NOT the one
* that was already cloned. This information is used to avoid
* stealing the original child since we already made a copy of
* it. It is {@link #NULL} when this information is not
* available, e.g., when the copyOnWrite action is triggered by a
* join() and we are cloning the sibling before we redistribute a
* key to the node/leaf on which the join was invoked.
*
* @todo We could perhaps replace this with the conversion of the
* INodeData:data field to a mutable field since the code which
* invokes copyOnWrite() no longer needs to operate on a new Node
* reference. However, I need to verify that nothing else depends on
* the new Node, e.g., the dirty flag, addr, etc.
*
* @todo Can't we just test to see if the child already has this node as its
* parent reference and then skip it? If so, then that would remove a
* troublesome parameter from the API.
*/
protected DirectoryPage(final DirectoryPage src,
final long triggeredByChildId) {
super(src);
assert !src.isDirty();
assert src.isReadOnly();
// assert src.isPersistent();
/*
* Steal/clone the data record.
*
* Note: The copy constructor is invoked when we need to begin mutation
* operations on an immutable node or leaf, so make sure that the data
* record is mutable.
*/
final int slotsOnPage = 1 << htree.addressBits;
assert src.data != null;
this.data = src.isReadOnly() ? new MutableDirectoryPageData(
htree.addressBits, src.data) : src.data;
assert this.data != null;
// clear reference on source.
src.data = null;
/*
* Steal strongly reachable unmodified children by setting their parent
* fields to the new node. Stealing the child means that it MUST NOT be
* used by its previous ancestor (our source node for this copy).
*/
childRefs = src.childRefs;
src.childRefs = null;
// childLocks = src.childLocks; src.childLocks = null;
for (int i = 0; i < slotsOnPage; i++) {
final AbstractPage child = deref(i);
/*
* Note: Both child.identity and triggeredByChildId will always be
* 0L for a transient B+Tree since we never assign persistent
* identity to the nodes and leaves. Therefore [child.identity !=
* triggeredByChildId] will fail for ALL children, including the
* trigger, and therefore fail to set the parent on any of them. The
* [btree.store==null] test handles this condition and always steals
* the child, setting its parent to this new node.
*
* FIXME It is clear that testing on child.identity is broken in
* some other places for the transient store. [This comment is
* carried over from the B+Tree code.]
*/
if (child != null
&& (htree.store == null || child.getIdentity() != triggeredByChildId)) {
/*
* Copy on write should never trigger for a dirty node and only
* a dirty node can have dirty children.
*/
assert !child.isDirty();
// Steal the child.
child.parent = (Reference) this.self;
// child.parent = btree.newRef(this);
// // Keep a reference to the clean child.
// childRefs[i] = new WeakReference(child);
}
}
}
/**
* Iterator visits children, recursively expanding each child with a
* post-order traversal of its children and finally visits this node itself.
*/
@Override
@SuppressWarnings("unchecked")
public Iterator postOrderNodeIterator(
final boolean dirtyNodesOnly, final boolean nodesOnly) {
if (dirtyNodesOnly && !dirty) {
return EmptyIterator.DEFAULT;
}
/*
* Iterator append this node to the iterator in the post-order position.
*/
return new Striterator(postOrderIterator1(dirtyNodesOnly, nodesOnly))
.append(new SingleValueIterator(this));
}
/**
* Iterator visits children recursively expanding each child with a
* post-order traversal of its children and finally visits this node itself.
*/
@SuppressWarnings("unchecked")
public Iterator postOrderIterator() {
/*
* Iterator append this node to the iterator in the post-order position.
*/
return new Striterator(postOrderIterator2())
.append(new SingleValueIterator(this));
}
/**
* Visits the children (recursively) using post-order traversal, but does
* NOT visit this node.
*/
@SuppressWarnings("unchecked")
private Iterator postOrderIterator2() {
/*
* Iterator visits the direct children, expanding them in turn with a
* recursive application of the post-order iterator.
*/
return new Striterator(childIterator()).addFilter(new Expander() {
private static final long serialVersionUID = 1L;
/*
* Expand each child in turn.
*/
protected Iterator expand(final Object childObj) {
/*
* A child of this node.
*/
final AbstractPage child = (AbstractPage) childObj;
if (!child.isLeaf()) {
/*
* The child is a Node (has children).
*
* Visit the children (recursive post-order traversal).
*/
// BTree.log.debug("child is node: " + child);
final Striterator itr = new Striterator(
((DirectoryPage) child).postOrderIterator2());
// append this node in post-order position.
itr.append(new SingleValueIterator(child));
return itr;
} else {
/*
* The child is a leaf.
*/
// BTree.log.debug("child is leaf: " + child);
// Visit the leaf itself.
return new SingleValueIterator(child);
}
}
});
}
/**
* Iterator visits the direct child nodes in the external key ordering.
*/
Iterator childIterator() {
return new ChildIterator();
}
/**
* Visit the distinct children exactly once (if there are multiple pointers
* to a given child, that child is visited just once).
*/
private class ChildIterator implements Iterator {
final private int slotsPerPage = 1 << htree.addressBits;
private int slot = 0;
private AbstractPage child = null;
private ChildIterator() {
nextChild(); // materialize the first child.
}
/**
* Advance to the next distinct child, materializing it if necessary.
* The first time, this will always return the child in slot zero on the
* page. Thereafter, it will skip over pointers to the same child and
* return the next distinct child.
*
* Note: For the special case of an overflow directory we allow a
* null
pointer for a child at a given index.
*
* @return true
iff there is another distinct child
* reference.
*/
private boolean nextChild() {
// final boolean isOverflowDirectory = isOverflowDirectory();
for (; slot < slotsPerPage; slot++) {
AbstractPage tmp = deref(slot);
if (tmp == null
&& data.getChildAddr(slot) == NULL) {
// Null pointers allowed either in an overflow or with lazy child creation.
continue;
}
tmp = tmp == null ? getChild(slot) : tmp;
if (tmp != child) {
child = tmp;
return true;
}
}
return false;
}
@Override
public boolean hasNext() {
/*
* Return true if there is another child to be visited.
*
* Note: This depends on nextChild() being invoked by the
* constructor and by next().
*/
return slot < slotsPerPage;
}
@Override
public AbstractPage next() {
if (!hasNext())
throw new NoSuchElementException();
final AbstractPage tmp = child;
nextChild(); // advance to the next child (if any).
return tmp;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
}
/**
* Human readable representation of the {@link Node}.
*/
@Override
public String toString() {
final StringBuilder sb = new StringBuilder();
// sb.append(getClass().getName());
sb.append(super.toString());
sb.append("{ isDirty=" + isDirty());
sb.append(", isDeleted=" + isDeleted());
sb.append(", addr=" + identity);
final DirectoryPage p = (parent == null ? null : parent.get());
sb.append(", parent=" + (p == null ? "N/A" : p.toShortString()));
sb.append(", isRoot=" + (htree.root == this));
if (data == null) {
// No data record? (Generally, this means it was stolen by copy on
// write).
sb.append(", data=NA}");
return sb.toString();
}
sb.append(", globalDepth=" + getGlobalDepth());
sb.append(", nbuddies=" + (1 << htree.addressBits) / (1 << globalDepth));
sb.append(", slotsPerBuddy=" + (1 << globalDepth));
// sb.append(", minKeys=" + minKeys());
//
// sb.append(", maxKeys=" + maxKeys());
toString(this, sb);
// indicate if each child is loaded or unloaded.
{
final int nchildren = getChildCount();
sb.append(", children=[");
for (int i = 0; i < nchildren; i++) {
if (i > 0)
sb.append(", ");
final AbstractPage child = childRefs[i] == null ? null
: childRefs[i].get();
sb.append(child == null ? "U" : "L");
}
sb.append("]");
}
sb.append("}");
return sb.toString();
}
/**
* Visits the children (recursively) using post-order traversal, but does
* NOT visit this node.
*/
@SuppressWarnings("unchecked")
private Iterator postOrderIterator1(
final boolean dirtyNodesOnly, final boolean nodesOnly) {
/*
* Iterator visits the direct children, expanding them in turn with a
* recursive application of the post-order iterator.
*
* When dirtyNodesOnly is true we use a child iterator that makes a best
* effort to only visit dirty nodes. Especially, the iterator MUST NOT
* force children to be loaded from disk if the are not resident since
* dirty nodes are always resident.
*
* The iterator must touch the node in order to guarantee that a node
* will still be dirty by the time that the caller visits it. This
* places the node onto the hard reference queue and increments its
* reference counter. Evictions do NOT cause IO when the reference is
* non-zero, so the node will not be made persistent as a result of
* other node touches. However, the node can still be made persistent if
* the caller explicitly writes the node onto the store.
*/
// BTree.log.debug("node: " + this);
return new Striterator(childIterator(dirtyNodesOnly))
.addFilter(new Expander() {
private static final long serialVersionUID = 1L;
/*
* Expand each child in turn.
*/
protected Iterator expand(final Object childObj) {
/*
* A child of this node.
*/
final AbstractPage child = (AbstractPage) childObj;
if (dirtyNodesOnly && !child.isDirty()) {
return EmptyIterator.DEFAULT;
}
if (child instanceof DirectoryPage) {
/*
* The child is a Node (has children).
*/
// visit the children (recursive post-order
// traversal).
final Striterator itr = new Striterator(
((DirectoryPage) child).postOrderIterator1(
dirtyNodesOnly, nodesOnly));
// append this node in post-order position.
itr.append(new SingleValueIterator(child));
return itr;
} else {
/*
* The child is a leaf.
*/
// Visit the leaf itself.
if (nodesOnly)
return EmptyIterator.DEFAULT;
return new SingleValueIterator(child);
}
}
});
}
/**
* Iterator visits the direct child nodes in the external key ordering.
*
* @param dirtyNodesOnly
* When true, only the direct dirty child nodes will be visited.
*/
public Iterator childIterator(final boolean dirtyNodesOnly) {
if (dirtyNodesOnly) {
return new DirtyChildIterator(this);
} else {
return new ChildIterator();
}
}
/**
* TODO We should dump each bucket page once. This could be done either by
* dumping each buddy bucket on the page separately or by skipping through
* the directory page until we get to the next bucket page and then dumping
* that.
*
* TODO The directory page validation should include checks on the bucket
* references and addresses. For a given buddy hash table, the reference and
* address should pairs should be consistent if either the reference or the
* address appears in another slot of that table. Also, there can not be
* "gaps" between observations of a reference to a given bucket - once you
* see another bucket reference a previously observed reference can not then
* appear.
*
* @see HTree#validatePointersInParent(DirectoryPage, int, AbstractPage)
*/
@Override
protected boolean dump(final Level level, final PrintStream out, final int height,
final boolean recursive, final boolean materialize) {
// True iff we will write out the node structure.
final boolean debug = level.toInt() <= Level.DEBUG.toInt();
// Set true iff an inconsistency is detected.
boolean ok = true;
// final int branchingFactor = this.getBranchingFactor();
// final int nkeys = getKeyCount();
// final int minKeys = this.minKeys();
// final int maxKeys = this.maxKeys();
if (this == htree.root) {
if (parent != null) {
out.println(indent(height)
+ "ERROR: this is the root, but the parent is not null.");
ok = false;
}
} else {
/*
* Note: there is a difference between having a parent reference and
* having the parent be strongly reachable. However, we actually
* want to maintain both -- a parent MUST always be strongly
* reachable ... UNLESS you are doing a fast forward or reverse leaf
* scan since the node hierarchy is not being traversed in that
* case.
*/
if (parent == null) {
out.println(indent(height)
+ "ERROR: the parent reference MUST be defined for a non-root node.");
ok = false;
} else if (parent.get() == null) {
out.println(indent(height)
+ "ERROR: the parent is not strongly reachable.");
ok = false;
}
}
if (debug) {
out.println(indent(height) + toString());
}
/*
* Look for inconsistencies for children. A dirty child MUST NOT have an
* entry in childAddr[] since it is not persistent and MUST show up in
* dirtyChildren. Likewise if a child is NOT dirty, then it MUST have an
* entry in childAddr and MUST NOT show up in dirtyChildren.
*
* This also verifies that all entries beyond nchildren (nkeys+1) are
* unused.
*/
for (int i = 0; i < (1 << htree.addressBits); i++) {
/*
* Scanning a valid child index.
*
* Note: This is not fetching the child if it is not in memory --
* perhaps it should using its persistent id?
*/
final AbstractPage child = (childRefs[i] == null ? null
: childRefs[i].get());
if (child != null) {
if (child.parent == null || child.parent.get() == null) {
/*
* the reference to the parent MUST exist since the we are
* the parent and therefore the parent is strongly
* reachable.
*/
out.println(indent(height) + " ERROR child[" + i
+ "] does not have parent reference.");
ok = false;
}
if (child.parent.get() != this) {
out.println(indent(height) + " ERROR child[" + i
+ "] has wrong parent.");
ok = false;
}
if (child.isDirty()) {
/*
* Dirty child. The parent of a dirty child MUST also be
* dirty.
*/
if (!isDirty()) {
out.println(indent(height) + " ERROR child[" + i
+ "] is dirty, but its parent is clean");
ok = false;
}
if (childRefs[i] == null) {
out.println(indent(height) + " ERROR childRefs[" + i
+ "] is null, but the child is dirty");
ok = false;
}
if (getChildAddr(i) != NULL) {
out.println(indent(height) + " ERROR childAddr[" + i
+ "]=" + getChildAddr(i) + ", but MUST be "
+ NULL + " since the child is dirty");
ok = false;
}
} else {
/*
* Clean child (ie, persistent). The parent of a clean child
* may be either clear or dirty.
*/
if (getChildAddr(i) == NULL) {
out.println(indent(height) + " ERROR childKey[" + i
+ "] is " + NULL + ", but child is not dirty");
ok = false;
}
}
}
}
if (!ok && !debug) {
// @todo show the node structure with the errors since we would not
// have seen it otherwise.
}
if (recursive) {
/*
* Dump children using pre-order traversal.
*/
final Set dirty = new HashSet();
for (int i = 0; i < (1 << htree.addressBits); i++) {
if (childRefs[i] == null && !isReadOnly()
&& ((MutableDirectoryPageData) data).childAddr[i] == 0) {
/*
* This let's us dump a tree with some kinds of structural
* problems (missing child reference or key).
*/
out.println(indent(height + 1)
+ "ERROR can not find child at index=" + i
+ ", skipping this index.");
ok = false;
continue;
}
/*
* Note: this works around the assert test for the index in
* getChild(index) but is not able/willing to follow a childKey
* to a child that is not memory resident.
*/
// AbstractNode child = getChild(i);
final AbstractPage child = childRefs[i] == null ? null
: childRefs[i].get();
if (child != null) {
if (child.parent == null) {
out.println(indent(height + 1)
+ "ERROR child does not have parent reference at index="
+ i);
ok = false;
}
if (child.parent.get() != this) {
out.println(indent(height + 1)
+ "ERROR child has incorrect parent reference at index="
+ i);
ok = false;
}
if (child.isDirty()) {
dirty.add(child);
}
if (!child.dump(level, out, height + 1, true, materialize)) {
ok = false;
}
}
}
}
return ok;
}
/**
* {@inheritDoc}
*
* @see pre-heat the journal
* on startup
*/
@Override
public void dumpPages(final boolean recursive, final boolean visitLeaves,
final HTreePageStats stats) {
stats.visit(htree, this);
if (!recursive)
return;
// materialize children.
final Iterator itr = childIterator();
while (itr.hasNext()) {
final AbstractPage child = itr.next();
if (!visitLeaves && child instanceof BucketPage) {
/*
* Note: This always reads the child and then filters out leaves.
*
* Note: It might not be possible to lift this constraint into the
* childIterator() due to the HTree design (per Martyn's
* recollection).
*/
continue;
}
// recursion into children.
child.dumpPages(recursive, visitLeaves, stats);
}
}
/**
* Utility method formats the {@link IDirectoryData}.
*
* @param data
* A data record.
* @param sb
* The representation will be written onto this object.
*
* @return The sb parameter.
*/
static public StringBuilder toString(final IDirectoryData data,
final StringBuilder sb) {
final int nchildren = data.getChildCount();
sb.append(", nchildren=" + nchildren);
// sb.append(", spannedTupleCount=" + data.getSpannedTupleCount());
//
// sb.append(",\nkeys=" + data.getKeys());
{
sb.append(",\nchildAddr=[");
for (int i = 0; i < nchildren; i++) {
if (i > 0)
sb.append(", ");
sb.append(data.getChildAddr(i));
}
sb.append("]");
}
// {
//
// sb.append(",\nchildEntryCount=[");
//
// for (int i = 0; i < nchildren; i++) {
//
// if (i > 0)
// sb.append(", ");
//
// sb.append(data.getChildEntryCount(i));
//
// }
//
// sb.append("]");
//
// }
if (data.hasVersionTimestamps()) {
sb.append(",\nversionTimestamps={min="
+ data.getMinimumVersionTimestamp() + ",max="
+ data.getMaximumVersionTimestamp() + "}");
}
return sb;
}
@Override
public void PP(final StringBuilder sb, final boolean showBinary) {
sb.append(PPID() + " [" + globalDepth + "] " + indent(getLevel()));
sb.append("("); // start of address map.
// #of buddy tables on a page.
final int nbuddies = (1 << htree.addressBits) / (1 << globalDepth);
// #of address slots in each buddy hash table.
final int slotsPerBuddy = (1 << globalDepth);
for (int i = 0; i < nbuddies; i++) {
if (i > 0) // buddy boundary marker
sb.append(";");
for (int j = 0; j < slotsPerBuddy; j++) {
final int slot = i * slotsPerBuddy + j;
if (j > 0) // slot boundary marker.
sb.append(",");
final AbstractPage child = getChildIfPresent(slot);
sb.append(child == null ? "-" : child.PPID());
}
}
sb.append(")"); // end of address map.
sb.append("\n");
final Iterator itr = childIterator();
while (itr.hasNext()) {
final AbstractPage child = itr.next();
child.PP(sb, showBinary);
}
}
/**
* Tests the slot for content
*
* @param slot - the slot to check
* @return an existing AbstractPage if present and null otherwise
*/
AbstractPage getChildIfPresent(final int slot) {
if (childRefs[slot] == null && data.getChildAddr(slot) == IRawStore.NULL) {
return null;
} else {
return getChild(slot);
}
}
/**
* Invoked by {@link #copyOnWrite()} to clear the persistent address for a
* child on a cloned parent and set the reference to the cloned child.
*
* @param oldChildAddr
* The persistent address of the old child. The entries to be
* updated are located based on this argument. It is an error if
* this address is not found in the list of child addresses for
* this {@link Node}.
* @param newChild
* The reference to the new child.
*/
// FIXME Reconcile two versions of replaceChildRef
// FIXME Reconcile pattern for deleting a persistent object (htree AND btree)
void replaceChildRef(final long oldChildAddr, final AbstractPage newChild) {
assert oldChildAddr != NULL || htree.store == null;
assert newChild != null;
// This node MUST have been cloned as a pre-condition, so it can not
// be persistent.
assert !isPersistent();
assert !isReadOnly();
// The newChild MUST have been cloned and therefore MUST NOT be
// persistent.
assert !newChild.isPersistent();
assert !isReadOnly();
final MutableDirectoryPageData data = (MutableDirectoryPageData) this.data;
final int slotsOnPage = 1 << htree.addressBits;
// Scan for location in weak references.
int npointers = 0;
boolean found = false;
for (int i = 0; i < slotsOnPage; i++) {
if (data.childAddr[i] == oldChildAddr) {
found = true;
// remove from cache and free the oldChildAddr if the Strategy
// supports it.
// TODO keep this in case we add in the store cache again.
// if (htree.storeCache != null) {
// // remove from cache.
// htree.storeCache.remove(oldChildAddr);
// }
// free the oldChildAddr if the Strategy supports it
// - and only if not already deleted!
if (npointers == 0)
htree.deleteNodeOrLeaf(oldChildAddr);
// System.out.println("Deleting " + oldChildAddr);
// Clear the old key.
data.childAddr[i] = NULL;
// Stash reference to the new child.
// childRefs[i] = btree.newRef(newChild);
childRefs[i] = (Reference) newChild.self;
if (newChild.isPersistent()) {
data.childAddr[i] = newChild.getIdentity();
}
// // Add the new child to the dirty list.
// dirtyChildren.add(newChild);
// Set the parent on the new child.
// newChild.parent = btree.newRef(this);
newChild.parent = (Reference) this.self;
npointers++;
} else if (found) {
// No more pointers to that child.
break;
}
}
if (npointers == 0)
throw new IllegalArgumentException("Not our child : oldChildAddr="
+ oldChildAddr);
}
int replaceChildRef(final Reference oldRef, final AbstractPage newChild) {
final int slotsOnPage = 1 << htree.addressBits;
final MutableDirectoryPageData data = (MutableDirectoryPageData) this.data;
// Scan for location in weak references.
int firstSlot = -1;
int npointers = 0;
for (int i = 0; i < slotsOnPage; i++) {
if (childRefs[i] == oldRef) {
if (firstSlot == -1) firstSlot = i;
// Clear the old key.
data.childAddr[i] = NULL;
// Stash reference to the new child.
// childRefs[i] = btree.newRef(newChild);
if (newChild != null) {
childRefs[i] = (Reference) newChild.self;
if (newChild.isPersistent()) {
data.childAddr[i] = newChild.getIdentity();
}
newChild.parent = (Reference) this.self;
} else {
childRefs[i] = null;
}
npointers++;
}
}
assert npointers > 0;
return firstSlot;
}
void _addLevel(final BucketPage bucketPage) {
assert !isReadOnly();
assert !isOverflowDirectory();
/**
* TBD: Since for _addLevel to be called there should only be a single reference to
* bucketPage, this directory MUST be at global depth. BUT, rather than
* replacing the only the old bucket page with the reference to the new
* directory, we should/could create a directory of half this directory's depth
* and insert the requisite references
*/
// Create new directory to insert
final DirectoryPage ndir = new DirectoryPage((HTree) htree,
null, // overflowKey
htree.addressBits/* globalDepth */);
((HTree) htree).nnodes++;
// And new bucket page for the new directory
// only add one initially - to cover the first key
final BucketPage newPage = createPage(1);
newPage.parent = (Reference) ndir.self;
final byte[] tstkey = bucketPage.getFirstKey();
final int bucketSlot = getLocalHashCode(tstkey, getPrefixLength() + globalDepth);
final int bucketRefs = (1 << htree.addressBits) >> 1; // half total number of directory slots
final boolean fillLowerSlots = bucketSlot < bucketRefs;
final Reference a = (Reference) (fillLowerSlots ? newPage.self : null);
final Reference b = (Reference) (fillLowerSlots ? null : newPage.self);
// nleaves is unchanged since we will create one new page and delete the old
// ((HTree) htree).nleaves++; // Note: only +1 since we will delete the oldPage.
// Link the new bucket pages into the new parent directory page.
for (int i = 0; i < bucketRefs; i++) {
ndir.childRefs[i] = a;
ndir.childRefs[i+bucketRefs] = b;
}
// now replace the reference to the old bucket page with the new directory
replaceChildRef(bucketPage.self, ndir);
// ensure that the bucket page doesn't evict
bucketPage.delete();
// insert old tuples
final int bucketSlotsPerPage = bucketPage.slotsOnPage();
for (int i = 0; i < bucketSlotsPerPage; i++) {
((HTree) htree).insertRawTuple(bucketPage, i);
}
// ...and finally delete old data
if (bucketPage.isPersistent()) {
htree.deleteNodeOrLeaf(bucketPage.getIdentity());
}
}
/**
* This method should only be called when using a DirectoryPage as a BucketPage
* Blob. The final child in the blob is used for insert by default.
*
* @param child - the child to be added
*/
void _addChild(final AbstractPage child) {
assert isOverflowDirectory();
assert !isReadOnly();
// find available slot
final MutableDirectoryPageData pdata = (MutableDirectoryPageData) data;
for (int i = 0; i < pdata.childAddr.length; i++) {
final AbstractPage aChild = childRefs[i] == null ? null
: childRefs[i].get();
if (aChild == null && pdata.childAddr[i] == NULL) {
childRefs[i] = (Reference) child.self;
assert !child.isPersistent();
child.parent = (Reference) self;
return;
}
}
// else insert new level above this one and add child to that
final DirectoryPage pd = getParentDirectory();
if (pd.isOverflowDirectory()) { // already handles blobs
assert false; // unsure for now
} else {
final DirectoryPage blob = new DirectoryPage((HTree) htree,
getOverflowKey(),
getOverflowPageDepth());
blob._addChild(this);
blob._addChild(child);
pd.replaceChildRef(this.self, blob);
if (INFO)
log.info("New Overflow Level: " + getLevel());
}
}
/**
* Return the last non-null
child of an overflow directory
* page.
*/
BucketPage lastChild() {
assert isOverflowDirectory();
for (int i = data.getChildCount() - 1; i >= 0; i--) {
final BucketPage aChild = (BucketPage) deref(i);
if (aChild != null) {
// htree.touch(aChild);
return aChild;
}
if (data.getChildAddr(i) != NULL)
return (BucketPage) getChild(i);
}
throw new AssertionError();
}
/**
* Double indirection dereference for the specified index. If the
* {@link Reference} is null
, returns null
.
* Otherwise returns {@link Reference#get()}.
*/
protected AbstractPage deref(final int index) {
return childRefs[index] == null ? null : childRefs[index].get();
}
@Override
public boolean isOverflowDirectory() {
return data.isOverflowDirectory();
}
/**
* If this is an overflow directory then the depth-based hashCode is irrelevant
* since it is used as a blob container for BucketPage references.
*/
@Override
public int getLocalHashCode(final byte[] key, final int prefixLength) {
if (isOverflowDirectory()) {
/*
* Shouldn't need to check the key, this will be handled when
* the BucketPage is checked for a precise match
*/
return 0;
}
return super.getLocalHashCode(key, prefixLength);
}
/**
* This method is never called at present since DirectoryPages are
* always created at maximum depth. Whether there is any advantage
* in supporting pages of lesser depths is yet to be determined.
*
* @param buddyOffset
* @param oldChild
*/
public void split(final int buddyOffset, final DirectoryPage oldChild) {
if (true) {
/*
* FIXME We need to update this code to handle the directory page
* not being at maximum depth to work without the concept of buddy
* buckets.
*/
throw new UnsupportedOperationException();
}
if (parent == null)
throw new IllegalArgumentException();
if (oldChild == null)
throw new IllegalArgumentException();
if (oldChild.globalDepth >= globalDepth) {
/*
* In this case we have to introduce a new directory level instead
* (increasing the height of the tree at that point).
*/
throw new IllegalStateException();
}
if (buddyOffset < 0)
throw new IllegalArgumentException();
if (buddyOffset >= (1 << htree.addressBits)) {
/*
* Note: This check is against the maximum possible slot index. The
* actual max buddyOffset depends on parent.globalBits also since
* (1<> 1); i < npointers; i++) {
if (childRefs[i] != oldChild.self)
throw new RuntimeException("Does not point to old child.");
// update the references to the new bucket.
childRefs[i] = (Reference) newChild.self;
assert !newChild.isPersistent();
}
} // updatePointersInParent
/**
* Redistribute the buddy hash tables in a {@link DirectoryPage}.
*
* Note: We are not changing the #of hash tables, just their size and the
* page on which they are found. Any reference in a source buddy hash table
* will wind up in the "same" buddy hash table afterwards, but the page and
* offset on the page of the buddy hash table may have been changed and the
* size of the buddy hash table will have doubled.
*
* When a {@link DirectoryPage} is split, the size of each buddy hash table
* is doubled. The additional slots in each buddy hash table are filled in
* by (a) spacing out the old slot entries in each buddy hash table; and (b)
* filling in the uncovered slot with a copy of the previous slot.
*
* We proceed backwards, moving the upper half of the buddy hash tables to
* the new directory page first and then spreading out the lower half of the
* source page among the new buddy hash table boundaries on the source page.
*
* @param oldDepth
* The depth of the old {@link DirectoryPage} before the split.
* @param newDepth
* The depth of the old and new {@link DirectoryPage} after the
* split (this is just oldDepth+1).
* @param oldDir
* The old {@link DirectoryPage}.
* @param newDir
* The new {@link DirectoryPage}.
*
* @deprecated with
* {@link #splitDirectoryPage(DirectoryPage, int, DirectoryPage)}
*/
private void redistributeBuddyTables(final int oldDepth,
final int newDepth, final DirectoryPage oldDir,
final DirectoryPage newDir) {
assert oldDepth + 1 == newDepth;
// #of slots on the directory page (invariant given addressBits).
final int slotsOnPage = (1 << htree.addressBits);
// #of address slots in each old buddy hash table.
final int slotsPerOldBuddy = (1 << oldDepth);
// #of address slots in each new buddy hash table.
final int slotsPerNewBuddy = (1 << newDepth);
// #of buddy tables on the old bucket directory.
final int oldBuddyCount = slotsOnPage / slotsPerOldBuddy;
// #of buddy tables on the directory page after the split.
final int newBuddyCount = slotsOnPage / slotsPerNewBuddy;
final DirectoryPage srcPage = oldDir;
final long[] srcAddrs = ((MutableDirectoryPageData) oldDir.data).childAddr;
final Reference[] srcRefs = oldDir.childRefs;
/*
* Move top 1/2 of the buddy hash tables from the child to the new page.
*/
{
// target is the new page.
final DirectoryPage dstPage = newDir;
final long[] dstAddrs = ((MutableDirectoryPageData) dstPage.data).childAddr;
final Reference[] dstRefs = dstPage.childRefs;
// index (vs offset) of first buddy in upper half of src page.
final int firstSrcBuddyIndex = (oldBuddyCount >> 1);
// exclusive upper bound for index (vs offset) of last buddy in
// upper half of src page.
final int lastSrcBuddyIndex = oldBuddyCount;
// exclusive upper bound for index (vs offset) of last buddy in
// upper half of target page.
final int lastDstBuddyIndex = newBuddyCount;
// work backwards over buddies to avoid stomping data!
for (int srcBuddyIndex = lastSrcBuddyIndex - 1, dstBuddyIndex = lastDstBuddyIndex - 1; //
srcBuddyIndex >= firstSrcBuddyIndex; //
srcBuddyIndex--, dstBuddyIndex--//
) {
final int firstSrcSlot = srcBuddyIndex * slotsPerOldBuddy;
final int lastSrcSlot = (srcBuddyIndex + 1) * slotsPerOldBuddy;
final int firstDstSlot = dstBuddyIndex * slotsPerNewBuddy;
for (int srcSlot = firstSrcSlot, dstSlot = firstDstSlot; srcSlot < lastSrcSlot; srcSlot++, dstSlot += 2) {
if (TRACE)
log.trace("moving: page(" + srcPage.toShortString()
+ "=>" + dstPage.toShortString() + ")"
+ ", buddyIndex(" + srcBuddyIndex + "=>"
+ dstBuddyIndex + ")" + ", slot(" + srcSlot
+ "=>" + dstSlot + ")");
for (int i = 0; i < 2; i++) {
// Copy data to slot
dstAddrs[dstSlot + i] = srcAddrs[srcSlot];
dstRefs[dstSlot + i] = srcRefs[srcSlot];
}
}
}
}
/*
* Reposition the bottom 1/2 of the buddy buckets on the old page.
*
* Again, we have to move backwards through the buddy tables on the
* source page to avoid overwrites of data which has not yet been
* copied. Also, notice that the buddy table at index ZERO does not
* move - it is already in place even though it's size has doubled.
*/
{
// target is the old page.
final DirectoryPage dstPage = oldDir;
final long[] dstAddrs = ((MutableDirectoryPageData) dstPage.data).childAddr;
final Reference[] dstRefs = dstPage.childRefs;
// index (vs offset) of first buddy in lower half of src page.
final int firstSrcBuddyIndex = 0;
// exclusive upper bound for index (vs offset) of last buddy in
// lower half of src page.
final int lastSrcBuddyIndex = (oldBuddyCount >> 1);
// exclusive upper bound for index (vs offset) of last buddy in
// upper half of target page (which is also the source page).
final int lastDstBuddyIndex = newBuddyCount;
/*
* Work backwards over buddy buckets to avoid stomping data!
*
* Note: Unlike with a BucketPage, we have to spread out the data in
* the slots of the first buddy hash table on the lower half of the
* page as well to fill in the uncovered slots. This means that we
* have to work backwards over the slots in each source buddy table
* to avoid stomping our data.
*/
for (int srcBuddyIndex = lastSrcBuddyIndex - 1, dstBuddyIndex = lastDstBuddyIndex - 1; //
srcBuddyIndex >= firstSrcBuddyIndex; //
srcBuddyIndex--, dstBuddyIndex--//
) {
final int firstSrcSlot = srcBuddyIndex * slotsPerOldBuddy;
final int lastSrcSlot = (srcBuddyIndex + 1) * slotsPerOldBuddy;
// final int firstDstSlot = dstBuddyIndex * slotsPerNewBuddy;
final int lastDstSlot = (dstBuddyIndex + 1) * slotsPerNewBuddy;
for (int srcSlot = lastSrcSlot-1, dstSlot = lastDstSlot-1; srcSlot >= firstSrcSlot; srcSlot--, dstSlot -= 2) {
if (TRACE)
log.trace("moving: page(" + srcPage.toShortString()
+ "=>" + dstPage.toShortString() + ")"
+ ", buddyIndex(" + srcBuddyIndex + "=>"
+ dstBuddyIndex + ")" + ", slot(" + srcSlot
+ "=>" + dstSlot + ")");
for (int i = 0; i < 2; i++) {
// Copy data to slot.
dstAddrs[dstSlot - i] = srcAddrs[srcSlot];
dstRefs[dstSlot - i] = srcRefs[srcSlot];
}
}
}
}
}
/**
* Use Striterator Expander to optimize iteration
*/
public ITupleIterator getTuples() {
// start with child nodes
final Striterator tups = new Striterator(childIterator());
// expand child contents
tups.addFilter(new Expander() {
protected Iterator expand(Object obj) {
if (obj instanceof BucketPage) {
return ((BucketPage) obj).tuples();
} else {
return ((DirectoryPage) obj).getTuples();
}
}
});
// wrap striterator for return type
return new ITupleIterator() {
public ITuple next() {
return (ITuple) tups.next();
}
public boolean hasNext() {
return tups.hasNext();
}
public void remove() {
throw new UnsupportedOperationException();
}
};
}
@Override
boolean isClean() {
for (int i = 0; i < childRefs.length; i++) {
final AbstractPage node = deref(i);
if (node != null && !node.isClean()) {
return false;
}
}
return !isDirty();
}
public byte[] getOverflowKey() {
return data.getOverflowKey();
}
/**
* Called on the condition that an attempt is made to insert a key/value
* into an overflow directory with a different key.
*
* In this case, this directory has been created to extend the bit resolution
* so that the new key/value is resolved to a different slot and a new BucketPage
*
* @param overflowKey
* @param child
*/
DirectoryPage _addLevelForOverflow(final DirectoryPage current) {
if (isReadOnly()) {
DirectoryPage copy = (DirectoryPage) copyOnWrite(current.getIdentity());
return copy._addLevelForOverflow(current);
}
// EvictionProtection ep = new EvictionProtection(this);
// try {
final DirectoryPage newdir = new DirectoryPage((HTree) htree, null /*overflowKey*/, htree.addressBits);
if (isReadOnly()) // TBD: Remove debug point
assert !isReadOnly();
replaceChildRef(current.self, newdir);
// do not want or need to push all bucket values to redistribute, just want to
// set the directory to relevant key.
// Logically we want to split the bucket pages until there is only one at the key provided
final byte[] overflowKey = current.getOverflowKey();
final int slot = newdir.getLocalHashCode(overflowKey, newdir._getPrefixLength());
newdir.childRefs[slot] = (Reference) current.self;
((MutableDirectoryPageData) newdir.data).childAddr[slot] = current.identity;
current.parent = (Reference) newdir.self;
try {
current._protectFromEviction();
// now fill others with BucketPage references
newdir._fillChildSlots(slot, 0, childRefs.length, 0);
} finally {
current._releaseProtection();
}
// System.out.println("_addLevelForOverflow, level: " + newdir.getLevel());
return newdir;
// } finally {
// ep.release();
// }
}
private void _touchHierarchy() {
DirectoryPage dp = this;
while (dp != null) {
htree.touch(dp);
dp = dp.getParentDirectory();
}
}
/**
* crawl up the hierarchy decrementing the eviction count
* if any hit zero then explicitly evict
*/
private void _releaseProtection() {
DirectoryPage dp = this;
while (dp != null) {
if (--dp.referenceCount == 0) {
htree.writeNodeRecursive(dp);
}
dp = dp.getParentDirectory();
}
}
/**
* crawl up the hierarchy incrementing the eviction count
*/
private void _protectFromEviction() {
DirectoryPage dp = this;
while (dp != null) {
dp.referenceCount++;
dp = dp.getParentDirectory();
}
}
/**
* This method distributes new BucketPages in the child slots around a specified slot
*
* The purpose is to create the minimum number of pages to enable the specified slot
* to have unique contents. Examples where X indicates specified slot:
*
* 2bits
* X211
* 112X
*
* 3bits
* X3221111
* 3X221111
* 22X31111
* 1111X322
*
* @param slot - the slot to NOT fill
* @param offset from where to fill
* @param length of range
* @param depth of pages
*/
private void _fillChildSlots(final int slot, final int offset, final int length, final int depth) {
assert !isReadOnly();
if (slot == offset && length == 1) {
assert childRefs[offset] != null;
return;
}
if (slot >= offset && slot < (offset + length)) {
final int delta = length/2;
_fillChildSlots(slot, offset, delta, depth+1);
_fillChildSlots(slot, offset+delta, delta, depth+1);
} else {
final BucketPage bp = new BucketPage((HTree) htree, depth);
bp.parent = (Reference) self;
((HTree) htree).nleaves++;
for (int s = 0; s < length; s++) {
if (isReadOnly())
assert !isReadOnly();
assert childRefs[offset+s] == null;
childRefs[offset+s] = (Reference) bp.self;
}
}
}
int _getPrefixLength() {
int prefixLength = 0;
DirectoryPage p = getParentDirectory();
while (p != null) {
prefixLength += p.globalDepth;
p = p.getParentDirectory();
}
return prefixLength;
}
/**
* Ensure their is a single reference to the BucketPage. This assumption is
* required in the case of overflow, since an overflow Bucket will hold
* values for a single key.
*
* @param key
* @param bp
*/
void _ensureUniqueBucketPage(final byte[] key,
Reference bp) {
final int prefixLength = _getPrefixLength();
final int hashBits = getLocalHashCode(key, prefixLength);
assert childRefs[hashBits] == bp;
int start = -1;
int refs = 0;
for (int i = 0; i < childRefs.length; i++) {
if (childRefs[i] == bp) {
start = start == -1 ? i : start;
refs++;
if (i != hashBits) {
childRefs[i] = null;
((MutableDirectoryPageData) data).childAddr[i] = IRawStore.NULL;
}
}
}
assert Integer.bitCount(refs) == 1; // MUST be power of 2
// Don't fill slots!
// _fillChildSlots(hashBits, start, refs, 0);
}
final public int removeAll(final byte[] key) {
if (!isOverflowDirectory())
throw new UnsupportedOperationException("Only valid if page is an overflow directory");
if (isReadOnly()) {
DirectoryPage copy = (DirectoryPage) copyOnWrite(getIdentity());
return copy.removeAll(key);
}
// call removeAll on each child, then htree.remove on existing child store
int ret = 0;
for (int i = 0; i < childRefs.length; i++) {
AbstractPage tmp = deref(i);
if (tmp == null && data.getChildAddr(i) != IRawStore.NULL) {
tmp = getChild(i);
}
if (tmp == null) {
break;
}
ret += tmp.removeAll(key);
}
// Now remove all childAddrs IFF there is a persistent identity
for (int i = 0; i < childRefs.length; i++) {
final long addr = data.getChildAddr(i);
if (addr != IRawStore.NULL)
htree.deleteNodeOrLeaf(addr);
}
return ret;
}
final public byte[] removeFirst(final byte[] key) {
if (!isOverflowDirectory())
throw new UnsupportedOperationException("Only valid if page is an overflow directory");
/**
* We should check the overflowKey to see if we need to ask the last
* BucketPage to remove the key.
*
* If so, then we need to ensure the BucketPage is not empty after removal
* and, if so, to remove the reference and mark as deleted.
*
* This is further complicated if the previous sibling is a DirectoryPage.
* In this case, the original parent directory should be replaced
*/
if (BytesUtil.compareBytes(key, getOverflowKey()) == 0) {
final AbstractPage last = lastChild();
if (last.isLeaf()) {
final BucketPage lastbp = (BucketPage) last.copyOnWrite();
final byte[] ret = lastbp.removeFirst(key);
if (lastbp.getValues().isEmpty()) {
DirectoryPage dp = lastbp.getParentDirectory();
final int slot = dp.replaceChildRef(lastbp.self, null);
lastbp.delete();
if (slot == 1) { // check first child for OverflowDirectory
AbstractPage odc = dp.getChild(0);
if (!odc.isLeaf()) {
DirectoryPage od = (DirectoryPage) odc;
assert od.isOverflowDirectory();
// Right then, since this (dp) only has a single child
// AND that child is already an overflowDirectory, then
// its parent must replace the dp.self with the
// odc.self
DirectoryPage gdp = dp.getParentDirectory();
assert gdp != null;
gdp.replaceChildRef(dp.self, od);
dp.delete();
if (DEBUG) {
log.debug("Promoted child overflowDirectory after remove from last BucketPage");
}
}
}
}
return ret;
} else {
return last.removeFirst(key);
}
} else {
return null;
}
}
public void removeAll() {
Iterator chs = childIterator(false);
while (chs.hasNext()) {
final AbstractPage ch = chs.next();
if (ch instanceof BucketPage) {
if (TRACE)
log.trace("Removing bucket page: " + ch.PPID());
if (ch.isPersistent())
htree.store.delete(ch.getIdentity());
} else {
if (TRACE)
log.trace("Removing child directory page: " + ch.PPID());
((DirectoryPage) ch).removeAll();
}
}
if (identity != IRawStore.NULL)
htree.store.delete(identity);
}
}