com.bigdata.htree.DirectoryPage Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bigdata-core Show documentation
Blazegraph(TM) DB Core Platform. It contains all Blazegraph DB dependencies other than Blueprints.
There is a newer version: 2.1.4
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package com.bigdata.htree;

import java.io.PrintStream;
import java.lang.ref.Reference;
import java.util.HashSet;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.concurrent.FutureTask;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.Node;
import com.bigdata.htree.AbstractHTree.ChildMemoizer;
import com.bigdata.htree.AbstractHTree.LoadChildRequest;
import com.bigdata.htree.data.IDirectoryData;
import com.bigdata.io.AbstractFixedByteArrayBuffer;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.util.BytesUtil;
import com.bigdata.util.concurrent.Memoizer;

import cutthecrap.utils.striterators.EmptyIterator;
import cutthecrap.utils.striterators.Expander;
import cutthecrap.utils.striterators.SingleValueIterator;
import cutthecrap.utils.striterators.Striterator;

/**
 * An {@link HTree} directory page (node). Each directory page will hold one or
 * more "buddy" hash tables. The #of buddy hash tables on the page depends on
 * the globalDepth of the page and the addressBits as computed by
 * {@link DirectoryPage#getNumBuddies()}.
 */
class DirectoryPage extends AbstractPage implements IDirectoryData {
    
    /**
     * The depth of a bucket page which overflows is always addressBits.
     */
    final int getOverflowPageDepth() {
     
        return htree.addressBits;
        
    }
	
	/**
	 * Transient references to the children.
	 */
	// Note: cleared by copyOnWrite when we steal the array.
	/*final*/ Reference[] childRefs;

	/**
	 * Persistent data.
	 */
	IDirectoryData data;

	/**
	 * Get the child indexed by the key.
	 * 
	 * Note: The recursive descent pattern requires the caller to separately
	 * compute the buddy index before each descent into a child.
	 * 
	 * @param hashBits
	 *            The relevant bits from the key.
	 * @param buddyOffset
	 *            The offset into the child of the first slot for the buddy hash
	 *            table or buddy hash bucket.
	 * 
	 * @return The child indexed by the key.
	 * 
	 * @see HTreeUtil#getBuddyOffset(int, int, int)
	 */
	protected AbstractPage getChild(final int hashBits, final int buddyOffset) {

		// width of a buddy hash table in pointer slots.
		final int tableWidth = (1 << globalDepth);

		// index position of the start of the buddy hash table in the page.
		final int tableOffset = (tableWidth * buddyOffset);

		// index of the slot in the buddy hash table for the given hash
		// bits.
		final int index = tableOffset + hashBits;

		return getChild(index);

	}

	/**
	 * Split the child {@link BucketPage}.
	 * 

	 * Rather than immediately creating 2 buckets, we want first to just
	 * fill half the original slots (by testing the first key value) and
	 * leaving the other half as null.
	 * 

	 * This will result in a lazy creation of the second BucketPage
	 * if required.  Such that we could end up introducing several
	 * new layers without the needless creation of empty BucketPages.
	 */
	void _splitBucketPage(final BucketPage bucketPage) {

        // Note: this.getChildCount() is less direct.
		final byte[] tstkey = bucketPage.getFirstKey();
		final int bucketSlot = getLocalHashCode(tstkey, getPrefixLength());
				
		final int slotsOnPage = 1 << htree.addressBits;
		int start = 0;
		for (int s = 0; s < slotsOnPage; s++) {
			if (bucketPage.self == childRefs[s]) {
				start = s;
				break;
			}
		}
		int last = start;
		for (int s = start + 1; s < slotsOnPage; s++) {
			if (bucketPage.self == childRefs[s]) {
				last++;
			} else {
				break;
			}
		}
		final int npointers = last - start + 1;

		assert npointers > 1;
		assert npointers % 2 == 0;
		assert npointers == 1 << (htree.addressBits - bucketPage.globalDepth);

		final int crefs = npointers >> 1; // half references for each new child
		final int newDepth = bucketPage.globalDepth + 1;
		final boolean fillLowerSlots = bucketSlot < start + crefs;
		assert bucketSlot >= start && bucketSlot <= last;
		
		final BucketPage newPage = createPage(newDepth);
		
		final Reference a = (Reference) (fillLowerSlots ? newPage.self : null);
		for (int s = start; s < start + crefs; s++) {
			childRefs[s] = (Reference) a;
		}
		
		final Reference b = (Reference) (!fillLowerSlots ? newPage.self : null);
		for (int s = start + crefs; s <= last; s++) {
			childRefs[s] = (Reference) b;
		}
		
		assert !newPage.isPersistent();
		((HTree) htree).nleaves++; // only definitely add one


        // remove original page and correct leaf count
        bucketPage.delete();
		((HTree) htree).nleaves--;

		// insert raw values from original page
		final int bucketSlotsPerPage = bucketPage.slotsOnPage();
        for (int i = 0; i < bucketSlotsPerPage; i++) {
            ((HTree) htree).insertRawTuple(bucketPage, i);
        }
        
        assert ((BucketPage) childRefs[bucketSlot].get()).data.getKeyCount() > 0;
        
        // ...and finally delete old page data
        if (bucketPage.isPersistent()) {
            htree.deleteNodeOrLeaf(bucketPage.getIdentity());
        }
        
	}

	private BucketPage createPage(final int depth) {
		final BucketPage ret =  new BucketPage((HTree) htree, depth);
		ret.parent = (Reference) self;
		
		return ret;
	}
	
	/**
	 * This is a lazy creation of a BucketPage to fill the relevant number of slots
	 * based on the number of empty slots surrounding it.
	 * 
	 * If all slots are empty, then they will all be filled with references to the
	 * new page.
	 *
	 * The regions tested follow the pattern below:
	 * [A A A A A A A A]
	 * [A A A A B B B B]
	 * [A A B B C C D D]
	 * [A B C D E F G H]
	 *
	 * Examples with 8 slots, test slot 'T'
	 * [- - - - T - - -] // all empty
	 * [S S S S S S S S] // set ALL 8
	 * 
	 * [F F - - T - - -] // other half not empty
	 * [F F - - S S S S] // set 4 slots
	 * 
	 * [F F - T - - - -] // same half not empty
	 * [F F S S - - - -] // set 2 slots
	 * 
	 * @param slot - the target slot to be filled
	 * @param bucketPage - the page to replicate
	 */
	private void fillEmptySlots(final int slot, final BucketPage bucketPage) {
		// create with initial max depth
		int slots = fillEmptySlots(slot, bucketPage, 0, 1 << htree.addressBits); 
		
		assert bucketPage.globalDepth == htree.addressBits;
		
		bucketPage.globalDepth = htree.addressBits;
		while (slots > 1) {
			slots >>= 1;
			bucketPage.globalDepth--;
		}
		
		// assert (1 << (htree.addressBits - bucketPage.globalDepth)) == countChildRefs(bucketPage);
	}
	
	int countChildRefs(AbstractPage pge) {
		int count = 0;
		for (int i = 0; i < childRefs.length; i++) {
			if (childRefs[i] == pge.self) {
				count++;
			}
		}
		return count;
	}

	private int fillEmptySlots(final int slot, final BucketPage bucketPage, final int from, final int to) {
		if (from > slot | to < slot) {
			throw new IllegalArgumentException();
		}
		
		assert !bucketPage.isPersistent();
		
		if (from == slot && to == slot+1) {
			childRefs[slot] = (Reference) bucketPage.self;
			
			return 1;
		}
		
		// test for full range null
		for (int i = from; i < to; i++) {
			if (childRefs[i] != null || getChildAddr(i) != IRawStore.NULL) {
				// recurse and return
				final int childlen = (to-from) >> 1; // half
				if (slot < (from + childlen)) {
					return fillEmptySlots(slot, bucketPage, from, from+childlen);
				} else {
					return fillEmptySlots(slot, bucketPage, to-childlen, to);
				}
			}
		}
		
		// assign to validated null range
		for (int i = from; i < to; i++) {
			assert childRefs[i] == null;
			
			childRefs[i] = (Reference) bucketPage.self;
		}
		
		return to - from;
	}
	
	/**
	 * Return the {@link Reference} for the child at that index.
	 * 
	 * @param index
	 *            The index
	 * 
	 * @return The {@link Reference}.
	 */
	Reference getChildRef(final int index) {

		return childRefs[index];

	}

	/**
	 * This method must be invoked on a parent to notify the parent that the
	 * child has become persistent. The method scans the weak references for the
	 * children, finds the index for the specified child, and then sets the
	 * corresponding index in the array of child addresses.
	 * 
	 * @param child
	 *            The child.
	 * 
	 * @exception IllegalStateException
	 *                if the child is not persistent.
	 * @exception IllegalArgumentException
	 *                if the child is not a child of this node.
	 */
	void setChildAddr(final AbstractPage child) {

		assert !isReadOnly();

		if (!child.isPersistent()) {

			// The child does not have persistent identity.
			throw new IllegalStateException();

		}

		final int slotsPerPage = 1 << htree.addressBits;
		
		if (childRefs == null)
			throw new IllegalStateException("childRefs must not be NULL");

		boolean found = false;
		for (int i = 0; i < slotsPerPage; i++) {

			if (childRefs[i] == child.self) {

				((MutableDirectoryPageData) data).childAddr[i] = child
						.getIdentity();

                found = true;

            } else if (found) {

                // no more pointers to that child.
                break;
                
            }

		}

	}

	/**
	 * Return the child at the specified index in the {@link DirectoryPage}.
	 * 
	 * @param index
	 *            The index of the slot in the {@link DirectoryPage}. If the
	 *            child must be materialized, the buddyOffset will be computed
	 *            based on the globalDepth and the #of pointers to that child
	 *            will be computed so its depth may be set.
	 * 
	 * @return The child at that index.
	 */
	AbstractPage getChild(final int index) {
		
        // See BLZG-1657 (Add BTreeCounters for cache hit and cache miss)
        htree.getBtreeCounters().cacheTests.increment();
        
		/**
		 * FIXME: Need to check whether we are using unnecessary synchronization
		 */
		AbstractPage ret = checkLazyChild(index);
		
		if (ret != null) {
			return ret;
		}

        if (htree.memo == null) {

            /*
             * Optimization for the mutable B+Tree.
             * 
             * Note: This optimization depends on the assumption that concurrent
             * operations are never submitted to the mutable B+Tree. In fact,
             * the UnisolatedReadWriteIndex *DOES* allow concurrent readers (it
             * uses a ReentrantReadWriteLock). Therefore this code path is now
             * expressed conditionally on whether or not the Memoizer object is
             * initialized by AbstractBTree.
             * 
             * Note: Since the caller is single-threaded for the mutable B+Tree
             * we do not need to use the Memoizer, which just delegates to
             * _getChild(index). This saves us some object creation and overhead
             * for this case.
             */

            // See BLZG-1657 (Add BTreeCounters for cache hit and cache miss)
            htree.getBtreeCounters().cacheMisses.increment();
            
            return _getChild(index, null/* req */);
            
        }

        /*
         * If we can resolve a hard reference to the child then we do not need
         * to look any further.
         */
//        synchronized (childRefs) 
        {

            /*
             * Note: we need to synchronize on here to ensure visibility for
             * childRefs[index] (in case it was updated in another thread). This
             * is true even for the mutable B+Tree since the caller could use
             * different threads for different operations. However, this
             * synchronization will never be contended for the mutable B+Tree.
             */
      	
            final Reference childRef = childRefs[index];

            final AbstractPage child = childRef == null ? null : childRef.get();

            if (child != null) {

                // Already materialized.
        		htree.touch(child);
        		
                return child;

            }

        }

        /*
         * Otherwise we need to go through the Memoizer pattern to achieve
         * non-blocking access. It will wind up delegating to _getChild(int),
         * which is immediately below. However, it will ensure that one and only
         * one thread executes _getChild(int) for a given parent and child
         * index. That thread will update childRefs[index]. Any concurrent
         * requests for the same child will wait for the FutureTask inside of
         * the Memoizer and then return the new value of childRefs[index].
         */

        /*
         * See BLZG-1657 (Add BTreeCounters for cache hit and cache miss)
         * 
         * Note: This is done in the caller rather than _getChild() since the
         * latter may be called from the memoizer, in which case only one thread
         * will actually invoke _getChild() while the others will just obtain
         * the child through the memoized Future.
         */
        htree.getBtreeCounters().cacheMisses.increment();
        
        return htree.loadChild(this, index);

    }

    private AbstractPage checkLazyChild(final int index) {

      final AbstractPage child;
      synchronized (childRefs) {

         /*
          * Note: we need to synchronize on here to ensure visibility for
          * childRefs[index] (in case it was updated in another thread).
          */
         final Reference childRef = childRefs[index];

         child = childRef == null ? null : childRef.get();

         if (child != null) {

            // Already materialized.
            htree.touch(child);

            return child;

         }

      }

      // Check for lazy BucketPage creation
      if (getChildAddr(index) == IRawStore.NULL) {

         final DirectoryPage copy = (DirectoryPage) copyOnWrite(IRawStore.NULL);

         return copy.setLazyChild(index);
         
      }

      return null;

    }
    
    private BucketPage setLazyChild(final int index) {

       final DirectoryPage copy = (DirectoryPage) copyOnWrite(IRawStore.NULL);
    	
       assert copy == this;

     	assert childRefs[index] == null;
    	
    	final BucketPage lazyChild = new BucketPage((HTree) htree, htree.addressBits);
    	lazyChild.parent = (Reference) self;
    	((HTree) htree).nleaves++;
    	
    	fillEmptySlots(index, lazyChild);
    	
    	htree.touch(lazyChild);
    	
    	return lazyChild;
    	
    }

	/**
     * Method conditionally reads the child at the specified index from the
     * backing store and sets its reference on the appropriate element of
     * {@link #childRefs}. This method assumes that external mechanisms
     * guarantee that no other thread is requesting the same child via this
     * method at the same time. For the mutable B+Tree, that guarantee is
     * trivially given by its single-threaded constraint. For the read-only
     * B+Tree, {@link AbstractHTree#loadChild(DirectoryPage, int)} provides this
     * guarantee using a {@link Memoizer} pattern. This method explicitly
     * handshakes with the {@link ChildMemoizer} to clear the {@link FutureTask}
     * from the memoizer's internal cache as soon as the reference to the child
     * has been set on the appropriate element of {@link #childRefs}.
     * 
     * @param index
     *            The index of the child.
     * @param req
     *            The key we need to remove the request from the
     *            {@link ChildMemoizer} cache (and null if this
     *            method is not invoked by the memoizer pattern).
     * 
     * @return The child and never null.
     */
    AbstractPage _getChild(final int index, final LoadChildRequest req) {

		/*
		 * Make sure that the child is not reachable. It could have been
		 * concurrently set even if the caller had tested this and we do not
		 * want to read through to the backing store unless we need to.
		 * 
		 * Note: synchronizing on childRefs[] should not be necessary. For a
		 * read-only B+Tree, the synchronization is provided by the Memoizer
		 * pattern. For a mutable B+Tree, the synchronization is provided by the
		 * single-threaded contract for mutation and by the requirement to use a
		 * construct, such as a Queue or the UnisolatedReadWriteIndex, which
		 * imposes a memory barrier when passing a B+Tree instance between
		 * threads.
		 * 
		 * See http://www.cs.umd.edu/~pugh/java/memoryModel/archive/1096.html
		 */
        AbstractPage child;
        synchronized (childRefs) {

            /*
             * Note: we need to synchronize on here to ensure visibility for
             * childRefs[index] (in case it was updated in another thread).
             */
            final Reference childRef = childRefs[index];

            child = childRef == null ? null : childRef.get();

            if (child != null) {

                // Already materialized.
        		htree.touch(child);
        		
                return child;

            }

        }

        /*
         * The child needs to be read from the backing store.
         */

		// width of a buddy hash table (#of pointer slots).
		final int tableWidth = 1 << globalDepth;

		// offset in [0:nbuddies-1] to the start of the buddy spanning that
		// index.
		final int tableOffset = index / tableWidth;

		/*
		 * We need to get the address of the child, figure out the local depth
		 * of the child (by counting the #of points in the buddy bucket to that
		 * child), and then materialize the child from its address.
		 */
		final long addr = data.getChildAddr(index);

        if (addr == IRawStore.NULL) {

            // dump(Level.DEBUG, System.err);
            /*
             * Note: It appears that this can be triggered by a full disk, but I
             * am not quite certain how a full disk leads to this condition.
             * Presumably the full disk would cause a write of the child to
             * fail. In turn, that should cause the thread writing on the B+Tree
             * to fail. If group commit is being used, the B+Tree should then be
             * discarded and reloaded from its last commit point.
             */
            throw new AssertionError(
                    "Child does not have persistent identity: this=" + this
                            + ", index=" + index);

        }

		/*
		 * Scan to count pointers to child within the buddy hash table.
		 */
		final int npointers;
		{

			int n = 0; // npointers

			final int lastIndex = (tableOffset + tableWidth);

			for (int i = tableOffset; i < lastIndex; i++) {

				if (data.getChildAddr(i) == addr)
					n++;

			}

			assert n > 0;

			npointers = n;

		}

        /*
         * Find the local depth of the child within this node. this becomes the
         * global depth of the child.
         * 
         * Note: This winds up being invoked to compute the depth of an overflow
         * directory page when we first descend from a normal directory page.
         */
        final int localDepth = HTreeUtil.getLocalDepth(htree.addressBits,
                globalDepth, npointers);
        assert npointers == 1 << (htree.addressBits - localDepth);
        /*
         * Read the child from the backing store (potentially reads through to
         * the disk).
         * 
         * Note: This is guaranteed to not do duplicate reads. There are two
         * cases. (A) The mutable B+Tree. Since the mutable B+Tree is single
         * threaded, this case is trivial. (B) The read-only B+Tree. Here our
         * guarantee is that the caller is in ft.run() inside of the Memoizer,
         * and that ensures that only one thread is executing for a given
         * LoadChildRequest object (the input to the Computable). Note that
         * LoadChildRequest MUST meet the criteria for a hash map for this
         * guarantee to obtain.
         */

		child = htree.readNodeOrLeaf(addr);
		
        // set the depth on the child.
        if (!child.isLeaf() && ((DirectoryPage) child).isOverflowDirectory()) {
            /*
             * Note: The global depth of an overflow page is always set to this
             * constant. It should be ignored for code paths which maintain the
             * overflow directory and overflow bucket pages.
             */
            child.globalDepth = getOverflowPageDepth();
        } else {
            child.globalDepth = localDepth;
        }

		/*
		 * Set the reference for each slot in the buddy bucket which pointed at
		 * that child. There will be [npointers] such slots.
         * 
         * Note: This code block is synchronized in order to facilitate the safe
         * publication of the change in childRefs[index] to other threads.
         */
        synchronized (childRefs) {

			int n = 0;

			final int lastIndex = (tableOffset + tableWidth);

			for (int i = tableOffset; i < lastIndex; i++) {

				if (data.getChildAddr(i) == addr) {

		            /*
		             * Since the childRefs[index] element has not been updated we do so
		             * now while we are synchronized.
		             * 
		             * Note: This paranoia test could be tripped if the caller allowed
		             * concurrent requests to enter this method for the same child. In
		             * that case childRefs[index] could have an uncleared reference to
		             * the child. This would indicate a breakdown in the guarantee we
		             * require of the caller.
		             */
		            assert childRefs[i] == null || childRefs[i].get() == null : "Child is already set: this="
		                    + this + ", index=" + i;

		            childRefs[i] = (Reference) child.self;

					n++;

				}

			}

			// patch parent reference since loaded from store.
			child.parent = (Reference) this.self;

			assert n == npointers;

		}

        /*
         * Clear the future task from the memoizer cache.
         * 
         * Note: This is necessary in order to prevent the cache from retaining
         * a hard reference to each child materialized for the B+Tree.
         * 
         * Note: This does not depend on any additional synchronization. The
         * Memoizer pattern guarantees that only one thread actually call
         * ft.run() and hence runs this code.
         */
        if (req != null) {

            htree.memo.removeFromCache(req);

        }

		htree.touch(child);
		
        return child;

    }
    
//    private AbstractPage fixme(final int index) {
//    	
//		/*
//		 * Look at the entry in the buddy hash table. If there is a reference to
//		 * the child and that reference has not been cleared, then we are done
//		 * and we can return the child reference and the offset of the buddy
//		 * table or bucket within the child.
//		 */
//		final Reference ref = childRefs[index];
//
//		AbstractPage child = ref == null ? null : ref.get();
//
//		if (child != null) {
//
//			return child;
//
//		}
//
//		// width of a buddy hash table (#of pointer slots).
//		final int tableWidth = 1 << globalDepth;
//
//		// offset in [0:nbuddies-1] to the start of the buddy spanning that
//		// index.
//		final int tableOffset = index / tableWidth;
//
//		/*
//		 * We need to get the address of the child, figure out the local depth
//		 * of the child (by counting the #of points in the buddy bucket to that
//		 * child), and then materialize the child from its address.
//		 * 
//		 * fixme MEMORIZER : This all needs to go through a memoizer pattern.
//		 * The hooks for that should be on AbstractHTree(.memo), but I have not
//		 * yet ported that code.
//		 */
//		final long addr = data.getChildAddr(index);
//
//		/*
//		 * Scan to count pointers to child within the buddy hash table.
//		 */
//		final int npointers;
//		{
//
//			int n = 0; // npointers
//
//			final int lastIndex = (tableOffset + tableWidth);
//
//			for (int i = tableOffset; i < lastIndex; i++) {
//
//				if (data.getChildAddr(i) == addr)
//					n++;
//
//			}
//
//			assert n > 0;
//
//			npointers = n;
//
//		}
//
//		/*
//		 * Find the local depth of the child within this node. this becomes the
//		 * global depth of the child.
//		 */
//		final int localDepth = HTreeUtil.getLocalDepth(htree.addressBits,
//				globalDepth, npointers);
//
//		child = htree.readNodeOrLeaf(addr, localDepth/* globalDepthOfChild */);
//
//		/*
//		 * Set the reference for each slot in the buddy bucket which pointed at
//		 * that child. There will be [npointers] such slots.
//		 */
//		{
//
//			int n = 0;
//
//			final int lastIndex = (tableOffset + tableWidth);
//
//			for (int i = tableOffset; i < lastIndex; i++) {
//
//				if (data.getChildAddr(i) == addr) {
//
//					childRefs[i] = (Reference) child.self;
//
//					n++;
//
//				}
//
//			}
//
//			assert n == npointers;
//
//		}
//
//		return child;
//
//	}

	public AbstractFixedByteArrayBuffer data() {
		return data.data();
	}

	/**
	 * {@inheritDoc}
	 * 

	 * Note: This method can only be used once you have decoded the hash bits
	 * from the key and looked up the offset of the buddy hash table on the page
	 * and the offset of the slot within the buddy hash table for the desired
	 * key. You must also know the localDepth of the child when it is
	 * materialized so that information can be set on the child, where is
	 * becomes the globalDepth of the child.
	 * 
	 * @see #getChildAddrByHashCode(int, int)
	 */
	public long getChildAddr(int index) {
		return data.getChildAddr(index);
	}

	public int getChildCount() {
		return data.getChildCount();
	}

	public long getMaximumVersionTimestamp() {
		return data.getMaximumVersionTimestamp();
	}

	public long getMinimumVersionTimestamp() {
		return data.getMinimumVersionTimestamp();
	}

	public boolean hasVersionTimestamps() {
		return data.hasVersionTimestamps();
	}

	public boolean isCoded() {
		return data.isCoded();
	}

	public boolean isLeaf() {
		return data.isLeaf();
	}

    /**
     * The result depends on the backing {@link IDirectoryData} implementation.
     * The {@link DirectoryPage} will be mutable when it is first created and is
     * made immutable when it is persisted. If there is a mutation operation,
     * the backing {@link IDirectoryData} is automatically converted into a
     * mutable instance.
     */
	public boolean isReadOnly() {
		return data.isReadOnly();
	}

    /**
     * @param htree
     *            The owning hash tree.
     * @param overflowDirectory
     *            true iff this is an overflow directory page.
     * @param globalDepth
     *            The size of the address space (in bits) for each buddy hash
     *            table on a directory page. The global depth of a node is
     *            defined recursively as the local depth of that node within its
     *            parent. The global/local depth are not stored explicitly.
     *            Instead, the local depth is computed dynamically when the
     *            child will be materialized by counting the #of pointers to the
     *            the child in the appropriate buddy hash table in the parent.
     *            This local depth value is passed into the constructor when the
     *            child is materialized and set as the global depth of the
     *            child.
     */
	@SuppressWarnings("unchecked")
    public DirectoryPage(final HTree htree, final byte[] overflowKey,
            final int globalDepth) {

		super(htree, true/* dirty */, globalDepth);

		childRefs = new Reference[(1 << htree.addressBits)];

		data = new MutableDirectoryPageData(overflowKey, htree.addressBits,
				htree.versionTimestamps);

	}

	/**
	 * Deserialization constructor - {@link #globalDepth} MUST be set by the
	 * caller.
	 * 
	 * @param htree
	 * @param addr
	 * @param data
	 */
	DirectoryPage(final HTree htree, final long addr, final IDirectoryData data) {

		super(htree, false/* dirty */, 0/*unknownGlobalDepth*/);

        setIdentity(addr);

		this.data = data;

        childRefs = new Reference[(1 << htree.addressBits)];

	}

    /**
     * Copy constructor.
     * 
     * @param src
     *            The source node (must be immutable).
     * 
     * @param triggeredByChildId
     *            The persistent identity of the child that triggered the copy
     *            constructor. This should be the immutable child NOT the one
     *            that was already cloned. This information is used to avoid
     *            stealing the original child since we already made a copy of
     *            it. It is {@link #NULL} when this information is not
     *            available, e.g., when the copyOnWrite action is triggered by a
     *            join() and we are cloning the sibling before we redistribute a
     *            key to the node/leaf on which the join was invoked.
     * 
     * @todo We could perhaps replace this with the conversion of the
     *       INodeData:data field to a mutable field since the code which
     *       invokes copyOnWrite() no longer needs to operate on a new Node
     *       reference. However, I need to verify that nothing else depends on
     *       the new Node, e.g., the dirty flag, addr, etc.
     * 
     * @todo Can't we just test to see if the child already has this node as its
     *       parent reference and then skip it? If so, then that would remove a
     *       troublesome parameter from the API.
     */
	protected DirectoryPage(final DirectoryPage src,
			final long triggeredByChildId) {

        super(src);

        assert !src.isDirty();

        assert src.isReadOnly();
        // assert src.isPersistent();

		/*
		 * Steal/clone the data record.
		 * 
		 * Note: The copy constructor is invoked when we need to begin mutation
		 * operations on an immutable node or leaf, so make sure that the data
		 * record is mutable.
		 */
		final int slotsOnPage = 1 << htree.addressBits;

        assert src.data != null;
		this.data = src.isReadOnly() ? new MutableDirectoryPageData(
				htree.addressBits, src.data) : src.data;
        assert this.data != null;

        // clear reference on source.
        src.data = null;

        /*
         * Steal strongly reachable unmodified children by setting their parent
         * fields to the new node. Stealing the child means that it MUST NOT be
         * used by its previous ancestor (our source node for this copy).
         */

        childRefs = src.childRefs;
        src.childRefs = null;
        
        // childLocks = src.childLocks; src.childLocks = null;

        for (int i = 0; i < slotsOnPage; i++) {
        	
            final AbstractPage child = deref(i);

			/*
			 * Note: Both child.identity and triggeredByChildId will always be
			 * 0L for a transient B+Tree since we never assign persistent
			 * identity to the nodes and leaves. Therefore [child.identity !=
			 * triggeredByChildId] will fail for ALL children, including the
			 * trigger, and therefore fail to set the parent on any of them. The
			 * [btree.store==null] test handles this condition and always steals
			 * the child, setting its parent to this new node.
			 * 
			 * FIXME It is clear that testing on child.identity is broken in
			 * some other places for the transient store. [This comment is
			 * carried over from the B+Tree code.]
			 */
            if (child != null
                    && (htree.store == null || child.getIdentity() != triggeredByChildId)) {

                /*
                 * Copy on write should never trigger for a dirty node and only
                 * a dirty node can have dirty children.
                 */
                assert !child.isDirty();

                // Steal the child.
                child.parent = (Reference) this.self;
                // child.parent = btree.newRef(this);

                // // Keep a reference to the clean child.
                // childRefs[i] = new WeakReference(child);

            }

        }

    }

    /**
	 * Iterator visits children, recursively expanding each child with a
	 * post-order traversal of its children and finally visits this node itself.
	 */
	@Override
	@SuppressWarnings("unchecked")
	public Iterator postOrderNodeIterator(
			final boolean dirtyNodesOnly, final boolean nodesOnly) {

		if (dirtyNodesOnly && !dirty) {

			return EmptyIterator.DEFAULT;

		}

		/*
		 * Iterator append this node to the iterator in the post-order position.
		 */

		return new Striterator(postOrderIterator1(dirtyNodesOnly, nodesOnly))
				.append(new SingleValueIterator(this));

	}

	/**
	 * Iterator visits children recursively expanding each child with a
	 * post-order traversal of its children and finally visits this node itself.
	 */
	@SuppressWarnings("unchecked")
	public Iterator postOrderIterator() {

		/*
		 * Iterator append this node to the iterator in the post-order position.
		 */

		return new Striterator(postOrderIterator2())
				.append(new SingleValueIterator(this));

	}

	/**
	 * Visits the children (recursively) using post-order traversal, but does
	 * NOT visit this node.
	 */
	@SuppressWarnings("unchecked")
	private Iterator postOrderIterator2() {

		/*
		 * Iterator visits the direct children, expanding them in turn with a
		 * recursive application of the post-order iterator.
		 */

		return new Striterator(childIterator()).addFilter(new Expander() {

			private static final long serialVersionUID = 1L;

			/*
			 * Expand each child in turn.
			 */
			protected Iterator expand(final Object childObj) {

				/*
				 * A child of this node.
				 */

				final AbstractPage child = (AbstractPage) childObj;

				if (!child.isLeaf()) {

					/*
					 * The child is a Node (has children).
					 * 
					 * Visit the children (recursive post-order traversal).
					 */

					// BTree.log.debug("child is node: " + child);
					final Striterator itr = new Striterator(
							((DirectoryPage) child).postOrderIterator2());

					// append this node in post-order position.
					itr.append(new SingleValueIterator(child));

					return itr;

				} else {

					/*
					 * The child is a leaf.
					 */

					// BTree.log.debug("child is leaf: " + child);

					// Visit the leaf itself.
					return new SingleValueIterator(child);

				}
			}
		});

	}

	/**
	 * Iterator visits the direct child nodes in the external key ordering.
	 */
	Iterator childIterator() {

		return new ChildIterator();

	}

	/**
	 * Visit the distinct children exactly once (if there are multiple pointers
	 * to a given child, that child is visited just once).
	 */
	private class ChildIterator implements Iterator {

		final private int slotsPerPage = 1 << htree.addressBits;
		private int slot = 0;
		private AbstractPage child = null;

		private ChildIterator() {
			nextChild(); // materialize the first child.
		}

        /**
         * Advance to the next distinct child, materializing it if necessary.
         * The first time, this will always return the child in slot zero on the
         * page. Thereafter, it will skip over pointers to the same child and
         * return the next distinct child.
         * 

         * Note: For the special case of an overflow directory we allow a
         * null pointer for a child at a given index.
         * 
         * @return true iff there is another distinct child
         *         reference.
         */
		private boolean nextChild() {
//		    final boolean isOverflowDirectory = isOverflowDirectory();
			for (; slot < slotsPerPage; slot++) {
			    AbstractPage tmp = deref(slot);
                if (tmp == null
                        && data.getChildAddr(slot) == NULL) {
			        // Null pointers allowed either in an overflow or with lazy child creation.
			        continue;
			    }
                tmp = tmp == null ? getChild(slot) : tmp;
				if (tmp != child) {
					child = tmp;
					return true;
				}
			}
			return false;
		}

		@Override
		public boolean hasNext() {
			/*
			 * Return true if there is another child to be visited.
			 * 
			 * Note: This depends on nextChild() being invoked by the
			 * constructor and by next().
			 */
			return slot < slotsPerPage;
		}

		@Override
		public AbstractPage next() {
			if (!hasNext())
				throw new NoSuchElementException();
			final AbstractPage tmp = child;
			nextChild(); // advance to the next child (if any).
			return tmp;
		}

		@Override
		public void remove() {
			throw new UnsupportedOperationException();
		}

	}

	/**
	 * Human readable representation of the {@link Node}.
	 */
	@Override
	public String toString() {

		final StringBuilder sb = new StringBuilder();

		// sb.append(getClass().getName());
		sb.append(super.toString());

		sb.append("{ isDirty=" + isDirty());

		sb.append(", isDeleted=" + isDeleted());

		sb.append(", addr=" + identity);

		final DirectoryPage p = (parent == null ? null : parent.get());

		sb.append(", parent=" + (p == null ? "N/A" : p.toShortString()));

		sb.append(", isRoot=" + (htree.root == this));

		if (data == null) {

			// No data record? (Generally, this means it was stolen by copy on
			// write).
			sb.append(", data=NA}");

			return sb.toString();

		}
		sb.append(", globalDepth=" + getGlobalDepth());
		sb.append(", nbuddies=" + (1 << htree.addressBits) / (1 << globalDepth));
		sb.append(", slotsPerBuddy=" + (1 << globalDepth));
		// sb.append(", minKeys=" + minKeys());
		//
		// sb.append(", maxKeys=" + maxKeys());

		toString(this, sb);

		// indicate if each child is loaded or unloaded.
		{

			final int nchildren = getChildCount();

			sb.append(", children=[");

			for (int i = 0; i < nchildren; i++) {

				if (i > 0)
					sb.append(", ");

				final AbstractPage child = childRefs[i] == null ? null
						: childRefs[i].get();

				sb.append(child == null ? "U" : "L");

			}

			sb.append("]");

		}

		sb.append("}");

		return sb.toString();

	}

	/**
	 * Visits the children (recursively) using post-order traversal, but does
	 * NOT visit this node.
	 */
	@SuppressWarnings("unchecked")
	private Iterator postOrderIterator1(
			final boolean dirtyNodesOnly, final boolean nodesOnly) {

		/*
		 * Iterator visits the direct children, expanding them in turn with a
		 * recursive application of the post-order iterator.
		 * 
		 * When dirtyNodesOnly is true we use a child iterator that makes a best
		 * effort to only visit dirty nodes. Especially, the iterator MUST NOT
		 * force children to be loaded from disk if the are not resident since
		 * dirty nodes are always resident.
		 * 
		 * The iterator must touch the node in order to guarantee that a node
		 * will still be dirty by the time that the caller visits it. This
		 * places the node onto the hard reference queue and increments its
		 * reference counter. Evictions do NOT cause IO when the reference is
		 * non-zero, so the node will not be made persistent as a result of
		 * other node touches. However, the node can still be made persistent if
		 * the caller explicitly writes the node onto the store.
		 */

		// BTree.log.debug("node: " + this);
		return new Striterator(childIterator(dirtyNodesOnly))
				.addFilter(new Expander() {

					private static final long serialVersionUID = 1L;

					/*
					 * Expand each child in turn.
					 */
					protected Iterator expand(final Object childObj) {

						/*
						 * A child of this node.
						 */

						final AbstractPage child = (AbstractPage) childObj;

						if (dirtyNodesOnly && !child.isDirty()) {

							return EmptyIterator.DEFAULT;

						}

						if (child instanceof DirectoryPage) {

							/*
							 * The child is a Node (has children).
							 */

							// visit the children (recursive post-order
							// traversal).
							final Striterator itr = new Striterator(
									((DirectoryPage) child).postOrderIterator1(
											dirtyNodesOnly, nodesOnly));

							// append this node in post-order position.
							itr.append(new SingleValueIterator(child));

							return itr;

						} else {

							/*
							 * The child is a leaf.
							 */

							// Visit the leaf itself.
							if (nodesOnly)
								return EmptyIterator.DEFAULT;

							return new SingleValueIterator(child);

						}
					}
				});

	}

	/**
	 * Iterator visits the direct child nodes in the external key ordering.
	 * 
	 * @param dirtyNodesOnly
	 *            When true, only the direct dirty child nodes will be visited.
	 */
	public Iterator childIterator(final boolean dirtyNodesOnly) {

		if (dirtyNodesOnly) {

			return new DirtyChildIterator(this);

		} else {

			return new ChildIterator();

		}

	}

	/**
	 * TODO We should dump each bucket page once. This could be done either by
	 * dumping each buddy bucket on the page separately or by skipping through
	 * the directory page until we get to the next bucket page and then dumping
	 * that.
	 * 
	 * TODO The directory page validation should include checks on the bucket
	 * references and addresses. For a given buddy hash table, the reference and
	 * address should pairs should be consistent if either the reference or the
	 * address appears in another slot of that table. Also, there can not be
	 * "gaps" between observations of a reference to a given bucket - once you
	 * see another bucket reference a previously observed reference can not then
	 * appear.
	 * 
	 * @see HTree#validatePointersInParent(DirectoryPage, int, AbstractPage)
	 */
	@Override
	protected boolean dump(final Level level, final PrintStream out, final int height,
			final boolean recursive, final boolean materialize) {

		// True iff we will write out the node structure.
		final boolean debug = level.toInt() <= Level.DEBUG.toInt();

		// Set true iff an inconsistency is detected.
		boolean ok = true;

		// final int branchingFactor = this.getBranchingFactor();
		// final int nkeys = getKeyCount();
		// final int minKeys = this.minKeys();
		// final int maxKeys = this.maxKeys();

		if (this == htree.root) {
			if (parent != null) {
				out.println(indent(height)
						+ "ERROR: this is the root, but the parent is not null.");
				ok = false;
			}
		} else {
			/*
			 * Note: there is a difference between having a parent reference and
			 * having the parent be strongly reachable. However, we actually
			 * want to maintain both -- a parent MUST always be strongly
			 * reachable ... UNLESS you are doing a fast forward or reverse leaf
			 * scan since the node hierarchy is not being traversed in that
			 * case.
			 */
			if (parent == null) {
				out.println(indent(height)
						+ "ERROR: the parent reference MUST be defined for a non-root node.");
				ok = false;
			} else if (parent.get() == null) {
				out.println(indent(height)
						+ "ERROR: the parent is not strongly reachable.");
				ok = false;
			}
		}

		if (debug) {
			out.println(indent(height) + toString());
		}

		/*
		 * Look for inconsistencies for children. A dirty child MUST NOT have an
		 * entry in childAddr[] since it is not persistent and MUST show up in
		 * dirtyChildren. Likewise if a child is NOT dirty, then it MUST have an
		 * entry in childAddr and MUST NOT show up in dirtyChildren.
		 * 
		 * This also verifies that all entries beyond nchildren (nkeys+1) are
		 * unused.
		 */
		for (int i = 0; i < (1 << htree.addressBits); i++) {

			/*
			 * Scanning a valid child index.
			 * 
			 * Note: This is not fetching the child if it is not in memory --
			 * perhaps it should using its persistent id?
			 */

			final AbstractPage child = (childRefs[i] == null ? null
					: childRefs[i].get());

			if (child != null) {

				if (child.parent == null || child.parent.get() == null) {
					/*
					 * the reference to the parent MUST exist since the we are
					 * the parent and therefore the parent is strongly
					 * reachable.
					 */
					out.println(indent(height) + "  ERROR child[" + i
							+ "] does not have parent reference.");
					ok = false;
				}

				if (child.parent.get() != this) {
					out.println(indent(height) + "  ERROR child[" + i
							+ "] has wrong parent.");
					ok = false;
				}

				if (child.isDirty()) {
					/*
					 * Dirty child. The parent of a dirty child MUST also be
					 * dirty.
					 */
					if (!isDirty()) {
						out.println(indent(height) + "  ERROR child[" + i
								+ "] is dirty, but its parent is clean");
						ok = false;
					}
					if (childRefs[i] == null) {
						out.println(indent(height) + "  ERROR childRefs[" + i
								+ "] is null, but the child is dirty");
						ok = false;
					}
					if (getChildAddr(i) != NULL) {
						out.println(indent(height) + "  ERROR childAddr[" + i
								+ "]=" + getChildAddr(i) + ", but MUST be "
								+ NULL + " since the child is dirty");
						ok = false;
					}
				} else {
					/*
					 * Clean child (ie, persistent). The parent of a clean child
					 * may be either clear or dirty.
					 */
					if (getChildAddr(i) == NULL) {
						out.println(indent(height) + "  ERROR childKey[" + i
								+ "] is " + NULL + ", but child is not dirty");
						ok = false;
					}
				}

			}

		}

		if (!ok && !debug) {

			// @todo show the node structure with the errors since we would not
			// have seen it otherwise.

		}

		if (recursive) {

			/*
			 * Dump children using pre-order traversal.
			 */

			final Set dirty = new HashSet();

			for (int i = 0; i < (1 << htree.addressBits); i++) {

				if (childRefs[i] == null && !isReadOnly()
						&& ((MutableDirectoryPageData) data).childAddr[i] == 0) {

					/*
					 * This let's us dump a tree with some kinds of structural
					 * problems (missing child reference or key).
					 */

					out.println(indent(height + 1)
							+ "ERROR can not find child at index=" + i
							+ ", skipping this index.");

					ok = false;

					continue;

				}

				/*
				 * Note: this works around the assert test for the index in
				 * getChild(index) but is not able/willing to follow a childKey
				 * to a child that is not memory resident.
				 */
				// AbstractNode child = getChild(i);
				final AbstractPage child = childRefs[i] == null ? null
						: childRefs[i].get();

				if (child != null) {

					if (child.parent == null) {

						out.println(indent(height + 1)
								+ "ERROR child does not have parent reference at index="
								+ i);

						ok = false;

					}

					if (child.parent.get() != this) {

						out.println(indent(height + 1)
								+ "ERROR child has incorrect parent reference at index="
								+ i);

						ok = false;

					}

					if (child.isDirty()) {

						dirty.add(child);

					}

					if (!child.dump(level, out, height + 1, true, materialize)) {

						ok = false;

					}

				}

			}

		}

		return ok;

	}

   /**
    * {@inheritDoc}
    * 
    * @see  pre-heat the journal
    *      on startup 
    */
   @Override
   public void dumpPages(final boolean recursive, final boolean visitLeaves,
         final HTreePageStats stats) {

      stats.visit(htree, this);

      if (!recursive)
         return;

      // materialize children.
      final Iterator itr = childIterator();

      while (itr.hasNext()) {

         final AbstractPage child = itr.next();

         if (!visitLeaves && child instanceof BucketPage) {

            /*
             * Note: This always reads the child and then filters out leaves.
             * 
             * Note: It might not be possible to lift this constraint into the
             * childIterator() due to the HTree design (per Martyn's
             * recollection).
             */
            continue;
            
         }

         // recursion into children.
         child.dumpPages(recursive, visitLeaves, stats);

      }

   }

	/**
	 * Utility method formats the {@link IDirectoryData}.
	 * 
	 * @param data
	 *            A data record.
	 * @param sb
	 *            The representation will be written onto this object.
	 * 
	 * @return The sb parameter.
	 */
	static public StringBuilder toString(final IDirectoryData data,
			final StringBuilder sb) {

		final int nchildren = data.getChildCount();

		sb.append(", nchildren=" + nchildren);

		// sb.append(", spannedTupleCount=" + data.getSpannedTupleCount());
		//
		// sb.append(",\nkeys=" + data.getKeys());

		{

			sb.append(",\nchildAddr=[");

			for (int i = 0; i < nchildren; i++) {

				if (i > 0)
					sb.append(", ");

				sb.append(data.getChildAddr(i));

			}

			sb.append("]");

		}

		// {
		//
		// sb.append(",\nchildEntryCount=[");
		//
		// for (int i = 0; i < nchildren; i++) {
		//
		// if (i > 0)
		// sb.append(", ");
		//
		// sb.append(data.getChildEntryCount(i));
		//
		// }
		//
		// sb.append("]");
		//
		// }

		if (data.hasVersionTimestamps()) {

			sb.append(",\nversionTimestamps={min="
					+ data.getMinimumVersionTimestamp() + ",max="
					+ data.getMaximumVersionTimestamp() + "}");

		}

		return sb;

	}

	@Override
	public void PP(final StringBuilder sb, final boolean showBinary) {

		sb.append(PPID() + " [" + globalDepth + "] " + indent(getLevel()));

		sb.append("("); // start of address map.

		// #of buddy tables on a page.
		final int nbuddies = (1 << htree.addressBits) / (1 << globalDepth);

		// #of address slots in each buddy hash table.
		final int slotsPerBuddy = (1 << globalDepth);

		for (int i = 0; i < nbuddies; i++) {

			if (i > 0) // buddy boundary marker
				sb.append(";");

			for (int j = 0; j < slotsPerBuddy; j++) {

				final int slot = i * slotsPerBuddy + j;

				if (j > 0) // slot boundary marker.
					sb.append(",");

				final AbstractPage child = getChildIfPresent(slot);

				sb.append(child == null ? "-" : child.PPID());

			}

		}

		sb.append(")"); // end of address map.

		sb.append("\n");

		final Iterator itr = childIterator();

		while (itr.hasNext()) {

			final AbstractPage child = itr.next();

			child.PP(sb, showBinary);

		}

	}

	/**
	 * Tests the slot for content
	 * 
	 * @param slot - the slot to check
	 * @return an existing AbstractPage if present and null otherwise
	 */
	AbstractPage getChildIfPresent(final int slot) {
		if (childRefs[slot] == null && data.getChildAddr(slot) == IRawStore.NULL) {
			return null;
		} else {
			return getChild(slot);
		}
	}

    /**
     * Invoked by {@link #copyOnWrite()} to clear the persistent address for a
     * child on a cloned parent and set the reference to the cloned child.
     * 
     * @param oldChildAddr
     *            The persistent address of the old child. The entries to be
     *            updated are located based on this argument. It is an error if
     *            this address is not found in the list of child addresses for
     *            this {@link Node}.
     * @param newChild
     *            The reference to the new child.
     */
	// FIXME Reconcile two versions of replaceChildRef
	// FIXME Reconcile pattern for deleting a persistent object (htree AND btree)
    void replaceChildRef(final long oldChildAddr, final AbstractPage newChild) {

        assert oldChildAddr != NULL || htree.store == null;
        assert newChild != null;

        // This node MUST have been cloned as a pre-condition, so it can not
        // be persistent.
        assert !isPersistent();
        assert !isReadOnly();

        // The newChild MUST have been cloned and therefore MUST NOT be
        // persistent.
        assert !newChild.isPersistent();

        assert !isReadOnly();

		final MutableDirectoryPageData data = (MutableDirectoryPageData) this.data;

		final int slotsOnPage = 1 << htree.addressBits;

		// Scan for location in weak references.
		int npointers = 0;
		boolean found = false;
		for (int i = 0; i < slotsOnPage; i++) {

            if (data.childAddr[i] == oldChildAddr) {

                found = true;
                
                // remove from cache and free the oldChildAddr if the Strategy
                // supports it.
                // TODO keep this in case we add in the store cache again.
//				if (htree.storeCache != null) {
//					// remove from cache.
//					htree.storeCache.remove(oldChildAddr);
//				}
                // free the oldChildAddr if the Strategy supports it
            	// - and only if not already deleted!
            	if (npointers == 0)
            		htree.deleteNodeOrLeaf(oldChildAddr);
                // System.out.println("Deleting " + oldChildAddr);

                // Clear the old key.
                data.childAddr[i] = NULL;

                // Stash reference to the new child.
                // childRefs[i] = btree.newRef(newChild);
                childRefs[i] = (Reference) newChild.self;
                if (newChild.isPersistent()) {
                	data.childAddr[i] = newChild.getIdentity();
                }

                // // Add the new child to the dirty list.
                // dirtyChildren.add(newChild);

                // Set the parent on the new child.
                // newChild.parent = btree.newRef(this);
                newChild.parent = (Reference) this.self;

                npointers++;
                
            } else if (found) {

                // No more pointers to that child.
                break;
                
            }

        }

		if (npointers == 0)
			throw new IllegalArgumentException("Not our child : oldChildAddr="
					+ oldChildAddr);

    }
    
    int replaceChildRef(final Reference oldRef, final AbstractPage newChild) {
    	
        final int slotsOnPage = 1 << htree.addressBits;

		final MutableDirectoryPageData data = (MutableDirectoryPageData) this.data;
		
		// Scan for location in weak references.
		int firstSlot = -1;
		int npointers = 0;
		for (int i = 0; i < slotsOnPage; i++) {

            if (childRefs[i] == oldRef) {
            	
            	if (firstSlot == -1) firstSlot = i; 

                // Clear the old key.
                data.childAddr[i] = NULL;

                // Stash reference to the new child.
                // childRefs[i] = btree.newRef(newChild);
                if (newChild != null) {
	                childRefs[i] = (Reference) newChild.self;
	                
	                if (newChild.isPersistent()) {
	                	data.childAddr[i] = newChild.getIdentity();
	                }
	
	                newChild.parent = (Reference) this.self;
                } else {
                	childRefs[i] = null;
                }

                npointers++;
            }

        }
		
		assert npointers > 0;
		
		return firstSlot;
    	
    }

	void _addLevel(final BucketPage bucketPage) {
		assert !isReadOnly();
		assert !isOverflowDirectory();

		/**
		 * TBD: Since for _addLevel to be called there should only be a single reference to
		 * bucketPage, this directory MUST be at global depth.  BUT, rather than
		 * replacing the only the old bucket page with the reference to the new
		 * directory, we should/could create a directory of half this directory's depth
		 * and insert the requisite references
		 */
		
        // Create new directory to insert
        final DirectoryPage ndir = new DirectoryPage((HTree) htree,
                null, // overflowKey
                htree.addressBits/* globalDepth */);

        ((HTree) htree).nnodes++;

		// And new bucket page for the new directory
        // only add one initially - to cover the first key
		final BucketPage newPage = createPage(1);
		newPage.parent = (Reference) ndir.self;

		final byte[] tstkey = bucketPage.getFirstKey();
		final int bucketSlot = getLocalHashCode(tstkey, getPrefixLength() + globalDepth);
		final int bucketRefs = (1 << htree.addressBits) >> 1; // half total number of directory slots
		final boolean fillLowerSlots = bucketSlot < bucketRefs;

		final Reference a =  (Reference) (fillLowerSlots ? newPage.self : null);
		final Reference b =  (Reference) (fillLowerSlots ? null : newPage.self);
		
		// nleaves is unchanged since we will create one new page and delete the old
		// ((HTree) htree).nleaves++; // Note: only +1 since we will delete the oldPage.
		
		// Link the new bucket pages into the new parent directory page.
		for (int i = 0; i < bucketRefs; i++) {
			ndir.childRefs[i] = a;
			ndir.childRefs[i+bucketRefs] = b;
		}
		
		// now replace the reference to the old bucket page with the new directory
		replaceChildRef(bucketPage.self, ndir);
		
		// ensure that the bucket page doesn't evict		
		bucketPage.delete();
		
		// insert old tuples
		final int bucketSlotsPerPage = bucketPage.slotsOnPage();
		for (int i = 0; i < bucketSlotsPerPage; i++) {
			((HTree) htree).insertRawTuple(bucketPage, i);
		}

		// ...and finally delete old data
		if (bucketPage.isPersistent()) {
			htree.deleteNodeOrLeaf(bucketPage.getIdentity());
		}
		
	}

	/**
	 * This method should only be called when using a DirectoryPage as a BucketPage
	 * Blob.  The final child in the blob is used for insert by default.
	 * 
	 * @param child - the child to be added
	 */
	void _addChild(final AbstractPage child) {
		assert isOverflowDirectory();
		assert !isReadOnly();
		
		// find available slot
		final MutableDirectoryPageData pdata = (MutableDirectoryPageData) data;
		
		for (int i = 0; i < pdata.childAddr.length; i++) {

		    final AbstractPage aChild = childRefs[i] == null ? null
                    : childRefs[i].get();
            
		    if (aChild == null && pdata.childAddr[i] == NULL) {

				childRefs[i] = (Reference) child.self;
				
				assert !child.isPersistent();
				
				child.parent = (Reference) self;
				
				return;
			
		    }
		    
		}
		
		// else insert new level above this one and add child to that
		final DirectoryPage pd = getParentDirectory();
		if (pd.isOverflowDirectory()) { // already handles blobs
			assert false; // unsure for now
		} else {
            final DirectoryPage blob = new DirectoryPage((HTree) htree,
                    getOverflowKey(),
                    getOverflowPageDepth());
			blob._addChild(this);
			blob._addChild(child);
			
			pd.replaceChildRef(this.self, blob);
			
			if (INFO)
				log.info("New Overflow Level: " + getLevel());
		}
	}

    /**
     * Return the last non-null child of an overflow directory
     * page.
     */
	BucketPage lastChild() {
        assert isOverflowDirectory();
        for (int i = data.getChildCount() - 1; i >= 0; i--) {
            final BucketPage aChild = (BucketPage) deref(i);
            if (aChild != null) {
                // htree.touch(aChild);
                return aChild;
            }
            if (data.getChildAddr(i) != NULL)
                return (BucketPage) getChild(i);
        }
		
		throw new AssertionError();
	}

    /**
     * Double indirection dereference for the specified index. If the
     * {@link Reference} is null, returns null.
     * Otherwise returns {@link Reference#get()}.
     */
	protected AbstractPage deref(final int index) {

	    return childRefs[index] == null ? null  : childRefs[index].get();

	}
	
	@Override
	public boolean isOverflowDirectory() {
	    
	    return data.isOverflowDirectory();
	    
	}
	
	/**
	 * If this is an overflow directory then the depth-based hashCode is irrelevant
	 * since it is used as a blob container for BucketPage references.
	 */
	@Override
	public int getLocalHashCode(final byte[] key, final int prefixLength) {
		if (isOverflowDirectory()) {
			/*
			 * Shouldn't need to check the key, this will be handled when
			 * the BucketPage is checked for a precise match
			 */
			return 0;
		}
		
		return super.getLocalHashCode(key,  prefixLength);

	}

	/**
	 * This method is never called at present since DirectoryPages are
	 * always created at maximum depth.  Whether there is any advantage
	 * in supporting pages of lesser depths is yet to be determined.
	 * 
	 * @param buddyOffset
	 * @param oldChild
	 */
    public void split(final int buddyOffset, final DirectoryPage oldChild) {

        if (true) {
            /*
             * FIXME We need to update this code to handle the directory page
             * not being at maximum depth to work without the concept of buddy
             * buckets.
             */
            throw new UnsupportedOperationException();
        }
        
        if (parent == null)
            throw new IllegalArgumentException();
        if (oldChild == null)
            throw new IllegalArgumentException();
        if (oldChild.globalDepth >= globalDepth) {
            /*
             * In this case we have to introduce a new directory level instead
             * (increasing the height of the tree at that point).
             */
            throw new IllegalStateException();
        }
        if (buddyOffset < 0)
            throw new IllegalArgumentException();
        if (buddyOffset >= (1 << htree.addressBits)) {
            /*
             * Note: This check is against the maximum possible slot index. The
             * actual max buddyOffset depends on parent.globalBits also since
             * (1<> 1); i < npointers; i++) {

			if (childRefs[i] != oldChild.self)
				throw new RuntimeException("Does not point to old child.");
			
			// update the references to the new bucket.
			childRefs[i] = (Reference) newChild.self;
			
			assert !newChild.isPersistent();
			
		}
			
	} // updatePointersInParent
	
	/**
	 * Redistribute the buddy hash tables in a {@link DirectoryPage}.
	 * 

	 * Note: We are not changing the #of hash tables, just their size and the
	 * page on which they are found. Any reference in a source buddy hash table
	 * will wind up in the "same" buddy hash table afterwards, but the page and
	 * offset on the page of the buddy hash table may have been changed and the
	 * size of the buddy hash table will have doubled.
	 * 

	 * When a {@link DirectoryPage} is split, the size of each buddy hash table
	 * is doubled. The additional slots in each buddy hash table are filled in
	 * by (a) spacing out the old slot entries in each buddy hash table; and (b)
	 * filling in the uncovered slot with a copy of the previous slot.
	 * 
	 * We proceed backwards, moving the upper half of the buddy hash tables to
	 * the new directory page first and then spreading out the lower half of the
	 * source page among the new buddy hash table boundaries on the source page.
	 * 
	 * @param oldDepth
	 *            The depth of the old {@link DirectoryPage} before the split.
	 * @param newDepth
	 *            The depth of the old and new {@link DirectoryPage} after the
	 *            split (this is just oldDepth+1).
	 * @param oldDir
	 *            The old {@link DirectoryPage}.
	 * @param newDir
	 *            The new {@link DirectoryPage}.
	 * 
	 * @deprecated with
	 *             {@link #splitDirectoryPage(DirectoryPage, int, DirectoryPage)}
	 */
    private void redistributeBuddyTables(final int oldDepth,
            final int newDepth, final DirectoryPage oldDir,
            final DirectoryPage newDir) {

        assert oldDepth + 1 == newDepth;
        
        // #of slots on the directory page (invariant given addressBits).
        final int slotsOnPage = (1 << htree.addressBits);

        // #of address slots in each old buddy hash table.
        final int slotsPerOldBuddy = (1 << oldDepth);

        // #of address slots in each new buddy hash table.
        final int slotsPerNewBuddy = (1 << newDepth);

        // #of buddy tables on the old bucket directory.
        final int oldBuddyCount = slotsOnPage / slotsPerOldBuddy;

        // #of buddy tables on the directory page after the split.
        final int newBuddyCount = slotsOnPage / slotsPerNewBuddy;

        final DirectoryPage srcPage = oldDir;
        final long[] srcAddrs = ((MutableDirectoryPageData) oldDir.data).childAddr;
        final Reference[] srcRefs = oldDir.childRefs;

        /*
         * Move top 1/2 of the buddy hash tables from the child to the new page.
         */
        {

            // target is the new page.
            final DirectoryPage dstPage = newDir;
            final long[] dstAddrs = ((MutableDirectoryPageData) dstPage.data).childAddr;
            final Reference[] dstRefs = dstPage.childRefs;

            // index (vs offset) of first buddy in upper half of src page.
            final int firstSrcBuddyIndex = (oldBuddyCount >> 1);

            // exclusive upper bound for index (vs offset) of last buddy in
            // upper half of src page.
            final int lastSrcBuddyIndex = oldBuddyCount;

            // exclusive upper bound for index (vs offset) of last buddy in
            // upper half of target page.
            final int lastDstBuddyIndex = newBuddyCount;

            // work backwards over buddies to avoid stomping data!
            for (int srcBuddyIndex = lastSrcBuddyIndex - 1, dstBuddyIndex = lastDstBuddyIndex - 1; //
            srcBuddyIndex >= firstSrcBuddyIndex; //
            srcBuddyIndex--, dstBuddyIndex--//
            ) {

                final int firstSrcSlot = srcBuddyIndex * slotsPerOldBuddy;

                final int lastSrcSlot = (srcBuddyIndex + 1) * slotsPerOldBuddy;

                final int firstDstSlot = dstBuddyIndex * slotsPerNewBuddy;

                for (int srcSlot = firstSrcSlot, dstSlot = firstDstSlot; srcSlot < lastSrcSlot; srcSlot++, dstSlot += 2) {

                    if (TRACE)
                        log.trace("moving: page(" + srcPage.toShortString()
                                + "=>" + dstPage.toShortString() + ")"
                                + ", buddyIndex(" + srcBuddyIndex + "=>"
                                + dstBuddyIndex + ")" + ", slot(" + srcSlot
                                + "=>" + dstSlot + ")");

                    for (int i = 0; i < 2; i++) {
                        // Copy data to slot
                        dstAddrs[dstSlot + i] = srcAddrs[srcSlot];
                        dstRefs[dstSlot + i] = srcRefs[srcSlot];
                    }

                }

            }

        }

        /*
         * Reposition the bottom 1/2 of the buddy buckets on the old page.
         * 
         * Again, we have to move backwards through the buddy tables on the
         * source page to avoid overwrites of data which has not yet been
         * copied. Also, notice that the buddy table at index ZERO does not
         * move - it is already in place even though it's size has doubled.
         */
        {

            // target is the old page.
            final DirectoryPage dstPage = oldDir;
            final long[] dstAddrs = ((MutableDirectoryPageData) dstPage.data).childAddr;
            final Reference[] dstRefs = dstPage.childRefs;

            // index (vs offset) of first buddy in lower half of src page.
            final int firstSrcBuddyIndex = 0;

            // exclusive upper bound for index (vs offset) of last buddy in
            // lower half of src page.
            final int lastSrcBuddyIndex = (oldBuddyCount >> 1);

            // exclusive upper bound for index (vs offset) of last buddy in
            // upper half of target page (which is also the source page).
            final int lastDstBuddyIndex = newBuddyCount;

            /*
             * Work backwards over buddy buckets to avoid stomping data!
             * 
             * Note: Unlike with a BucketPage, we have to spread out the data in
             * the slots of the first buddy hash table on the lower half of the
             * page as well to fill in the uncovered slots. This means that we
             * have to work backwards over the slots in each source buddy table
             * to avoid stomping our data.
             */
            for (int srcBuddyIndex = lastSrcBuddyIndex - 1, dstBuddyIndex = lastDstBuddyIndex - 1; //
            srcBuddyIndex >= firstSrcBuddyIndex; // 
            srcBuddyIndex--, dstBuddyIndex--//
            ) {

                final int firstSrcSlot = srcBuddyIndex * slotsPerOldBuddy;

                final int lastSrcSlot = (srcBuddyIndex + 1) * slotsPerOldBuddy;

//                final int firstDstSlot = dstBuddyIndex * slotsPerNewBuddy;

                final int lastDstSlot = (dstBuddyIndex + 1) * slotsPerNewBuddy;

                for (int srcSlot = lastSrcSlot-1, dstSlot = lastDstSlot-1; srcSlot >= firstSrcSlot; srcSlot--, dstSlot -= 2) {

                    if (TRACE)
                        log.trace("moving: page(" + srcPage.toShortString()
                                + "=>" + dstPage.toShortString() + ")"
                                + ", buddyIndex(" + srcBuddyIndex + "=>"
                                + dstBuddyIndex + ")" + ", slot(" + srcSlot
                                + "=>" + dstSlot + ")");

                    for (int i = 0; i < 2; i++) {
                        // Copy data to slot.
                        dstAddrs[dstSlot - i] = srcAddrs[srcSlot];
                        dstRefs[dstSlot - i] = srcRefs[srcSlot];
                    }
                    
                }

            }

        }

    }

    /**
	 * Use Striterator Expander to optimize iteration
	 */
	public ITupleIterator getTuples() {
		// start with child nodes
		final Striterator tups = new Striterator(childIterator());
		
		// expand child contents
		tups.addFilter(new Expander() {

			protected Iterator expand(Object obj) {
				if (obj instanceof BucketPage) {
					return ((BucketPage) obj).tuples();
				} else {
					return ((DirectoryPage) obj).getTuples();
				}					
			}
			
		});
		
		// wrap striterator for return type
		return new ITupleIterator() {
			
			public ITuple next() {
				return (ITuple) tups.next();
			}

			public boolean hasNext() {
				return tups.hasNext();
			}

			public void remove() {
				throw new UnsupportedOperationException();
			}
			
		};
		
	}

	@Override
	boolean isClean() {
		for (int i = 0; i < childRefs.length; i++) {
			final AbstractPage node = deref(i);
			if (node != null && !node.isClean()) {
				return false;
			}
		}
		
		return !isDirty();
	}

	public byte[] getOverflowKey() {
		return data.getOverflowKey();
	}

	/**
	 * Called on the condition that an attempt is made to insert a key/value
	 * into an overflow directory with a different key.
	 * 
	 * In this case, this directory has been created to extend the bit resolution
	 * so that the new key/value is resolved to a different slot and a new BucketPage
	 * 
	 * @param overflowKey
	 * @param child
	 */
	DirectoryPage _addLevelForOverflow(final DirectoryPage current) {
		
		if (isReadOnly()) {
			DirectoryPage copy = (DirectoryPage) copyOnWrite(current.getIdentity());
			return copy._addLevelForOverflow(current);
		}
		
		// EvictionProtection ep = new EvictionProtection(this);
		// try {		
			final DirectoryPage newdir = new DirectoryPage((HTree) htree, null /*overflowKey*/, htree.addressBits);
			
			if (isReadOnly()) // TBD: Remove debug point
				assert !isReadOnly();
			
			replaceChildRef(current.self, newdir);
			
			
			// do not want or need to push all bucket values to redistribute, just want to
			// set the directory to relevant key.
			// Logically we want to split the bucket pages until there is only one at the key provided
			final byte[] overflowKey = current.getOverflowKey();
			
			final int slot = newdir.getLocalHashCode(overflowKey, newdir._getPrefixLength());
			
			newdir.childRefs[slot] = (Reference) current.self;
			((MutableDirectoryPageData) newdir.data).childAddr[slot] = current.identity;
			current.parent = (Reference) newdir.self;
			
			try {
				current._protectFromEviction();
				
				// now fill others with BucketPage references
				newdir._fillChildSlots(slot, 0, childRefs.length, 0);
			} finally {
				current._releaseProtection();
			}
			
			// System.out.println("_addLevelForOverflow, level: " + newdir.getLevel());
			
			return newdir;
		// } finally {
		//	ep.release();
		// }
	}
	
	private void _touchHierarchy() {
		DirectoryPage dp = this;
		while (dp != null) {
			htree.touch(dp);
			dp = dp.getParentDirectory();
		}
	}

	/**
	 * crawl up the hierarchy decrementing the eviction count
	 * if any hit zero then explicitly evict
	 */
	private void _releaseProtection() {
		DirectoryPage dp = this;
		while (dp != null) {
			if (--dp.referenceCount == 0) {
				htree.writeNodeRecursive(dp);
			}
			dp = dp.getParentDirectory();
		}
	}

	/**
	 * crawl up the hierarchy incrementing the eviction count
	 */
	private void _protectFromEviction() {
		DirectoryPage dp = this;
		while (dp != null) {
			dp.referenceCount++;
			dp = dp.getParentDirectory();
		}
	}

	/**
	 * This method distributes new BucketPages in the child slots around a specified slot
	 * 
	 * The purpose is to create the minimum number of pages to enable the specified slot
	 * to have unique contents. Examples where X indicates specified slot:
	 * 
	 * 2bits
	 * X211
	 * 112X
	 * 
	 * 3bits
	 * X3221111
	 * 3X221111
	 * 22X31111
	 * 1111X322
	 * 
	 * @param slot - the slot to NOT fill
	 * @param offset from where to fill
	 * @param length of range
	 * @param depth of pages
	 */
	private void _fillChildSlots(final int slot, final int offset, final int length, final int depth) {
		assert !isReadOnly();
		
		if (slot == offset && length == 1) {
			assert childRefs[offset] != null;
			
			return;
		}
		
		if (slot >= offset && slot < (offset + length)) {
			final int delta = length/2;
			_fillChildSlots(slot, offset, delta, depth+1);
			_fillChildSlots(slot, offset+delta, delta, depth+1);
		} else {
			final BucketPage bp = new BucketPage((HTree) htree, depth);
			bp.parent = (Reference) self;
			((HTree) htree).nleaves++;

			for (int s = 0; s < length; s++) {
				if (isReadOnly())
					assert !isReadOnly();
				
				assert childRefs[offset+s] == null;
				
				childRefs[offset+s] = (Reference) bp.self;
			}
		}
	}

	int _getPrefixLength() {
		int prefixLength = 0;
		DirectoryPage p = getParentDirectory();
		while (p != null) {
			prefixLength += p.globalDepth;
			p = p.getParentDirectory();
		}
		
		return prefixLength;
	}


	/**
     * Ensure their is a single reference to the BucketPage. This assumption is
     * required in the case of overflow, since an overflow Bucket will hold
     * values for a single key.
     * 
     * @param key
     * @param bp
     */
	void _ensureUniqueBucketPage(final byte[] key,
		Reference bp) {
		final int prefixLength = _getPrefixLength();
	    final int hashBits = getLocalHashCode(key, prefixLength);
	    assert childRefs[hashBits] == bp;
	    
	    int start = -1;
	    int refs = 0;
	    for (int i = 0; i < childRefs.length; i++) {
	    	if (childRefs[i] == bp) {
	    		start = start == -1 ? i : start;
	    		refs++;
	    		if (i != hashBits) {
	    			childRefs[i] = null;
	    			((MutableDirectoryPageData) data).childAddr[i] = IRawStore.NULL;
	    		}
	    	}
	    }
	    
	    assert Integer.bitCount(refs) == 1; // MUST be power of 2
	    
	    // Don't fill slots!
	    // _fillChildSlots(hashBits, start, refs, 0);
	}

   final public int removeAll(final byte[] key) {
	   if (!isOverflowDirectory())
    	throw new UnsupportedOperationException("Only valid if page is an overflow directory");
	   
		if (isReadOnly()) {
			DirectoryPage copy = (DirectoryPage) copyOnWrite(getIdentity());
			return copy.removeAll(key);
		}
		
		// call removeAll on each child, then htree.remove on existing child store
		int ret = 0;
		for (int i = 0; i < childRefs.length; i++) {
		    AbstractPage tmp = deref(i);
		    if (tmp == null && data.getChildAddr(i) != IRawStore.NULL) {
		    	tmp = getChild(i);
		    }
		    if (tmp == null) {
		    	break;
		    }
		    ret += tmp.removeAll(key);
		}
		
		// Now remove all childAddrs IFF there is a persistent identity
		for (int i = 0; i < childRefs.length; i++) {
		    final long addr = data.getChildAddr(i);
		    if (addr != IRawStore.NULL)
		    	htree.deleteNodeOrLeaf(addr);
		}
		
		return ret;
    }

    final public byte[] removeFirst(final byte[] key) {
 	   if (!isOverflowDirectory())
 	    	throw new UnsupportedOperationException("Only valid if page is an overflow directory");
 	   
 	   /**
 	    * We should check the overflowKey to see if we need to ask the last
 	    * BucketPage to remove the key.
 	    * 
 	    * If so, then we need to ensure the BucketPage is not empty after removal
 	    * and, if so, to remove the reference and mark as deleted.
 	    * 
 	    * This is further complicated if the previous sibling is a DirectoryPage.
 	    * In this case, the original parent directory should be replaced
 	    */
 	   if (BytesUtil.compareBytes(key, getOverflowKey()) == 0) {
 		   
			final AbstractPage last = lastChild();
			
			if (last.isLeaf()) {
				final BucketPage lastbp = (BucketPage) last.copyOnWrite();
				final byte[] ret = lastbp.removeFirst(key);
				
				if (lastbp.getValues().isEmpty()) {
					DirectoryPage dp = lastbp.getParentDirectory();
					
					final int slot = dp.replaceChildRef(lastbp.self, null);
					lastbp.delete();
					if (slot == 1) { // check first child for OverflowDirectory
						AbstractPage odc = dp.getChild(0);
						if (!odc.isLeaf()) {
							DirectoryPage od = (DirectoryPage) odc;
							
							assert od.isOverflowDirectory();
							
							// Right then, since this (dp) only has a single child
							// AND that child is already an overflowDirectory, then
							// its parent must replace the dp.self with the
							// odc.self
							DirectoryPage gdp = dp.getParentDirectory();
							assert gdp != null;
							gdp.replaceChildRef(dp.self, od);
							dp.delete();
							if (DEBUG) {
								log.debug("Promoted child overflowDirectory after remove from last BucketPage");
							}
						}
					} 
				}
				
				return ret;
			} else {
				return last.removeFirst(key);
			}
 	   } else {
 		   return null;
 	   }
    }

	public void removeAll() {
		Iterator chs = childIterator(false);
		while (chs.hasNext()) {
			final AbstractPage ch = chs.next();
			if (ch instanceof BucketPage) {
				if (TRACE)
					log.trace("Removing bucket page: " + ch.PPID());
				
				if (ch.isPersistent())
					htree.store.delete(ch.getIdentity());		
			} else {
				if (TRACE)
					log.trace("Removing child directory page: " + ch.PPID());
				
				((DirectoryPage) ch).removeAll();
			}
		}
		if (identity != IRawStore.NULL)
			htree.store.delete(identity);		
	}

}