com.bigdata.htree.BucketPage Maven / Gradle / Ivy

Go to download
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Dec 19, 2006
 */
package com.bigdata.htree;

import java.io.PrintStream;
import java.nio.ByteBuffer;
import java.util.Iterator;
import java.util.NoSuchElementException;

import org.apache.log4j.Level;
import org.apache.log4j.Logger;

import com.bigdata.btree.AbstractTuple;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.IRawRecordAccess;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.data.DefaultLeafCoder;
import com.bigdata.btree.data.ILeafData;
import com.bigdata.btree.raba.IRaba;
import com.bigdata.htree.raba.MutableKeyBuffer;
import com.bigdata.htree.raba.MutableValueBuffer;
import com.bigdata.io.AbstractFixedByteArrayBuffer;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.util.BytesUtil;

import cutthecrap.utils.striterators.EmptyIterator;
import cutthecrap.utils.striterators.SingleValueIterator;

/**
 * An {@link HTree} bucket page (leaf). The bucket page is comprised of one or
 * more buddy hash buckets. The #of buddy hash buckets is determined by the
 * address bits of the hash tree and the global depth of the bucket page.
 * 
 * The entire bucket page is logically a fixed size array of tuples. The
 * individual buddy hash buckets are simply offsets into that logical tuple
 * array. While inserts of distinct keys drive splits, inserts of identical keys
 * do not. Therefore, this simple logical picture is upset by the need for a
 * bucket to hold an arbitrary number of tuples having the same key.
 * 

 * Each tuple is represented by a key, a value, and various metadata bits using
 * the {@link ILeafData} API. The tuple keys are always inline within the page
 * and are often 32-bit integers. The tuple values may be either "raw records"
 * on the backing {@link IRawStore} or inline within the page.
 * 

 * The {@link ILeafData#getPriorAddr()} and {@link ILeafData#getNextAddr()}
 * fields of the {@link ILeafData} record are reserved by the hash tree to
 * encode the search order for range queries when used in combination with an
 * order preserving hash function.
 * 
 * TODO One way to tradeoff the simplicity of a local tuple array with the
 * requirement to hold an arbitrary number of duplicate keys within a bucket is
 * to split the bucket if it becomes full regardless of whether or not there are
 * duplicate keys.
 * 

 * Splitting a bucket doubles its size which causes a new bucket page to be
 * allocated to store 1/2 of the data. If the keys can be differentiated by
 * increasing the local depth, then this is the normal case and the tuples are
 * just redistributed among buddy buckets on the original bucket page and the
 * new bucket page. If the keys are identical but we force a split anyway, then
 * we will still have fewer buckets on the original page and they will be twice
 * as large. The duplicate keys will all wind up in the same buddy bucket after
 * the split, but at least the buddy bucket is 2x larger. This process can
 * continue until there is only a single buddy bucket on the page (the global
 * depth of the parent is the same as the global depth of the buddy bucket). At
 * that point, a "full" page can simply "grow" by permitting more and more
 * tuples into the page (as long as those tuples have the same key). We could
 * also chain overflow pages at that point - it all amounts to the same thing.
 * An attempt to insert a tuple having a different key into a page in which all
 * keys are known to be identical will immediately trigger a split. In this case
 * we could avoid some effort if we modified the directory structure to impose a
 * split since we already know that we have only two distinct keys (the one
 * found in all tuples of the bucket and the new key). When a bucket page
 * reaches this condition of containing only duplicate keys we could of course
 * compress the keys enormously since they are all the same.
 * 

 * This works well when we only store the address of the objects in the bucket
 * (rather than the objects themselves, e.g., raw records mode) and choose the
 * address bits based on the expected size of a tuple record. However, we can
 * also accommodate tuples with varying size (think binding sets) with in page
 * locality if split the buddy bucket with the most bytes when an insert into
 * the page would exceed the target page size. This looks pretty much like the
 * case above, except that we split buddy buckets based on not only whether
 * their alloted #of slots are filled by tuples but also based on the data on
 * the page.
 * 
 * TODO Delete markers will also require some thought. Unless we can purge them
 * out at the tx commit point, we can wind up with a full bucket page consisting
 * of a single buddy bucket filled with deleted tuples all having the same key.
 * 
 * TODO Explore cracking. Concerning cracking, we need to be careful about the
 * thread-safety guarantee. If we did cracking for a mutation operation, that
 * would be Ok since we are single threaded. However, we could not do cracking
 * for a read operation even against a mutable HTree since we allow concurrent
 * read operations as long as there is no writer.
 */
class BucketPage extends AbstractPage implements ILeafData, IRawRecordAccess {

	/**
	 * The data record. {@link MutableBucketData} is used for all mutation
	 * operations. {@link ReadOnlyLeafData} is used when the {@link BucketPage}
	 * is made persistent. A read-only data record is automatically converted
	 * into a {@link MutableBucketData} record when a mutation operation is
	 * requested.
	 * 

	 * Note: This is package private in order to expose it to {@link HTree}.
	 */
	ILeafData data;
	
	@Override
	public AbstractFixedByteArrayBuffer data() {
		return data.data();
	}

   @Override
	public boolean getDeleteMarker(final int index) {
		return data.getDeleteMarker(index);
	}

   @Override
	public int getKeyCount() {
		return data.getKeyCount();
	}

   @Override
	public IRaba getKeys() {
		return data.getKeys();
	}

   @Override
	public long getMaximumVersionTimestamp() {
		return data.getMaximumVersionTimestamp();
	}

   @Override
	public long getMinimumVersionTimestamp() {
		return data.getMinimumVersionTimestamp();
	}

   @Override
	public long getNextAddr() {
		return data.getNextAddr();
	}

   @Override
	public long getPriorAddr() {
		return data.getPriorAddr();
	}

   @Override
	public long getRawRecord(int index) {
		return data.getRawRecord(index);
	}

	// public int getSpannedTupleCount() {
	// return data.getSpannedTupleCount();
	// }

   @Override
	public int getValueCount() {
		return data.getValueCount();
	}

   @Override
	public IRaba getValues() {
		return data.getValues();
	}

   @Override
	public long getVersionTimestamp(final int index) {
		return data.getVersionTimestamp(index);
	}

   @Override
	public boolean hasDeleteMarkers() {
		return data.hasDeleteMarkers();
	}

   @Override
	public boolean hasRawRecords() {
		return data.hasRawRecords();
	}

   @Override
	public boolean hasVersionTimestamps() {
		return data.hasVersionTimestamps();
	}

   @Override
	public boolean isCoded() {
		return data.isCoded();
	}

   @Override
	public boolean isDoubleLinked() {
		return data.isDoubleLinked();
	}

   @Override
	public boolean isLeaf() {
		return data.isLeaf();
	}

   @Override
	public boolean isReadOnly() {
		return data.isReadOnly();
	}

	/**
	 * Create a new empty bucket.
	 * 
	 * @param htree
	 *            A reference to the owning {@link HTree}.
	 * @param globalDepth
	 *            The size of the address space (in bits) for each buddy hash
	 *            table on a directory page. The global depth of a node is
	 *            defined recursively as the local depth of that node within its
	 *            parent. The global/local depth are not stored explicitly.
	 *            Instead, the local depth is computed dynamically when the
	 *            child will be materialized by counting the #of pointers to the
	 *            the child in the appropriate buddy hash table in the parent.
	 *            This local depth value is passed into the constructor when the
	 *            child is materialized and set as the global depth of the
	 *            child.
	 */
	BucketPage(final HTree htree, final int globalDepth) {

		super(htree, true/* dirty */, globalDepth);
		
		data = new MutableBucketData(//
				htree.bucketSlots, // fan-out
				htree.versionTimestamps,//
				htree.deleteMarkers,//
				htree.rawRecords//
		);

	}

	/**
	 * Deserialization constructor - {@link #globalDepth} MUST be set by the
	 * caller.
	 * 
	 * @param htree
	 * @param addr
	 * @param data
	 */
	BucketPage(final HTree htree, final long addr, final ILeafData data) {

		super(htree, false/* dirty */, 0/*unknownGlobalDepth*/);

		this.data = data;
		
        setIdentity(addr);

	}

    /**
     * Copy constructor.
     * 
     * @param src
     *            The source node (must be immutable).
     * 
     * @see AbstractPage#copyOnWrite()
     */
    protected BucketPage(final BucketPage src) {

        super(src);

        assert !src.isDirty();
        assert src.isReadOnly();
//        assert src.isPersistent();

        // steal/clone the data record.
		this.data = src.isReadOnly() ? new MutableBucketData(src.slotsOnPage(),
				src.data) : src.data;

        // clear reference on source.
        src.data = null;

//        /*
//         * Steal/copy the keys.
//         * 
//         * Note: The copy constructor is invoked when we need to begin mutation
//         * operations on an immutable node or leaf, so make sure that the keys
//         * are mutable.
//         */
//        {
//
////            nkeys = src.nkeys;
//
//            if (src.getKeys() instanceof MutableKeyBuffer) {
//
//                keys = src.getKeys();
//
//            } else {
//
//                keys = new MutableKeyBuffer(src.getBranchingFactor(), src
//                        .getKeys());
//
//            }
//
//            // release reference on the source node.
////            src.nkeys = 0;
//            src.keys = null;
//            
//        }
//
////        /*
////         * Steal the values[].
////         */
////
////        // steal reference and clear reference on the source node.
////        values = src.values;
//
//        /*
//         * Steal/copy the values[].
//         * 
//         * Note: The copy constructor is invoked when we need to begin mutation
//         * operations on an immutable node or leaf, so make sure that the values
//         * are mutable.
//         */
//        {
//
//            if (src.values instanceof MutableValueBuffer) {
//
//                values = src.values;
//
//            } else {
//
//                values = new MutableValueBuffer(src.getBranchingFactor(),
//                        src.values);
//
//            }
//
//            // release reference on the source node.
//            src.values = null;
//            
//        }
//
//        versionTimestamps = src.versionTimestamps;
//        
//        deleteMarkers = src.deleteMarkers;
        
//        // Add to the hard reference queue.
//        btree.touch(this);

    }

    /**
	 * Return true if there is at lease one tuple in the buddy hash
	 * bucket for the specified key.
	 * 
	 * @param key
	 *            The key.
	 * @param buddyOffset
	 *            The offset within the {@link BucketPage} of the buddy hash
	 *            bucket to be searched.
	 * 
	 * @return true if a tuple is found in the buddy hash bucket
	 *         for the specified key.
	 */
	boolean contains(final byte[] key, final int buddyOffset) {

		if (key == null)
			throw new IllegalArgumentException();

		/*
		 * Use search to locate key, buddy offset is ignored for BucketPage
		 */	
		final int index = getKeys().search(key);
		return index >= 0;
	}

	/**
	 * There is no reason why the number of slots in a BucketPage should be the
	 * same as the number in a DirectoryPage.
	 * 
	 * @return number of slots available in this BucketPage
	 */
	final int slotsOnPage() {
		return htree.bucketSlots;
		// return 1 << htree.addressBits;
	}

	/**
	 * Return the first value found in the buddy hash bucket for the specified
	 * key.
	 * 
	 * @param key
	 *            The key.
	 * @param buddyOffset
	 *            The offset within the {@link BucketPage} of the buddy hash
	 *            bucket to be searched.
	 * 
	 * @return The value associated with the first tuple found in the buddy hash
	 *         bucket for the specified key and null if no such
	 *         tuple was found. Note that the return value is not diagnostic if
	 *         the application allows null values into the index.
	 */
	final byte[] lookupFirst(final byte[] key, final int buddyOffset) {
		final int index = lookupIndex(key);
		
		if (index == -1)
			return null;
		
		if (hasRawRecords()) {
			final long addr = getRawRecord(index);
			
			if (addr != IRawStore.NULL)
				return getBytes(readRawRecord(addr));
		}

		return getValues().get(index);
	}
	
	/**
	 * @param buf
	 * @return a byte array representing the data view of the ByteBuffer
	 */
	final byte[] getBytes(final ByteBuffer buf) {

		if (buf.hasArray() && buf.arrayOffset() == 0 && buf.position() == 0
				&& buf.limit() == buf.capacity()) {

			/*
			 * Return the backing array.
			 */

			return buf.array();

		}

		/*
		 * Copy the expected data into a byte[] using a read-only view on the
		 * buffer so that we do not mess with its position, mark, or limit.
		 */
		final byte[] a;
		{

			final ByteBuffer buf2 = buf.asReadOnlyBuffer();

			final int len = buf2.remaining();

			a = new byte[len];

			buf2.get(a);

		}

		return a;

	}

	final int lookupIndex(final byte[] key) {

		if (key == null)
			throw new IllegalArgumentException();

		/*
		 * Locate the first unassigned tuple in the buddy bucket.
		 * 
		 */
		final IRaba keys = getKeys();
		
		final int si = keys.search(key);
		
		return si < 0 ? -1 : si;
	}

	/**
	 * Return an iterator which will visit each tuple in the buddy hash bucket
	 * for the specified key.
	 * 
	 * @param key
	 *            The key.
	 * 
	 * @return An iterator which will visit each tuple in the buddy hash table
	 *         for the specified key and never null.
	 * 
	 *         TODO Specify the contract for concurrent modification both here
	 *         and on the {@link HTree#lookupAll(byte[])} methods.
	 */
//	 * @param buddyOffset
//	 *            The offset within the {@link BucketPage} of the buddy hash
//	 *            bucket to be searched.
	final ITupleIterator lookupAll(final byte[] key) {// final int buddyOffset) {

		return new BuddyBucketTupleIterator(key, this);//, buddyOffset);

	}

	/**
	 * Insert the tuple into the buddy bucket.
	 * 
	 * @param key
	 *            The key (all bits, all bytes).
	 * @param val
	 *            The value (optional).
	 * 
	 * @return false iff the buddy bucket must be split.
	 * 
	 * @throws IllegalArgumentException
	 *             if key is null.
	 * @throws IllegalArgumentException
	 *             if parent is null.
	 * @throws IndexOutOfBoundsException
	 *             if buddyOffset is out of the allowed range.
	 */
	boolean insert(final byte[] key, final byte[] val) {

		if (key == null)
			throw new IllegalArgumentException();

		if (parent == null)
			throw new IllegalArgumentException();

		// #of slots on the page.
		final int slotsOnPage = slotsOnPage();

        /*
         * Note: This is one of the few gateways for mutation of a leaf via the
         * main btree API (insert, lookup, delete). By ensuring that we have a
         * mutable leaf here, we can assert that the leaf must be mutable in
         * other methods.
         */
        final BucketPage copy = (BucketPage) copyOnWrite();

		// assert copy.dirtyHierarchy();

		if (copy != this) {

			/*
			 * This leaf has been copied so delegate the operation to the new
			 * leaf.
			 * 
			 * Note: copy-on-write deletes [this] leaf and delete() notifies any
			 * leaf listeners before it clears the [leafListeners] reference so
			 * not only don't we have to do that here, but we can't since the
			 * listeners would be cleared before we could fire off the event
			 * ourselves.
			 */

			return copy.insert(key, val);

		}
		
		// convert to raw record if necessary
		final byte[] ival = checkRawRecord(val);

		final MutableKeyBuffer keys = (MutableKeyBuffer) getKeys();
		if (keys.nkeys < keys.capacity()) {
			int insIndex;
			// Check if an overflow BucketPage (by checking parent)
			// and if so, then just append to end
			if (this.getParentDirectory().isOverflowDirectory()) {
				insIndex = keys.nkeys;
			} else {
				insIndex = keys.search(key);
			}
			if (insIndex < 0) {
				insIndex = -insIndex - 1;
			} else if (TRACE){
				log.trace("Insert duplicate key");
			}
			
			((MutableBucketData) data).insert(insIndex, key, ival, ival != val);
			
			((HTree) htree).nentries++;
			
			return true;
		}
			
		/*
		 * Now we have to figure out whether or not all keys are duplicates.
		 */
		boolean identicalKeys = true;
		for (int i = 0; i < slotsOnPage; i++) {
			if (!BytesUtil.bytesEqual(key, keys.get(i))) {
				identicalKeys = false;
				break;
			}
		}
		if (!identicalKeys) {
			/*
			 * Force a split since it is possible to redistribute some tuples.
			 */
			return false;
		}
		
		/*
		 * Rather than overflow a BucketPage by some chaining structure it
		 * turns out to be a lot simpler to introduce a new DirectoryPage
		 * for this bucketPage since the serialization and dirty protocols
		 * need not change at all. 
		 */
//		final EvictionProtection protect = new EvictionProtection(this);
//		try {
			/**
			 * In any event:
			 * 		create new bucket page and insert key/value
			 */
			final BucketPage newPage = new BucketPage((HTree) htree, globalDepth);			
			((HTree) htree).nleaves++;

			final DirectoryPage pd = getParentDirectory();
			if (pd.isOverflowDirectory()) { // already handles blobs
				assert globalDepth == htree.addressBits;
				pd._addChild(newPage); // may result in extra level insertion
			} else {
				if (pd.getLevel() * htree.addressBits > key.length * 8)
					throw new AssertionError();
				
				// Must ensure that there is only a single reference to this BucketPage
				// and that the "active" page is for the overflow key
				pd._ensureUniqueBucketPage(key, this.self);
				globalDepth = htree.addressBits;
				newPage.globalDepth = htree.addressBits;
				
	            final DirectoryPage blob = new DirectoryPage((HTree) htree,
	                    key,// overflowKey
	                    pd.getOverflowPageDepth());
				// now add in blob
				pd.replaceChildRef(this.self, blob);
	
				blob._addChild(this);
				blob._addChild(newPage); // Directories MUST have at least 2 slots!
				
			}
			
			newPage.insert(key, val);
			
			// assert (1 << (htree.addressBits - this.globalDepth)) == getParentDirectory().countChildRefs(this);
			// assert (1 << (htree.addressBits - newPage.globalDepth)) == getParentDirectory().countChildRefs(newPage);
			
			assert dirtyHierarchy();
//		} finally {
//			protect.release();
//		}

		return true;
	}

	/**
	 * Checks to see if the value supplied should be converted to a raw record, and
	 * if so converts it.
	 * 
	 * @param val - value to be checked
	 * @return the value to be used, converted to raw record reference if required
	 */
	private byte[] checkRawRecord(final byte[] val) {
		if (hasRawRecords() && val != null && val.length > htree.getMaxRecLen()) {

			// write the value on the backing store.
			final long naddr = htree.writeRawRecord(val);

			// convert to byte[].
			return ((HTree) htree).encodeRecordAddr(naddr);
		} else {
			return val;
		}

	}
	
	/**
	 * Insert used when addLevel() is invoked to copy a tuple from an existing
	 * bucket page into another bucket page. This method is very similar to
	 * {@link #insert(byte[], byte[], DirectoryPage, int)}. The critical
	 * differences are: (a) it correctly handles raw records (they are not
	 * materialized during the copy); (b) it correctly handles version counters
	 * and delete markers; and (c) the #of tuples in the index is unchanged.
	 * 
	 * @param srcPage
	 *            The source {@link BucketPage}.
	 * @param srcSlot
	 *            The slot in that {@link BucketPage} having the tuple to be
	 *            copied.
	 * @param key
	 *            The key (already materialized).
	 * @param parent
	 *            The parent {@link DirectoryPage} and never null
	 *            (this is required for the copy-on-write pattern).
	 * @param buddyOffset
	 *            The offset into the child of the first slot for the buddy hash
	 *            table or buddy hash bucket.
	 * 
	 * @return false iff the buddy bucket must be split.
	 * 
	 * @throws IllegalArgumentException
	 *             if key is null.
	 * @throws IllegalArgumentException
	 *             if parent is null.
	 * @throws IndexOutOfBoundsException
	 *             if buddyOffset is out of the allowed range.
	 */
	boolean insertRawTuple(final BucketPage srcPage, final int srcSlot,
			final byte[] key) {//, final int buddyOffset) {
		
		if (key == null)
			throw new IllegalArgumentException();

        /*
         * Note: This is one of the few gateways for mutation of a leaf via the
         * main btree API (insert, lookup, delete). By ensuring that we have a
         * mutable leaf here, we can assert that the leaf must be mutable in
         * other methods.
         */
        final BucketPage copy = (BucketPage) copyOnWrite();

        if (copy != this) {

            /*
             * This leaf has been copied so delegate the operation to the new
             * leaf.
             * 
             * Note: copy-on-write deletes [this] leaf and delete() notifies any
             * leaf listeners before it clears the [leafListeners] reference so
             * not only don't we have to do that here, but we can't since the
             * listeners would be cleared before we could fire off the event
             * ourselves.
             */

            return copy.insertRawTuple(srcPage, srcSlot, key);//, buddyOffset);

        }
        
        // just fit somewhere in page
		final int slotsOnPage = slotsOnPage();
        final MutableKeyBuffer keys = (MutableKeyBuffer) getKeys();
        final MutableValueBuffer vals = (MutableValueBuffer) getValues();
        for (int i = 0; i < slotsOnPage; i++) {
            if (keys.isNull(i)) {
                keys.nkeys++;
                keys.keys[i] = key;
                vals.nvalues++;
                
                // Note: DOES NOT Materialize a raw record!!!!
                vals.values[i] = srcPage.getValues().get(srcSlot);
                
                // deleteMarkers
                if (srcPage.hasDeleteMarkers()) {
                	((MutableBucketData) data).deleteMarkers[i] = srcPage.getDeleteMarker(srcSlot);
                }
                
                // version timestamp
                if (srcPage.hasVersionTimestamps()) {
                	((MutableBucketData) data).versionTimestamps[i] = srcPage.getVersionTimestamp(srcSlot);
                }
                
                // copy raw record info
                if (srcPage.hasRawRecords() && srcPage.getRawRecord(srcSlot) != IRawStore.NULL) {
                	((MutableBucketData) data).rawRecords[i] = true;
                }
                
                // do not increment on raw insert, since this is only ever
                // (for now) a re-organisation

                // insert Ok.
                return true;
            }

        } // next slot on page

        /*
         * The page is full. Now we have to figure out whether or not all keys
         * are duplicates.
         */
		boolean identicalKeys = true;
		for (int i = 0; i < slotsOnPage; i++) {
			if (!BytesUtil.bytesEqual(key, keys.get(i))) {
				identicalKeys = false;
				break;
			}
		}
		if (!identicalKeys) {
			/*
			 * Force a split since it is possible to redistribute some tuples.
			 */
			return false;
		}

        /*
         * Note: insertRawTuple() is invoked when we split a bucket page.
         * Therefore, it is not possible that a bucket page to which we must
         * redistribute a tuple could be full. If it were full, then we would
         * not split it. If we split it, then we can not wind up with more
         * tuples in the target bucket page than were present in the original
         * bucket page.
         */

		throw new AssertionError();

	}

	/**
	 * Return an iterator visiting all the non-deleted, non-empty tuples on this
	 * {@link BucketPage}.
	 */
	ITupleIterator tuples() {

		return new InnerBucketPageTupleIterator(IRangeQuery.DEFAULT);

	}

	/**
	 * Visits the non-empty tuples in each {@link BucketPage} visited by the
	 * source iterator.
	 */
	private class InnerBucketPageTupleIterator implements ITupleIterator {

		private final int slotsPerPage = slotsOnPage();

		private int nextNonEmptySlot = 0;

		private final Tuple tuple;

		InnerBucketPageTupleIterator(final int flags) {

			// look for the first slot.
			if (findNextSlot()) {

				this.tuple = new Tuple(htree, flags);

			} else {

				// Nothing will be visited.
				this.tuple = null;

			}

		}

		/**
		 * Scan to the next non-empty slot in the current {@link BucketPage}.
		 * 
		 * @return true iff there is a non-empty slot on the
		 *         current {@link BucketPage}.
		 */
		private boolean findNextSlot() {
			final IRaba keys = getKeys();
			final int size = keys.size();
			for (; nextNonEmptySlot < size; nextNonEmptySlot++) {
				if (keys.isNull(nextNonEmptySlot))
					continue;
				return true;
			}
			// The current page is exhausted.
			return false;
		}

		public boolean hasNext() {

			return nextNonEmptySlot < getKeys().size();

		}

		public ITuple next() {
			if (!hasNext())
				throw new NoSuchElementException();
			// Copy the data for the current tuple into the Tuple buffer.
			tuple.copy(nextNonEmptySlot, BucketPage.this);
			/*
			 * Advance to the next slot on the current page. if there is non,
			 * then the current page reference will be cleared and we will need
			 * to fetch a new page in hasNext() on the next invocation.
			 */
			nextNonEmptySlot++; // skip past the current tuple.
			findNextSlot(); // find the next non-null slot (next tuple).
			// Return the Tuple buffer.
			return tuple;
		}

		public void remove() {
			throw new UnsupportedOperationException();
		}

	}

	/**
	 * Visits this leaf if unless it is not dirty and the flag is true, in which
	 * case the returned iterator will not visit anything.
	 * 
	 * {@inheritDoc}
	 */
	@Override
	@SuppressWarnings("unchecked")
	public Iterator postOrderNodeIterator(
			final boolean dirtyNodesOnly, final boolean nodesOnly) {

		if (dirtyNodesOnly && !isDirty()) {

			return EmptyIterator.DEFAULT;

		} else if (nodesOnly) {

			return EmptyIterator.DEFAULT;

		} else {

			return new SingleValueIterator(this);

		}

	}

	@Override
	public void PP(final StringBuilder sb, final boolean showBinary) {

		sb.append(PPID() + " [" + globalDepth + "] " + indent(getLevel()));

		sb.append("("); // start of address map.

		// #of buddy tables on a page.
		// final int nbuddies = (1 << htree.addressBits) / (1 << globalDepth);
		final int nbuddies = 1;

		// #of address slots in each buddy hash table.
		// final int slotsPerBuddy = (1 << globalDepth);
		final int slotsPerBuddy = slotsOnPage();

		for (int i = 0; i < nbuddies; i++) {

			if (i > 0) // buddy boundary marker
				sb.append(";");

			for (int j = 0; j < slotsPerBuddy; j++) {

				if (j > 0) // slot boundary marker.
					sb.append(",");

				final int slot = i * slotsPerBuddy + j;
				if (slot > 0 && slot % 16 == 0)
					sb.append("\n----------" + indent(getLevel()));
				
				sb.append(PPVAL(slot, showBinary));

			}

		}

		sb.append(")"); // end of tuples

		sb.append("\n");

	}

	/**
	 * Pretty print a value from the tuple at the specified slot on the page.
	 * 
	 * @param index
	 *            The slot on the page.
	 * 
	 * @return The pretty print representation of the value associated with the
	 *         tuple at that slot.
	 */
	private String PPVAL(final int index, final boolean showBinary) {

		if (index >= getKeys().size())
			return "-";

		if (index > getKeys().capacity()) {
			throw new RuntimeException("index="+index+", keys.size="+getKeys().size()+", keys.capacity="+getKeys().capacity());
		}
		
		final byte[] key = getKeys().get(index);

		final String keyStr = showBinary ? BytesUtil.toString(key) + "("
				+ BytesUtil.toBitString(key) + ")" : BytesUtil.toString(key);

		final String valStr;

		if (false/* showValues */) {

			final long addr;
			if (hasRawRecords()) {
				addr = getRawRecord(index);
			} else {
				addr = IRawStore.NULL;
			}

			if (addr != IRawStore.NULL) {
				// A raw record
				valStr = "@" + htree.getStore().toString(addr);
			} else {
				final byte[] value = getValues().get(index);

				valStr = BytesUtil.toString(value);
			}
		} else {

			valStr = null;

		}

		if (valStr == null) {

			return keyStr;

		}

		return keyStr + "=>" + valStr;

	}

	/**
	 * Human readable representation of the {@link ILeafData} plus transient
	 * information associated with the {@link BucketPage}.
	 */
	@Override
	public String toString() {

		final StringBuilder sb = new StringBuilder();

		sb.append(super.toString());

		sb.append("{ isDirty=" + isDirty());

		sb.append(", isDeleted=" + isDeleted());

		sb.append(", addr=" + identity);

		final DirectoryPage p = (parent == null ? null : parent.get());

		sb.append(", parent=" + (p == null ? "N/A" : p.toShortString()));
		sb.append(", globalDepth=" + getGlobalDepth());
		sb.append(", nbuddies=" + (1 << htree.addressBits) / (1 << globalDepth));
		sb.append(", slotsPerBuddy=" + (1 << globalDepth));
		if (data == null) {

			// No data record? (Generally, this means it was stolen by copy on
			// write).
			sb.append(", data=NA}");

			return sb.toString();

		}

		sb.append(", nkeys=" + getKeyCount());

		// sb.append(", minKeys=" + minKeys());
		//
		// sb.append(", maxKeys=" + maxKeys());

		DefaultLeafCoder.toString(this, sb);

		sb.append("}");

		return sb.toString();

	}

   @Override
	protected boolean dump(final Level level, final PrintStream out,
			final int height, final boolean recursive, final boolean materialize) {

		final boolean debug = level.toInt() <= Level.DEBUG.toInt();

		// Set to false iff an inconsistency is detected.
		boolean ok = true;

		if (parent == null || parent.get() == null) {
			out.println(indent(height) + "ERROR: parent not set");
			ok = false;
		}

		if (globalDepth > parent.get().globalDepth) {
			out.println(indent(height)
					+ "ERROR: localDepth exceeds globalDepth of parent");
			ok = false;
		}

		/*
		 * TODO Count the #of pointers in each buddy hash table of the parent
		 * to each buddy bucket in this bucket page and verify that the
		 * globalDepth on the child is consistent with the pointers in the
		 * parent.
		 * 
		 * TODO The same check must be performed for the directory page to
		 * cross validate the parent child linking pattern with the transient
		 * cached globalDepth fields.
		 */

		if (debug || !ok) {

			out.println(indent(height) + toString());

		}

		return ok;

	}

   @Override
   public void dumpPages(final boolean recursive, final boolean visitLeaves,
         final HTreePageStats stats) {

      if (!visitLeaves)
         return;

      stats.visit(htree, this);

   }
	
	/**
	 * From the current bit resolution, determines how many extra bits are
	 * required to ensure the current set of bucket values can be split.
	 * 
	 * The additional complexity of determining whether the page can really be
	 * split is left to the parent. A new directory, covering the required
	 * prefixBits would initially be created with depth 1. But if the specified
	 * bit is discriminated within buddy buckets AND other bits do not further
	 * separate the buckets then the depth of the directory will need to be
	 * increased before the bucket page can be split.
	 * 
	 * @return bit depth increase from current offset required -or-
	 *         -1 if it is not possible to split the page no matter
	 *         how many bits we have.
	 */
	int distinctBitsRequired() {
		final int currentResolution = getPrefixLength(); // start offset of this
															// page
		int testPrefix = currentResolution + 1;

		final IRaba keys = data.getKeys();
		final int nkeys = keys.size();

		int maxPrefix = 0;
		for (int t = 1; t < nkeys; t++) {
			final byte[] k = keys.get(t);
			final int klen = k == null ? 0 : k.length;
			maxPrefix = maxPrefix > klen ? maxPrefix : klen;
		}
		maxPrefix *= 8; // convert max bytes to max bits

		assert nkeys > 1;

		while (testPrefix < maxPrefix) {
			final boolean bitset = BytesUtil.getBit(keys.get(0), testPrefix);
			for (int t = 1; t < nkeys; t++) {
				final byte[] k = keys.get(t);
				if (bitset != (k == null ? false : BytesUtil.getBit(
						keys.get(t), testPrefix))) {
					return testPrefix - currentResolution;
				}
			}
			testPrefix++;
		}

		return -1;
	}

//	/**
//	 * To insert in a BucketPage must handle split
//	 * 
//	 * @see com.bigdata.htree.AbstractPage#insertRawTuple(byte[], byte[], int)
//	 */
//	void insertRawTuple(final byte[] key, final byte[] val, final int buddy) {
//		final int slotsPerBuddy = slotsOnPage(); // (1 << htree.addressBits);
//		final MutableKeyBuffer keys = (MutableKeyBuffer) getKeys();
//		final MutableValueBuffer vals = (MutableValueBuffer) getValues();
//
//		if (true) {
//			// just fit somewhere in page
//			for (int i = 0; i < slotsPerBuddy; i++) {
//				if (keys.isNull(i)) {
//					keys.nkeys++;
//					keys.keys[i] = key;
//					vals.nvalues++;
//					vals.values[i] = val;
//					// TODO deleteMarker:=false
//					// TODO versionTimestamp:=...
//					// do not increment on raw insert, since this is only ever
//					// (for now) a re-organisation
//					// ((HTree)htree).nentries++;
//					// insert Ok.
//					return;
//				}
//			}
//		} else { // if mapping buddy explicitly
//			final int buddyStart = buddy * slotsPerBuddy;
//			final int lastSlot = buddyStart + slotsPerBuddy;
//
//			for (int i = buddyStart; i < lastSlot; i++) {
//				if (keys.isNull(i)) {
//					keys.nkeys++;
//					keys.keys[i] = key;
//					vals.nvalues++;
//					setValue(i, val);
//					// TODO deleteMarker:=false
//					// TODO versionTimestamp:=...
//					((HTree) htree).nentries++;
//					// insert Ok.
//					return;
//				}
//			}
//		}
//
//		// unable to insert
//		final DirectoryPage np;
//		if (globalDepth == htree.addressBits) {
//			// max depth so add level
//			np = ((HTree) htree).addLevel2(this);
//		} else {
//			// otherwise split page by asking parent to split and re-inserting
//			// values
//
//			np = getParentDirectory();
//			np.split(this); // will re-insert tuples from original page
//		}
//		
//		np.insertRawTuple(key, val, 0);
//	}
//
//	// setValue() is unused. only reference is in insertRawTuple (above).
//	private void setValue(final int entryIndex, final byte[] newval) {
//
//		// Tunnel through to the mutable object.
//        final MutableBucketData data = (MutableBucketData) this.data;
//		
//        /*
//		 * Update the entry on the leaf.
//		 */
//		if (hasRawRecords()) {
//
//			/*
//			 * Note: If the old value was a raw record, we need to delete
//			 * that raw record now.
//			 * 
//			 * Note: If the new value will be a raw record, we need to write
//			 * that raw record onto the store now and save its address into
//			 * the values[] raba.
//			 */
//			final long oaddr = getRawRecord(entryIndex);
//
//			if(oaddr != IRawStore.NULL) {
//				
//				htree.deleteRawRecord(oaddr);
//				
//			}
//			
//			final long maxRecLen = htree.getMaxRecLen();
//			
//			if (newval != null && newval.length > maxRecLen) {
//
//				// write the value on the backing store.
//				final long naddr = htree.writeRawRecord(newval);
//
//				// save its address in the values raba.
//				data.vals.values[entryIndex] = ((HTree) htree)
//						.encodeRecordAddr(naddr);
//				
//				// flag as a raw record.
//				data.rawRecords[entryIndex] = true;
//
//			} else {
//				
//				data.vals.values[entryIndex] = newval;
//			
//				data.rawRecords[entryIndex] = false;
//				
//			}
//			
//		} else {
//
//			data.vals.values[entryIndex] = newval;
//			
//		}
//
//	}
	
	/**
	 * Convenience method returns the byte[] for the given index in the leaf. If
	 * the tuple at that index is a raw record, then the record is read from the
	 * backing store. More efficient operations should be performed when copying
	 * the value into a tuple.
	 * 
	 * @param leaf
	 *            The leaf.
	 * @param index
	 *            The index in the leaf.
	 * 
	 * @return The data.
	 * 
	 * @see AbstractTuple#copy(int, ILeafData)
	 */
    public byte[] getValue(final int index) {
    	
		if (!hasRawRecords()) {
		
			return getValues().get(index);
			
		}
		
		final long addr = getRawRecord(index);
		
		if( addr == IRawStore.NULL) {

			return getValues().get(index);

		}
		
		final ByteBuffer tmp = htree.readRawRecord(addr);
		
		if (tmp.hasArray() && tmp.arrayOffset() == 0 && tmp.position() == 0
				&& tmp.limit() == tmp.capacity()) {
			/*
			 * Return the backing array.
			 */
			return tmp.array();
		}

		/*
		 * Copy the data into a byte[].
		 */

		final int len = tmp.remaining();

		final byte[] a = new byte[len];

		tmp.get(a);

		return a;

    }

    @Override
	final public ByteBuffer readRawRecord(long addr) {
		
		return htree.readRawRecord(addr);

	}

	/*
	 * TODO When writing a method to remove a key/value, the following logic
	 * should be applied to delete the corresponding raw record on the backing
	 * store when the tuple is deleted.
	 */
//	/*
//	 * If the tuple was associated with a raw record address, then delete
//	 * the raw record from the backing store.
//	 * 
//	 * Note: The general copy-on-write contract of the B+Tree combined with
//	 * the semantics of the WORM, RW, and scale-out persistence layers will
//	 * ensure the actual delete of the raw record is deferred until the
//	 * commit point from which the tuple was deleted is no longer visible.
//	 */
//	if (data.hasRawRecords()) {
//
//		final long addr = data.getRawRecord(entryIndex);
//
//		if (addr != IRawStore.NULL) {
//
//			btree.deleteRawRecord(addr);
//
//		}
//
//	}

    /**
     * Split the bucket page into two, updating the pointers in the parent
     * accordingly.
     */
	void split() {
	    
        /*
         * Note: This is one of the few gateways for mutation of a BucketPage
         * via the main htree API (insert, lookup, delete). By ensuring that we
         * have a mutable directory here, we can assert that the directory must
         * be mutable in other methods.
         */
        final BucketPage copy = (BucketPage) copyOnWrite();

        if (copy != this) {

            /*
             * This leaf has been copied so delegate the operation to the new
             * leaf.
             * 
             * Note: copy-on-write deletes [this] leaf and delete() notifies any
             * leaf listeners before it clears the [leafListeners] reference so
             * not only don't we have to do that here, but we can't since the
             * listeners would be cleared before we could fire off the event
             * ourselves.
             */

            copy.split();
            return;
            
        }

        /*
         * DO THE WORK HERE
         * 
         * FIXME We are doing too much work here since the BucketPage which is
         * being split does NOT need to be made mutable. However, this follows
         * the established copy-on-write pattern by starting from the leaf.
         * Revisit and optimize this once we have a stable eviction pattern for
         * the HTree.
         */
        getParentDirectory()._splitBucketPage(this);
        
	}
	
	void addLevel() {
		/**
		 * Ensure parent directory is mutable - at present this requires making
		 * a Leaf mutable
		 */
        final BucketPage copy = (BucketPage) copyOnWrite();
        if (copy != this) {
        	copy.addLevel();
        	return;
        }
    	getParentDirectory()._addLevel(this);
	}

	@Override
	boolean isClean() {
		return !isDirty();
	}

   @Override
	public int removeAll(final byte[] key) {
		if (isReadOnly()) {
			BucketPage copy = (BucketPage) copyOnWrite(getIdentity());
			
			return copy.removeAll(key);
		}
		
		// non-optimal
		int ret = 0;
		while (removeFirst(key) != null) ret++;
		
		return ret;
	}
	
	/**
	 * Must check for rawRecords and remove the references.
	 */
   @Override
	final public byte[] removeFirst(final byte[] key) {

      if (isReadOnly()) {
			final BucketPage copy = (BucketPage) copyOnWrite(getIdentity());
			
			return copy.removeFirst(key);
		}
		
		final int index = lookupIndex(key);
		
		if (index == -1)
			return null;
		
		// get byte[], reading rawRecord if necessary
		final long addr = hasRawRecords() ? getRawRecord(index) : IRawStore.NULL;
		final byte[] ret;
		if (addr != IRawStore.NULL) {
			ret = getBytes(htree.readRawRecord(addr));
			
			htree.deleteRawRecord(addr);					
		} else {
			ret = data.getValues().get(index);
		}
		
		// Now remove reference from data
		((MutableBucketData) data).remove(index);
		
		
		return ret;
		
	}

	/**
	 * Since the BucketPage orders its keys the first key will be the
	 * "lowest" in sort order.
	 * 
	 * @return first key value
	 */
	public byte[] getFirstKey() {
		
	   assert data.getKeys().size() > 0;
		
		return data.getKeys().get(0);
	}

}