com.bigdata.htree.HTree Maven / Gradle / Ivy

Go to download
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Apr 19, 2011
 */

package com.bigdata.htree;

import java.lang.ref.Reference;
import java.lang.ref.WeakReference;
import java.lang.reflect.Constructor;
import java.util.Arrays;
import java.util.NoSuchElementException;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.Lock;

import org.apache.log4j.Logger;

import com.bigdata.BigdataStatics;
import com.bigdata.btree.AbstractBTree;
import com.bigdata.btree.AbstractNode;
import com.bigdata.btree.BTree;
import com.bigdata.btree.BTreeCounters;
import com.bigdata.btree.BaseIndexStats;
import com.bigdata.btree.Checkpoint;
import com.bigdata.btree.HTreeIndexMetadata;
import com.bigdata.btree.ICounter;
import com.bigdata.btree.IDirtyListener;
import com.bigdata.btree.IIndexLocalCounter;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.IndexTypeEnum;
import com.bigdata.btree.Leaf;
import com.bigdata.btree.Node;
import com.bigdata.btree.ReadOnlyCounter;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.io.ByteArrayBuffer;
import com.bigdata.io.SerializerUtil;
import com.bigdata.mdi.LocalPartitionMetadata;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.util.Bytes;
import com.bigdata.util.BytesUtil;

/**
 * An mutable persistence capable extensible hash tree.
 * 
 * @author Bryan Thompson
 * 
 * @see 
 *      A Robust Scheme for Multilevel Extendible Hashing  by Sven Helmer,
 *      Thomas Neumann, Guido Moerkotte. ISCIS 2003: 220-227.
 * 
 *      TODO It should be possible to define an native int32 hash table in
 *      parallel to the unsigned byte[] hash table simply by having an
 *      alternative descent passing an int32 key all the way down and using the
 *      variant of getBits() method which operates on the int32 values. We could
 *      also optimize the storage and retrieval of the int32 keys, perhaps with
 *      a custom mutable bucket page and mutable bucket data implementation for
 *      int32 keys. This optimization is likely to be quite worth while as the
 *      majority of use cases for the hash tree use int32 keys.
 * 
 *      TODO It is quite possible to define a range query interface for the hash
 *      tree. You have to use an order preserving hash function, which is
 *      external to the HTree implementation. Internally, the HTree must either
 *      double-link the pages or crawl the directory structure.
 * 
 *      TODO The keys should be declared as a computed key based on the data
 *      fields in the record. The {@link HTree} supports arbitrary bit length
 *      keys, but can be optimized for int32 keys easily enough.
 * 
 *      TODO Instrument performance counters for structural modifications,
 *      insert, remove, etc. per {@link BTreeCounters}.
 */
public class HTree extends AbstractHTree 
	implements 
	IIndexLocalCounter
//	ICheckpointProtocol // interface declaration moved to subclass
//	IIndex, 
//  ISimpleBTree//, IAutoboxBTree, ILinearList, IBTreeStatistics, ILocalBTreeView
//  IRangeQuery
{

//    private static final transient Logger log = Logger.getLogger(HTree.class);

    /**
     * The #of bits of distinction to be made each time we split a directory
     * page in the {@link HTree}. See
     * {@link #splitDirectoryPage(DirectoryPage, int, AbstractPage)} for a write
     * up on this.
     */
    /*private*/ final int splitBits = 1; // in [1:addressBits];
    
    /*
     * metadata about the index.
     * 
     * @todo this data should be rolled into the IndexMetadata object.
     */
	final boolean versionTimestamps = false;
	final boolean deleteMarkers = false;
    final boolean rawRecords;

	/**
	 * The #of {@link DirectoryPage} in the {@link HTree}. This is ONE (1) for a
	 * new {@link HTree}.
	 */
	protected long nnodes;

	/**
	 * The #of {@link BucketPage}s in the {@link HTree}. This is one (1) for a
	 * new {@link HTree} (one directory page and one bucket page).
	 */
	protected long nleaves;

	/**
	 * The #of entries in the {@link HTree}. This is ZERO (0) for a new
	 * {@link HTree}.
	 */
	protected long nentries;
    
	/**
	 * The value of the record version number that will be assigned to the next
	 * node or leaf written onto the backing store. This number is incremented
	 * each time a node or leaf is written onto the backing store. The initial
	 * value is ZERO (0). The first value assigned to a node or leaf will be
	 * ZERO (0).
	 */
    protected long recordVersion;
    
    /**
     * The mutable counter exposed by #getCounter()}.
     * 
     * Note: This is protected so that it will be visible to
     * {@link Checkpoint} which needs to write the actual value store in this
     * counter into its serialized record (without the partition identifier).
     */
    protected AtomicLong counter;

	/**
	 * A buffer used to encode a raw record address for a mutable {@link BTree}
	 * and otherwise null.
	 */
    private final ByteArrayBuffer recordAddrBuf;
    
    @Override
    final public long getNodeCount() {
        
        return nnodes;
        
    }

    @Override
    final public long getLeafCount() {
        
        return nleaves;
        
    }

    @Override
    final public long getEntryCount() {
        
        return nentries;
        
    }

    /**
     * The constructor sets this field initially based on a {@link Checkpoint}
     * record containing the only address of the {@link IndexMetadata} for the
     * index. Thereafter this reference is maintained as the {@link Checkpoint}
     * record last written by {@link #writeCheckpoint()} or read by
     * {@link #load(IRawStore, long)}.
     */
    private Checkpoint checkpoint = null;
    
    @Override
    final public Checkpoint getCheckpoint() {

        if (checkpoint == null)
            throw new AssertionError();
        
        return checkpoint;
        
    }
    
    @Override
    final public long getRecordVersion() {
    	
    	return recordVersion;

    }
    
    @Override
    final public long getMetadataAddr() {

    	return metadata.getMetadataAddr();

    }
        
    @Override
    final public long getRootAddr() {
    	
		return (root == null ? getCheckpoint().getRootAddr() : root
				.getIdentity());
		
    }

    /**
     * Return true iff changes would be lost unless the B+Tree is flushed to the
     * backing store using {@link #writeCheckpoint()}.
     * 

     * Note: In order to avoid needless checkpoints this method will return
     * false if:
     * 

     *  the metadata record is persistent -AND-
     * 
     *  EITHER the root is null, indicating that the index
     * is closed (in which case there are no buffered writes);
     *  OR the root of the btree is NOT dirty, the persistent address of the
     * root of the btree agrees with {@link Checkpoint#getRootAddr()}, and the
     * {@link #counter} value agrees {@link Checkpoint#getCounter()}.
     * 
     * 
     * 
     * 
     * @return true true iff changes would be lost unless the
     *         B+Tree was flushed to the backing store using
     *         {@link #writeCheckpoint()}.
     */
    public boolean needsCheckpoint() {

        if(!checkpoint.hasCheckpointAddr()) {
            
            /*
             * The checkpoint record needs to be written.
             */
            
            return true;
            
        }
        
        if(metadata.getMetadataAddr() == 0L) {
            
            /*
             * The index metadata record was replaced and has not yet been
             * written onto the store.
             */
            
            return true;
            
        }
        
        if(metadata.getMetadataAddr() != checkpoint.getMetadataAddr()) {
            
            /*
             * The index metadata record was replaced and has been written on
             * the store but the checkpoint record does not reflect the new
             * address yet.
             */
            
            return true;
            
        }
        
        if(checkpoint.getCounter() != counter.get()) {
            
            // The counter has been modified.
            
            return true;
            
        }
        
        if(root != null ) {
            
            if (root.isDirty()) {

                // The root node is dirty.

                return true;

            }
            
            if(checkpoint.getRootAddr() != root.getIdentity()) {
        
                // The root node has a different persistent identity.
                
                return true;
                
            }
            
        }

        /*
         * No apparent change in persistent state so we do NOT need to do a
         * checkpoint.
         */
        
        return false;
        
//        if (metadata.getMetadataAddr() != 0L && //
//                (root == null || //
//                        ( !root.dirty //
//                        && checkpoint.getRootAddr() == root.identity //
//                        && checkpoint.getCounter() == counter.get())
//                )
//        ) {
//            
//            return false;
//            
//        }
//
//        return true;
     
    }
    
    /**
     * Method updates the index metadata associated with this {@link BTree}.
     * The new metadata record will be written out as part of the next index
     * {@link #writeCheckpoint()}.
     * 
     * Note: this method should be used with caution.
     * 
     * @param indexMetadata
     *            The new metadata description for the {@link BTree}.
     * 
     * @throws IllegalArgumentException
     *             if the new value is null
     * @throws IllegalArgumentException
     *             if the new {@link IndexMetadata} record has already been
     *             written on the store - see
     *             {@link IndexMetadata#getMetadataAddr()}
     * @throws UnsupportedOperationException
     *             if the index is read-only.
     */
	final public void setIndexMetadata(final HTreeIndexMetadata indexMetadata) {

        assertNotReadOnly();

        if (indexMetadata == null)
            throw new IllegalArgumentException();

        if (indexMetadata.getMetadataAddr() != 0)
            throw new IllegalArgumentException();

        this.metadata = indexMetadata;
        
        // gets us on the commit list for Name2Addr.
        fireDirtyEvent();
        
    }
    
//    /**
//     * Handle request for a commit by {@link #writeCheckpoint()}ing dirty nodes
//     * and leaves onto the store, writing a new metadata record, and returning
//     * the address of that metadata record.
//     * 

//     * Note: In order to avoid needless writes the existing metadata record is
//     * always returned if {@link #needsCheckpoint()} is false.
//     * 

//     * Note: The address of the existing {@link Checkpoint} record is always
//     * returned if {@link #getAutoCommit() autoCommit} is disabled.
//     * 
//     * @return The address of a {@link Checkpoint} record from which the btree
//     *         may be reloaded.
//     */
    public long handleCommit(final long commitTime) {

        return writeCheckpoint2().getCheckpointAddr();
    	
    }

    /**
     * Returns an {@link ICounter}. The {@link ICounter} is mutable iff the
     * {@link BTree} is mutable. All {@link ICounter}s returned by this method
     * report and increment the same underlying counter.
     * 

     * Note: When the {@link BTree} is part of a scale-out index then the
     * counter will assign values within a namespace defined by the partition
     * identifier.
     */
    public ICounter getCounter() {

        ICounter counter = new Counter(this);
        
        final LocalPartitionMetadata pmd = metadata.getPartitionMetadata();

        if (pmd != null) {

        	throw new UnsupportedOperationException();
//            counter = new PartitionedCounter(pmd.getPartitionId(), counter);

        }

        if (isReadOnly()) {

            return new ReadOnlyCounter(counter);

        }

        return counter;

    }
    
    /**
	 * Required constructor form for {@link HTree} and any derived subclasses.
	 * This constructor is used both to create a new {@link HTree}, and to load
	 * a {@link HTree} from the store using a {@link Checkpoint} record.
	 * 
	 * @param store
	 *            The store.
	 * @param checkpoint
	 *            A {@link Checkpoint} record for that {@link HTree}.
	 * @param metadata
	 *            The metadata record for that {@link HTree}.
	 * @param readOnly
	 *            When true the {@link HTree} will be immutable.
	 * 
	 * @see HTree#create(IRawStore, IndexMetadata)
	 * @see HTree#load(IRawStore, long, boolean)
	 */
//	 * @throws IllegalArgumentException
//	 *             if addressBits is LT ONE (1).
//	 * @throws IllegalArgumentException
//	 *             if addressBits is GT (16) (fan out of 65536).
    public HTree(final IRawStore store, final Checkpoint checkpoint,
            final IndexMetadata metadata, final boolean readOnly) {

        super(  store, 
                NodeFactory.INSTANCE, //
                readOnly, // read-only
                (HTreeIndexMetadata)metadata,//
                metadata.getBtreeRecordCompressorFactory()
                );

//        this.readOnly = readOnly;
        
        if (checkpoint == null) {

            throw new IllegalArgumentException();
            
        }

        if (store != null) {

            if (checkpoint.getMetadataAddr() != metadata.getMetadataAddr()) {

                // must agree.
                throw new IllegalArgumentException();

            }

        }

        if (metadata.getConflictResolver() != null && !metadata.isIsolatable()) {

            throw new IllegalArgumentException(
                    "Conflict resolver may only be used with isolatable indices");
            
        }

        setCheckpoint(checkpoint);
        
//        // save the address from which the index metadata record was read.
//        this.lastMetadataAddr = metadata.getMetadataAddr();
        
        /*
         * Note: the mutable BTree has a limit here so that split() will always
         * succeed. That limit does not apply for an immutable btree.
         */
        assert readOnly || writeRetentionQueue.capacity() >= IndexMetadata.Options.MIN_WRITE_RETENTION_QUEUE_CAPACITY;

        /*
         * Note: Re-open is deferred so that we can mark the BTree as read-only
         * before we read the root node.
         */
//        reopen();

        /*
         * Buffer used to encode addresses into the tuple value for a mutable
         * B+Tree.
         */
		recordAddrBuf = readOnly ? null
				: new ByteArrayBuffer(Bytes.SIZEOF_LONG);
		
		this.rawRecords = metadata.getRawRecords();

    }

	/**
	 * Encode a raw record address into a byte[] suitable for storing in the
	 * value associated with a tuple and decoding using
	 * {@link AbstractBTree#decodeRecordAddr(byte[])}. This method is only
	 * supported for a mutable {@link BTree} instance. Per the contract of the
	 * mutable {@link BTree}, it is not thread-safe.
	 * 
	 * @param addr
	 *            The raw record address.
	 * 
	 * @return A newly allocated byte[] which encodes that address.
	 */
    byte[] encodeRecordAddr(final long addr) {
    	
    	return AbstractBTree.encodeRecordAddr(recordAddrBuf, addr);
    	
    }

    /**
     * Sets the {@link #checkpoint} and initializes the mutable fields from the
     * checkpoint record. In order for this operation to be atomic, the caller
     * must be synchronized on the {@link HTree} or otherwise guaranteed to have
     * exclusive access, e.g., during the ctor or when the {@link HTree} is
     * mutable and access is therefore required to be single-threaded.
     */
    protected void setCheckpoint(final Checkpoint checkpoint) {

        this.checkpoint = checkpoint; // save reference.
        
        // Note: Height is not uniform for an HTree.
//        this.height = checkpoint.getHeight();
        
        this.nnodes = checkpoint.getNodeCount();
        
        this.nleaves = checkpoint.getLeafCount();
        
        this.nentries = checkpoint.getEntryCount();
        
        this.counter = new AtomicLong( checkpoint.getCounter() );

        this.recordVersion = checkpoint.getRecordVersion();
        
    }

    /**
     * Creates and sets new root {@link Leaf} on the B+Tree and (re)sets the
     * various counters to be consistent with that root.  This is used both
     * by the constructor for a new {@link BTree} and by {@link #removeAll()}.
     * 

     * Note: The {@link #getCounter()} is NOT changed by this method.
     */
    final private void newRoot() {

//        height = 0;

        nnodes = 0;
        
        nentries = 0;

        final boolean wasDirty = root != null && root.isDirty();
        
        nleaves = 0;

		/*
		 * The initial setup of the hash tree is a root directory page whose
		 * global depth is the #of address bits (which is the maximum value that
		 * global depth can take on for a given hash tree). There is a single
		 * bucket page and all entries in the root directory page point to that
		 * bucket. This means that the local depth of the initial bucket page
		 * will be zero (you can prove this for yourself by consulting the
		 * tables generated by TestHTree). With a depth of zero, the initial
		 * bucket page will have buddy hash buckets which can hold only a single
		 * distinct hash key (buckets always have to handle duplicates of a hash
		 * key).
		 * 
		 * From this initial configuration, inserts of 2 distinct keys which
		 * fall into the same buddy hash tables will cause that buddy hash
		 * buckets in the initial bucket page to split, increasing the depth of
		 * the resulting bucket page by one. Additional splits driven by inserts
		 * of distinct keys will eventually cause the local depth of some bucket
		 * page to exceed the global depth of the root and a new level will be
		 * introduced below the root. The hash tree can continue to grow in this
		 * manner, gradually adding depth where there are bit prefixes for which
		 * there exists a lot of variety in the observed keys.
		 */

		// Initial root.
		final DirectoryPage r = new DirectoryPage(//
				this,// the owning htree instance
				null, // overflowKey
				addressBits // the global depth of the root.
		);
		nnodes++;
		assert r.getSlotsPerBuddy() == (1 << addressBits) : "slotsPerBuddy="
				+ r.getSlotsPerBuddy();
		assert r.getNumBuddies() == 1 : "numBuddies=" + r.getNumBuddies();

		// Data for the root.
		final MutableDirectoryPageData rdata = (MutableDirectoryPageData) r.data;

		// Initial bucket.
		final BucketPage b = new BucketPage(this, 0/* globalDepth */);
		nleaves++;

		final int nslots = r.getSlotsPerBuddy() * r.getNumBuddies();

		for (int i = 0; i < nslots; i++) {

			b.parent = (Reference) r.self;

			r.childRefs[i] = (Reference) b.self;

			rdata.childAddr[i] = 0L;

		}

		this.root = r;

        // Note: Counter is unchanged!
//        counter = new AtomicLong( 0L );

        // TODO support bloom filter?
//        if (metadata.getBloomFilterFactory() != null) {
//
//            /*
//             * Note: Allocate a new bloom filter since the btree is now empty
//             * and set its reference on the BTree. We need to do this here in
//             * order to (a) overwrite the old reference when we are replacing
//             * the root, e.g., for removeAll(); and (b) to make sure that the
//             * bloomFilter reference is defined since the checkpoint will not
//             * have its address until the next time we call writeCheckpoint()
//             * and therefore readBloomFilter() would fail if we did not create
//             * and assign the bloom filter here.
//             */
//            
//            bloomFilter = metadata.getBloomFilterFactory().newBloomFilter();
//            
//        }
        
        /*
         * Note: a new root leaf is created when an empty btree is (re-)opened.
         * However, if the BTree is marked as read-only then do not fire off a
         * dirty event.
         */
        if(!wasDirty && !readOnly) {
            
            fireDirtyEvent();
            
        }
        
    }

    @Override
    protected void _reopen() {
        
        if (checkpoint.getRootAddr() == 0L) {

            /*
             * Create the root leaf.
             * 
             * Note: if there is an optional bloom filter, then it is created
             * now.
             */
            
            newRoot();
            
        } else {

			/*
			 * Read the root node of the HTree and set its depth, which is
			 * always [addressBits].
			 */
			root = (DirectoryPage) readNodeOrLeaf(checkpoint.getRootAddr());
			root.globalDepth = addressBits;

            // Note: The optional bloom filter will be read lazily. 
            
        }

    }

    final public long getLastCommitTime() {
        
        return lastCommitTime;
        
    }

    final public long getRevisionTimestamp() {
        
        if (readOnly)
            throw new UnsupportedOperationException(ERROR_READ_ONLY);

        return lastCommitTime + 1;
        
    }
    
    @Override
    final public void setLastCommitTime(final long lastCommitTime) {
        
        if (lastCommitTime == 0L)
            throw new IllegalArgumentException();
        
        if (this.lastCommitTime == lastCommitTime) {

            // No change.
            
            return;
            
        }
        
        if (INFO)
            log.info("old=" + this.lastCommitTime + ", new=" + lastCommitTime);
        // Note: Commented out to allow replay of historical transactions.
        /*if (this.lastCommitTime != 0L && this.lastCommitTime > lastCommitTime) {

            throw new IllegalStateException("Updated lastCommitTime: old="
                    + this.lastCommitTime + ", new=" + lastCommitTime);
            
        }*/

        this.lastCommitTime = lastCommitTime;
        
    }

    /**
     * The lastCommitTime of the {@link Checkpoint} record from which the
     * {@link BTree} was loaded.
     * 

     * Note: Made volatile on 8/2/2010 since it is not otherwise obvious what
     * would guarantee visibility of this field, through I do seem to remember
     * that visibility might be guaranteed by how the BTree class is discovered
     * and returned to the class. Still, it does no harm to make this a volatile
     * read.
     */
    volatile private long lastCommitTime = 0L;// Until the first commit.
    
    final public IDirtyListener getDirtyListener() {
        
        return listener;
        
    }

    final public void setDirtyListener(final IDirtyListener listener) {

        assertNotReadOnly();
        
        this.listener = listener;
        
    }
    
    private IDirtyListener listener;

    /**
     * Fire an event to the listener (iff set).
     */
    final protected void fireDirtyEvent() {

        assertNotReadOnly();

        final IDirtyListener l = this.listener;

        if (l == null)
            return;

        if (Thread.interrupted()) {

            throw new RuntimeException(new InterruptedException());

        }

        l.dirtyEvent(this);
        
    }

    /**
     * Flush the nodes of the {@link BTree} to the backing store. After invoking
     * this method the root of the {@link BTree} will be clean.
     * 

     * Note: This does NOT flush all persistent state. See
     * {@link #writeCheckpoint()} which also handles the optional bloom filter,
     * the {@link IndexMetadata}, and the {@link Checkpoint} record itself.
     * 
     * @return true if anything was written.
     */
    final public boolean flush() {

        assertNotTransient();
        assertNotReadOnly();
        
        if (root != null && root.isDirty()) {

            writeNodeRecursive(root);
            
            if(INFO)
                log.info("flushed root: addr=" + root.getIdentity());
            
            return true;
            
        }
        
        return false;

    }

    /**
     * Returns an immutable view of this {@link HTree}. If {@link BTree} is
     * already read-only, then this instance is returned. Otherwise, a
     * read-only {@link BTree} is loaded from the last checkpoint and returned.
     * 
     * @throws IllegalStateException
     *             If the {@link BTree} is dirty.
     * 
     * @todo The {@link Checkpoint} could hold a {@link WeakReference} singleton
     *       to the read-only view loaded from that checkpoint.
     */
    public HTree asReadOnly() {

        if (isReadOnly()) {

            return this;
            
        }

        if(needsCheckpoint())
            throw new IllegalStateException();

		return HTree
				.load(store, checkpoint.getCheckpointAddr(), true/* readOnly */);

    }

//    * Checkpoint operation {@link #flush()}es dirty nodes, the optional
//    * {@link IBloomFilter} (if dirty), the {@link IndexMetadata} (if dirty),
//    * and then writes a new {@link Checkpoint} record on the backing store,
//    * saves a reference to the current {@link Checkpoint} and returns the
//    * address of that {@link Checkpoint} record.
//    * 

//    * Note: A checkpoint by itself is NOT an atomic commit. The commit protocol
//    * is at the store level and uses {@link Checkpoint}s to ensure that the
//    * state of the {@link BTree} is current on the backing store.
//    * 
//    * @return The address at which the {@link Checkpoint} record for the
//    *         {@link BTree} was written onto the store. The {@link BTree} can
//    *         be reloaded from this {@link Checkpoint} record.
//    * 
//    * @see #writeCheckpoint2(), which returns the {@link Checkpoint} record
//    *      itself.
    /**
     * {@inheritDoc}
     *       
     * @see #load(IRawStore, long)
     */
    @Override
    final public long writeCheckpoint() {
    
        // write checkpoint and return address of that checkpoint record.
        return writeCheckpoint2().getCheckpointAddr();
        
    }

    /**
     * {@inheritDoc}
     * 
     * @see #load(IRawStore, long)
     */
    @Override
    final public Checkpoint writeCheckpoint2() {
    	
        assertNotTransient();
        assertNotReadOnly();

		/*
		 * Note: Acquiring this lock provides for atomicity of the checkpoint of
		 * the BTree during the commit protocol. Without this lock, users of the
		 * UnisolatedReadWriteIndex could be concurrently modifying the BTree
		 * while we are attempting to snapshot it for the commit.
		 * 
		 * Note: An alternative design would declare a global read/write lock
		 * for mutation of the indices in addition to the per-BTree read/write
		 * lock provided by UnisolatedReadWriteIndex. Rather than taking the
		 * per-BTree write lock here, we would take the global write lock in the
		 * AbstractJournal's commit protocol, e.g., commitNow(). The global read
		 * lock would be taken by UnisolatedReadWriteIndex before taking the
		 * per-BTree write lock. This is effectively a hierarchical locking
		 * scheme and could provide a workaround if deadlocks are found to occur
		 * due to lock ordering problems with the acquisition of the
		 * UnisolatedReadWriteIndex lock (the absence of lock ordering problems
		 * really hinges around UnisolatedReadWriteLocks not being taken for
		 * more than one index at a time.)
		 * 
		 * @see https://sourceforge.net/apps/trac/bigdata/ticket/278
		 * @see https://sourceforge.net/apps/trac/bigdata/ticket/284
		 * @see https://sourceforge.net/apps/trac/bigdata/ticket/288
		 * @see https://sourceforge.net/apps/trac/bigdata/ticket/343
		 * @see https://sourceforge.net/apps/trac/bigdata/ticket/440
		 */
		final Lock lock = writeLock();
		lock.lock();
		try {
            /**
             * Do not permit checkpoint if the index is in an error state.
             * 
             * @see  Invalidate
             *      BTree objects if error occurs during eviction 
             */
            if (error != null)
                throw new IllegalStateException(ERROR_ERROR_STATE, error);
            
			if (/* autoCommit && */needsCheckpoint()) {

				/*
				 * Flush the btree, write a checkpoint record, and return the
				 * address of that checkpoint record. The [checkpoint] reference
				 * is also updated.
				 */

				return _writeCheckpoint2();

			}

			/*
			 * There have not been any writes on this btree or auto-commit is
			 * disabled.
			 * 
			 * Note: if the application has explicitly invoked writeCheckpoint()
			 * then the returned address will be the address of that checkpoint
			 * record and the BTree will have a new checkpoint address made
			 * restart safe on the backing store.
			 */

			return checkpoint;

		} finally {

			lock.unlock();

		}

    }
    
    /**
	 * Core implementation invoked by {@link #writeCheckpoint2()} while holding
	 * the lock - DO NOT INVOKE THIS METHOD DIRECTLY.
	 * 
	 * @return the checkpoint.
	 */
    final private Checkpoint _writeCheckpoint2() {
        
        assertNotTransient();
        assertNotReadOnly();
        
//        assert root != null : "root is null"; // i.e., isOpen().

        // flush any dirty nodes.
        flush();
        
        // pre-condition: all nodes in the tree are clean.
        assert root == null || !root.isDirty();

//        { // TODO support bloom filter?
//            /*
//             * Note: Use the [AbstractBtree#bloomFilter] reference here!!!
//             * 
//             * If that reference is [null] then the bloom filter is either
//             * clean, disabled, or was not configured. For any of those (3)
//             * conditions we will use the address of the bloom filter from the
//             * last checkpoint record. If the bloom filter is clean, then we
//             * will just carry forward its old address. Otherwise the address in
//             * the last checkpoint record will be 0L and that will be carried
//             * forward.
//             */
//            final BloomFilter filter = this.bloomFilter;
//
//            if (filter != null && filter.isDirty() && filter.isEnabled()) {
//
//                /*
//                 * The bloom filter is enabled, is loaded and is dirty, so write
//                 * it on the store now.
//                 */
//
//                filter.write(store);
//
//            }
//            
//        }
        
        if (metadata.getMetadataAddr() == 0L) {
            
        	/*
        	 * Is there an old metadata addr in need of recyling?
        	 */
            if (checkpoint != null) {
            	final long addr = checkpoint.getMetadataAddr();
            	if (addr != IRawStore.NULL)
            		store.delete(addr);
            }
            
            /*
             * The index metadata has been modified so we write out a new
             * metadata record on the store.
             */
            
            metadata.write(store);
            
         }
        
        if (checkpoint != null && getRoot() != null
                && checkpoint.getRootAddr() != getRoot().getIdentity()) {
            
            recycle(checkpoint.getRootAddr());
            
        }

        if (checkpoint != null) {
            
            recycle(checkpoint.getCheckpointAddr());
            
        }
        
        // create new checkpoint record.
        checkpoint = newCheckpoint();
        
        // write it on the store.
        checkpoint.write(store);
        
        if (BigdataStatics.debug||INFO) {
            final String msg = "name=" + metadata.getName()
                    + ", writeQueue{size=" + writeRetentionQueue.size()
                    + ",distinct=" + ndistinctOnWriteRetentionQueue + "} : "
                    + checkpoint;
            if (BigdataStatics.debug)
                System.err.println(msg);
            if (INFO)
                log.info(msg);
        }
        
        // return the checkpoint record.
        return checkpoint;
        
    }

    /**
     * Create a {@link Checkpoint} for a {@link HTree}.
     * 

     * The caller is responsible for writing the {@link Checkpoint} record onto
     * the store.
     * 

     * The class identified by {@link IndexMetadata#getCheckpointClassName()}
     * MUST declare a public constructor with the following method signature
     * 
     * 
     *   ...( HTree htree )
     * 
     * 
     * @return The {@link Checkpoint}.
     */
    @SuppressWarnings("unchecked")
    final private Checkpoint newCheckpoint() {
        
        try {
            
            @SuppressWarnings("rawtypes")
            final Class cl = Class.forName(metadata.getCheckpointClassName());
            
            /*
             * Note: A NoSuchMethodException thrown here means that you did not
             * declare the required public constructor on the Checkpoint class
             * [cl].
             */
            
            @SuppressWarnings("rawtypes")
            final Constructor ctor = cl.getConstructor(new Class[] {
                    HTree.class //
                    });

            final Checkpoint checkpoint = (Checkpoint) ctor
                    .newInstance(new Object[] { //
                            this //
                    });
            
            return checkpoint;
            
        } catch(Exception ex) {
            
            throw new RuntimeException(ex);
            
        }
        
    }

    /*
	 * CRUD API.
	 * 
	 * TODO The hash tree intrinsically supports duplicate keys. This means that
	 * some method semantics must be different. E.g., insert() does not update
	 * an existing tuple for the same key. remove() removes an arbitrary tuple
	 * matching the key, lookup() will return an arbitrary match or null if
	 * there is no match, etc. Also, what corresponds to a lookup scenario in a
	 * BTree will normally be a bag traversal in an HTree requiring an iterator
	 * construct.
	 * 
	 * TODO The hash tree will normally be used with int32 keys, and those keys
	 * will typically be obtained from Object#hashCode(). Special methods should
	 * be provided for this purpose. E.g., ISimpleHTree. Custom serialization
	 * and even a custom mutable leaf data record should be used with this case.
	 * 
	 * TODO insert() return for htree is never old value since always adds() a
	 * new tuple never replaces() an existing tuple.  Further, if the key and
	 * val are equals() then we might even define the semantics as a NOP rather
	 * than appending a duplicate item into the bucket (it's worth thinking on
	 * this further as scanning for equals is more work and sometimes we might
	 * want to allow fully duplicated items into the bucket.).
	 */
    
    /**
     * Convert an int32 hash code key into an unsigned byte[4].
     * 
     * Note: This encoding MUST be consistent with
     * {@link IKeyBuilder#append(int)}.
     */
    private byte[] i2k(final int key) {
        // lexiographic ordering as unsigned int.
        int v = key;
        if (v < 0) {

            v = v - 0x80000000;

        } else {
            
            v = 0x80000000 + v;
            
        }
		final byte[] buf = new byte[4];
		buf[0] = (byte) (v >>> 24);
		buf[1] = (byte) (v >>> 16);
		buf[2] = (byte) (v >>> 8);
		buf[3] = (byte) (v >>> 0);
		return buf;
    }

	// TODO contains with an Object clear needs to test for the Object, not just
	// the key.  Maybe do this as a wrapper similar to BigdataMap?
    public boolean contains(final Object obj) {
        //contains(obj.hashCode());
    	throw new UnsupportedOperationException();
    }

	public boolean contains(final int key) {
		return contains(i2k(key));
	}

	/**
	 * Return true iff there is at least one tuple in the hash tree
	 * having the specified key.
	 * 
	 * @param key
	 *            The key.
	 * @return true iff the hash tree contains at least one tuple
	 *         with that key.
	 * @throws IllegalArgumentException
	 *             if the key is null.
	 * 
	 *             TODO Parallel to insert(), consider a contains() signature
	 *             which permits testing for a specific value as well. Maybe
	 *             in a wrapper class?
	 */
	public boolean contains(final byte[] key) {
		if (key == null)
			throw new IllegalArgumentException();
		DirectoryPage current = getRoot(); // start at the root.
		int prefixLength = 0;// prefix length of the root is always zero.
		int buddyOffset = 0; // buddyOffset of the root is always zero.
		while (true) {
			// skip prefixLength bits and then extract globalDepth bits. 
			final int hashBits = current.getLocalHashCode(key, prefixLength);
			// find the child directory page or bucket page but do not lazily
			// create new Directory if not present
			final AbstractPage child = current.getChildIfPresent(hashBits);
			if (child == null)
				return false;
			
			if (child.isLeaf()) {
				/*
				 * Found the bucket page, update it.
				 */
				final BucketPage bucketPage = (BucketPage) child;
				if(!bucketPage.contains(key, buddyOffset)) {
					return false;
				}
				return true;
			}
			/*
			 * Recursive descent into a child directory page. We have to update
			 * the prefixLength and compute the offset of the buddy hash table
			 * within the child before descending into the child.
			 */
			prefixLength = prefixLength + current.globalDepth;
			buddyOffset = HTreeUtil
					.getBuddyOffset(hashBits, current.globalDepth,
							child.globalDepth/* localDepthOfChild */);
			current = (DirectoryPage) child;
		}
	}

//    public void lookup(final Object obj) {
//        lookup(obj.hashCode());
//    }

	/**
	 * Return the first value for the key.
	 * 
	 * @param key
	 *            The key.
	 * 
	 * @return The first value for the key -or- null if there are
	 *         no tuples in the index having that key. Note that the return
	 *         value is not diagnostic if the application allows
	 *         null values into the index.
	 */
    public byte[] lookupFirst(final int key) {
    	return lookupFirst(i2k(key));
    }

	/**
	 * Return the first value for the key.
	 * 
	 * @param key
	 *            The key.
	 * 
	 * @return The first value for the key -or- null if there are
	 *         no tuples in the index having that key. Note that the return
	 *         value is not diagnostic if the application allows
	 *         null values into the index.
	 */
	public byte[] lookupFirst(final byte[] key) {
		if (key == null)
			throw new IllegalArgumentException();
		DirectoryPage current = getRoot(); // start at the root.
		int prefixLength = 0;// prefix length of the root is always zero.
		int buddyOffset = 0; // buddyOffset of the root is always zero.
		while (true) {
			// skip prefixLength bits and then extract globalDepth bits. 
			final int hashBits = current.getLocalHashCode(key, prefixLength);
			// find the child directory page or bucket page and ensure a new
			// child is not created on lookup
			final AbstractPage child = current.getChildIfPresent(hashBits);
			if (child == null)
				return null;
			
			if (child.isLeaf()) {
				/*
				 * Found the bucket page, search it for a match.
				 */
				final BucketPage bucketPage = (BucketPage) child;
				return bucketPage.lookupFirst(key, buddyOffset);
			}
			/*
			 * Recursive descent into a child directory page. We have to update
			 * the prefixLength and compute the offset of the buddy hash table
			 * within the child before descending into the child.
			 */
			prefixLength = prefixLength + current.globalDepth;
			buddyOffset = HTreeUtil
					.getBuddyOffset(hashBits, current.globalDepth,
							child.globalDepth/* localDepthOfChild */);
			current = (DirectoryPage) child;
		}
	}

	/**
	 * Return an iterator which will visit each tuple in the index having the
	 * specified key.
	 * 
	 * @param key
	 *            The key.
	 * 
	 * @return The iterator and never null.
	 */
	public ITupleIterator lookupAll(final int key) {
		return lookupAll(i2k(key));
	}

	/**
	 * Return an iterator which will visit each tuple in the index having the
	 * specified key.
	 * 
	 * @param key
	 *            The key.
	 * 
	 * @return The iterator and never null.
	 */
	public ITupleIterator lookupAll(final byte[] key) {
		if (key == null)
			throw new IllegalArgumentException();
		DirectoryPage current = getRoot(); // start at the root.
		int prefixLength = 0;// prefix length of the root is always zero.
		int buddyOffset = 0; // buddyOffset of the root is always zero.
		while (true) {
			// skip prefixLength bits and then extract globalDepth bits. 
			final int hashBits = current.getLocalHashCode(key, prefixLength);
			// find the child directory page or bucket page.
			// Ensure we do not create a child for lookup 
			final AbstractPage child = current.getChildIfPresent(hashBits);
			if (child == null)
				return emptyTupleIterator();
			
			if (child.isLeaf()) {
				/*
				 * Found the bucket page, search it for a match.
				 */
				final BucketPage bucketPage = (BucketPage) child;
				return bucketPage.lookupAll(key);//, buddyOffset);
			} else if (((DirectoryPage)child).isOverflowDirectory()) {
				return ((DirectoryPage) child).getTuples();
			}
			/*
			 * Recursive descent into a child directory page. We have to update
			 * the prefixLength and compute the offset of the buddy hash table
			 * within the child before descending into the child.
			 */
			prefixLength = prefixLength + current.globalDepth;
			buddyOffset = HTreeUtil
					.getBuddyOffset(hashBits, current.globalDepth,
							child.globalDepth/* localDepthOfChild */);
			current = (DirectoryPage) child;
		}
	}

    private ITupleIterator emptyTupleIterator() {
		return new ITupleIterator() {

			public ITuple next() {
				throw new NoSuchElementException();
			}

			public boolean hasNext() {
				return false;
			}

			public void remove() {
				throw new UnsupportedOperationException();
			}
			
		};
	}

	public void insert(final Object obj) {
        insert(obj.hashCode(), SerializerUtil.serialize(obj));
    }

	public void insert(final int key, final byte[] val) {
		insert(i2k(key), val);
	}

	/**
	 * Insert a tuple into the hash tree. Tuples with duplicate keys and even
	 * tuples with duplicate keys and values are allowed and will result in
	 * multiple tuples.
	 * 
	 * @param key
	 *            The key.
	 * @param value
	 *            The value.
	 * @return null (always).
	 * 
	 *         TODO If the application wants to restrict the hash tree to such
	 *         that it does not contain duplicate tuples then it must first
	 *         search in the tree for an exact match (key and value). It is
	 *         easier to do that from within the insert logic so expand the
	 *         method signature to pass an insert enum {ALLDUPS,DUPKEYS,NODUPS}.
	 */
	public byte[] insert(final byte[] key, final byte[] value) {

        if (DEBUG)
            log.debug("key=" + BytesUtil.toString(key) + ", value="
                    + Arrays.toString(value));

		if (key == null)
			throw new IllegalArgumentException();
		
		// the current directory page.
		DirectoryPage current = getRoot(); // start at the root.
		
		// #of prefix bits already consumed.
		int prefixLength = 0;// prefix length of the root is always zero.
		
		// buddyOffset into [current].
		int buddyOffset = 0; // buddyOffset of the root is always zero.
		
		while (true) {

			// skip prefixLength bits and then extract globalDepth bits. 
			assert !(prefixLength >= key.length * 8 && !current.isOverflowDirectory());
			
		    final int hashBits = current.getLocalHashCode(key, prefixLength);

		    // find the child directory page or bucket page.
			final AbstractPage child;
			if (current.isOverflowDirectory()) {
				if (!BytesUtil.bytesEqual(key, current.getOverflowKey())) {
					// if not the overflowKey, then insert extra level
					final DirectoryPage pd = current.getParentDirectory();
					
					assert !pd.isOverflowDirectory();
					assert pd.isReadOnly() ? current.isReadOnly() : true;
					
					current = pd._addLevelForOverflow(current);
					// we need to fix this since we have introduced a new level
					// between the old current and its parent directory
					child = current;
					// and frig prefixlength to come back in
					prefixLength -= current.globalDepth;
					
//					if (DEBUG)
//						log.debug("frig prefixLength: " + prefixLength);
				} else {
					child = current.lastChild();
				}
			} else {				
				child = current.getChild(hashBits, buddyOffset);
			}
			
			if (child.isLeaf()) {

				/*
				 * Found the bucket page, update it.
				 * 
				 * We must copyOnWrite now since the insert will otherwise call it
				 * and potentially invalidate the bucketPage reference.
				 */
			    final BucketPage bucketPage = (BucketPage) child.copyOnWrite();
			    
				// Attempt to insert the tuple into the bucket.
				if (!bucketPage.insert(key, value)) {
					if (current.globalDepth == child.globalDepth) {

                        /*
                         * The child is at the same depth as the parent. Either
                         * we can split a directory page which is a parent of
                         * that bucket or we have to add a new level below the
                         * root on the path to that bucket.
                         */

                        if (child.globalDepth == addressBits) {

                        	bucketPage.addLevel();
                        	
                        } else {

                        	current.getParentDirectory().split(buddyOffset, current/* oldChild */);
                           
                        }

                    } else {

                        // globalDepth >= localDepth
                        bucketPage.split();

                    }
		
                    return insert(key,value);

				}
				
				return null; // TODO should be Void return? or depends on enum controlling dups behavior?
				
			}

			/*
			 * Recursive descent into a child directory page. We have to update
			 * the prefixLength and compute the offset of the buddy hash table
			 * within the child before descending into the child.
			 */
			
			// increase prefix length by the #of address bits consumed by the
			// buddy hash table in the current directory page as we descend.
			prefixLength = prefixLength + current.globalDepth;
			
			// find the offset of the buddy hash table in the child.
			buddyOffset = HTreeUtil
					.getBuddyOffset(hashBits, current.globalDepth,
							child.globalDepth/* localDepthOfChild */);
			
			// update current so we can search in the child.
			current = (DirectoryPage) child;
			
		}

	} // insert()
	
	/**
	 * Insert a tuple into the hash tree. Tuples with duplicate keys and even
	 * tuples with duplicate keys and values are allowed and will result in
	 * multiple tuples.
	 * 
	 * @param src
	 *            The source {@link BucketPage} src
	 * @param slot
	 *            The slot in the source {@link BucketPage} whose tuple will be
	 *            inserted (really copied).
	 */
	protected void insertRawTuple(final BucketPage src, final int slot) {

		if (src == null)
			throw new IllegalArgumentException();

		if (slot < 0 || slot >= (1 << addressBits)) // [0:slotsPerPage].
			throw new IllegalArgumentException();

		// the key to insert
		final byte[] key = src.getKeys().get(slot);
		
		if (key == null)
			throw new IllegalArgumentException();
		
		if (DEBUG)
			log.debug("key=" + BytesUtil.toString(key));

		// the current directory page.
		DirectoryPage current = getRoot(); // start at the root.
		
		// #of prefix bits already consumed.
		int prefixLength = 0;// prefix length of the root is always zero.
		
		// buddyOffset into [current].
		int buddyOffset = 0; // buddyOffset of the root is always zero.
		
		while (true) {

			// skip prefixLength bits and then extract globalDepth bits. 
		    final int hashBits = current.getLocalHashCode(key, prefixLength);
			
			// find the child directory page or bucket page.
			final AbstractPage child = current.getChild(hashBits, buddyOffset);
			
			if (child.isLeaf()) {

				/*
				 * Found the bucket page, update it.
				 * 
				 * Must ensure we have copyOnWriteVersion since otherwise the bucketPage
				 * later referenced will not be valid
				 */

			    final BucketPage bucketPage = (BucketPage) child.copyOnWrite();
				
				// Attempt to insert the tuple into the bucket.
                if (!bucketPage.insertRawTuple(src, slot, key)) {

                    if (current.globalDepth == child.globalDepth) {

                        /*
                         * The child is at the same depth as the parent. Either
                         * we can split a directory page which is a parent of
                         * that bucket or we have to add a new level below the
                         * root on the path to that bucket.
                         */

                        if (child.globalDepth == addressBits) {

                            // addLevel2(bucketPage);
                        	bucketPage.addLevel();

                        } else {

                        	current.getParentDirectory().split(
                                    buddyOffset, current/* oldChild */);

                        }

                    } else {

                        // globalDepth >= localDepth
                        bucketPage.split();

                    }

                    insertRawTuple(src, slot);
                    return;

				}
				
				return;
				
			}

			/*
			 * Recursive descent into a child directory page. We have to update
			 * the prefixLength and compute the offset of the buddy hash table
			 * within the child before descending into the child.
			 */
			
			// increase prefix length by the #of address bits consumed by the
			// buddy hash table in the current directory page as we descend.
			prefixLength = prefixLength + current.globalDepth;
			
			// find the offset of the buddy hash table in the child.
			buddyOffset = HTreeUtil
					.getBuddyOffset(hashBits, current.globalDepth,
							child.globalDepth/* localDepthOfChild */);
			
			// update current so we can search in the child.
			current = (DirectoryPage) child;
			
		}

	} // insertRawTuple()

	/**
	 * Removes a single entry matching the key supplied.
	 * The HTree provides no guarantee that the order in which a value
	 * with a duplicate key is added will be the order in which it
	 * would be removed - FIFO.  It is more likely to be in LIFO
	 * 
	 * @param key
	 * @return the value removed or null if none found
	 */
	public byte[] remove(final byte[] key) {
		if (key == null)
			throw new IllegalArgumentException();
		
		final AbstractPage page = locatePageForKey(key);
		
		final byte[] ret =  page == null ? null : page.removeFirst(key);
		
		if (ret != null) nentries--;
		
		return ret;
	}

	public int removeAll(final byte[] key) {
		if (key == null)
			throw new IllegalArgumentException();
		
		final AbstractPage page = locatePageForKey(key);
		
		final int removals =  page == null ? 0 : page.removeAll(key);
		nentries -= removals;
		
		return removals;
	}

	/**
	 * 
	 */
	protected AbstractPage locatePageForKey(final byte[] key) {
		if (key == null)
			throw new IllegalArgumentException();
		
		DirectoryPage current = getRoot(); // start at the root.
		int prefixLength = 0;// prefix length of the root is always zero.
		while (true) {
			// skip prefixLength bits and then extract globalDepth bits. 
			final int hashBits = current.getLocalHashCode(key, prefixLength);
			// find the child directory page or bucket page, ensuring that
			// new Directory is not created if not present
			final AbstractPage child = current.getChildIfPresent(hashBits);
			
			if (child == null || child.isLeaf() || ((DirectoryPage) child).isOverflowDirectory()) {
				return child;
			}
			/*
			 * Recursive descent into a child directory page. We have to update
			 * the prefixLength before descending into the child.
			 */
			prefixLength = prefixLength + current.globalDepth;
			current = (DirectoryPage) child;
		}
	}

    public void removeAll() {
		
		DirectoryPage root = getRoot();
		root.removeAll();
				
		newRoot();
	}

	public long rangeCount() {
		
		return nentries;
	
	}

	/**
	 * Pretty print the {@link HTree}.
	 */
	String PP() {
		return PP(true);
	}
	
	String PP(final boolean showBinary) {
		
		final StringBuilder sb = new StringBuilder();
		
		sb.append("#nodes=" + nnodes + ", #leaves=" + nleaves + ", #entries="
				+ nentries + "\n");

		root.PP(sb, showBinary);
		
		return sb.toString();
	
	}

	@Override
    public BaseIndexStats dumpPages(final boolean recursive, final boolean visitLeaves) {

        if (!recursive) {

            return new BaseIndexStats(this);

        }

        final HTreePageStats stats = new HTreePageStats();

        getRoot().dumpPages(recursive, visitLeaves, stats);

        return stats;

    }

	/**
	 * Create a new {@link HTree} or derived class. This method works by writing
	 * the {@link IndexMetadata} record on the store and then loading the
	 * {@link HTree} from the {@link IndexMetadata} record.
	 * 
	 * @param store
	 *            The store.
	 * 
	 * @param metadata
	 *            The metadata record.
	 * 
	 * @return The newly created {@link HTree}.
	 * 
	 * @see #load(IRawStore, long, boolean)
	 * 
	 * @throws IllegalStateException
	 *             If you attempt to create two {@link HTree} objects from the
	 *             same metadata record since the metadata address will have
	 *             already been noted on the {@link IndexMetadata} object. You
	 *             can use {@link IndexMetadata#clone()} to obtain a new copy of
	 *             the metadata object with the metadata address set to
	 *             0L.
     * @exception IllegalStateException
     *                if the {@link IndexTypeEnum} in the supplied
     *                {@link IndexMetadata} object is not
     *                {@link IndexTypeEnum#BTree}.
	 */
	public static HTree create(final IRawStore store,
			final HTreeIndexMetadata metadata) {

	    if (metadata.getIndexType() != IndexTypeEnum.HTree) {

            throw new IllegalStateException("Wrong index type: "
                    + metadata.getIndexType());

        }

		if (store == null) {

			// Create an htree which is NOT backed by a persistence store.
			return createTransient(metadata);

		}

        if (metadata.getMetadataAddr() != 0L) {

            throw new IllegalStateException("Metadata record already in use");
            
        }
        
        /*
         * Write metadata record on store. The address of that record is set as
         * a side-effect on the metadata object.
         */
        metadata.write(store);

        /*
         * Create checkpoint for the new H+Tree.
         */
        final Checkpoint firstCheckpoint = metadata.firstCheckpoint();
        
        /*
         * Write the checkpoint record on the store. The address of the
         * checkpoint record is set on the object as a side effect.
         */
        firstCheckpoint.write(store);
        
        /*
         * Load the HTree from the store using that checkpoint record. There is
         * no root so a new root leaf will be created when the HTree is opened.
         */
        return load(store, firstCheckpoint.getCheckpointAddr(), false/* readOnly */);
        
    }

    /**
     * Create a new {@link HTree} or derived class that is fully transient (NO
     * backing {@link IRawStore}).
     * 

     * Fully transient {@link HTree}s provide the functionality of a {@link HTree}
     * without a backing persistence store. Internally, reachable nodes and
     * leaves of the transient {@link HTree} use hard references to ensure that
     * remain strongly reachable. Deleted nodes and leaves simply clear their
     * references and will be swept by the garbage collector shortly thereafter.
     * 

     * Operations which attempt to write on the backing store will fail.
     * 

     * While nodes and leaves are never persisted, the keys and values of the
     * transient {@link HTree} are unsigned byte[]s. This means that application
     * keys and values are always converted into unsigned byte[]s before being
     * stored in the {@link HTree}. Hence if an object that is inserted into
     * the {@link HTree} and then looked up using the {@link HTree} API, you
     * WILL NOT get back the same object reference.
     * 
     * Note: CLOSING A TRANSIENT INDEX WILL DISCARD ALL DATA!
     * 
     * @param metadata
     *            The metadata record.
     * 
     * @return The transient {@link HTree}.
     */
    @SuppressWarnings("unchecked")
    public static HTree createTransient(final HTreeIndexMetadata metadata) {
        
        if (metadata.getIndexType() != IndexTypeEnum.HTree) {

            throw new IllegalStateException("Wrong index type: "
                    + metadata.getIndexType());

        }

        /*
         * Create checkpoint for the new HTree.
         */
        final Checkpoint firstCheckpoint = metadata.firstCheckpoint();

        /*
         * Create B+Tree object instance.
         */
        try {

            final Class cl = Class.forName(metadata.getHTreeClassName());

            /*
             * Note: A NoSuchMethodException thrown here means that you did not
             * declare the required public constructor for a class derived from
             * BTree.
             */
            final Constructor ctor = cl.getConstructor(new Class[] {
                    IRawStore.class,//
                    Checkpoint.class,//
                    IndexMetadata.class,//
                    Boolean.TYPE//
                    });

            final HTree htree = (HTree) ctor.newInstance(new Object[] { //
                    null , // store
                    firstCheckpoint, //
                    metadata, //
                    false// readOnly
                    });

            // create the root node.
            htree.reopen();

            return htree;

        } catch (Exception ex) {

            throw new RuntimeException(ex);

        }
        
    }

	/**
	 * Load an instance of a {@link HTree} or derived class from the store. The
	 * {@link HTree} or derived class MUST declare a constructor with the
	 * following signature: 
     * 
     * className(IRawStore store, Checkpoint checkpoint, IndexMetadata metadata, boolean readOnly)
     * 
     * 
	 * 
	 * @param store
	 *            The store.
	 * @param addrCheckpoint
	 *            The address of a {@link Checkpoint} record for the index.
	 * @param readOnly
	 *            When true the {@link BTree} will be marked as
	 *            read-only. Marking has some advantages relating to the locking
	 *            scheme used by {@link Node#getChild(int)} since the root node
	 *            is known to be read-only at the time that it is allocated as
	 *            per-child locking is therefore in place for all nodes in the
	 *            read-only {@link BTree}. It also results in much higher
	 *            concurrency for {@link AbstractBTree#touch(AbstractNode)}.
	 * 
	 * @return The {@link HTree} or derived class loaded from that
	 *         {@link Checkpoint} record.
	 * 
	 * @throws IllegalArgumentException
	 *             if store is null.
	 */
    @SuppressWarnings("unchecked")
    public static HTree load(final IRawStore store, final long addrCheckpoint,
            final boolean readOnly) {

		if (store == null)
			throw new IllegalArgumentException();
    	
        /*
         * Read checkpoint record from store.
         */
		final Checkpoint checkpoint;
		try {
			checkpoint = Checkpoint.load(store, addrCheckpoint);
		} catch (Throwable t) {
			throw new RuntimeException("Could not load Checkpoint: store="
					+ store + ", addrCheckpoint="
					+ store.toString(addrCheckpoint), t);
		}

		if (checkpoint.getIndexType() != IndexTypeEnum.HTree)
			throw new RuntimeException("Not an HTree checkpoint: " + checkpoint);

		/*
		 * Read metadata record from store.
		 */
		final HTreeIndexMetadata metadata;
		try {
			metadata = (HTreeIndexMetadata) IndexMetadata.read(store,
					checkpoint.getMetadataAddr());
		} catch (Throwable t) {
			throw new RuntimeException("Could not read IndexMetadata: store="
					+ store + ", checkpoint=" + checkpoint, t);
		}

        if (INFO) {

            // Note: this is the scale-out index name for a partitioned index.
            final String name = metadata.getName();

            log.info((name == null ? "" : "name=" + name + ", ")
                    + "readCheckpoint=" + checkpoint);

        }

        /*
         * Create HTree object instance.
         */
        try {

            final Class cl = Class.forName(metadata.getHTreeClassName());

            /*
             * Note: A NoSuchMethodException thrown here means that you did not
             * declare the required public constructor for a class derived from
             * BTree.
             */
            final Constructor ctor = cl.getConstructor(new Class[] {
                    IRawStore.class,//
                    Checkpoint.class,//
                    IndexMetadata.class, //
                    Boolean.TYPE
                    });

            final HTree htree = (HTree) ctor.newInstance(new Object[] { //
                    store,//
                    checkpoint, //
                    metadata, //
                    readOnly
                    });

//            if (readOnly) {
//
//                btree.setReadOnly(true);
//
//            }

            // read the root node.
            htree.reopen();

            return htree;

        } catch (Exception ex) {

            throw new RuntimeException(ex);

        }

    }
    
    /**
     * Recycle (aka delete) the allocation. This method also adjusts the #of
     * bytes released in the {@link BTreeCounters}.
     * 
     * @param addr
     *            The address to be recycled.
     * 
     * @return The #of bytes which were recycled and ZERO (0) if the address is
     *         {@link IRawStore#NULL}.
     */
    protected int recycle(final long addr) {
        
        if (addr == IRawStore.NULL) 
            return 0;
        
        final int nbytes = store.getByteCount(addr);
        
        // getBtreeCounters().bytesReleased += nbytes;
        
        store.delete(addr);
        
        return nbytes;

    }

}