com.bigdata.btree.Leaf Maven / Gradle / Ivy

Go to download
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Nov 15, 2006
 */
package com.bigdata.btree;

import java.io.PrintStream;
import java.nio.ByteBuffer;
import java.util.Iterator;
import java.util.WeakHashMap;

import org.apache.log4j.Level;

import com.bigdata.btree.data.DefaultLeafCoder;
import com.bigdata.btree.data.ILeafData;
import com.bigdata.btree.filter.EmptyTupleIterator;
import com.bigdata.btree.isolation.IsolatedFusedView;
import com.bigdata.btree.raba.IRaba;
import com.bigdata.btree.raba.MutableKeyBuffer;
import com.bigdata.btree.raba.MutableValueBuffer;
import com.bigdata.io.AbstractFixedByteArrayBuffer;
import com.bigdata.journal.ITransactionService;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.util.BytesUtil;

import cutthecrap.utils.striterators.EmptyIterator;
import cutthecrap.utils.striterators.SingleValueIterator;

/**
 * 
 * A B+-Tree leaf.
 * 
 * Tuple revision timestamps
 * 
 * When tuple revision timestamps are maintained, they must be propagated to the
 * parents if we insert or remove a tuple, but also need to be propagated if we
 * update a tuple in a manner which changes the min/max version timestamp. This
 * is done either by {@link Node#updateEntryCount(AbstractNode, int)}, when the
 * #of tuples in the leaf was changed, or by
 * {@link Node#updateMinMaxVersionTimestamp(AbstractNode)} when the #of tuples
 * in the leaf is unchanged.
 * 

 * The {@link #getMinimumVersionTimestamp()} and
 * {@link #getMaximumVersionTimestamp()} can be used to efficiently filter
 * iterators so as to only visit those nodes and leaves which have updates for
 * some revision timestamp range. This filtering is effective because if we
 * reject a node has not having data for the revision range of interest, then we
 * do not need to consider any of the nodes or leaves spanned by that node.
 * 

 * Note that revision timestamps ARE NOT commit timestamps. See
 * {@link ITransactionService} and {@link IsolatedFusedView} for more about
 * this and how to obtain and work with revision timestamps.
 * 
 * @author Bryan Thompson
 */
public class Leaf extends AbstractNode implements ILeafData, IRawRecordAccess {

    /**
     * The data record. {@link MutableLeafData} is used for all mutation
     * operations. {@link ReadOnlyLeafData} is used when the {@link Leaf} is
     * made persistent. A read-only data record is automatically converted into
     * a {@link MutableLeafData} record when a mutation operation is requested.
     * 

     * Note: This is package private in order to expose it to {@link Node}.
     */
    ILeafData data;

    /**
     * Return (branchingFactor + 1) << 1
     * 

     * Note: the root may have fewer keys.
     */
    @Override
    protected final int minKeys() {

//        /*
//         * Compute the minimum #of children/values. This is the same whether
//         * this is a Node or a Leaf.
//         */
//        final int minChildren = (btree.branchingFactor + 1) >> 1;
//
//        // this.minKeys = isLeaf() ? minChildren : minChildren - 1;
//
//        return minChildren;

        return btree.minChildren;
        
    }
    
    /**
     * Return branchingFactor, which is the maximum #of keys for a
     * {@link Leaf}.
     */
    @Override
    protected final int maxKeys() {
        
//        // The maximum #of keys is easy to compute.
//        this.maxKeys = isLeaf() ? branchingFactor : branchingFactor - 1;

        return btree.branchingFactor;
        
    }

    @Override
    final public ILeafData getDelegate() {
        
        return data;

    }
    
    /*
     * ILeafData
     */
    
    /**
     * Always returns true.
     */
    @Override
    final public boolean isLeaf() {

        return true;

    }

    /**
     * The result depends on the implementation. The {@link Leaf} will be
     * mutable when it is first created and is made immutable when it is
     * persisted. If there is a mutation operation, the backing
     * {@link ILeafData} is automatically converted into a mutable instance.
     */
    @Override
    final public boolean isReadOnly() {
        
        return data.isReadOnly();
        
    }

    @Override
    final public boolean isCoded() {
        
        return data.isCoded();
        
    }
    
    @Override
    final public AbstractFixedByteArrayBuffer data() {
        
        return data.data();
        
    }
    
    @Override
    final public boolean getDeleteMarker(final int index) {
        
        return data.getDeleteMarker(index);
        
    }

    @Override
    final public int getKeyCount() {
        
        return data.getKeyCount();
        
    }

    @Override
    // See https://sourceforge.net/apps/trac/bigdata/ticket/550 (NPE in Leaf.getKey())
    final public IRaba getKeys() {
        if(data==null) throw new NullPointerException("leaf="+toString());
        return data.getKeys();
        
    }

//    final public int getSpannedTupleCount() {
//        
//        return data.getSpannedTupleCount();
//        
//    }

    @Override
    final public int getValueCount() {
        
        return data.getValueCount();
        
    }

    @Override
    final public IRaba getValues() {
        
        return data.getValues();
        
    }

	/**
	 * Convenience method returns the byte[] for the given index in the leaf. If
	 * the tuple at that index is a raw record, then the record is read from the
	 * backing store. More efficient operations should be performed when copying
	 * the value into a tuple.
	 * 
	 * @param leaf
	 *            The leaf.
	 * @param index
	 *            The index in the leaf.
	 * 
	 * @return The data.
	 * 
	 * @see AbstractTuple#copy(int, Leaf)
	 */
    public byte[] getValue(final int index) {
    	
		if (!hasRawRecords()) {
		
			return getValues().get(index);
			
		}
		
		final long addr = getRawRecord(index);
		
		if( addr == IRawStore.NULL) {

			return getValues().get(index);

		}
		
		final ByteBuffer tmp = btree.readRawRecord(addr);
		
		if (tmp.hasArray() && tmp.arrayOffset() == 0 && tmp.position() == 0
				&& tmp.limit() == tmp.capacity()) {
			/*
			 * Return the backing array.
			 */
			return tmp.array();
		}

		/*
		 * Copy the data into a byte[].
		 */

		final int len = tmp.remaining();

		final byte[] a = new byte[len];

		tmp.get(a);

		return a;

    }
    
    final public long getVersionTimestamp(final int index) {
        
        return data.getVersionTimestamp(index);
        
    }

    final public long getRawRecord(final int index) {
        
        return data.getRawRecord(index);
        
    }

    final public boolean hasDeleteMarkers() {
        
        return data.hasDeleteMarkers();
        
    }

    final public boolean hasVersionTimestamps() {

        return data.hasVersionTimestamps();

    }

    final public long getMinimumVersionTimestamp() {

        return data.getMinimumVersionTimestamp();

    }

    final public long getMaximumVersionTimestamp() {

        return data.getMaximumVersionTimestamp();

    }

    final public boolean hasRawRecords() {
        
        return data.hasRawRecords();
        
    }

    final public boolean isDoubleLinked() {
        
        return data.isDoubleLinked();
        
    }
    
    final public long getPriorAddr() {
        
        return data.getPriorAddr();
        
    }

    final public long getNextAddr() {
        
        return data.getNextAddr();
        
    }

    /**
     * De-serialization constructor.
     * 

     * Note: The de-serialization constructor (and ONLY the de-serialization
     * constructor) ALWAYS creates a clean leaf. Therefore the {@link PO#dirty}
     * flag passed up from this constructor has the value false.
     * 
     * @param btree
     *            The tree to which the leaf belongs.
     * @param addr
     *            The address of this leaf.
     * @param data
     *            The data record.
     */
    protected Leaf(final AbstractBTree btree, final long addr,
            final ILeafData data) {

        super(btree, false /* The leaf is NOT dirty. */);

        assert data != null;

        /*
         * Cross check flags against the B+Tree when we wrap the record in a
         * Leaf.
         */
        assert data.hasDeleteMarkers() == btree.getIndexMetadata()
                .getDeleteMarkers();
        assert data.hasVersionTimestamps() == btree.getIndexMetadata()
                .getVersionTimestamps();

        setIdentity(addr);
        
        this.data = data;
        
//        // must clear the dirty since we just de-serialized this leaf.
//        setDirty(false);

//        // Add to the hard reference queue.
//        btree.touch(this);
        
    }

    /**
     * Creates a new mutable leaf.
     * 
     * @param btree
     *            A mutable B+Tree.
     */
    protected Leaf(final AbstractBTree btree) {

        super(btree, true /*dirty*/ );

        final IndexMetadata md = btree.getIndexMetadata();
        
        data = new MutableLeafData(//
                btree.branchingFactor, //
                md.getVersionTimestamps(),//
                md.getDeleteMarkers(),//
                md.getRawRecords()//
                );

//        final int branchingFactor = btree.branchingFactor;
//        
//        this.keys = new MutableKeyBuffer(branchingFactor + 1);
//
//        values = new MutableValueBuffer(0/* size */,
//                new byte[branchingFactor + 1][]);
//        
//        if(btree.getIndexMetadata().getVersionTimestamps()) {
//
//            versionTimestamps = new long[branchingFactor + 1];
//
//        }
//
//        if (btree.getIndexMetadata().getDeleteMarkers()) {
//
//            deleteMarkers = new boolean[branchingFactor + 1];
//
//        }
        
//        /*
//         * Add to the hard reference queue. If the queue is full, then this will
//         * force the incremental write whatever gets evicted from the queue.
//         */
//        btree.touch(this);
        
    }

    /**
     * Copy constructor.
     * 
     * @param src
     *            The source node (must be immutable).
     * 
     * @see AbstractNode#copyOnWrite()
     */
    protected Leaf(final Leaf src) {

        super(src);

        assert !src.isDirty();
        assert src.isReadOnly();
//        assert src.isPersistent();

        // steal/clone the data record.
        this.data = src.isReadOnly() ? new MutableLeafData(src
                .getBranchingFactor(), src.data) : src.data;

        // clear reference on source.
        src.data = null;

//        /*
//         * Steal/copy the keys.
//         * 
//         * Note: The copy constructor is invoked when we need to begin mutation
//         * operations on an immutable node or leaf, so make sure that the keys
//         * are mutable.
//         */
//        {
//
////            nkeys = src.nkeys;
//
//            if (src.getKeys() instanceof MutableKeyBuffer) {
//
//                keys = src.getKeys();
//
//            } else {
//
//                keys = new MutableKeyBuffer(src.getBranchingFactor(), src
//                        .getKeys());
//
//            }
//
//            // release reference on the source node.
////            src.nkeys = 0;
//            src.keys = null;
//            
//        }
//
////        /*
////         * Steal the values[].
////         */
////
////        // steal reference and clear reference on the source node.
////        values = src.values;
//
//        /*
//         * Steal/copy the values[].
//         * 
//         * Note: The copy constructor is invoked when we need to begin mutation
//         * operations on an immutable node or leaf, so make sure that the values
//         * are mutable.
//         */
//        {
//
//            if (src.values instanceof MutableValueBuffer) {
//
//                values = src.values;
//
//            } else {
//
//                values = new MutableValueBuffer(src.getBranchingFactor(),
//                        src.values);
//
//            }
//
//            // release reference on the source node.
//            src.values = null;
//            
//        }
//
//        versionTimestamps = src.versionTimestamps;
//        
//        deleteMarkers = src.deleteMarkers;
        
//        // Add to the hard reference queue.
//        btree.touch(this);

    }

    @Override
    public void delete() {

        /*
         * Note: This event MUST go out before we clear [leafListners].
         * 
         * Note: Since we fire this event here we do NOT need to fire it
         * explicitly after a copy-on-write since copy-on-write ALWAYS calls
         * delete() on the original leaf if it makes a copy.
         */

        fireInvalidateLeafEvent();

        super.delete();

        // clear references.
        data = null;
        
//        keys = null;
//        
//        values = null;
//        
//        versionTimestamps = null;
//        
//        deleteMarkers = null;
        
        leafListeners = null;
        
    }

    /**
     * Insert or update an entry in the leaf as appropriate. The caller MUST
     * ensure by appropriate navigation of parent nodes that the key for the
     * next tuple either exists in or belongs in this leaf. If the leaf
     * overflows then it is split after the insert.
     * 
     * FIXME maintain min/max version timestamps.
     */
    @Override
    public Tuple insert(final byte[] searchKey, final byte[] newval,
            final boolean delete, final boolean putIfAbsent, final long timestamp, final Tuple tuple) {

        if (delete && !data.hasDeleteMarkers()) {

            /*
             * You may not specify the delete flag unless delete markers are
             * being maintained.
             */
            
            throw new UnsupportedOperationException();
            
        }
        
        if(btree.debug) assertInvariants();
        
//        btree.touch(this); // Note: Invoked by copyOnWrite() (immediately below)

        int entryIndex = Integer.MAX_VALUE; // shut up the compiler. complains about not assigned on all code paths.
		if (putIfAbsent) {
			/*
			 * putIfAbsent code path. We look for the search key in the leaf
			 * before triggering copy-on-write. If the key is found, then we
			 * will (optionally) return the value under the key and WILL NOT
			 * modify the leaf.
			 * 
			 * Note: If we search for the entryIndex here, then we DO NOT 
			 * search again below.
			 * 
			 * See BLZG-1539
			 */
			entryIndex = this.getKeys().search(searchKey);
			
			if (entryIndex >= 0) {
				
				// found entry for that key.
				
				if (!hasDeleteMarkers() || !getDeleteMarker(entryIndex)) {

					/*
					 * Found an existing (non-deleted) entry under the key.
					 * 
					 * Do NOT mutate the leaf.
					 */
					
					// copy tuple (optional).
					if (tuple != null)
						tuple.copy(entryIndex, this);
					
					// return caller's tuple.
					return tuple;
				}
			}
		}

        /*
         * Note: This is one of the few gateways for mutation of a leaf via the
         * main btree API (insert, lookup, delete). By ensuring that we have a
         * mutable leaf here, we can assert that the leaf must be mutable in
         * other methods.
         */
        final Leaf copy = (Leaf) copyOnWrite();

        if (copy != this) {

            /*
			 * This leaf has been copied so delegate the operation to the new
			 * leaf.
			 * 
			 * Note: copy-on-write deletes [this] leaf and delete() notifies any
			 * leaf listeners before it clears the [leafListeners] reference so
			 * not only don't we have to do that here, but we can't since the
			 * listeners would be cleared before we could fire off the event
			 * ourselves.
			 * 
			 * Note: putIfAbsent() is handled above without triggering a
			 * copy-on-write so we always set the flag to false if we triggered
			 * copy on write so the mutation is unconditional on the copy of the
			 * leaf data.
			 */
            
            return copy.insert(searchKey, newval, delete, false/*putIfAbsent*/, timestamp, tuple);
            
        }

        /*
         * Search for the key.
         * 
         * Note: We do NOT search before triggering copy-on-write for an object
         * index since an insert/update always triggers a mutation.
         */

        // look for the search key in the leaf.
		if (!putIfAbsent) {
			/*
			 * when putIfAbsent is true, we already found the entryIndex above.
			 * Otherwise we find it now.  See BLZG-1539.
			 */
			entryIndex = this.getKeys().search(searchKey);
		}

        if (entryIndex >= 0) {

            /*
             * The key is already present in the leaf, so we are updating an
             * existing entry.
             */
            
            if (tuple != null) {

                /*
                 * Copy data and metadata for the old value stored under the
                 * search key.
                 */

                tuple.copy(entryIndex, this);
                
            }

            // Tunnel through to the mutable object.
            final MutableLeafData data = (MutableLeafData) this.data;

			/*
			 * Update the entry on the leaf.
			 */
			if (hasRawRecords()) {

				/*
				 * Note: If the old value was a raw record, we need to delete
				 * that raw record now.
				 * 
				 * Note: If the new value will be a raw record, we need to write
				 * that raw record onto the store now and save its address into
				 * the values[] raba.
				 */
				final long oaddr = getRawRecord(entryIndex);

				if(oaddr != IRawStore.NULL) {
					
					btree.deleteRawRecord(oaddr);
					
				}
				
				final long maxRecLen = btree.getMaxRecLen();
				
				if (newval != null && newval.length > maxRecLen) {

					// write the value on the backing store.
					final long naddr = btree.writeRawRecord(newval);

					// save its address in the values raba.
					data.vals.values[entryIndex] = ((BTree) btree)
							.encodeRecordAddr(naddr);
					
					// flag as a raw record.
					data.rawRecords[entryIndex] = true;

				} else {
					
					data.vals.values[entryIndex] = newval;
				
					data.rawRecords[entryIndex] = false;
					
				}
				
			} else {

				data.vals.values[entryIndex] = newval;
				
			}

            if (data.deleteMarkers != null) {

                if (!data.deleteMarkers[entryIndex] && delete) {

                    /*
                     * Changing from a non-deleted to a deleted tuple (we don't
                     * count re-deletes of an already deleted tuple).
                     */
                    btree.getBtreeCounters().ntupleUpdateDelete++;

                } else if(!delete) { 
                
                    /*
                     * Either changing from a deleted to a non-deleted tuple or
                     * just overwriting an existing non-deleted tuple.
                     */
                    btree.getBtreeCounters().ntupleUpdateValue++;

                }
                
                data.deleteMarkers[entryIndex] = delete;

            } else {
                
                /*
                 * Update value for existing tuple (delete markers are not in
                 * use).
                 */
                btree.getBtreeCounters().ntupleUpdateValue++;
                
            }

            if (data.versionTimestamps != null) {
                boolean propagateMinMax = false;
                data.versionTimestamps[entryIndex] = timestamp;
                if (data.minimumVersionTimestamp > timestamp) {
                    data.minimumVersionTimestamp = timestamp;
                    propagateMinMax = true;
                }
                if (data.maximumVersionTimestamp < timestamp) {
                    data.maximumVersionTimestamp = timestamp;
                    propagateMinMax = true;
                }
                if (propagateMinMax && parent != null) {
                    parent.get().updateMinMaxVersionTimestamp(this);
                }

            }
            
//            // notify any listeners that this tuple's state has been changed.
//            fireInvalidateTuple(entryIndex);

            // return the old value.
            return tuple;

        }

        /*
         * The insert goes into this leaf.
         */
        
        // Convert the position to obtain the insertion point.
        entryIndex = -entryIndex - 1;

        // insert an entry under that key.
        {

            final int nkeys = getKeyCount();
            
            if (entryIndex < nkeys) {

                /* index = 2;
                 * nkeys = 6;
                 * 
                 * [ 0 1 2 3 4 5 ]
                 *       ^ index
                 * 
                 * count = keys - index = 4;
                 */
                final int count = nkeys - entryIndex;
                
                assert count >= 1;

                copyDown(entryIndex, count);

            }

            /*
             * Insert at index.
             */
            
            // Tunnel through to the mutable object.
            final MutableLeafData data = (MutableLeafData) this.data;
            final MutableKeyBuffer keys = data.keys;
            final MutableValueBuffer vals = data.vals;
//            copyKey(entryIndex, searchKeys, tupleIndex);
            keys.keys[entryIndex] = searchKey; // note: presumes caller does not reuse the searchKeys!
			if (hasRawRecords()) {
				final long maxRecLen = btree.getMaxRecLen();
				if (newval != null && newval.length > maxRecLen) {
					// write the value on the backing store.
					final long naddr = btree.writeRawRecord(newval);
					// save its address in the values raba.
					data.vals.values[entryIndex] = ((BTree) btree)
							.encodeRecordAddr(naddr);
					// flag as a raw record.
					data.rawRecords[entryIndex] = true;
				} else {
					data.vals.values[entryIndex] = newval;
					data.rawRecords[entryIndex] = false;
				}
            } else {
            	vals.values[entryIndex] = newval;
            }
            if (data.deleteMarkers != null) {
                if (delete) {
                    // Inserting a deleted tuple.
                    btree.getBtreeCounters().ntupleInsertDelete++;
                } else if (!delete) {
                    // Inserting a non-deleted tuple.
                    btree.getBtreeCounters().ntupleInsertValue++;
                }
                data.deleteMarkers[entryIndex] = delete;
            } else {
                // Inserting a tuple (delete markers not in use).
                btree.getBtreeCounters().ntupleInsertValue++;
            }
            if (data.versionTimestamps != null) {
                data.versionTimestamps[entryIndex] = timestamp;
                if (data.minimumVersionTimestamp > timestamp)
                    data.minimumVersionTimestamp = timestamp;
                if (data.maximumVersionTimestamp < timestamp)
                    data.maximumVersionTimestamp = timestamp;
            }

            /*nkeys++;*/keys.nkeys++; vals.nvalues++;

        }

        // one more entry in the btree.
        ((BTree)btree).nentries++;

        if( parent != null ) {
            // update spanned tuple count and min/max version timestamp.
            parent.get().updateEntryCount(this, 1);
        }

//        if (INFO) {
//            log.info("this="+this+", key="+key+", value="+entry);
//            if(DEBUG) {
//                System.err.println("this"); dump(Level.DEBUG,System.err);
//            }
//        }

        if (data.getKeyCount() == maxKeys() + 1) {

            /*
             * The insert caused the leaf to overflow, so now we split the leaf.
             */

            final Leaf rightSibling = (Leaf) split();

            // assert additional invariants post-split.
            if(btree.debug) {
                rightSibling.assertInvariants();
                getParent().assertInvariants();
            }

        }

        // assert invariants post-split.
        if(btree.debug) assertInvariants();
        
        /*
         * Notify any listeners that the tuples found in the leaf have been
         * changed (one was added but others will have been moved into a new
         * right sibling if the leaf was split).
         */
        fireInvalidateLeafEvent();

        // return null since there was no pre-existing entry.
        return null;
        
    }

    @Override
    public Tuple lookup(final byte[] searchKey, final Tuple tuple) {

        btree.touch(this);

        final int entryIndex = getKeys().search(searchKey);

        if (entryIndex < 0) {

            // Not found.

            return null;

        }

        // Found.
        
        tuple.copy(entryIndex, this);
        
        return tuple;
        
    }

    @Override
    public long indexOf(final byte[] key) {

        btree.touch(this);
        
		return (long) getKeys().search(key);

	}

	@Override
	public byte[] keyAt(final long entryIndex) {

		rangeCheck2(entryIndex);

		return getKeys().get((int) entryIndex);

	}

	@Override
	public void valueAt(final long entryIndex, final Tuple tuple) {

		rangeCheck2(entryIndex);

		tuple.copy((int) entryIndex, this);

	}

    /** Used for the {@link ILinearList} methods. */
	final protected boolean rangeCheck2(final long index)
			throws IndexOutOfBoundsException {

		if (index > Integer.MAX_VALUE)
			throw new IndexOutOfBoundsException();
		
		return rangeCheck((int) index);
		
	}
    
	/** Used for methods which have an index into the leaf. */
    final protected boolean rangeCheck(final int index)
            throws IndexOutOfBoundsException {

        final int nkeys = data.getKeyCount();

        if (index < 0 || index >= nkeys) {

            throw new IndexOutOfBoundsException("index=" + index + ", nkeys="
                    + nkeys);

        }

        return true;

    }

    /**
     * 

     * Split an over-capacity leaf (a leaf with maxKeys+1 keys),
     * creating a new rightSibling. The splitIndex (the index of the first key
     * to move to the rightSibling) is (maxKeys+1)/2. The key at
     * the splitIndex is also inserted as the new separatorKey into the parent.
     * All keys and values starting with splitIndex are moved to the new
     * rightSibling. If this leaf is the root of the tree (no parent), then a
     * new root {@link Node} is created without any keys and is made the parent
     * of this leaf. In any case, we then insert( separatorKey, rightSibling )
     * into the parent node, which may cause the parent node itself to split.
     * 
     * 
     * @return The new rightSibling leaf.
     * 
     *         FIXME maintain min/max version timestamps.
     */
    @Override
    protected IAbstractNode split() {

        final int maxKeys = this.maxKeys();

        // MUST be mutable.
        assert isDirty();
        // MUST be an overflow.
        assert getKeyCount() == maxKeys + 1;

        final BTree btree = (BTree) this.btree;

        btree.getBtreeCounters().leavesSplit++;

        // #of entries in the leaf before it is split.
        final int nentriesBeforeSplit = getKeyCount();
        
        /*
         * The splitIndex is the index of the first key/value to move to the new
         * rightSibling.
         */
        final int splitIndex = (maxKeys + 1) / 2;

        /*
         * The separatorKey is the shortest key that is less than or equal to
         * the key at the splitIndex and greater than the key at [splitIndex-1].
         * This also serves as the separator key when we insert( separatorKey,
         * rightSibling ) into the parent.
         */
//        final byte[] separatorKey = getKeys().get(splitIndex);
        final byte[] separatorKey = BytesUtil.getSeparatorKey(//
                getKeys().get(splitIndex),//
                getKeys().get(splitIndex - 1)//
                );

        if (getParent() != null) {

            /*
             * Note: This code block was introduced to track down an error
             * observed where a leaf split chose a separator key which already
             * existed in the parent node. However, I am beginning to suspect
             * that the error was introduced by a cache consistency problem
             * (since fixed) in the LRUNexus.
             */
            
            // the index of this leaf in the parent.
            final int leafIndex = getParent().getIndexOf(this);

            // the index of the proposed separator key in the parent (should not
            // exist).
            final int separatorIndex = getParent().getKeys().search(
                    separatorKey);

            if (separatorIndex >= 0) {

                /*
                 * The separator key should not be pre-existing in the parent.
                 */

                throw new AssertionError("Split on existing key: leafIndex="
                        + leafIndex + ", splitIndexInLeaf=" + splitIndex
                        + ", separatorIndexInParent=" + separatorIndex
                        + ", separatorKey=" + keyAsString(separatorKey)
                        + "\nparent=" + getParent() + "\nleaf=" + this);

            }

        }
        
        // The new rightSibling of this leaf (this will be a mutable leaf).
        final Leaf rightSibling = new Leaf(btree);

        // Tunnel through to the mutable objects.
        final MutableLeafData data = (MutableLeafData) this.data;
        final MutableLeafData sdata = (MutableLeafData) rightSibling.data;

        // increment #of leaves in the tree.
        btree.nleaves++;

        if (DEBUG) {
            log.debug("this=" + this + ", nkeys=" + getKeyCount() + ", splitIndex="
                    + splitIndex + ", separatorKey="
                    + keyAsString(separatorKey)
                    );
//            if(DEBUG) dump(Level.DEBUG,System.err);
        }

        int j = 0;
        for (int i = splitIndex; i <= maxKeys; i++, j++) {

            // copy key and value to the new leaf.
//            rightSibling.setKey(j, getKey(i));
            rightSibling.copyKey(j, this.getKeys(), i);
            
            sdata.vals.values[j] = data.vals.values[i];

            if (data.deleteMarkers != null) {
                sdata.deleteMarkers[j] = data.deleteMarkers[i];
            }

            if (data.versionTimestamps != null) {
                sdata.versionTimestamps[j] = data.versionTimestamps[i];
            }

            if (data.rawRecords != null) {
                sdata.rawRecords[j] = data.rawRecords[i];
            }
            
            // clear out the old keys and values.
            data.keys.keys[i] = null;
            data.vals.values[i] = null;
            if (data.deleteMarkers != null)
                data.deleteMarkers[i] = false;
            if (data.versionTimestamps != null)
                data.versionTimestamps[i] = 0L;
            if (data.rawRecords != null)
                data.rawRecords[i] = false;
            
            // one less key here.
            /* nkeys--; */
            data.keys.nkeys--;
            data.vals.nvalues--;
            // more more key there.
            /*rightSibling.nkeys++;*/
            sdata.keys.nkeys++;
            sdata.vals.nvalues++;

        }

        /*
         * Recalculate the version timestamps. This is both easier than tracking
         * the changes on a per-tuple basis in the loop above and we have to
         * recalculate the version timestamps anyway if we move one whose value
         * is equal to the min or max.
         */
        if (data.versionTimestamps != null)
            data.recalcMinMaxVersionTimestamp();
        if (sdata.versionTimestamps != null)
            sdata.recalcMinMaxVersionTimestamp();

        /*
         * Now consider the parent. It will have to be updated. If there is no
         * parent (if this is the root leaf), then we need to create that
         * parent.
         */
        Node p = getParent();

        if (p == null) {

            /*
             * Use a special constructor to split the root leaf. The result is a
             * new node with zero keys and one child (this leaf).  The #of entries
             * spanned by the new root node is the same as the #of entries found
             * on this leaf _before_ the split.
             */

            p = new Node((BTree) btree, this, nentriesBeforeSplit);

        } else {
            
            assert !p.isReadOnly();

            // FIXME must update min/max on parent, which requires a child scan :-(
            // this leaf now has fewer entries
            ((MutableNodeData) p.data).childEntryCounts[p.getIndexOf(this)] -= rightSibling
                    .getKeyCount();

            if (p != btree.root && p.isRightMostNode()) {

                /*
                 * If the leaf that is split is a child of the right most node
                 * in the tree then that is counted as a "tail split".
                 * 
                 * Note: We DO NOT count tail splits when the leaf is the root
                 * leaf and we DO NOT count tail splits when the parent of the
                 * leaf is the root leaf. In both of those cases any leaf split
                 * would qualify, which is too liberal to be a useful measure.
                 * 
                 * Note: The ratio of tail splits to leaf splits may be used as
                 * an indication of a pattern of index writes that bears heavily
                 * on the tail of the index.
                 */

                btree.getBtreeCounters().tailSplit++;

            } else if (p != btree.root && p.isLeftMostNode()) {

                /*
                 * If the leaf that is split is a child of the left-most node in
                 * the tree then that is counted as a "head split".
                 * 
                 * Note: We DO NOT count head splits when the leaf is the root
                 * leaf and we DO NOT count head splits when the parent of the
                 * leaf is the root leaf. In both of those cases any leaf split
                 * would qualify, which is too liberal to be a useful measure.
                 * 
                 * Note: The ratio of head splits to leaf splits may be used as
                 * an indication of a pattern of index writes that bears heavily
                 * on the head of the index.
                 */

                btree.getBtreeCounters().headSplit++;
                
            }
            
        }

        /* 
         * Insert(splitKey,rightSibling) into the parent node.  This may cause
         * the parent node itself to split.
         * 
         * Note: This operation can not cause the min/max on the parent Node
         * to change.  However, insertChild() does need to record the min/max
         * for the new rightSibling.
         */
        p.insertChild(separatorKey, rightSibling);

        // Return the high leaf.
        return rightSibling;

    }

    /**
     * Redistributes a key from the specified sibling into this leaf in order to
     * bring this leaf up to the minimum #of keys. This also updates the
     * separator key in the parent for the right most of (this, sibling). While
     * the #of entries spanned by the children of the common parent is changed
     * by this method note that there is no net change in the #of entries
     * spanned by that parent node.
     * 
     * @param sibling
     *            A direct sibling of this leaf (either the left or right
     *            sibling). The sibling MUST be mutable.
     * 
     * @todo Modify to always choose the shortest separator key from within a
     *       region in which the split is reasonable. This will help keep down
     *       the size of the separator keys in the nodes.
     * 
     *       FIXME maintain min/max version timestamps.
     */
    @Override
    protected void redistributeKeys(final AbstractNode sibling,
            final boolean isRightSibling) {

        // the sibling of a leaf must be a leaf.
        final Leaf s = (Leaf) sibling;
        assert s != null;
        
        final int nkeys = this.getKeyCount();
        final int snkeys = s.getKeyCount();
        final int minKeys = this.minKeys();
        
        assert dirty;
        assert !deleted;
        assert !isPersistent();
        // verify that this leaf is deficient.
        assert nkeys < minKeys;
        // verify that this leaf is under minimum capacity by one key.
        assert nkeys == minKeys - 1;
        
        // the sibling MUST be _OVER_ the minimum #of keys/values.
        assert snkeys > minKeys;
        assert s.dirty;
        assert !s.deleted;
        assert !s.isPersistent();
        
        final Node p = getParent();
        
        // children of the same node.
        assert s.getParent() == p;

        if (DEBUG) {
            log.debug("this="+this+", sibling="+sibling+", rightSibling="+isRightSibling);
//            if(DEBUG) {
//                System.err.println("this"); dump(Level.DEBUG,System.err);
//                System.err.println("sibling"); sibling.dump(Level.DEBUG,System.err);
//                System.err.println("parent"); p.dump(Level.DEBUG,System.err);
//            }
        }

        /*
         * The index of this leaf in its parent. we note this before we
         * start mucking with the keys.
         */
        final int index = p.getIndexOf(this);
        
        // Tunnel through to the mutable objects.
        final MutableLeafData data = (MutableLeafData) this.data;
        final MutableLeafData sdata = (MutableLeafData) s.data;
        final MutableNodeData pdata = (MutableNodeData) p.data;
        final MutableKeyBuffer keys = data.keys;
        final MutableKeyBuffer skeys = sdata.keys;
        final MutableValueBuffer vals = data.vals;
        final MutableValueBuffer svals = sdata.vals;

        /*
         * Determine which leaf is earlier in the key ordering and get the
         * index of the sibling.
         */
        if (isRightSibling) {

            /*
             * redistributeKeys(this,rightSibling). all we have to do is move
             * the first key from the rightSibling to the end of the keys in
             * this leaf. we then close up the hole that this left at index 0 in
             * the rightSibling. finally, we update the separator key for the
             * rightSibling to the new key in its first index position.
             */

            // copy the first key from the rightSibling.
//            setKey(nkeys, s.getKey(0));
            copyKey(nkeys, s.getKeys(), 0);
            vals.values[nkeys] = svals.values[0];
            if (data.deleteMarkers != null)
                data.deleteMarkers[nkeys] = sdata.deleteMarkers[0];
            boolean updateMinMaxVersionTimestampOnSibling = false;
            if (data.versionTimestamps != null) {
                final long t = sdata.versionTimestamps[0];
                data.versionTimestamps[nkeys] = t;
                if (t < data.minimumVersionTimestamp)
                    data.minimumVersionTimestamp = t;
                if (t > data.maximumVersionTimestamp)
                    data.maximumVersionTimestamp = t;
                if (t == sdata.minimumVersionTimestamp
                        || t == sdata.maximumVersionTimestamp)
                    updateMinMaxVersionTimestampOnSibling = true;
            }
            if (data.rawRecords != null)
                data.rawRecords[nkeys] = sdata.rawRecords[0];

            // copy down the keys on the right sibling to cover up the hole.
            System.arraycopy(skeys.keys, 1, skeys.keys, 0, snkeys-1);
            System.arraycopy(svals.values, 1, svals.values, 0, snkeys-1);
            if(data.deleteMarkers!=null)
                System.arraycopy(sdata.deleteMarkers, 1, sdata.deleteMarkers, 0, snkeys-1);
            if(data.versionTimestamps!=null)
                System.arraycopy(sdata.versionTimestamps, 1, sdata.versionTimestamps, 0, snkeys-1);
            if(data.rawRecords !=null)
                System.arraycopy(sdata.rawRecords, 1, sdata.rawRecords, 0, snkeys-1);

            // erase exposed key/value on rightSibling that is no longer defined.
            skeys.keys[snkeys-1] = null;
            svals.values[snkeys-1] = null;
            if (data.deleteMarkers != null)
                sdata.deleteMarkers[snkeys - 1] = false;
            if (data.versionTimestamps != null)
                sdata.versionTimestamps[snkeys - 1] = 0L;
            if (data.rawRecords != null)
                sdata.rawRecords[snkeys - 1] = false;

            /*s.nkeys--;*/ skeys.nkeys--; svals.nvalues--;
            /*this.nkeys++;*/keys.nkeys++; vals.nvalues++;
            
            if(updateMinMaxVersionTimestampOnSibling)
                sdata.recalcMinMaxVersionTimestamp();
            
            // update the separator key for the rightSibling.
//            p.setKey(index, s.getKey(0));
            p.copyKey(index, s.getKeys(), 0);

            // update parent : one more key on this child.
            pdata.childEntryCounts[index]++;
            // update parent : one less key on our right sibling.
            pdata.childEntryCounts[index + 1]--;
            // FIXME update min/max on parent for this leaf and the rightSibling.

            if (btree.debug) {
                assertInvariants();
                s.assertInvariants();
            }

        } else {
            
            /*
             * redistributeKeys(leftSibling,this). all we have to do is copy
             * down the keys in this leaf by one position and move the last key
             * from the leftSibling into the first position in this leaf. We
             * then replace the separation key for this leaf on the parent with
             * the key that we copied from the leftSibling.
             */
            
            // copy down by one.
            System.arraycopy(keys.keys, 0, keys.keys, 1, nkeys);
            System.arraycopy(vals.values, 0, vals.values, 1, nkeys);
            if(data.deleteMarkers!=null)
                System.arraycopy(data.deleteMarkers, 0, data.deleteMarkers, 1, nkeys);
            if(data.versionTimestamps!=null)
                System.arraycopy(data.versionTimestamps, 0, data.versionTimestamps, 1, nkeys);
            if(data.rawRecords!=null)
                System.arraycopy(data.rawRecords, 0, data.rawRecords, 1, nkeys);
            
            // move the last key/value from the leftSibling to this leaf (copy, then clear).
            // copy.
//            setKey(0, s.getKey(s.nkeys-1));
            copyKey(0,s.getKeys(),snkeys-1);
            vals.values[0] = svals.values[snkeys-1];
            if (data.deleteMarkers != null)
                data.deleteMarkers[0] = sdata.deleteMarkers[snkeys - 1];
//            if (data.versionTimestamps != null)
//                data.versionTimestamps[0] = sdata.versionTimestamps[snkeys - 1];
            boolean updateMinMaxVersionTimestampOnSibling = false;
            if (data.versionTimestamps != null) {
                final long t = sdata.versionTimestamps[snkeys - 1];
                data.versionTimestamps[0] = t;
                if (t < data.minimumVersionTimestamp)
                    data.minimumVersionTimestamp = t;
                if (t > data.maximumVersionTimestamp)
                    data.maximumVersionTimestamp = t;
                if (t == sdata.minimumVersionTimestamp
                        || t == sdata.maximumVersionTimestamp)
                    updateMinMaxVersionTimestampOnSibling = true;
            }
			if (data.rawRecords != null)
				data.rawRecords[0] = sdata.rawRecords[snkeys - 1];
            // clear
            skeys.keys[snkeys-1] = null;
            svals.values[snkeys-1] = null;
            if (data.deleteMarkers != null)
                sdata.deleteMarkers[snkeys - 1] = false;
            if (data.versionTimestamps != null)
                sdata.versionTimestamps[snkeys - 1] = 0L;
            if (data.rawRecords != null)
                sdata.rawRecords[snkeys - 1] = false;
            /*s.nkeys--;*/ skeys.nkeys--; svals.nvalues--;
            /*this.nkeys++;*/ keys.nkeys++; vals.nvalues++;
            
            if(updateMinMaxVersionTimestampOnSibling)
                sdata.recalcMinMaxVersionTimestamp();

            // update the separator key for this leaf.
//            p.setKey(index-1,getKey(0));
            p.copyKey(index-1, this.getKeys(), 0);

            // update parent : one more key on this child.
            pdata.childEntryCounts[index]++;
            // update parent : one less key on our left sibling.
            pdata.childEntryCounts[index-1]--;
            // FIXME update min/max on parent for this leaf and the rightSibling.

            if (btree.debug) {
                assertInvariants();
                s.assertInvariants();
            }

        }

    }

    /**
     * Merge the keys and values from the sibling into this leaf, delete the
     * sibling from the store and remove the sibling from the parent. This will
     * trigger recursive {@link AbstractNode#join()} if the parent node is now
     * deficient. While this changes the #of entries spanned by the current node
     * it does NOT effect the #of entries spanned by the parent. Likewise, while
     * the min/max tuple revision timestamp may change for this {@link Leaf}, it
     * WILL NOT change for its parent {@link Node} (since this operation does
     * not remove any tuples).
     * 
     * @param sibling
     *            A direct sibling of this leaf (does NOT need to be mutable).
     *            The sibling MUST have exactly the minimum #of keys.
     * 
     *            FIXME maintain min/max version timestamps.
     */
    @Override
    protected void merge(final AbstractNode sibling,
            final boolean isRightSibling) {
        
        // the sibling of a leaf must be a leaf.
        final Leaf s = (Leaf)sibling;
        assert s != null;
        
        final int nkeys = this.getKeyCount();
        final int snkeys = s.getKeyCount();

        assert !s.deleted;
        // verify that this leaf is deficient.
        assert nkeys < minKeys();
        // verify that this leaf is under minimum capacity by one key.
        assert nkeys == minKeys() - 1;
        // the sibling MUST at the minimum #of keys/values.
        assert snkeys == s.minKeys();

        final Node p = getParent();
        
        // children of the same node.
        assert s.getParent() == p : "this.parent="
                + (p == null ? null : p)
                + " != s.parent="
                + (s.getParent() == null ? null : s.getParent());

        if (DEBUG) {
            log.debug("this="+this+", sibling="+sibling+", rightSibling="+isRightSibling);
//            if(DEBUG) {
//                System.err.println("this"); dump(Level.DEBUG,System.err);
//                System.err.println("sibling"); sibling.dump(Level.DEBUG,System.err);
//                System.err.println("parent"); p.dump(Level.DEBUG,System.err);
//            }
        }

        /*
         * The index of this leaf in its parent. We note this before we
         * start mucking with the keys.
         */
        final int index = p.getIndexOf(this);
        
        /*
         * Tunnel through to the mutable data records.
         * 
         * Note: We do not require the sibling to be mutable. If it is not, then
         * we create a mutable copy of the sibling for use during this method.
         */
        final MutableLeafData data = (MutableLeafData) this.data;
        final MutableLeafData sdata = s.isReadOnly() ? new MutableLeafData(
                getBranchingFactor(), s.data) : (MutableLeafData) s.data;
        final MutableNodeData pdata = (MutableNodeData) p.data;

        /*
         * Determine which leaf is earlier in the key ordering so that we know
         * whether the sibling's keys will be inserted at the front of this
         * leaf's keys or appended to this leaf's keys.
         */
        if( isRightSibling /*keys[nkeys-1] < s.keys[0]*/) {
            
            /*
             * merge( this, rightSibling ). the keys and values from this leaf
             * will appear in their current position and the keys and values
             * from the rightSibling will be appended after the last key/value
             * in this leaf.
             */

            /*
             * Copy in the keys and values from the sibling.
             */
            
            System.arraycopy(sdata.keys.keys, 0, data.keys.keys, nkeys, snkeys);

            System.arraycopy(sdata.vals.values, 0, data.vals.values, nkeys,
                    snkeys);

            if (data.deleteMarkers != null) {
                System.arraycopy(sdata.deleteMarkers, 0, data.deleteMarkers,
                        nkeys, snkeys);
            }

            if (data.versionTimestamps != null) {
                System.arraycopy(sdata.versionTimestamps, 0,
                        data.versionTimestamps, nkeys, snkeys);
                if (sdata.minimumVersionTimestamp < data.minimumVersionTimestamp)
                    data.minimumVersionTimestamp = sdata.minimumVersionTimestamp;
                if (sdata.maximumVersionTimestamp > data.maximumVersionTimestamp)
                    data.maximumVersionTimestamp = sdata.maximumVersionTimestamp;
            }
            
            if (data.rawRecords != null) {
                System.arraycopy(sdata.rawRecords, 0, data.rawRecords,
                        nkeys, snkeys);
            }
            
            /* 
             * Adjust the #of keys in this leaf.
             */
//            this.nkeys += s.nkeys;
            data.keys.nkeys += snkeys;
            data.vals.nvalues += snkeys;

            /*
             * Note: in this case we have to replace the separator key for this
             * leaf with the separator key for its right sibling.
             * 
             * Note: This temporarily causes the duplication of a separator key
             * in the parent. However, the separator key for the right sibling
             * will be deleted when the sibling is removed from the parent
             * below.
             */
//            p.setKey(index, p.getKey(index+1));
            p.copyKey(index, p.getKeys(), index+1 );

            // reallocate spanned entries from the sibling to this node.
            // FIXME Update min/max on the parent for this leaf.
            pdata.childEntryCounts[index] += s.getKeyCount();
            
            if(btree.debug) assertInvariants();
            
        } else {
            
            /*
             * merge( leftSibling, this ). The keys and values from this leaf
             * will be move down by sibling.nkeys positions and then the keys
             * and values from the sibling will be copied into this leaf
             * starting at index zero(0).
             * 
             * Note: we do not update the separator key in the parent because 
             * the separatorKey will be removed when we remove the leftSibling
             * from the parent at the end of this method.  This also has the
             * effect of giving this leaf its left sibling's separatorKey.
             */
            
            // move keys and values down by sibling.nkeys positions.
            System.arraycopy(data.keys.keys, 0, data.keys.keys, snkeys, nkeys);
            System.arraycopy(data.vals.values, 0, data.vals.values, snkeys,
                    nkeys);
            if (data.deleteMarkers != null) {
                System.arraycopy(data.deleteMarkers, 0, data.deleteMarkers,
                        snkeys, nkeys);
            }
            if (data.versionTimestamps != null) {
                System.arraycopy(data.versionTimestamps, 0,
                        data.versionTimestamps, snkeys, nkeys);
            }
            if (data.rawRecords != null) {
                System.arraycopy(data.rawRecords, 0, data.rawRecords,
                        snkeys, nkeys);
            }
            
            // copy keys and values from the sibling to index 0 of this leaf.
            System.arraycopy(sdata.keys.keys, 0, data.keys.keys, 0, snkeys);
            System.arraycopy(sdata.vals.values, 0, data.vals.values, 0, snkeys);
            if (data.deleteMarkers != null) {
                System.arraycopy(sdata.deleteMarkers, 0, data.deleteMarkers, 0,
                        snkeys);
            }
            if (data.versionTimestamps != null) {
                System.arraycopy(sdata.versionTimestamps, 0,
                        data.versionTimestamps, 0, snkeys);
                if (sdata.minimumVersionTimestamp < data.minimumVersionTimestamp)
                    data.minimumVersionTimestamp = sdata.minimumVersionTimestamp;
                if (sdata.maximumVersionTimestamp > data.maximumVersionTimestamp)
                    data.maximumVersionTimestamp = sdata.maximumVersionTimestamp;
            }
            if (data.rawRecords != null) {
                System.arraycopy(sdata.rawRecords, 0, data.rawRecords, 0,
                        snkeys);
            }
            
//            this.nkeys += s.nkeys;
            data.keys.nkeys += snkeys;
            data.vals.nvalues += snkeys;

            // FIXME update min/max on the parent for this leaf.
            // reallocate spanned entries from the sibling to this node.
            pdata.childEntryCounts[index] += s.getKeyCount();

            if(btree.debug) assertInvariants();
            
        }

        /*
         * The sibling leaf is now empty. We need to detach the leaf from its
         * parent node and then delete the leaf from the store.
         * 
         * Note: We have already adjusted the min/max for this Leaf on the
         * parent. The min/max for the parent itself will be unchanged by the
         * merge(). Therefore this method need only clear out the min/max for
         * the deleted child.
         */
        p.removeChild(s);
        
    }

    /**
     * Copies all keys and values from the specified start index down by one in
     * order to make room to insert a key and value at that index.
     * 
     * @param entryIndex
     *            The index of the first key and value to be copied.
     */
    protected void copyDown(final int entryIndex, final int count) {

        /*
         * copy down per-key data (#values == nkeys).
         */

        // Tunnel through to the mutable keys and values objects.
        final MutableLeafData data = (MutableLeafData) this.data;
        final MutableKeyBuffer keys = data.keys;
        final MutableValueBuffer vals = data.vals;

        System.arraycopy(keys.keys, entryIndex, keys.keys, entryIndex + 1,
                count);

        System.arraycopy(vals.values, entryIndex, vals.values, entryIndex + 1,
                count);

        if (data.deleteMarkers != null) {

            System.arraycopy(data.deleteMarkers, entryIndex,
                    data.deleteMarkers, entryIndex + 1, count);

        }

        if (data.versionTimestamps != null) {

            System.arraycopy(data.versionTimestamps, entryIndex,
                    data.versionTimestamps, entryIndex + 1, count);
            
        }
        
        if (data.rawRecords != null) {

            System.arraycopy(data.rawRecords, entryIndex,
                    data.rawRecords, entryIndex + 1, count);

        }

        /*
         * Clear the entry at the index. This is partly a paranoia check and
         * partly critical. Some per-key elements MUST be cleared and it is much
         * safer (and quite cheap) to clear them during copyDown() rather than
         * relying on maintenance elsewhere.
         */

        keys.keys[entryIndex] = null;

        vals.values[entryIndex] = null;

        if (data.deleteMarkers != null) {
            data.deleteMarkers[entryIndex] = false;
        }

        if (data.versionTimestamps != null) {
            data.versionTimestamps[entryIndex] = 0L;
            // Note: caller MUST update min/max if they are invalidated!
        }
     
		if (data.rawRecords != null) {
			data.rawRecords[entryIndex] = false;
		}
        
    }

    @Override
    public Tuple remove(final byte[] key, final Tuple tuple) {
        
        if(btree.debug) assertInvariants();
        
        btree.touch(this);

        final int entryIndex = getKeys().search(key);

        if (entryIndex < 0) {

            // Not found.

            return null;

        }

        /*
         * Note: This is one of the few gateways for mutation of a leaf via
         * the main btree API (insert, lookup, delete). By ensuring that we
         * have a mutable leaf here, we can assert that the leaf must be
         * mutable in other methods.
         */

        final Leaf copy = (Leaf) copyOnWrite();

        if (copy != this) {

            /*
             * Note: This leaf was copied so delegate to the new leaf (the old
             * leaf is now unused).
             * 
             * Note: copy-on-write deletes [this] leaf and delete() notifies any
             * leaf listeners before it clears the [leafListeners] reference so
             * not only don't we have to do that here, but we can't since the
             * listeners would be cleared before we could fire off the event
             * ourselves.
             */
            return copy.remove(key, tuple);
            
        }

//        // The value that is being removed.
//        final Object oldval = this.values[entryIndex];

        if (tuple != null) {
            
            /*
             * Copy data and metadata for the index entry that is being removed.
             */
        
            tuple.copy(entryIndex, this);
            
        }

        if (data.hasDeleteMarkers()) {

            /*
             * This operation is not allowed when delete markers are being
             * maintained. You use an insert(...) instead and specify delete :=
             * true.
             */
            
            throw new UnsupportedOperationException();
            
		}

		/*
		 * If the tuple was associated with a raw record address, then delete
		 * the raw record from the backing store.
		 * 
		 * Note: The general copy-on-write contract of the B+Tree combined with
		 * the semantics of the WORM, RW, and scale-out persistence layers will
		 * ensure the actual delete of the raw record is deferred until the
		 * commit point from which the tuple was deleted is no longer visible.
		 */
		if (data.hasRawRecords()) {

			final long addr = data.getRawRecord(entryIndex);

			if (addr != IRawStore.NULL) {

				btree.deleteRawRecord(addr);

			}

		}
        
// if (INFO) {
// log.info("this="+this+", key="+key+", value="+entry+", index="+entryIndex);
// if(DEBUG) {
//                System.err.println("this"); dump(Level.DEBUG,System.err);
//            }
//        }
        
        /*
         * Copy over the hole created when the key and value were removed
         * from the leaf.
         * 
         * Given: 
         * keys : [ 1 2 3 4 ]
         * vals : [ a b c d ]
         * 
         * Remove(1):
         * index := 0
         * length = nkeys(4) - index(0) - 1 = 3;
         * 
         * Remove(3):
         * index := 2;
         * length = nkeys(4) - index(2) - 1 = 1;
         * 
         * Remove(4):
         * index := 3
         * length = nkeys(4) - index(3) - 1 = 0;
         * 
         * Given: 
         * keys : [ 1      ]
         * vals : [ a      ]
         * 
         * Remove(1):
         * index := 0
         * length = nkeys(1) - index(0) - 1 = 0;
         */

        {

            /*
             * Copy down to cover up the hole.
             */
            
            final int nkeys = getKeyCount();
            final int length = nkeys - entryIndex - 1;
    
            // Tunnel through to the mutable objects.
            final MutableLeafData data = (MutableLeafData) this.data;
            final MutableKeyBuffer keys = data.keys;
            final MutableValueBuffer vals = data.vals;
    
            if (length > 0) {
    
                System.arraycopy(keys.keys, entryIndex + 1, keys.keys, entryIndex,
                        length);
    
                System.arraycopy(vals.values, entryIndex + 1, vals.values,
                        entryIndex, length);

                if (data.versionTimestamps != null) {

                    System.arraycopy(data.versionTimestamps, entryIndex + 1,
                            data.versionTimestamps, entryIndex, length);

                }
    
				if (data.rawRecords != null) {

					System.arraycopy(data.rawRecords, entryIndex + 1,
							data.rawRecords, entryIndex, length);

                }
    
            }
    
            /* 
             * Erase the key/value that was exposed by this operation.
             */
            keys.keys[nkeys - 1] = null;
            vals.values[nkeys - 1] = null;
            if (data.versionTimestamps != null) {
                data.versionTimestamps[nkeys - 1] = 0L;
            }
            if (data.rawRecords != null) {
                data.rawRecords[nkeys - 1] = false;
            }
    
            // One less key in the leaf.
            /*nkeys--;*/ keys.nkeys--; vals.nvalues--;
            
            // One less entry in the tree.
            ((BTree)btree).nentries--;
            assert ((BTree)btree).nentries >= 0;
            
            // One more deleted tuple.
            btree.getBtreeCounters().ntupleRemove++;
        
            if (data.versionTimestamps != null) {
                /*
                 * If the tuple with the min/max version timestamp was removed
                 * then we need to recalculate the min/max version timestamp.
                 * This needs to happen after we update nkeys/nvalues (so the
                 * new min/max considers only the valid tuples) and before we
                 * update the spanned tuple counts on the parent (so the new
                 * min/max will be propagated correctly).
                 */
                final long oldVersionTimestamp = tuple.getVersionTimestamp();
                if (oldVersionTimestamp == data.minimumVersionTimestamp
                        || oldVersionTimestamp == data.maximumVersionTimestamp)
                    data.recalcMinMaxVersionTimestamp();
            }
            
        }
        
        if( btree.root != this ) {

            /*
             * this is not the root leaf.
             */
        
            // update entry count and min/max version timestamp on ancestors.
            parent.get().updateEntryCount(this, -1);

            if (data.getKeyCount() < minKeys()) {

                /*
                 * The leaf is deficient. Join it with a sibling, causing their
                 * keys to be redistributed such that neither leaf is deficient.
                 * If there is only one other sibling and it has only the
                 * minimum #of values then the two siblings will be merged into
                 * a single leaf and their parent will have only a single child.
                 * Since the minimum #of children is two (2), having a single
                 * child makes the parent of this node deficient and it will be
                 * joined with one of its siblings. If necessary, this process
                 * will continue recursively up the tree. The root leaf never
                 * has any siblings and never experiences underflow so it may be
                 * legally reduced to zero values.
                 * 
                 * Note that the minimum branching factor (3) and the invariants
                 * together guarantee that there is at least one sibling. Also
                 * note that the minimum #of children for a node with the
                 * minimum branching factor is two (2) so a valid tree never has
                 * a node with a single sibling.
                 * 
                 * Note that we must invoke copy-on-write before modifying a
                 * sibling.  However, the parent of the leaf MUST already be
                 * mutable (aka dirty) since that is a precondition for removing
                 * a key from the leaf.  This means that copy-on-write will not
                 * force the parent to be cloned.
                 */
                
                join();
                
            }
            
        }
            
        if(btree.debug) assertInvariants();
        
        /*
         * Notify any listeners that the tuple(s) in the leaf have been changed.
         */
        fireInvalidateLeafEvent();
        
        return tuple;
        
    }
    
    /**
     * Visits this leaf if unless it is not dirty and the flag is true, in which
     * case the returned iterator will not visit anything.
     * 
     * {@inheritDoc}
     */
    @Override
    @SuppressWarnings("unchecked")
    public Iterator postOrderNodeIterator(
            final boolean dirtyNodesOnly, final boolean nodesOnly) {

        if (dirtyNodesOnly && ! isDirty() ) {

            return EmptyIterator.DEFAULT;

        } else if(nodesOnly) { 
            
            return EmptyIterator.DEFAULT;
            
        } else {
            
            return new SingleValueIterator(this);

        }

    }

    /**
     * Visits this leaf.
     */
    @Override
    @SuppressWarnings("unchecked")
    public Iterator postOrderIterator(final byte[] fromKey,
            final byte[] toKey) {

        return new SingleValueIterator(this);

    }

    /**
     * Iterator visits the tuples in this leaf in key order.
     */
    @Override
    public ITupleIterator entryIterator() {

        if (getKeys().isEmpty()) {

            return EmptyTupleIterator.INSTANCE;

        }

        return new LeafTupleIterator(this);

    }

    @Override
    public boolean dump(final Level level, final PrintStream out,
            final int height, final boolean recursive) {

        final boolean debug = level.toInt() <= Level.DEBUG.toInt();
        
        // Set to false iff an inconsistency is detected.
        boolean ok = true;

        final int branchingFactor = this.getBranchingFactor();
        final int nkeys = this.getKeyCount();
        final int minKeys = this.minKeys();
        final int maxKeys = this.maxKeys();

        /*
         * Since the index segment does not materialize the root when running a
         * leaf cursor we can not rely on [btree.root != this].
         */
        final boolean isRoot = (btree.root == this)
                || ((btree instanceof IndexSegment) && btree.getEntryCount() == 0);

        if (!isRoot
                && (nkeys < minKeys)) {
            /*
             * Min keys failure.
             * 
             * Note: the root may have fewer keys.
             */
            out.println(indent(height) + "ERROR: too few keys: m="
                    + branchingFactor + ", minKeys=" + minKeys + ", nkeys="
                    + nkeys + ", isLeaf=" + isLeaf() + ", isRoot=" + isRoot);
            ok = false;
        }

        if (nkeys > branchingFactor) {
            // max keys failure.
            out.println(indent(height) + "ERROR: too many keys: m="
                    + branchingFactor + ", maxKeys=" + maxKeys + ", nkeys="
                    + nkeys + ", isLeaf=" + isLeaf() + ", isRoot=" + isRoot);
            ok = false;
        }

        if (height != -1 && height != btree.getHeight()) {

            out.println(indent(height) + "WARN: height=" + height
                    + ", but btree height=" + btree.getHeight());
            ok = false;
            
        }

        // verify keys are monotonically increasing.
        try {
            assertKeysMonotonic();
        } catch (AssertionError ex) {
            out.println(indent(height) + "  ERROR: "+ex);
            ok = false;
        }
        
        if (debug || ! ok ) {
        
            out.println(indent(height) + toString());
            
//            out.println(indent(height) + "  parent="
//                    + (parent == null ? null : parent.get()));
//            
//            out.println(indent(height) + "  isRoot=" + (btree.root == this)
//                    + ", dirty=" + isDirty() + ", nkeys=" + nkeys
//                    + ", minKeys=" + minKeys + ", maxKeys=" + maxKeys
//                    + ", branchingFactor=" + branchingFactor);
//            
//            // Note: key format is dumped by its object.
//            out.println(indent(height) + "  keys=" + getKeys());
//        
//            // Note: signed byte[]s.
//            out.println(indent(height) + "  vals=" + getValues());
//            
//            if(hasDeleteMarkers()) {
//                
//                out.print(indent(height) + "  deleted=[");
//                for (int i = 0; i <= nkeys; i++) {
//                    if (i > 0)
//                        out.print(", ");
//                    out.print(getDeleteMarker(i));
//                }
//                out.println("]");
//
//            }
//            
//            if(hasVersionTimestamps()) {
//                
//                out.print(indent(height) + "  timestamps=[");
//                for (int i = 0; i <= nkeys; i++) {
//                    if (i > 0)
//                        out.print(", ");
//                    out.print(getVersionTimestamp(i));
//                }
//                out.println("]");
//
//            }
            
        }

        return ok;

    }

//    /**
//     * Formats the data into a {@link String}.
//     * 
//     * @param data
//     *            An array of signed byte arrays.
//     */
//    static private String toString(final int n, final IRaba data) {
//       
//        final StringBuilder sb = new StringBuilder();
//
//        sb.append("data(n=" + n + ")={");
//
//        for (int i = 0; i < n; i++) {
//
//            final byte[] a = data.get(i);
//
//            sb.append("\n");
//
//            sb.append("data[" + i + "]=");
//
//            sb.append(Arrays.toString(a));
//
//            if (i + 1 < n)
//                sb.append(",");
//
//        }
//
//        sb.append("}");
//
//        return sb.toString();
//        
//    }

    /**
     * Human readable representation of the {@link ILeafData} plus transient
     * information associated with the {@link Leaf}.
     */
    @Override
    public String toString() {

        final StringBuilder sb = new StringBuilder();

//      sb.append(getClass().getName());
        sb.append(super.toString());

        sb.append("{ isDirty="+isDirty());

        sb.append(", isDeleted="+isDeleted());
        
        sb.append(", addr=" + identity);

        final Node p = (parent == null ? null : parent.get());

        sb.append(", parent=" + (p == null ? "N/A" : p.toShortString()));

        sb.append(", isRoot=" + (btree.root == this));

        if (data == null) {

            // No data record? (Generally, this means it was stolen by copy on
            // write).
            sb.append(", data=NA}");

            return sb.toString();
            
        }

        sb.append(", nkeys=" + getKeyCount());
        
        sb.append(", minKeys=" + minKeys());

        sb.append(", maxKeys=" + maxKeys());
        
        DefaultLeafCoder.toString(this, sb);

        sb.append("}");

        return sb.toString();
        
    }
    
    /**
     * An interface that may be used to register for and receive events when the
     * state of a {@link Leaf} is changed. This includes (a) adding a new tuple
     * to a leaf; (b) removing a tuple from a leaf (but not flagging an existing
     * tuple as deleted); and (c) when the leaf is discarded by copy-on-write.
     * 
     * @author Bryan Thompson
     * @version $Id$
     * 
     * @todo another listener API could be developed for tuple state changes.
     *       that would be useful if there was a desire for pre- or
     *       post-processing for each tuple. This might be useful for
     *       introducing triggers.
     */
    public static interface ILeafListener {
       
        /**
         * Notice that the leaf state has changed and that the listener must not
         * assume: (a) that a tuple of interest still resides within the leaf
         * (it may have been moved up or down within the leaf or it may be in
         * another leaf altogether as a result of underflow or overflow); (b)
         * that the leaf is still in use (it may have been discarded by a
         * copy-on-write operation).
         */
        public void invalidateLeaf();
        
//        /**
//         * Notice that the state of a tuple in the leaf has been changed (the
//         * tuple is still known to be located within the leaf).
//         * 
//         * @param index
//         *            The index of the tuple whose state was changed.
//         */
//        public void invalidateTuple(int index);
        
    }
    
    /**
     * Listeners for {@link ILeafListener} events.
     * 
     * Note: The values in the map are null.
     * 

     * Note: Listeners are cleared from the map automatically by the JVM soon
     * after the listener becomes only weakly reachable.
     * 

     * Note: Mutable {@link BTree}s are single-threaded so there is no need to
     * synchronize access to this collection.
     * 

     * Note: These listeners are primarily used to support {@link ITupleCursor}s.
     * The #of listeners at any one time is therefore directly related to the
     * #of open iterators on the owning mutable {@link BTree}.
     * Normally that is ONE (1) since the {@link BTree} is not thread-safe for
     * mutation and each cursor has a current, prior, and next position meaning
     * that we have typically either NO listeners or the current and either
     * prior or next listener. This tends to make visiting the members of the
     * collection (when it is defined) very fast, especially since we do not
     * need to synchronize on anything.
     * 
     * Note: The trigger conditions for the events of interest to the listeners
     * are scattered throughout the {@link Leaf} class.
     */
    private transient WeakHashMap leafListeners = null;

    /**
     * Register an {@link ILeafListener} with this {@link Leaf}. Listeners are
     * automatically removed by the JVM shortly after they become only weakly
     * reachable.
     * 
     * @param l
     *            The listener.
     * 
     * @throws IllegalStateException
     *             if the owning {@link AbstractBTree} is read-only.
     */
    final public void addLeafListener(ILeafListener l) {

        if (l == null)
            throw new IllegalArgumentException();

        btree.assertNotReadOnly();

        if(leafListeners==null) {
            
            leafListeners = new WeakHashMap();
            
        }
        
        leafListeners.put(l, null);
        
    }

    /**
     * Fire an {@link ILeafListener#invalidateLeaf()} event to any registered
     * listeners.
     */
    final protected void fireInvalidateLeafEvent() {
        
        if(leafListeners == null) return;

        for(ILeafListener l : leafListeners.keySet()) {
            
            l.invalidateLeaf();
            
        }
        
    }

//    /**
//     * Fire an {@link ILeafListener#invalidateTuple(int)} event to any
//     * registered listeners.
//     * 
//     * @param index
//     *            The index of the tuple whose state was changed.
//     */
//    final protected void fireInvalidateTuple(int index) {
//
//        if(leafListeners == null) return;
//
//        for(ILeafListener l : leafListeners.keySet()) {
//            
//            l.invalidateTuple(index);
//            
//        }
//
//    }

	final public ByteBuffer readRawRecord(long addr) {
		
		return btree.readRawRecord(addr);

	}

}