com.bigdata.journal.Name2Addr Maven / Gradle / Ivy

Go to download
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package com.bigdata.journal;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataInputStream;
import java.io.DataOutput;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.lang.ref.WeakReference;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.log4j.Logger;

import com.bigdata.btree.BTree;
import com.bigdata.btree.Checkpoint;
import com.bigdata.btree.DefaultTupleSerializer;
import com.bigdata.btree.ICheckpointProtocol;
import com.bigdata.btree.IDirtyListener;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.keys.DefaultKeyBuilderFactory;
import com.bigdata.btree.keys.IKeyBuilder;
import com.bigdata.btree.keys.IKeyBuilderFactory;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.keys.StrengthEnum;
import com.bigdata.btree.keys.SuccessorUtil;
import com.bigdata.cache.ConcurrentWeakValueCache;
import com.bigdata.cache.ConcurrentWeakValueCacheWithTimeout;
import com.bigdata.cache.LRUCache;
import com.bigdata.cache.WeakValueCache;
import com.bigdata.counters.CounterSet;
import com.bigdata.counters.ICounterSet;
import com.bigdata.io.DataInputBuffer;
import com.bigdata.mdi.LocalPartitionMetadata;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.resources.IndexManager;
import com.bigdata.resources.ResourceManager;
import com.bigdata.util.Bytes;
import com.bigdata.util.BytesUtil;
import com.bigdata.util.concurrent.ExecutionExceptions;

import cutthecrap.utils.striterators.IStriterator;
import cutthecrap.utils.striterators.Resolver;
import cutthecrap.utils.striterators.Striterator;

/**
 * 
 * {@link Name2Addr} is a {@link BTree} mapping index names to an {@link Entry}
 * containing the last {@link Checkpoint} record committed for the named index
 * and the timestamp of that commit. The keys are Unicode strings using the
 * default {@link Locale}. The {@link Entry}s in {@link Name2Addr} are the set
 * of registered named indices for an {@link AbstractJournal}.
 * 
 * 
 * The {@link AbstractJournal} maintains an instance of this class that evolves
 * with each {@link AbstractJournal#commit()} and tracks the {@link Checkpoint}
 * records of the registered {@link ITx#UNISOLATED} indices. However, the
 * journal also makes use of historical states for the {@link Name2Addr} index
 * in order to resolve the historical state of a named index. Of necessity, the
 * {@link Name2Addr} objects used for this latter purpose MUST be distinct from
 * the evolving instance otherwise the current version of the named index would
 * be resolved. Note further that the historical {@link Name2Addr} states are
 * accessed using a canonicalizing mapping but that current evolving
 * {@link Name2Addr} instance is NOT part of that mapping.
 * 
 * 
 * Concurrent reads are permitted against the historical {@link Name2Addr}
 * objects since the {@link BTree} is thread-safe for read-only operations.
 * Likewise, writes are only allowed on the {@link ITx#UNISOLATED}
 * {@link Name2Addr} instance. Write access to the underlying
 * {@link BTree} MUST be synchronized on the {@link ITx#UNISOLATED}
 * {@link Name2Addr} instance since the {@link BTree} is NOT safe for concurrent
 * writers. Further, read access to {@link ITx#UNISOLATED}
 * {@link Name2Addr} object MUST be synchronized so as to not conflict with
 * writes on that {@link BTree}. Therefore all write methods on this
 * class are declared as synchronized but the caller MUST synchronize
 * on {@link Name2Addr} if they are performing reads on the {@link ITx#UNISOLATED}
 * {@link Name2Addr} instance.  This allows readers on historical {@link Name2Addr} 
 * instances to have full concurrency.
 * 

 * 
 * Note: {@link Name2Addr} by itself is NOT sufficient to handle commits with
 * concurrent task execution, such as arises with the group commit protocol. The
 * problem is concurrency in the data structure that keeps track of add/drop for
 * named indices and also tracks which named indices are dirty. In order to account for
 * tasks running concurrent with commit processing, {@link AbstractTask} isolates
 * {@link Name2Addr} and makes the set of changes {registering indices, dropping
 * indices, and updating the {@link Entry} in {@link Name2Addr} to reflect the
 * current {@link Checkpoint} record for an index) an atomic state change that
 * is performed IFF the task completes successfully and is synchronized on
 * {@link Name2Addr} to prevent that happening concurrent with commit
 * processing.
 * 
 */
public class Name2Addr extends BTree {

    private static final Logger log = Logger.getLogger(Name2Addr.class);

    /**
     * Cache of added/retrieved btrees by _name_. This cache is ONLY used by the
     * "live" {@link Name2Addr} instance.
     * 
     * Map from the name of an index to a weak reference for the corresponding
     * "live" version of the named index. Entries will be cleared from this map
     * after they have become only weakly reachable. In order to prevent dirty
     * indices from being cleared, we register an {@link IDirtyListener}. When
     * it is informed that an index is dirty it places a hard reference to that
     * index into the {@link #commitList}.
     * 

     * Note: The capacity of the backing hard reference LRU effects how many
     * _clean_ indices can be held in the cache. Dirty indices remain strongly
     * reachable owing to their existence in the {@link #commitList}.
     */
    private ConcurrentWeakValueCache indexCache = null;

    /**
     * Holds hard references for the dirty indices along with the index name.
     * This collection prevents dirty indices from being cleared from the
     * {@link #indexCache}, which would result in lost updates.
     * 

     * Note: Operations on unisolated indices always occur on the "current"
     * state of that index. The "current" state is either unchanged (following a
     * successful commit) or rolled back to the last saved state (by an abort
     * following an unsuccessful commit). Therefore all unisolated index write
     * operations MUST complete before a commit and new unisolated operations
     * MUST NOT begin until the commit has either succeeded or been rolled back.
     * Failure to observe this constraint can result in new unisolated
     * operations writing on indices that should have been rolled back if the
     * commit is not successfull.
     */
    private ConcurrentHashMap commitList = new ConcurrentHashMap();
    
    /**
     * An instance of this {@link DirtyListener} is registered with each named
     * index that we administer to listen for events indicating that the index
     * is dirty. When we get that event we stick the {@link DirtyListener} on
     * the {@link #commitList}. This makes the commit protocol simpler since
     * the {@link DirtyListener} has both the name of the index and the
     * reference to the index and we need both on hand to do the commit.
     * 
     * @author Bryan Thompson
     */
    private class DirtyListener implements IDirtyListener, Comparable {
        
        final String name;
        final ICheckpointProtocol btree;
        boolean needsCheckpoint;
        long checkpointAddr = 0L;
        
        @Override
        public String toString() {
            
            return "DirtyListener{name="
                    + name
                    + ","
                    + (needsCheckpoint ? "needsCheckpoint" : "checkpointAddr="
                            + checkpointAddr) + "}";
            
        }
        
        private DirtyListener(final String name,
                final ICheckpointProtocol btree, final boolean needsCheckpoint) {
            
            assert name!=null;
            
            assert btree!=null;
            
            this.name = name;
            
            this.btree = btree;
            
            this.needsCheckpoint = needsCheckpoint;
            
            if(!needsCheckpoint) {
                
                /*
                 * Grab the checkpointAddr for the object.
                 */
                
                try {

                    checkpointAddr = btree.getCheckpoint().getCheckpointAddr();
                    
                } catch(IllegalStateException ex) {
                    
                    throw new RuntimeException(
                            "Checkpoint record not written: " + name);
                    
                }

            }
            
        }
        
        /**
         * Return the {@link Name2Addr} instance to which this listener is
         * reporting.
         */
        private Name2Addr getName2Addr() {
            
            return Name2Addr.this;
            
        }

        /**
         * Add this to the {@link Name2Addr#commitList} and set
         * {@link #needsCheckpoint} to true.
         * 
         * @param btree
         */
        @Override
        public void dirtyEvent(final ICheckpointProtocol btree) {

            assert btree == this.btree;

            @SuppressWarnings("unused")
            final boolean added;
            
            synchronized(Name2Addr.this) {
                
                final ICheckpointProtocol cached = indexCache.get(name);

                if (cached == null) {

                    /*
                     * There is no index in the cache for this name. This can
                     * occur if someone is holding a reference to a mutable
                     * BTree and they write on it after a commit or abort.
                     */
                    
                    throw new RuntimeException("No index in cache: name="+name);

                }

                if (cached != btree) {

                    /*
                     * There is a different index in the cache for this name.
                     * This can occur if someone is holding a reference to a
                     * mutable BTree and they write on it after a commit or
                     * abort but the named index has already been re-loaded into
                     * the cache.
                     */

                    throw new RuntimeException("Different index in cache: "+name);

                }

                /*
                 * Note: This MUST be synchronized to prevent loss of dirty
                 * notifications that arrive while a concurrent commit is in
                 * progress.
                 */

                added = commitList.putIfAbsent(name, this) != null;

                needsCheckpoint = true;
                
                checkpointAddr = 0L;
                    
                if(log.isInfoEnabled()) {

                    /*
                     * Note: The size of the commit list can appear to increment
                     * by more than one if there are concurrent writes on
                     * different indices (e.g., if this log message is written
                     * outside of the synchronized block).
                     */
                    
                    log.info("name=" + name + ", commitListSize="
                            + commitList.size() + ", file="
                            + getStore().getFile());
                    
                }

            } // synchronized.
            
        }

        /**
         * Puts instances into order by their {@link #name}
         */
        public int compareTo(DirtyListener arg0) {

            return name.compareTo(arg0.name);
            
        }

    }

    /**
     * Create a new instance.
     * 
     * @param store
     *            The backing store.
     * 
     * @return The new instance.
     */
    static public Name2Addr create(final IRawStore store) {
    
        final IndexMetadata metadata = new IndexMetadata(UUID.randomUUID());
        
        metadata.setBTreeClassName(Name2Addr.class.getName());

        /*
         * TODO configure unicode sort key behavior explicitly?
         * 
         * Note: This only applies to new Name2Addr objects. However, historical
         * Name2Addr objects were created using the default collator for the
         * platform. When the ICU library is available, that is the default.
         */

        final Properties p = new Properties();

//        p.setProperty(KeyBuilder.Options.COLLATOR, CollatorEnum.ASCII.name());

        p.setProperty(KeyBuilder.Options.STRENGTH,
                StrengthEnum.Identical.name());

        metadata.setTupleSerializer(new Name2AddrTupleSerializer(
                new DefaultKeyBuilderFactory(p)));

        return (Name2Addr) BTree.create(store, metadata);
        
    }
    
    /**
     * Load from the store (de-serialization constructor).
     * 
     * @param store
     *            The backing store.
     * @param checkpoint
     *            The {@link Checkpoint} record.
     * @param metadata
     *            The metadata record for the index.
     */
    public Name2Addr(final IRawStore store, final Checkpoint checkpoint,
            final IndexMetadata metadata, final boolean readOnly) {

        super(store, checkpoint, metadata, readOnly);
        
    }

    /**
     * Many methods on this class will throw an {@link IllegalStateException}
     * unless they are invoked on the {@link ITx#UNISOLATED} {@link Name2Addr}
     * instance. This method is used to test that assertion for those methods.
     * 
     * @throws IllegalStateException
     *             unless this is the {@link ITx#UNISOLATED} {@link Name2Addr}
     *             instance.
     */
    final protected void assertUnisolatedInstance() {
        
        if(indexCache == null) {
            
            throw new IllegalStateException();
            
        }
        
    }
    
    /**
     * Return true iff this is the {@link ITx#UNISOLATED}
     * {@link Name2Addr} instance.
     */
    final protected boolean isUnisolatedInstance() {

        return indexCache != null;
        
    }
    
    /**
     * Setup the {@link #indexCache}.
     * 

     * Note: This cache is null unless initialized and is ONLY
     * used by the "live" version of the {@link Name2Addr} index. The only
     * method that creates or loads the "live" {@link Name2Addr} index is
     * {@link AbstractJournal#setupName2AddrBTree()}.
     * 
     * @param cacheCapacity
     *            The capacity of the inner {@link LRUCache} for the
     *            {@link WeakValueCache}.
     * @param cacheTimeout
     *            The timeout in milliseconds for stale entries in the cache.
     * 
     * @see Options#LIVE_INDEX_CACHE_CAPACITY
     * @see Options#LIVE_INDEX_CACHE_TIMEOUT
     */
    protected void setupCache(final int cacheCapacity, final long cacheTimeout) {

        if (indexCache != null) {

            // Cache was already configured.
            
            throw new IllegalStateException();

        }

        // indexCache = new WeakValueCache(
        // new LRUCache(cacheCapacity));

        indexCache = new ConcurrentWeakValueCacheWithTimeout(
                cacheCapacity, TimeUnit.MILLISECONDS.toNanos(cacheTimeout));

    }
    
    /**
     * An iterator that visits the entries in the internal {@link #indexCache}.
     * You must test the weak reference for each entry in order to determine
     * whether its value has been cleared as of the moment that you request that
     * value.
     * 
     * @throws IllegalStateException
     *             unless this is the {@link ITx#UNISOLATED} instance.
     */
    private Iterator>> indexCacheEntryIterator() {

        assertUnisolatedInstance();
        
        return indexCache.entryIterator();
    
    }
    
    /**
     * Return the approximate number of indices in the live index cache.
     * 
     * @throws IllegalStateException
     *             unless this is the {@link ITx#UNISOLATED} instance.
     */
    public int getIndexCacheSize() {
    
        assertUnisolatedInstance();
        
        return indexCache.size();
    
    }
    
    /**
     * Return true iff the named index is on the commit list.
     * 

     * Note: This is synchronized even through the commitList is thread-safe in
     * order to make the test atomic with respect to {@link #handleCommit(long)}.
     * 
     * @param name
     *            The index name.
     * 
     * @throws IllegalStateException
     *             unless this is the {@link ITx#UNISOLATED} instance.
     */
    synchronized public boolean willCommit(final String name) {

        assertUnisolatedInstance();

        return commitList.containsKey(name);
        
    }
    
    /**
     * Flush a dirty index to the disk. This is used to flush each dirty index
     * in parallel, providing increased IO throughput and reduced latency during
     * the commit.
     * 
     * @author Bryan
     *         Thompson
     * @see Flush indices in parallel during checkpoint to reduce IO
     *      latency
     */
    private static class CommitIndexTask implements Callable {
     
        private final DirtyListener l;
        
        private final long commitTime;
        
        private final AtomicLong checkpointAddr = new AtomicLong(0L);

        /**
         * Return the address of the {@link Checkpoint} record.
         */
        public long getCheckpointAddr() {
        
            return checkpointAddr.get();
            
        }

        /**
         * 
         * @param l
         *            The {@link DirtyListener}.
         * @param commitTime
         *            The commitTime associated with the commitPoint.
         */
        public CommitIndexTask(final DirtyListener l, final long commitTime) {

            if (l == null)
                throw new IllegalArgumentException();
            
            this.l = l;
            
            this.commitTime = commitTime;
            
        }

        /**
         * @return self
         */
        @Override
        public CommitIndexTask call() throws Exception {

            if (log.isInfoEnabled())
                log.info("Will commit: " + l.name);

            final long checkpointAddr;
            if (l.needsCheckpoint) {

                /*
                 * Note: AbstractTask flags [needsCheckpoint := false] on the
                 * DirtyListener and handles the BTree checkpoint itself in
                 * order to avoid the possibility of a concurrent modification
                 * by this code during commit processing.
                 */

                try {

                    // checkpoint the index.
                    checkpointAddr = l.btree.handleCommit(commitTime);

                    // we just did the checkpoint.
                    l.needsCheckpoint = false;

                } catch (Throwable t) {

                    // adds the name to the stack trace.
                    throw new RuntimeException("Could not commit index: name="
                            + l.name, t);

                }
                
            } else {

                /*
                 * Note: AbstractTask avoids concurrent modification of the
                 * BTree checkpoint record during a commit by synchronizing on
                 * Name2Addr.
                 * 
                 * Note: The DirtyListener grabs the current checkpointAddr from
                 * the BTree when [needsCheckpoint := false]. This allows us to
                 * have asynchronous checkpoints of the BTree (by concurrent
                 * tasks) without causing the checkpoint address on the
                 * commitList to be advanced until the next atomic transfer of
                 * state from a completed AbstractTask down to Name2Addr.
                 * (Without this, the checkpointAddr on the BTree winds up
                 * updated concurrently but we only notice the problem when it
                 * happens to be 0L because the BTree is in the midst of
                 * updating its checkpoint!)
                 */
                
                // use last recorded checkpoint.
                checkpointAddr = l.checkpointAddr;

                if (checkpointAddr == 0L) {

                    throw new RuntimeException(
                            "Checkpoint address not written: name=" + l.name);
                    
                }
                                
            }
            
            // set commitTime on the btree (transient field).
            l.btree.setLastCommitTime(commitTime);
            
            // publish the checkpoint address.
            this.checkpointAddr.set(checkpointAddr);
            
            // Done.
            return this;
            
        }
        
    } // CommitIndexTask
    
    /**
     * Commit processing for named indices.
     * 

     * This method applies the {@link #commitList} and then flushes the backing
     * {@link ICheckpointProtocol} object to the store. The {@link #commitList}
     * consists of {@link DirtyListener}s. If the listener has its
     * {@link DirtyListener#needsCheckpoint} flag set, then the
     * {@link ICheckpointProtocol} implementation to which that listener is
     * attached will have its {@link BTree#writeCheckpoint() checkpoint written}
     * . Otherwise the current {@link Checkpoint} address is recovered. Either
     * way, the {@link Entry} in {@link Name2Addr}s backing {@link BTree} is
     * updated to reflect the commitTime and {@link Checkpoint} address
     * for the index.
     * 

     * Finally {@link Name2Addr} {@link Checkpoint}s itself using
     * {@link ICommitter#handleCommit(long)} and returns the address from which
     * {@link Name2Addr} may be reloaded.
     * 

     * Note: The {@link #commitList} MUST be protected against concurrent
     * modification during the commit otherwise concurrent tasks could be
     * reporting dirty objects while we are doing a commit and those notices
     * would be lost. Persistence capable objects ({@link ICheckpointProtocol}
     * implementations) get onto the {@link #commitList} via the
     * {@link DirtyListener}, so it is also synchronized.
     * 

     * Note: {@link Name2Addr} DOES NOT obtain a resource lock on the
     * {@link ICheckpointProtocol} implementation. Therefore it MUST NOT
     * checkpoint an index on which an {@link AbstractTask} has obtained a
     * resource lock. Otherwise we have concurrent writers on the {@link BTree}
     * and the {@link BTree} is not thread-safe for concurrent writers. Instead,
     * the {@link AbstractTask} checkpoints the {@link BTree} itself while it is
     * holding the resource lock and then sets
     * {@link DirtyListener#needsCheckpoint} to false using
     * {@link #putOnCommitList(String, ICheckpointProtocol, boolean)} as an
     * indication to {@link Name2Addr} that it MUST persist the current
     * checkpointAddr for the {@link BTree} on its next commit (and MUST NOT
     * write on the index when it does that commit).
     * 
     * @see Flush indices in parallel during checkpoint to reduce IO
     *      latency
     */
    @Override
    synchronized
    public long handleCommit(final long commitTime) {

        assertUnisolatedInstance();

        // snapshot the commit list
        final DirtyListener[] a = commitList.values().toArray(new DirtyListener[] {});
        
        // clear the commit list.
        commitList.clear();

        /*
         * Place into sorted order as an aid to debugging when examining the
         * commit list.
         * 
         * Note: This also approximates the order of the generated keys for the
         * indices which makes the operations on the underlying BTree somewhat
         * more efficient as they are more or less in key order. (The order is
         * only approximate since a Unicode collator determines the real order
         * for the sort keys generated from the index names).
         */
        Arrays.sort(a);
        
        if (log.isInfoEnabled()) {
            
            log.info("Store file="+getStore().getFile());
            
            log.info("There are " + a.length + " dirty indices : "
                    + Arrays.toString(a));
            
        }

        // for each entry in the snapshot of the commit list.
        final List tasks = new ArrayList(a.length);
        for (int i = 0; i < a.length; i++) {
            
            final DirtyListener l = a[i];
            
            if (log.isInfoEnabled())
                log.info("Will commit: " + l.name);
            
            tasks.add(new CommitIndexTask(l, commitTime));

        }

        /*
         * Submit checkpoint tasks in parallel.
         * 
         * Note: This relies on getStore() providing access to the IIndexManager
         * interface.
         */
        final List> futures;
        try {

            final ExecutorService executorService = ((IIndexManager) getStore())
                    .getExecutorService();
            
            /*
             * Invoke tasks.
             * 
             * Note: Blocks until all tasks are done. Hence we do NOT have to
             * cancel these Futures. If we obtain them, then they are already
             * done.
             */
            futures = executorService.invokeAll(tasks);
            
        } catch (InterruptedException e) {
            
            // Interrupted while awaiting checkpoint(s).
            throw new RuntimeException(e);
            
        }
        
        // for each entry in the snapshot of the commit list.
        final List causes = new LinkedList();
        for (Future f : futures) {
            
            try {
                
                final CommitIndexTask task = f.get();
                
                final DirtyListener l = task.l;
                
                final long checkpointAddr = task.getCheckpointAddr();
                
                // encode the index name as a key.
                final byte[] key = getKey(l.name);

                // lookup the current entry (if any) for that index.
                final byte[] val = lookup(key);

                // de-serialize iff entry was found.
                final Entry oldEntry = (val == null ? null
                        : EntrySerializer.INSTANCE.deserialize(new DataInputBuffer(
                                val)));

                /*
                 * Update if there is no existing entry or if the checkpointAddr has
                 * changed or if there was no commit time on the old entry.
                 */

                if (oldEntry == null || oldEntry.checkpointAddr != checkpointAddr
                        || oldEntry.commitTime == 0L) {

                    final Entry entry = new Entry(l.name, checkpointAddr, commitTime);
                    
                    // update persistent mapping.
                    insert(key, EntrySerializer.INSTANCE.serialize( entry ));
                }
                
            } catch (InterruptedException e) {

                log.error("l.name: " + e, e);
                causes.add(e);
                
            } catch (ExecutionException e) {
                
                log.error("l.name: " + e, e);
                causes.add(e);
                
            }
            
//            if (l.needsCheckpoint) {
//
//                /*
//                 * Note: AbstractTask flags [needsCheckpoint := false] on the
//                 * DirtyListener and handles the BTree checkpoint itself in
//                 * order to avoid the possibility of a concurrent modification
//                 * by this code during commit processing.
//                 */
//                
//                try {
//
//                    // checkpoint the index.
//                    checkpointAddr = l.btree.handleCommit(commitTime);
//
//                    // we just did the checkpoint.
//                    l.needsCheckpoint = false;
//
//                } catch (Throwable t) {
//
//                    // adds the name to the stack trace.
//                    throw new RuntimeException("Could not commit index: name="
//                            + l.name + ", commitList=" + Arrays.toString(a), t);
//
//                }
//                
//            } else {
//
//                /*
//                 * Note: AbstractTask avoids concurrent modification of the
//                 * BTree checkpoint record during a commit by synchronizing on
//                 * Name2Addr.
//                 * 
//                 * Note: The DirtyListener grabs the current checkpointAddr from
//                 * the BTree when [needsCheckpoint := false]. This allows us to
//                 * have asynchronous checkpoints of the BTree (by concurrent
//                 * tasks) without causing the checkpoint address on the
//                 * commitList to be advanced until the next atomic transfer of
//                 * state from a completed AbstractTask down to Name2Addr.
//                 * (Without this, the checkpointAddr on the BTree winds up
//                 * updated concurrently but we only notice the problem when it
//                 * happens to be 0L because the BTree is in the midst of
//                 * updating its checkpoint!)
//                 */
//                
//                // use last recorded checkpoint.
//                checkpointAddr = l.checkpointAddr;
//
//                if (checkpointAddr == 0L) {
//
//                    throw new RuntimeException(
//                            "Checkpoint address not written: name=" + l.name);
//                    
//                }
//                                
//            }
//            
//            // set commitTime on the btree (transient field).
//            l.btree.setLastCommitTime(commitTime);
            
        } // next Future.
        
        /*
         * If there were any errors, then throw an exception listing them.
         */
        if (!causes.isEmpty()) {
            // Throw exception back to the leader.
            if (causes.size() == 1)
                throw new RuntimeException(causes.get(0));
            throw new RuntimeException("nerrors=" + causes.size(),
                    new ExecutionExceptions(causes));
        }
        
        // and flushes out this btree as well.
        return super.handleCommit(commitTime);
        
    }
    
    /**
     * Encodes a unicode string into a key.
     * 
     * @param name
     *            The name of the btree.
     *            
     * @return The corresponding key.
     */
    private byte[] getKey(final String name) {

        final byte[] a = metadata.getTupleSerializer().serializeKey(name);

//        log.error("name=" + name + ", key=" + BytesUtil.toString(a));

        return a;

//        return KeyBuilder.asSortKey(name);

    }

    /**
     * Return the {@link ITx#UNISOLATED} view of the named persistence capable
     * data structure - this method tests a cache of the named persistence
     * capable data structures and will return the existing instance if the
     * index is found in the cache and will otherwise load the
     * {@link ITx#UNISOLATED} view of the data structure from the backing store.
     * 
     * @param name
     *            The index name.
     * 
     * @return The named index or null iff there is no index with
     *         that name.
     * 
     * @throws IllegalArgumentException
     *             if name is null.
     * @throws IllegalStateException
     *             if this is not the {@link ITx#UNISOLATED} {@link Name2Addr}
     *             instance.
     */
    public ICheckpointProtocol getIndex(final String name) {

        assertUnisolatedInstance();

        if (name == null) {

            throw new IllegalArgumentException();
            
        }

        ICheckpointProtocol ndx;
        synchronized(this) {
        
            /*
             * Note: Synchronized since some operations (remove+add) are not
             * otherwise atomic.
             */

            ndx = indexCache.get(name);

        }

        if (ndx != null) {

            if (ndx.getDirtyListener() == null) {

                /*
                 * Note: We can't return an unisolated view of a BTree to the
                 * caller without having a dirty listener set on it that will
                 * report any changes back to this name2addr instance.  An
                 * exception thrown here indicates that the BTree was able to
                 * remain in (or enter into) the indexCache without having its
                 * dirty listener set.
                 */

                throw new AssertionError();

            }

            /*
             * Further verify that the dirty listener is reporting to this
             * name2addr instance.
             */
            assert ((DirtyListener)ndx.getDirtyListener()).getName2Addr() == this;

            return ndx;

        }

        final byte[] val = super.lookup(getKey(name));

        if (val == null) {

            return null;
            
        }
        
        // deserialize entry.
//        final Entry entry = EntrySerializer.INSTANCE.deserialize(new DataInputBuffer(val));
        final Entry entry = EntrySerializer.INSTANCE
                .deserialize(new DataInputStream(new ByteArrayInputStream(val)));

        // Load from the backing store.
        ndx = Checkpoint.loadFromCheckpoint(store, entry.checkpointAddr,
                false/* readOnly */);

        // Set the lastCommitTime on the index.
        ndx.setLastCommitTime(entry.commitTime);
        
        // Save name -> btree mapping in transient cache.
        putIndexCache(name, ndx, false/*replace*/);
        
        // listen for dirty events so that we know when to add this to the commit list.
        final DirtyListener l = new DirtyListener(name, ndx, false/* needsCheckpoint */);
        
        ndx.setDirtyListener( l );
        
        // report event (loaded btree).
        ResourceManager.openUnisolatedIndex(name);

        // return btree.
        return ndx;

    }
    
    /**
     * Return the {@link Entry} for the named index.
     * 

     * Note: This is a lower-level access mechanism that is used by
     * {@link Journal#getIndex(String, ICommitRecord)} when accessing historical
     * named indices from an {@link ICommitRecord}.
     * 
     * @param name
     *            The index name.
     * 
     * @return The {@link Entry} for the named index -or- null if
     *         there is no entry for that name.
     */
    public Entry getEntry(final String name) {

        // lookup in the index.
        final byte[] val = super.lookup(getKey(name));

        Entry entry = null;

        if (val != null) {

            // deserialize entry.
            entry = EntrySerializer.INSTANCE.deserialize(new DataInputBuffer(
                    val));

        }

        return entry;

    }

    /**
     * Add an entry for the named index.
     * 
     * @param name
     *            The index name.
     * 
     * @param btree
     *            The index.
     * 
     * @exception IllegalArgumentException
     *                if name is null.
     * @exception IllegalArgumentException
     *                if btree is null.
     * @exception IndexExistsException
     *                if there is already an index registered under that name.
     */
    synchronized public void registerIndex(final String name,
            final ICheckpointProtocol btree) {

        assertUnisolatedInstance();

        if (name == null)
            throw new IllegalArgumentException();

        if (btree == null)
            throw new IllegalArgumentException();
        
        final byte[] key = getKey(name);

        if (super.contains(key)) {

            throw new IndexExistsException(name);

        }

        // flush btree to the store to get the checkpoint record address.
        final long checkpointAddr = btree.writeCheckpoint();

        /*
         * Add a serialized entry to the persistent index.
         * 
         * Note: The commit time here is a placeholder. It will be replaced with
         * the actual commit time by the next commit since the newly created
         * B+Tree is on our commit list. If there is an abort, then the entry is
         * simply discarded along with the rest of the Name2Addr state.
         */
        
        final Entry entry = new Entry(name, checkpointAddr, 0L/* commitTime */);
        
        super.insert(key, EntrySerializer.INSTANCE.serialize( entry ));
        
        putOnCommitList(name, btree, false/* needsCheckpoint */);
        
        // report event (the application has access to the named index).
        ResourceManager.openUnisolatedIndex(name);
        
    }

    /**
     * Adds the named index to the commit list and sets a {@link DirtyListener}
     * on the {@link ICheckpointProtocol} so that this {@link Name2Addr} object
     * will be informed if the associated persistent data structure becomes
     * dirty.
     * 
     * @param name
     *            The index name.
     * @param btree
     *            The persistence capable data structure.
     * @param needsCheckpoint
     *            Specify true if {@link Name2Addr} should invoke
     *            {@link ICheckpointProtocol#writeCheckpoint()} rather than just
     *            updating the {@link Entry} for the persistenc capable data
     *            structure using {@link ICheckpointProtocol#getCheckpoint()}
     */
    synchronized protected void putOnCommitList(final String name,
            final ICheckpointProtocol btree, final boolean needsCheckpoint) {

        assertUnisolatedInstance();

        if (name == null)
            throw new IllegalArgumentException();

        if (btree == null)
            throw new IllegalArgumentException();
        
        // setup a dirty listener.
        final DirtyListener l = new DirtyListener(name, btree, needsCheckpoint);
        
        // and set it on the btree.
        btree.setDirtyListener(l);

        putIndexCache(name, btree, true/*replace*/);
        
        // add to the commit list.
        commitList.put(name, l);
     
        if(log.isInfoEnabled()) {
            
            log.info("name=" + name + ", commitListSize=" + commitList.size()
                    + ", needsCheckpoint=" + needsCheckpoint + ", file="
                    + getStore().getFile());
            
        }
        
    }

    /**
     * Adds the named index to the {@link ITx#UNISOLATED} index cache.
     * 
     * @param name
     *            The index name.
     * @param btree
     *            The {@link ITx#UNISOLATED} view of the persistence capable
     *            data structure.
     * @param replace
     *            If an existing entry for that name may be replaced.
     */
    synchronized protected void putIndexCache(final String name,
            final ICheckpointProtocol btree, final boolean replace) {

        assertUnisolatedInstance();
        
//        /*
//         * Note: the WeakValueCache does not let you replace an existing entry
//         * so we first remove the old entry under the key if there is one.
//         */
//        if (replace) {
//
//            indexCache.remove(name);
//
//        }
//        
//        // add name -> btree mapping to the transient cache.
//        indexCache.put(name, btree, true/*dirty*/);

        if (replace) {

            indexCache.put(name, btree);
            
        } else {
            
            indexCache.putIfAbsent(name, btree);
            
        }

    }
    
    /**
     * Return the current entry, if any, for the named {@link ITx#UNISOLATED}
     * index in the {@link #indexCache}.
     * 

     * Note: This method is more direct than {@link #getIndex(String)}.
     * {@link AbstractTask} uses this method together with
     * {@link #putIndexCache(String, ICheckpointProtocol, boolean)} to allow
     * different tasks access to the same pool of {@link ITx#UNISOLATED}
     * indices.
     * 
     * @param name
     *            The index name.
     * 
     * @return The index iff it was found in the cache.
     */
    synchronized protected ICheckpointProtocol getIndexCache(final String name) {

        assertUnisolatedInstance();

        return indexCache.get(name);

    }
    
    /**
     * Removes the entry for the named index. The named index will no longer
     * participate in commits.
     * 
     * @param name
     *            The index name.
     * 
     * @exception IllegalArgumentException
     *                if name is null.
     * @exception NoSuchIndexException
     *                if the index does not exist.
     */
    synchronized public void dropIndex(final String name) {

        assertUnisolatedInstance();

        if (name == null)
            throw new IllegalArgumentException();

        if (log.isInfoEnabled())
            log.info("name=" + name);
        
        final byte[] key = getKey(name);
        
        if(!super.contains(key)) {
            
            throw new NoSuchIndexException("Not registered: "+name);
            
        }
        
        // remove the name -> btree mapping from the transient cache.
        final ICommitter btree = indexCache.remove(name);
        
        if (btree != null) {

            /*
             * Make sure that the index is not on the commit list.
             * 
             * Note: If the index is not in the index cache then it WILL NOT be
             * in the commit list.
             */
            commitList.remove(name);
            
            // clear our listener.
            ((ICheckpointProtocol) btree).setDirtyListener(null);

        }

        /*
         * Remove the entry from the persistent index. After a commit you will
         * no longer be able to find the metadata record for this index from the
         * current commit record (it will still exist of course in historical
         * commit records).
         */
        super.remove(key);

        // report event.
        ResourceManager.dropUnisolatedIndex(name);

    }
    
    /**
     * Return a {@link CounterSet} reflecting the named indices that are
     * currently open (more accurately, those open named indices whose
     * references are in {@link Name2Addr}s internal {@link #indexCache}). When
     * index partitions are in use their {@link CounterSet}s are reported under
     * a path formed from name of the scale-out index and partition identifier.
     * Otherwise the {@link CounterSet}s are reported directly under the index
     * name.
     * 
     * @param counterSet
     *            When non-null the performance counters are
     *            entered into the caller's collection. Otherwise they are
     *            entered into a new collection.
     * @param found
     *            When non-null, the names of the indices whose
     *            performance counters are being returned is reported as a
     *            side-effect on this {@link Set}.
     * 
     * @return A new {@link CounterSet} reflecting the named indices that were
     *         open as of the time that this method was invoked.
     * 
     * @see IndexManager#getIndexCounters()
     * 
     * @see 
     *      Expose performance counters for read-only indices 
     */
    protected CounterSet getIndexCounters(final CounterSet counterSet,
            final Set found) {

        assertUnisolatedInstance();

        final CounterSet tmp = counterSet == null ? new CounterSet()
                : counterSet;
        
        final Iterator>> itr = indexCacheEntryIterator();

        while (itr.hasNext()) {

            final java.util.Map.Entry> entry = itr.next();

            final String name = entry.getKey();

            final ICheckpointProtocol btree = entry.getValue().get();

            if (btree == null) {

                // Note: Weak reference has been cleared.
                
                continue;
                
            }
            
            final IndexMetadata md = btree.getIndexMetadata();

            final LocalPartitionMetadata pmd = md.getPartitionMetadata();

            final String path;
            if (pmd != null) {

                // Note: [name] already includes the partition identifier.
                path = md.getName() + ICounterSet.pathSeparator + name;

            } else {

                path = name;

            }

            /*
             * Attach the B+Tree performance counters.
             * 
             * Note: These counters MUST NOT embed a hard reference to the
             * AbstractBTree. That could cause the BTree to be retained as long
             * as the caller holds the CounterSet object!
             */

            tmp.makePath(path).attach(btree.getCounters());

            if (found != null) {
             
                /*
                 * Report out the names of the indices whose counters are being
                 * returned.
                 */

                found.add(name);
                
            }
            
        }
        
        return tmp;

    }
    
    /**
     * An entry in the persistent index.
     * 

     * The {@link Entry} reports the {@link #name} of the index, its
     * {@link #checkpointAddr}, and the {@link #commitTime} when it was last
     * updated. If you want to know more about the index, then you need to open
     * it.
     * 
     * Do NOT open the index directly from the {@link #checkpointAddr}. That
     * will circumvent the canonical mapping imposed by the
     * {@link IIndexManager} on the open indices for a given name and commit
     * time. Instead, just ask the {@link IIndexManager} to open the index
     * having the specified {@link #name} as of the desired commit time.
     * 
     * @author Bryan
     *         Thompson
     */
    public static class Entry {
       
        /**
         * The name of the index.
         */
        public final String name;
        
        /**
         * The address of the last known {@link Checkpoint} record for the
         * index with that name.
         */
        public final long checkpointAddr;

        /**
         * The commit time associated with the last commit point for the named
         * index.
         */
        public final long commitTime;

        public Entry(final String name, final long checkpointAddr,
                final long commitTime) {
            
            this.name = name;
            
            this.checkpointAddr = checkpointAddr;
            
            this.commitTime = commitTime;
            
        }
        
        @Override
        public String toString() {

            return "Entry{name=" + name + ",checkpointAddr=" + checkpointAddr
                    + ",commitTime=" + commitTime + "}";

        }
        
    }

    /**
     * The values are {@link Entry}s.
     *
     * @author Bryan Thompson
     */
    public static class EntrySerializer {

        public static transient final EntrySerializer INSTANCE = new EntrySerializer();

        private EntrySerializer() {

        }

        public byte[] serialize(final Entry entry) {

            try {

                // estimate capacity
                final int capacity = Bytes.SIZEOF_LONG + entry.name.length() * 2;
                
                final ByteArrayOutputStream baos = new ByteArrayOutputStream(capacity);
                
                final DataOutput os = new DataOutputStream(baos);

                os.writeLong(entry.commitTime);

                os.writeLong(entry.checkpointAddr);

                os.writeUTF(entry.name);
                
                return baos.toByteArray();

            } catch (IOException e) {
                
                throw new RuntimeException(e);
                
            }

        }

        public Entry deserialize(final DataInput in) {

            try {

                final long commitTime = in.readLong();
                
                final long checkpointAddr = in.readLong();

                final String name = in.readUTF();

                return new Entry(name, checkpointAddr, commitTime);

            } catch (IOException e) {

                throw new RuntimeException(e);

            }

        }

    }

    /**
     * Encapsulates key and value formation for {@link Name2Addr}.
     * 
     * @author Bryan Thompson
     */
    static public class Name2AddrTupleSerializer extends
            DefaultTupleSerializer {

        /**
         * 
         */
        private static final long serialVersionUID = 5699568938604974463L;
        
        /**
         * Used to (de-)serialize {@link Entry}s (NOT thread-safe).
         */
        private final EntrySerializer ser;
        
        /**
         * De-serialization ctor.
         */
        public Name2AddrTupleSerializer() {

            super();

            this.ser = EntrySerializer.INSTANCE;
            
        }

        /**
         * Ctor when creating a new instance.
         * 
         * @param keyBuilderFactory
         */
        public Name2AddrTupleSerializer(
                final IKeyBuilderFactory keyBuilderFactory) {
            
            super(keyBuilderFactory);
            
            this.ser = EntrySerializer.INSTANCE;

        }
        
        /**
         * Return the unsigned byte[] key for an index name.
         * 
         * @param obj
         *            The name of an index.
         */
        @Override
        public byte[] serializeKey(final Object obj) {

            final IKeyBuilder keyBuilder = getKeyBuilder();
            
            final byte[] a = keyBuilder.reset().append((String) obj).getKey();

//            log.error("name=" + obj + ", key=" + BytesUtil.toString(a)+", keyBuilder="+keyBuilder);
            
            return a;
            
        }
        
        /**
         * Return the byte[] value an {@link Entry}.
         * 
         * @param entry
         *            An Entry.
         */
        @Override
        public byte[] serializeVal(final Entry entry) {
            
            return ser.serialize(entry);

        }

        @SuppressWarnings("rawtypes")
        @Override
        public Entry deserialize(final ITuple tuple) {

            return ser.deserialize(tuple.getValueStream());

        }

        /**
         * The initial version (no additional persistent state).
         */
        private final static transient byte VERSION0 = 0;

        /**
         * The current version.
         */
        private final static transient byte VERSION = VERSION0;

        @Override
        public void readExternal(final ObjectInput in) throws IOException,
                ClassNotFoundException {

            super.readExternal(in);
            
            final byte version = in.readByte();
            
            switch (version) {
            case VERSION0:
                break;
            default:
                throw new UnsupportedOperationException("Unknown version: "
                        + version);
            }

        }

        @Override
        public void writeExternal(final ObjectOutput out) throws IOException {

            super.writeExternal(out);
            
            out.writeByte(VERSION);
            
        }

    } // Name2AddrTupleSerializer

    /**
     * Prefix scan of a {@link Name2Addr} index. This scan assumes that the
     * caller has provided for any possible thread-safety issues.
     * 
     * @param prefix
     *            The prefix.
     * @param n2a
     *            The index.
     * 
     * @return The names of the indices spanned by that prefix in that index.
     * 
     * @see 
     *      Name2Addr.indexNameScan(prefix) uses scan + filter 
     * @see 
     *      AbstractTripleStore.destroy() does not filter for correct prefix
     *      
     */
    @SuppressWarnings("unchecked")
   public static final Iterator indexNameScan(final String prefix,
            final IIndex n2a) {

        final byte[] fromKey;
        final byte[] toKey;
        final boolean hasPrefix = prefix != null && prefix.length() > 0;
//        final boolean restrictScan = true;

        if (hasPrefix ) //&& restrictScan) 
        {

            /*
             * When the namespace prefix was given, generate the toKey as the
             * fixed length successor of the fromKey.
             * 
             * Note: We MUST use StrengthEnum:=PRIMARY for the prefix scan in
             * order to avoid the secondary collation ordering effects.
             */

//            final IKeyBuilder keyBuilder = n2a.getIndexMetadata()
//                    .getTupleSerializer().getKeyBuilder();

//            final Properties properties = new Properties();
//
//            properties.setProperty(KeyBuilder.Options.STRENGTH,
//                    StrengthEnum.Primary.toString());
//
//            final IKeyBuilder keyBuilder = new DefaultKeyBuilderFactory(
//                    properties).getKeyBuilder();
            final IKeyBuilder keyBuilder = n2a.getIndexMetadata()
                    .getPrimaryKeyBuilder();
                    
            fromKey = keyBuilder.reset().append(prefix).getKey();

            toKey = SuccessorUtil.successor(fromKey.clone());

            if (log.isDebugEnabled()) {

                log.error("fromKey=" + BytesUtil.toString(fromKey));

                log.error("toKey  =" + BytesUtil.toString(toKey));

            }

        } else {

            // Do not restrict the scan.
            fromKey = null;
            toKey = null;

        }

        final ITupleIterator itr = n2a.rangeIterator(fromKey, toKey);

        /*
         * Add resolver from the tuple to the name of the index.
         */
        final IStriterator sitr = new Striterator(itr).addFilter(new Resolver() {

            private static final long serialVersionUID = 1L;

            @Override
            protected Object resolve(Object obj) {

                return ((ITuple) obj).getObject().name;

            }

        });

//        if (hasPrefix && !restrictScan) {
//
//            /*
//             * Only report the names that match the prefix.
//             * 
//             * Note: For the moment, the filter is hacked by examining the
//             * de-serialized Entry objects and only reporting those that start
//             * with the [prefix].
//             */
//            
//            sitr = sitr.addFilter(new Filter() {
//
//                private static final long serialVersionUID = 1L;
//
//                @Override
//                public boolean isValid(final Object obj) {
//                    
//                    final String name = (String) obj;
//
//                    if (name.startsWith(prefix)) {
//
//                        // acceptable.
//                        return true;
//                    }
//                    return false;
//                }
//            });
//
//        }

        return sitr;

    }

//    /**
//     * The SuccessorUtil does not work with CollatedKeys since it bumps the "meta/control" data
//     * at the end of the key, rather than the "value" data of the key.
//     * 
//     * It has been observed that the key data is delimited with a 01 byte, followed by meta/control
//     * data with the key itself delimited by a 00 byte.
//     * 
//     * Note that this has only been analyzed for the ICU collator, the standard Java collator does include 
//     * 00 bytes in the key.  However, it too appears to delimit the value key with a 01 byte so the
//     * same method should work.
//     * 
//     * @param src - original key
//     * @return the next key
//     */
//    private static byte[] successor(final byte[] src) {
//        final byte[] nxt = src.clone();
//        for (int i = 1; i < nxt.length; i++) {
//            if (nxt[i] == 01) { // end of data
//                nxt[i-1]++;
//                break;
//            }
//        }
//        
//        return nxt;
//    }
}