com.bigdata.journal.Journal Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bigdata-core Show documentation
Blazegraph(TM) DB Core Platform. It contains all Blazegraph DB dependencies other than Blueprints.
There is a newer version: 2.1.4
/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     [email protected]

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
package com.bigdata.journal;

import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.BrokenBarrierException;
import java.util.concurrent.Callable;
import java.util.concurrent.CancellationException;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.Semaphore;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReentrantLock;

import org.apache.log4j.Logger;

import cern.colt.Arrays;

import com.bigdata.bfs.BigdataFileSystem;
import com.bigdata.bfs.GlobalFileSystemHelper;
import com.bigdata.bop.engine.QueryEngine;
import com.bigdata.bop.fed.QueryEngineFactory;
import com.bigdata.btree.AbstractBTree;
import com.bigdata.btree.BTree;
import com.bigdata.btree.BTreeCounters;
import com.bigdata.btree.BaseIndexStats;
import com.bigdata.btree.ILocalBTreeView;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.IndexSegment;
import com.bigdata.btree.ReadCommittedView;
import com.bigdata.cache.HardReferenceQueue;
import com.bigdata.config.IntegerValidator;
import com.bigdata.config.LongValidator;
import com.bigdata.counters.AbstractStatisticsCollector;
import com.bigdata.counters.CounterSet;
import com.bigdata.counters.httpd.CounterSetHTTPD;
import com.bigdata.ha.HAGlue;
import com.bigdata.ha.HAStatusEnum;
import com.bigdata.ha.HATXSGlue;
import com.bigdata.ha.QuorumService;
import com.bigdata.ha.msg.HAGatherReleaseTimeRequest;
import com.bigdata.ha.msg.HANotifyReleaseTimeRequest;
import com.bigdata.ha.msg.HANotifyReleaseTimeResponse;
import com.bigdata.ha.msg.IHAGatherReleaseTimeRequest;
import com.bigdata.ha.msg.IHANotifyReleaseTimeRequest;
import com.bigdata.ha.msg.IHANotifyReleaseTimeResponse;
import com.bigdata.journal.JournalTransactionService.ValidateWriteSetTask;
import com.bigdata.quorum.Quorum;
import com.bigdata.quorum.QuorumException;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.rdf.task.IApiTask;
import com.bigdata.relation.locator.DefaultResourceLocator;
import com.bigdata.relation.locator.ILocatableResource;
import com.bigdata.relation.locator.IResourceLocator;
import com.bigdata.resources.IndexManager;
import com.bigdata.resources.ResourceManager;
import com.bigdata.resources.StaleLocatorReason;
import com.bigdata.rwstore.IHistoryManager;
import com.bigdata.rwstore.IRawTx;
import com.bigdata.rwstore.RWStore;
import com.bigdata.service.AbstractTransactionService;
import com.bigdata.service.DataService;
import com.bigdata.service.IBigdataFederation;
import com.bigdata.sparse.GlobalRowStoreHelper;
import com.bigdata.sparse.SparseRowStore;
import com.bigdata.util.DaemonThreadFactory;
import com.bigdata.util.InnerCause;
import com.bigdata.util.concurrent.LatchedExecutor;
import com.bigdata.util.concurrent.ShutdownHelper;
import com.bigdata.util.concurrent.ThreadPoolExecutorBaseStatisticsTask;

/**
 * Concrete implementation suitable for a local and unpartitioned database.
 * 
 * Note: This implementation does NOT not support partitioned indices. Because
 * all data must reside on a single journal resource there is no point to a
 * view. Views are designed to have data on a mixture of the live journal, one
 * or more historical journals, and one or more {@link IndexSegment}s.
 * 
 * @see ResourceManager, which supports views.
 */
public class Journal extends AbstractJournal implements IConcurrencyManager,
        /*ILocalTransactionManager,*/ IResourceManager {

    /**
     * Logger.
     */
    static final Logger log = Logger.getLogger(Journal.class);

    /**
     * @see http://sourceforge.net/apps/trac/bigdata/ticket/443 (Logger for
     *      RWStore transaction service and recycler)
     */
    private static final Logger txLog = Logger.getLogger("com.bigdata.txLog");

    /**
     * Object used to manage local transactions. 
     */
    private final AbstractLocalTransactionManager localTransactionManager; 

    /**
     * Object used to manage tasks executing against named indices.
     */
    private final ConcurrencyManager concurrencyManager;

    /**
     * true iff the journal has been configured to use group commit.
     */
    private final boolean isGroupCommit;
    
    /**
     * Options understood by the {@link Journal}.
     * 
     * @author Bryan Thompson
     */
    public interface Options extends com.bigdata.journal.Options,
            com.bigdata.journal.ConcurrencyManager.Options,
            com.bigdata.journal.TemporaryStoreFactory.Options,
            com.bigdata.journal.QueueStatsPlugIn.Options,
            com.bigdata.journal.PlatformStatsPlugIn.Options,
            com.bigdata.journal.HttpPlugin.Options
            // Note: Do not import. Forces bigdata-ganglia dependency.
            // com.bigdata.journal.GangliaPlugIn.Options
            {

       /**
       * The name of an boolean option that conditionally enables group commit
       * semantics for the Journal based on task-oriented concurrent writers.
       * This option indicates a contract that the application agrees
       * to respect. When false the application controls when the
       * database goes through a commit point by invoking
       * {@link Journal#commit()}. When true ALL mutation
       * operations MUST be submitted to the {@link IConcurrencyManager} of the
       * journal. Note that the journal always has supported group commit - this
       * is how it is used in scale-out. However, embedded applications have
       * historically made the decision about when the database should commit.
       * When you specify this option, you are asserting that your code will
       * always submit tasks for evaluation using the
       * {@link IConcurrencyManager}.
       * 

       * There are several benefits of group commit.
       * 

       * First, you can have multiple tenants in the same database instance and
       * the updates for one tenant will no longer block the updates for the
       * other tenants. Thus, one tenant can be safely running a long running
       * update and other tenants can still enjoy low latency updates.
       * 

       * Second, group commit automatically combines a sequence of updates on
       * one (or more) tenant(s) into a single commit point on the disk. This
       * provides higher potential throughput. It also means that it is no
       * longer as important for applications to batch their updates since group
       * commit will automatically perform some batching.
       * 

       * Third, writes on independent indices may be independent (this behavior
       * has been around for a long time).
       * 

       * There are a few "gotchas" with the group commit support. This is
       * because commits are decided by {@link IApiTask} or {@link AbstractTask}
       * completion and tasks are scheduled by the concurrency manager, lock
       * manager, and write executor service.
       * 

       * 
       * Mutation tasks that do not complete normally MUST throw an exception!
       * Applications MUST NOT call Journal.commit(). Instead, they submit
       * an IApiTask using AbstractApiTask.submit(). The database will meld the
       * write set of the task into a group commit sometime after the task
       * completes successfully.
       * 
       * @see #566 (NSS GROUP COMMIT).
       */
      String GROUP_COMMIT = Journal.class.getName() + ".groupCommit";

      String DEFAULT_GROUP_COMMIT = "false";

        /**
         * The capacity of the {@link HardReferenceQueue} backing the
         * {@link IResourceLocator} maintained by the {@link Journal}. The
         * capacity of this cache indirectly controls how many
         * {@link ILocatableResource}s the {@link Journal} will hold open.
         * 
         * The effect of this parameter is indirect owning to the semantics of
         * weak references and the control of the JVM over when they are
         * cleared. Once an {@link ILocatableResource} becomes weakly reachable,
         * the JVM will eventually GC the object. Since objects which are
         * strongly reachable are never cleared, this provides our guarantee
         * that resources are never closed if they are in use.
         * 
         * @see #DEFAULT_LOCATOR_CACHE_CAPACITY
         */
        String LOCATOR_CACHE_CAPACITY = Journal.class.getName()
                + ".locatorCacheCapacity";

        String DEFAULT_LOCATOR_CACHE_CAPACITY = "20";
        
        /**
         * The timeout in milliseconds for stale entries in the
         * {@link IResourceLocator} cache -or- ZERO (0) to disable the timeout
         * (default {@value #DEFAULT_LOCATOR_CACHE_TIMEOUT}). When this timeout
         * expires, the reference for the entry in the backing
         * {@link HardReferenceQueue} will be cleared. Note that the entry will
         * remain in the {@link IResourceLocator} cache regardless as long as it
         * is strongly reachable.
         */
        String LOCATOR_CACHE_TIMEOUT = Journal.class.getName()
                + ".locatorCacheTimeout";

        String DEFAULT_LOCATOR_CACHE_TIMEOUT = "" + (60 * 1000);

        /**
         * The #of threads that will be used to read on the local disk.
         * 
         * @see Journal#getReadExecutor()
         */
        String READ_POOL_SIZE = Journal.class.getName() + ".readPoolSize";

        String DEFAULT_READ_POOL_SIZE = "0";
        
    }
    
    /**
     * Create or re-open a journal.
     * 
     * @param properties
     *            See {@link com.bigdata.journal.Options}.
     */
    public Journal(final Properties properties) {
        
        this(properties, null/* quorum */);
    
    }

    public Journal(final Properties properties,
            final Quorum> quorum) {

        super(properties, quorum);
        
        isGroupCommit = Boolean.parseBoolean(properties.getProperty(
            Options.GROUP_COMMIT, Options.DEFAULT_GROUP_COMMIT));

        tempStoreFactory = new TemporaryStoreFactory(properties);
        
        executorService = (ThreadPoolExecutor) Executors
                .newCachedThreadPool(new DaemonThreadFactory(getClass()
                        .getName()
                        + ".executorService"));

//        if (Boolean.valueOf(properties.getProperty(
//                Options.COLLECT_QUEUE_STATISTICS,
//                Options.DEFAULT_COLLECT_QUEUE_STATISTICS))) {
            
            scheduledExecutorService = Executors
                    .newSingleThreadScheduledExecutor(new DaemonThreadFactory(
                            getClass().getName() + ".sampleService"));
            
//        } else {
//         
//            scheduledExecutorService = null;
//            
//        }
        
        {
            
            final int readPoolSize = Integer.valueOf(properties.getProperty(
                    Options.READ_POOL_SIZE, Options.DEFAULT_READ_POOL_SIZE));
            
            if (readPoolSize > 0) {

                readService = new LatchedExecutor(executorService,
                        readPoolSize);

            } else {

                readService = null;
                
            }

        }

        resourceLocator = newResourceLocator();
        
        resourceLockManager = new ResourceLockService();

        localTransactionManager = newLocalTransactionManager();

        concurrencyManager = new ConcurrencyManager(properties,
                localTransactionManager, this);
        
        getExecutorService().execute(new StartDeferredTasksTask());

      if (isGroupCommit() && !(this.isHAJournal())
            && getRootBlockView().getCommitCounter() == 0L) {

         /*
          * GROUP_COMMIT: See #566.
          * 
          * This hacks in an initial commit to force the GRS to spring into
          * existence. Maybe we should just do this for the initial create of
          * the Journal when using group commit? Or maybe we should make the
          * AbstractTask smart enough to merge two versions of the GRS - it
          * should be simple enough to provide write-write conflict resolution
          * for the GRS.
          * 
          * Note: This needs to be done by the leader in HA, hence the
          * conditional test above.
          */
         
         getGlobalRowStore(); // Force the GRS index to exist.
         
         commit(); // Commit.
         
      }

    }

    /**
     * Ensure that the WORM mode of the journal always uses
     * {@link Long#MAX_VALUE} for
     * {@link AbstractTransactionService.Options#MIN_RELEASE_AGE}.
     * 
     * @param properties
     *            The properties.
     *            
     * @return The argument, with the minReleaseAge overridden if necessary.
     * 
     * @see https://sourceforge.net/apps/trac/bigdata/ticket/391
     */
    private Properties checkProperties(final Properties properties) {
        if (getBufferStrategy() instanceof WORMStrategy) {
            properties.setProperty(
                    AbstractTransactionService.Options.MIN_RELEASE_AGE, ""
                            + Long.MAX_VALUE);
        }
        return properties;
    }

    /**
     * Factory for the {@link IResourceLocator} for the {@link Journal}.
     */
    @SuppressWarnings({ "unchecked", "rawtypes" })
    protected IResourceLocator newResourceLocator() {

        final int cacheCapacity = getProperty(Options.LOCATOR_CACHE_CAPACITY,
                Options.DEFAULT_LOCATOR_CACHE_CAPACITY,
                IntegerValidator.GT_ZERO);

        final long cacheTimeout = getProperty(Options.LOCATOR_CACHE_TIMEOUT,
                Options.DEFAULT_LOCATOR_CACHE_TIMEOUT, LongValidator.GTE_ZERO);

        return new DefaultResourceLocator(this, null/* delegate */,
                cacheCapacity, cacheTimeout);

    }
    
    /**
     * Inner class used to coordinate the distributed protocol for achieving an
     * atomic consensus on the new releaseTime for the services joined
     * with a met quorum.
     * 
     * @author Bryan
     *         Thompson
     */
    private class BarrierState implements Runnable {
        
        /**
         * The token that must remain valid.
         */
        final private long token;
        
        /**
         * The commit counter that will be assigned to the commit point. This is
         * used to ensure that the GATHER and PREPARE are for the same commit
         * point and that the follower is at the previous commit point.
         */
        final private long newCommitCounter;
        
        /**
         * The commit time that will be assigned to the commit point. This is
         * used to ensure that the GATHER and PREPARE are for the same commit
         * point and that the follower is at the previous commit point.
         */
        final private long newCommitTime;
        
        /**
         * Local HA service implementation (non-Remote).
         */
        final private QuorumService quorumService;
        
        /** The services joined with the met quorum, in their join order. */
        final private UUID[] joinedServiceIds;
        
        /**
         * {@link CyclicBarrier} used to coordinate the protocol for achiving an
         * atomic consensus on the new releaseTime for the services
         * joined with a met quorum.
         * 

         * Note: The {@link #barrier} provides visibilty for the fields that are
         * modified by {@link #run()} so we do not need additional locks or
         * atomics for synchronizing these state updates.
         */
        final private CyclicBarrier barrier;

//        /**
//         * The {@link Future} for the RMI to each follower that is joined with
//         * the met quorum.
//         */
//        final private Map> futures = new HashMap>();

        /**
         * A timestamp taken on the leader when we start the protocol to
         * discover the new releaseTime consensus.
         */
        final private long timestampOnLeader;
        
        /**
         * The {@link UUID} of the quorum leader.
         */
        final private UUID leaderId;
        
        /**
         * This is the earliest visible commit point on the leader.
         */
        final private IHANotifyReleaseTimeRequest leadersValue;

        /**
         * Exception is set by {@link #run()} if there is a problem when the
         * barrier breaks. The exception is then thrown out to the thread on
         * the leader that is running commitNow(), forcing the commit to fail.
         */
        volatile Throwable cause = null;
        
        /**
         * The message from each of those followers providing their local
         * earliest visible commit point.
         * 

         * Note: The {@link ConcurrentHashMap} does NOT allow null
         * values. Further the {@link IHANotifyReleaseTimeRequest} specifies the
         * serviceId of the follower. Therefore, a follower whose
         * {@link GatherTask} fails MUST provide a "mock"
         * {@link IHANotifyReleaseTimeRequest} that it will use to wait at the
         * {@link CyclicBarrier}.
         * 
         * @see InnerJournalTransactionService#notifyEarliestCommitTime(IHANotifyReleaseTimeRequest)
         * @see GatherTask
         */
        final private Map followerResponses = new ConcurrentHashMap();

        /**
         * The value from {@link #followerResponses} associated with the earliest commit
         * point. This is basis for the "censensus" across the services.
         */
        private IHANotifyReleaseTimeRequest minimumResponse = null;

        /**
         * The consensus value. This is a restatement of the data in from the
         * {@link #minimumResponse}.  This is set by {@link #run()}.
         */
        protected volatile IHANotifyReleaseTimeResponse consensus = null;

//        private Quorum> getQuorum() {
//            
//            return Journal.this.getQuorum();
//            
//        }
        
        private HATXSGlue getService(final UUID serviceId) {

            return quorumService.getService(serviceId);
            
        }

//        /**
//         * Cancel the requests on the remote services (RMI). This is a best effort
//         * implementation. Any RMI related errors are trapped and ignored in order
//         * to be robust to failures in RMI when we try to cancel the futures.
//         */
//        private , T> void cancelRemoteFutures(
//                final F[] remoteFutures) {
//
//            if (log.isInfoEnabled())
//                log.info("");
//
//            for (F rf : remoteFutures) {
//
//                try {
//
//                    rf.cancel(true/* mayInterruptIfRunning */);
//
//                } catch (Throwable t) {
//
//                    // ignored (to be robust).
//
//                }
//
//            }
//
//        }

        /** The services joined with the met quorum, in their join order. */
        public BarrierState(final long newCommitCounter,
                final long newCommitTime, final UUID[] joinedServiceIds) {

            token = getQuorum().token();

            this.newCommitCounter = newCommitCounter;
            
            this.newCommitTime = newCommitTime;
            
            getQuorum().assertLeader(token);

            // Local HA service implementation (non-Remote).
            quorumService = getQuorum().getClient();

            this.joinedServiceIds = joinedServiceIds;

            this.leaderId = quorumService.getServiceId();
            
            leadersValue = ((InnerJournalTransactionService) getTransactionService())
                    .newHANotifyReleaseTimeRequest(leaderId, newCommitCounter,
                            newCommitTime);

            // Note: Local method call.
            timestampOnLeader = leadersValue.getTimestamp();

//            /*
//             * Only the followers will countDown() at the barrier. The leader
//             * will await() until the barrier breaks.
//             */
            final int nparties = joinedServiceIds.length;// - 1;

            barrier = new CyclicBarrier(nparties, this);

        }

        /**
         * Find the minimum value across the responses when the {@link #barrier}
         * breaks.
         */
        @Override
        public void run() {

            try {

                if (log.isInfoEnabled())
                    log.info("leader: " + leadersValue);

                // This is the timestamp from the BarrierState ctor.
                final long timeLeader = leadersValue.getTimestamp();

                // This is the timestamp for right now.
                final long timeNow = newConsensusProtocolTimestamp();

                // // The local clock must be moving forward.
                // assertBefore(timeLeader, timeNow);

                // Start with the leader's value (from ctor).
                minimumResponse = leadersValue;

                for (IHANotifyReleaseTimeRequest response : followerResponses
                        .values()) {

                    if (log.isTraceEnabled())
                        log.trace("follower: " + response);

                    if (response.isMock()) {

                        /**
                         * The mock response should not have any influence on
                         * the consensus release time. The follower provides the
                         * mock response when it is unable to execute the
                         * GatherTask correctly, typically because it is not yet
                         * HAReady. The mock response preserves liveness for the
                         * GATHER protocol. The follower that provided the mock
                         * response will vote NO for the PREPARE because it's
                         * GatherTask will have thrown out an exception.
                         * 
                         * @see  HA3 simultaneous service start failure 
                         */

                        log.warn("Ignoring mock response: " + response);

                        continue;

                    }

                    final UUID followerId = response.getServiceUUID();

                    if (minimumResponse.getPinnedCommitCounter() > response
                            .getPinnedCommitCounter()) {

                        minimumResponse = response;

                    }

                    /*
                     * Verify that the timestamp from the ctor is BEFORE the
                     * timestamp assigned by the follower in the GatherTask.
                     */
                    assertBefore(leaderId, followerId, timeLeader,
                            response.getTimestamp());

                    /*
                     * Verify that the timestamp from the GatherTask on the
                     * follower is before the timestamp obtained at the top of
                     * this run() method.
                     */
                    assertBefore(followerId, leaderId, response.getTimestamp(),
                            timeNow);

                }

                // Restate the consensus as an appropriate message object.
                consensus = new HANotifyReleaseTimeResponse(
                        minimumResponse.getPinnedCommitTime(),
                        minimumResponse.getPinnedCommitCounter());

                if (log.isInfoEnabled())
                    log.info("consensus: " + consensus);

            } catch (Throwable t) {

                // Set the cause.
                cause = t;

            }

        }

        /**
         * Task does an RMI to the follower to start the GatherTask on the
         * follower.
         */
        private final class StartGatherOnFollowerTask implements Callable {

            private final UUID serviceId;
            private final IHAGatherReleaseTimeRequest msg;
            
            public StartGatherOnFollowerTask(final UUID serviceId,
                    final IHAGatherReleaseTimeRequest msg) {
                this.serviceId = serviceId;
                this.msg = msg;
            }

            public Void call() throws Exception {
                // Resolve joined service.
                final HATXSGlue service = getService(serviceId);
                // Message remote service.
                // Note: NPE if [service] is gone.
                service.gatherMinimumVisibleCommitTime(msg);
                // Done.
                return null;
            }

        } // class StartGatherOnFollowerTask
        
        /**
         * Send an {@link IHAGatherReleaseTimeRequest} message to each follower.
         * Block until the responses are received.
         * 

         * Note: Like the 2-phase commit, the overall protocol should succeed if
         * we can get ((k+1)/2) services that do not fail this
         * step. Thus for HA3, we should allow one error on a follower, the
         * leader is sending the messages and is presumed to succeed, and one
         * follower COULD fail without failing the protocol. If the protocol
         * does fail we have to fail the commit, so getting this right is
         * NECESSARY. At a mimimum, we must not fail if all joined services on
         * entry to this method respond without failing (that is, succeed if no
         * services fail during this protocol) - this is implemented.
         * 
         * @throws InterruptedException
         * @throws BrokenBarrierException
         * @throws TimeoutException 
         * 
         * @see 
         *      Native thread leak in HAJournalServer process 
         */
        private void messageFollowers(final long token, final long timeoutNanos)
                throws IOException, InterruptedException,
                BrokenBarrierException, TimeoutException {

            final long begin = System.nanoTime();
            final long nanos = timeoutNanos;
            long remaining = nanos;

            // Verify that this service is still the leader.
            getQuorum().assertLeader(token);

//            /*
//             * Future for gather task for each follower.
//             * 
//             * Note: These are asynchronous remote Futures. They must not escape
//             * the local scope and must be cancelled regardless of the outcome.
//             * 
//             * Note: DGC for these remote causes a native thread leak on the
//             * followers. To avoid that, I am attempting to rely on proxies for
//             * remote futures that do not use DGC. In this case, I believe that
//             * it will work since the Future is in scope on the follower (it is
//             * the future for the GatherTask running on the follower) and thus
//             * we should not need DGC to keep the follower from finalizing the
//             * remote futures on which this method is relying.
//             * 
//             * @see https://sourceforge.net/apps/trac/bigdata/ticket/673
//             */
//            @SuppressWarnings("unchecked")
//            final Future[] remoteFutures = new Future[joinedServiceIds.length];
//            final boolean[] remoteDone = new boolean[joinedServiceIds.length];

            // Local future for RMI requesting GATHER on each follower.
            final List> futures = new LinkedList>();

            try {

                final IHAGatherReleaseTimeRequest msg = new HAGatherReleaseTimeRequest(
                        token, timestampOnLeader, leaderId, newCommitCounter,
                        newCommitTime);

                // Do not send message to self (leader is at index 0).
                for (int i = 1; i < joinedServiceIds.length; i++) {

                    final UUID serviceId = joinedServiceIds[i];

                    /*
                     * Message each follower.
                     * 
                     * Note: The invoked RMI method submits the GatherTask that
                     * executes on the follower and returns. It does not block
                     * waiting for the outcome of the task on the follower.
                     * Instead, we wait until the barrier breaks. A thread will
                     * monitor the quorum state and break the barrier if the
                     * quorum breaks or if a joined service leaves during the
                     * consensus protocol.
                     * 
                     * Note: This uses multiple threads to issue the requests in
                     * parallel against the followers in order to minimize the
                     * latency of the protocol.
                     */
                    // Note: throws RejectedExecutionException if shutdown.
                    futures.add(getExecutorService().submit(
                            new StartGatherOnFollowerTask(serviceId, msg)));

//                    // add to list of futures we will check.
//                    remoteFutures[i] = rf;

                }

//                /*
//                 * Check the futures for the other services in the quorum.
//                 */
//                final List causes = new LinkedList();
//                for (int i = 1; i < remoteFutures.length; i++) {
//                    final Future rf = remoteFutures[i];
//                    boolean success = false;
//                    try {
//                        rf.get();
//                        success = true;
//                        remoteDone[i] = true;
//                    } catch (InterruptedException ex) {
//                        log.error(ex, ex);
//                        causes.add(ex);
//                    } catch (ExecutionException ex) {
//                        log.error(ex, ex);
//                        causes.add(ex);
//                        remoteDone[i] = true;
//                    } catch (RuntimeException ex) {
//                        /*
//                         * Note: ClientFuture.get() can throw a RuntimeException
//                         * if there is a problem with the RMI call. In this case
//                         * we do not know whether the Future is done.
//                         */
//                        log.error(ex, ex);
//                        causes.add(ex);
//                    } finally {
//                        if (!success) {
//                            // Cancel the request on the remote service (RMI).
//                            try {
//                                rf.cancel(true/* mayInterruptIfRunning */);
//                            } catch (Throwable t) {
//                                // ignored.
//                            }
//                            remoteDone[i] = true;
//                        }
//                    }
//                }

//                spinWaitBarrier(getQuorum(), barrier, token, timeout, units);
                
                /*
                 * This sets up a task that will monitor the quorum state and
                 * then interrupt this Thread if it is blocked at the barrier
                 * [actually, it uses barrier.reset(), which appears to be a
                 * little safer].
                 * 
                 * If this service is no longer the quorum leader or if any of
                 * the services leave that were joined with the met quorum when
                 * we started the release time consensus protocol, then we have
                 * to reset() the barrier. We achieve this by interrupting the
                 * Thread (actually it now uses barrier.reset()).
                 * 
                 * Note: CyclicBarrier.await(timeout,unit) causes the barrier to
                 * break if the timeout is exceeded (as opposed to simply throwing the TimeoutException and allowing the thread to
                 * retry the CyclicBarrier.await()). Therefore it CAN NOT be used in preference to this pattern. However, we could replace the use of the CyclicBarrier with a Phaser (JDK 1.7 or jr166).
                 */
                {
//                    final Thread blockedAtBarrier = Thread.currentThread();

                    final Quorum> quorum = getQuorum();

                    final long initialDelay = 100; // milliseconds.
                    final long delay = initialDelay;

                    final ScheduledFuture scheduledFuture = scheduledExecutorService
                            .scheduleWithFixedDelay(new Runnable() {
                                public void run() {
                                    try {
                                        
                                        // Verify service is still leader.
                                        quorum.assertLeader(token);

                                        // Verify service self-recognizes as leader.
                                        if (getHAStatus() != HAStatusEnum.Leader) {

                                            throw new QuorumException();

                                        }

                                        // Verify messaged services still
                                        // joined.
                                        assertServicesStillJoined(quorum);

                                        for (Future f : futures) {
                                            if (f.isDone()) {
                                                /*
                                                 * Note: If any follower fails
                                                 * on the RMI, then that is
                                                 * noticed here and the GATHER
                                                 * will fail on the leader.
                                                 * 
                                                 * TODO This should be robust as
                                                 * long as a majority of the
                                                 * services succeed. Right now
                                                 * this will stop the GATHER if
                                                 * any service fails on the RMI.
                                                 */
                                                f.get();
                                            }
                                        }
                                        
                                    } catch (Throwable ex) {

                                        if (InnerCause.isInnerCause(ex,
                                                InterruptedException.class)) {

                                            // Normal termination.
                                            return;
                                            
                                        }
                                        
                                        logErrorAndResetBarrier(ex);

                                    }

                                }
                            }, initialDelay, delay, TimeUnit.MILLISECONDS);

                    // Update time remaining.
                    remaining = nanos - (System.nanoTime() - begin);
                    
                    try {

                        /*
                         * Throws InterruptedException, BrokenBarrierException,
                         * TimeoutException.
                         * 
                         * Note: If TimeoutException is thrown, then the barrier
                         * will be broken.
                         */
                        barrier.await(remaining, TimeUnit.NANOSECONDS);

                    } finally {

                        scheduledFuture.cancel(true/* mayInterruptIfRunning */);

                    }

                }

//                /*
//                 * If there were any errors, then throw an exception listing them.
//                 */
//                if (!causes.isEmpty()) {
//                    // Note: Cancelled below.
////                    // Cancel remote futures.
////                    cancelRemoteFutures(remoteFutures);
//                    // Throw exception back to the leader.
//                    if (causes.size() == 1)
//                        throw new RuntimeException(causes.get(0));
//                    throw new RuntimeException("remote errors: nfailures="
//                            + causes.size(), new ExecutionExceptions(causes));
//                }

            } finally {

                /*
                 * Cancel local futures for RMI messages to followers.
                 * 
                 * Note: Regardless of outcome or errors above, ensure that the
                 * futures used to initiate the GatherTask on the followers are
                 * cancelled. These are local Futures that do RMIs. The RMIs
                 * should not block when they execute on the follower.
                 */

                for (Future f : futures) {

                    /*
                     * Await the Future with a short timeout for the task that
                     * did the RMI to each follow to participate in the GATHER
                     * protocol. Ensure that these Futures are cancelled if they
                     * are not already done, but prefer not to attempt to cancel
                     * the Future if it would be finished if we waited for a few
                     * milliseconds.
                     */
                    boolean done = false;
                    try {
                        f.get(10, TimeUnit.MILLISECONDS);
                        done = true;
                    } catch (Exception ex) {
                        // IGNORE
                    } finally {
                        if (!done) {
                            // cancel local future.
                            f.cancel(true/* mayInterruptIfRunning */);
                        }
                    }
                    
                    /*
                     * Future is done. Either cancelled, error, or successfully
                     * completed per the code block immediately above.
                     */
                    try {
                        // check outcome of future.
                        f.get();
                    } catch (CancellationException e) {
                        /*
                         * There is a race between the code path returning from
                         * the RMI (to request the caller to participate in the
                         * gather procotol) and the code path in which the
                         * leader awakens from the broken barrier (above). We
                         * have explicitly cancelled the future for that RMI and
                         * then checked the Future. If the leader wakes up
                         * before the RMI returns, then the Future will be
                         * cancelled rather than done. This case is a race, not an
                         * error.
                         */
                        if (log.isInfoEnabled())
                            log.info(e);// , e);
                    } catch (ExecutionException e) {
                        // Probably error on the RMI.
                        log.error(e, e);
                    }
                    
                }

                if (consensus == null) {
                
                    /*
                     * If there were any followers that did not message the
                     * leader and cause the barrier to be decremented and hence
                     * the [consensus] to become defined, then we need to
                     * decrement the barrier for those followers now in order
                     * for it to break.
                     * 
                     * There is no method to decrement by a specific number
                     * (unlike a semaphore), but you can reset() the barrier,
                     * which will cause a BrokenBarrierException for all Threads
                     * waiting on the barrier.
                     * 
                     * Note: It appears that [barrier.isBoken()] always reports
                     * [false] here. Hence, the test was changed to
                     * [consensus==null]. See 
                     * HA3 simultaneous service start failure 
                     * 
                     * FIXME HA TXS: A reset() here does not allow us to proceed
                     * with the consensus protocol unless all services
                     * "vote yes". Thus, a single node failure during the
                     * release time consensus protocol will cause the commit to
                     * fail. [Actually, we could use getNumberWaiting(). If it
                     * is a bare majority, then we could take the barrier break
                     * action ourselves. E.g., in the thread that calls
                     * barrier.reset()]. [Actually, this might not be a problem
                     * for cases where the GatherTask is able to send back a
                     * mock IHANotifyReleaseTimeRequest message, only when we
                     * are interrupted by the Runnable above that is monitoring
                     * the quorum state for an invariant change.]
                     */

                    log.error("Forcing barrier break");

                    barrier.reset();
                    
                }

            }// finally

        }

//        /**
//         * Wait on the {@link CyclicBarrier}, but do this in a loop so we can
//         * watch for a quorum break or service leave.
//         * 
//         * @param quorum
//         * @param barrier
//         * @param timeout
//         * @param units
//         * 
//         * @throws BrokenBarrierException
//         * @throws InterruptedException
//         * @throws TimeoutException
//         */
//        private void spinWaitBarrier(
//                final Quorum> quorum,
//                final CyclicBarrier barrier, final long token,
//                final long timeout, final TimeUnit unit)
//                throws BrokenBarrierException, InterruptedException,
//                TimeoutException {
//
//            if (log.isInfoEnabled())
//                log.info("Waiting at barrier: #parties=" + barrier.getParties()
//                        + ", #waiting=" + barrier.getNumberWaiting()
//                        + ", isBroken=" + barrier.isBroken() + ", token="
//                        + token + ", timeout=" + timeout + ", unit=" + unit);
//
//            // How lock to block in each iteration.
//            final long blockNanos = TimeUnit.MILLISECONDS.toNanos(10000);
//
//            final long begin = System.nanoTime();
//            final long nanos = unit.toNanos(timeout);
//            long remaining = nanos;
//            long nspin = 0L;
//
//            try {
//
//                while (remaining > 0) {
//
//                    nspin++;
//
//                    remaining = nanos - (System.nanoTime() - begin);
//
//                    try {
//
//                        // Verify that this service remains the leader.
//                        quorum.assertLeader(token);
//
//                        // Verify messaged services are still joined.
//                        assertServicesStillJoined(quorum);
//                        
//                        /*
//                         * If we observe a serviceLeave for any service that we
//                         * are awaiting, then we need to stop waiting on that
//                         * service. This could be achieved by running a Thread
//                         * that did a barrier.await() on the behalf of that
//                         * service, but only if that service has not yet
//                         * responded with its input for the consensus protocol
//                         * [if it has responded then it is probably already at
//                         * barrier.await() in a Thread on the leader for that
//                         * follower.]
//                         */
//                        final long awaitNanos = Math.min(blockNanos, remaining);
//
//                        /*
//                         * Await barrier, up to the timeout.
//                         * 
//                         * Note: Contrary to the javadoc, barrier.await(timeout)
//                         * will break the barrier if the timeout is exceeded!!!
//                         */
//                        barrier.await(awaitNanos, TimeUnit.NANOSECONDS);
//
//                        // Done.
//                        return;
//
//                    } catch (TimeoutException e) {
//                        // Spin.
//                        continue;
//                    } catch (InterruptedException e) {
//                        throw e;
//                    } catch (BrokenBarrierException e) {
//                        throw e;
//                    }
//
//                }
//
//            } finally {
//
//                /*
//                 * Note: On exit, the caller must reset() the barrier if it is
//                 * not yet broken.
//                 */
//
//                if (log.isInfoEnabled())
//                    log.info("barrier: #parties=" + barrier.getParties()
//                            + ", #waiting=" + barrier.getNumberWaiting()
//                            + ", isBroken=" + barrier.isBroken() + ", #spin="
//                            + nspin);
//
//            }
//
//        }

        private void logErrorAndResetBarrier(final Throwable ex) {

            log.error(ex, ex);

            if (!barrier.isBroken()) {

                log.error("Forcing barrier break");

                barrier.reset();
                
            }

        }
        
        /**
         * Verify that the services that were messaged for the release time
         * consensus protocol are still joined with the met quorum.
         * 
         * @throws QuorumException
         *             if one of the joined services leaves.
         */
        private void assertServicesStillJoined(
                final Quorum> quorum)
                throws QuorumException {

            final UUID[] tmp = quorum.getJoined();

            for (UUID serviceId : joinedServiceIds) {

                boolean found = false;
                for (UUID t : tmp) {
                    if (serviceId.equals(t)) {
                        found = true;
                        break;
                    }
                }
                if (!found) {

                    throw new QuorumException(
                            "Service leave during consensus protocol: "
                                    + serviceId);

                }

            }

        }

    }

    /**
     * Note: This deliberately uses the (non-remote) method
     * {@link BasicHA#nextTimestamp()}. This is done so we can write a unit test
     * of the {@link GatherTask} that imposes clock skew by overridding the next
     * value to be returned by that method.
     */
    private long newConsensusProtocolTimestamp() {

        return ((BasicHA) getQuorum().getClient().getService()).nextTimestamp();

    }

    /**
     * {@inheritDoc}
     * 

     * Extends the {@link JournalTransactionService} to provide protection for
     * the session protection mode of the {@link RWStore} and to support the
     * {@link HATXSGlue} interface.
     * 
     * @author Bryan
     *         Thompson
     * 
     * @see  HA TXS Design Document 
     * 
     * @see  HA
     *      TXS / TXS Bottleneck 
     */
    private class InnerJournalTransactionService extends
            JournalTransactionService {

        protected InnerJournalTransactionService() {

            super(checkProperties(properties), Journal.this);

            final long lastCommitTime = Journal.this.getLastCommitTime();

            if (lastCommitTime != 0L) {

                /*
                 * Notify the transaction service on startup so it can set the
                 * effective release time based on the last commit time for the
                 * store.
                 * 
                 * Note: For HA, the releaseTime is updated by the consensus
                 * protocol once a quorum is met. Before the quorum meets (and
                 * before a service joins with a met quorum) each service will
                 * track its own releaseTime. Therefore, during startup, the
                 * quorum will be null or HAStatusEnum will be NotReady so the
                 * TXS will automatically track the release time until the
                 * service joins with a met quorum.
                 */
                
                if (log.isInfoEnabled())
                    log.info("Startup: lastCommitTime=" + lastCommitTime);
                
                updateReleaseTimeForBareCommit(lastCommitTime);

            }
        
        }

        /**
         * @see 
         *      RWStore does not track tx release correctly 
         */
        final private ConcurrentHashMap m_rawTxs = new ConcurrentHashMap();

        /**
         * This lock is used to ensure that the following actions are MUTEX:
         * 

         * The barrier where we obtain a consensus among the services joined
         * with the met quorum concerning the new release time.
         * A remote service that wishes to join an already met quorum.
         * A new transaction start that would read on a commit point which
         * is LT than the readsOnCommitTime of the earliestActiveTx for this
         * service but GT earliest visible commit point for this service (as
         * determined by the releaseTime on the transaction service).
         * 
         * Any of these actions must contend for the {@link #barrierLock}.
         * 
         * @see #updateReleaseTimeConsensus(long, TimeUnit)
         * @see GatherTask#call()
         * @see #newTx(long)
         * @see #runWithBarrierLock(Runnable)
         */
        final private ReentrantLock barrierLock = new ReentrantLock();
        
//        final private Condition barrierBroke = barrierLock.newCondition();
        
        /**
         * This is used to coordinate the protocol for achiving an atomic
         * consensus on the new releaseTime for the services joined with
         * a met quorum.
         */
        final private AtomicReference barrierRef = new AtomicReference();
        
        @Override
        public void runWithBarrierLock(final Runnable r) {
            
            barrierLock.lock();
            try {
                haLog.info("Will run with barrier lock.");
                try {
                    r.run();
                } catch(Throwable t) {
                    /*
                     * Note: An Interrupt here is not really an ERROR. It
                     * could be caused by a change in the RunState of the
                     * HAJournalServer.
                     */
                    haLog.error(t, t);
                } finally {
                    haLog.info("Did run with barrier lock.");
                }
            } finally {
                barrierLock.unlock();
            }
            
        }

        /**
         * {@inheritDoc}
         * 
         * We need obtain a distributed consensus for the services joined with
         * the met quorum concerning the earliest commit point that is pinned by
         * the combination of the active transactions and the minReleaseAge on
         * the TXS.
         * 

         * New transaction starts during this critical section will block (on
         * the leader or the folllower) unless they are guaranteed to be
         * allowable, e.g., based on the current minReleaseAge, the new tx would
         * read from the most recent commit point, the new tx would ready from a
         * commit point that is already pinned by an active transaction on that
         * node, etc.
         * 
         * @throws IOException
         * @throws BrokenBarrierException
         */
        // Note: Executed on the leader.
        @Override
        public IHANotifyReleaseTimeResponse updateReleaseTimeConsensus(
                final long newCommitCounter,
                final long newCommitTime,
                final UUID[] joinedServiceIds, final long timeout,
                final TimeUnit units) throws IOException, InterruptedException,
                TimeoutException, BrokenBarrierException {

            final long begin = System.nanoTime();
            final long nanos = units.toNanos(timeout);
            long remaining = nanos;

            final long token = getQuorum().token();

            if (haLog.isInfoEnabled())
                haLog.info("GATHER PROTOCOL: commitCounter=" + newCommitCounter
                        + ", token=" + token + ", joinedServiceIds="
                        + Arrays.toString(joinedServiceIds));
            
            final BarrierState barrierState;
            
            barrierLock.lock();

            try {

                getQuorum().assertLeader(token);

                if (!barrierRef.compareAndSet(null/* expectedValue */,
                        barrierState = new BarrierState(newCommitCounter,
                                newCommitTime, joinedServiceIds)/* newValue */)) {

                    throw new IllegalStateException();

                }

                try {

                    /*
                     * Message the followers and block until the barrier breaks.
                     */

                    // Update time remaining.
                    remaining = nanos - (System.nanoTime() - begin);
                    
                    barrierState.messageFollowers(token, remaining);

                } finally {

                    // Clear the barrierRef.
                    if (!barrierRef.compareAndSet(barrierState/* expected */,
                            null)) {

                        throw new AssertionError();

                    }

                }

                if (barrierState.cause != null) {

                    /*
                     * If an exception was recorded, re-throw it in the thread
                     * that invoked commitNow().
                     */

                    throw new RuntimeException(barrierState.cause);

                }

                /**
                 * Update the release time on the leader.
                 * 
                 * Note: The follower has the new release time, but it is
                 * running asynchronously and might not have updated its release
                 * time locally by the time the leader leaves the consensus
                 * protocol. prepare2Phase() (on the follower) will check the
                 * Future of the GatherTask and block until it is complete. Thus
                 * all services will be in a state where they are known to have
                 * updated their release time (based on the consensus protocol)
                 * before we finish prepare2Phase() and hence before we run
                 * commit2Phase().
                 * 
                 * @see  Native thread leak in HAJournalServer process 
                 */
                
                final IHANotifyReleaseTimeResponse consensus = barrierState.consensus;

                if (consensus == null) {

                    throw new RuntimeException("No consensus");

                }
                
                final long consensusValue = consensus.getCommitTime();

                final long newReleaseTime = Math.max(0L, consensusValue - 1);
                
                if (log.isInfoEnabled())
                    log.info("Advancing releaseTime on leader: "
                            + newReleaseTime);

                setReleaseTime(newReleaseTime);

                return consensus;
                
            } finally {

                barrierLock.unlock();

            }

        }
        
        /**
         * {@inheritDoc}
         * 

         * Overridden to notice whether this service is using the consensus
         * protocol to update the releaseTime or updating it automatically as
         * transactions complete.
         * 
         * @see 
         *      Journal HA 
         */
        @Override
        protected boolean isReleaseTimeConsensusProtocol() {

            final HAStatusEnum haStatus = getHAStatus();
            
            if (haStatus == null || haStatus == HAStatusEnum.NotReady) {
            
                /*
                 * Since we are not HA or this service is not HAReady, we will
                 * not use the consensus protocol to update the releaseTime.
                 * 
                 * Therefore the releaseTime is updated here since we will not
                 * (actually, did not) run the consensus protocol to update it.
                 */

                return false;
                
            }
            
            /*
             * Note: When we are using a 2-phase commit, the leader can not
             * update the release time from commit() using this methods. It
             * must rely on the consensus protocol to update the release
             * time instead.
             */

            return true;

        }
        
//        /**
//         * {@inheritDoc}
//         * 

//         * Note: When we are using a 2-phase commit, the leader can not update
//         * the release time from commit() using this methods. It must rely on
//         * the consensus protocol to update the release time instead.
//         * 
//         * @see 
//         *      Journal HA 
//         */
//        @Override
//        protected void updateReleaseTimeForBareCommit(final long commitTime) {
//
//            final HAStatusEnum haStatus = getHAStatus();
//            
//            if (haStatus == null || haStatus == HAStatusEnum.NotReady) {
//
//                /*
//                 * Since we are not HA or this service is not HAReady, we will
//                 * not use the consensus protocol to update the releaseTime.
//                 * 
//                 * Therefore the releaseTime is updated here since we will not
//                 * (actually, did not) run the consensus protocol to update it.
//                 */
//                super.updateReleaseTimeForBareCommit(commitTime);
//                
//            } else {
//                
//                /*
//                 * Note: When we are using a 2-phase commit, the leader can not
//                 * update the release time from commit() using this methods. It
//                 * must rely on the consensus protocol to update the release
//                 * time instead.
//                 */
//            
//            }
//            
//        }

        /**
         * {@inheritDoc}
         * 

         * Overridden to take the necessary lock since we are invoking this
         * method from contexts in which the lock would not otherwise be held.
         * 

         * Note: This is also used to callback on service join to set the
         * consensus release time from the leader of a newly joined follower.
         * This ensures that live joiners need not take part in a gather and can
         * still confidently join in an HA 2 phase commit.
         * 
         * @param newReleaseTime
         *            The new release time for the local journal.
         */
        @Override
        public void setReleaseTime(final long newValue) {

            if (newValue < 0)
                throw new IllegalArgumentException();

            lock.lock();
            
            try {
            
                super.setReleaseTime(newValue);
                
            } finally {
                
                lock.unlock();
                
            }

        }

        /**
         * Factory for {@link IHANotifyReleaseTimeRequest} messages. This is
         * used by both the leader and the followers.
         * 
         * @param serviceId
         *            The {@link UUID} for this service.
         * @return The new message.
         */
        protected IHANotifyReleaseTimeRequest newHANotifyReleaseTimeRequest(
                final UUID serviceId, final long newCommitCounter,
                final long newCommitTime) {

            // On AbstractTransactionService.
            final long effectiveReleaseTimeForHA = getEffectiveReleaseTimeForHA();

            // On AbstractJournal
            final ICommitRecord commitRecord = getEarliestVisibleCommitRecordForHA(effectiveReleaseTimeForHA);

            final long commitCounter = commitRecord == null ? 0
                    : commitRecord.getCommitCounter();

            final long commitTime = commitRecord == null ? 0
                    : commitRecord.getTimestamp();

//            final long now = getLocalTransactionManager().nextTimestamp();
            final long now = newConsensusProtocolTimestamp();

            final IHANotifyReleaseTimeRequest req = new HANotifyReleaseTimeRequest(
                    serviceId, commitTime, commitCounter, now, false/* isMock */,
                    newCommitCounter, newCommitTime);

            if (log.isTraceEnabled())
                log.trace("releaseTime=" + getReleaseTime()//
                        + ",effectiveReleaseTimeForHA="
                        + effectiveReleaseTimeForHA //
                        + ",rootBlock=" + getRootBlockView() //
                        + ",req=" + req//
                        );

            return req;

        }

        @Override
        public Callable newGatherMinimumVisibleCommitTimeTask(
                final HAGlue leader, final UUID serviceId,
                final IHAGatherReleaseTimeRequest req) {

            return new GatherTask(leader, serviceId, req);

        }

        /**
         * {@inheritDoc}
         * 

         * Note: This method is implemented by {@link AbstractJournal.BasicHA}
         * which calls through to
         * {@link #newGatherMinimumVisibleCommitTimeTask(IHAGatherReleaseTimeRequest)}
         * 
         * @throws UnsupportedOperationException
         */
        @Override
        public void gatherMinimumVisibleCommitTime(
                final IHAGatherReleaseTimeRequest req) throws IOException {

            throw new UnsupportedOperationException();
            
        }
        
        /**
         * "Gather" task runs on the followers.
         * 

         * Note: The gather task scopes the consensus protocol on the follower.
         * It contends for the {@link #barrierLock} (on the follower) in order
         * to be MUTEX with new read-only tx starts on the follower which (a)
         * occur during the consensus protocol; and (b) would read on a commit
         * point that is not pinned by any of an active transaction on the
         * follower, the minReleaseAge, or being the most recent commit point.
         * These are the criteria that allow {@link #newTx(long)} to NOT contend
         * for the {@link #barrierLock}.
         * 
         * @see #newTx(long)
         */
        private class GatherTask implements Callable {

            /**
             * The proxy for the leader (the service that made this request).
             * This is used to RMI back to the leader and therefore MUST be non-
             * null.
             */
            private final HAGlue leader;
            /**
             * The {@link UUID} of this service. This is required as
             * part of the RMI back to the leader (so the leader knows which
             * services responded) and therefore MUST be non-null.
             */
            private final UUID serviceId;
            private final IHAGatherReleaseTimeRequest req;

            /**
             * This variable is set in the try {} block in {@link #call()}. We
             * eventually respond (sending an RMI to the leader) either in the
             * try{} or in the finally{}, depending on whether or not the
             * {@link GatherTask} encounters an error when it executes.
             */
            volatile private boolean didNotifyLeader = false;            

            public GatherTask(final HAGlue leader, final UUID serviceId,
                    final IHAGatherReleaseTimeRequest req) {

                if (leader == null)
                    throw new IllegalArgumentException();

                if (serviceId == null)
                    throw new IllegalArgumentException();

                if (req == null)
                    throw new IllegalArgumentException();

                this.leader = leader;
                
                this.serviceId = serviceId;
                
                this.req = req;

            }

            /**
             * Note: This needs to be robust to most kinds of errors. However,
             * if the quorum breaks (leader leaves) of if a follower leaves that
             * was joined with the met quorum as of the atomic decision point in
             * commitNow(), then that change will be detected by the leader and
             * it will break the {@link CyclicBarrier}.
             */
            @Override
            public IHANotifyReleaseTimeResponse call() throws Exception {

                if (haLog.isInfoEnabled())
                    haLog.info("Running gather on follower");

                /*
                 * This variable is set in the try {} below. We eventually
                 * respond either in the try{} or in the finally{}, depending on
                 * whether or not the GatherTask encounters an error when it
                 * executes.
                 */
                didNotifyLeader = false;

                try {

                    /*
                     * Test pre-conditions BEFORE getting the barrierLock. This
                     * allows a service that is not yet properly joined to
                     * refuse to do the GATHER before it obtains the barrierLock
                     * that makes the GatherTask MUTEX with
                     * doCastLeadersVoteAndServiceJoin().
                     */
                    preconditionTest();

                    barrierLock.lock(); // take lock on follower!

                    try {

                        // Re-test the pre-conditions.
                        preconditionTest();

                        return doRunWithBarrierLock();

                    } finally {

                        barrierLock.unlock();

                    }

                } catch (Throwable t) {

                    log.error(t, t);
                    
                    if (!didNotifyLeader) {

                        /**
                         * Send mock response to the leader so it does not block
                         * forever waiting for our response. The mock response
                         * MUST include our correct serviceId.
                         * 
                         * @see  HA3 simultaneous service start failure 
                         */
                        
                        try {
                            final IHANotifyReleaseTimeRequest resp = new HANotifyReleaseTimeRequest(
                                    serviceId, 0L/* pinnedCommitTime */,
                                    0L/* pinnedCommitCounter */,
                                    nextTimestamp()/* timestamp */,
                                    true/* isMock */,
                                    req.getNewCommitCounter(),
                                    req.getNewCommitTime());
                            log.warn("Sending mock response for gather protocol: cause="
                                    + t);
                            // Will block until barrier breaks on leader.
                            leader.notifyEarliestCommitTime(resp);
                        } catch (Throwable t2) {
                            log.error(t2, t2);
                        }

                    }

                    /*
                     * This exception will force PREPARE to fail on this service
                     * when it checks the GatherTask's Future.
                     */
                    throw new Exception(t);
                    
                 }
                
            }

            /**
             * Check various conditions that need to be true.
             * 

             * Note: We do this once before we take the barrier lock and once
             * after. We need to do this before we take the barrier lock to
             * avoid a distributed deadlock when a service is attempting to do
             * runWithBarrierLock() to join concurrent with the GATHER of a
             * 2-phase commit. We do it after we take the barrier lock to ensure
             * that the conditions are still satisified - they are all light
             * weight tests, but the conditions could become invalidated so it
             * does not hurt to check again.
             */
            private void preconditionTest() {
            
                final long token = req.token();

                /*
                 * we do not need to handle the case where the token is
                 * invalid. The leader will reset() the CylicBarrier for
                 * this case.
                 */

                // Verify quorum valid for token (implies leader valid)
                getQuorum().assertQuorum(token);

                // Verify this service is HAReady for token.
                assertHAReady(token);

                /*
                 * If the quorumService is null because this service is
                 * shutting down then the leader will notice the
                 * serviceLeave() and reset() the CyclicBarrier.
                 */
                final QuorumService quorumService = getQuorum()
                        .getClient();

//                /*
//                 * This timestamp is used to help detect clock skew.
//                 */
//                now = newConsensusProtocolTimestamp();

                /*
                 * Note: At this point we have everything we need to form up
                 * our response. If we hit an assertion, we will still
                 * respond in the finally {} block below.
                 */

                /*
                 * Note: This assert has been moved to the leader when it
                 * analyzes the messages from the followers. This allows us
                 * to report out the nature of the exception on the leader
                 * and thence back to the client.
                 */
//                /* Verify event on leader occurs before event on follower.
//                 */
//                assertBefore(req.getTimestampOnLeader(), now);

                if (!quorumService.isFollower(token))
                    throw new QuorumException();

                final long localCommitCounter = getRootBlockView()
                        .getCommitCounter();
                
                if (req.getNewCommitCounter() != localCommitCounter + 1) {
                    throw new RuntimeException(
                            "leader is preparing for commitCounter="
                                    + req.getNewCommitCounter()
                                    + ", but follower is at localCommitCounter="
                                    + localCommitCounter);
                }

            }
            
            /**
             * This code is MUTEX with runWithBarrierLock() in HAJournalServer's
             * doCastLeadersVoteAndJoin().
             */
            private IHANotifyReleaseTimeResponse doRunWithBarrierLock()
                    throws Exception {

                final IHANotifyReleaseTimeRequest req2 = newHANotifyReleaseTimeRequest(
                        serviceId, req.getNewCommitCounter(),
                        req.getNewCommitTime());

                /*
                 * RMI to leader.
                 * 
                 * Note: Will block until barrier breaks on the leader.
                 */

                didNotifyLeader = true;

                final IHANotifyReleaseTimeResponse consensusReleaseTime = leader
                        .notifyEarliestCommitTime(req2);

                /*
                 * Now spot check the earliest active tx on this follower. We
                 * want to make sure that this tx is not reading against a
                 * commit point whose state would be released by the new
                 * [consensusReleaseTime] that we just obtained from the leader.
                 * 
                 * If everything is Ok, we update the releaseTime on the
                 * follower.
                 */

                lock.lock();

                try {

                    if (log.isInfoEnabled())
                        log.info("Validating consensus releaseTime on follower: consensus="
                                + consensusReleaseTime);

                    // the earliest active tx on this follower.
                    final TxState txState = getEarliestActiveTx();

                    // Consensus for new earliest visible commit time.
                    final long t2 = consensusReleaseTime.getCommitTime();

                    if (txState != null && txState.getReadsOnCommitTime() < t2) {

                        /*
                         * At least one transaction exists on the follower that
                         * is reading on a commit point LT the commit point
                         * which would be released. This is either a failure in
                         * the logic to compute the consensus releaseTime or a
                         * failure to exclude new transaction starts on the
                         * follower while computing the new consensus
                         * releaseTime.
                         */

                        throw new AssertionError(
                                "The releaseTime consensus would release a commit point with active readers"
                                        + ": consensus=" + consensusReleaseTime
                                        + ", earliestActiveTx=" + txState);

                    }

                    final long newReleaseTime = Math.max(0L,
                            consensusReleaseTime.getCommitTime() - 1);

                    if (log.isInfoEnabled())
                        log.info("Advancing releaseTime on follower: "
                                + newReleaseTime);

                    // Update the releaseTime on the follower
                    setReleaseTime(newReleaseTime);

                } finally {

                    lock.unlock();

                }

                // Done.
                return consensusReleaseTime;

            } // doRunWithBarrierLock

        } // GatherTask
        
        /**
         * {@inheritDoc}
         * 

         * Note: Message sent by follower (RMI). Method executes on leader.
         * 

         * We pass the message through to the {@link BarrierState} object.
         * 

         * Note: We MUST NOT contend for the {@link #barrierLock} here. That
         * lock is held by the Thread that invoked
         * {@link #updateReleaseTimeConsensus()}.
         */
        @Override
        public IHANotifyReleaseTimeResponse notifyEarliestCommitTime(
                final IHANotifyReleaseTimeRequest req) throws IOException,
                InterruptedException, BrokenBarrierException {

            /*
             * Note: Do NOT error check [req] until we are in the try{} /
             * finally {} below that will do the CyclicBarrier.await().
             */
            
            final BarrierState barrierState = barrierRef.get();

            if (barrierState == null) {

                /*
                 * If the BarrierState reference has been cleared then it is not
                 * possible for us to count down at the barrier for this message
                 * (since the CyclicBarrier is gone). Otherwise, we will await()
                 * at the CyclicBarrier regardless of the message.
                 */

                throw new IllegalStateException();
                
            }

            try {

                if (haLog.isInfoEnabled())
                    haLog.info("resp=" + req);

                getQuorum().assertLeader(barrierState.token);

                if (barrierState.newCommitCounter != req.getNewCommitCounter()) {
                    /*
                     * Response is for the wrong GATHER request.
                     */
                    throw new RuntimeException(
                            "Wrong newCommitCounter: expected="
                                    + barrierState.newCommitCounter
                                    + ", actual=" + req.getNewCommitCounter());
                }

                if (barrierState.newCommitTime != req.getNewCommitTime()) {
                    /*
                     * Response is for the wrong GATHER request.
                     */
                    throw new RuntimeException("Wrong newCommitTime: expected="
                            + barrierState.newCommitTime + ", actual="
                            + req.getNewCommitTime());
                }

                // ServiceId of the follower (NPE if req is null).
                final UUID followerId = req.getServiceUUID();

                // Make a note of the message from this follower.
                barrierState.followerResponses.put(followerId, req);
                
            } catch(RuntimeException e) {
                
                /*
                 * Note: The try {} block can throw RuntimeException but not
                 * Exception. If anything is thrown, then reset the barrier and
                 * rethrow the exception.
                 */

                haLog.error(e, e);
                
                // Reset the barrier (barrier will break).
                barrierState.barrier.reset();
                
                // Rethrow the exception.
                throw new RuntimeException(e);
                
            } finally {
                
                /*
                 * Block until barrier breaks.
                 * 
                 * Note: The barrier will break immediately if it was reset in
                 * the catch{} block above.
                 */

                try {
                    if (haLog.isInfoEnabled()) {
                        haLog.info("Awaiting barrier: #followerResponses="
                                + barrierState.followerResponses.size() + ", #parties="
                                + barrierState.barrier.getParties()
                                + ", #joinedUUIDs="
                                + barrierState.joinedServiceIds.length);
                    }
                } finally {
                    /*
                     * Follower blocks on Thread on the leader here.
                     * 
                     * Note: This will thrown InterruptedException -or-
                     * BarrierBrokenException if the barrier is reset().
                     */
                    barrierState.barrier.await();
                }
                
            }

            /*
             * Check for an error in the consensus protocol.
             */
            final Throwable t = barrierState.cause;

            if (t != null) {

                /*
                 * Log error.
                 */
                haLog.error(t, t);

                // rethrow cause.
                throw new RuntimeException(t);

            }

            // Return the consensus.
            final IHANotifyReleaseTimeResponse resp = barrierState.consensus;

            if (resp == null) {
                /*
                 * Log error, but return anyway.
                 */
                haLog.error("No consensus");
            }

            return resp;

        }

        /**
         * Helper method returns the {@link HAStatusEnum} -or- null
         * if this is not HA or if the {@link Quorum} is not running. This is a
         * low latency local method call. The code path is against the
         * local (non-remote) HAGlue object. It is NOT an RMI.
         * 
         * @return The {@link HAStatusEnum} or null.
         */
        private final HAStatusEnum getHAStatus() {
            
            // Quorum iff HA.
            final Quorum> quorum = getQuorum();

            if(quorum == null) {
             
                // Not HA.
                return null;
                
            }
            
            // Note: This is the local service interface.
            final HAGlue localService;
            try {

                localService = quorum.getClient().getService();
                
            } catch (IllegalStateException ex) {
                /*
                 * Quorum client is not running (not started or terminated).
                 */
                return null;

            }
            
            // Note: Invocation against local HAGlue object (NOT RMI).
            try {

                return localService.getHAStatus();
                
            } catch (IOException ex) {
                
                // Note: Exception is never thrown (not RMI).
                throw new RuntimeException(ex);
                
            }

        }
        
        @Override
        public long newTx(final long timestamp) {

            if (TimestampUtility.isReadWriteTx(timestamp)) {

                // The caller has provided a TxId, not a timestamp.
                throw new IllegalArgumentException();
                
            }
            
            // The HAStatusEnum -or- null if not HA.
            final HAStatusEnum haStatus = getHAStatus();

            if (haStatus == null) {
             
                // Not HA.
                return _newTx(timestamp);
                
            }

            if (haStatus == HAStatusEnum.NotReady) {

                // Not ready.
                throw new QuorumException();

            }

            if (timestamp == ITx.UNISOLATED && haStatus != HAStatusEnum.Leader) {

                // Read/Write Tx starts are only allowed on the Leader.
                throw new QuorumException("Not quorum leader");

            }

            if (timestamp == ITx.UNISOLATED || timestamp == ITx.READ_COMMITTED) {

                /*
                 * A read-write tx reads on the current commit point.
                 * 
                 * A read-committed tx reads on the current commit point.
                 * 
                 * The current commit point is always visible, so these requests
                 * are non-blocking.
                 * 
                 * Note: We have verified that this service is the quorum leader
                 * above if the request is for a read-write tx.
                 */
        
                return _newTx(timestamp);
                
            }

            /*
             * The request is a read-only tx against some specific historical
             * commit point. It will be allowed (without blocking at the
             * barrier) if the commit point is known to be pinned based on
             * either the minReleaseAge or the earliestActiveTx. We use the
             * AbstractTransactionService's lock to make these inspections
             * atomic.
             */
            
            lock.lock(); // Note: AbstractTransactionService.lock

            try {

                final long now = nextTimestamp();

                final long minReleaseAge = getMinReleaseAge();

                final long ageOfTxView = now - timestamp;

                if (ageOfTxView < minReleaseAge) {

                    // Start tx. Commit point pinned by minReleaseAge.
                    return _newTx(timestamp);

                }

                /*
                 * Handle commit point pinned by earliestActiveTx's
                 * readsOnCommitTime.
                 */
                {

                    final TxState state = getEarliestActiveTx();

                    if (state != null && state.getReadsOnCommitTime() <= timestamp) {

                        // Start Tx. Commit point pinned by earliestActiveTx.
                        return _newTx(timestamp);

                    }

                }

                final IRootBlockView rootBlock = getRootBlockView();
                
                if (rootBlock.getCommitCounter() == 0L) {

                    // Start Tx. No commits so nothing could be released.
                    return _newTx(timestamp);

                }

                if (rootBlock.getLastCommitTime() <= timestamp) {

                    // Tx reads on most recent commit point.
                    return _newTx(timestamp);

                }

            } finally {
            
                lock.unlock();
                
            }

            /*
             * Must block at barrier.
             */
            
            barrierLock.lock();
            
            try {

                if (log.isInfoEnabled())
                    log.info("NewTx with barrierLock");
                
                return _newTx(timestamp);
                                
            } finally {
                
                barrierLock.unlock();
                
            }

        }
        
        /**
         * Core impl.
         * 

         * This code pre-increments the active transaction count within the
         * RWStore before requesting a new transaction from the transaction
         * service. This ensures that the RWStore does not falsely believe
         * that there are no open transactions during the call to
         * AbstractTransactionService#newTx().
         * 

         * Note: This code was moved into the inner class extending the
         * {@link JournalTransactionService} in order to ensure that we
         * follow this pre-incremental pattern for an {@link HAJournal} as
         * well.
         * 
         * @see 
         *      BTree can not be case to Name2Addr 
         * @see 
         *      Journal HA 
         */
        private final long _newTx(final long timestamp) {

            IRawTx tx = null;
            try {
            
                final IBufferStrategy bufferStrategy = getBufferStrategy();

                if (bufferStrategy instanceof IHistoryManager) {

                    // pre-increment the active tx count.
                    tx = ((IHistoryManager) bufferStrategy).newTx();
                    
                }

                return super.newTx(timestamp);

            } finally {

                if (tx != null) {

                    /*
                     * If we had pre-incremented the transaction counter in
                     * the RWStore, then we decrement it before leaving this
                     * method.
                     */

                    tx.close();

                }

            }

        }
        
        @Override
        public long commit(final long tx) {

            final TxState state = getTxState(tx);

            final Quorum> quorum = getQuorum();

            if (quorum != null && state != null && !state.isReadOnly()) {

                /*
                 * Commit on write transaction. We must be the quorum leader.
                 */

                final long token = getQuorumToken();

                getQuorum().assertLeader(token);

            }

            return super.commit(tx);

        }
        
        @Override
        protected void activateTx(final TxState state) {
            if (txLog.isInfoEnabled())
                txLog.info("OPEN : txId=" + state.tx
                        + ", readsOnCommitTime=" + state.getReadsOnCommitTime());
            final IBufferStrategy bufferStrategy = Journal.this.getBufferStrategy();
            if (bufferStrategy instanceof IHistoryManager) {
                final IRawTx tx = ((IHistoryManager)bufferStrategy).newTx();
                if (m_rawTxs.put(state.tx, tx) != null) {
                    throw new IllegalStateException(
                            "Unexpected existing RawTx");
                }
            }
            super.activateTx(state);
        }
        
        @Override
        protected void deactivateTx(final TxState state) {
            if (txLog.isInfoEnabled())
                txLog.info("CLOSE: txId=" + state.tx
                        + ", readsOnCommitTime=" + state.getReadsOnCommitTime());
            /*
             * Note: We need to deactivate the tx before RawTx.close() is
             * invoked otherwise the activeTxCount will never be zero inside
             * of RawTx.close() and the session protection mode of the
             * RWStore will never be able to release storage.
             */
            super.deactivateTx(state);
            
            final IRawTx tx = m_rawTxs.remove(state.tx);
            if (tx != null) {
                tx.close();
            }
        }
        
      /**
       * Extended to cancel any running or queued tasks on the {@link WriteExecutorService}.
       * 
       * @see  HA doLocalAbort()
       *      should interrupt NSS requests and AbstractTasks 
       */
        @Override
        public void abortAllTx() {
           super.abortAllTx();
           concurrencyManager.abortAllTx();
        }
        
    } // class InnerJournalTransactionService
    
    protected JournalTransactionService newTransactionService() {
        
        final JournalTransactionService abstractTransactionService = new InnerJournalTransactionService();
   
        return abstractTransactionService;
        
    }
    
    protected AbstractLocalTransactionManager newLocalTransactionManager() {

        final JournalTransactionService abstractTransactionService = newTransactionService();

        abstractTransactionService.start();
        
        return new AbstractLocalTransactionManager() {

           @Override
            public AbstractTransactionService getTransactionService() {
                
                return abstractTransactionService;
                
            }

            /**
             * Extended to shutdown the embedded transaction service.
             */
            @Override
            public void shutdown() {

                ((JournalTransactionService) getTransactionService())
                        .shutdown();

                super.shutdown();

            }

            /**
             * Extended to shutdown the embedded transaction service.
             */
            @Override
            public void shutdownNow() {

                ((JournalTransactionService) getTransactionService())
                        .shutdownNow();

                super.shutdownNow();

            }
        
        };

    }
    
    @Override
    public AbstractLocalTransactionManager getLocalTransactionManager() {

        return localTransactionManager;

    }

    @Override
    public boolean isGroupCommit() {
       
       return isGroupCommit;
       
    }
    
    /**
     * Interface defines and documents the counters and counter namespaces
     * reported by the {@link Journal} and the various services which it uses.
     * 
     * @author Bryan
     *         Thompson
     */
    public static interface IJournalCounters extends
            ConcurrencyManager.IConcurrencyManagerCounters,
//            ...TransactionManager.XXXCounters,
            ResourceManager.IResourceManagerCounters
            {
       
        /**
         * The namespace for the counters pertaining to the {@link ConcurrencyManager}.
         */
        String concurrencyManager = "Concurrency Manager";

        /**
         * The namespace for the counters pertaining to the named indices.
         */
        String indexManager = "Index Manager";

        /**
         * The namespace for the counters pertaining to the {@link ILocalTransactionService}.
         */
        String transactionManager = "Transaction Manager";
        
        /**
         * The namespace for counters pertaining to the
         * {@link Journal#getExecutorService()}.
         */
        String executorService = "Executor Service";
        
        /**
         * Performance counters for the query engine associated with this
         * journal (if any).
         */
        String queryEngine = "Query Engine";
        
    }

    /**
     * {@inheritDoc}
     * 

     * Overridden to attach additional performance counters.
     */
    @Override
    public CounterSet getCounters() {

        final CounterSet root = new CounterSet();

        // Host wide performance counters (collected from the OS).
        {

            final AbstractStatisticsCollector t = getPlatformStatisticsCollector();

            if (t != null) {

                root.attach(t.getCounters());

            }
            
        }
        
        // JVM wide performance counters.
        {
            
            final CounterSet tmp = root.makePath("JVM");
            
            tmp.attach(AbstractStatisticsCollector.getMemoryCounterSet());
            
        }

        // Journal performance counters.
        {

            final CounterSet tmp = root.makePath("Journal");

			tmp.attach(super.getCounters());

			// Live index counters iff available.
            {

                final CounterSet indexCounters = getIndexCounters();

                if (indexCounters != null) {
                    
                    tmp.makePath(IJournalCounters.indexManager).attach(
                            indexCounters);
                    
                }

            }

			tmp.makePath(IJournalCounters.concurrencyManager)
                    .attach(concurrencyManager.getCounters());

            tmp.makePath(IJournalCounters.transactionManager)
                    .attach(localTransactionManager.getCounters());

            {

                final IPlugIn plugin = pluginQueueStats
                        .get();

                if (plugin != null) {

                    final ThreadPoolExecutorBaseStatisticsTask t = plugin
                            .getService();

                    if (t != null) {

                        tmp.makePath(IJournalCounters.executorService).attach(
                                t.getCounters());

                    }

                }
                
            }

        }
        
        // Lookup an existing query engine, but do not cause one to be created.
        final QueryEngine queryEngine = QueryEngineFactory.getInstance()
                .getExistingQueryController(this);

        if (queryEngine != null) {

            final CounterSet tmp = root.makePath(IJournalCounters.queryEngine);

            tmp.attach(queryEngine.getCounters());
            
        }

        return root;
        
    }
    
    /*
     * IResourceManager
     */

    @Override
    public File getTmpDir() {
        
        return tmpDir;
        
    }
    
    /**
     * The directory in which the journal's file is located -or-
     * null if the journal is not backed by a file.
     */
    @Override
    public File getDataDir() {
        
        final File file = getFile();
        
        if (file == null) {

            return null;

        }
        
        return file.getParentFile();
        
    }

    /**
     * Note: This will only succeed if the uuid identifies this
     * journal.
     */
    public IRawStore openStore(final UUID uuid) {
    
        if(uuid == getRootBlockView().getUUID()) {
            
            return this;
            
        }

        throw new UnsupportedOperationException();
        
    }
        
    /**
     * Always returns an array containing a single {@link BTree} which is the
     * {@link BTree} loaded from the commit record whose commit timestamp is
     * less than or equal to timestamp -or- null if there
     * are no {@link ICommitRecord}s that satisfy the probe or if the named
     * index was not registered as of that timestamp.
     * 
     * @param name
     * @param timestamp
     * 
     * @throws UnsupportedOperationException
     *             If the timestamp is {@link ITx#READ_COMMITTED}. You
     *             MUST use {@link #getIndex(String, long)} in order to obtain a
     *             view that has {@link ITx#READ_COMMITTED} semantics.
     */
    public AbstractBTree[] getIndexSources(final String name,
            final long timestamp) {

        final BTree btree;
        
        if (timestamp == ITx.UNISOLATED) {
        
            /*
             * Unisolated operation on the live index.
             */
            
            // MAY be null.
            btree = getIndex(name);

        } else if (timestamp == ITx.READ_COMMITTED) {

            /*
             * BTree does not know how to update its view with intervening
             * commits. Further, for a variety of reasons including the
             * synchronization problems that would be imposed, there are no
             * plans for BTree to be able to provide read-committed semantics.
             * Instead a ReadCommittedView is returned by
             * getIndex(name,timestamp) when ITx#READ_COMMITTED is requested and
             * this method is not invoked.
             */
            throw new UnsupportedOperationException("Read-committed view");
            
//            /*
//             * Read committed operation against the most recent commit point.
//             * 
//             * Note: This commit record is always defined, but that does not
//             * mean that any indices have been registered.
//             */
//
//            final ICommitRecord commitRecord = getCommitRecord();
//
//            final long ts = commitRecord.getTimestamp();
//
//            if (ts == 0L) {
//
//                log.warn("Nothing committed: name="+name+" - read-committed operation.");
//
//                return null;
//
//            }
//
//            // MAY be null.
//            btree = getIndex(name, commitRecord);
//
//            if (btree != null) {
//
////                /*
////                 * Mark the B+Tree as read-only.
////                 */
////                
////                btree.setReadOnly(true);
//
//                assert ((BTree) btree).getLastCommitTime() != 0;
////                btree.setLastCommitTime(commitRecord.getTimestamp());
//                
//            }
            
        } else {

            /*
             * A specified historical index commit point.
             * 
             * @see  Add
             * cache for access to historical index views on the Journal by name
             * and commitTime. 
             */
            
            final long ts = Math.abs(timestamp);

//            final ICommitRecord commitRecord = getCommitRecord(ts);
//
//            if (commitRecord == null) {
//
//                log.warn("No commit record: name=" + name + ", timestamp=" + ts);
//
//                return null;
//                
//            }
//
//            // MAY be null
//            btree = getIndex(name, commitRecord);

            // MAY be null
            btree = (BTree) super.getIndex(name, ts);

            if (btree != null) {

//                /*
//                 * Mark the B+Tree as read-only.
//                 */
//                
//                btree.setReadOnly(true);
                
                assert btree.getLastCommitTime() != 0;
//                btree.setLastCommitTime(commitRecord.getTimestamp());
                
            }

        }
        
        /* 
         * No such index as of that timestamp.
         */

        if (btree == null) {

            if (log.isInfoEnabled())
                log.info("No such index: name=" + name + ", timestamp="
                        + timestamp);

            return null;

        }

        return new AbstractBTree[] {

                btree

        };

    }

    /**
     * Always returns this.
     */
    final public AbstractJournal getLiveJournal() {

        return this;

    }
    
    /**
     * Always returns this.
     */
    final public AbstractJournal getJournal(final long timestamp) {
        
        return this;
        
    }

    /**
     * Compacts the named indices found on this journal as of the most recent
     * commit point, writing their view onto a new Journal. This method MAY be
     * used concurrently with the {@link Journal} but writes after the selected
     * commit point WILL NOT be reflected in the output file. Typical uses are
     * to reduce the space required by the backing store, to improve locality in
     * the backing store, and to make a backup of the most recent commit point.
     * 
     * @param outFile
     *            The file on which the new journal will be created.
     * 
     * @return The {@link Future} on which you must {@link Future#get() wait}
     *         for the {@link CompactTask} to complete. The already open journal
     *         is accessible using {@link Future#get()}. If you are backing up
     *         data, then be sure to shutdown the returned {@link Journal} so
     *         that it can release its resources.
     */
    public Future compact(final File outFile) {

        return executorService.submit(new CompactTask(this, outFile,
                getLastCommitTime()));
        
    }

   /**
    * Submit a task that will take a snapshot of the journal and return the
    * {@link Future} for that task. The snapshot is taken on a temporary file.
    * Iff the snapshot is successful, the temporary file is renamed to the
    * application determined file. Thus all snapshots are either valid or are
    * were not written. A snapshot of an empty journal is not permitted. Also,
    * the backing store MUST implement the {@link IHABufferStrategy}.
    * 

    * Note: This method supports application controlled snapshots and is
    * primarily intended for non-HA deployments. HA has an integrated snapshot
    * and transaction log mechanism which is preferred in HA deployments and
    * also provides the ability for an application to take snapshots on demand.
    * 
    * @param snapshotFactory
    *           The factory that will provide the name of the file on which the
    *           snapshot will be written.
    * 
    * @return The {@link Future} for the snapshot.
    * 
    * @throws UnsupportedOperationException
    *            if the backing store does not implement the
    *            {@link IHABufferStrategy} interface.
    * 
    * @see  Online backup for
    *      Journal 
    * @since 1.5.2
    */
   public Future snapshot(
         final ISnapshotFactory snapshotFactory) {

      if (!(getBufferStrategy() instanceof IHABufferStrategy)) {
      
         throw new UnsupportedOperationException();
         
      }

      return executorService.submit(new SnapshotTask(this, snapshotFactory));

   }

   @Override
	public void dropIndex(final String name) {

		final BTreeCounters btreeCounters = getIndexCounters(name);

		super.dropIndex(name);

		if (btreeCounters != null) {

			// Conditionally remove the counters for the old index.
			indexCounters.remove(name, btreeCounters);

		}
    	
    }
    
    /**
     * {@inheritDoc}
     * 

     * Note: {@link ITx#READ_COMMITTED} views are given read-committed semantics
     * using a {@link ReadCommittedView}.  This means that they can be cached
     * since the view will update automatically as commits are made against
     * the {@link Journal}.
     *  
     * @see IndexManager#getIndex(String, long)
     */
    @Override
    public ILocalBTreeView getIndex(final String name, final long timestamp) {
        
        if (name == null) {

            throw new IllegalArgumentException();

        }

        final boolean isReadWriteTx = TimestampUtility.isReadWriteTx(timestamp);

//        final Tx tx = (Tx) (isReadWriteTx ? getConcurrencyManager()
//                .getTransactionManager().getTx(timestamp) : null);
        final Tx tx = (Tx) /*getConcurrencyManager().*/getTransactionManager()
                .getTx(timestamp);

        if (isReadWriteTx) {

            if (tx == null) {

                log.warn("Unknown transaction: name=" + name + ", tx="
                        + timestamp);

                return null;

            }

            tx.lock.lock();

            try {

                if (!tx.isActive()) {

                    // typically this means that the transaction has already
                    // prepared.
                    log.warn("Transaction not active: name=" + name + ", tx="
                            + timestamp + ", prepared=" + tx.isPrepared()
                            + ", complete=" + tx.isComplete() + ", aborted="
                            + tx.isAborted());

                    return null;

                }

            } finally {

                tx.lock.unlock();

            }
                                
        }
        
        if( isReadWriteTx && tx == null ) {
        
            /*
             * Note: This will happen both if you attempt to use a transaction
             * identified that has not been registered or if you attempt to use
             * a transaction manager after the transaction has been either
             * committed or aborted.
             */
            
            log.warn("No such transaction: name=" + name + ", tx=" + timestamp);

            return null;
            
        }
        
        final boolean readOnly = TimestampUtility.isReadOnly(timestamp);
//        final boolean readOnly = (timestamp < ITx.UNISOLATED)
//                || (isReadWriteTx && tx.isReadOnly());

        final ILocalBTreeView tmp;

        if (isReadWriteTx) {

            /*
             * Isolated operation.
             * 
             * Note: The backing index is always a historical state of the named
             * index.
             * 
             * Note: Tx.getIndex() will pass through the actual commit time of
             * the ground state against which the transaction is reading (if it
             * is available, which it is on the local Journal).
             * 
             * @see 
             * Refactor native long tx id to thin object
             */

            final ILocalBTreeView isolatedIndex = tx.getIndex(name);

            if (isolatedIndex == null) {

                log.warn("No such index: name=" + name + ", tx=" + timestamp);

                return null;

            }

            tmp = isolatedIndex;

        } else {
            
            /*
             * Non-transactional view.
             */

            if (readOnly) {

                if (timestamp == ITx.READ_COMMITTED) {

                    // read-committed
                    
                    tmp = new ReadCommittedView(this, name);

                } else {

                    if (tx != null) {
  
                        /*
                         * read-only transaction
                         * 
                         * @see  Add cache for access to historical index views on
                         * the Journal by name and commitTime. 
                         */

                        final AbstractBTree[] sources = getIndexSources(name,
                                tx.getReadsOnCommitTime());

                        if (sources == null) {

                            log.warn("No such index: name=" + name
                                    + ", timestamp=" + timestamp);

                            return null;

                        }

                        assert sources[0].isReadOnly();

                        tmp = (BTree) sources[0];
                        
                    } else {

                        // historical read not protected by a transaction

                        final AbstractBTree[] sources = getIndexSources(name,
                                timestamp);

                        if (sources == null) {

                            log.warn("No such index: name=" + name
                                    + ", timestamp=" + timestamp);

                            return null;

                        }

                        assert sources[0].isReadOnly();

                        tmp = (BTree) sources[0];

                    }

                }
                
            } else {
                
                /*
                 * Writable unisolated index.
                 * 
                 * Note: This is the "live" mutable index. This index is NOT
                 * thread-safe. A lock manager is used to ensure that at most
                 * one task has access to this index at a time.
                 */

                assert timestamp == ITx.UNISOLATED;
                
                final AbstractBTree[] sources = getIndexSources(name, ITx.UNISOLATED);
                
                if (sources == null) {

                    if (log.isInfoEnabled())
                        log.info("No such index: name="+name+", timestamp="+timestamp);
                    
                    return null;
                    
                }

                assert ! sources[0].isReadOnly();

                tmp = (BTree) sources[0];

            }

        }
        
        /*
         * Make sure that it is using the canonical counters for that index.
         * 
         * Note: AbstractTask also does this for UNISOLATED indices which it
         * loads by itself as part of providing ACID semantics for add/drop
         * of indices.
         */

        tmp.getMutableBTree().setBTreeCounters(getIndexCounters(name));

        return tmp;

    }

    /**
     * Always returns the {@link BTree} as the sole element of the array since
     * partitioned indices are not supported.
     */
    @Override
    public AbstractBTree[] getIndexSources(final String name,
            final long timestamp, final BTree btree) {
        
        return new AbstractBTree[] { btree };
        
    }

    /**
     * Create a new transaction on the {@link Journal}.
     * 

     * Note: This is a convenience method. The implementation of this method is
     * delegated to the object returned by {@link #getTransactionService()}.
     * 
     * @param timestamp
     *            A positive timestamp for a historical read-only transaction as
     *            of the first commit point LTE the given timestamp,
     *            {@link ITx#READ_COMMITTED} for a historical read-only
     *            transaction as of the most current commit point on the
     *            {@link Journal} as of the moment that the transaction is
     *            created, or {@link ITx#UNISOLATED} for a read-write
     *            transaction.
     * 
     * @return The transaction identifier.
     * 
     * @see ITransactionService#newTx(long)
     */
    final public long newTx(final long timestamp) {

        /*
         * Note: The RWStore native tx pre-increment logic is now handled by
         * _newTx() in the inner class that extends JournalTransactionService.
         */
        try {

            return getTransactionService().newTx(timestamp);

        } catch (IOException ioe) {

            /*
             * Note: IOException is declared for RMI but will not be thrown
             * since the transaction service is in fact local.
             */

            throw new RuntimeException(ioe);

        }

    }

    /**
     * Abort a transaction.
     * 

     * Note: This is a convenience method. The implementation of this method is
     * delegated to the object returned by {@link #getTransactionService()}.
     * 
     * @param tx
     *            The transaction identifier.
     *            
     * @see ITransactionService#abort(long)
     */
    final public void abort(final long tx) {

        try {

            /*
             * Note: TransactionService will make call back to the
             * localTransactionManager to handle the client side of the
             * protocol.
             */
            
            getTransactionService().abort(tx);

        } catch (IOException e) {
            
            /*
             * Note: IOException is declared for RMI but will not be thrown
             * since the transaction service is in fact local.
             */
            
            throw new RuntimeException(e);
            
        }

    }

    /**
     * Commit a transaction.
     * 

     * Note: This is a convenience method. The implementation of this method is
     * delegated to the object returned by {@link #getTransactionService()}.
     * 
     * @param tx
     *            The transaction identifier.
     * 
     * @return The commit time assigned to that transaction.
     * 
     * @see ITransactionService#commit(long)
     */
    final public long commit(final long tx) throws ValidationError {

        try {

            /*
             * Note: TransactionService will make call back to the
             * localTransactionManager to handle the client side of the
             * protocol.
             */

            return getTransactionService().commit(tx);

        } catch (IOException e) {

            /*
             * Note: IOException is declared for RMI but will not be thrown
             * since the transaction service is in fact local.
             */

            throw new RuntimeException(e);

        }

    }

    /**
    * Validate the write set for a transaction. This operation is not required.
    * Validation will be performed during commit processing for a transaction
    * regardless.
    * 
    * @param txId
    *           The transaction identifier.
    * 
    * @return true iff the write set of the transaction could be
    *         validated.
    * 
    * @throws TransactionNotFoundException
    *            if no such transaction exists.
    */
   final public boolean prepare(final long txId) {

      final Tx localState = getLocalTransactionManager().getTx(txId);

      if (localState == null)
         throw new TransactionNotFoundException(txId);

      if (localState.isReadOnly()) {
         // Trivally validated.
         return true;
      }

      try {

         final AbstractTask task = new ValidateWriteSetTask(
               concurrencyManager, getLocalTransactionManager(), localState);

         /*
          * Submit the task and wait for the result.
          * 
          * Note: This task MUST go through the ConcurrencyManager to obtain its
          * locks.
          */
         final boolean ok = concurrencyManager.submit(task).get();

         return ok;

      } catch (Exception ex) {

         throw new RuntimeException(ex);

      }

   }
    
//    /**
//     * @deprecated This method in particular should be hidden from the
//     *             {@link Journal} as it exposes the {@link ITx} which really
//     *             deals with the client-side state of a transaction and which
//     *             should not be visible to applications - they should just use
//     *             the [long] transaction identifier.
//     */
//    public ITx getTx(long startTime) {
//    
//        return localTransactionManager.getTx(startTime);
//        
//    }

    /**
     * Returns the next timestamp from the {@link ILocalTransactionManager}.
     * 

     * Note: This is a convenience method. The implementation of this method is
     * delegated to the object returned by {@link #getTransactionService()}.
     * 
     * @deprecated This is here for historical reasons and is only used by the
     *             test suite. Use {@link #getLocalTransactionManager()} and
     *             {@link ITransactionService#nextTimestamp()}.
     * 
     * @see ITransactionService#nextTimestamp()
     */
    final public long nextTimestamp() {
    
        return localTransactionManager.nextTimestamp();
    
    }

    /*
     * IConcurrencyManager
     */
    
    public ConcurrencyManager getConcurrencyManager() {
        
        return concurrencyManager;
        
    }
    
    /**
     * Note: The transaction service is shutdown first, then the
     * {@link #executorService}, then the {@link IConcurrencyManager}, the
     * {@link ITransactionService} and finally the {@link IResourceLockService}.
     */
    @Override
    synchronized public void shutdown() {
        
        if (!isOpen())
            return;

        /*
         * Shutdown the transaction service. This will not permit new
         * transactions to start and will wait until running transactions either
         * commit or abort.
         */
        localTransactionManager.shutdown();

        {

            final IPlugIn plugIn = pluginGanglia.get();

            if (plugIn != null) {

                // stop if running.
                plugIn.stopService(false/* immediateShutdown */);

            }

        }
        
        {
        
            final IPlugIn plugIn = pluginQueueStats.get();

            if (plugIn != null) {

                // stop if running.
                plugIn.stopService(false/* immediateShutdown */);

            }
            
        }

        {
         
            final IPlugIn plugIn = pluginPlatformStats.get();

            if (plugIn != null) {

                // stop if running.
                plugIn.stopService(false/* immediateShutdown */);

            }
            
        }
        
        if (scheduledExecutorService != null) {

            scheduledExecutorService.shutdown();
            
        }
        
        // optional httpd service for the local counters.
        {
            
            final IPlugIn plugIn = pluginHttpd.get();

            if (plugIn != null) {

                // stop if running.
                plugIn.stopService(false/* immediateShutdown */);

            }
            
        }
        
        /*
         * Shutdown the executor service. This will wait for any tasks being run
         * on that service by the application to complete.
         */
        try {

            new ShutdownHelper(executorService, 1000/* logTimeout */,
                    TimeUnit.MILLISECONDS) {
               
                @Override
                protected void logTimeout() {

                    log.warn("Waiting on task(s)"
                            + ": elapsed="
                            + TimeUnit.NANOSECONDS.toMillis(elapsed())
                            + "ms, #active="
                            + ((ThreadPoolExecutor) executorService)
                                    .getActiveCount());
                    
                }

            };

        } catch (InterruptedException ex) {

            log.warn("Immediate shutdown: "+ex);
            
            // convert to immediate shutdown.
            shutdownNow();
            
            return;

        }

        /*
         * Shutdown the concurrency manager - this will allow existing
         * non-transactional operations to complete but prevent additional
         * operations from starting.
         */
        concurrencyManager.shutdown();
        
        super.shutdown();
        
    }

    /**
     * Note: The {@link IConcurrencyManager} is shutdown first, then the
     * {@link ITransactionService} and finally the {@link IResourceManager}.
     */
    @Override
    synchronized public void shutdownNow() {

        if (!isOpen())
            return;

        /*
         * Note: The ganglia plug in is executed on the main thread pool. We
         * need to terminate it in order for the thread pool to shutdown.
         */
        {

            final IPlugIn plugIn = pluginGanglia.get();

            if (plugIn != null) {

                // stop if running.
                plugIn.stopService(true/* immediateShutdown */);

            }
            
        }

        {

            final IPlugIn plugIn = pluginQueueStats.get();

            if (plugIn != null) {

                // stop if running.
                plugIn.stopService(true/* immediateShutdown */);

            }
            
        }

        {
         
            final IPlugIn plugIn = pluginPlatformStats.get();

            if (plugIn != null) {

                // stop if running.
                plugIn.stopService(true/* immediateShutdown */);

            }
            
        }

        if (scheduledExecutorService != null)
            scheduledExecutorService.shutdownNow();
        
        // optional httpd service for the local counters.
        {
            
            final IPlugIn plugIn = pluginHttpd.get();

            if (plugIn != null) {

                // stop if running.
                plugIn.stopService(false/* immediateShutdown */);

            }
            
        }

        // Note: can be null if error in ctor.
        if (executorService != null)
            executorService.shutdownNow();

        // Note: can be null if error in ctor.
        if (concurrencyManager != null)
            concurrencyManager.shutdownNow();

        // Note: can be null if error in ctor.
        if (localTransactionManager != null)
            localTransactionManager.shutdownNow();

        super.shutdownNow();
        
    }
    
//    public void deleteResources() {
//        
//        super.deleteResources();
//        
//        // Note: can be null if error in ctor.
//        if (tempStoreFactory != null)
//            tempStoreFactory.closeAll();
//
//    }

    /**
     * {@inheritDoc}
     * 

     * Overridden to close the {@link TemporaryStoreFactory}.
     */
    @Override
    protected void _close() {

        super._close();

        // Note: can be null if error in ctor.
        if (tempStoreFactory != null)
            tempStoreFactory.closeAll();

    }
    
    @Override
    public  FutureTask submit(AbstractTask task) {

        return concurrencyManager.submit(task);
        
    }

    @Override
	public  List> invokeAll(
			final Collection> tasks,
			final long timeout, final TimeUnit unit)
			throws InterruptedException {

        return concurrencyManager.invokeAll(tasks, timeout, unit);
        
    }

    @Override
    public  List> invokeAll(
            Collection> tasks)
            throws InterruptedException {
        
        return concurrencyManager.invokeAll(tasks);
        
    }

    @Override
    public IResourceManager getResourceManager() {
        
        return concurrencyManager.getResourceManager();
        
    }

    @Override
    public ILocalTransactionManager getTransactionManager() {

//        return concurrencyManager.getTransactionManager();
        return localTransactionManager;
        
    }
    
    public ITransactionService getTransactionService() {
    	
//       return getTransactionManager().getTransactionService();
        return localTransactionManager.getTransactionService();

    }

    @Override
    public WriteExecutorService getWriteService() {

        return concurrencyManager.getWriteService();
        
    }

    /*
     * IResourceManager
     */
    
    /**
     * Note: This implementation always returns false. As a
     * consequence the journal capacity will simply be extended by
     * {@link #write(ByteBuffer)} until the available disk space is exhausted.
     * 
     * @return This implementation returns false since overflow
     *         is NOT supported.
     */
    @Override
    public boolean shouldOverflow() {

        return false;

    }
    
    /**
     * Note: This implementation always returns false.
     */
    @Override
    public boolean isOverflowEnabled() {
        
        return false;
        
    }
    
    @Override
    public Future