
com.bigdata.journal.Journal Maven / Gradle / Ivy
Show all versions of bigdata-core Show documentation
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package com.bigdata.journal;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.BrokenBarrierException;
import java.util.concurrent.Callable;
import java.util.concurrent.CancellationException;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.Semaphore;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.log4j.Logger;
import cern.colt.Arrays;
import com.bigdata.bfs.BigdataFileSystem;
import com.bigdata.bfs.GlobalFileSystemHelper;
import com.bigdata.bop.engine.QueryEngine;
import com.bigdata.bop.fed.QueryEngineFactory;
import com.bigdata.btree.AbstractBTree;
import com.bigdata.btree.BTree;
import com.bigdata.btree.BTreeCounters;
import com.bigdata.btree.BaseIndexStats;
import com.bigdata.btree.ILocalBTreeView;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.IndexSegment;
import com.bigdata.btree.ReadCommittedView;
import com.bigdata.cache.HardReferenceQueue;
import com.bigdata.config.IntegerValidator;
import com.bigdata.config.LongValidator;
import com.bigdata.counters.AbstractStatisticsCollector;
import com.bigdata.counters.CounterSet;
import com.bigdata.counters.httpd.CounterSetHTTPD;
import com.bigdata.ha.HAGlue;
import com.bigdata.ha.HAStatusEnum;
import com.bigdata.ha.HATXSGlue;
import com.bigdata.ha.QuorumService;
import com.bigdata.ha.msg.HAGatherReleaseTimeRequest;
import com.bigdata.ha.msg.HANotifyReleaseTimeRequest;
import com.bigdata.ha.msg.HANotifyReleaseTimeResponse;
import com.bigdata.ha.msg.IHAGatherReleaseTimeRequest;
import com.bigdata.ha.msg.IHANotifyReleaseTimeRequest;
import com.bigdata.ha.msg.IHANotifyReleaseTimeResponse;
import com.bigdata.journal.JournalTransactionService.ValidateWriteSetTask;
import com.bigdata.quorum.Quorum;
import com.bigdata.quorum.QuorumException;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.rdf.task.IApiTask;
import com.bigdata.relation.locator.DefaultResourceLocator;
import com.bigdata.relation.locator.ILocatableResource;
import com.bigdata.relation.locator.IResourceLocator;
import com.bigdata.resources.IndexManager;
import com.bigdata.resources.ResourceManager;
import com.bigdata.resources.StaleLocatorReason;
import com.bigdata.rwstore.IHistoryManager;
import com.bigdata.rwstore.IRawTx;
import com.bigdata.rwstore.RWStore;
import com.bigdata.service.AbstractTransactionService;
import com.bigdata.service.DataService;
import com.bigdata.service.IBigdataFederation;
import com.bigdata.sparse.GlobalRowStoreHelper;
import com.bigdata.sparse.SparseRowStore;
import com.bigdata.util.DaemonThreadFactory;
import com.bigdata.util.InnerCause;
import com.bigdata.util.concurrent.LatchedExecutor;
import com.bigdata.util.concurrent.ShutdownHelper;
import com.bigdata.util.concurrent.ThreadPoolExecutorBaseStatisticsTask;
/**
* Concrete implementation suitable for a local and unpartitioned database.
*
* Note: This implementation does NOT not support partitioned indices. Because
* all data must reside on a single journal resource there is no point to a
* view. Views are designed to have data on a mixture of the live journal, one
* or more historical journals, and one or more {@link IndexSegment}s.
*
* @see ResourceManager, which supports views.
*/
public class Journal extends AbstractJournal implements IConcurrencyManager,
/*ILocalTransactionManager,*/ IResourceManager {
/**
* Logger.
*/
static final Logger log = Logger.getLogger(Journal.class);
/**
* @see http://sourceforge.net/apps/trac/bigdata/ticket/443 (Logger for
* RWStore transaction service and recycler)
*/
private static final Logger txLog = Logger.getLogger("com.bigdata.txLog");
/**
* Object used to manage local transactions.
*/
private final AbstractLocalTransactionManager localTransactionManager;
/**
* Object used to manage tasks executing against named indices.
*/
private final ConcurrencyManager concurrencyManager;
/**
* true
iff the journal has been configured to use group commit.
*/
private final boolean isGroupCommit;
/**
* Options understood by the {@link Journal}.
*
* @author Bryan Thompson
*/
public interface Options extends com.bigdata.journal.Options,
com.bigdata.journal.ConcurrencyManager.Options,
com.bigdata.journal.TemporaryStoreFactory.Options,
com.bigdata.journal.QueueStatsPlugIn.Options,
com.bigdata.journal.PlatformStatsPlugIn.Options,
com.bigdata.journal.HttpPlugin.Options
// Note: Do not import. Forces bigdata-ganglia dependency.
// com.bigdata.journal.GangliaPlugIn.Options
{
/**
* The name of an boolean option that conditionally enables group commit
* semantics for the Journal based on task-oriented concurrent writers.
* This option indicates a contract that the application agrees
* to respect. When false
the application controls when the
* database goes through a commit point by invoking
* {@link Journal#commit()}. When true
ALL mutation
* operations MUST be submitted to the {@link IConcurrencyManager} of the
* journal. Note that the journal always has supported group commit - this
* is how it is used in scale-out. However, embedded applications have
* historically made the decision about when the database should commit.
* When you specify this option, you are asserting that your code will
* always submit tasks for evaluation using the
* {@link IConcurrencyManager}.
*
* There are several benefits of group commit.
*
* First, you can have multiple tenants in the same database instance and
* the updates for one tenant will no longer block the updates for the
* other tenants. Thus, one tenant can be safely running a long running
* update and other tenants can still enjoy low latency updates.
*
* Second, group commit automatically combines a sequence of updates on
* one (or more) tenant(s) into a single commit point on the disk. This
* provides higher potential throughput. It also means that it is no
* longer as important for applications to batch their updates since group
* commit will automatically perform some batching.
*
* Third, writes on independent indices may be independent (this behavior
* has been around for a long time).
*
* There are a few "gotchas" with the group commit support. This is
* because commits are decided by {@link IApiTask} or {@link AbstractTask}
* completion and tasks are scheduled by the concurrency manager, lock
* manager, and write executor service.
*
* -
* Mutation tasks that do not complete normally MUST throw an exception!
* - Applications MUST NOT call Journal.commit(). Instead, they submit
* an IApiTask using AbstractApiTask.submit(). The database will meld the
* write set of the task into a group commit sometime after the task
* completes successfully.
*
* @see #566 (NSS GROUP COMMIT).
*/
String GROUP_COMMIT = Journal.class.getName() + ".groupCommit";
String DEFAULT_GROUP_COMMIT = "false";
/**
* The capacity of the {@link HardReferenceQueue} backing the
* {@link IResourceLocator} maintained by the {@link Journal}. The
* capacity of this cache indirectly controls how many
* {@link ILocatableResource}s the {@link Journal} will hold open.
*
* The effect of this parameter is indirect owning to the semantics of
* weak references and the control of the JVM over when they are
* cleared. Once an {@link ILocatableResource} becomes weakly reachable,
* the JVM will eventually GC the object. Since objects which are
* strongly reachable are never cleared, this provides our guarantee
* that resources are never closed if they are in use.
*
* @see #DEFAULT_LOCATOR_CACHE_CAPACITY
*/
String LOCATOR_CACHE_CAPACITY = Journal.class.getName()
+ ".locatorCacheCapacity";
String DEFAULT_LOCATOR_CACHE_CAPACITY = "20";
/**
* The timeout in milliseconds for stale entries in the
* {@link IResourceLocator} cache -or- ZERO (0) to disable the timeout
* (default {@value #DEFAULT_LOCATOR_CACHE_TIMEOUT}). When this timeout
* expires, the reference for the entry in the backing
* {@link HardReferenceQueue} will be cleared. Note that the entry will
* remain in the {@link IResourceLocator} cache regardless as long as it
* is strongly reachable.
*/
String LOCATOR_CACHE_TIMEOUT = Journal.class.getName()
+ ".locatorCacheTimeout";
String DEFAULT_LOCATOR_CACHE_TIMEOUT = "" + (60 * 1000);
/**
* The #of threads that will be used to read on the local disk.
*
* @see Journal#getReadExecutor()
*/
String READ_POOL_SIZE = Journal.class.getName() + ".readPoolSize";
String DEFAULT_READ_POOL_SIZE = "0";
}
/**
* Create or re-open a journal.
*
* @param properties
* See {@link com.bigdata.journal.Options}.
*/
public Journal(final Properties properties) {
this(properties, null/* quorum */);
}
public Journal(final Properties properties,
final Quorum> quorum) {
super(properties, quorum);
isGroupCommit = Boolean.parseBoolean(properties.getProperty(
Options.GROUP_COMMIT, Options.DEFAULT_GROUP_COMMIT));
tempStoreFactory = new TemporaryStoreFactory(properties);
executorService = (ThreadPoolExecutor) Executors
.newCachedThreadPool(new DaemonThreadFactory(getClass()
.getName()
+ ".executorService"));
// if (Boolean.valueOf(properties.getProperty(
// Options.COLLECT_QUEUE_STATISTICS,
// Options.DEFAULT_COLLECT_QUEUE_STATISTICS))) {
scheduledExecutorService = Executors
.newSingleThreadScheduledExecutor(new DaemonThreadFactory(
getClass().getName() + ".sampleService"));
// } else {
//
// scheduledExecutorService = null;
//
// }
{
final int readPoolSize = Integer.valueOf(properties.getProperty(
Options.READ_POOL_SIZE, Options.DEFAULT_READ_POOL_SIZE));
if (readPoolSize > 0) {
readService = new LatchedExecutor(executorService,
readPoolSize);
} else {
readService = null;
}
}
resourceLocator = newResourceLocator();
resourceLockManager = new ResourceLockService();
localTransactionManager = newLocalTransactionManager();
concurrencyManager = new ConcurrencyManager(properties,
localTransactionManager, this);
getExecutorService().execute(new StartDeferredTasksTask());
if (isGroupCommit() && !(this.isHAJournal())
&& getRootBlockView().getCommitCounter() == 0L) {
/*
* GROUP_COMMIT: See #566.
*
* This hacks in an initial commit to force the GRS to spring into
* existence. Maybe we should just do this for the initial create of
* the Journal when using group commit? Or maybe we should make the
* AbstractTask smart enough to merge two versions of the GRS - it
* should be simple enough to provide write-write conflict resolution
* for the GRS.
*
* Note: This needs to be done by the leader in HA, hence the
* conditional test above.
*/
getGlobalRowStore(); // Force the GRS index to exist.
commit(); // Commit.
}
}
/**
* Ensure that the WORM mode of the journal always uses
* {@link Long#MAX_VALUE} for
* {@link AbstractTransactionService.Options#MIN_RELEASE_AGE}.
*
* @param properties
* The properties.
*
* @return The argument, with the minReleaseAge overridden if necessary.
*
* @see https://sourceforge.net/apps/trac/bigdata/ticket/391
*/
private Properties checkProperties(final Properties properties) {
if (getBufferStrategy() instanceof WORMStrategy) {
properties.setProperty(
AbstractTransactionService.Options.MIN_RELEASE_AGE, ""
+ Long.MAX_VALUE);
}
return properties;
}
/**
* Factory for the {@link IResourceLocator} for the {@link Journal}.
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
protected IResourceLocator> newResourceLocator() {
final int cacheCapacity = getProperty(Options.LOCATOR_CACHE_CAPACITY,
Options.DEFAULT_LOCATOR_CACHE_CAPACITY,
IntegerValidator.GT_ZERO);
final long cacheTimeout = getProperty(Options.LOCATOR_CACHE_TIMEOUT,
Options.DEFAULT_LOCATOR_CACHE_TIMEOUT, LongValidator.GTE_ZERO);
return new DefaultResourceLocator(this, null/* delegate */,
cacheCapacity, cacheTimeout);
}
/**
* Inner class used to coordinate the distributed protocol for achieving an
* atomic consensus on the new releaseTime for the services joined
* with a met quorum.
*
* @author Bryan
* Thompson
*/
private class BarrierState implements Runnable {
/**
* The token that must remain valid.
*/
final private long token;
/**
* The commit counter that will be assigned to the commit point. This is
* used to ensure that the GATHER and PREPARE are for the same commit
* point and that the follower is at the previous commit point.
*/
final private long newCommitCounter;
/**
* The commit time that will be assigned to the commit point. This is
* used to ensure that the GATHER and PREPARE are for the same commit
* point and that the follower is at the previous commit point.
*/
final private long newCommitTime;
/**
* Local HA service implementation (non-Remote).
*/
final private QuorumService quorumService;
/** The services joined with the met quorum, in their join order. */
final private UUID[] joinedServiceIds;
/**
* {@link CyclicBarrier} used to coordinate the protocol for achiving an
* atomic consensus on the new releaseTime for the services
* joined with a met quorum.
*
* Note: The {@link #barrier} provides visibilty for the fields that are
* modified by {@link #run()} so we do not need additional locks or
* atomics for synchronizing these state updates.
*/
final private CyclicBarrier barrier;
// /**
// * The {@link Future} for the RMI to each follower that is joined with
// * the met quorum.
// */
// final private Map> futures = new HashMap>();
/**
* A timestamp taken on the leader when we start the protocol to
* discover the new releaseTime consensus.
*/
final private long timestampOnLeader;
/**
* The {@link UUID} of the quorum leader.
*/
final private UUID leaderId;
/**
* This is the earliest visible commit point on the leader.
*/
final private IHANotifyReleaseTimeRequest leadersValue;
/**
* Exception is set by {@link #run()} if there is a problem when the
* barrier breaks. The exception is then thrown out to the thread on
* the leader that is running commitNow(), forcing the commit to fail.
*/
volatile Throwable cause = null;
/**
* The message from each of those followers providing their local
* earliest visible commit point.
*
* Note: The {@link ConcurrentHashMap} does NOT allow null
* values. Further the {@link IHANotifyReleaseTimeRequest} specifies the
* serviceId of the follower. Therefore, a follower whose
* {@link GatherTask} fails MUST provide a "mock"
* {@link IHANotifyReleaseTimeRequest} that it will use to wait at the
* {@link CyclicBarrier}.
*
* @see InnerJournalTransactionService#notifyEarliestCommitTime(IHANotifyReleaseTimeRequest)
* @see GatherTask
*/
final private Map followerResponses = new ConcurrentHashMap();
/**
* The value from {@link #followerResponses} associated with the earliest commit
* point. This is basis for the "censensus" across the services.
*/
private IHANotifyReleaseTimeRequest minimumResponse = null;
/**
* The consensus value. This is a restatement of the data in from the
* {@link #minimumResponse}. This is set by {@link #run()}.
*/
protected volatile IHANotifyReleaseTimeResponse consensus = null;
// private Quorum> getQuorum() {
//
// return Journal.this.getQuorum();
//
// }
private HATXSGlue getService(final UUID serviceId) {
return quorumService.getService(serviceId);
}
// /**
// * Cancel the requests on the remote services (RMI). This is a best effort
// * implementation. Any RMI related errors are trapped and ignored in order
// * to be robust to failures in RMI when we try to cancel the futures.
// */
// private , T> void cancelRemoteFutures(
// final F[] remoteFutures) {
//
// if (log.isInfoEnabled())
// log.info("");
//
// for (F rf : remoteFutures) {
//
// try {
//
// rf.cancel(true/* mayInterruptIfRunning */);
//
// } catch (Throwable t) {
//
// // ignored (to be robust).
//
// }
//
// }
//
// }
/** The services joined with the met quorum, in their join order. */
public BarrierState(final long newCommitCounter,
final long newCommitTime, final UUID[] joinedServiceIds) {
token = getQuorum().token();
this.newCommitCounter = newCommitCounter;
this.newCommitTime = newCommitTime;
getQuorum().assertLeader(token);
// Local HA service implementation (non-Remote).
quorumService = getQuorum().getClient();
this.joinedServiceIds = joinedServiceIds;
this.leaderId = quorumService.getServiceId();
leadersValue = ((InnerJournalTransactionService) getTransactionService())
.newHANotifyReleaseTimeRequest(leaderId, newCommitCounter,
newCommitTime);
// Note: Local method call.
timestampOnLeader = leadersValue.getTimestamp();
// /*
// * Only the followers will countDown() at the barrier. The leader
// * will await() until the barrier breaks.
// */
final int nparties = joinedServiceIds.length;// - 1;
barrier = new CyclicBarrier(nparties, this);
}
/**
* Find the minimum value across the responses when the {@link #barrier}
* breaks.
*/
@Override
public void run() {
try {
if (log.isInfoEnabled())
log.info("leader: " + leadersValue);
// This is the timestamp from the BarrierState ctor.
final long timeLeader = leadersValue.getTimestamp();
// This is the timestamp for right now.
final long timeNow = newConsensusProtocolTimestamp();
// // The local clock must be moving forward.
// assertBefore(timeLeader, timeNow);
// Start with the leader's value (from ctor).
minimumResponse = leadersValue;
for (IHANotifyReleaseTimeRequest response : followerResponses
.values()) {
if (log.isTraceEnabled())
log.trace("follower: " + response);
if (response.isMock()) {
/**
* The mock response should not have any influence on
* the consensus release time. The follower provides the
* mock response when it is unable to execute the
* GatherTask correctly, typically because it is not yet
* HAReady. The mock response preserves liveness for the
* GATHER protocol. The follower that provided the mock
* response will vote NO for the PREPARE because it's
* GatherTask will have thrown out an exception.
*
* @see HA3 simultaneous service start failure
*/
log.warn("Ignoring mock response: " + response);
continue;
}
final UUID followerId = response.getServiceUUID();
if (minimumResponse.getPinnedCommitCounter() > response
.getPinnedCommitCounter()) {
minimumResponse = response;
}
/*
* Verify that the timestamp from the ctor is BEFORE the
* timestamp assigned by the follower in the GatherTask.
*/
assertBefore(leaderId, followerId, timeLeader,
response.getTimestamp());
/*
* Verify that the timestamp from the GatherTask on the
* follower is before the timestamp obtained at the top of
* this run() method.
*/
assertBefore(followerId, leaderId, response.getTimestamp(),
timeNow);
}
// Restate the consensus as an appropriate message object.
consensus = new HANotifyReleaseTimeResponse(
minimumResponse.getPinnedCommitTime(),
minimumResponse.getPinnedCommitCounter());
if (log.isInfoEnabled())
log.info("consensus: " + consensus);
} catch (Throwable t) {
// Set the cause.
cause = t;
}
}
/**
* Task does an RMI to the follower to start the GatherTask on the
* follower.
*/
private final class StartGatherOnFollowerTask implements Callable {
private final UUID serviceId;
private final IHAGatherReleaseTimeRequest msg;
public StartGatherOnFollowerTask(final UUID serviceId,
final IHAGatherReleaseTimeRequest msg) {
this.serviceId = serviceId;
this.msg = msg;
}
public Void call() throws Exception {
// Resolve joined service.
final HATXSGlue service = getService(serviceId);
// Message remote service.
// Note: NPE if [service] is gone.
service.gatherMinimumVisibleCommitTime(msg);
// Done.
return null;
}
} // class StartGatherOnFollowerTask
/**
* Send an {@link IHAGatherReleaseTimeRequest} message to each follower.
* Block until the responses are received.
*
* Note: Like the 2-phase commit, the overall protocol should succeed if
* we can get ((k+1)/2)
services that do not fail this
* step. Thus for HA3, we should allow one error on a follower, the
* leader is sending the messages and is presumed to succeed, and one
* follower COULD fail without failing the protocol. If the protocol
* does fail we have to fail the commit, so getting this right is
* NECESSARY. At a mimimum, we must not fail if all joined services on
* entry to this method respond without failing (that is, succeed if no
* services fail during this protocol) - this is implemented.
*
* @throws InterruptedException
* @throws BrokenBarrierException
* @throws TimeoutException
*
* @see
* Native thread leak in HAJournalServer process
*/
private void messageFollowers(final long token, final long timeoutNanos)
throws IOException, InterruptedException,
BrokenBarrierException, TimeoutException {
final long begin = System.nanoTime();
final long nanos = timeoutNanos;
long remaining = nanos;
// Verify that this service is still the leader.
getQuorum().assertLeader(token);
// /*
// * Future for gather task for each follower.
// *
// * Note: These are asynchronous remote Futures. They must not escape
// * the local scope and must be cancelled regardless of the outcome.
// *
// * Note: DGC for these remote causes a native thread leak on the
// * followers. To avoid that, I am attempting to rely on proxies for
// * remote futures that do not use DGC. In this case, I believe that
// * it will work since the Future is in scope on the follower (it is
// * the future for the GatherTask running on the follower) and thus
// * we should not need DGC to keep the follower from finalizing the
// * remote futures on which this method is relying.
// *
// * @see https://sourceforge.net/apps/trac/bigdata/ticket/673
// */
// @SuppressWarnings("unchecked")
// final Future[] remoteFutures = new Future[joinedServiceIds.length];
// final boolean[] remoteDone = new boolean[joinedServiceIds.length];
// Local future for RMI requesting GATHER on each follower.
final List> futures = new LinkedList>();
try {
final IHAGatherReleaseTimeRequest msg = new HAGatherReleaseTimeRequest(
token, timestampOnLeader, leaderId, newCommitCounter,
newCommitTime);
// Do not send message to self (leader is at index 0).
for (int i = 1; i < joinedServiceIds.length; i++) {
final UUID serviceId = joinedServiceIds[i];
/*
* Message each follower.
*
* Note: The invoked RMI method submits the GatherTask that
* executes on the follower and returns. It does not block
* waiting for the outcome of the task on the follower.
* Instead, we wait until the barrier breaks. A thread will
* monitor the quorum state and break the barrier if the
* quorum breaks or if a joined service leaves during the
* consensus protocol.
*
* Note: This uses multiple threads to issue the requests in
* parallel against the followers in order to minimize the
* latency of the protocol.
*/
// Note: throws RejectedExecutionException if shutdown.
futures.add(getExecutorService().submit(
new StartGatherOnFollowerTask(serviceId, msg)));
// // add to list of futures we will check.
// remoteFutures[i] = rf;
}
// /*
// * Check the futures for the other services in the quorum.
// */
// final List causes = new LinkedList();
// for (int i = 1; i < remoteFutures.length; i++) {
// final Future rf = remoteFutures[i];
// boolean success = false;
// try {
// rf.get();
// success = true;
// remoteDone[i] = true;
// } catch (InterruptedException ex) {
// log.error(ex, ex);
// causes.add(ex);
// } catch (ExecutionException ex) {
// log.error(ex, ex);
// causes.add(ex);
// remoteDone[i] = true;
// } catch (RuntimeException ex) {
// /*
// * Note: ClientFuture.get() can throw a RuntimeException
// * if there is a problem with the RMI call. In this case
// * we do not know whether the Future is done.
// */
// log.error(ex, ex);
// causes.add(ex);
// } finally {
// if (!success) {
// // Cancel the request on the remote service (RMI).
// try {
// rf.cancel(true/* mayInterruptIfRunning */);
// } catch (Throwable t) {
// // ignored.
// }
// remoteDone[i] = true;
// }
// }
// }
// spinWaitBarrier(getQuorum(), barrier, token, timeout, units);
/*
* This sets up a task that will monitor the quorum state and
* then interrupt this Thread if it is blocked at the barrier
* [actually, it uses barrier.reset(), which appears to be a
* little safer].
*
* If this service is no longer the quorum leader or if any of
* the services leave that were joined with the met quorum when
* we started the release time consensus protocol, then we have
* to reset() the barrier. We achieve this by interrupting the
* Thread (actually it now uses barrier.reset()).
*
* Note: CyclicBarrier.await(timeout,unit) causes the barrier to
* break if the timeout is exceeded (as opposed to simply throwing the TimeoutException and allowing the thread to
* retry the CyclicBarrier.await()). Therefore it CAN NOT be used in preference to this pattern. However, we could replace the use of the CyclicBarrier with a Phaser (JDK 1.7 or jr166).
*/
{
// final Thread blockedAtBarrier = Thread.currentThread();
final Quorum> quorum = getQuorum();
final long initialDelay = 100; // milliseconds.
final long delay = initialDelay;
final ScheduledFuture> scheduledFuture = scheduledExecutorService
.scheduleWithFixedDelay(new Runnable() {
public void run() {
try {
// Verify service is still leader.
quorum.assertLeader(token);
// Verify service self-recognizes as leader.
if (getHAStatus() != HAStatusEnum.Leader) {
throw new QuorumException();
}
// Verify messaged services still
// joined.
assertServicesStillJoined(quorum);
for (Future f : futures) {
if (f.isDone()) {
/*
* Note: If any follower fails
* on the RMI, then that is
* noticed here and the GATHER
* will fail on the leader.
*
* TODO This should be robust as
* long as a majority of the
* services succeed. Right now
* this will stop the GATHER if
* any service fails on the RMI.
*/
f.get();
}
}
} catch (Throwable ex) {
if (InnerCause.isInnerCause(ex,
InterruptedException.class)) {
// Normal termination.
return;
}
logErrorAndResetBarrier(ex);
}
}
}, initialDelay, delay, TimeUnit.MILLISECONDS);
// Update time remaining.
remaining = nanos - (System.nanoTime() - begin);
try {
/*
* Throws InterruptedException, BrokenBarrierException,
* TimeoutException.
*
* Note: If TimeoutException is thrown, then the barrier
* will be broken.
*/
barrier.await(remaining, TimeUnit.NANOSECONDS);
} finally {
scheduledFuture.cancel(true/* mayInterruptIfRunning */);
}
}
// /*
// * If there were any errors, then throw an exception listing them.
// */
// if (!causes.isEmpty()) {
// // Note: Cancelled below.
//// // Cancel remote futures.
//// cancelRemoteFutures(remoteFutures);
// // Throw exception back to the leader.
// if (causes.size() == 1)
// throw new RuntimeException(causes.get(0));
// throw new RuntimeException("remote errors: nfailures="
// + causes.size(), new ExecutionExceptions(causes));
// }
} finally {
/*
* Cancel local futures for RMI messages to followers.
*
* Note: Regardless of outcome or errors above, ensure that the
* futures used to initiate the GatherTask on the followers are
* cancelled. These are local Futures that do RMIs. The RMIs
* should not block when they execute on the follower.
*/
for (Future f : futures) {
/*
* Await the Future with a short timeout for the task that
* did the RMI to each follow to participate in the GATHER
* protocol. Ensure that these Futures are cancelled if they
* are not already done, but prefer not to attempt to cancel
* the Future if it would be finished if we waited for a few
* milliseconds.
*/
boolean done = false;
try {
f.get(10, TimeUnit.MILLISECONDS);
done = true;
} catch (Exception ex) {
// IGNORE
} finally {
if (!done) {
// cancel local future.
f.cancel(true/* mayInterruptIfRunning */);
}
}
/*
* Future is done. Either cancelled, error, or successfully
* completed per the code block immediately above.
*/
try {
// check outcome of future.
f.get();
} catch (CancellationException e) {
/*
* There is a race between the code path returning from
* the RMI (to request the caller to participate in the
* gather procotol) and the code path in which the
* leader awakens from the broken barrier (above). We
* have explicitly cancelled the future for that RMI and
* then checked the Future. If the leader wakes up
* before the RMI returns, then the Future will be
* cancelled rather than done. This case is a race, not an
* error.
*/
if (log.isInfoEnabled())
log.info(e);// , e);
} catch (ExecutionException e) {
// Probably error on the RMI.
log.error(e, e);
}
}
if (consensus == null) {
/*
* If there were any followers that did not message the
* leader and cause the barrier to be decremented and hence
* the [consensus] to become defined, then we need to
* decrement the barrier for those followers now in order
* for it to break.
*
* There is no method to decrement by a specific number
* (unlike a semaphore), but you can reset() the barrier,
* which will cause a BrokenBarrierException for all Threads
* waiting on the barrier.
*
* Note: It appears that [barrier.isBoken()] always reports
* [false] here. Hence, the test was changed to
* [consensus==null]. See
* HA3 simultaneous service start failure
*
* FIXME HA TXS: A reset() here does not allow us to proceed
* with the consensus protocol unless all services
* "vote yes". Thus, a single node failure during the
* release time consensus protocol will cause the commit to
* fail. [Actually, we could use getNumberWaiting(). If it
* is a bare majority, then we could take the barrier break
* action ourselves. E.g., in the thread that calls
* barrier.reset()]. [Actually, this might not be a problem
* for cases where the GatherTask is able to send back a
* mock IHANotifyReleaseTimeRequest message, only when we
* are interrupted by the Runnable above that is monitoring
* the quorum state for an invariant change.]
*/
log.error("Forcing barrier break");
barrier.reset();
}
}// finally
}
// /**
// * Wait on the {@link CyclicBarrier}, but do this in a loop so we can
// * watch for a quorum break or service leave.
// *
// * @param quorum
// * @param barrier
// * @param timeout
// * @param units
// *
// * @throws BrokenBarrierException
// * @throws InterruptedException
// * @throws TimeoutException
// */
// private void spinWaitBarrier(
// final Quorum> quorum,
// final CyclicBarrier barrier, final long token,
// final long timeout, final TimeUnit unit)
// throws BrokenBarrierException, InterruptedException,
// TimeoutException {
//
// if (log.isInfoEnabled())
// log.info("Waiting at barrier: #parties=" + barrier.getParties()
// + ", #waiting=" + barrier.getNumberWaiting()
// + ", isBroken=" + barrier.isBroken() + ", token="
// + token + ", timeout=" + timeout + ", unit=" + unit);
//
// // How lock to block in each iteration.
// final long blockNanos = TimeUnit.MILLISECONDS.toNanos(10000);
//
// final long begin = System.nanoTime();
// final long nanos = unit.toNanos(timeout);
// long remaining = nanos;
// long nspin = 0L;
//
// try {
//
// while (remaining > 0) {
//
// nspin++;
//
// remaining = nanos - (System.nanoTime() - begin);
//
// try {
//
// // Verify that this service remains the leader.
// quorum.assertLeader(token);
//
// // Verify messaged services are still joined.
// assertServicesStillJoined(quorum);
//
// /*
// * If we observe a serviceLeave for any service that we
// * are awaiting, then we need to stop waiting on that
// * service. This could be achieved by running a Thread
// * that did a barrier.await() on the behalf of that
// * service, but only if that service has not yet
// * responded with its input for the consensus protocol
// * [if it has responded then it is probably already at
// * barrier.await() in a Thread on the leader for that
// * follower.]
// */
// final long awaitNanos = Math.min(blockNanos, remaining);
//
// /*
// * Await barrier, up to the timeout.
// *
// * Note: Contrary to the javadoc, barrier.await(timeout)
// * will break the barrier if the timeout is exceeded!!!
// */
// barrier.await(awaitNanos, TimeUnit.NANOSECONDS);
//
// // Done.
// return;
//
// } catch (TimeoutException e) {
// // Spin.
// continue;
// } catch (InterruptedException e) {
// throw e;
// } catch (BrokenBarrierException e) {
// throw e;
// }
//
// }
//
// } finally {
//
// /*
// * Note: On exit, the caller must reset() the barrier if it is
// * not yet broken.
// */
//
// if (log.isInfoEnabled())
// log.info("barrier: #parties=" + barrier.getParties()
// + ", #waiting=" + barrier.getNumberWaiting()
// + ", isBroken=" + barrier.isBroken() + ", #spin="
// + nspin);
//
// }
//
// }
private void logErrorAndResetBarrier(final Throwable ex) {
log.error(ex, ex);
if (!barrier.isBroken()) {
log.error("Forcing barrier break");
barrier.reset();
}
}
/**
* Verify that the services that were messaged for the release time
* consensus protocol are still joined with the met quorum.
*
* @throws QuorumException
* if one of the joined services leaves.
*/
private void assertServicesStillJoined(
final Quorum> quorum)
throws QuorumException {
final UUID[] tmp = quorum.getJoined();
for (UUID serviceId : joinedServiceIds) {
boolean found = false;
for (UUID t : tmp) {
if (serviceId.equals(t)) {
found = true;
break;
}
}
if (!found) {
throw new QuorumException(
"Service leave during consensus protocol: "
+ serviceId);
}
}
}
}
/**
* Note: This deliberately uses the (non-remote) method
* {@link BasicHA#nextTimestamp()}. This is done so we can write a unit test
* of the {@link GatherTask} that imposes clock skew by overridding the next
* value to be returned by that method.
*/
private long newConsensusProtocolTimestamp() {
return ((BasicHA) getQuorum().getClient().getService()).nextTimestamp();
}
/**
* {@inheritDoc}
*
* Extends the {@link JournalTransactionService} to provide protection for
* the session protection mode of the {@link RWStore} and to support the
* {@link HATXSGlue} interface.
*
* @author Bryan
* Thompson
*
* @see HA TXS Design Document
*
* @see HA
* TXS / TXS Bottleneck
*/
private class InnerJournalTransactionService extends
JournalTransactionService {
protected InnerJournalTransactionService() {
super(checkProperties(properties), Journal.this);
final long lastCommitTime = Journal.this.getLastCommitTime();
if (lastCommitTime != 0L) {
/*
* Notify the transaction service on startup so it can set the
* effective release time based on the last commit time for the
* store.
*
* Note: For HA, the releaseTime is updated by the consensus
* protocol once a quorum is met. Before the quorum meets (and
* before a service joins with a met quorum) each service will
* track its own releaseTime. Therefore, during startup, the
* quorum will be null or HAStatusEnum will be NotReady so the
* TXS will automatically track the release time until the
* service joins with a met quorum.
*/
if (log.isInfoEnabled())
log.info("Startup: lastCommitTime=" + lastCommitTime);
updateReleaseTimeForBareCommit(lastCommitTime);
}
}
/**
* @see
* RWStore does not track tx release correctly
*/
final private ConcurrentHashMap m_rawTxs = new ConcurrentHashMap();
/**
* This lock is used to ensure that the following actions are MUTEX:
*
* - The barrier where we obtain a consensus among the services joined
* with the met quorum concerning the new release time.
* - A remote service that wishes to join an already met quorum.
* - A new transaction start that would read on a commit point which
* is LT than the readsOnCommitTime of the earliestActiveTx for this
* service but GT earliest visible commit point for this service (as
* determined by the releaseTime on the transaction service).
*
* Any of these actions must contend for the {@link #barrierLock}.
*
* @see #updateReleaseTimeConsensus(long, TimeUnit)
* @see GatherTask#call()
* @see #newTx(long)
* @see #runWithBarrierLock(Runnable)
*/
final private ReentrantLock barrierLock = new ReentrantLock();
// final private Condition barrierBroke = barrierLock.newCondition();
/**
* This is used to coordinate the protocol for achiving an atomic
* consensus on the new releaseTime for the services joined with
* a met quorum.
*/
final private AtomicReference barrierRef = new AtomicReference();
@Override
public void runWithBarrierLock(final Runnable r) {
barrierLock.lock();
try {
haLog.info("Will run with barrier lock.");
try {
r.run();
} catch(Throwable t) {
/*
* Note: An Interrupt here is not really an ERROR. It
* could be caused by a change in the RunState of the
* HAJournalServer.
*/
haLog.error(t, t);
} finally {
haLog.info("Did run with barrier lock.");
}
} finally {
barrierLock.unlock();
}
}
/**
* {@inheritDoc}
*
* We need obtain a distributed consensus for the services joined with
* the met quorum concerning the earliest commit point that is pinned by
* the combination of the active transactions and the minReleaseAge on
* the TXS.
*
* New transaction starts during this critical section will block (on
* the leader or the folllower) unless they are guaranteed to be
* allowable, e.g., based on the current minReleaseAge, the new tx would
* read from the most recent commit point, the new tx would ready from a
* commit point that is already pinned by an active transaction on that
* node, etc.
*
* @throws IOException
* @throws BrokenBarrierException
*/
// Note: Executed on the leader.
@Override
public IHANotifyReleaseTimeResponse updateReleaseTimeConsensus(
final long newCommitCounter,
final long newCommitTime,
final UUID[] joinedServiceIds, final long timeout,
final TimeUnit units) throws IOException, InterruptedException,
TimeoutException, BrokenBarrierException {
final long begin = System.nanoTime();
final long nanos = units.toNanos(timeout);
long remaining = nanos;
final long token = getQuorum().token();
if (haLog.isInfoEnabled())
haLog.info("GATHER PROTOCOL: commitCounter=" + newCommitCounter
+ ", token=" + token + ", joinedServiceIds="
+ Arrays.toString(joinedServiceIds));
final BarrierState barrierState;
barrierLock.lock();
try {
getQuorum().assertLeader(token);
if (!barrierRef.compareAndSet(null/* expectedValue */,
barrierState = new BarrierState(newCommitCounter,
newCommitTime, joinedServiceIds)/* newValue */)) {
throw new IllegalStateException();
}
try {
/*
* Message the followers and block until the barrier breaks.
*/
// Update time remaining.
remaining = nanos - (System.nanoTime() - begin);
barrierState.messageFollowers(token, remaining);
} finally {
// Clear the barrierRef.
if (!barrierRef.compareAndSet(barrierState/* expected */,
null)) {
throw new AssertionError();
}
}
if (barrierState.cause != null) {
/*
* If an exception was recorded, re-throw it in the thread
* that invoked commitNow().
*/
throw new RuntimeException(barrierState.cause);
}
/**
* Update the release time on the leader.
*
* Note: The follower has the new release time, but it is
* running asynchronously and might not have updated its release
* time locally by the time the leader leaves the consensus
* protocol. prepare2Phase() (on the follower) will check the
* Future of the GatherTask and block until it is complete. Thus
* all services will be in a state where they are known to have
* updated their release time (based on the consensus protocol)
* before we finish prepare2Phase() and hence before we run
* commit2Phase().
*
* @see Native thread leak in HAJournalServer process
*/
final IHANotifyReleaseTimeResponse consensus = barrierState.consensus;
if (consensus == null) {
throw new RuntimeException("No consensus");
}
final long consensusValue = consensus.getCommitTime();
final long newReleaseTime = Math.max(0L, consensusValue - 1);
if (log.isInfoEnabled())
log.info("Advancing releaseTime on leader: "
+ newReleaseTime);
setReleaseTime(newReleaseTime);
return consensus;
} finally {
barrierLock.unlock();
}
}
/**
* {@inheritDoc}
*
* Overridden to notice whether this service is using the consensus
* protocol to update the releaseTime or updating it automatically as
* transactions complete.
*
* @see
* Journal HA
*/
@Override
protected boolean isReleaseTimeConsensusProtocol() {
final HAStatusEnum haStatus = getHAStatus();
if (haStatus == null || haStatus == HAStatusEnum.NotReady) {
/*
* Since we are not HA or this service is not HAReady, we will
* not use the consensus protocol to update the releaseTime.
*
* Therefore the releaseTime is updated here since we will not
* (actually, did not) run the consensus protocol to update it.
*/
return false;
}
/*
* Note: When we are using a 2-phase commit, the leader can not
* update the release time from commit() using this methods. It
* must rely on the consensus protocol to update the release
* time instead.
*/
return true;
}
// /**
// * {@inheritDoc}
// *
// * Note: When we are using a 2-phase commit, the leader can not update
// * the release time from commit() using this methods. It must rely on
// * the consensus protocol to update the release time instead.
// *
// * @see
// * Journal HA
// */
// @Override
// protected void updateReleaseTimeForBareCommit(final long commitTime) {
//
// final HAStatusEnum haStatus = getHAStatus();
//
// if (haStatus == null || haStatus == HAStatusEnum.NotReady) {
//
// /*
// * Since we are not HA or this service is not HAReady, we will
// * not use the consensus protocol to update the releaseTime.
// *
// * Therefore the releaseTime is updated here since we will not
// * (actually, did not) run the consensus protocol to update it.
// */
// super.updateReleaseTimeForBareCommit(commitTime);
//
// } else {
//
// /*
// * Note: When we are using a 2-phase commit, the leader can not
// * update the release time from commit() using this methods. It
// * must rely on the consensus protocol to update the release
// * time instead.
// */
//
// }
//
// }
/**
* {@inheritDoc}
*
* Overridden to take the necessary lock since we are invoking this
* method from contexts in which the lock would not otherwise be held.
*
* Note: This is also used to callback on service join to set the
* consensus release time from the leader of a newly joined follower.
* This ensures that live joiners need not take part in a gather and can
* still confidently join in an HA 2 phase commit.
*
* @param newReleaseTime
* The new release time for the local journal.
*/
@Override
public void setReleaseTime(final long newValue) {
if (newValue < 0)
throw new IllegalArgumentException();
lock.lock();
try {
super.setReleaseTime(newValue);
} finally {
lock.unlock();
}
}
/**
* Factory for {@link IHANotifyReleaseTimeRequest} messages. This is
* used by both the leader and the followers.
*
* @param serviceId
* The {@link UUID} for this service.
* @return The new message.
*/
protected IHANotifyReleaseTimeRequest newHANotifyReleaseTimeRequest(
final UUID serviceId, final long newCommitCounter,
final long newCommitTime) {
// On AbstractTransactionService.
final long effectiveReleaseTimeForHA = getEffectiveReleaseTimeForHA();
// On AbstractJournal
final ICommitRecord commitRecord = getEarliestVisibleCommitRecordForHA(effectiveReleaseTimeForHA);
final long commitCounter = commitRecord == null ? 0
: commitRecord.getCommitCounter();
final long commitTime = commitRecord == null ? 0
: commitRecord.getTimestamp();
// final long now = getLocalTransactionManager().nextTimestamp();
final long now = newConsensusProtocolTimestamp();
final IHANotifyReleaseTimeRequest req = new HANotifyReleaseTimeRequest(
serviceId, commitTime, commitCounter, now, false/* isMock */,
newCommitCounter, newCommitTime);
if (log.isTraceEnabled())
log.trace("releaseTime=" + getReleaseTime()//
+ ",effectiveReleaseTimeForHA="
+ effectiveReleaseTimeForHA //
+ ",rootBlock=" + getRootBlockView() //
+ ",req=" + req//
);
return req;
}
@Override
public Callable newGatherMinimumVisibleCommitTimeTask(
final HAGlue leader, final UUID serviceId,
final IHAGatherReleaseTimeRequest req) {
return new GatherTask(leader, serviceId, req);
}
/**
* {@inheritDoc}
*
* Note: This method is implemented by {@link AbstractJournal.BasicHA}
* which calls through to
* {@link #newGatherMinimumVisibleCommitTimeTask(IHAGatherReleaseTimeRequest)}
*
* @throws UnsupportedOperationException
*/
@Override
public void gatherMinimumVisibleCommitTime(
final IHAGatherReleaseTimeRequest req) throws IOException {
throw new UnsupportedOperationException();
}
/**
* "Gather" task runs on the followers.
*
* Note: The gather task scopes the consensus protocol on the follower.
* It contends for the {@link #barrierLock} (on the follower) in order
* to be MUTEX with new read-only tx starts on the follower which (a)
* occur during the consensus protocol; and (b) would read on a commit
* point that is not pinned by any of an active transaction on the
* follower, the minReleaseAge, or being the most recent commit point.
* These are the criteria that allow {@link #newTx(long)} to NOT contend
* for the {@link #barrierLock}.
*
* @see #newTx(long)
*/
private class GatherTask implements Callable {
/**
* The proxy for the leader (the service that made this request).
* This is used to RMI back to the leader and therefore MUST be non-
* null
.
*/
private final HAGlue leader;
/**
* The {@link UUID} of this service. This is required as
* part of the RMI back to the leader (so the leader knows which
* services responded) and therefore MUST be non-null
.
*/
private final UUID serviceId;
private final IHAGatherReleaseTimeRequest req;
/**
* This variable is set in the try {} block in {@link #call()}. We
* eventually respond (sending an RMI to the leader) either in the
* try{} or in the finally{}, depending on whether or not the
* {@link GatherTask} encounters an error when it executes.
*/
volatile private boolean didNotifyLeader = false;
public GatherTask(final HAGlue leader, final UUID serviceId,
final IHAGatherReleaseTimeRequest req) {
if (leader == null)
throw new IllegalArgumentException();
if (serviceId == null)
throw new IllegalArgumentException();
if (req == null)
throw new IllegalArgumentException();
this.leader = leader;
this.serviceId = serviceId;
this.req = req;
}
/**
* Note: This needs to be robust to most kinds of errors. However,
* if the quorum breaks (leader leaves) of if a follower leaves that
* was joined with the met quorum as of the atomic decision point in
* commitNow(), then that change will be detected by the leader and
* it will break the {@link CyclicBarrier}.
*/
@Override
public IHANotifyReleaseTimeResponse call() throws Exception {
if (haLog.isInfoEnabled())
haLog.info("Running gather on follower");
/*
* This variable is set in the try {} below. We eventually
* respond either in the try{} or in the finally{}, depending on
* whether or not the GatherTask encounters an error when it
* executes.
*/
didNotifyLeader = false;
try {
/*
* Test pre-conditions BEFORE getting the barrierLock. This
* allows a service that is not yet properly joined to
* refuse to do the GATHER before it obtains the barrierLock
* that makes the GatherTask MUTEX with
* doCastLeadersVoteAndServiceJoin().
*/
preconditionTest();
barrierLock.lock(); // take lock on follower!
try {
// Re-test the pre-conditions.
preconditionTest();
return doRunWithBarrierLock();
} finally {
barrierLock.unlock();
}
} catch (Throwable t) {
log.error(t, t);
if (!didNotifyLeader) {
/**
* Send mock response to the leader so it does not block
* forever waiting for our response. The mock response
* MUST include our correct serviceId.
*
* @see HA3 simultaneous service start failure
*/
try {
final IHANotifyReleaseTimeRequest resp = new HANotifyReleaseTimeRequest(
serviceId, 0L/* pinnedCommitTime */,
0L/* pinnedCommitCounter */,
nextTimestamp()/* timestamp */,
true/* isMock */,
req.getNewCommitCounter(),
req.getNewCommitTime());
log.warn("Sending mock response for gather protocol: cause="
+ t);
// Will block until barrier breaks on leader.
leader.notifyEarliestCommitTime(resp);
} catch (Throwable t2) {
log.error(t2, t2);
}
}
/*
* This exception will force PREPARE to fail on this service
* when it checks the GatherTask's Future.
*/
throw new Exception(t);
}
}
/**
* Check various conditions that need to be true.
*
* Note: We do this once before we take the barrier lock and once
* after. We need to do this before we take the barrier lock to
* avoid a distributed deadlock when a service is attempting to do
* runWithBarrierLock() to join concurrent with the GATHER of a
* 2-phase commit. We do it after we take the barrier lock to ensure
* that the conditions are still satisified - they are all light
* weight tests, but the conditions could become invalidated so it
* does not hurt to check again.
*/
private void preconditionTest() {
final long token = req.token();
/*
* we do not need to handle the case where the token is
* invalid. The leader will reset() the CylicBarrier for
* this case.
*/
// Verify quorum valid for token (implies leader valid)
getQuorum().assertQuorum(token);
// Verify this service is HAReady for token.
assertHAReady(token);
/*
* If the quorumService is null because this service is
* shutting down then the leader will notice the
* serviceLeave() and reset() the CyclicBarrier.
*/
final QuorumService quorumService = getQuorum()
.getClient();
// /*
// * This timestamp is used to help detect clock skew.
// */
// now = newConsensusProtocolTimestamp();
/*
* Note: At this point we have everything we need to form up
* our response. If we hit an assertion, we will still
* respond in the finally {} block below.
*/
/*
* Note: This assert has been moved to the leader when it
* analyzes the messages from the followers. This allows us
* to report out the nature of the exception on the leader
* and thence back to the client.
*/
// /* Verify event on leader occurs before event on follower.
// */
// assertBefore(req.getTimestampOnLeader(), now);
if (!quorumService.isFollower(token))
throw new QuorumException();
final long localCommitCounter = getRootBlockView()
.getCommitCounter();
if (req.getNewCommitCounter() != localCommitCounter + 1) {
throw new RuntimeException(
"leader is preparing for commitCounter="
+ req.getNewCommitCounter()
+ ", but follower is at localCommitCounter="
+ localCommitCounter);
}
}
/**
* This code is MUTEX with runWithBarrierLock() in HAJournalServer's
* doCastLeadersVoteAndJoin().
*/
private IHANotifyReleaseTimeResponse doRunWithBarrierLock()
throws Exception {
final IHANotifyReleaseTimeRequest req2 = newHANotifyReleaseTimeRequest(
serviceId, req.getNewCommitCounter(),
req.getNewCommitTime());
/*
* RMI to leader.
*
* Note: Will block until barrier breaks on the leader.
*/
didNotifyLeader = true;
final IHANotifyReleaseTimeResponse consensusReleaseTime = leader
.notifyEarliestCommitTime(req2);
/*
* Now spot check the earliest active tx on this follower. We
* want to make sure that this tx is not reading against a
* commit point whose state would be released by the new
* [consensusReleaseTime] that we just obtained from the leader.
*
* If everything is Ok, we update the releaseTime on the
* follower.
*/
lock.lock();
try {
if (log.isInfoEnabled())
log.info("Validating consensus releaseTime on follower: consensus="
+ consensusReleaseTime);
// the earliest active tx on this follower.
final TxState txState = getEarliestActiveTx();
// Consensus for new earliest visible commit time.
final long t2 = consensusReleaseTime.getCommitTime();
if (txState != null && txState.getReadsOnCommitTime() < t2) {
/*
* At least one transaction exists on the follower that
* is reading on a commit point LT the commit point
* which would be released. This is either a failure in
* the logic to compute the consensus releaseTime or a
* failure to exclude new transaction starts on the
* follower while computing the new consensus
* releaseTime.
*/
throw new AssertionError(
"The releaseTime consensus would release a commit point with active readers"
+ ": consensus=" + consensusReleaseTime
+ ", earliestActiveTx=" + txState);
}
final long newReleaseTime = Math.max(0L,
consensusReleaseTime.getCommitTime() - 1);
if (log.isInfoEnabled())
log.info("Advancing releaseTime on follower: "
+ newReleaseTime);
// Update the releaseTime on the follower
setReleaseTime(newReleaseTime);
} finally {
lock.unlock();
}
// Done.
return consensusReleaseTime;
} // doRunWithBarrierLock
} // GatherTask
/**
* {@inheritDoc}
*
* Note: Message sent by follower (RMI). Method executes on leader.
*
* We pass the message through to the {@link BarrierState} object.
*
* Note: We MUST NOT contend for the {@link #barrierLock} here. That
* lock is held by the Thread that invoked
* {@link #updateReleaseTimeConsensus()}.
*/
@Override
public IHANotifyReleaseTimeResponse notifyEarliestCommitTime(
final IHANotifyReleaseTimeRequest req) throws IOException,
InterruptedException, BrokenBarrierException {
/*
* Note: Do NOT error check [req] until we are in the try{} /
* finally {} below that will do the CyclicBarrier.await().
*/
final BarrierState barrierState = barrierRef.get();
if (barrierState == null) {
/*
* If the BarrierState reference has been cleared then it is not
* possible for us to count down at the barrier for this message
* (since the CyclicBarrier is gone). Otherwise, we will await()
* at the CyclicBarrier regardless of the message.
*/
throw new IllegalStateException();
}
try {
if (haLog.isInfoEnabled())
haLog.info("resp=" + req);
getQuorum().assertLeader(barrierState.token);
if (barrierState.newCommitCounter != req.getNewCommitCounter()) {
/*
* Response is for the wrong GATHER request.
*/
throw new RuntimeException(
"Wrong newCommitCounter: expected="
+ barrierState.newCommitCounter
+ ", actual=" + req.getNewCommitCounter());
}
if (barrierState.newCommitTime != req.getNewCommitTime()) {
/*
* Response is for the wrong GATHER request.
*/
throw new RuntimeException("Wrong newCommitTime: expected="
+ barrierState.newCommitTime + ", actual="
+ req.getNewCommitTime());
}
// ServiceId of the follower (NPE if req is null).
final UUID followerId = req.getServiceUUID();
// Make a note of the message from this follower.
barrierState.followerResponses.put(followerId, req);
} catch(RuntimeException e) {
/*
* Note: The try {} block can throw RuntimeException but not
* Exception. If anything is thrown, then reset the barrier and
* rethrow the exception.
*/
haLog.error(e, e);
// Reset the barrier (barrier will break).
barrierState.barrier.reset();
// Rethrow the exception.
throw new RuntimeException(e);
} finally {
/*
* Block until barrier breaks.
*
* Note: The barrier will break immediately if it was reset in
* the catch{} block above.
*/
try {
if (haLog.isInfoEnabled()) {
haLog.info("Awaiting barrier: #followerResponses="
+ barrierState.followerResponses.size() + ", #parties="
+ barrierState.barrier.getParties()
+ ", #joinedUUIDs="
+ barrierState.joinedServiceIds.length);
}
} finally {
/*
* Follower blocks on Thread on the leader here.
*
* Note: This will thrown InterruptedException -or-
* BarrierBrokenException if the barrier is reset().
*/
barrierState.barrier.await();
}
}
/*
* Check for an error in the consensus protocol.
*/
final Throwable t = barrierState.cause;
if (t != null) {
/*
* Log error.
*/
haLog.error(t, t);
// rethrow cause.
throw new RuntimeException(t);
}
// Return the consensus.
final IHANotifyReleaseTimeResponse resp = barrierState.consensus;
if (resp == null) {
/*
* Log error, but return anyway.
*/
haLog.error("No consensus");
}
return resp;
}
/**
* Helper method returns the {@link HAStatusEnum} -or- null
* if this is not HA or if the {@link Quorum} is not running. This is a
* low latency local method call. The code path is against the
* local (non-remote) HAGlue object. It is NOT an RMI.
*
* @return The {@link HAStatusEnum} or null
.
*/
private final HAStatusEnum getHAStatus() {
// Quorum iff HA.
final Quorum> quorum = getQuorum();
if(quorum == null) {
// Not HA.
return null;
}
// Note: This is the local service interface.
final HAGlue localService;
try {
localService = quorum.getClient().getService();
} catch (IllegalStateException ex) {
/*
* Quorum client is not running (not started or terminated).
*/
return null;
}
// Note: Invocation against local HAGlue object (NOT RMI).
try {
return localService.getHAStatus();
} catch (IOException ex) {
// Note: Exception is never thrown (not RMI).
throw new RuntimeException(ex);
}
}
@Override
public long newTx(final long timestamp) {
if (TimestampUtility.isReadWriteTx(timestamp)) {
// The caller has provided a TxId, not a timestamp.
throw new IllegalArgumentException();
}
// The HAStatusEnum -or- null if not HA.
final HAStatusEnum haStatus = getHAStatus();
if (haStatus == null) {
// Not HA.
return _newTx(timestamp);
}
if (haStatus == HAStatusEnum.NotReady) {
// Not ready.
throw new QuorumException();
}
if (timestamp == ITx.UNISOLATED && haStatus != HAStatusEnum.Leader) {
// Read/Write Tx starts are only allowed on the Leader.
throw new QuorumException("Not quorum leader");
}
if (timestamp == ITx.UNISOLATED || timestamp == ITx.READ_COMMITTED) {
/*
* A read-write tx reads on the current commit point.
*
* A read-committed tx reads on the current commit point.
*
* The current commit point is always visible, so these requests
* are non-blocking.
*
* Note: We have verified that this service is the quorum leader
* above if the request is for a read-write tx.
*/
return _newTx(timestamp);
}
/*
* The request is a read-only tx against some specific historical
* commit point. It will be allowed (without blocking at the
* barrier) if the commit point is known to be pinned based on
* either the minReleaseAge or the earliestActiveTx. We use the
* AbstractTransactionService's lock to make these inspections
* atomic.
*/
lock.lock(); // Note: AbstractTransactionService.lock
try {
final long now = nextTimestamp();
final long minReleaseAge = getMinReleaseAge();
final long ageOfTxView = now - timestamp;
if (ageOfTxView < minReleaseAge) {
// Start tx. Commit point pinned by minReleaseAge.
return _newTx(timestamp);
}
/*
* Handle commit point pinned by earliestActiveTx's
* readsOnCommitTime.
*/
{
final TxState state = getEarliestActiveTx();
if (state != null && state.getReadsOnCommitTime() <= timestamp) {
// Start Tx. Commit point pinned by earliestActiveTx.
return _newTx(timestamp);
}
}
final IRootBlockView rootBlock = getRootBlockView();
if (rootBlock.getCommitCounter() == 0L) {
// Start Tx. No commits so nothing could be released.
return _newTx(timestamp);
}
if (rootBlock.getLastCommitTime() <= timestamp) {
// Tx reads on most recent commit point.
return _newTx(timestamp);
}
} finally {
lock.unlock();
}
/*
* Must block at barrier.
*/
barrierLock.lock();
try {
if (log.isInfoEnabled())
log.info("NewTx with barrierLock");
return _newTx(timestamp);
} finally {
barrierLock.unlock();
}
}
/**
* Core impl.
*
* This code pre-increments the active transaction count within the
* RWStore before requesting a new transaction from the transaction
* service. This ensures that the RWStore does not falsely believe
* that there are no open transactions during the call to
* AbstractTransactionService#newTx().
*
* Note: This code was moved into the inner class extending the
* {@link JournalTransactionService} in order to ensure that we
* follow this pre-incremental pattern for an {@link HAJournal} as
* well.
*
* @see
* BTree can not be case to Name2Addr
* @see
* Journal HA
*/
private final long _newTx(final long timestamp) {
IRawTx tx = null;
try {
final IBufferStrategy bufferStrategy = getBufferStrategy();
if (bufferStrategy instanceof IHistoryManager) {
// pre-increment the active tx count.
tx = ((IHistoryManager) bufferStrategy).newTx();
}
return super.newTx(timestamp);
} finally {
if (tx != null) {
/*
* If we had pre-incremented the transaction counter in
* the RWStore, then we decrement it before leaving this
* method.
*/
tx.close();
}
}
}
@Override
public long commit(final long tx) {
final TxState state = getTxState(tx);
final Quorum> quorum = getQuorum();
if (quorum != null && state != null && !state.isReadOnly()) {
/*
* Commit on write transaction. We must be the quorum leader.
*/
final long token = getQuorumToken();
getQuorum().assertLeader(token);
}
return super.commit(tx);
}
@Override
protected void activateTx(final TxState state) {
if (txLog.isInfoEnabled())
txLog.info("OPEN : txId=" + state.tx
+ ", readsOnCommitTime=" + state.getReadsOnCommitTime());
final IBufferStrategy bufferStrategy = Journal.this.getBufferStrategy();
if (bufferStrategy instanceof IHistoryManager) {
final IRawTx tx = ((IHistoryManager)bufferStrategy).newTx();
if (m_rawTxs.put(state.tx, tx) != null) {
throw new IllegalStateException(
"Unexpected existing RawTx");
}
}
super.activateTx(state);
}
@Override
protected void deactivateTx(final TxState state) {
if (txLog.isInfoEnabled())
txLog.info("CLOSE: txId=" + state.tx
+ ", readsOnCommitTime=" + state.getReadsOnCommitTime());
/*
* Note: We need to deactivate the tx before RawTx.close() is
* invoked otherwise the activeTxCount will never be zero inside
* of RawTx.close() and the session protection mode of the
* RWStore will never be able to release storage.
*/
super.deactivateTx(state);
final IRawTx tx = m_rawTxs.remove(state.tx);
if (tx != null) {
tx.close();
}
}
/**
* Extended to cancel any running or queued tasks on the {@link WriteExecutorService}.
*
* @see HA doLocalAbort()
* should interrupt NSS requests and AbstractTasks
*/
@Override
public void abortAllTx() {
super.abortAllTx();
concurrencyManager.abortAllTx();
}
} // class InnerJournalTransactionService
protected JournalTransactionService newTransactionService() {
final JournalTransactionService abstractTransactionService = new InnerJournalTransactionService();
return abstractTransactionService;
}
protected AbstractLocalTransactionManager newLocalTransactionManager() {
final JournalTransactionService abstractTransactionService = newTransactionService();
abstractTransactionService.start();
return new AbstractLocalTransactionManager() {
@Override
public AbstractTransactionService getTransactionService() {
return abstractTransactionService;
}
/**
* Extended to shutdown the embedded transaction service.
*/
@Override
public void shutdown() {
((JournalTransactionService) getTransactionService())
.shutdown();
super.shutdown();
}
/**
* Extended to shutdown the embedded transaction service.
*/
@Override
public void shutdownNow() {
((JournalTransactionService) getTransactionService())
.shutdownNow();
super.shutdownNow();
}
};
}
@Override
public AbstractLocalTransactionManager getLocalTransactionManager() {
return localTransactionManager;
}
@Override
public boolean isGroupCommit() {
return isGroupCommit;
}
/**
* Interface defines and documents the counters and counter namespaces
* reported by the {@link Journal} and the various services which it uses.
*
* @author Bryan
* Thompson
*/
public static interface IJournalCounters extends
ConcurrencyManager.IConcurrencyManagerCounters,
// ...TransactionManager.XXXCounters,
ResourceManager.IResourceManagerCounters
{
/**
* The namespace for the counters pertaining to the {@link ConcurrencyManager}.
*/
String concurrencyManager = "Concurrency Manager";
/**
* The namespace for the counters pertaining to the named indices.
*/
String indexManager = "Index Manager";
/**
* The namespace for the counters pertaining to the {@link ILocalTransactionService}.
*/
String transactionManager = "Transaction Manager";
/**
* The namespace for counters pertaining to the
* {@link Journal#getExecutorService()}.
*/
String executorService = "Executor Service";
/**
* Performance counters for the query engine associated with this
* journal (if any).
*/
String queryEngine = "Query Engine";
}
/**
* {@inheritDoc}
*
* Overridden to attach additional performance counters.
*/
@Override
public CounterSet getCounters() {
final CounterSet root = new CounterSet();
// Host wide performance counters (collected from the OS).
{
final AbstractStatisticsCollector t = getPlatformStatisticsCollector();
if (t != null) {
root.attach(t.getCounters());
}
}
// JVM wide performance counters.
{
final CounterSet tmp = root.makePath("JVM");
tmp.attach(AbstractStatisticsCollector.getMemoryCounterSet());
}
// Journal performance counters.
{
final CounterSet tmp = root.makePath("Journal");
tmp.attach(super.getCounters());
// Live index counters iff available.
{
final CounterSet indexCounters = getIndexCounters();
if (indexCounters != null) {
tmp.makePath(IJournalCounters.indexManager).attach(
indexCounters);
}
}
tmp.makePath(IJournalCounters.concurrencyManager)
.attach(concurrencyManager.getCounters());
tmp.makePath(IJournalCounters.transactionManager)
.attach(localTransactionManager.getCounters());
{
final IPlugIn plugin = pluginQueueStats
.get();
if (plugin != null) {
final ThreadPoolExecutorBaseStatisticsTask t = plugin
.getService();
if (t != null) {
tmp.makePath(IJournalCounters.executorService).attach(
t.getCounters());
}
}
}
}
// Lookup an existing query engine, but do not cause one to be created.
final QueryEngine queryEngine = QueryEngineFactory.getInstance()
.getExistingQueryController(this);
if (queryEngine != null) {
final CounterSet tmp = root.makePath(IJournalCounters.queryEngine);
tmp.attach(queryEngine.getCounters());
}
return root;
}
/*
* IResourceManager
*/
@Override
public File getTmpDir() {
return tmpDir;
}
/**
* The directory in which the journal's file is located -or-
* null
if the journal is not backed by a file.
*/
@Override
public File getDataDir() {
final File file = getFile();
if (file == null) {
return null;
}
return file.getParentFile();
}
/**
* Note: This will only succeed if the uuid identifies this
* journal.
*/
public IRawStore openStore(final UUID uuid) {
if(uuid == getRootBlockView().getUUID()) {
return this;
}
throw new UnsupportedOperationException();
}
/**
* Always returns an array containing a single {@link BTree} which is the
* {@link BTree} loaded from the commit record whose commit timestamp is
* less than or equal to timestamp -or- null
if there
* are no {@link ICommitRecord}s that satisfy the probe or if the named
* index was not registered as of that timestamp.
*
* @param name
* @param timestamp
*
* @throws UnsupportedOperationException
* If the timestamp is {@link ITx#READ_COMMITTED}. You
* MUST use {@link #getIndex(String, long)} in order to obtain a
* view that has {@link ITx#READ_COMMITTED} semantics.
*/
public AbstractBTree[] getIndexSources(final String name,
final long timestamp) {
final BTree btree;
if (timestamp == ITx.UNISOLATED) {
/*
* Unisolated operation on the live index.
*/
// MAY be null.
btree = getIndex(name);
} else if (timestamp == ITx.READ_COMMITTED) {
/*
* BTree does not know how to update its view with intervening
* commits. Further, for a variety of reasons including the
* synchronization problems that would be imposed, there are no
* plans for BTree to be able to provide read-committed semantics.
* Instead a ReadCommittedView is returned by
* getIndex(name,timestamp) when ITx#READ_COMMITTED is requested and
* this method is not invoked.
*/
throw new UnsupportedOperationException("Read-committed view");
// /*
// * Read committed operation against the most recent commit point.
// *
// * Note: This commit record is always defined, but that does not
// * mean that any indices have been registered.
// */
//
// final ICommitRecord commitRecord = getCommitRecord();
//
// final long ts = commitRecord.getTimestamp();
//
// if (ts == 0L) {
//
// log.warn("Nothing committed: name="+name+" - read-committed operation.");
//
// return null;
//
// }
//
// // MAY be null.
// btree = getIndex(name, commitRecord);
//
// if (btree != null) {
//
//// /*
//// * Mark the B+Tree as read-only.
//// */
////
//// btree.setReadOnly(true);
//
// assert ((BTree) btree).getLastCommitTime() != 0;
//// btree.setLastCommitTime(commitRecord.getTimestamp());
//
// }
} else {
/*
* A specified historical index commit point.
*
* @see Add
* cache for access to historical index views on the Journal by name
* and commitTime.
*/
final long ts = Math.abs(timestamp);
// final ICommitRecord commitRecord = getCommitRecord(ts);
//
// if (commitRecord == null) {
//
// log.warn("No commit record: name=" + name + ", timestamp=" + ts);
//
// return null;
//
// }
//
// // MAY be null
// btree = getIndex(name, commitRecord);
// MAY be null
btree = (BTree) super.getIndex(name, ts);
if (btree != null) {
// /*
// * Mark the B+Tree as read-only.
// */
//
// btree.setReadOnly(true);
assert btree.getLastCommitTime() != 0;
// btree.setLastCommitTime(commitRecord.getTimestamp());
}
}
/*
* No such index as of that timestamp.
*/
if (btree == null) {
if (log.isInfoEnabled())
log.info("No such index: name=" + name + ", timestamp="
+ timestamp);
return null;
}
return new AbstractBTree[] {
btree
};
}
/**
* Always returns this.
*/
final public AbstractJournal getLiveJournal() {
return this;
}
/**
* Always returns this.
*/
final public AbstractJournal getJournal(final long timestamp) {
return this;
}
/**
* Compacts the named indices found on this journal as of the most recent
* commit point, writing their view onto a new Journal. This method MAY be
* used concurrently with the {@link Journal} but writes after the selected
* commit point WILL NOT be reflected in the output file. Typical uses are
* to reduce the space required by the backing store, to improve locality in
* the backing store, and to make a backup of the most recent commit point.
*
* @param outFile
* The file on which the new journal will be created.
*
* @return The {@link Future} on which you must {@link Future#get() wait}
* for the {@link CompactTask} to complete. The already open journal
* is accessible using {@link Future#get()}. If you are backing up
* data, then be sure to shutdown the returned {@link Journal} so
* that it can release its resources.
*/
public Future compact(final File outFile) {
return executorService.submit(new CompactTask(this, outFile,
getLastCommitTime()));
}
/**
* Submit a task that will take a snapshot of the journal and return the
* {@link Future} for that task. The snapshot is taken on a temporary file.
* Iff the snapshot is successful, the temporary file is renamed to the
* application determined file. Thus all snapshots are either valid or are
* were not written. A snapshot of an empty journal is not permitted. Also,
* the backing store MUST implement the {@link IHABufferStrategy}.
*
* Note: This method supports application controlled snapshots and is
* primarily intended for non-HA deployments. HA has an integrated snapshot
* and transaction log mechanism which is preferred in HA deployments and
* also provides the ability for an application to take snapshots on demand.
*
* @param snapshotFactory
* The factory that will provide the name of the file on which the
* snapshot will be written.
*
* @return The {@link Future} for the snapshot.
*
* @throws UnsupportedOperationException
* if the backing store does not implement the
* {@link IHABufferStrategy} interface.
*
* @see Online backup for
* Journal
* @since 1.5.2
*/
public Future snapshot(
final ISnapshotFactory snapshotFactory) {
if (!(getBufferStrategy() instanceof IHABufferStrategy)) {
throw new UnsupportedOperationException();
}
return executorService.submit(new SnapshotTask(this, snapshotFactory));
}
@Override
public void dropIndex(final String name) {
final BTreeCounters btreeCounters = getIndexCounters(name);
super.dropIndex(name);
if (btreeCounters != null) {
// Conditionally remove the counters for the old index.
indexCounters.remove(name, btreeCounters);
}
}
/**
* {@inheritDoc}
*
* Note: {@link ITx#READ_COMMITTED} views are given read-committed semantics
* using a {@link ReadCommittedView}. This means that they can be cached
* since the view will update automatically as commits are made against
* the {@link Journal}.
*
* @see IndexManager#getIndex(String, long)
*/
@Override
public ILocalBTreeView getIndex(final String name, final long timestamp) {
if (name == null) {
throw new IllegalArgumentException();
}
final boolean isReadWriteTx = TimestampUtility.isReadWriteTx(timestamp);
// final Tx tx = (Tx) (isReadWriteTx ? getConcurrencyManager()
// .getTransactionManager().getTx(timestamp) : null);
final Tx tx = (Tx) /*getConcurrencyManager().*/getTransactionManager()
.getTx(timestamp);
if (isReadWriteTx) {
if (tx == null) {
log.warn("Unknown transaction: name=" + name + ", tx="
+ timestamp);
return null;
}
tx.lock.lock();
try {
if (!tx.isActive()) {
// typically this means that the transaction has already
// prepared.
log.warn("Transaction not active: name=" + name + ", tx="
+ timestamp + ", prepared=" + tx.isPrepared()
+ ", complete=" + tx.isComplete() + ", aborted="
+ tx.isAborted());
return null;
}
} finally {
tx.lock.unlock();
}
}
if( isReadWriteTx && tx == null ) {
/*
* Note: This will happen both if you attempt to use a transaction
* identified that has not been registered or if you attempt to use
* a transaction manager after the transaction has been either
* committed or aborted.
*/
log.warn("No such transaction: name=" + name + ", tx=" + timestamp);
return null;
}
final boolean readOnly = TimestampUtility.isReadOnly(timestamp);
// final boolean readOnly = (timestamp < ITx.UNISOLATED)
// || (isReadWriteTx && tx.isReadOnly());
final ILocalBTreeView tmp;
if (isReadWriteTx) {
/*
* Isolated operation.
*
* Note: The backing index is always a historical state of the named
* index.
*
* Note: Tx.getIndex() will pass through the actual commit time of
* the ground state against which the transaction is reading (if it
* is available, which it is on the local Journal).
*
* @see
* Refactor native long tx id to thin object
*/
final ILocalBTreeView isolatedIndex = tx.getIndex(name);
if (isolatedIndex == null) {
log.warn("No such index: name=" + name + ", tx=" + timestamp);
return null;
}
tmp = isolatedIndex;
} else {
/*
* Non-transactional view.
*/
if (readOnly) {
if (timestamp == ITx.READ_COMMITTED) {
// read-committed
tmp = new ReadCommittedView(this, name);
} else {
if (tx != null) {
/*
* read-only transaction
*
* @see Add cache for access to historical index views on
* the Journal by name and commitTime.
*/
final AbstractBTree[] sources = getIndexSources(name,
tx.getReadsOnCommitTime());
if (sources == null) {
log.warn("No such index: name=" + name
+ ", timestamp=" + timestamp);
return null;
}
assert sources[0].isReadOnly();
tmp = (BTree) sources[0];
} else {
// historical read not protected by a transaction
final AbstractBTree[] sources = getIndexSources(name,
timestamp);
if (sources == null) {
log.warn("No such index: name=" + name
+ ", timestamp=" + timestamp);
return null;
}
assert sources[0].isReadOnly();
tmp = (BTree) sources[0];
}
}
} else {
/*
* Writable unisolated index.
*
* Note: This is the "live" mutable index. This index is NOT
* thread-safe. A lock manager is used to ensure that at most
* one task has access to this index at a time.
*/
assert timestamp == ITx.UNISOLATED;
final AbstractBTree[] sources = getIndexSources(name, ITx.UNISOLATED);
if (sources == null) {
if (log.isInfoEnabled())
log.info("No such index: name="+name+", timestamp="+timestamp);
return null;
}
assert ! sources[0].isReadOnly();
tmp = (BTree) sources[0];
}
}
/*
* Make sure that it is using the canonical counters for that index.
*
* Note: AbstractTask also does this for UNISOLATED indices which it
* loads by itself as part of providing ACID semantics for add/drop
* of indices.
*/
tmp.getMutableBTree().setBTreeCounters(getIndexCounters(name));
return tmp;
}
/**
* Always returns the {@link BTree} as the sole element of the array since
* partitioned indices are not supported.
*/
@Override
public AbstractBTree[] getIndexSources(final String name,
final long timestamp, final BTree btree) {
return new AbstractBTree[] { btree };
}
/**
* Create a new transaction on the {@link Journal}.
*
* Note: This is a convenience method. The implementation of this method is
* delegated to the object returned by {@link #getTransactionService()}.
*
* @param timestamp
* A positive timestamp for a historical read-only transaction as
* of the first commit point LTE the given timestamp,
* {@link ITx#READ_COMMITTED} for a historical read-only
* transaction as of the most current commit point on the
* {@link Journal} as of the moment that the transaction is
* created, or {@link ITx#UNISOLATED} for a read-write
* transaction.
*
* @return The transaction identifier.
*
* @see ITransactionService#newTx(long)
*/
final public long newTx(final long timestamp) {
/*
* Note: The RWStore native tx pre-increment logic is now handled by
* _newTx() in the inner class that extends JournalTransactionService.
*/
try {
return getTransactionService().newTx(timestamp);
} catch (IOException ioe) {
/*
* Note: IOException is declared for RMI but will not be thrown
* since the transaction service is in fact local.
*/
throw new RuntimeException(ioe);
}
}
/**
* Abort a transaction.
*
* Note: This is a convenience method. The implementation of this method is
* delegated to the object returned by {@link #getTransactionService()}.
*
* @param tx
* The transaction identifier.
*
* @see ITransactionService#abort(long)
*/
final public void abort(final long tx) {
try {
/*
* Note: TransactionService will make call back to the
* localTransactionManager to handle the client side of the
* protocol.
*/
getTransactionService().abort(tx);
} catch (IOException e) {
/*
* Note: IOException is declared for RMI but will not be thrown
* since the transaction service is in fact local.
*/
throw new RuntimeException(e);
}
}
/**
* Commit a transaction.
*
* Note: This is a convenience method. The implementation of this method is
* delegated to the object returned by {@link #getTransactionService()}.
*
* @param tx
* The transaction identifier.
*
* @return The commit time assigned to that transaction.
*
* @see ITransactionService#commit(long)
*/
final public long commit(final long tx) throws ValidationError {
try {
/*
* Note: TransactionService will make call back to the
* localTransactionManager to handle the client side of the
* protocol.
*/
return getTransactionService().commit(tx);
} catch (IOException e) {
/*
* Note: IOException is declared for RMI but will not be thrown
* since the transaction service is in fact local.
*/
throw new RuntimeException(e);
}
}
/**
* Validate the write set for a transaction. This operation is not required.
* Validation will be performed during commit processing for a transaction
* regardless.
*
* @param txId
* The transaction identifier.
*
* @return true
iff the write set of the transaction could be
* validated.
*
* @throws TransactionNotFoundException
* if no such transaction exists.
*/
final public boolean prepare(final long txId) {
final Tx localState = getLocalTransactionManager().getTx(txId);
if (localState == null)
throw new TransactionNotFoundException(txId);
if (localState.isReadOnly()) {
// Trivally validated.
return true;
}
try {
final AbstractTask task = new ValidateWriteSetTask(
concurrencyManager, getLocalTransactionManager(), localState);
/*
* Submit the task and wait for the result.
*
* Note: This task MUST go through the ConcurrencyManager to obtain its
* locks.
*/
final boolean ok = concurrencyManager.submit(task).get();
return ok;
} catch (Exception ex) {
throw new RuntimeException(ex);
}
}
// /**
// * @deprecated This method in particular should be hidden from the
// * {@link Journal} as it exposes the {@link ITx} which really
// * deals with the client-side state of a transaction and which
// * should not be visible to applications - they should just use
// * the [long] transaction identifier.
// */
// public ITx getTx(long startTime) {
//
// return localTransactionManager.getTx(startTime);
//
// }
/**
* Returns the next timestamp from the {@link ILocalTransactionManager}.
*
* Note: This is a convenience method. The implementation of this method is
* delegated to the object returned by {@link #getTransactionService()}.
*
* @deprecated This is here for historical reasons and is only used by the
* test suite. Use {@link #getLocalTransactionManager()} and
* {@link ITransactionService#nextTimestamp()}.
*
* @see ITransactionService#nextTimestamp()
*/
final public long nextTimestamp() {
return localTransactionManager.nextTimestamp();
}
/*
* IConcurrencyManager
*/
public ConcurrencyManager getConcurrencyManager() {
return concurrencyManager;
}
/**
* Note: The transaction service is shutdown first, then the
* {@link #executorService}, then the {@link IConcurrencyManager}, the
* {@link ITransactionService} and finally the {@link IResourceLockService}.
*/
@Override
synchronized public void shutdown() {
if (!isOpen())
return;
/*
* Shutdown the transaction service. This will not permit new
* transactions to start and will wait until running transactions either
* commit or abort.
*/
localTransactionManager.shutdown();
{
final IPlugIn, ?> plugIn = pluginGanglia.get();
if (plugIn != null) {
// stop if running.
plugIn.stopService(false/* immediateShutdown */);
}
}
{
final IPlugIn, ?> plugIn = pluginQueueStats.get();
if (plugIn != null) {
// stop if running.
plugIn.stopService(false/* immediateShutdown */);
}
}
{
final IPlugIn, ?> plugIn = pluginPlatformStats.get();
if (plugIn != null) {
// stop if running.
plugIn.stopService(false/* immediateShutdown */);
}
}
if (scheduledExecutorService != null) {
scheduledExecutorService.shutdown();
}
// optional httpd service for the local counters.
{
final IPlugIn, ?> plugIn = pluginHttpd.get();
if (plugIn != null) {
// stop if running.
plugIn.stopService(false/* immediateShutdown */);
}
}
/*
* Shutdown the executor service. This will wait for any tasks being run
* on that service by the application to complete.
*/
try {
new ShutdownHelper(executorService, 1000/* logTimeout */,
TimeUnit.MILLISECONDS) {
@Override
protected void logTimeout() {
log.warn("Waiting on task(s)"
+ ": elapsed="
+ TimeUnit.NANOSECONDS.toMillis(elapsed())
+ "ms, #active="
+ ((ThreadPoolExecutor) executorService)
.getActiveCount());
}
};
} catch (InterruptedException ex) {
log.warn("Immediate shutdown: "+ex);
// convert to immediate shutdown.
shutdownNow();
return;
}
/*
* Shutdown the concurrency manager - this will allow existing
* non-transactional operations to complete but prevent additional
* operations from starting.
*/
concurrencyManager.shutdown();
super.shutdown();
}
/**
* Note: The {@link IConcurrencyManager} is shutdown first, then the
* {@link ITransactionService} and finally the {@link IResourceManager}.
*/
@Override
synchronized public void shutdownNow() {
if (!isOpen())
return;
/*
* Note: The ganglia plug in is executed on the main thread pool. We
* need to terminate it in order for the thread pool to shutdown.
*/
{
final IPlugIn, ?> plugIn = pluginGanglia.get();
if (plugIn != null) {
// stop if running.
plugIn.stopService(true/* immediateShutdown */);
}
}
{
final IPlugIn, ?> plugIn = pluginQueueStats.get();
if (plugIn != null) {
// stop if running.
plugIn.stopService(true/* immediateShutdown */);
}
}
{
final IPlugIn, ?> plugIn = pluginPlatformStats.get();
if (plugIn != null) {
// stop if running.
plugIn.stopService(true/* immediateShutdown */);
}
}
if (scheduledExecutorService != null)
scheduledExecutorService.shutdownNow();
// optional httpd service for the local counters.
{
final IPlugIn, ?> plugIn = pluginHttpd.get();
if (plugIn != null) {
// stop if running.
plugIn.stopService(false/* immediateShutdown */);
}
}
// Note: can be null if error in ctor.
if (executorService != null)
executorService.shutdownNow();
// Note: can be null if error in ctor.
if (concurrencyManager != null)
concurrencyManager.shutdownNow();
// Note: can be null if error in ctor.
if (localTransactionManager != null)
localTransactionManager.shutdownNow();
super.shutdownNow();
}
// public void deleteResources() {
//
// super.deleteResources();
//
// // Note: can be null if error in ctor.
// if (tempStoreFactory != null)
// tempStoreFactory.closeAll();
//
// }
/**
* {@inheritDoc}
*
* Overridden to close the {@link TemporaryStoreFactory}.
*/
@Override
protected void _close() {
super._close();
// Note: can be null if error in ctor.
if (tempStoreFactory != null)
tempStoreFactory.closeAll();
}
@Override
public FutureTask submit(AbstractTask task) {
return concurrencyManager.submit(task);
}
@Override
public List> invokeAll(
final Collection extends AbstractTask> tasks,
final long timeout, final TimeUnit unit)
throws InterruptedException {
return concurrencyManager.invokeAll(tasks, timeout, unit);
}
@Override
public List> invokeAll(
Collection extends AbstractTask> tasks)
throws InterruptedException {
return concurrencyManager.invokeAll(tasks);
}
@Override
public IResourceManager getResourceManager() {
return concurrencyManager.getResourceManager();
}
@Override
public ILocalTransactionManager getTransactionManager() {
// return concurrencyManager.getTransactionManager();
return localTransactionManager;
}
public ITransactionService getTransactionService() {
// return getTransactionManager().getTransactionService();
return localTransactionManager.getTransactionService();
}
@Override
public WriteExecutorService getWriteService() {
return concurrencyManager.getWriteService();
}
/*
* IResourceManager
*/
/**
* Note: This implementation always returns false
. As a
* consequence the journal capacity will simply be extended by
* {@link #write(ByteBuffer)} until the available disk space is exhausted.
*
* @return This implementation returns false
since overflow
* is NOT supported.
*/
@Override
public boolean shouldOverflow() {
return false;
}
/**
* Note: This implementation always returns false
.
*/
@Override
public boolean isOverflowEnabled() {
return false;
}
@Override
public Future