com.bigdata.service.DistributedTransactionService Maven / Gradle / Ivy
/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Dec 18, 2008
*/
package com.bigdata.service;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.BrokenBarrierException;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.zip.Adler32;
import com.bigdata.btree.BTree;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.concurrent.LockManager;
import com.bigdata.concurrent.LockManagerTask;
import com.bigdata.config.LongValidator;
import com.bigdata.counters.CounterSet;
import com.bigdata.counters.Instrument;
import com.bigdata.journal.IDistributedTransactionService;
import com.bigdata.journal.ITransactionService;
import com.bigdata.journal.ITx;
import com.bigdata.journal.Name2Addr;
import com.bigdata.journal.RunState;
import com.bigdata.util.concurrent.ExecutionExceptions;
/**
* Implementation for an {@link IBigdataFederation} supporting both single-phase
* commits (for transactions that execute on a single {@link IDataService}) and
* distributed commits.
*
* @author Bryan Thompson
* @version $Id$
*/
public abstract class DistributedTransactionService extends
AbstractTransactionService implements IDistributedTransactionService {
/**
* Options understood by this service.
*
* @author Bryan Thompson
*/
public interface Options extends AbstractTransactionService.Options {
/**
* The directory in which the persistent state of this service will be
* stored.
*/
String DATA_DIR = DistributedTransactionService.class.getName()
+ ".dataDir";
/**
* The interval in milliseconds between writing a snapshot of the index
* of accessible commit points into the {@link #DATA_DIR} ({@value #DEFAULT_SHAPSHOT_INTERVAL}).
*
* Two snapshots are retained of the commit time index so that those
* historical commit times required for reading on committed states of
* the database GT the releaseTime may be on hand after a
* service restart. Two snapshots are maintained, with the older
* snapshot being overwritten each time. A snapshot is written every N
* milliseconds, where N is configured using this property, and also
* when the service is shutdown.
*
* This MAY be ZERO (0L) to disable snapshots - a feature that is used
* by the {@link EmbeddedFederation} when run in a diskless mode.
*/
String SHAPSHOT_INTERVAL = DistributedTransactionService.class
.getName()
+ ".snapshotInterval";
/** 5 minutes (in millseconds). */
String DEFAULT_SHAPSHOT_INTERVAL = ""
+ (5 * 60 * 1000);
}
/**
* A map of the distributed transactions that are currently committing.
*
* @todo config for initial capacity and concurrency?
*/
private final ConcurrentHashMap commitList = new ConcurrentHashMap();
/**
* The {@link LockManager} used to impose a partial ordering on the prepare
* phase of distributed transaction commits using index partition names as
* the named resources for which the tasks must contend.
*/
private final LockManager indexLockManager = new LockManager(
0/* maxConcurrencyIsIgnored */, true/* predeclareLocks */);
/**
* The {@link LockManager} used to impose a partial ordering on the commit
* phase of distributed transaction commits using {@link IDataService}
* {@link UUID}s as the named resources for which the tasks must contend.
*/
private final LockManager dataServiceLockManager = new LockManager(
0/* maxConcurrencyIsIgnored */, true/* predeclareLocks */);
/**
* A {@link BTree} containing a log of the historical commit points.
*
* The main things that it gives us are (a) the half-open ranges within
* which we can allocate read-historical transactions; and (b) the last
* commit time on record. It seems that creating an image of the log every N
* seconds should be sufficient.
*
* Note: Read and write operations on this index MUST be synchronized on the
* index object.
*/
protected final CommitTimeIndex commitTimeIndex;
/**
* True iff the service does not write any state on the disk.
*/
private final boolean isTransient;
/**
* The data directory -or- null
iff the service is transient.
*/
protected final File dataDir;
/**
* The interval in milliseconds between logging an image of the
* {@link #commitTimeIndex}.
*
* @see Options#COMMIT_TIME_INDEX_SHAPSHOT_INTERVAL
*/
private final long snapshotInterval;
/**
* The last (known) commit time.
*/
private volatile long lastCommitTime = 0L;
/**
* @param properties
*/
public DistributedTransactionService(final Properties properties) {
super(properties);
if (properties.getProperty(Options.DATA_DIR) == null) {
throw new RuntimeException("Required property: " + Options.DATA_DIR);
}
snapshotInterval = LongValidator.GTE_ZERO.parse(
Options.SHAPSHOT_INTERVAL, properties.getProperty(
Options.SHAPSHOT_INTERVAL,
Options.DEFAULT_SHAPSHOT_INTERVAL));
if (log.isInfoEnabled())
log.info(Options.SHAPSHOT_INTERVAL + "=" + snapshotInterval);
isTransient = snapshotInterval == 0;
if (isTransient) {
dataDir = null;
} else {
dataDir = new File(properties.getProperty(Options.DATA_DIR));
if (log.isInfoEnabled())
log.info(Options.DATA_DIR + "=" + dataDir);
}
// Create transient BTree for the commit time log.
commitTimeIndex = CommitTimeIndex.createTransient();
setup();
if (log.isInfoEnabled())
log.info("lastCommitTime=" + lastCommitTime + ", #commitTimes="
+ commitTimeIndex.getEntryCount());
}
/**
* Either creates the data directory or reads the {@link #commitTimeIndex}
* from files in an existing data directory.
*/
private void setup() {
if(isTransient) {
// nothing committed yet.
lastCommitTime = 0L;
return;
}
if (!dataDir.exists()) {
/*
* New service if its data directory does not exist.
*/
if (!dataDir.mkdirs() && !dataDir.mkdirs()) {
throw new RuntimeException("Could not create: " + dataDir);
}
// nothing committed yet.
lastCommitTime = 0L;
return;
}
{
// the files on which the images should have been written.
final File file0 = new File(dataDir, BASENAME + "0" + EXT);
final File file1 = new File(dataDir, BASENAME + "1" + EXT);
if (!file0.exists() && !file1.exists()) {
log.warn("No commit time logs - assuming new service: dataDir="
+ dataDir);
// nothing committed yet.
lastCommitTime = 0L;
return;
}
// timestamps on those files (zero if the file does not exist)
final long time0 = file0.lastModified();
final long time1 = file1.lastModified();
// true iff file0 is more recent.
final boolean isFile0 = (time0 != 0L && time1 != 0L) //
? (time0 > time1 ? true: false)// Note: both files exist.
: (time0 != 0L ? true: false)// Note: only one file exists
;
final File file = isFile0 ? file0 : file1;
// System.err.println("file0: "+file0.lastModified());
// System.err.println("file1: "+file1.lastModified());
// System.err.println("isFile0="+isFile0);
/*
* Note: On restart the value of this counter is set to either
* ONE(1) or TWO(1) depending on which snapshot file is more
* current.
*
* It is ONE(1) if we read file0 since the counter would be ONE(1)
* after we write file0 for the first time.
*
* It is TWO(2) if we read file1 since the counter would be TWO(2)
* after we write file1 for the first time.
*/
snapshotCount = isFile0 ? 1 : 2;
try {
// read most recent image.
final long entryCount = SnapshotHelper.read(commitTimeIndex,
file);
log.warn("Read snapshot: entryCount=" + entryCount + ", file="
+ file);
} catch (IOException ex) {
throw new RuntimeException("Could not read file: " + file, ex);
}
}
if (commitTimeIndex.getEntryCount() == 0) {
// nothing in the commit time log.
lastCommitTime = 0;
} else {
// the last commit time in the log. @todo write unit test to
// verify on restart.
lastCommitTime = commitTimeIndex.decodeKey(commitTimeIndex
.keyAt(commitTimeIndex.getEntryCount() - 1));
}
}
/**
* Basename for the files written in the {@link #dataDir} containing images
* of the {@link #commitTimeIndex}.
*/
static protected final String BASENAME = "commitTime";
/**
* Extension for the files written in the {@link #dataDir} containing
* snapshots of the {@link #commitTimeIndex}.
*/
static protected final String EXT = ".snapshot";
/**
* #of times we have written a snapshot of the {@link #commitTimeIndex}.
*/
private long snapshotCount = 0L;
/**
* Runs the {@link SnapshotTask} once.
*/
public void snapshot() {
new SnapshotTask().run();
}
/**
* A task that writes a snapshot of the commit time index onto a pair of
* alternating files. This is in the spirit of the Challis algorithm, but
* the approach is less rigorous.
*
* @author Bryan Thompson
*/
private class SnapshotTask implements Runnable {
/**
* Note: Anything thrown out of this method will cause the task to no
* longer be scheduled!
*/
public void run() {
if(isTransient) {
// snapshot not supported for transient service.
throw new RuntimeException("Service is transient");
}
lock.lock();
try {
final long begin = System.currentTimeMillis();
// either 0 or 1.
final int i = (int) snapshotCount % 2;
final File file = new File(dataDir, BASENAME + i + EXT);
if (!dataDir.exists()) {
if (!dataDir.mkdirs() && !dataDir.mkdirs()) {
throw new RuntimeException("Could not create: " + dataDir);
}
}
final long entryCount;
synchronized (commitTimeIndex) {
entryCount = SnapshotHelper.write(commitTimeIndex, file);
}
// increment counter iff successful.
snapshotCount++;
final long elapsed = System.currentTimeMillis() - begin;
log.warn("snapshot: snapshotCount=" + snapshotCount
+ ", entryCount=" + entryCount + ", file=" + file
+ ", elapsed=" + elapsed);
} catch (Throwable t) {
log.error(t.getMessage(), t);
return;
} finally {
lock.unlock();
}
}
};
/**
* A helper class for reading and writing snapshots of the commit time
* index. The image contains the commit timestamps in order.
*
* Note: The caller must prevent concurrent changes to the index.
*
* @todo write counters into the files since the system clock could be
* messed with on before a restart but the counters will always be
* valid. we would then either read both and choose one, or have a
* method to report the header with the earlier counter.
*
* @todo Checksum the commit time log file? this is easily done either using
* a {@link ByteBuffer} or using {@link Adler32}.
*
* @author Bryan Thompson
*/
public static class SnapshotHelper {
static public long read(CommitTimeIndex ndx, File file)
throws IOException {
final FileInputStream is = new FileInputStream(file);
try {
final BufferedInputStream bis = new BufferedInputStream(is);
final DataInputStream dis = new DataInputStream(bis);
return SnapshotHelper.read(ndx, dis);
} finally {
is.close();
}
}
static public long read(CommitTimeIndex ndx, DataInputStream is)
throws IOException {
final long n = is.readLong();
for (int i = 0; i < n; i++) {
ndx.add(is.readLong());
}
return n;
}
static public long write(final CommitTimeIndex ndx, final File file)
throws IOException {
final FileOutputStream os = new FileOutputStream(file);
try {
final BufferedOutputStream bos = new BufferedOutputStream(os);
final DataOutputStream dos = new DataOutputStream(bos);
// write the image on the file.
final long entryCount = SnapshotHelper.write(ndx, dos);
dos.flush();
bos.flush();
return entryCount;
} finally {
os.close();
}
}
static public long write(final CommitTimeIndex ndx,
final DataOutputStream os) throws IOException {
final long entryCount = ndx.getEntryCount();
os.writeLong(entryCount);
final ITupleIterator itr = ndx.rangeIterator();
int n = 0;
while (itr.hasNext()) {
final ITuple tuple = itr.next();
final long commitTime = ndx.decodeKey(tuple.getKey());
os.writeLong(commitTime);
n++;
}
if (n != entryCount) {
/*
* Note: probable error is the caller not preventing concurrent
* modification.
*/
throw new AssertionError();
}
return entryCount;
}
}
public DistributedTransactionService start() {
/*
* Note: lock makes operation _mostly_ atomic even though the base class
* changes the runState. For example, new transactions can not start
* without this lock.
*/
lock.lock();
try {
super.start();
addScheduledTasks();
return this;
} finally {
lock.unlock();
}
}
/**
* Adds the scheduled tasks.
*/
protected void addScheduledTasks() {
if (!lock.isHeldByCurrentThread())
throw new IllegalMonitorStateException();
final AbstractFederation fed = (AbstractFederation) getFederation();
// @todo config options (verify units).
notifyFuture = fed.addScheduledTask(new NotifyReleaseTimeTask(),
60/* initialDelay */, 60/* delay */, TimeUnit.SECONDS);
if (snapshotInterval != 0L) {
// start the snapshot task.
writeFuture = fed.addScheduledTask(
new SnapshotTask(),
snapshotInterval/* initialDelay */,
snapshotInterval/* delay */,
TimeUnit.MILLISECONDS);
}
}
private ScheduledFuture notifyFuture = null;
private ScheduledFuture writeFuture = null;
public void shutdown() {
lock.lock();
try {
switch (getRunState()) {
case Shutdown:
case ShutdownNow:
case Halted:
return;
}
/*
* First make sure that all tx are terminated - this is important
* otherwise we will write the commit time index image before we
* have the last commit times on hand.
*/
super.shutdown();
/*
* No need to interrupt this task. It will complete soon enough.
* However, we do want to cancel it so it will stop running.
*/
if (notifyFuture != null)
notifyFuture.cancel(false/* mayInterruptIfRunning */);
/*
* Cancel this task, but DO NOT interrupt it to avoid a partial
* write if there is a write in progress. If there is a write in
* progress, then we will wind up writing it again immediately since
* we do that below. This is Ok. We will just have a current image
* and a nearly current image.
*/
if (writeFuture != null)
writeFuture.cancel(false/* mayInterruptIfRunning */);
if (snapshotInterval != 0L) {
// write a final image during shutdown.
new SnapshotTask().run();
}
} finally {
lock.unlock();
}
}
public void shutdownNow() {
lock.lock();
try {
switch (getRunState()) {
case ShutdownNow:
case Halted:
return;
}
/*
* First make sure that all tx are terminated - this is important
* otherwise we will write the commit time index image before we
* have the last commit times on hand.
*/
super.shutdownNow();
/*
* Cancel and interrupt if running.
*/
if (notifyFuture != null)
notifyFuture.cancel(true/* mayInterruptIfRunning */);
/*
* Cancel this task and interrupt if running. Interrupting this will
* leave a partial snapshot on the disk, but we do not advance the
* counter unless the snapshot is successful so we will overwrite
* that partial snapshot below when we write a final snapshot.
*/
if (writeFuture != null)
writeFuture.cancel(true/* mayInterruptIfRunning */);
if (snapshotInterval != 0L) {
// write a final snapshot during shutdown.
snapshot();
}
} finally {
lock.unlock();
}
}
public void destroy() {
lock.lock();
try {
super.destroy();
if (!isTransient) {
// delete the commit time index log files.
new File(dataDir, BASENAME + "0" + EXT).delete();
new File(dataDir, BASENAME + "1" + EXT).delete();
// delete the data directory (works iff it is empty).
dataDir.delete();
}
} finally {
lock.unlock();
}
}
/**
* Extended to truncate the head of the {@link #commitTimeIndex} such only
* the commit times requires for reading on timestamps GTE to the new
* releaseTime are retained.
*/
protected void setReleaseTime(long releaseTime) {
super.setReleaseTime(releaseTime);
/*
* Truncate the head of the commit time index since we will no longer
* grant transactions whose start time is LTE the new releaseTime.
*/
// Note: Use the current value.
releaseTime = getReleaseTime();
if (releaseTime > 0) {
synchronized (commitTimeIndex) {
/*
* The exclusive upper bound is the timestamp of the earliest
* commit point on which we can read with this [releaseTime].
*/
final long toKey = commitTimeIndex.find(releaseTime + 1);
final ITupleIterator itr = commitTimeIndex.rangeIterator(0L,
toKey, 0/* capacity */, IRangeQuery.KEYS
| IRangeQuery.CURSOR, null/* filter */);
while (itr.hasNext()) {
itr.next();
// remove the tuple from the index.
itr.remove();
}
}
}
}
/**
* Return the proxies for the services participating in a distributed
* transaction commit or abort.
*
* Note: This method is here so that it may be readily overriden for unit
* tests.
*
* @param uuids
* The {@link UUID}s of the participating services.
*
* @return The corresponding service proxies.
*/
protected ITxCommitProtocol[] getDataServices(UUID[] uuids) {
return getFederation().getDataServices(uuids);
}
/**
* Task runs {@link ITxCommitProtocol#abort(long)}.
*
* @author Bryan Thompson
*/
private static class AbortTask implements Callable {
private final ITxCommitProtocol service;
private final TxState state;
public AbortTask(final ITxCommitProtocol service, final TxState state) {
if (service == null)
throw new IllegalArgumentException();
if (state == null)
throw new IllegalArgumentException();
this.service = service;
this.state = state;
}
public Void call() throws Exception {
service.abort(state.tx);
return null;
}
}
@Override
protected void abortImpl(final TxState state) throws Exception {
if(!state.lock.isHeldByCurrentThread())
throw new IllegalMonitorStateException();
if (!state.isActive())
throw new IllegalStateException();
if(state.isReadOnly()) {
/*
* Note: There is no local state for read-only tx so we do not need
* to message the data services.
*/
state.setRunState(RunState.Aborted);
return;
}
final UUID[] uuids = state.getDataServiceUUIDs();
final ITxCommitProtocol[] services = getDataServices(uuids);
final List> tasks = new ArrayList>(
uuids.length);
for (ITxCommitProtocol dataService : services) {
tasks.add(new AbortTask(dataService, state));
}
final List> futures = getFederation().getExecutorService()
.invokeAll(tasks);
List causes = null;
for (Future f : futures) {
try {
// verify no errors.
f.get();
} catch (Throwable t) {
/*
* Collect all causes and always log an error if any data
* service abort fails.
*
* Note: If an exception is thrown here the transaction will be
* aborted regardless. Howwever, the data service which threw
* the exception may still have local state on hand for the tx.
*/
log.error(t, t);
if (causes == null) {
causes = new LinkedList();
}
causes.add(t);
}
}
state.setRunState(RunState.Aborted);
if (causes != null) {
throw new ExecutionExceptions(state.toString(), causes);
}
}
/**
* There are two distinct commit protocols depending on whether the
* transaction write set is distributed across more than one
* {@link IDataService}. When write set of the transaction lies entirely on
* a single {@link IDataService}, an optimized commit protocol is used.
* When the write set of the transaction is distributed, a 3-phase commit is
* used with most of the work occurring during the "prepare" phase and a
* very rapid "commit" phase. If a distributed commit fails, even during the
* "commit", then the transaction will be rolled back on all participating
* {@link IDataService}s.
*
* Single phase commits
*
* A simple commit protocol is used when the write set of the transaction
* resides entirely on a single {@link IDataService}. Such commits DO NOT
* contend for named resource locks (either on the index names or on the
* {@link IDataService} {@link UUID}s). Since such transactions DO NOT have
* dependencies outside of the specific {@link IDataService}, a necessary
* and sufficient partial order will be imposed on the executing tasks
* locally by the {@link IDataService} on which they are executing based
* solely on the named resources which they declare. Without dependencies on
* distributed resources, this can not deadlock.
*
* Distributed commits
*
* Transaction commits for a distributed database MUST be prepared in a
* partial order so that they do not deadlock when acquiring the necessary
* locks on the named indices on the local data services. That partial order
* is imposed using the {@link #indexLockManager}. The named index locks
* are pre-declared at the start of the distributed commit protocol and are
* held through both the prepare and commit phases until the end of the
* commit protocol. The distributed commit must obtain a lock on all of the
* necessary named index resources before proceeding. If there is an
* existing commit using some of those resources, then any concurrent commit
* requiring any of those resources will block. The {@link LockManager} is
* configured to require pre-declaration of locks. Deadlocks are NOT
* possible when the locks are pre-declared.
*
* A secondary partial ordering is established based on the
* {@link IDataService} {@link UUID}s during the commit phase. This partial
* order is necessary to avoid deadlocks for concurrently executing commit
* phases of distributed transactions that DO NOT share named index locks.
* Without a partial order over the participating {@link IDataService}s,
* deadlocks could arise because each transaction will grab an exclusive
* lock on the write service for each participating {@link IDataService}.
* By ordering those lock requests, we again ensure that deadlocks can not
* occur.
*
* Note: The prepare phase for distributed commits allows the maximum
* possible concurrency. This is especially important as validation and
* merging down onto the unisolated indices can have significant length for
* large transactions.
*
* The commit phase should be very fast, with syncing the disk providing the
* primary source of latency. All participating indices on the participating
* data services have already been checkpointed. Once the commitTime is
* assigned by the {@link DistributedTransactionService}, the group commit
* need only update the root block on the live journal and sync to disk.
*
* @todo Place timeout on the commit phase where the tx will abort unless
* all participants join at the "committed" barrier within ~ 250ms.
* That should be a generous timeout, but track aborts for this reason
* specifically since they may indicate interesting problems (heavy
* swapping, network issues, etc).
*
* @todo make sure that we checkpoint the commit record index and
* {@link Name2Addr} before requesting the commitTime to remove even
* more latency.
*/
@Override
protected long commitImpl(final TxState state) throws Exception {
if (state.isReadOnly() || state.getDataServiceCount() == 0) {
/*
* Note: We do not maintain any transaction state on the client for
* read-only transactionss.
*
* Note: If the transaction was never started on any data service so
* we do not need to notify any data service. In effect, the tx was
* started but never used.
*/
state.setRunState(RunState.Committed);
return 0L;
}
if (!state.isDistributedTx()) {
/*
* The write set of the transaction is local to a single data
* service. In this case we can do a much simpler commit protocol.
*/
return singlePhaseCommit(state);
}
/*
* The LockManagerTask will handle lock acquisition for the named
* resources and then invoke our task to perform the commit.
*/
final LockManagerTask delegate = new LockManagerTask(
indexLockManager, state.getResources(), new DistributedTxCommitTask(state));
/*
* This queues the request until it holds the necessary locks (on the
* named indices used by the transaction). It then prepares the
* transaction and (if successfull) requests the necessary locks for the
* commit phase (on the data service UUIDs) and then commits the tx.
*/
return delegate.call();
}
/**
* Prepare and commit a read-write transaction that has written on a single
* data service.
*/
protected long singlePhaseCommit(final TxState state) throws Exception {
if(!state.lock.isHeldByCurrentThread())
throw new IllegalMonitorStateException();
final UUID[] uuids = state.getDataServiceUUIDs();
if (uuids.length != 1)
throw new AssertionError();
final UUID serviceUUID = uuids[0];
final IDataService dataService = getFederation().getDataService(
serviceUUID);
try {
final long commitTime = dataService.singlePhaseCommit(state.tx);
state.setRunState(RunState.Committed);
return commitTime;
} catch (Throwable t) {
state.setRunState(RunState.Aborted);
throw new RuntimeException(t);
}
}
/**
*
* Task runs the distributed commit protocol transaction.
*
* Pre-conditions:
*
*
* - The transaction has a distributed write set (this does too much work
* for a transaction whose write set is local to a single data service).
* - The caller holds the locks for the named index resources declared by
* the transaction.
* - The transaction {@link TxState#isActive()}.
*
*
*
* Post-conditions (success):
*
* - The transaction was assigned a revisionTime.
* - All participating data services validated the write set of the
* transaction using that revisionTime and merge down the write set
* of the transaction onto the corresponding unisolated indices.
* - The transaction was assigned a commitTime.
* - All participating data services have made the write set of the
* transaction restart safe and marked the transaction as "committed" in
* their local data.
* - The transaction {@link TxState#isCommitted()}.
*
*
*
* Post-conditions (failure):
*
* - The transaction {@link TxState#isAborted()}.
* - Each participating data service has been notified that the
* transaction was aborted.
*
*
*
* @author Bryan Thompson
*/
private class DistributedTxCommitTask implements Callable {
private final TxState state;
/**
* The {@link UUID}s of the participating {@link IDataService}s.
*/
private final UUID[] uuids;
/**
* The proxies for the participating {@link IDataService}s.
*/
private final ITxCommitProtocol[] services;
/**
* The #of participating {@link IDataService}s.
*/
private final int nservices;
/**
* The revision time (assigned once the task begins to execute with all
* locks held for the named index partitions).
*/
private long revisionTime;
/**
* The commit time (assigned once the prepared barrier breaks and all
* locks are held for the participating data services).
*
* Note: This field is for debugging only.
*/
private long commitTime;
/**
* The thread in which the {@link DistributedTxCommitTask} is executing.
* This is the {@link Thread} that is used to obtain the locks for the
* commit phase using the
* {@link DistributedTransactionService#dataServiceLockManager}.
*/
final Thread commitThread;
/**
* Condition is signaled when the "prepared" barrier breaks.
*
* Note: If the barrier does not break because a participate fails then
* the {@link #commitThread} MUST be interrupted in order for it to awaken.
*/
final Condition prepared;
/**
* Condition is signaled when the necessary locks are held for the
* participating {@link IDataService}s.
*
* @see DistributedTransactionService#dataServiceLockManager
*/
final Condition locksHeld;
/**
* Condition is signaled when the "committed" barrier breaks.
*/
final Condition committed;
/**
* Barrier used to await the
* {@link ITransactionService#prepared(long, UUID)} messages during a
* distributed read-write transaction commit.
*/
CyclicBarrier preparedBarrier = null;
/**
* Barrier used to await the
* {@link ITransactionService#committed(long, UUID)} messages during a
* distributed read-write transaction commit.
*/
CyclicBarrier committedBarrier = null;
public DistributedTxCommitTask(final TxState state) {
if (state == null)
throw new IllegalArgumentException();
/*
* Note: If this thread is holding the lock on [TxState] then no
* other thread can access that object. This issue is resolved by
* creating [Condition]s on which this thread awaits based on
* TxState.lock.
*/
if(!state.lock.isHeldByCurrentThread())
throw new IllegalMonitorStateException();
this.state = state;
// The UUIDs of the participating (meta)dataServices.
this.uuids = state.getDataServiceUUIDs();
// The corresponding data services (resolve before acquiring locks).
this.services = getDataServices(uuids);
this.nservices = uuids.length;
// Note: Same thread required for ctor and execution!
this.commitThread = Thread.currentThread();
this.prepared = state.lock.newCondition();
this.locksHeld = state.lock.newCondition();
this.committed = state.lock.newCondition();
}
/**
* This method will be invoked by the {@link LockManagerTask} once it
* holds all of the necessary named index resource locks. This is how we
* impose a partial order for preparing the transaction. Deadlocks can
* not arise because we predeclare the locks and {@link LockManager} can
* guarantee no deadlocks in that case by sorting the requested
* resources and acquiring the locks in the sorted order.
*/
public Long call() throws Exception {
assert this.commitThread == Thread.currentThread();
return distributedCommit(state);
}
/**
* Prepare and commit a read-write transaction that has written on more
* than one data service.
*
* Note: read-write transactions that have written on multiple journals
* must use a distributed (2-/3-phase) commit protocol. As part of the
* commit protocol, we obtain an exclusive write lock on each journal on
* which the transaction has written. This is necessary in order for the
* transaction as a whole to be assigned a single commit time. Latency
* is critical in this commit protocol since the journals participating
* in the commit will be unable to perform any unisolated operations
* until the transaction either commits or aborts.
*
* Note: There is an assumption that the revisionTime does not need to
* be the commitTime. This allows us to get all the heavy work done
* before we reach the "prepared" barrier, which means that the commit
* phase should be very fast. The assumption is that validation of
* different transactions writing on the same unisolated indices is in
* fact serialized. The transaction services MUST enforce that
* assumption by serializing distributed commits (at least those which
* touch the same index partitions (necessary constraint), the same
* indices (sufficient constraint) or the same {@link IDataService}s
* (sufficient constraint)). If it did not serialize distributed commits
* then deadlocks could arise where two distributed
* commits were each seeking the exclusive write lock on resources, one
* of which was already held by the other commit.
*
* @throws Exception
* if anything goes wrong.
*
* @return The commit time for the transaction.
*/
protected long distributedCommit(final TxState state) throws Exception {
if(!state.lock.isHeldByCurrentThread())
throw new IllegalMonitorStateException();
// choose the revision timestamp.
this.revisionTime = nextTimestamp();
// add to map of concurrently committing distributed transactions.
commitList.put(state.tx, this);
try {
/*
* Submit a task that will run issue the prepare(tx,rev)
* messages to each participating data service and await its
* future.
*/
call2();
return commitTime;
} finally {
commitList.remove(state.tx);
}
}
/**
* Setups up the {@link TxState#preparedBarrier} and the
* {@link TxState#committedBarrier} and then runs the
* {@link PrepareTask} tasks.
*
* Post-conditions: {@link TxState#isComplete()} will be true. The
* transaction will either have been aborted or committed on all
* {@link IDataService}s.
*
* @return The assigned commit time.
*
* @todo Allow interrupt of the data service committers if any task
* fails during prepare() rather than having to wait for all of
* those tasks to join at the {@link TxState#preparedBarrier}.
* This is only an optimization. We would cancel those tasks using
* the {@link TaskRunner}'s {@link Future}.
*/
public Void call2() throws Exception {
Future> taskRunnerFuture = null;
try {
setupPreparedBarrier();
setupCommittedBarrier();
taskRunnerFuture = getFederation().getExecutorService().submit(
new TaskRunner());
/*
* Signaled when the prepared barrier breaks. Interrupted if
* the prepare phase fails.
*/
prepared.await();
/**
* Runs an inner Callable once we have the data service UUID
* locks.
*
* Note: The purpose of this task is to hold onto those locks
* until the commit is finished (either success or failure). The
* locks are automatically release once the inner Callable
* completes regardless of the outcome.
*
* Note: This task will run in the same thread as the caller.
* This means that the task will already hold the
* {@link TxState#lock}.
*/
new LockManagerTask(dataServiceLockManager, state
.getDataServiceUUIDs(), new Callable() {
public Void call() throws Exception {
if (!state.lock.isHeldByCurrentThread()) {
/*
* Note: The task runs in its caller's thread and
* the caller should already be holding the TxState
* lock.
*/
throw new IllegalMonitorStateException();
}
/*
* Signal so that the task which caused the prepared
* barrier to break can resume. It turn, when the
* prepared runnable finishes, all tasks awaiting that
* barrier will continue to execute and will enter their
* "commit" phase.
*/
locksHeld.signal();
// Signaled when the committed barrier breaks.
committed.await();
return null;
}
}).call();
// Done.
return null;
} finally {
/*
* Reset the barriers in case anyone is waiting.
*/
if (preparedBarrier != null)
preparedBarrier.reset();
if (committedBarrier != null)
committedBarrier.reset();
/*
* Await the future on the task running the PrepareTasks.
*
* Note: This task SHOULD complete very shortly after a
* successful commit.
*
* Note: If any PrepareTask fails, then all PrepareTasks should
* abort shortly thereafter.
*/
if (taskRunnerFuture != null)
taskRunnerFuture.get();
}
}
/**
* Submits the {@link PrepareTask}s in a different thread, awaits their
* {@link Future}s and logs any errors.
*
* Note: The {@link PrepareTask}s are executed outside of the thread
* that runs the {@link DistributedTxCommitTask} so that we may use the
* thread running the {@link DistributedTxCommitTask} to obtain locks
* from the {@link DistributedTransactionService#dataServiceLockManager}.
*
* @author Bryan
* Thompson
*/
private class TaskRunner implements Callable {
public TaskRunner() {
}
public Void call() throws Exception {
// The task MUST NOT run in the commitThread.
assert commitThread != Thread.currentThread();
// This thread MUST NOT own the lock.
assert !state.lock.isHeldByCurrentThread();
/*
* The futures for the tasks used to invoke prepare(tx,rev) on
* each dataService.
*/
final List> futures;
final List> tasks = new ArrayList>(
nservices);
for (ITxCommitProtocol dataService : services) {
tasks.add(new PrepareTask(dataService));
}
try {
/*
* Await all futures, returning once they are all done.
*/
futures = getFederation().getExecutorService().invokeAll(
tasks);
// tx must be complete (either committed or aborted).
assert state.isComplete() : state.toString();
} catch (Throwable t) {
/*
* If we can not invoke all tasks then abort
*/
log.error(t.getLocalizedMessage(), t);
state.setRunState(RunState.Aborted);
throw new RuntimeException(t);
}
List causes = null;
for (Future f : futures) {
try {
f.get();
} catch (Throwable t) {
if (causes == null) {
causes = new LinkedList();
}
causes.add(t);
log.error(t.getLocalizedMessage(), t);
}
}
if (causes != null) {
final int nfailed = causes.size();
state.setRunState(RunState.Aborted);
throw new ExecutionExceptions("Committer(s) failed: n="
+ nservices + ", nfailed=" + nfailed, causes);
}
return null;
} // call()
} // class TaskRunner
/**
* Sets up the {@link TxState#preparedBarrier}. When the barrier action
* runs it will change {@link RunState} to {@link RunState#Prepared} and
* assign a commitTime to the transaction. When the barrier
* breaks, the assigned commitTime will be reported back to the
* {@link IDataService}s waiting in
* {@link ITransactionService#prepared(long, UUID)} as the return value
* for that method.
*/
private void setupPreparedBarrier() {
preparedBarrier = new CyclicBarrier(nservices,
new Runnable() {
/**
* Method runs when the "prepared" barrier breaks.
*/
public void run() {
state.lock.lock();
try {
state.setRunState(RunState.Prepared);
/*
* Wake up the main thread. It will obtain the necessary
* locks for the participating data services and then
* signal that we may continue.
*/
prepared.signal();
try {
// wait until the necessary locks are held.
locksHeld.await();
} catch (InterruptedException ex) {
log.warn("Interrupted", ex);
// re-throw the exception.
throw new RuntimeException(ex);
}
// assign a commitTime to this tx.
final long commitTime = nextTimestamp();
// Set the commitTime on the outer task.
DistributedTxCommitTask.this.commitTime = commitTime;
// Set the commitTime on the tx.
state.setCommitTime(commitTime);
} finally {
state.lock.unlock();
}
}
});
}
/**
* Sets up the {@link TxState#committedBarrier}. When the barrier
* action runs it will change the {@link RunState} to
* {@link RunState#Committed}.
*/
protected void setupCommittedBarrier() {
committedBarrier = new CyclicBarrier(nservices,
new Runnable() {
/**
* Method runs when the "committed" barrier breaks. At this
* point the transaction is fully committed on the participating
* data services.
*/
public void run() {
state.lock.lock();
try {
// wake up the main thread.
committed.signal();
// Set the assigned commitTime on the TxState.
state.setCommitTime(commitTime);
// Change the tx run state.
state.setRunState(RunState.Committed);
} finally {
state.lock.unlock();
}
}
});
}
/**
* Task issues {@link ITxCommitProtocol#prepare(long, long)} to an
* {@link IDataService} participating in a distributed commit.
*
* @author Bryan
* Thompson
*/
protected class PrepareTask implements Callable {
final ITxCommitProtocol service;
public PrepareTask(final ITxCommitProtocol service) {
this.service = service;
}
public Void call() throws Exception {
try {
service.prepare(state.tx, revisionTime);
} catch (Throwable e) {
/*
* If an exception is thrown, then make sure that the tx
* is in the [Abort] state.
*/
try {
log.error(e.getLocalizedMessage(), e);
} catch (Throwable t) {
// ignored
}
state.lock.lock();
try {
state.setRunState(RunState.Aborted);
} finally {
state.lock.unlock();
}
throw new RuntimeException(e);
}
return null;
}
}
}
/**
* Note: Only those {@link DataService}s on which a read-write transaction
* has started will participate in the commit. If there is only a single
* such {@link IDataService}, then a single-phase commit will be used.
* Otherwise a distributed transaction commit protocol will be used.
*
* Note: The commits requests are placed into a partial order by sorting the
* total set of resources which the transaction declares (via this method)
* across all operations executed by the transaction and then contending for
* locks on the named resources using a LockManager. This is
* handled by the {@link DistributedTransactionService}.
*/
@Override
public void declareResources(final long tx, final UUID dataServiceUUID,
final String[] resource) throws IllegalStateException {
setupLoggingContext();
lock.lock();
try {
switch (getRunState()) {
case Running:
case Shutdown:
break;
default:
throw new IllegalStateException(ERR_SERVICE_NOT_AVAIL);
}
if (dataServiceUUID == null)
throw new IllegalArgumentException();
if (resource == null)
throw new IllegalArgumentException();
final TxState state = getTxState(tx);
if (state == null) {
throw new IllegalStateException(ERR_NO_SUCH);
}
state.lock.lock();
try {
if (state.isReadOnly()) {
throw new IllegalStateException(ERR_READ_ONLY);
}
if (!state.isActive()) {
throw new IllegalStateException(ERR_NOT_ACTIVE);
}
state.declareResources(dataServiceUUID, resource);
} finally {
state.lock.unlock();
}
} finally {
lock.unlock();
clearLoggingContext();
}
}
/**
* Waits at "prepared" barrier. When the barrier breaks, examing the
* {@link TxState}. If the transaction is aborted, then throw an
* {@link InterruptedException}. Otherwise return the commitTime assigned
* to the transaction.
*
* @throws InterruptedException
* if the barrier is reset while the caller is waiting.
*/
@Override
public long prepared(final long tx, final UUID dataService)
throws IOException, InterruptedException, BrokenBarrierException {
final DistributedTxCommitTask task = commitList.get(tx);
if (task == null) {
/*
* Transaction is not committing.
*/
throw new IllegalStateException();
}
final TxState state = task.state;
state.lock.lock();
try {
if(!state.isStartedOn(dataService)) {
throw new IllegalArgumentException();
}
// wait at the 'prepared' barrier.
task.preparedBarrier.await();
if (state.isAborted())
throw new InterruptedException();
return state.getCommitTime();
} finally {
state.lock.unlock();
}
}
/**
* Wait at "committed" barrier. When the barrier breaks, examing the
* {@link TxState}. If the transaction is aborted, then return
* false
. Otherwise return true.
*
* Note: The {@link TxState} will be aborted if any of the committers throws
* an exception of their {@link ITxCommitProtocol#prepare(long, long)}
* method.
*/
@Override
public boolean committed(final long tx, final UUID dataService)
throws IOException, InterruptedException, BrokenBarrierException {
final DistributedTxCommitTask task = commitList.get(tx);
if (task == null) {
/*
* Transaction is not committing.
*/
throw new IllegalStateException();
}
final TxState state = task.state;
state.lock.lock();
try {
if(!state.isStartedOn(dataService)) {
throw new IllegalArgumentException();
}
// wait at the 'committed' barrier.
task.committedBarrier.await();
if (state.isAborted())
return false;
return true;
} finally {
state.lock.unlock();
}
}
protected long findCommitTime(final long timestamp) {
synchronized(commitTimeIndex) {
return commitTimeIndex.find(timestamp);
}
}
protected long findNextCommitTime(long commitTime) {
synchronized(commitTimeIndex) {
return commitTimeIndex.findNext(commitTime);
}
}
/**
* @todo Is it a problem if the commit notices do not arrive in sequence?
* Because they will not. Unisolated operations will participate in
* group commits using timestamps obtained from the transaction
* service, but those commit operations will not be serialize and
* their reporting of the timestamps for the commits will likewise not
* be serialized.
*
* The danger is that we could assign a read-historical transaction
* start time based on the {@link #commitTimeIndex} and then have
* commit timestamps arrive that are within the interval in which we
* made the assignment. Essentially, our interval was too large and
* the assigned start time may have been on either side of a
* concurrent commit. However, this can only occur for unisolated
* operations (non-transactional commits). The selected timestamp will
* always be coherent with respect to transaction commits since those
* are coordinated and use a shared commit time.
*
* This issue can only arise when requesting historical reads for
* timestamps that are "close" to the most recent commit point since
* the latency involve would otherwise not effect the assignment of
* transaction start times. However, it can occur either when
* specifying the symbolic constant {@link ITx#READ_COMMITTED} to
* {@link #newTx(long)} or when specifying the exact commitTime
* reported by a transaction commit.
*
* Simply stated, there is NO protection against concurrently
* unisolated operations committing. If such operations are used on
* the same indices as transactions, then it IS possible that the
* application will be unable to read from exactly the post-commit
* state of the transaction for a brief period (10s of milliseconds)
* until the unisolated commit notices have been propagated to the
* {@link DistributedTransactionService}. This issue will only occur
* when there is also a lot of contention for reading on the desired
* timestamp since otherwise the commitTime itself may be used as a
* transaction start time.
*
* @todo depending on the latency involved and the issue described
* immediately above, it might be possible to simply queue these
* notices and consume them in an async thread. some operations (such
* as a distributed commit) might require that we catch up on the
* commit time notices in the queue. just thinking out loud here.
*/
final public void notifyCommit(final long commitTime) {
/*
* Note: In order to avoid a deadlock, this must obtain the lock before
* synchronizing on the commitTimeIndex since the method in the super
* class will request that lock as well.
*/
lock.lock();
try {
synchronized (commitTimeIndex) {
/*
* Add all commit times
*/
commitTimeIndex.add(commitTime);
/*
* Note: commit time notifications can be overlap such that they
* appear out of sequence with respect to their values. This is
* Ok. We just ignore any older commit times. However we do need
* to be synchronized here such that the commit time notices
* themselves are serialized so that we do not miss any.
*/
if (log.isDebugEnabled())
log.debug("commitTime="
+ commitTime
+ ", lastKnownCommitTime="
+ lastCommitTime
+ (lastCommitTime < commitTime ? " WILL UPDATE"
: ""));
if (lastCommitTime < commitTime) {
lastCommitTime = commitTime;
super.notifyCommit(commitTime);
}
}
} finally {
lock.unlock();
}
}
final public long getLastCommitTime() {
return lastCommitTime;
}
/**
* Invokes {@link ITxCommitProtocol#setReleaseTime(long)} for a specific
* {@link IDataService}.
*
* @author Bryan Thompson
*/
private static class SetReleaseTimeTask implements Callable {
final IDataService dataService;
final long releaseTime;
public SetReleaseTimeTask(final IDataService dataService, final long releaseTime) {
if (dataService == null)
throw new IllegalArgumentException();
if (releaseTime <= 0L)
throw new IllegalArgumentException();
this.dataService = dataService;
this.releaseTime = releaseTime;
}
public Void call() throws Exception {
dataService.setReleaseTime(releaseTime);
return null;
}
}
/**
* Task periodically notifies the discovered {@link IDataService}s of the
* new release time.
*
* Note: Running a concurrent instance of this could cause release times to
* be distributed that do not strictly advance. If you need to do this,
* e.g., in order to immediately update the release time, then also
* introduce a lock for this task on the {@link AbstractTransactionService}
* so that instances of the task must run in sequence.
*
* @todo must also notify the metadata service once it is partitioned.
*
* @todo We could monitor data service joins (for jini) and immediately
* notify newly joined data services of the current release time.
*
* FIXME There is probably no reason to do this now that the resource
* manager reaches out for the current release time before deciding which
* resources it can release.
*
* @author Bryan Thompson
*/
protected class NotifyReleaseTimeTask implements Runnable {
private long lastReleaseTime = 0L;
/**
* Notifies all {@link IDataService}s of the current release time.
*
* Note: An {@link IDataService} WILL NOT release its most current
* commit point, regardless of the releaseTime that is sent to that
* service.
*
* Note: If this method throws an exception then the task will no longer
* be scheduled!
*/
public void run() {
try {
final long releaseTime = getReleaseTime();
if (releaseTime == lastReleaseTime) {
// The release time has not been advanced.
return;
}
final IBigdataFederation fed = getFederation();
final UUID[] a = fed.getDataServiceUUIDs(0/* maxCount */);
final IDataService[] services = getFederation()
.getDataServices(a);
final List> tasks = new ArrayList>(
a.length);
for (IDataService dataService : services) {
tasks.add(new SetReleaseTimeTask(dataService, releaseTime));
}
log.warn("Will set release time on " + a.length
+ " data services: releaseTime=" + releaseTime);
final List> futures = getFederation()
.getExecutorService().invokeAll(tasks);
for (Future f : futures) {
try {
// verify no errors.
f.get();
} catch (Throwable t) {
/*
* Log an error if any data service can not be notified.
*/
log.error(t.getLocalizedMessage(), t);
}
}
// update the last release time.
lastReleaseTime = releaseTime;
} catch (Throwable t) {
log.error(t.getLocalizedMessage(), t);
}
}
}
/**
* Adds counters for the {@link LockManager}.
*/
// synchronized
public CounterSet getCounters() {
// if (countersRoot == null) {
/*
* Setup basic counters.
*/
final CounterSet countersRoot = super.getCounters();
/**
* The lock manager imposing a partial ordering on the prepare phase
* of distributed transaction commits using the index partition
* names as the named resources.
*/
countersRoot.makePath("Index Lock Manager").attach(
((DistributedTransactionService) this).indexLockManager
.getCounters());
/**
* The lock manager imposing a partial ordering on the commit phase
* of distributed transaction commits using the data service UUIDs
* as the named resources.
*/
countersRoot.makePath("DataService Lock Manager").attach(
((DistributedTransactionService) this).dataServiceLockManager
.getCounters());
/**
* The #of snapshots of the commit time index that have been written
* to date.
*/
countersRoot.addCounter("snapshotCount",
new Instrument() {
protected void sample() {
setValue(snapshotCount);
}
});
/**
* The #of distributed transaction commits that are currently in
* progress.
*/
countersRoot.addCounter("distributedCommitsInProgressCount",
new Instrument() {
protected void sample() {
setValue(commitList.size());
}
});
/**
* The #of commit times that are currently accessible.
*/
countersRoot.addCounter("commitTimesCount",
new Instrument() {
protected void sample() {
/*
* Note: This uses a method which does not require
* synchronization. (The entryCount is reported
* without traversing the BTree.)
*/
setValue(commitTimeIndex.getEntryCount());
// synchronized (commitTimeIndex) {
// setValue(commitTimeIndex.getEntryCount());
// }
}
});
countersRoot.addCounter("dataDir", new Instrument() {
protected void sample() {
setValue(dataDir.toString());
}
});
// }
return countersRoot;
}
}