![JAR search and dependency download from the Maven repository](/logo.png)
com.bigdata.btree.isolation.IsolatedFusedView Maven / Gradle / Ivy
/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
[email protected]
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Feb 12, 2007
*/
package com.bigdata.btree.isolation;
import java.util.concurrent.atomic.AtomicInteger;
import com.bigdata.btree.AbstractBTree;
import com.bigdata.btree.BTree;
import com.bigdata.btree.Checkpoint;
import com.bigdata.btree.ICounter;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.ILocalBTreeView;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITuple;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.Tuple;
import com.bigdata.btree.view.FusedView;
import com.bigdata.journal.AbstractTask;
import com.bigdata.journal.ITx;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.service.IBigdataFederation;
/**
*
* An index (or index partition) that has been isolated by a transaction.
* Isolation is achieved by the following mechanisms:
*
* - The writeSet of the transaction on the index is isolated on a
* {@link BTree} visible only to that transaction.
* - Version timestamps are maintained for index entries in both the isolated
* write set and groundState from which the transaction is reading.
* - The groundState is defined as the view of the index (partition) as of the
* abs(startTime) of the transaction.
* - Reads are performed against an ordered view defined by the writeSet
* followed by the ordered set of indices defining the groundState of the index.
*
* - Writes first read through the ordered view to locate the most recent
* version for an index entry. If the index entry is located in the isolated
* writeSet then it is overwritten and its timestamp is unchanged. If the index
* entry is located in the groundState then the timestamp is copied from that
* index entry and written on the new version in the isolated writeSet.
* - During validation, version timestamps in the isolated writeSet are
* compared against the then current view of the corresponding unisolated index.
* If the timestamp in the unisolated view differs from that in the writeSet
* then there is a write-write conflict. Write-write conflicts MAY be validated
* if the index has a registered {@link IConflictResolver}.
* - If the writeSet is validated then it is mergedDown (copied onto) the then
* current unisolated index view. During the mergeDown phase the revision
* timestamp of the transaction is applied to all index entries copied from the
* write set. Transactions that later try to commit will recognize write-write
* conflicts based on those updated timestamps. Note that revision timestamps
* ARE NOT commit timestamps. Revision timestamps are assigned at the start of
* the validation phase. All tuples modified by a transaction are annotated with
* the same revision timestamp during the validation phase of the transaction.
* Write-write conflicts are detected on the basis of the per-tuple revision
* timestamps. Commit timestamps are assigned once the write set has been
* validated and checkpointed and all shards participating in the commit
* protocol signal that they are prepared and ready to commit.
*
*
*
* Note: The timestamp from which the post-commit state of the transaction may
* be read IS NOT defined for an {@link IBigdataFederation}. It is not possible
* to define this timestamp without requiring concurrent commit processing to be
* paused on all data services on which the transaction has written, which is
* viewed as too high a cost. Instead, the commit timestamp is the state from
* which you can read the data written by the transaction. Reads on tuples NOT
* updated by the transaction MAY have been changed by concurrent transactions.
*
* Note: The process of validating, merging down changes, and committing those
* changes MUST be atomic. Therefore no other operations may be permitted access
* to the unisolated indices corresponding to the isolated indices on which the
* transaction during this process. This constraint is generally achieved by
* holding a write lock on the unisolated indices corresponding to the indices
* isolated by the transaction, e.g., by declaring those indices to an
* {@link ITx#UNISOLATED} {@link AbstractTask} which handles this process.
*
* @author Bryan Thompson
*/
public class IsolatedFusedView extends FusedView {
/**
* The transaction identifier (aka transaction start time).
*/
private final long startTime;
/**
* The isolated write set (the place where we record the intention of the
* transaction). This is just a reference to the mutable {@link BTree} at
* index zero(0) of sources in the view.
*/
private final BTree writeSet;
/**
* The isolated write set (the place where we record the intention of the
* transaction). This is just a reference to the mutable {@link BTree} at
* index zero(0) of sources in the view.
*
* @see FusedView#getMutableBTree()
*/
public BTree getWriteSet() {
return writeSet;
}
/**
* Constructor may be used either for a fully isolated transaction or an
* unisolated operation. In each case the groundState is the ordered
* set of read-only resources corresponding to the timestamp.
*
* Reads will read through the writeSet and then the resource(s) in
* the groundState in the order in which they are given. A read is
* satisfied by the first resource containing an index entry for the search
* key.
*
* Writes will first read through looking for a @todo javadoc
*
* @param timestamp
* The timestamp associated with the groundState.
* @param sources
* An ordered array of sources comprised of the {@link BTree}
* that will absorb writes and the historical ground state.
*/
public IsolatedFusedView(final long timestamp, final AbstractBTree[] sources ) {
super(sources);
if (!TimestampUtility.isCommitTime(timestamp))
throw new IllegalStateException();
this.startTime = timestamp;
if (sources.length < 2)
throw new IllegalArgumentException();
if( !( sources[0] instanceof BTree ) ) {
throw new IllegalArgumentException();
}
writeSet = (BTree) sources[0];
// verify all sources support timestamps.
for (int i = 0; i < sources.length; i++) {
if (!sources[i].getIndexMetadata().getVersionTimestamps()) {
throw new IllegalArgumentException();
}
}
}
/**
* True iff there are no writes on this isolated index.
*/
public boolean isEmptyWriteSet() {
return writeSet.getEntryCount() == 0;
}
/**
* Counters are disallowed for isolated view. The reason is that counters
* are typically used to create one-up distinct values assigned to keys. If
* the counter is stored on the write set then different transactions could
* easily assign the same counter value under different keys, leading to an
* undetectable write conflict.
*
* @todo counters could probably be enabled within transactions if we used
* the counter from the then current mutable btree. This would have to
* be passed into the constructor. In addition, the counter logic
* would have to be carefully checked to make sure that counter
* assignments remain consistent. The counter itself is an
* {@link AtomicInteger}. However additional care needs to be taken
* to ensure that the counter value is persisted if it is changed (by
* updating the {@link BTree} {@link Checkpoint} record). The cases
* where the tx bumps the counter need to be carefully examined since
* it could force the write of the unisolated btree when we actually
* do not want to commit the btree - some kind of locking may be
* required. So, for now, this is disabled.
*
* @throws UnsupportedOperationException
* always
*/
@Override
final public ICounter getCounter() {
throw new UnsupportedOperationException();
}
/**
* {@inheritDoc}
*
* Write an entry for the key on the write set.
*/
@Override
public byte[] insert(final byte[] key, final byte[] val) {
final Tuple> tuple = lookup(key, getMutableBTree().getLookupTuple());
if (tuple == null) {
/*
* There is no entry under that key in the view, not even a deleted
* entry. Therefore we insert a new entry using the startTime of the
* transaction. We return [null] since there was no value under that
* key.
*/
// srcs[0]
getMutableBTree().insert(key, val, false/*delete*/, false/*putIfAbsent*/, startTime, null/*tuple*/);
return null;
} else {
/*
* There is an (potentially deleted) entry under that key and we are
* going to overwrite it. We will use the timestamp from that entry.
* If the entry is NOT in the write set then the timestamp will be
* the [revisionTime] of the last write on that key before this
* transaction's start time and the timestamp will be copied into
* the write set. If the entry is in the write set then the
* timestamp will either have been copied already into the write set
* previously by this code branch or it will be the startTime of
* this transaction (the code branch above).
*/
final long timestamp = tuple.getVersionTimestamp();
// srcs[0]
getMutableBTree().insert(key, val, false/*delete*/, false/*putIfAbsent*/, timestamp, null/*tuple*/);
return tuple.isNull() || tuple.isDeletedVersion() ? null : tuple
.getValue();
}
}
/**
* {@inheritDoc}
*
* Write an entry for the key on the write set.
*/
@Override
public byte[] putIfAbsent(final byte[] key, final byte[] val) {
final Tuple> tuple = lookup(key, getMutableBTree().getLookupTuple());
if (tuple == null) {
/*
* There is no entry under that key in the view, not even a deleted
* entry. Therefore we insert a new entry using the startTime of the
* transaction. We return [null] since there was no value under that
* key.
*
* See BLZG-1539. Note for this code path the insert is unconditional
* since we already know that there is no entry under that key in the
* index.
*/
// srcs[0]
getMutableBTree().insert(key, val, false/*delete*/, false/*putIfAbsent*/, startTime, null/*tuple*/);
return null;
} else {
/*
* There is an (potentially deleted) entry under that key and we are
* going to overwrite it IFF it is a deleted entry (conditional
* insert). We will use the timestamp from that entry. If the entry
* is NOT in the write set then the timestamp will be the
* [revisionTime] of the last write on that key before this
* transaction's start time and the timestamp will be copied into
* the write set. If the entry is in the write set then the
* timestamp will either have been copied already into the write set
* previously by this code branch or it will be the startTime of
* this transaction (the code branch above).
*/
final long timestamp = tuple.getVersionTimestamp();
// srcs[0]
getMutableBTree().insert(key, val, false/*delete*/, true/*putIfAbsent*/, timestamp, null/*tuple*/);
return tuple.isNull() || tuple.isDeletedVersion() ? null : tuple
.getValue();
}
}
/**
* Write a deleted entry for the key on the write set.
*/
@Override
public byte[] remove(final byte[] key) {
final Tuple> tuple = lookup(key, getMutableBTree().getLookupTuple());
if (tuple == null) {
/*
* There is no entry under that key in the view, not even a deleted
* entry. Therefore we insert a new entry using the startTime of the
* transaction. We return [null] since there was no value under that
* key.
*/
// srcs[0]
getMutableBTree().insert(key, null, true/* delete */, false/*putIfAbsent*/, startTime, null/*tuple*/);
return null;
} else {
/*
* There is an (potentially deleted) entry under that key and we are
* going to overwrite it. We will use the timestamp from that entry.
* If the entry is NOT in the write set then the timestamp will be
* the [revisionTime] of the last write on that key before this
* transaction's start time and the timestamp will be copied into
* the write set. If the entry is in the write set then the
* timestamp will either have been copied into the write set
* previously by this code branch or it will be the startTime of
* this transaction (the code branch above).
*/
final long timestamp = tuple.getVersionTimestamp();
if (tuple.isDeletedVersion() && timestamp == this.startTime) {
/*
* Note: Avoid double-delete when the delete was performed by
* this transaction.
*/
} else {
/*
* Write a delete marker whose timestamp is copied from the
* groundState.
*/
// srcs[0]
getMutableBTree().insert(key, null, true/* delete */, false/*putIfAbsent*/, timestamp, null/* tuple */);
}
return tuple.isNull() || tuple.isDeletedVersion() ? null : tuple
.getValue();
}
}
/**
*
* Validate changes made to the index within a transaction against the last
* committed state of the index in the global scope. In general there are
* two kinds of conflicts: read-write conflicts and write-write conflicts.
* Read-write conflicts are handled by NEVER overwriting an existing version
* (an MVCC style strategy). Write-write conflicts are detected by backward
* validation against the last committed state of the journal. A write-write
* conflict exists IFF the version counter on the transaction index entry
* differs from the version counter in the global index scope. Once
* detected, the resolution of a write-write conflict is delegated to a
* {@link IConflictResolver conflict resolver}. If a write-write conflict
* can not be validated, then validation will fail and the transaction must
* abort.
*
*
* Validation occurs as part of the prepare/commit protocol. Concurrent
* transactions MAY continue to run without limitation. A concurrent commit
* (if permitted) would force re-validation since the transaction MUST now
* be validated against the new baseline. (It is possible that this
* validation could be optimized.)
*
*
* The version counters used to detect write-write conflicts are incremented
* during the commit as part of the {@link #mergeDown()} of the
* {@link IsolatedFusedView} onto the corresponding unisolated indices in
* the global scope.
*
*
* @param groundStateSources
* The ordered view of the unisolated index. This MUST be the
* current view of the ground state as of when the transaction is
* validated (NOT when it was created). This view WILL NOT the
* same as the groundState specified to the constructor if
* intervening transactions have committed on the index.
*
* @return True iff validation succeeds.
*/
public boolean validate(final AbstractBTree[] groundStateSources) {
if (isEmptyWriteSet()) {
// Nothing written on this isolated index.
return true;
}
/*
* Do not validate this index unless the groundState has been modified
* since the readState for the transaction.
*
* @todo the code below presumed that the source was a single BTree
* rather than an ordered view of AbstractBTrees. It would be better to
* compare a timestamp for the view definition, e.g., in the Resource[]
* defining an index partition.
*/
// {
//
// // Note: This is the state from which the transaction read.
// ReadOnlyFusedView readState = this.groundState;
//
// if (!currentGroundState.modifiedSince(readState.getMetadata()
// .getMetadataAddr())) {
//
// // No changes to the unisolated index since the readState.
//
// return true;
//
// }
//
// }
/*
* Note: Write-write conflicts can be validated iff a conflict resolver
* was declared when the Journal object was instantiated.
*/
final IConflictResolver conflictResolver = writeSet.getIndexMetadata()
.getConflictResolver();
/*
* The versions returned by the conflict resolver must be written on the
* isolated index so that they will overwrite the committed version when
* we mergeDown() onto the unisolated index. However, we have to take
* care since this can result in a concurrent modification of the
* IsolatedBTree that we are currently traversing.
*
* We handle this by inserting the results from the conflict resolver
* into an temporary tree and then writing them on the IsolatedBTree
* once we finish the validation pass.
*
* Note: It is NOT safe to update the value on the IsolatedBTree during
* traversal since it might trigger copy-on-write which would cause
* structural modifications that would break the iterator. [@todo
* actually, that is fine now if we specify the CURSOR flag].
*
* @todo Once we create this temporary tree we need to read from a fused
* view of it and the primary IsolatedFusedView if we are going to
* support conflict resolution that spans more than a single key-value
* at a time. However, we also need to expose the Tx to the conflict
* resolver for that to work (which is why we have not exposed the tx
* yet).
*/
BTree tmp = null;
/*
* A view onto the consistent state of the current global scope for that
* index. We use this ONLY for reading (clearly).
*/
final IIndex groundState = (groundStateSources.length == 1 ? groundStateSources[0]
: new FusedView(groundStateSources));
// /*
// * The btree that is absorbing writes for the index. We need this as a
// * BTree and not a FusedView or IIndex in order to handle all of the
// * cases as cleanly as possible - what matters is having access to the
// * core lookup() method on AbstractBTree.
// */
// final BTree groundStateWriteSet = (BTree) groundStateSources[0];
/*
* Scan the write set of the transaction.
*
* Note: Both indices have the same total ordering so we are essentially
* scanning both indices in order.
*
* Note: the iterator is chosen carefully in order to visit the IValue
* objects and see both deleted and undeleted entries.
*/
final ITupleIterator> itr = writeSet.rangeIterator(null, null,
0/* capacity */, ALL /* flags */, null);
// tuple for reading from the groundState index.
final Tuple> groundStateTuple = new Tuple(
((ILocalBTreeView)groundState).getMutableBTree(),
// (groundState instanceof AbstractBTree ? (AbstractBTree) groundState
// : ((FusedView) groundState).getSources()[0]),
KEYS | VALS);
while (itr.hasNext()) {
// The index entry in the transaction's write set.
final ITuple> txEntry = itr.next();
// The key for that index entry.
final byte[] key = txEntry.getKey();
// Lookup the entry in the global scope.
final ITuple> baseEntry; //= groundState.lookup(key, groundStateTuple);
if(groundState instanceof AbstractBTree) {
baseEntry = ((AbstractBTree) groundState).lookup(key,
groundStateTuple);
} else {
baseEntry = ((FusedView) groundState).lookup(key,
groundStateTuple);
}
/*
* If there is an entry in the global scope, then we MUST compare
* the version counters.
*/
if (baseEntry != null) {
/*
* If the version counters do not agree then we need to perform
* write-write conflict resolution.
*/
if (baseEntry.getVersionTimestamp() != txEntry
.getVersionTimestamp()) {
if (conflictResolver == null) {
// no conflict resolver - validation fails.
log.warn("Write-write conflict - no conflict resolver");
return false;
}
/*
* Create a temporary index to buffer the conflict
* resolver's decisions. Once the write set has been
* validated the versions written (or deleted) by the
* conflict resolver must be written on the isolated index
* so that it will overwrite the committed version when we
* mergeDown() onto the unisolated index.
*
* Note: This uses the same store as the writeSet, which is
* supposed to be a temporary store dedicated to a specific
* transaction.
*/
if (tmp == null) {
tmp = BTree.create(//
writeSet.getStore(), // same store.
writeSet.getIndexMetadata().clone() // same metadata
);
}
/*
* Apply the conflict resolver in an attempt to resolve the
* conflict.
*/
try {
if (!conflictResolver.resolveConflict(tmp, txEntry,
baseEntry)) {
log.warn("Write-write conflict NOT resolved.");
// Not validated.
return false;
}
} catch (Exception ex) {
log.error("Write-write conflict", ex);
// Not validated.
return false;
}
}
}
}
if (tmp != null) {
/*
* Copy in any updates resulting from conflict validation. It is
* safe to apply those updates now that we are no longer traversing
* the index.
*
* Note: We visit deleted entries in case conflict resolution
* decided to delete an index entry.
*
* Note: This sets the timestamp to the startTime of the
* transaction, but that it an arbitrary choice. Since we have
* already validated the transaction the timestamps in the write set
* will be ignored during the mergeDown() operation so the assigned
* value does NOT matter.
*/
final ITupleIterator> tmpItr = tmp.rangeIterator(null, null,
0/* capacity */,
IRangeQuery.DEFAULT | IRangeQuery.DELETED, null/* filter */);
while (tmpItr.hasNext()) {
final ITuple> tuple = tmpItr.next();
if(tuple.isDeletedVersion()) {
writeSet.insert(tuple.getKey(), null, true/* deleted */, false/*putIfAbsent*/,
startTime, null/*tuple*/);
} else {
writeSet.insert(tuple.getKey(), tuple.getValue(),
false/* deleted */, false/*putIfAbsent*/, startTime, null/* tuple */);
}
}
}
// validation succeeded.
return true;
}
/**
*
* Merge the transaction scope index onto the then current unisolated index.
*
*
* Note: This method is invoked by a transaction during commit processing to
* merge the write set of an {@link IsolatedFusedView} into the global
* scope. This operation does NOT check for conflicts. The pre-condition is
* that the transaction has already been validated (hence, there will be no
* conflicts).
*
*
* Note: This method is also responsible for updating the version timestamps
* that are used to detect write-write conflicts during validation - they
* are set to the revisionTime.
*
*
* @param revisionTime
* The revision timestamp assigned to the commit point of the
* transaction.
*
* @param groundStateSources
* The ordered view of the unisolated index. This MUST be the
* current view of the ground state as of when the transaction is
* validated (NOT when it was created). This view WILL NOT the
* same as the groundState specified to the constructor if
* intervening transactions have committed on the index.
*/
public void mergeDown(final long revisionTime,
final AbstractBTree[] groundStateSources) {
/*
* A read-only view onto the consistent state of the current global
* scope for that index. We use this ONLY for reading (clearly).
*/
// final IIndex groundStateScope = new ReadOnlyFusedView(groundStateSources);
/*
* A view onto the consistent state of the current global scope for that
* index. We use this ONLY for reading (clearly).
*/
final IIndex groundState = (groundStateSources.length == 1 ? groundStateSources[0]
: new FusedView(groundStateSources));
/*
* The btree that is absorbing writes for the index. We need this as a
* BTree and not a FusedView or IIndex in order to handle all of the
* cases as cleanly as possible - what matters is having access to the
* core insert method on AbstractBTree.
*/
final BTree groundStateWriteSet = (BTree) groundStateSources[0];
/*
* Note: the iterator is chosen carefully in order to visit the IValue
* objects and see both deleted and undeleted entries.
*/
final ITupleIterator itr = writeSet.rangeIterator(null, null,
0/* capacity */, ALL/* flags */, null);
while (itr.hasNext()) {
// The index entry in the isolated write set.
final ITuple entry = itr.next();
// The corresponding key.
final byte[] key = entry.getKey();
if (entry.isDeletedVersion()) {
/*
* IFF there was a pre-existing version in the global scope then
* we remove the key from the global scope so that it will now
* have a "delete marker" for this key.
*/
if (groundState.contains(key)) {
// globalScope.remove(key);
groundStateWriteSet.insert(key, null/* val */, true/* delete */, false/*putIfAbsent*/,
revisionTime, null/*tuple*/);
} else {
/*
* The deleted version never existed in the unisolated index
* so we do not need to record the entry.
*/
}
} else {
/*
* Copy the entry down onto the global scope.
*
* Note: This writes the [revisionTime] of the transaction on
* the unisolated index entry.
*/
groundStateWriteSet.insert(key, entry.getValue(),
false/* delete */, false/*putIfAbsent*/, revisionTime, null/* tuple */);
}
}
}
}