![JAR search and dependency download from the Maven repository](/logo.png)
com.bigdata.resources.CompactingMergeTask Maven / Gradle / Ivy
package com.bigdata.resources;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.TimeoutException;
import com.bigdata.btree.BTree;
import com.bigdata.btree.ILocalBTreeView;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.IndexSegment;
import com.bigdata.btree.ScatterSplitConfiguration;
import com.bigdata.btree.proc.BatchLookup;
import com.bigdata.btree.proc.AbstractKeyArrayIndexProcedure.ResultBuffer;
import com.bigdata.btree.proc.BatchLookup.BatchLookupConstructor;
import com.bigdata.io.SerializerUtil;
import com.bigdata.journal.AbstractTask;
import com.bigdata.journal.IConcurrencyManager;
import com.bigdata.journal.IResourceManager;
import com.bigdata.journal.ITx;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.mdi.IResourceMetadata;
import com.bigdata.mdi.LocalPartitionMetadata;
import com.bigdata.mdi.PartitionLocator;
import com.bigdata.mdi.SegmentMetadata;
import com.bigdata.resources.OverflowManager.ResourceScores;
import com.bigdata.service.DataService;
import com.bigdata.service.Event;
import com.bigdata.service.EventResource;
import com.bigdata.service.ILoadBalancerService;
import com.bigdata.service.MetadataService;
import com.bigdata.service.ndx.ClientIndexView;
import com.bigdata.util.Bytes;
/**
* Task builds an {@link IndexSegment} from the fused view of an index partition
* as of some historical timestamp and then atomically updates the view (aka a
* compacting merge).
*
* Note: This task may be used after {@link IResourceManager#overflow()} in
* order to produce a compact view of the index as of the lastCommitTime
* on the old journal.
*
* Note: As its last action, this task submits a
* {@link AtomicUpdateCompactingMergeTask} which replaces the view with one
* defined by the current {@link BTree} on the journal and the newly built
* {@link IndexSegment}.
*
* Note: If the task fails, then the generated {@link IndexSegment} will be
* deleted.
*
* @author Bryan Thompson
* @version $Id$
*/
public class CompactingMergeTask extends AbstractPrepareTask {
final protected ViewMetadata vmd;
/**
*
* @param vmd
* The {@link ViewMetadata} for the index partition.
*/
public CompactingMergeTask(final ViewMetadata vmd) {
super(vmd.resourceManager, TimestampUtility
.asHistoricalRead(vmd.commitTime), vmd.name);
this.vmd = vmd;
}
@Override
protected void clearRefs() {
vmd.clearRef();
}
/**
* Build an {@link IndexSegment} from the compacting merge of an index
* partition.
*
* @return The {@link BuildResult}.
*/
protected BuildResult doTask() throws Exception {
final Event e = new Event(resourceManager.getFederation(),
new EventResource(vmd.indexMetadata),
OverflowActionEnum.Merge, vmd.getParams()).start();
BuildResult buildResult = null;
try {
try {
if (resourceManager.isOverflowAllowed())
throw new IllegalStateException();
/*
* Build the index segment.
*
* Note: Since this is a compacting merge the view on the old
* journal as of the last commit time will be fully captured by
* the generated index segment. However, writes buffered by the
* live journal WILL NOT be present in that index segment and
* the post-condition view will include those writes.
*/
// build the index segment.
buildResult = resourceManager
.buildIndexSegment(vmd.name, vmd.getView(),
true/* compactingMerge */, vmd.commitTime,
null/* fromKey */, null/* toKey */, e);
} finally {
/*
* Release our hold on the source view - we only needed it when
* we did the index segment build.
*/
clearRefs();
}
if (buildResult.builder.getCheckpoint().length >= resourceManager.nominalShardSize) {
/*
* If sumSegBytes exceeds the threshold, then do a split here.
*/
// FIXME reconcile return type and enable post-merge split.
// return new SplitCompactViewTask(vmd.name, buildResult);
}
/*
* @todo error handling should be inside of the atomic update task
* since it has more visibility into the state changes and when we
* can no longer delete the new index segment.
*/
try {
// scale-out index UUID.
final UUID indexUUID = vmd.indexMetadata.getIndexUUID();
// submit task and wait for it to complete
concurrencyManager.submit(
new AtomicUpdateCompactingMergeTask(resourceManager,
concurrencyManager, vmd.name, indexUUID,
buildResult, e.newSubEvent(
OverflowSubtaskEnum.AtomicUpdate, vmd
.getParams()))).get();
// /*
// * Verify that the view was updated. If the atomic update task
// * runs correctly then it will replace the IndexMetadata object
// * on the mutable BTree with a new view containing only the live
// * journal and the new index segment (for a compacting merge).
// * We verify that right now to make sure that the state change
// * to the BTree was noticed and resulted in a commit before
// * returning control to us here.
// *
// * @todo comment this out or replicate for the index build task
// * also?
// */
// concurrencyManager
// .submit(
// new VerifyAtomicUpdateTask(resourceManager,
// concurrencyManager, vmd.name,
// indexUUID, result)).get();
} catch (Throwable t) {
// make it releasable.
resourceManager.retentionSetRemove(buildResult.segmentMetadata
.getUUID());
// delete the generated index segment.
resourceManager
.deleteResource(buildResult.segmentMetadata.getUUID(), false/* isJournal */);
// re-throw the exception
throw new Exception(t);
}
if (resourceManager.compactingMergeWithAfterAction) {
/*
* Consider possible after-actions now that the view is compact.
* If any is selected, then it will be executed in the current
* thread.
*/
final AbstractTask> afterActionTask = chooseAfterActionTask();
if (afterActionTask != null) {
afterActionTask.call();
}
}
return buildResult;
} finally {
if (buildResult != null) {
/*
* At this point the index segment was either incorporated into
* the new view in a restart safe manner or there was an error.
* Either way, we now remove the index segment store's UUID from
* the retentionSet so it will be subject to the release policy
* of the StoreManager.
*/
resourceManager.retentionSetRemove(buildResult.segmentMetadata
.getUUID());
}
e.end();
}
}
/**
* Now that the index partition is compact, decide if we will take any after
* action, such as {move, join, split, tailSplit, scatterSplit, etc). All of
* these operations are much cheaper while the index is compact which is why
* we do them here.
*
* Note: asynchronous overflow processing WILL NOT complete until the
* CompactingMergeTask is done. This means that we will still be reading
* from the same journal. As long as we are reading from the same ordered
* set of resources the lastCommitTime chosen here is somewhat arbitrary.
*
* The updated view metadata as of the last commit time on the live journal.
*
* FIXME Concurrent operations can replace the view definition. However,
* what would not be good is if they changed the set of resources in the
* view. The AtomicUpdate of the after action task MUST check for this
* precondition (same set of resources in the view) and abort (and clean up
* any intermediate files) if the precondition has been violated (no harm is
* done if we abort, just some lost work).
*
* @todo split + move and friends seem unnecessarily complicated. We can
* just move anything that is compact. [Clean up the tasks to remove
* this stuff.]
*
* @todo We might be better off running {@link #chooseAfterActionTask()}
* from inside of the atomic update and then doing any work there
* while we have the lock on the shard. This will prevent any new data
* from building up and can help ensure that the preconditions for the
* operation remain valid. This might also help simplify the HA
* design.
*
* @todo Once we have flow control on writes we can save the DS a lot of
* work by not accepting new writes for an index partition when we are
* going to compact it, move it, split it, etc.
*/
private AbstractTask> chooseAfterActionTask() {
final ViewMetadata vmd = new ViewMetadata(resourceManager,
resourceManager.getLiveJournal().getLastCommitTime(),
this.vmd.name, resourceManager.getIndexCounters(this.vmd.name));
/*
* Scatter split?
*
* Note: Scatter splits are considered before tail splits and normal
* splits since they can only be taken when there is a single index
* partition for a scale-out index. The other kinds of splits are used
* once the index has already been distributed onto the cluster by a
* scatter split.
*/
{
final ScatterSplitConfiguration ssc = vmd.indexMetadata
.getScatterSplitConfiguration();
if ( // only a single index partitions?
(vmd.getIndexPartitionCount() == 1L)//
// scatter splits enabled for service
&& resourceManager.scatterSplitEnabled//
// scatter splits enabled for index
&& ssc.isEnabled()//
// The view is compact (only one segment).
&& vmd.compactView//
// trigger scatter split before too much data builds up in one place.
&& vmd.getPercentOfSplit() >= ssc.getPercentOfSplitThreshold()
) {
// Target data services for the new index partitions.
final UUID[] moveTargets = getScatterSplitTargets(ssc);
if (moveTargets != null) {
// #of splits.
final int nsplits = ssc.getIndexPartitionCount() == 0//
? (2 * moveTargets.length) // two per data service.
: ssc.getIndexPartitionCount()//
;
if (log.isInfoEnabled())
log.info("will scatter: " + vmd);
// scatter split task.
return new ScatterSplitTask(vmd, nsplits, moveTargets);
}
}
}
/*
* Tail split?
*
* Note: We can do a tail split as long as we are "close" to a full
* index partition. We have an expectation that the head of the split
* will be over the minimum capacity. While the tail of the split MIGHT
* be under the minimum capacity, if there are continued heavy writes on
* the tail then it will should reach the minimum capacity for an index
* partition by the time the live journal overflows again.
*/
if (vmd.isTailSplit() && false) {
/*
* FIXME The current tailSplit implementation operations against the
* BTree, NOT the FusedView and NOT the IndexSegment. It needs to be
* refactored before it can be an after action for a compacting
* merge.
*
* It is written to identify the separator key based on an
* examination of the mutable BTree. Once it has the separator key
* it then does a normal build for each key-range. [@todo It
* probably should use a compacting merge in order to avoid sharing
* index segments across shards.]
*/
if (log.isInfoEnabled())
log.info("Will tailSpl" + vmd.name);
return new SplitTailTask(vmd, null/* moveTarget */);
}
/*
* Should split?
*
* Note: Split is NOT allowed if the index is currently being moved
* onto this data service. Split, join, and move are all disallowed
* until the index partition move is complete since each of them
* would cause the index partition to become invalidated.
*/
if (vmd.getPercentOfSplit() > 1.0) {
if (log.isInfoEnabled())
log.info("will split : " + vmd);
return new SplitIndexPartitionTask(vmd, (UUID) null/* moveTarget */);
}
/*
* Join undercapacity shard (either with local rightSibling or move to
* join with remote rightSibling).
*
* If the rightSibling of an undercapacity index partition is also local
* then a {@link JoinIndexPartitionTask} is used to join those index
* partitions.
*
* If the rightSibling of an undercapacity index partition is remote,
* then a {@link MoveTask} is created to move the undercapacity index
* partition to the remove data service.
*
* Note: joins are only considered when the rightSibling of an index
* partition exists. The last index partition has [rightSeparatorKey ==
* null] and there is no rightSibling for that index partition.
*
* @todo What kinds of guarantees do we have that a local rightSibling
* will be around by the time the JoinIndexPartitionTask runs?
*
* @todo This has even more assumptions about [lastCommitTime] than the
* other tasks. All these tasks need to be reviewed to make sure that
* there are no gaps created by this refactor. Running these after
* action tasks while we hold the write lock on the source shard could
* probably help us to reduce the possibility of any such problems but
* might require a revisit / refactor / simplification of the tasks.
*
* FIXME Make sure that we are not running compacting merges as part of
* the split, scatter split and other tasks. Some tasks used to do this
* in order to have a compact view.
*/
if (resourceManager.joinsEnabled
&& vmd.pmd.getRightSeparatorKey() != null
&& vmd.getPercentOfSplit() < resourceManager.percentOfJoinThreshold) {
final String scaleOutIndexName = vmd.indexMetadata.getName();
final PartitionLocator rightSiblingLocator = getRightSiblingLocator(
scaleOutIndexName, vmd.commitTime);
if (rightSiblingLocator != null) {
final UUID targetDataServiceUUID = rightSiblingLocator
.getDataServiceUUID();
final String[] resources = new String[2];
// the underutilized index partition.
resources[0] = DataService.getIndexPartitionName(
scaleOutIndexName, vmd.pmd.getPartitionId());
// its right sibling (may be local or remote).
resources[1] = DataService
.getIndexPartitionName(scaleOutIndexName,
rightSiblingLocator.getPartitionId());
if (resourceManager.getDataServiceUUID().equals(
targetDataServiceUUID)) {
/*
* JOIN underutilized index partition with its local
* rightSibling.
*
* Note: This is only joining two index partitions at a
* time. It's possible to do more than that if it happens
* that N > 2 underutilized sibling index partitions are on
* the same data service, but that is a relatively unlikely
* combination of events.
*/
if (log.isInfoEnabled())
log.info("Will JOIN: " + Arrays.toString(resources));
final String rightSiblingName = DataService
.getIndexPartitionName(scaleOutIndexName,
rightSiblingLocator.getPartitionId());
final ViewMetadata vmd2 = new ViewMetadata(resourceManager,
vmd.commitTime, rightSiblingName, resourceManager
.getIndexCounters(rightSiblingName));
return new JoinIndexPartitionTask(resourceManager,
vmd.commitTime, resources, new ViewMetadata[] {
vmd, vmd2 });
} else {
/*
* MOVE underutilized index partition to data service
* hosting the right sibling.
*
* @todo The decision to join shards is asymmetric (an
* undercapacity shard is moved to its rightSibling).
* However, it is possible that its rightSibling was also
* undercapacity and was either moved to or locally joined
* with its rightSibling (in which case its partition
* identifier would have been changed). To avoid these edge
* cases there could be a global synchronous agreement for
* move/join decisions
*/
if (log.isInfoEnabled()) {
// get the target service name.
String targetDataServiceName;
try {
targetDataServiceName = resourceManager
.getFederation().getDataService(
targetDataServiceUUID)
.getServiceName();
} catch (Throwable t) {
targetDataServiceName = targetDataServiceUUID
.toString();
}
log.info("willMoveToJoinWithRightSibling" + "( "
+ vmd.name + " -> " + targetDataServiceName //
+ ", leftSibling=" + resources[0] //
+ ", rightSibling=" + resources[1] //
+ ")");
}
return new MoveTask(vmd, targetDataServiceUUID);
}
} // rightSibling != null
} // if(join)
/*
* Move (to shed or redistribute load).
*
* @todo We should prefer to move smaller shards (faster to move) or
* "hotter" shards (sheds more workload). There should be a way to
* estimate how much workload will be transferred so we know when we are
* done.
*
* FIXME We should limit the #of shards that we move in a given period
* of time to allow both this host and the target host an opportunity to
* adapt to their new load. [An exception would be if this host was
* critically overloaded, but that should probably be handled by
* different logic.]
*/
ILoadBalancerService loadBalancerService = null;
if (vmd.getPercentOfSplit() < resourceManager.maximumMovePercentOfSplit
&& resourceManager.maximumMovesPerTarget != 0
&& resourceManager.getLiveJournal().getName2Addr().rangeCount() > resourceManager.minimumActiveIndexPartitions
&& (loadBalancerService = getLoadBalancerService()) != null
&& shouldMove(loadBalancerService)) {
// the UUID of this data service.
final UUID sourceServiceUUID = resourceManager.getDataServiceUUID();
// Obtain UUID of a relatively underutilized data service.
final UUID targetDataServiceUUID = getMoveTarget(sourceServiceUUID,
loadBalancerService);
if (targetDataServiceUUID != null) {
if (log.isInfoEnabled()) {
// get the target service name.
String targetDataServiceName;
try {
targetDataServiceName = resourceManager
.getFederation().getDataService(
targetDataServiceUUID)
.getServiceName();
} catch (Throwable t) {
targetDataServiceName = targetDataServiceUUID
.toString();
}
log.info("willMove" + "( " + vmd.name + " -> "
+ targetDataServiceName + ")");
}
// Move the shard to the target host.
return new MoveTask(vmd, targetDataServiceUUID);
}
}
// No after action was chosen.
return null;
}
/**
* Return the {@link ILoadBalancerService} if it can be discovered.
*
* @return the {@link ILoadBalancerService} if it can be discovered and
* otherwise null
.
*/
private ILoadBalancerService getLoadBalancerService() {
// lookup the load balancer service.
final ILoadBalancerService loadBalancerService;
try {
loadBalancerService = resourceManager.getFederation()
.getLoadBalancerService();
} catch (Exception ex) {
log.warn("Could not discover the load balancer service", ex);
return null;
}
if (loadBalancerService == null) {
log.warn("Could not discover the load balancer service");
return null;
}
return loadBalancerService;
}
/**
* Figure out if this data service is considered to be highly utilized, in
* which case the DS should shed some index partitions.
*
* Note: We consult the load balancer service on this since it is able to
* put the load of this service into perspective by also considering the
* load on the other services in the federation.
*
* @param loadBalancerService
* The load balancer.
*/
protected boolean shouldMove(final ILoadBalancerService loadBalancerService) {
if (loadBalancerService == null)
throw new IllegalArgumentException();
// inquire if this service is highly utilized.
final boolean highlyUtilizedService;
try {
final UUID serviceUUID = resourceManager.getDataServiceUUID();
highlyUtilizedService = loadBalancerService
.isHighlyUtilizedDataService(serviceUUID);
} catch (Exception ex) {
log.warn("Could not determine if this data service is highly utilized");
return false;
}
if (!highlyUtilizedService) {
if(log.isInfoEnabled())
log.info("Service is not highly utilized.");
return false;
}
/*
* At this point we know that the LBS considers this host and service to
* be highly utilized (relative to the other hosts and services). If
* there is evidence of resource exhaustion for critical resources (CPU,
* RAM, or DIKS) then we will MOVE index partitions in order to shed
* some load. Otherwise, we will SPLIT hot index partitions in order to
* increase the potential concurrency of the workload for this service.
*
* Note: CPU is the only fungable resource since things will just slow
* down if a host has 100% CPU while it can die if it runs out of DISK
* or RAM (including if it begins to swap heavily).
*
* @todo config options for these triggers.
*/
final ResourceScores resourceScores = resourceManager.getResourceScores();
final boolean shouldMove = //
// heavy CPU utilization.
(resourceScores.percentCPUTime >= resourceManager.movePercentCpuTimeThreshold) ||
// swapping heavily.
(resourceScores.majorPageFaultsPerSec > 20) ||
// running out of disk (data dir).
(resourceScores.dataDirBytesFree < Bytes.gigabyte * 5)||
// running out of disk (tmp dir).
(resourceScores.dataDirBytesFree < Bytes.gigabyte * .5)
;
return shouldMove;
// if (shouldMove) {
//
// return chooseMoves(loadBalancerService);
//
// }
// return chooseHotSplits();
}
/**
* Obtain the UUID of some relatively underutilized data service.
*
* FIXME The LBS should interpret the excludedServiceUUID as the source
* service UUID and then provide a list of those services having an LBS
* computed service score which is significantly lower than the score for
* this service. Changing this will break some unit tests (for the LBS
* behavior).
*/
private UUID getMoveTarget(final UUID sourceServiceUUID,
final ILoadBalancerService loadBalancerService) {
try {
// request under utilized data service UUIDs (RMI).
final UUID[] uuids = loadBalancerService.getUnderUtilizedDataServices(//
0, // minCount - no lower bound.
1, // maxCount - no upper bound.
sourceServiceUUID // exclude this data service.
);
if (uuids != null && uuids.length > 0) {
// Found a move target.
return uuids[0];
}
// No move target.
return null;
} catch (TimeoutException t) {
log.warn(t.getMessage());
return null;
} catch (InterruptedException t) {
log.warn(t.getMessage());
return null;
} catch (Throwable t) {
log.error("Could not obtain target service UUIDs: ", t);
return null;
}
}
/**
* Locate the right sibling for this index partition.
*
* Note: default key/val serializers are used.
*
* @return The locator for the right sibling -or- null
if no
* right sibling could be found (which is an error).
*
* @todo This does not have to be a batch lookup any more. It could use the
* {@link ClientIndexView} class.
*/
private PartitionLocator getRightSiblingLocator(
final String scaleOutIndexName, final long lastCommitTime) {
final BatchLookup op = BatchLookupConstructor.INSTANCE.newInstance(
0/* fromIndex */, 1/* toIndex */, new byte[][] { vmd.pmd
.getRightSeparatorKey() }, null/* vals */);
final ResultBuffer resultBuffer;
try {
resultBuffer = (ResultBuffer) resourceManager.getFederation()
.getMetadataService().submit(
TimestampUtility.asHistoricalRead(lastCommitTime),
MetadataService
.getMetadataIndexName(scaleOutIndexName),
op).get();
} catch (Exception e) {
log.error("Could not locate rightSiblings: index="
+ scaleOutIndexName, e);
return null;
}
// the locator for the rightSibling.
return (PartitionLocator) SerializerUtil.deserialize(resultBuffer
.getValues().get(0));
}
/**
* Identify the target data services for the new index partitions.
*
* Note that when maxCount is ZERO (0) ALL joined data services will be
* reported.
*
* Note: This makes sure that _this_ data service is included in the array
* so that we will leave at least one of the post-split index partitions on
* this data service.
*
* @todo For a system which has been up and running for a while we would be
* better off using the LBS reported move targets rather than all
* discovered data services. However, for a new federation we are
* better off with all discovered data services since there is less
* uncertainty about which services will be reported.
*
* @todo move to OverflowManager?
*/
private UUID[] getScatterSplitTargets(final ScatterSplitConfiguration ssc) {
final UUID[] a = resourceManager
.getFederation()
.getDataServiceUUIDs(
ssc.getDataServiceCount()/* maxCount */);
if (a == null || a.length == 1) {
if (log.isInfoEnabled())
log
.info("Will not scatter split - insufficient data services discovered.");
// abort scatter split logic.
return null;
}
final Set tmp = new HashSet(Arrays.asList(a));
tmp.add(resourceManager.getDataServiceUUID());
return tmp.toArray(new UUID[tmp.size()]);
}
// /**
// * A paranoia test that verifies that the definition of the view was in fact
// * updated.
// *
// * @author Bryan Thompson
// * @version $Id$
// */
// static private class VerifyAtomicUpdateTask extends AbstractTask {
//
// protected final ResourceManager resourceManager;
//
// final protected BuildResult buildResult;
//
// final private Event updateEvent;
//
// /**
// * @param resourceManager
// * @param concurrencyManager
// * @param resource
// * @param buildResult
// */
// public VerifyAtomicUpdateTask(ResourceManager resourceManager,
// IConcurrencyManager concurrencyManager, String resource,
// UUID indexUUID, BuildResult buildResult, Event updateEvent) {
//
// super(concurrencyManager, ITx.UNISOLATED, resource);
//
// if (resourceManager == null)
// throw new IllegalArgumentException();
//
// if (buildResult == null)
// throw new IllegalArgumentException();
//
// if(!buildResult.compactingMerge)
// throw new IllegalArgumentException();
//
// if(!resource.equals(buildResult.name))
// throw new IllegalArgumentException();
//
// if (updateEvent == null)
// throw new IllegalArgumentException();
//
// this.resourceManager = resourceManager;
//
// this.buildResult = buildResult;
//
// this.updateEvent = updateEvent;
//
// }
//
// /**
// * Verify that the update was correctly registered on the mutable
// * {@link BTree}.
// *
// * @return null
// */
// @Override
// protected Void doTask() throws Exception {
//
// updateEvent.start();
//
// try {
//
// if (resourceManager.isOverflowAllowed())
// throw new IllegalStateException();
//
// final SegmentMetadata segmentMetadata = buildResult.segmentMetadata;
//
// // the correct view definition.
// final IResourceMetadata[] expected = new IResourceMetadata[] {
// // the live journal.
// getJournal().getResourceMetadata(),
// // the newly built index segment.
// segmentMetadata
// };
//
// /*
// * Open the unisolated B+Tree on the live journal that is absorbing
// * writes and verify the definition of the view.
// */
// final ILocalBTreeView view = (ILocalBTreeView) getIndex(getOnlyResource());
//
// // The live B+Tree.
// final BTree btree = view.getMutableBTree();
//
// final LocalPartitionMetadata pmd = btree.getIndexMetadata().getPartitionMetadata();
//
// final IResourceMetadata[] actual = pmd.getResources();
//
// if (expected.length != actual.length) {
//
// throw new RuntimeException("expected=" + expected
// + ", but actual=" + actual);
//
// }
//
// for (int i = 0; i < expected.length; i++) {
//
// if (!expected[i].equals(actual[i])) {
//
// throw new RuntimeException("Differs at index=" + i
// + ", expected=" + expected + ", but actual="
// + actual);
//
// }
//
// }
//
// return null;
//
// } finally {
//
// updateEvent.end();
//
// }
//
// }
//
// }
/**
*
* The source view is pre-overflow (the last writes are on the old journal)
* while the current view is post-overflow (reflects writes made since
* overflow). What we are doing is replacing the pre-overflow history with
* an {@link IndexSegment}.
*
*
*
* journal A
* view={A}
* ---- sync overflow begins ----
* create journal B
* view={B,A}
* Begin build segment from view={A} (identified by the lastCommitTime)
* ---- sync overflow ends ----
* ... build continues ...
* ... writes against view={B,A}
* ... index segment S0 complete (based on view={A}).
* ...
* atomic build update task runs: view={B,S0}
* ... writes continue.
*
*
* @author Bryan Thompson
* @version $Id$
*/
static protected class AtomicUpdateCompactingMergeTask extends
AbstractAtomicUpdateTask {
private final Event updateEvent;
/**
* The expected UUID of the scale-out index.
*/
final protected UUID indexUUID;
final protected BuildResult buildResult;
/**
* @param resourceManager
* @param concurrencyManager
* @param resource
* @param buildResult
*/
public AtomicUpdateCompactingMergeTask(ResourceManager resourceManager,
IConcurrencyManager concurrencyManager, String resource,
UUID indexUUID, BuildResult buildResult, Event updateEvent) {
super(resourceManager, ITx.UNISOLATED, resource);
if (indexUUID == null)
throw new IllegalArgumentException();
if (buildResult == null)
throw new IllegalArgumentException();
if(!buildResult.compactingMerge)
throw new IllegalArgumentException();
if(!resource.equals(buildResult.name))
throw new IllegalArgumentException();
if (updateEvent == null)
throw new IllegalArgumentException();
this.indexUUID = indexUUID;
this.buildResult = buildResult;
this.updateEvent = updateEvent;
}
/**
*
* Atomic update.
*
*
* @return null
*/
@Override
protected Void doTask() throws Exception {
updateEvent.start();
try {
if (resourceManager.isOverflowAllowed())
throw new IllegalStateException();
final SegmentMetadata segmentMetadata = buildResult.segmentMetadata;
if (INFO)
log.info("Begin: name=" + getOnlyResource()
+ ", newSegment=" + segmentMetadata);
/*
* Open the unisolated B+Tree on the live journal that is
* absorbing writes. We are going to update its index metadata.
*
* Note: I am using AbstractTask#getIndex(String name) so that
* the concurrency control logic will notice the changes to the
* BTree and cause it to be checkpointed if this task succeeds
* normally.
*/
final ILocalBTreeView view = (ILocalBTreeView) getIndex(getOnlyResource());
// make sure that this is the same scale-out index.
assertSameIndex(indexUUID, view.getMutableBTree());
if (view instanceof BTree) {
/*
* Note: there is an expectation that this is not a simple
* BTree because this the build task is supposed to be
* invoked after an overflow event, and that event should
* have re-defined the view to include the BTree on the new
* journal plus the historical view.
*
* One explanation for finding a simple view here is that
* the view was a simple BTree on the old journal and the
* data was copied from the old journal into the new journal
* and then someone decided to do a build even through a
* copy had already been done. However, this is not a very
* good explanation since we try to avoid doing a build if
* we have already done a copy!
*/
throw new RuntimeException("View is only a B+Tree: name="
+ buildResult.name + ", pmd="
+ view.getIndexMetadata().getPartitionMetadata());
}
// The live B+Tree.
final BTree btree = view.getMutableBTree();
if (INFO)
log.info("src=" + getOnlyResource() + ",counter="
+ view.getCounter().get() + ",checkpoint="
+ btree.getCheckpoint());
assert btree != null : "Expecting index: " + getOnlyResource();
// clone the current metadata record for the live index.
final IndexMetadata indexMetadata = btree.getIndexMetadata()
.clone();
/*
* This is the index partition definition on the live index -
* the one that will be replaced with a new view as the result
* of this atomic update.
*/
final LocalPartitionMetadata currentpmd = indexMetadata
.getPartitionMetadata();
// Check pre-conditions.
final IResourceMetadata[] currentResources = currentpmd
.getResources();
{
if (currentpmd == null) {
throw new IllegalStateException(
"Not an index partition: " + getOnlyResource());
}
if (!currentResources[0].getUUID().equals(
getJournal().getRootBlockView().getUUID())) {
throw new IllegalStateException(
"Expecting live journal to be the first resource: "
+ currentResources);
}
/*
* Note: I have commented out a bunch of pre-condition tests
* that are not valid for histories such as:
*
* history=create() register(0) split(0)
* copy(entryCount=314)
*
* This case arises when there are not enough index entries
* written on the journal after a split to warrant a build
* so the buffered writes are just copied to the new
* journal. The resources in the view are:
*
* 1. journal 2. segment
*
* And this update will replace the segment.
*/
// // the old journal's resource metadata.
// final IResourceMetadata oldJournalMetadata =
// oldResources[1];
// assert oldJournalMetadata != null;
// assert oldJournalMetadata instanceof JournalMetadata :
// "name="
// + getOnlyResource() + ", old pmd=" + oldpmd
// + ", segmentMetadata=" + buildResult.segmentMetadata;
//
// // live journal must be newer.
// assert journal.getRootBlockView().getCreateTime() >
// oldJournalMetadata
// .getCreateTime();
// new index segment build from a view that did not include
// data from the live journal.
assert segmentMetadata.getCreateTime() < getJournal()
.getRootBlockView().getFirstCommitTime() : "segment createTime LT journal 1st commit time"
+ ": segmentMetadata="
+ segmentMetadata
+ ", journal: " + getJournal().getRootBlockView();
// if (oldResources.length == 3) {
//
// // the old index segment's resource metadata.
// final IResourceMetadata oldSegmentMetadata =
// oldResources[2];
// assert oldSegmentMetadata != null;
// assert oldSegmentMetadata instanceof SegmentMetadata;
//
// assert oldSegmentMetadata.getCreateTime() <=
// oldJournalMetadata
// .getCreateTime();
//
// }
}
// new view definition.
final IResourceMetadata[] newResources = new IResourceMetadata[] {
// the live journal.
getJournal().getResourceMetadata(),
// the newly built index segment.
segmentMetadata };
// describe the index partition.
indexMetadata.setPartitionMetadata(new LocalPartitionMetadata(//
currentpmd.getPartitionId(),//
currentpmd.getSourcePartitionId(),//
currentpmd.getLeftSeparatorKey(),//
currentpmd.getRightSeparatorKey(),//
newResources, //
currentpmd.getIndexPartitionCause()
// currentpmd.getHistory()
// + OverflowActionEnum.Merge//
// + "(lastCommitTime="
// + segmentMetadata.getCreateTime()//
// + ",btreeEntryCount="
// + btree.getEntryCount()//
// + ",segmentEntryCount="
// + buildResult.builder.getCheckpoint().nentries//
// + ",segment="
// + segmentMetadata.getUUID()//
// + ",counter="
// + btree.getCounter().get()//
// + ",oldResources="
// + Arrays.toString(currentResources) + ") "
));
// update the metadata associated with the btree
btree.setIndexMetadata(indexMetadata);
if (INFO)
log.info("Updated view: name=" + getOnlyResource()
+ ", pmd=" + indexMetadata.getPartitionMetadata());
/*
* Verify that the btree recognizes that it needs to be
* checkpointed.
*
* Note: The atomic commit point is when this task commits.
*/
assert btree.needsCheckpoint();
// btree.writeCheckpoint();
// {
// final long id0 = btree.getCounter().get();
// final long pid = id0 >> 32;
// final long mask = 0xffffffffL;
// final int ctr = (int) (id0 & mask);
// log.warn("name="+getOnlyResource()+", counter="+id0+", pid="+pid+", ctr="+ctr);
// }
// notify successful index partition build.
resourceManager.overflowCounters.indexPartitionMergeCounter.incrementAndGet();
return null;
} finally {
updateEvent.end();
}
} // doTask()
} // class AtomicUpdate
}