
com.bigdata.resources.IncrementalBuildTask Maven / Gradle / Ivy
Show all versions of bigdata-core Show documentation
package com.bigdata.resources;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import com.bigdata.btree.BTree;
import com.bigdata.btree.ILocalBTreeView;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.IndexSegment;
import com.bigdata.btree.IndexSegmentStore;
import com.bigdata.journal.IConcurrencyManager;
import com.bigdata.journal.ITx;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.mdi.IResourceMetadata;
import com.bigdata.mdi.LocalPartitionMetadata;
import com.bigdata.mdi.SegmentMetadata;
import com.bigdata.service.Event;
import com.bigdata.service.EventResource;
/**
* Task builds an {@link IndexSegment} from the mutable {@link BTree} and zero
* or more additional sources in the index partition view and then atomically
* updates the view (aka an incremental build).
*
* Build uses mutable {@link BTree} of the lastCommitTime for the old journal
* PLUS ZERO OR MORE additional source(s) taken in view order up to but not
* including the source in the view with significant content. This let's us keep
* the #of {@link IndexSegment}s in the view down without incurring the cost of
* a compacting merge. (The cost of the compacting merge itself comes from
* having a large index segment in the view, generally in the last position of
* the view.) In turn, this keeps the cost of overflow down and can be a
* significant win if there are a number of large index partitions that receive
* a few writes in each overflow.
*
* For example, assuming a large index segment exists from a previous compacting
* merge, then once the #of writes exceeds the "copy" threshold there will be an
* index build. The view will then have [live, smallSeg1, largeSeg1]. The next
* time the copy threshold is exceeded we would get [live, smallSeg2, smallSeg1,
* largeSeg1]. However if we include smallSeg1 in the build, then we get [live,
* smallSeg2, largeSeg1]. This can continue until we have enough data to warrant
* a split or until we have another "large" segment but not yet enough data to
* split, at which point we get [live, largeSeg2, largeSeg1] and then [live,
* smallSeg3, largeSeg2, largeSeg1].
*
* Note: As its last action, this task submits a
* {@link AtomicUpdateIncrementalBuildTask} which replaces the view with one
* defined by the current {@link BTree} on the journal and the newly built
* {@link IndexSegment}.
*
* Note: If the task fails, then the output {@link IndexSegment} will be
* deleted.
*
* @author Bryan Thompson
* @version $Id$
*/
public class IncrementalBuildTask extends AbstractPrepareTask {
final private ViewMetadata vmd;
/**
* @param vmd
* Metadata about the index partition view.
*/
public IncrementalBuildTask(final ViewMetadata vmd) {
super(vmd.resourceManager, TimestampUtility
.asHistoricalRead(vmd.commitTime), vmd.name);
this.vmd = vmd;
}
@Override
protected void clearRefs() {
// release soft references.
vmd.clearRef();
}
/**
* Build an {@link IndexSegment} from one or more sources for an index
* partition view. The sources are chosen in view order. New sources are
* incorporated until too much work would be performed for the lightweight
* semantics of "build". If all sources are incorporated by the build, then
* the result is identical a compacting merge.
*
* @return The {@link BuildResult}.
*/
protected BuildResult doTask() throws Exception {
final Event e = new Event(resourceManager.getFederation(),
new EventResource(vmd.indexMetadata), OverflowActionEnum.Build,
vmd.getParams()).start();
BuildResult buildResult = null;
try {
if (resourceManager.isOverflowAllowed())
throw new IllegalStateException();
try {
/*
* Figure out which sources will be used in the build operation.
* The sources are chosen in order. The first source is always
* a BTree on a journal and is always in the accepted view.
*
* Note: The order of the sources MUST be maintained. This
* ensures that the generated index segment will preserve only
* the most recently written tuple (or delete marker) for each
* tuple in the accepted view. We are only permitted to purge
* deleted tuples when all sources are accepted in the build
* view since that is the only time we have a guarantee that
* there is not a delete version of that tuple further back in
* history which would reemerge if we dropped the delete marker.
*/
final BuildViewMetadata buildViewMetadata = new BuildViewMetadata(
vmd.getView(),
resourceManager.maximumBuildSegmentBytes, e);
e.addDetails(buildViewMetadata.getParams());
if(INFO)
log.info("acceptedView: " + buildViewMetadata);
/*
* Build the index segment from a view comprised of just the
* accepted sources.
*/
buildResult = resourceManager.buildIndexSegment(vmd.name,
buildViewMetadata.acceptedView,
buildViewMetadata.compactingMerge, vmd.commitTime,
null/* fromKey */, null/* toKey */, e);
e.addDetails(buildResult.getParams());
if (buildResult.sourceCount != buildViewMetadata.naccepted) {
throw new AssertionError("Build result has "
+ buildResult.sourceCount + ", but expected "
+ buildViewMetadata.naccepted + " : acceptedView="
+ buildViewMetadata + ", buildResult=" + buildResult);
}
if (INFO)
log.info("buildResult=" + buildResult);
{
/*
* Verify that the resource manager can open the new index
* segment. This provides verification both that the index
* segment is registered with the store manager and that the
* index segment can be read. However, we do not actually
* read the leaves of the index segment here so there still
* could be errors on the disk.
*/
final IndexSegmentStore segStore = (IndexSegmentStore) resourceManager
.openStore(buildResult.segmentMetadata.getUUID());
assert segStore != null;
if (INFO)
log.info("indexSegmentStore="
+ segStore.loadIndexSegment());
}
} finally {
/*
* Release our hold on the source index partition view. We only
* needed it during the the index partition build.
*/
clearRefs();
}
if (buildResult.compactingMerge
&& buildResult.builder.getCheckpoint().length >= resourceManager.nominalShardSize) {
/*
* If a compacting merge was performed and sumSegBytes exceeds the
* threshold, then do a split here just as if CompactingMerge was
* run instead.
*
* Note: This is unlikely since build does not accept sources if
* they would cause a lot of work. The most likely reasons why this
* would happen would be a single index partition on the journal
* which receives all writes or the journal size is a healthy
* multiple of the target shard size.
*/
// FIXME reconcile return type and enable post-merge split.
// return new SplitCompactViewTask(vmd.name, buildResult);
}
try {
/*
* Submit task that will update the definition of the index
* partition view and wait for it to complete.
*/
concurrencyManager.submit(
new AtomicUpdateIncrementalBuildTask(resourceManager,
concurrencyManager, vmd.name, vmd.indexMetadata
.getIndexUUID(), buildResult, e)).get();
} catch (Throwable t) {
// make it releasable.
resourceManager.retentionSetRemove(buildResult.segmentMetadata
.getUUID());
// delete the generated index segment.
resourceManager
.deleteResource(buildResult.segmentMetadata.getUUID(), false/* isJournal */);
// re-throw the exception
throw new Exception(t);
}
return buildResult;
} finally {
if (buildResult != null) {
/*
* At this point the index segment was either incorporated into
* the new view in a restart safe manner or there was an error.
* Either way, we now remove the index segment store's UUID from
* the retentionSet so it will be subject to the release policy
* of the StoreManager.
*/
resourceManager.retentionSetRemove(buildResult.segmentMetadata
.getUUID());
}
e.end();
}
}
/**
*
* The source is an {@link IndexSegment} that was built from the mutable
* {@link BTree} associated with the lastCommitTime on old journal of some
* index partition. What we are doing is replacing the role of that
* {@link BTree} on the closed out journal with the {@link IndexSegment}.
* Note that the {@link IndexSegment} contains the same data as the
* {@link BTree} as of the lastCommitTime. The new view (as defined by this
* task) will be selected when the desired view is GTE the lastCommitTime.
* The old view will be used whenever the desired view is LT the
* lastCommitTime.
*
*
* * journal A
* view={A,...}
* ---- sync overflow begins ----
* create journal B
* view={B,A,...}
* Begin incremental build of segment from A (just the BTree state as identified by the lastCommitTime)
* ---- sync overflow ends ----
* ... build continues ...
* ... writes against view={B,A,...} are written on B.
* ... index segment S0 complete (based on A).
* ...
* atomic update task runs: view={B,S0,...}
* ... writes continue.
*
*
* @author Bryan Thompson
* @version $Id$
*/
static protected class AtomicUpdateIncrementalBuildTask extends
AbstractAtomicUpdateTask {
/**
* The expected UUID of the scale-out index.
*/
final protected UUID indexUUID;
final protected BuildResult buildResult;
final private Event parentEvent;
/**
* @param resourceManager
* @param concurrencyManager
* @param resource
* @param buildResult
*/
public AtomicUpdateIncrementalBuildTask(ResourceManager resourceManager,
IConcurrencyManager concurrencyManager, String resource,
UUID indexUUID, BuildResult buildResult, Event parentEvent) {
super(resourceManager, ITx.UNISOLATED, resource);
if(indexUUID == null)
throw new IllegalArgumentException();
if(buildResult == null)
throw new IllegalArgumentException();
if (!resource.equals(buildResult.name))
throw new IllegalArgumentException();
if (parentEvent == null)
throw new IllegalArgumentException();
this.indexUUID = indexUUID;
this.buildResult = buildResult;
this.parentEvent = parentEvent;
}
/**
*
* Atomic update.
*
*
* @return The ordered array of resources that define the post-condition
* view.
*/
@Override
protected IResourceMetadata[] doTask() throws Exception {
// populated with the description of the ordered sources of the new view.
final List newView = new LinkedList();
/*
* Note: The event is labeled a "build" even if all sources
* participate in the build. This makes it easier to identify the
* compacting merges in the events log. The compacting merges are of
* interest since they are only triggered when the #of sources in
* the view grows too large and they require more effort. By
* contrast, some "builds" will in fact be compacting merges, but
* they were selected as builds and they are compacting merges by
* virtue of having so little work to do that it is cheaper to use
* all sources in the view and thereby postpone a more intensive
* compacting merge somewhat longer.
*/
final Map v = buildResult.getParams();
v.put("summary", OverflowActionEnum.Build + "(" + buildResult.name
+ ")");
final Event updateEvent = parentEvent.newSubEvent(
OverflowSubtaskEnum.AtomicUpdate).start();
try {
if (resourceManager.isOverflowAllowed())
throw new IllegalStateException();
final SegmentMetadata segmentMetadata = buildResult.segmentMetadata;
if(INFO)
log.info(buildResult.toString());
/*
* Open the unisolated B+Tree on the live journal that is
* absorbing writes. We are going to update its index metadata.
*
* Note: I am using AbstractTask#getIndex(String name) so that
* the concurrency control logic will notice the changes to the
* BTree and cause it to be checkpointed if this task succeeds
* normally.
*/
final ILocalBTreeView view = getIndex(getOnlyResource());
// The live B+Tree.
final BTree btree = view.getMutableBTree();
// make sure that we are working with the same index.
assertSameIndex(indexUUID, btree);
if (view instanceof BTree) {
/*
* Note: there is an expectation that this is not a simple
* BTree because this the build task is supposed to be
* invoked after an overflow event (or a view checkpoint),
* and that event should have re-defined the view to include
* the BTree on the new journal plus the historical view.
*
* One explanation for finding a simple view here is that
* the old index was deleted and a new one created in its
* place. We check that above.
*/
throw new RuntimeException("View is only a B+Tree: name="
+ buildResult.name + ", pmd="
+ view.getIndexMetadata().getPartitionMetadata());
}
if (INFO)
log.info("src=" + getOnlyResource() + ", counter="
+ view.getCounter().get() + ", checkpoint="
+ btree.getCheckpoint());
// clone the current metadata record for the live index.
final IndexMetadata indexMetadata = btree.getIndexMetadata()
.clone();
/*
* This is the index partition definition on the live index -
* the one that will be replaced with a new view as the result
* of this atomic update.
*/
final LocalPartitionMetadata currentpmd = indexMetadata
.getPartitionMetadata();
if (currentpmd == null) {
throw new IllegalStateException(
"Not an index partition: " + getOnlyResource());
}
// Check pre-conditions.
final IResourceMetadata[] currentResources = currentpmd
.getResources();
{
/*
* verify that there are at least two resources in the
* current view:
*
* 1. currentResources[0] is the mutable BTree on the live
* journal
*
* 2. currentResources[1] is either the BTree on the old
* journal (since closed out for writes so it is no longer
* mutable) or a previous snapshot of the mutable BTree
* decoupled from the mutable BTree by a view checkpoint
* operation.
*/
if (currentResources.length < 2) {
throw new IllegalStateException(
"Expecting at least 2 resources in the view: "
+ Arrays.toString(currentResources));
}
if (!currentResources[0].getUUID().equals(
getJournal().getRootBlockView().getUUID())) {
throw new IllegalStateException(
"Expecting live journal to be the first resource: "
+ Arrays.toString(currentResources));
}
/*
* verify that the 2nd resource in the view is also a BTree
* on a journal.
*/
if (!currentResources[1].isJournal()) {
throw new IllegalStateException(
"Expecting live journal to be the first resource: "
+ Arrays.toString(currentResources));
}
// Note: This constraint does not apply when a view checkpoint was used.
// /*
// * Verify that the new index segment was built from a view
// * that did not include data from the live journal.
// */
// if (segmentMetadata.getCreateTime() >= getJournal()
// .getRootBlockView().getFirstCommitTime()) {
//
// throw new AssertionError(
// "IndexSegment includes data from the live journal?");
//
// }
}
// new view definition.
final IResourceMetadata[] newResources;
{
// the live journal.
newView.add(getJournal().getResourceMetadata());
/*
* The newly built index segment. This was built from at
* least one source, but it MAY have been built from more
* than one source.
*/
newView.add(segmentMetadata);
/*
* The rest of the components of the old view.
*
* Note: We start copying resources into the view AFTER the
* last source which was included in the view used to
* generate the index segment.
*
* For example, if the index segment was built from a single
* journal (the old journal), then [startIndex := 1 + 1 ==
* 2]. So we retain resources in the current view start at
* currentResources[2].
*
* If there are 3 sources in the current view (new journal,
* old journal, and an index segment) and the sourceCount
* was 2 then then build was actually a compacting merge and
* [startIndex := 1 + 2 == 3]. Since 3 EQ
* currentResources.length we will not include ANY sources
* from the old view. This is the semantics of a compacting
* merge. All data in the view is captured by the data on
* the live journal and the newly built index segment [live,
* newSeg].
*/
final int startIndex = 1 + buildResult.sourceCount;
for (int i = startIndex; i < currentResources.length; i++) {
newView.add(currentResources[i]);
}
newResources = (IResourceMetadata[]) newView
.toArray(new IResourceMetadata[] {});
}
// describe the index partition.
indexMetadata.setPartitionMetadata(new LocalPartitionMetadata(//
currentpmd.getPartitionId(),//
currentpmd.getSourcePartitionId(),//
currentpmd.getLeftSeparatorKey(),//
currentpmd.getRightSeparatorKey(),//
newResources, //
currentpmd.getIndexPartitionCause()
// , currentpmd.getHistory()
// + OverflowActionEnum.Build//
// + "(lastCommitTime="
// + segmentMetadata.getCreateTime()//
// + ",segment="
// + segmentMetadata.getUUID()//
// + ",#buildSources="
// + buildResult.sourceCount//
// + ",merge="
// + buildResult.compactingMerge//
// + ",counter="
// + btree.getCounter().get()//
// + ",oldResources="
// + Arrays.toString(currentResources) + ") "
));
// update the metadata associated with the btree
btree.setIndexMetadata(indexMetadata);
if (INFO)
log.info("Updated view: name=" + getOnlyResource()
+ ", pmd=" + indexMetadata.getPartitionMetadata()
+ toString("oldResources", currentResources)
+ toString("newResources", newResources));
/*
* Verify that the btree recognizes that it needs to be
* checkpointed.
*
* Note: The atomic commit point is when this task commits.
*/
assert btree.needsCheckpoint();
/*
* Update counter to reflect successful index partition build.
*
* Note: All build tasks are reported as builds so that we can
* readily distinguish the tasks which were selected as
* compacting merges from those which were selected as builds.
* If you want to see how many tasks were "effective" compacting
* merges (because all sources were used) then you need to look
* at the events log for the indexSegmentBuild operation.
*/
resourceManager.overflowCounters.indexPartitionBuildCounter.incrementAndGet();
updateEvent.addDetail("newView", newView.toString());
return newResources;
} finally {
updateEvent.end();
}
} // doTask()
} // AtomicUpdate
}