
com.bigdata.resources.MoveTask Maven / Gradle / Ivy
Show all versions of bigdata-core Show documentation
/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Feb 29, 2008
*/
package com.bigdata.resources;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.InetSocketAddress;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import com.bigdata.btree.BTree;
import com.bigdata.btree.IIndex;
import com.bigdata.btree.ILocalBTreeView;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.IndexSegment;
import com.bigdata.btree.IndexSegmentStore;
import com.bigdata.btree.proc.IIndexProcedure;
import com.bigdata.journal.AbstractTask;
import com.bigdata.journal.ConcurrencyManager;
import com.bigdata.journal.ITx;
import com.bigdata.journal.NoSuchIndexException;
import com.bigdata.journal.TimestampUtility;
import com.bigdata.mdi.IResourceMetadata;
import com.bigdata.mdi.IndexPartitionCause;
import com.bigdata.mdi.LocalPartitionMetadata;
import com.bigdata.mdi.MetadataIndex;
import com.bigdata.mdi.PartitionLocator;
import com.bigdata.mdi.SegmentMetadata;
import com.bigdata.service.DataService;
import com.bigdata.service.DataServiceCallable;
import com.bigdata.service.Event;
import com.bigdata.service.EventResource;
import com.bigdata.service.IDataService;
import com.bigdata.service.IMetadataService;
import com.bigdata.service.MetadataService;
import com.bigdata.service.ResourceService;
/**
* Task moves an index partition to another {@link IDataService}.
*
* This task runs as a historical read operation and copy the view of the index
* partition as of the lastCommitTime of old journal to another
* {@link IDataService}. Once that historical view has been copied, this task
* then submits an {@link AtomicUpdateMoveIndexPartitionTask}. The atomic
* update is an {@link ITx#UNISOLATED} operation. It is responsible copying any
* writes buffered for the index partition on the live journal to the target
* {@link IDataService} and then updating the {@link MetadataIndex}. Once the
* atomic update task is finished, clients will discover that the source index
* partition does not exist. When they query the {@link MetadataService} they
* will discover that the key(-range) is now handled by the new index partition
* on the target {@link IDataService}.
*
* Note: This task is run on the target {@link IDataService} and it copies the
* data from the source {@link IDataService}. This allows us to use standard
* {@link IRangeQuery} operations to copy the historical view. However, the
* {@link AtomicUpdateMoveIndexPartitionTask} is run on the source
* {@link IDataService} since it needs to obtain an exclusive lock on the index
* partition that is being moved in order to prevent concurrent writes during
* the atomic cutover. For the same reason, the
* {@link AtomicUpdateMoveIndexPartitionTask} can not use standard
* {@link IRangeQuery} operations. Instead, it initiates a series of data
* transfers while holding onto the exclusive lock until the target
* {@link IDataService} has the current state of the index partition. At that
* point it notifies the {@link IMetadataService} to perform the atomic cutover
* to the new index partition.
*
* Note: This task does NOT cause any resources associated with the current view
* of the index partition to be released on the source {@link IDataService}.
* The reason is two-fold. First, the {@link IndexSegment}(s) associated with
* that view MAY be in used by historical views. Second, there MAY be historical
* commit points for the index partition on the live journal before the atomic
* cutover to the new {@link IDataService} - those historical commit points MUST
* be preserved until the release policy for those views has been satisfied.
*
* Note: The MOVE task MUST be explicitly coordinated with the target
* {@link IDataService}. Failure to coordinate the move results in an error
* message reported by the {@link MetadataService} indicating that the wrong
* partition locator was found under the key. The cause is a MOVE operation
* during which the target data service undergoes concurrent synchronous (and
* then asynchronous) overflow. What happens is the {@link MoveTask} registers
* the new index partition on the target data service. One registered on the
* {@link IDataService}, the index partition it is visible during synchronous
* overflow BEFORE the MOVE is complete and BEFORE the index is registered with
* the {@link MetadataService} and hence discoverable to clients. If the target
* {@link IDataService} then undergoes synchronous and asynchronous overflow and
* chooses an action which would change the index partition definition (split,
* join, or move) WHILE the index partition is still being moved onto the target
* {@link IDataService} THEN the MOVE is not atomic and the definition of the
* index partition in the {@link MetadataService} will not coherently reflect
* either the MOVE or the action chosen by the target {@link IDataService},
* depending on which one makes its atomic update first.
*
* The target {@link IDataService} MAY undergo both synchronous and asynchronous
* overflow as {@link IDataService}s are designed to allow continued writes
* during those operations. Further, it MAY choose to copy, build, or compact
* the index partition while it is being moved. However, it MUST NOT choose any
* action (split, join, or move) that would change the index partition
* definition until the move is complete (whether it ends in success or
* failure).
*
* This issue is addressed by the following protocol:
*
*
* - The {@link MoveTask} set the
sourcePartitionId
on the
* {@link LocalPartitionMetadata} when it registers the index partition on the
* target {@link IDataService}. When sourcePartitionId != -1
.
* the target {@link IDataService} is restricted to for that index partition to
* overflows actions which do not change the index partition definition (copy,
* build, or merge). Further, any index partition found on restart whose by the
* target {@link IDataService} whose sourcePartitionId != -1
is
* deleted as it was never successfully put into play (this prevents partial
* moves from accumulating state which could not otherwise be released.)
*
* - The atomic update task causes the
sourcePartitionId
to be
* set to -1
as one of its last actions, thereby allowing the
* target {@link IDataService} to use operations that could re-define the index
* partition (split, join, move) and also preventing the target index partition
* from being deleted on restart.
*
*
*
* FIXME javadoc
*
* Note: There are only two entry points: a simple move and a move where the
* compacting merge has already been performed, e.g., by a split, and we just
* need to do the atomic update phase.
*
* @author Bryan Thompson
*/
public class MoveTask extends AbstractPrepareTask {
private final ViewMetadata vmd;
/**
* {@link UUID} of the target {@link IDataService} (the one to which the index
* partition will be moved).
*/
private final UUID targetDataServiceUUID;
/**
* The partition identifier for the target index partition that will be
* created by the move (RMI).
*/
final int newPartitionId;
/**
* The name of the new index partition on the target data service.
*/
final String targetIndexName;
/**
* The summary used for the event description and the partition history
* record.
*/
private final String summary;
/**
* The event corresponding to this action.
*/
private final Event e;
/**
* @param vmd
* Metadata for the source index partition view.
* @param targetDataServiceUUID
* The UUID for the target data service.
*/
public MoveTask(//
final ViewMetadata vmd,//
final UUID targetDataServiceUUID//
) {
super(vmd.resourceManager, TimestampUtility
.asHistoricalRead(vmd.commitTime), vmd.name);
if (targetDataServiceUUID == null)
throw new IllegalArgumentException();
if (resourceManager.getDataServiceUUID().equals(targetDataServiceUUID)) {
throw new IllegalArgumentException("Same data service: "
+ targetDataServiceUUID);
}
this.vmd = vmd;
this.targetDataServiceUUID = targetDataServiceUUID;
this.newPartitionId = resourceManager.nextPartitionId(vmd.indexMetadata
.getName());
this.targetIndexName = DataService.getIndexPartitionName(
vmd.indexMetadata.getName(), newPartitionId);
this.summary = OverflowActionEnum.Move + "(" + vmd.name + "->"
+ targetIndexName + ")";
final Map params = vmd.getParams();
params.put("summary", summary);
this.e = new Event(resourceManager.getFederation(), new EventResource(
vmd.indexMetadata), OverflowActionEnum.Move, params);
}
@Override
protected void clearRefs() {
vmd.clearRef();
}
/**
* Builds a compact index segment from the historical view as of the last
* commit time on the old journal and then submits an atomic update
* operation to move the source index partition to the target data service.
*
* @return A {@link MoveResult} describing the move operation (this is
* returned mainly for historical reasons).
*/
@Override
protected MoveResult doTask() throws Exception {
e.start();
BuildResult historicalWritesBuildResult = null;
try {
if (resourceManager.isOverflowAllowed())
throw new IllegalStateException();
try {
// view of the source index partition.
final ILocalBTreeView src = getIndex(vmd.name);
/*
* Do a compacting merge of the historical view in order to
* obtain a dense index segment.
*
* Note: This will also apply the overflow handler, so any blobs
* managed by the index partition will be found in the blobs
* region of the generated index segment.
*/
historicalWritesBuildResult = resourceManager
.buildIndexSegment(vmd.name, src,
true/* compactingMerge */, vmd.commitTime,
null/* fromKey */, null/* toKey */, e);
if (INFO)
log
.info("Generated compact index segment from historical view: "
+ historicalWritesBuildResult);
} finally {
/*
* While we still need to copy the buffered writes on the live
* journal to the target index partition, at this point we no
* longer require the source index partition view (the view on
* the old journal) so we clear our references for that index.
*/
clearRefs();
}
/*
* Atomic move of the index partition.
*/
final MoveResult moveResult = doAtomicUpdate(resourceManager,
vmd.name, historicalWritesBuildResult,
targetDataServiceUUID, newPartitionId, e);
if (INFO)
log.info("Successfully moved index partition: " + summary);
return moveResult;
} finally {
if (historicalWritesBuildResult != null) {
/*
* At this point the index segment was either MOVEd to the
* target data service or there was an error. Either way, we now
* remove the index segment store's UUID from the retentionSet
* so it will be subject to the release policy of the
* StoreManager.
*/
resourceManager
.retentionSetRemove(historicalWritesBuildResult.segmentMetadata
.getUUID());
/*
* Delete the index segment since it is no longer required and
* was not incorporated into a view used by this data service.
*/
resourceManager.deleteResource(
historicalWritesBuildResult.segmentMetadata.getUUID(),
false/* isJournal */);
}
e.end();
}
}
/**
* Submits an {@link AtomicUpdate} and awaits and returns its outcome.
*
* @param resourceManager
* The resource manager.
* @param sourceIndexName
* The name of the source index partition.
* @param historicalWritesBuildResult
* An index segment containing all data for the source view as of
* the last commit time on the old journal. This index segment
* should be generated by a compacting merge or by an index
* partition split with the same semantics so that we will move
* the minimum amount of data.
* @param targetDataServiceUUID
* The {@link UUID} of the target data service.
* @param targetIndexPartitionId
* The partition identifier assigned to the target index
* partition.
* @param parentEvent
*
* @throws ExecutionException
* @throws InterruptedException
*/
static protected MoveResult doAtomicUpdate(
final ResourceManager resourceManager,//
final String sourceIndexName,//
final BuildResult historicalWritesBuildResult,//
final UUID targetDataServiceUUID, //
final int targetIndexPartitionId,//
final Event parentEvent//
) throws InterruptedException, ExecutionException {
return resourceManager.getConcurrencyManager().submit(
new AtomicUpdate(resourceManager, sourceIndexName,
historicalWritesBuildResult, targetDataServiceUUID,
targetIndexPartitionId, parentEvent)).get();
}
/**
* Moves an index partition from this data service to another data service.
*
* This is an "atomic update" operation. It moves an index segment (supplied
* by the caller) containing the historical view of the source index
* partition and generates and moves an index segment containing any
* buffered writes on the live journal for the source index partition to the
* target data service. Once the target index partition is registered on the
* target data service and the {@link IMetadataService} has been updated to
* reflect the move, this task updates the stale locator cache. At that
* point clients addressing tasks to the source index partition will
* discover that it has been moved.
*
* Note: If the operation fails, then it has no side-effects but the caller
* is responsible for deleting the historicalWritesBuildResult iff
* that is deemed necessary (that is, if it is not in use then either put it
* to use or delete it -- an attractive alternative is to incorporate it
* into the source index partition view instead.)
*
* Tasks executing after this one will discover that the source index
* partition no longer exists as of the timestamp when this task commits.
* Clients that submit tasks for the source index partition will be notified
* that it no longer exists. When the client queries the
* {@link MetadataService} it will discover that the key range has been
* assigned to a new index partition - the one on the target data service.
*
* Note: This task runs as an {@link ITx#UNISOLATED} operation since it MUST
* have an exclusive lock in order to ensure that the buffered writes are
* transferred to the target index partition without allowing concurrent
* writes on the source index partition.
*
* Note: I have placed the "receive" of the historical index partition view
* within the atomic update task deliberately. It should add at most a few
* seconds to the execution time of that task and makes it easier to write
* corrective actions for the atomic update since we can offer a guarantees
* such that the existence of the target index partition on the target data
* service is sufficient to determine that the entire operation was
* successful.
*
* @author Bryan Thompson
*
* @todo optimization to NOT send an empty index segment if there are no
* buffered writes on the live journal.
*/
protected static class AtomicUpdate extends AbstractAtomicUpdateTask {
private final ResourceManager resourceManager;
private final String sourceIndexName;
private final BuildResult historicalWritesBuildResult;
private final UUID targetDataServiceUUID;
private final int targetIndexPartitionId;
private final Event parentEvent;
// private final InetAddress thisInetAddr;
/**
*
* @param resourceManager
* The resource manager.
* @param sourceIndexName
* The name of the source index partition.
* @param historicalWritesBuildResult
* An index segment containing all data for the source view
* as of the last commit time on the old journal. This index
* segment should be generated by a compacting merge or by an
* index partition split with the same semantics so that we
* will move the minimum amount of data.
* @param targetDataServiceUUID
* The {@link UUID} of the target data service.
* @param targetIndexPartitionId
* The partition identifier assigned to the target index
* partition.
* @param parentEvent
*/
protected AtomicUpdate(//
final ResourceManager resourceManager,//
final String sourceIndexName,//
final BuildResult historicalWritesBuildResult,//
final UUID targetDataServiceUUID,//
final int targetIndexPartitionId,//
final Event parentEvent//
) {
super(resourceManager, ITx.UNISOLATED, sourceIndexName);
if (historicalWritesBuildResult == null)
throw new IllegalArgumentException();
if (targetDataServiceUUID == null)
throw new IllegalArgumentException();
if (parentEvent == null)
throw new IllegalArgumentException();
this.resourceManager = resourceManager;
this.sourceIndexName = sourceIndexName;
this.historicalWritesBuildResult = historicalWritesBuildResult;
this.targetDataServiceUUID = targetDataServiceUUID;
this.targetIndexPartitionId = targetIndexPartitionId;
this.parentEvent = parentEvent;
// try {
// this.thisInetAddr = InetAddress.getByName(NicUtil.getIpAddress("default.nic", "default", false));
// } catch(Throwable t) {
// throw new IllegalArgumentException(t.getMessage(), t);
// }
}
/**
* Atomic update (move).
*
* @return A {@link MoveResult} describing the operation (this is
* returned mainly for historical reasons).
*/
public MoveResult doTask() throws Exception {
final Event e = parentEvent.newSubEvent(
OverflowSubtaskEnum.AtomicUpdate).start();
BuildResult bufferedWritesBuildResult = null;
try {
// Unisolated view of the source index partition.
final BTree src = getIndex(getOnlyResource()).getMutableBTree();
// The current index metadata record.
final IndexMetadata indexMetadata = src.getIndexMetadata();
// The name of the scale-out index whose index partition is
// being moved.
final String scaleOutIndexName = indexMetadata.getName();
// The name of the target index partition.
final String targetIndexName = DataService
.getIndexPartitionName(scaleOutIndexName,
targetIndexPartitionId);
// The current metadata for the source index partition view.
final LocalPartitionMetadata pmd = indexMetadata
.getPartitionMetadata();
// The current locator for the source index partition.
final PartitionLocator oldLocator = new PartitionLocator(//
pmd.getPartitionId(),//
resourceManager.getDataServiceUUID(),//
pmd.getLeftSeparatorKey(),//
pmd.getRightSeparatorKey()//
);
// The locator for the target index partition.
final PartitionLocator newLocator = new PartitionLocator(
targetIndexPartitionId,//
targetDataServiceUUID,//
pmd.getLeftSeparatorKey(),//
pmd.getRightSeparatorKey()//
);
/*
* Build an index segment from the buffered writes on the live
* journal for the source index partition.
*
* Note: DO NOT specify a compacting merge. That presumes that
* the entire view is being processed so that deleted tuples may
* be removed from the view. That is NOT the case here. There
* MAY be deleted tuples in the buffered writes and those MUST
* be included in the generated index segment so that the total
* view when reconstructed on the target data service will still
* report that those tuples are deleted. If you do not do this
* then a historical tuple could "reappear" after the move.
*
* Note: The [createTime] for the generated index segment store
* will reflect the commit point for the last buffered write on
* the source index partition.
*/
final long sourceCommitTime = src.getLastCommitTime();
bufferedWritesBuildResult = resourceManager.buildIndexSegment(
sourceIndexName, src, false/* compactingMerge */,
sourceCommitTime, null/* fromKey */, null/* toKey */,
parentEvent);
{
final IDataService targetDataService = resourceManager
.getFederation().getDataService(
targetDataServiceUUID);
if (targetDataService == null)
throw new Exception("No such data service: "
+ targetDataServiceUUID);
/*
* Submit task to the target data service that will copy the
* index segment store resources onto that data service and
* register the target index partition using the given
* IndexMetadata and the copied index segment store files.
*/
{
final Event receiveIndexPartitionEvent = parentEvent
.newSubEvent(
OverflowSubtaskEnum.ReceiveIndexPartition)
.start();
try {
targetDataService
.submit(
new ReceiveIndexPartitionTask(
indexMetadata,//
resourceManager
.getDataServiceUUID(),//
targetIndexPartitionId,//
historicalWritesBuildResult.segmentMetadata,//
bufferedWritesBuildResult.segmentMetadata,//
resourceManager
.getResourceService().getAddr()//
)).get();
} catch (ExecutionException ex) {
// The task failed.
rollbackMove(ex, scaleOutIndexName,
targetIndexName, targetDataService,
oldLocator, newLocator);
} catch (InterruptedException ex) {
// Task was interrupted.
rollbackMove(ex, scaleOutIndexName,
targetIndexName, targetDataService,
oldLocator, newLocator);
} catch (IOException ex) {
// RMI failure submitting task or obtain its
// outcome.
rollbackMove(ex, scaleOutIndexName,
targetIndexName, targetDataService,
oldLocator, newLocator);
} finally {
receiveIndexPartitionEvent.end();
}
}
/*
* The source index partition has been moved. All we need to
* do is drop the source index partition and notify clients
* that their locators for that key range are stale.
*/
/*
* The index manager will notify tasks that index partition
* has moved.
*
* Note: At this point, if the commit for this task fails,
* then clients will still be notified that the source index
* partition was moved. That is Ok since it WAS moved.
*/
resourceManager.setIndexPartitionGone(getOnlyResource(),
StaleLocatorReason.Move);
/*
* Drop the old index partition.
*
* Note: This action is rolled back automatically if this
* task fails. The consequence of this here is that the
* source index partition will remain registered on this
* data service. However, clients are being redirected
* (using stale locator exceptions) to the target data
* service and the metadata index will direct new requests
* to the target data service as well. So the consequence of
* failure here is that the source index partition becomes a
* zombie. It will remain on this data service forever
* unless someone explicitly drops it.
*/
getJournal().dropIndex(getOnlyResource());
// notify successful index partition move.
resourceManager.overflowCounters.indexPartitionMoveCounter.incrementAndGet();
}
return new MoveResult(scaleOutIndexName,
src.getIndexMetadata(), targetDataServiceUUID,
targetIndexPartitionId, oldLocator, newLocator);
} finally {
if (bufferedWritesBuildResult != null) {
/*
* At this point the index segment was either MOVEd to the
* target data service or there was an error. Either way, we
* now remove the index segment store's UUID from the
* retentionSet so it will be subject to the release policy
* of the StoreManager.
*/
resourceManager
.retentionSetRemove(bufferedWritesBuildResult.segmentMetadata
.getUUID());
/*
* Delete the index segment containing the buffer writes
* since it no longer required by this data service.
*/
resourceManager
.deleteResource(
bufferedWritesBuildResult.segmentMetadata
.getUUID(), false/* isJournal */);
}
e.end();
}
}
/**
* Invoked to rollback a partial move operation.
*
* This handles all cases from a completely successful move where only
* the RMI conveying the outcome failed (the exception is logged as a
* warning and this method returns normally since the move was in fact
* successful) to cases where it must rollback the change to the MDS
* (the commit of the target index partition failed after it had updated
* the MDS - in this cases some clients may temporarily see the locator
* for the target index partition, but they will discover that the index
* partition does not exist), to cases where the move failed before the
* MDS was updated (no compensating action is required).
*
* @throws Exception
* the caller's {@link Throwable}, wrapped iff necessary.
*/
private void rollbackMove(//
final Throwable t,//
final String scaleOutIndexName,//
final String targetIndexName,//
final IDataService targetDataService,//
final PartitionLocator oldLocator,//
final PartitionLocator newLocator//
) throws Exception {
/*
* 1. Query the target data service. If the target index partition
* was registered, then we are done and this method will return
* normally rather than re-throwing the exception.
*
* Note: It is not possible for the target index partition to be
* successfully registered on the target data service unless the MDS
* was also updated successfully - this is guaranteed because the
* MDS update occurs within the UNISOLATED task which registers the
* target index partition.
*/
try {
/*
* Figure out whether the target index partition was registered.
*
* Note: An UNISOLATED request and a custom IIndexProcedure
* (which IS NOT marked as read-only) are used deliberately.
* This way, even if the Future was not returned correctly for
* the task which we submitted to receive the index partition on
* the target data service (due to an RMI error) then we are
* guaranteed that our test WILL NOT execute until it can gain a
* lock on the target index partition. This prevents us from
* attempting to verify the outcome of the task before it has
* completed in the odd case where it is running asynchronously
* but we lack its Future.
*/
final IndexMetadata tmp = (IndexMetadata) targetDataService
.submit(ITx.UNISOLATED, targetIndexName,
new IsIndexRegistered_UsingWriteService())
.get();
if (tmp == null) {
/*
* This guards against a potential API change where
* AbstractTask#getIndex(String) would report [null] rather
* than throwing a NoSuchIndexException.
*/
throw new AssertionError("Not expecting [null] return.");
}
log
.error("Move successful - ignoring spurious exception: "
+ t);
// return normally.
return;
} catch (ExecutionException ex) {
if (ex.getCause() instanceof NoSuchIndexException) {
/*
* The target index partition was not registered.
*/
// fall through
} else throw ex;
}
/*
* 2. Query the MDS.
*
* We know that the target index partition was not successfully
* registered on the target data service. Now we query the MDS. If
* the target index partition is assigned by the MDS to the key
* range for the source index partition then the MDS was updated and
* we rollback that change using a compensating action.
*/
try {
final PartitionLocator current = resourceManager
.getFederation().getMetadataService().get(
scaleOutIndexName, ITx.UNISOLATED,
oldLocator.getLeftSeparatorKey());
final boolean mdsWasUpdated = current.getPartitionId() != oldLocator
.getPartitionId();
if (mdsWasUpdated) {
/*
* Rollback the MDS using a compensating action.
*/
try {
resourceManager.getFederation().getMetadataService()
.moveIndexPartition(scaleOutIndexName,
newLocator, oldLocator);
} catch (Throwable t2) {
log.error("Problem writing MDS? ", t2);
}
}
} catch (Throwable t2) {
log.error("Problem reading MDS? ", t2);
}
/*
* In any case, rethrow the original exception since the move was
* not successful.
*/
if(t instanceof Exception)
throw (Exception)t;
throw new RuntimeException(t);
}
}
/**
* Method used to test whether or not the target index partition was
* successfully registered on the target data service. This class explicitly
* uses the write service in order to guarantee that it can not execute
* until the "receive" operation is complete.
*
* @author Bryan Thompson
*/
private static class IsIndexRegistered_UsingWriteService implements
IIndexProcedure {
/**
*
*/
private static final long serialVersionUID = -6492979226768348981L;
@Override
public IndexMetadata apply(final IIndex ndx) {
return ndx.getIndexMetadata();
}
/**
* Note: This procedure is deliberately marked as NOT read-only.
* This ensures that the procedure will not execute until it
* has the exclusive write lock for the index.
*/
public boolean isReadOnly() {
return false;
}
}
/**
* Receives an index partition comprised of a historical index segment store
* and an index segment store containing the buffered writes and registers
* the index partition on the data service on which this procedure is
* executed. This class is actually a {@link Serializable} wrapper which
* submits the {@link InnerReceiveIndexPartitionTask} once it is running on
* the target data service.
*
* @see InnerReceiveIndexPartitionTask
*
* @author Bryan Thompson
*/
protected static class ReceiveIndexPartitionTask extends DataServiceCallable {
/**
*
*/
private static final long serialVersionUID = -4277343552510590741L;
final private IndexMetadata sourceIndexMetadata;
final private UUID sourceDataServiceUUID;
final private int targetIndexPartitionId;
final private SegmentMetadata historyIndexSegmentMetadata;
final private SegmentMetadata bufferedWritesIndexSegmentMetadata;
final private InetSocketAddress addr;
// final private int port;
/**
* @param sourceIndexMetadata
* The index metadata for the source index partition.
* @param sourceDataServiceUUID
* @param targetIndexPartitionId
* The index partition identifier assigned to the target
* index partition.
* @param historyIndexSegmentMetadata
* Describes the {@link IndexSegmentStore} containing the
* historical data for the source index partition.
* @param bufferedWritesIndexSegmentMetadata
* Describes the {@link IndexSegmentStore} containing the
* buffered writes from the live journal for the source index
* partition.
* @param addr
* The {@link InetSocketAddress} of the
* {@link ResourceService} running on the source data service
* (the one from which the resources will be copied during
* the move).
*/
ReceiveIndexPartitionTask(//
final IndexMetadata sourceIndexMetadata,//
final UUID sourceDataServiceUUID,//
final int targetIndexPartitionId,//
final SegmentMetadata historyIndexSegmentMetadata,//
final SegmentMetadata bufferedWritesIndexSegmentMetadata,//
final InetSocketAddress addr
) {
this.sourceIndexMetadata = sourceIndexMetadata;
this.sourceDataServiceUUID = sourceDataServiceUUID;
this.targetIndexPartitionId = targetIndexPartitionId;
this.historyIndexSegmentMetadata = historyIndexSegmentMetadata;
this.bufferedWritesIndexSegmentMetadata = bufferedWritesIndexSegmentMetadata;
this.addr = addr;
}
// private transient DataService dataService;
//
// public void setDataService(DataService dataService) {
//
// this.dataService = dataService;
//
// }
//
// protected DataService getDataService() {
//
// if (dataService == null)
// throw new IllegalArgumentException();
//
// return dataService;
//
// }
public Void call() throws Exception {
/*
* The name of the target index partition on the target data
* service. This is formed using the name of the scale-out index and
* the partition identifier that was assigned to the new index
* partition.
*/
final String targetIndexName = DataService.getIndexPartitionName(
sourceIndexMetadata.getName(), targetIndexPartitionId);
/*
* Run the inner task on the write service of the target data
* service.
*/
final ResourceManager resourceManager = getDataService()
.getResourceManager();
try {
getDataService().getConcurrencyManager().submit(
new InnerReceiveIndexPartitionTask(//
resourceManager,//
targetIndexName,//
sourceIndexMetadata,//
sourceDataServiceUUID,//
targetIndexPartitionId,//
historyIndexSegmentMetadata,//
bufferedWritesIndexSegmentMetadata,//
addr//
)).get();
// update the index partition receive counter.
resourceManager.overflowCounters.indexPartitionReceiveCounter.incrementAndGet();
return null;
} finally {
/*
* Regardless of whether the move was successful or not, we now
* remove from the received index segment stores from the
* retention set so that these resources become releaseable.
*/
resourceManager.retentionSetRemove(historyIndexSegmentMetadata
.getUUID());
resourceManager
.retentionSetRemove(bufferedWritesIndexSegmentMetadata
.getUUID());
}
}
}
/**
* Task submitted to the {@link ConcurrencyManager} on the target data
* service handles all the work required to receive the data for the index
* partition and register the new index partition on the target data service
* and in the {@link IMetadataService}.
*
* The new index partition initially will have three sources in the view:
* the live journal on the target data service, the buffered writes index
* segment from the source data service, and the history index segment from
* source data service.
*
* @author Bryan Thompson
*/
private static class InnerReceiveIndexPartitionTask extends AbstractTask {
final private ResourceManager resourceManager;
final private String scaleOutIndexName;
final private String sourceIndexName;
final private String targetIndexName;
final private IndexMetadata sourceIndexMetadata;
final private UUID sourceDataServiceUUID;
final private UUID targetDataServiceUUID;
final private int sourceIndexPartitionId;
final private int targetIndexPartitionId;
final private SegmentMetadata sourceHistorySegmentMetadata;
final private SegmentMetadata sourceBufferedWritesSegmentMetadata;
final private Event parentEvent;
final private String summary;
final InetSocketAddress addr;
// final int port;
/**
* @param resourceManager
* The resource manager (on the data service).
* @param targeIndexName
* The name of the target index partition.
* @param sourceIndexMetadata
* The index metadata for the source index partition.
* @param sourceDataServiceUUID
* @param targetIndexPartitionId
* The index partition identifier assigned to the target
* index partition.
* @param historyIndexSegmentMetadata
* Describes the {@link IndexSegmentStore} containing the
* historical data for the source index partition.
* @param bufferedWritesIndexSegmentMetadata
* Describes the {@link IndexSegmentStore} containing the
* buffered writes from the live journal for the source index
* partition.
* @param addr
* The {@link InetSocketAddress} of the
* {@link ResourceService} of the source data service (the
* one from which the resources will be copied).
*/
InnerReceiveIndexPartitionTask(final ResourceManager resourceManager,
final String targetIndexName,
final IndexMetadata sourceIndexMetadata,
final UUID sourceDataServiceUUID,
final int targetIndexPartitionId,
final SegmentMetadata historyIndexSegmentMetadata,
final SegmentMetadata bufferedWritesIndexSegmentMetadata,
final InetSocketAddress addr
) {
super(resourceManager.getConcurrencyManager(), ITx.UNISOLATED,
targetIndexName);
if (sourceIndexMetadata == null)
throw new IllegalArgumentException();
if (sourceDataServiceUUID == null)
throw new IllegalArgumentException();
if (historyIndexSegmentMetadata == null)
throw new IllegalArgumentException();
if (bufferedWritesIndexSegmentMetadata == null)
throw new IllegalArgumentException();
if (addr == null)
throw new IllegalArgumentException();
this.resourceManager = resourceManager;
this.scaleOutIndexName = sourceIndexMetadata.getName();
this.sourceIndexPartitionId = sourceIndexMetadata
.getPartitionMetadata().getPartitionId();
this.sourceIndexName = DataService.getIndexPartitionName(
scaleOutIndexName, sourceIndexPartitionId);
this.targetIndexName = targetIndexName;
this.sourceIndexMetadata = sourceIndexMetadata;
this.sourceDataServiceUUID = sourceDataServiceUUID;
this.targetDataServiceUUID = resourceManager.getDataServiceUUID();
this.targetIndexPartitionId = targetIndexPartitionId;
this.sourceHistorySegmentMetadata = historyIndexSegmentMetadata;
this.sourceBufferedWritesSegmentMetadata = bufferedWritesIndexSegmentMetadata;
this.addr = addr;
this.summary = OverflowActionEnum.Move + "(" + sourceIndexName
+ "->" + targetIndexName + ")";
this.parentEvent = new Event(resourceManager.getFederation(),
new EventResource(sourceIndexMetadata.getName(),
sourceIndexPartitionId), OverflowActionEnum.Move);
this.parentEvent.addDetail("summary", this.summary);
}
/**
* Copies the history and the buffered writes for the source index
* partition into the local {@link StoreManager}, registers the new
* index partition using those resources in its view, and notifies the
* {@link IMetadataService} of the move.
*
* The caller still needs to delete the moved resources on their end,
* delete the source index partition, and update the stale locator cache
* so that tasks in the write queue will be redirected to the new index
* partition.
*
* If this task throws an exception the caller MUST query the
* {@link IMetadataService} and determine whether the source index
* partition or the target index partition is now registered therein. If
* the target index partition is registered with the
* {@link IMetadataService}, then the caller needs to rollback that
* change using a compensating action.
*
* @throws Exception
*/
public Void doTask() throws Exception {
SegmentMetadata targetHistorySegmentMetadata = null;
SegmentMetadata targetBufferedWritesSegmentMetadata = null;
parentEvent.start();
try {
targetHistorySegmentMetadata = receiveIndexSegmentStore(sourceHistorySegmentMetadata);
targetBufferedWritesSegmentMetadata = receiveIndexSegmentStore(sourceBufferedWritesSegmentMetadata);
final MoveResult moveResult = registerIndexPartition(
targetHistorySegmentMetadata,
targetBufferedWritesSegmentMetadata);
updateMetadataIndex(moveResult);
return null;
} catch (Throwable t) {
if (targetHistorySegmentMetadata != null) {
try {
resourceManager
.retentionSetRemove(targetHistorySegmentMetadata
.getUUID());
resourceManager
.deleteResource(targetHistorySegmentMetadata
.getUUID(), false/* isJournal */);
} catch (Throwable t2) {
// ignore
}
}
if (targetBufferedWritesSegmentMetadata != null) {
try {
resourceManager
.retentionSetRemove(targetBufferedWritesSegmentMetadata
.getUUID());
resourceManager.deleteResource(
targetBufferedWritesSegmentMetadata.getUUID(),
false/* isJournal */);
} catch (Throwable t2) {
// ignore
}
}
if (t instanceof Exception)
throw (Exception) t;
throw new Exception(t);
} finally {
parentEvent.end();
}
}
/**
* This transfers the specified resource into its local data directory
* and registers it with the local {@link ResourceManager}.
*
* Note: Since the file (in this case an index segment) is being
* transferred from one data service to another the dataDir, the target
* index partitionId, and the basename of the file will all be
* different. This code make sure that the file winds up in the correct
* directory for the scale-out index _partition_ to which it belongs.
*
* @param sourceSegmentMetadata
* The {@link SegmentMetadata} for the resource on the source
* data service.
*
* @return The {@link SegmentMetadata} for the resource where it resides
* on this data service.
*/
protected SegmentMetadata receiveIndexSegmentStore(
final SegmentMetadata sourceSegmentMetadata) throws Exception {
final Event e = parentEvent.newSubEvent(
OverflowSubtaskEnum.ReceiveIndexSegment,
sourceSegmentMetadata.getParams()).start();
try {
if (sourceSegmentMetadata == null)
throw new IllegalArgumentException();
final long begin = System.currentTimeMillis();
// name for the segFile on this data service.
final File file = resourceManager.getIndexSegmentFile(
scaleOutIndexName, sourceIndexMetadata.getIndexUUID(),
targetIndexPartitionId);
// make sure that the parent directory exists.
file.getParentFile().mkdirs();
/*
* Construct the metadata describing the index segment that we
* are going to receive.
*/
final SegmentMetadata targetSegmentMetadata = new SegmentMetadata(
file, sourceSegmentMetadata.getUUID(),
sourceSegmentMetadata.getCreateTime());
try {
// read the resource, writing onto that file.
new ResourceService.ReadResourceTask(addr,
sourceSegmentMetadata.getUUID(), file).call();
} catch (Throwable t) {
try {
file.delete();
} catch (Throwable t2) {
// ignore
}
if (t instanceof Exception)
throw (Exception) t;
throw new Exception(t);
}
// put on the retentionSet first!
resourceManager.retentionSetAdd(sourceSegmentMetadata.getUUID());
// add the resource to those managed by this service.
resourceManager.addResource(sourceSegmentMetadata, file);
if (INFO) {
final long elapsed = System.currentTimeMillis() - begin;
log.info("Received index segment: " + sourceSegmentMetadata
+ " in " + elapsed + "ms");
}
return targetSegmentMetadata;
} finally {
e.end();
}
}
/**
* Register the target index partition on this (the target) data
* service.
*
* @param historySegmentMetadata
* The metadata for the received index segment containing the
* historical writes up to the last commit point of the old
* journal for the source index partition.
* @param bufferedWritesSegmentMetadata
* The metadata for the received index segment containing the
* buffered writes from the live journal for the source index
* partition.
*
* @return {@link MoveResult} which contains the information we need to
* update the {@link IMetadataService} when the move is
* complete.
*/
protected MoveResult registerIndexPartition(
final SegmentMetadata historySegmentMetadata,
final SegmentMetadata bufferedWritesSegmentMetadata) {
final Event e = parentEvent.newSubEvent(
OverflowSubtaskEnum.RegisterIndex).addDetail(
"targetIndexName", targetIndexName).start();
try {
// clone metadata.
final IndexMetadata newMetadata = sourceIndexMetadata.clone();
// the partition metadata for the source index partition.
final LocalPartitionMetadata oldpmd = newMetadata
.getPartitionMetadata();
/*
* Note: We DO NOT specify the sourcePartitionId on the new
* index partition's view since the view will be made live
* within this UNISOLATED task. Using this approach does not
* allow a gap when the move is in progress since it runs as a
* single unisolated task rather than a series of such tasks.
*/
newMetadata.setPartitionMetadata(new LocalPartitionMetadata(//
targetIndexPartitionId, // the new partition identifier.
-1, // The source partition identifier (unused here).
oldpmd.getLeftSeparatorKey(),//
oldpmd.getRightSeparatorKey(),//
/*
* Define the view for the target index partition.
*/
new IResourceMetadata[] {//
// The live journal (no data for this index partition yet).
getJournal().getResourceMetadata(), //
// Buffered writes from the live journal of the source DS.
bufferedWritesSegmentMetadata,//
// Historical writes from the source DS.
historySegmentMetadata//
},
IndexPartitionCause.move(resourceManager)
// // history line.
// ,oldpmd.getHistory() + summary + " "
));
/*
* Create the BTree to aborb writes for the target index
* partition. The metadata for this BTree was configured above
* and is associated with a view that captures all data received
* from the source index partition.
*/
final BTree btree = BTree.create(getJournal(), newMetadata);
/*
* Register the BTree on this data service.
*/
registerIndex(targetIndexName, btree);
if (INFO)
log
.info("Registered new index partition on target data service: targetIndexName="
+ targetIndexName);
/*
* Formulate a MoveResult containing the information that we
* need to update the MDS.
*/
final LocalPartitionMetadata pmd = sourceIndexMetadata
.getPartitionMetadata();
final PartitionLocator oldLocator = new PartitionLocator(//
sourceIndexPartitionId,//
sourceDataServiceUUID,//
pmd.getLeftSeparatorKey(),//
pmd.getRightSeparatorKey()//
);
final PartitionLocator newLocator = new PartitionLocator(
targetIndexPartitionId,//
targetDataServiceUUID,//
pmd.getLeftSeparatorKey(),//
pmd.getRightSeparatorKey()//
);
return new MoveResult(sourceIndexName, sourceIndexMetadata,
targetDataServiceUUID, targetIndexPartitionId, oldLocator,
newLocator);
} finally {
e.end();
}
}
/**
* Notifies the {@link IMetadataService} that the index partition has
* been moved. Once the {@link IMetadataService} has been notified, new
* requests for the key range will be able to discover the target index
* partition. However, tasks already in the queue on the source data
* service will not be notified that the index partition has been moved
* until this task completes and the caller updates the stale locator
* cache and deletes the source index partition.
*
* @throws ExecutionException
* @throws InterruptedException
* @throws IOException
*/
protected void updateMetadataIndex(final MoveResult moveResult)
throws IOException, InterruptedException, ExecutionException {
if (INFO)
log.info("Updating metadata index: name=" + scaleOutIndexName
+ ", oldLocator=" + moveResult.oldLocator
+ ", newLocator=" + moveResult.newLocator);
// atomic update on the metadata server.
resourceManager.getFederation().getMetadataService()
.moveIndexPartition(scaleOutIndexName,
moveResult.oldLocator, moveResult.newLocator);
/*
* @todo This flag is unused for this impl since MDS update is done
* by the target data service - in fact, the flag can probably be
* discarded if this move procedure works out nicely since it offers
* better atomicity guarantees. The OverflowSubtaskEnum can also
* be pruned since we will no longer use certain subtasks which it
* declares.
*/
// moveResult.registeredInMDS.set(true);
}
}
}