org.elasticsearch.indices.recovery.RecoverySourceHandler Maven / Gradle / Ivy
Show all versions of elasticsearch Show documentation
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.indices.recovery;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexFormatTooNewException;
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.RateLimiter;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.SetOnce;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.StepListener;
import org.elasticsearch.action.support.ThreadedActionListener;
import org.elasticsearch.action.support.replication.ReplicationResponse;
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.common.StopWatch;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.bytes.ReleasableBytesReference;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.util.CancellableThreads;
import org.elasticsearch.common.util.concurrent.CountDown;
import org.elasticsearch.common.util.concurrent.FutureUtils;
import org.elasticsearch.common.util.concurrent.ListenableFuture;
import org.elasticsearch.core.IOUtils;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.Releasables;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.core.Tuple;
import org.elasticsearch.index.engine.Engine;
import org.elasticsearch.index.engine.RecoveryEngineException;
import org.elasticsearch.index.seqno.ReplicationTracker;
import org.elasticsearch.index.seqno.RetentionLease;
import org.elasticsearch.index.seqno.RetentionLeaseNotFoundException;
import org.elasticsearch.index.seqno.RetentionLeases;
import org.elasticsearch.index.seqno.SequenceNumbers;
import org.elasticsearch.index.shard.IndexShard;
import org.elasticsearch.index.shard.IndexShardClosedException;
import org.elasticsearch.index.shard.IndexShardRelocatedException;
import org.elasticsearch.index.shard.IndexShardState;
import org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshot;
import org.elasticsearch.index.store.Store;
import org.elasticsearch.index.store.StoreFileMetadata;
import org.elasticsearch.index.translog.Translog;
import org.elasticsearch.indices.recovery.plan.RecoveryPlannerService;
import org.elasticsearch.indices.recovery.plan.ShardRecoveryPlan;
import org.elasticsearch.snapshots.SnapshotShardsService;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.RemoteTransportException;
import org.elasticsearch.transport.Transports;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Deque;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Consumer;
import java.util.function.IntSupplier;
import java.util.stream.StreamSupport;
import static org.elasticsearch.common.util.CollectionUtils.concatLists;
import static org.elasticsearch.core.Strings.format;
/**
* RecoverySourceHandler handles the three phases of shard recovery, which is
* everything relating to copying the segment files as well as sending translog
* operations across the wire once the segments have been copied.
*
* Note: There is always one source handler per recovery that handles all the
* file and translog transfer. This handler is completely isolated from other recoveries
* while the {@link RateLimiter} passed via {@link RecoverySettings} is shared across recoveries
* originating from this nodes to throttle the number bytes send during file transfer. The transaction log
* phase bypasses the rate limiter entirely.
*/
public class RecoverySourceHandler {
protected final Logger logger;
// Shard that is going to be recovered (the "source")
private final IndexShard shard;
private final int shardId;
// Request containing source and target node information
private final StartRecoveryRequest request;
private final int chunkSizeInBytes;
private final RecoveryTargetHandler recoveryTarget;
private final int maxConcurrentFileChunks;
private final int maxConcurrentOperations;
private final int maxConcurrentSnapshotFileDownloads;
private final boolean useSnapshots;
private final ThreadPool threadPool;
private final RecoveryPlannerService recoveryPlannerService;
private final CancellableThreads cancellableThreads = new CancellableThreads();
private final List resources = new CopyOnWriteArrayList<>();
private final ListenableFuture future = new ListenableFuture<>();
public RecoverySourceHandler(
IndexShard shard,
RecoveryTargetHandler recoveryTarget,
ThreadPool threadPool,
StartRecoveryRequest request,
int fileChunkSizeInBytes,
int maxConcurrentFileChunks,
int maxConcurrentOperations,
int maxConcurrentSnapshotFileDownloads,
boolean useSnapshots,
RecoveryPlannerService recoveryPlannerService
) {
this.shard = shard;
this.recoveryTarget = recoveryTarget;
this.threadPool = threadPool;
this.recoveryPlannerService = recoveryPlannerService;
this.request = request;
this.shardId = this.request.shardId().id();
this.logger = Loggers.getLogger(getClass(), request.shardId(), "recover to " + request.targetNode().getName());
this.chunkSizeInBytes = fileChunkSizeInBytes;
this.maxConcurrentFileChunks = maxConcurrentFileChunks;
this.maxConcurrentOperations = maxConcurrentOperations;
this.maxConcurrentSnapshotFileDownloads = maxConcurrentSnapshotFileDownloads;
this.useSnapshots = useSnapshots;
}
public StartRecoveryRequest getRequest() {
return request;
}
public void addListener(ActionListener listener) {
future.addListener(listener);
}
/**
* performs the recovery from the local engine to the target
*/
public void recoverToTarget(ActionListener listener) {
addListener(listener);
final Closeable releaseResources = () -> IOUtils.close(resources);
try {
cancellableThreads.setOnCancel((reason, beforeCancelEx) -> {
final RuntimeException e;
if (shard.state() == IndexShardState.CLOSED) { // check if the shard got closed on us
e = new IndexShardClosedException(shard.shardId(), "shard is closed and recovery was canceled reason [" + reason + "]");
} else {
e = new CancellableThreads.ExecutionCancelledException("recovery was canceled reason [" + reason + "]");
}
if (beforeCancelEx != null) {
e.addSuppressed(beforeCancelEx);
}
IOUtils.closeWhileHandlingException(releaseResources, () -> future.onFailure(e));
throw e;
});
final Consumer onFailure = e -> {
assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[onFailure]");
IOUtils.closeWhileHandlingException(releaseResources, () -> future.onFailure(e));
};
final SetOnce retentionLeaseRef = new SetOnce<>();
runUnderPrimaryPermit(() -> {
final IndexShardRoutingTable routingTable = shard.getReplicationGroup().getRoutingTable();
ShardRouting targetShardRouting = routingTable.getByAllocationId(request.targetAllocationId());
if (targetShardRouting == null) {
logger.debug(
"delaying recovery of {} as it is not listed as assigned to target node {}",
request.shardId(),
request.targetNode()
);
throw new DelayRecoveryException("source node does not have the shard listed in its state as allocated on the node");
}
assert targetShardRouting.initializing() : "expected recovery target to be initializing but was " + targetShardRouting;
retentionLeaseRef.set(
shard.getRetentionLeases().get(ReplicationTracker.getPeerRecoveryRetentionLeaseId(targetShardRouting))
);
},
shardId + " validating recovery target [" + request.targetAllocationId() + "] registered ",
shard,
cancellableThreads,
logger
);
final Closeable retentionLock = shard.acquireHistoryRetentionLock();
resources.add(retentionLock);
final long startingSeqNo;
final boolean isSequenceNumberBasedRecovery = request.startingSeqNo() != SequenceNumbers.UNASSIGNED_SEQ_NO
&& isTargetSameHistory()
&& shard.hasCompleteHistoryOperations("peer-recovery", request.startingSeqNo())
&& ((retentionLeaseRef.get() == null && shard.useRetentionLeasesInPeerRecovery() == false)
|| (retentionLeaseRef.get() != null && retentionLeaseRef.get().retainingSequenceNumber() <= request.startingSeqNo()));
// NB check hasCompleteHistoryOperations when computing isSequenceNumberBasedRecovery, even if there is a retention lease,
// because when doing a rolling upgrade from earlier than 7.4 we may create some leases that are initially unsatisfied. It's
// possible there are other cases where we cannot satisfy all leases, because that's not a property we currently expect to hold.
// Also it's pretty cheap when soft deletes are enabled, and it'd be a disaster if we tried a sequence-number-based recovery
// without having a complete history.
if (isSequenceNumberBasedRecovery && retentionLeaseRef.get() != null) {
// all the history we need is retained by an existing retention lease, so we do not need a separate retention lock
retentionLock.close();
logger.trace("history is retained by {}", retentionLeaseRef.get());
} else {
// all the history we need is retained by the retention lock, obtained before calling shard.hasCompleteHistoryOperations()
// and before acquiring the safe commit we'll be using, so we can be certain that all operations after the safe commit's
// local checkpoint will be retained for the duration of this recovery.
logger.trace("history is retained by retention lock");
}
final StepListener sendFileStep = new StepListener<>();
final StepListener prepareEngineStep = new StepListener<>();
final StepListener sendSnapshotStep = new StepListener<>();
final StepListener finalizeStep = new StepListener<>();
if (isSequenceNumberBasedRecovery) {
logger.trace("performing sequence numbers based recovery. starting at [{}]", request.startingSeqNo());
startingSeqNo = request.startingSeqNo();
if (retentionLeaseRef.get() == null) {
createRetentionLease(startingSeqNo, sendFileStep.map(ignored -> SendFileResult.EMPTY));
} else {
sendFileStep.onResponse(SendFileResult.EMPTY);
}
} else {
final Engine.IndexCommitRef safeCommitRef;
try {
safeCommitRef = acquireSafeCommit(shard);
resources.add(safeCommitRef);
} catch (final Exception e) {
throw new RecoveryEngineException(shard.shardId(), 1, "snapshot failed", e);
}
// Try and copy enough operations to the recovering peer so that if it is promoted to primary then it has a chance of being
// able to recover other replicas using operations-based recoveries. If we are not using retention leases then we
// conservatively copy all available operations. If we are using retention leases then "enough operations" is just the
// operations from the local checkpoint of the safe commit onwards, because when using soft deletes the safe commit retains
// at least as much history as anything else. The safe commit will often contain all the history retained by the current set
// of retention leases, but this is not guaranteed: an earlier peer recovery from a different primary might have created a
// retention lease for some history that this primary already discarded, since we discard history when the global checkpoint
// advances and not when creating a new safe commit. In any case this is a best-effort thing since future recoveries can
// always fall back to file-based ones, and only really presents a problem if this primary fails before things have settled
// down.
startingSeqNo = Long.parseLong(safeCommitRef.getIndexCommit().getUserData().get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)) + 1L;
logger.trace("performing file-based recovery followed by history replay starting at [{}]", startingSeqNo);
try {
final int estimateNumOps = estimateNumberOfHistoryOperations(startingSeqNo);
final Releasable releaseStore = acquireStore(shard.store());
resources.add(releaseStore);
sendFileStep.whenComplete(r -> IOUtils.close(safeCommitRef, releaseStore), e -> {
try {
IOUtils.close(safeCommitRef, releaseStore);
} catch (Exception ex) {
logger.warn("releasing snapshot caused exception", ex);
}
});
final StepListener deleteRetentionLeaseStep = new StepListener<>();
runUnderPrimaryPermit(() -> {
try {
// If the target previously had a copy of this shard then a file-based recovery might move its global
// checkpoint backwards. We must therefore remove any existing retention lease so that we can create a
// new one later on in the recovery.
shard.removePeerRecoveryRetentionLease(
request.targetNode().getId(),
new ThreadedActionListener<>(
logger,
shard.getThreadPool(),
ThreadPool.Names.GENERIC,
deleteRetentionLeaseStep,
false
)
);
} catch (RetentionLeaseNotFoundException e) {
logger.debug("no peer-recovery retention lease for " + request.targetAllocationId());
deleteRetentionLeaseStep.onResponse(null);
}
}, shardId + " removing retention lease for [" + request.targetAllocationId() + "]", shard, cancellableThreads, logger);
deleteRetentionLeaseStep.whenComplete(ignored -> {
assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[phase1]");
phase1(safeCommitRef.getIndexCommit(), startingSeqNo, () -> estimateNumOps, sendFileStep);
}, onFailure);
} catch (final Exception e) {
throw new RecoveryEngineException(shard.shardId(), 1, "sendFileStep failed", e);
}
}
assert startingSeqNo >= 0 : "startingSeqNo must be non negative. got: " + startingSeqNo;
sendFileStep.whenComplete(r -> {
assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[prepareTargetForTranslog]");
// For a sequence based recovery, the target can keep its local translog
prepareTargetForTranslog(estimateNumberOfHistoryOperations(startingSeqNo), prepareEngineStep);
}, onFailure);
prepareEngineStep.whenComplete(prepareEngineTime -> {
assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[phase2]");
/*
* add shard to replication group (shard will receive replication requests from this point on) now that engine is open.
* This means that any document indexed into the primary after this will be replicated to this replica as well
* make sure to do this before sampling the max sequence number in the next step, to ensure that we send
* all documents up to maxSeqNo in phase2.
*/
runUnderPrimaryPermit(
() -> shard.initiateTracking(request.targetAllocationId()),
shardId + " initiating tracking of " + request.targetAllocationId(),
shard,
cancellableThreads,
logger
);
final long endingSeqNo = shard.seqNoStats().getMaxSeqNo();
logger.trace("snapshot for recovery; current size is [{}]", estimateNumberOfHistoryOperations(startingSeqNo));
final Translog.Snapshot phase2Snapshot = shard.newChangesSnapshot(
"peer-recovery",
startingSeqNo,
Long.MAX_VALUE,
false,
false,
true
);
resources.add(phase2Snapshot);
retentionLock.close();
// we have to capture the max_seen_auto_id_timestamp and the max_seq_no_of_updates to make sure that these values
// are at least as high as the corresponding values on the primary when any of these operations were executed on it.
final long maxSeenAutoIdTimestamp = shard.getMaxSeenAutoIdTimestamp();
final long maxSeqNoOfUpdatesOrDeletes = shard.getMaxSeqNoOfUpdatesOrDeletes();
final RetentionLeases retentionLeases = shard.getRetentionLeases();
final long mappingVersionOnPrimary = shard.indexSettings().getIndexMetadata().getMappingVersion();
phase2(
startingSeqNo,
endingSeqNo,
phase2Snapshot,
maxSeenAutoIdTimestamp,
maxSeqNoOfUpdatesOrDeletes,
retentionLeases,
mappingVersionOnPrimary,
sendSnapshotStep
);
}, onFailure);
// Recovery target can trim all operations >= startingSeqNo as we have sent all these operations in the phase 2
final long trimAboveSeqNo = startingSeqNo - 1;
sendSnapshotStep.whenComplete(r -> finalizeRecovery(r.targetLocalCheckpoint, trimAboveSeqNo, finalizeStep), onFailure);
finalizeStep.whenComplete(r -> {
final long phase1ThrottlingWaitTime = 0L; // TODO: return the actual throttle time
final SendSnapshotResult sendSnapshotResult = sendSnapshotStep.result();
final SendFileResult sendFileResult = sendFileStep.result();
final RecoveryResponse response = new RecoveryResponse(
sendFileResult.phase1FileNames,
sendFileResult.phase1FileSizes,
sendFileResult.phase1ExistingFileNames,
sendFileResult.phase1ExistingFileSizes,
sendFileResult.totalSize,
sendFileResult.existingTotalSize,
sendFileResult.took.millis(),
phase1ThrottlingWaitTime,
prepareEngineStep.result().millis(),
sendSnapshotResult.sentOperations,
sendSnapshotResult.tookTime.millis()
);
try {
future.onResponse(response);
} finally {
IOUtils.close(resources);
}
}, onFailure);
} catch (Exception e) {
IOUtils.closeWhileHandlingException(releaseResources, () -> future.onFailure(e));
}
}
private boolean isTargetSameHistory() {
final String targetHistoryUUID = request.metadataSnapshot().getHistoryUUID();
assert targetHistoryUUID != null : "incoming target history missing";
return targetHistoryUUID.equals(shard.getHistoryUUID());
}
private int estimateNumberOfHistoryOperations(long startingSeqNo) throws IOException {
return shard.countChanges("peer-recovery", startingSeqNo, Long.MAX_VALUE);
}
static void runUnderPrimaryPermit(
CancellableThreads.Interruptible runnable,
String reason,
IndexShard primary,
CancellableThreads cancellableThreads,
Logger logger
) {
cancellableThreads.execute(() -> {
CompletableFuture permit = new CompletableFuture<>();
final ActionListener onAcquired = new ActionListener() {
@Override
public void onResponse(Releasable releasable) {
if (permit.complete(releasable) == false) {
releasable.close();
}
}
@Override
public void onFailure(Exception e) {
permit.completeExceptionally(e);
}
};
primary.acquirePrimaryOperationPermit(onAcquired, ThreadPool.Names.SAME, reason);
try (Releasable ignored = FutureUtils.get(permit)) {
// check that the IndexShard still has the primary authority. This needs to be checked under operation permit to prevent
// races, as IndexShard will switch its authority only when it holds all operation permits, see IndexShard.relocated()
if (primary.isRelocatedPrimary()) {
throw new IndexShardRelocatedException(primary.shardId());
}
runnable.run();
} finally {
// just in case we got an exception (likely interrupted) while waiting for the get
permit.whenComplete((r, e) -> {
if (r != null) {
r.close();
}
if (e != null) {
logger.trace("suppressing exception on completion (it was already bubbled up or the operation was aborted)", e);
}
});
}
});
}
/**
* Increases the store reference and returns a {@link Releasable} that will decrease the store reference using the generic thread pool.
* We must never release the store using an interruptible thread as we can risk invalidating the node lock.
*/
private Releasable acquireStore(Store store) {
store.incRef();
return Releasables.releaseOnce(() -> closeOnGenericThreadPool(store::decRef));
}
/**
* Releasing a safe commit can access some commit files. It's better not to use {@link CancellableThreads} to interact
* with the file systems due to interrupt (see {@link org.apache.lucene.store.NIOFSDirectory} javadocs for more detail).
* This method acquires a safe commit and wraps it to make sure that it will be released using the generic thread pool.
*/
private Engine.IndexCommitRef acquireSafeCommit(IndexShard shard) {
final Engine.IndexCommitRef commitRef = shard.acquireSafeIndexCommit();
return new Engine.IndexCommitRef(commitRef.getIndexCommit(), () -> closeOnGenericThreadPool(commitRef));
}
private void closeOnGenericThreadPool(Closeable closeable) {
assert threadPool.generic().isShutdown() == false;
threadPool.generic().execute(() -> {
try {
closeable.close();
} catch (Exception e) {
assert false : e;
logger.warn(new ParameterizedMessage("Exception while closing [{}]", closeable), e);
}
});
}
static final class SendFileResult {
final List phase1FileNames;
final List phase1FileSizes;
final long totalSize;
final List phase1ExistingFileNames;
final List phase1ExistingFileSizes;
final long existingTotalSize;
final TimeValue took;
SendFileResult(
List phase1FileNames,
List phase1FileSizes,
long totalSize,
List phase1ExistingFileNames,
List phase1ExistingFileSizes,
long existingTotalSize,
TimeValue took
) {
this.phase1FileNames = phase1FileNames;
this.phase1FileSizes = phase1FileSizes;
this.totalSize = totalSize;
this.phase1ExistingFileNames = phase1ExistingFileNames;
this.phase1ExistingFileSizes = phase1ExistingFileSizes;
this.existingTotalSize = existingTotalSize;
this.took = took;
}
static final SendFileResult EMPTY = new SendFileResult(
Collections.emptyList(),
Collections.emptyList(),
0L,
Collections.emptyList(),
Collections.emptyList(),
0L,
TimeValue.ZERO
);
}
/**
* Perform phase1 of the recovery operations. Once this {@link IndexCommit}
* snapshot has been performed no commit operations (files being fsync'd)
* are effectively allowed on this index until all recovery phases are done
*
* Phase1 examines the segment files on the target node and copies over the
* segments that are missing. Only segments that have the same size and
* checksum can be reused
*/
void phase1(IndexCommit snapshot, long startingSeqNo, IntSupplier translogOps, ActionListener listener) {
cancellableThreads.checkForCancel();
final Store store = shard.store();
try {
StopWatch stopWatch = new StopWatch().start();
final Store.MetadataSnapshot recoverySourceMetadata;
final String shardStateIdentifier;
try {
recoverySourceMetadata = store.getMetadata(snapshot);
shardStateIdentifier = SnapshotShardsService.getShardStateId(shard, snapshot);
} catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
shard.failShard("recovery", ex);
throw ex;
}
for (String name : snapshot.getFileNames()) {
final StoreFileMetadata md = recoverySourceMetadata.get(name);
if (md == null) {
logger.info("Snapshot differs from actual index for file: {} meta: {}", name, recoverySourceMetadata.fileMetadataMap());
throw new CorruptIndexException(
"Snapshot differs from actual index - maybe index was removed metadata has "
+ recoverySourceMetadata.fileMetadataMap().size()
+ " files",
name
);
}
}
if (canSkipPhase1(recoverySourceMetadata, request.metadataSnapshot()) == false) {
cancellableThreads.checkForCancel();
final boolean canUseSnapshots = useSnapshots && request.canDownloadSnapshotFiles();
recoveryPlannerService.computeRecoveryPlan(
shard.shardId(),
shardStateIdentifier,
recoverySourceMetadata,
request.metadataSnapshot(),
startingSeqNo,
translogOps.getAsInt(),
getRequest().targetNode().getVersion(),
canUseSnapshots,
ActionListener.wrap(plan -> recoverFilesFromSourceAndSnapshot(plan, store, stopWatch, listener), listener::onFailure)
);
} else {
logger.trace("skipping [phase1] since source and target have identical sync id [{}]", recoverySourceMetadata.getSyncId());
// but we must still create a retention lease
final StepListener createRetentionLeaseStep = new StepListener<>();
createRetentionLease(startingSeqNo, createRetentionLeaseStep);
createRetentionLeaseStep.whenComplete(retentionLease -> {
final TimeValue took = stopWatch.totalTime();
logger.trace("recovery [phase1]: took [{}]", took);
listener.onResponse(
new SendFileResult(
Collections.emptyList(),
Collections.emptyList(),
0L,
Collections.emptyList(),
Collections.emptyList(),
0L,
took
)
);
}, listener::onFailure);
}
} catch (Exception e) {
throw new RecoverFilesRecoveryException(request.shardId(), 0, new ByteSizeValue(0L), e);
}
}
void recoverFilesFromSourceAndSnapshot(
ShardRecoveryPlan shardRecoveryPlan,
Store store,
StopWatch stopWatch,
ActionListener listener
) {
cancellableThreads.checkForCancel();
final List filesToRecoverNames = shardRecoveryPlan.getFilesToRecoverNames();
final List filesToRecoverSizes = shardRecoveryPlan.getFilesToRecoverSizes();
final List phase1ExistingFileNames = shardRecoveryPlan.getFilesPresentInTargetNames();
final List phase1ExistingFileSizes = shardRecoveryPlan.getFilesPresentInTargetSizes();
final long totalSize = shardRecoveryPlan.getTotalSize();
final long existingTotalSize = shardRecoveryPlan.getExistingSize();
if (logger.isTraceEnabled()) {
for (StoreFileMetadata md : shardRecoveryPlan.getFilesPresentInTarget()) {
logger.trace(
"recovery [phase1]: not recovering [{}], exist in local store and has checksum [{}]," + " size [{}]",
md.name(),
md.checksum(),
md.length()
);
}
for (StoreFileMetadata md : shardRecoveryPlan.getSourceFilesToRecover()) {
if (request.metadataSnapshot().fileMetadataMap().containsKey(md.name())) {
logger.trace(
"recovery [phase1]: recovering [{}], exists in local store, but is different: remote [{}], local [{}]",
md.name(),
request.metadataSnapshot().fileMetadataMap().get(md.name()),
md
);
} else {
logger.trace("recovery [phase1]: recovering [{}], does not exist in remote", md.name());
}
}
for (BlobStoreIndexShardSnapshot.FileInfo fileInfo : shardRecoveryPlan.getSnapshotFilesToRecover()) {
final StoreFileMetadata md = fileInfo.metadata();
if (request.metadataSnapshot().fileMetadataMap().containsKey(md.name())) {
logger.trace(
"recovery [phase1]: recovering [{}], exists in local store, but is different: remote [{}], local [{}]",
md.name(),
request.metadataSnapshot().fileMetadataMap().get(md.name()),
md
);
} else {
logger.trace("recovery [phase1]: recovering [{}], does not exist in remote", md.name());
}
}
logger.trace(
"recovery [phase1]: recovering_files [{}] with total_size [{}], reusing_files [{}] with total_size [{}]",
filesToRecoverNames.size(),
new ByteSizeValue(totalSize),
phase1ExistingFileNames.size(),
new ByteSizeValue(existingTotalSize)
);
}
// We need to pass the ShardRecovery plan between steps instead of capturing it in the closures
// since the plan can change after a failure recovering files from the snapshots that cannot be
// recovered from the source node, in that case we have to start from scratch using the fallback
// recovery plan that would be used in subsequent steps.
final StepListener sendFileInfoStep = new StepListener<>();
final StepListener>> recoverSnapshotFilesStep = new StepListener<>();
final StepListener sendFilesStep = new StepListener<>();
final StepListener> createRetentionLeaseStep = new StepListener<>();
final StepListener cleanFilesStep = new StepListener<>();
final int translogOps = shardRecoveryPlan.getTranslogOps();
recoveryTarget.receiveFileInfo(
filesToRecoverNames,
filesToRecoverSizes,
phase1ExistingFileNames,
phase1ExistingFileSizes,
translogOps,
sendFileInfoStep
);
sendFileInfoStep.whenComplete(unused -> {
recoverSnapshotFiles(shardRecoveryPlan, new ActionListener<>() {
@Override
public void onResponse(List filesFailedToRecoverFromSnapshot) {
recoverSnapshotFilesStep.onResponse(Tuple.tuple(shardRecoveryPlan, filesFailedToRecoverFromSnapshot));
}
@Override
public void onFailure(Exception e) {
if (shardRecoveryPlan.canRecoverSnapshotFilesFromSourceNode() == false
&& e instanceof CancellableThreads.ExecutionCancelledException == false) {
ShardRecoveryPlan fallbackPlan = shardRecoveryPlan.getFallbackPlan();
recoveryTarget.receiveFileInfo(
fallbackPlan.getFilesToRecoverNames(),
fallbackPlan.getFilesToRecoverSizes(),
fallbackPlan.getFilesPresentInTargetNames(),
fallbackPlan.getFilesPresentInTargetSizes(),
fallbackPlan.getTranslogOps(),
recoverSnapshotFilesStep.map(r -> Tuple.tuple(fallbackPlan, Collections.emptyList()))
);
} else {
recoverSnapshotFilesStep.onFailure(e);
}
}
});
}, listener::onFailure);
recoverSnapshotFilesStep.whenComplete(planAndFilesFailedToRecoverFromSnapshot -> {
ShardRecoveryPlan recoveryPlan = planAndFilesFailedToRecoverFromSnapshot.v1();
List filesFailedToRecoverFromSnapshot = planAndFilesFailedToRecoverFromSnapshot.v2();
final List filesToRecoverFromSource;
if (filesFailedToRecoverFromSnapshot.isEmpty()) {
filesToRecoverFromSource = recoveryPlan.getSourceFilesToRecover();
} else {
filesToRecoverFromSource = concatLists(recoveryPlan.getSourceFilesToRecover(), filesFailedToRecoverFromSnapshot);
}
sendFiles(
store,
filesToRecoverFromSource.toArray(new StoreFileMetadata[0]),
recoveryPlan::getTranslogOps,
sendFilesStep.map(unused -> recoveryPlan)
);
}, listener::onFailure);
sendFilesStep.whenComplete(recoveryPlan -> {
createRetentionLease(
recoveryPlan.getStartingSeqNo(),
createRetentionLeaseStep.map(retentionLease -> Tuple.tuple(recoveryPlan, retentionLease))
);
}, listener::onFailure);
createRetentionLeaseStep.whenComplete(recoveryPlanAndRetentionLease -> {
final ShardRecoveryPlan recoveryPlan = recoveryPlanAndRetentionLease.v1();
final RetentionLease retentionLease = recoveryPlanAndRetentionLease.v2();
final Store.MetadataSnapshot recoverySourceMetadata = recoveryPlan.getSourceMetadataSnapshot();
final long lastKnownGlobalCheckpoint = shard.getLastKnownGlobalCheckpoint();
assert retentionLease == null || retentionLease.retainingSequenceNumber() - 1 <= lastKnownGlobalCheckpoint
: retentionLease + " vs " + lastKnownGlobalCheckpoint;
// Establishes new empty translog on the replica with global checkpoint set to lastKnownGlobalCheckpoint. We want
// the commit we just copied to be a safe commit on the replica, so why not set the global checkpoint on the replica
// to the max seqno of this commit? Because (in rare corner cases) this commit might not be a safe commit here on
// the primary, and in these cases the max seqno would be too high to be valid as a global checkpoint.
cleanFiles(
store,
recoverySourceMetadata,
() -> translogOps,
lastKnownGlobalCheckpoint,
cleanFilesStep.map(unused -> recoveryPlan)
);
}, listener::onFailure);
cleanFilesStep.whenComplete(recoveryPlan -> {
final TimeValue took = stopWatch.totalTime();
logger.trace("recovery [phase1]: took [{}]", took);
listener.onResponse(
new SendFileResult(
recoveryPlan.getFilesToRecoverNames(),
recoveryPlan.getFilesToRecoverSizes(),
recoveryPlan.getTotalSize(),
recoveryPlan.getFilesPresentInTargetNames(),
recoveryPlan.getFilesPresentInTargetSizes(),
recoveryPlan.getExistingSize(),
took
)
);
}, listener::onFailure);
}
/**
* Send requests to the target node to recover files from a given snapshot. In case of failure, the listener
* value contains the list of files that failed to be recovered from a snapshot.
*/
void recoverSnapshotFiles(ShardRecoveryPlan shardRecoveryPlan, ActionListener> listener) {
ShardRecoveryPlan.SnapshotFilesToRecover snapshotFilesToRecover = shardRecoveryPlan.getSnapshotFilesToRecover();
if (snapshotFilesToRecover.isEmpty()) {
listener.onResponse(Collections.emptyList());
return;
}
new SnapshotRecoverFileRequestsSender(shardRecoveryPlan, listener).start();
}
private class SnapshotRecoverFileRequestsSender {
private final ShardRecoveryPlan shardRecoveryPlan;
private final ShardRecoveryPlan.SnapshotFilesToRecover snapshotFilesToRecover;
private final ActionListener> listener;
private final CountDown countDown;
private final BlockingQueue pendingSnapshotFilesToRecover;
private final AtomicBoolean cancelled = new AtomicBoolean();
private final Set> outstandingRequests = new HashSet<>(maxConcurrentSnapshotFileDownloads);
private List filesFailedToDownloadFromSnapshot;
SnapshotRecoverFileRequestsSender(ShardRecoveryPlan shardRecoveryPlan, ActionListener> listener) {
this.shardRecoveryPlan = shardRecoveryPlan;
this.snapshotFilesToRecover = shardRecoveryPlan.getSnapshotFilesToRecover();
this.listener = listener;
this.countDown = new CountDown(shardRecoveryPlan.getSnapshotFilesToRecover().size());
this.pendingSnapshotFilesToRecover = new LinkedBlockingQueue<>(
shardRecoveryPlan.getSnapshotFilesToRecover().getSnapshotFiles()
);
}
void start() {
for (int i = 0; i < maxConcurrentSnapshotFileDownloads; i++) {
sendRequest();
}
}
void sendRequest() {
BlobStoreIndexShardSnapshot.FileInfo snapshotFileToRecover = pendingSnapshotFilesToRecover.poll();
if (snapshotFileToRecover == null) {
return;
}
final ListenableFuture requestFuture = new ListenableFuture<>();
try {
cancellableThreads.checkForCancel();
ActionListener sendRequestListener = new ActionListener<>() {
@Override
public void onResponse(Void unused) {
onRequestCompletion(snapshotFileToRecover.metadata(), null);
}
@Override
public void onFailure(Exception e) {
if (cancelled.get() || e instanceof CancellableThreads.ExecutionCancelledException) {
logger.debug(
new ParameterizedMessage(
"cancelled while recovering file [{}] from snapshot",
snapshotFileToRecover.metadata()
),
e
);
} else {
logger.warn(
new ParameterizedMessage(
"failed to recover file [{}] from snapshot{}",
snapshotFileToRecover.metadata(),
shardRecoveryPlan.canRecoverSnapshotFilesFromSourceNode() ? ", will recover from primary instead" : ""
),
e
);
}
if (shardRecoveryPlan.canRecoverSnapshotFilesFromSourceNode()) {
onRequestCompletion(snapshotFileToRecover.metadata(), e);
} else {
cancel(e);
}
}
};
requestFuture.addListener(sendRequestListener);
trackOutstandingRequest(requestFuture);
recoveryTarget.restoreFileFromSnapshot(
snapshotFilesToRecover.getRepository(),
snapshotFilesToRecover.getIndexId(),
snapshotFileToRecover,
ActionListener.runBefore(requestFuture, () -> unTrackOutstandingRequest(requestFuture))
);
} catch (CancellableThreads.ExecutionCancelledException e) {
cancel(e);
} catch (Exception e) {
unTrackOutstandingRequest(requestFuture);
onRequestCompletion(snapshotFileToRecover.metadata(), e);
}
}
void cancel(Exception e) {
if (cancelled.compareAndSet(false, true)) {
pendingSnapshotFilesToRecover.clear();
notifyFailureOnceAllOutstandingRequestAreDone(e);
}
}
void onRequestCompletion(StoreFileMetadata storeFileMetadata, @Nullable Exception exception) {
if (cancelled.get()) {
return;
}
if (exception != null) {
addFileFailedToRecoverFromSnapshot(storeFileMetadata);
}
if (countDown.countDown()) {
final List failedToRecoverFromSnapshotFiles = getFilesFailedToRecoverFromSnapshot();
listener.onResponse(failedToRecoverFromSnapshotFiles);
} else {
sendRequest();
}
}
synchronized void addFileFailedToRecoverFromSnapshot(StoreFileMetadata storeFileMetadata) {
if (filesFailedToDownloadFromSnapshot == null) {
filesFailedToDownloadFromSnapshot = new ArrayList<>();
}
filesFailedToDownloadFromSnapshot.add(storeFileMetadata);
}
synchronized List getFilesFailedToRecoverFromSnapshot() {
return Objects.requireNonNullElse(filesFailedToDownloadFromSnapshot, Collections.emptyList());
}
private void trackOutstandingRequest(ListenableFuture future) {
boolean cancelled;
synchronized (outstandingRequests) {
cancelled = cancellableThreads.isCancelled() || this.cancelled.get();
if (cancelled == false) {
outstandingRequests.add(future);
}
}
if (cancelled) {
cancellableThreads.checkForCancel();
// If the recover snapshot files operation is cancelled but the recovery is still
// valid, it means that some of the snapshot files download failed and the snapshot files
// differ from the source index files. In that case we have to cancel all pending operations
// and wait until all the in-flight operations are done to reset the recovery and start from
// scratch using the source node index files.
assert this.cancelled.get();
throw new CancellableThreads.ExecutionCancelledException("Recover snapshot files cancelled");
}
}
private void unTrackOutstandingRequest(ListenableFuture future) {
synchronized (outstandingRequests) {
outstandingRequests.remove(future);
}
}
private void notifyFailureOnceAllOutstandingRequestAreDone(Exception e) {
assert cancelled.get();
final Set> pendingRequests;
synchronized (outstandingRequests) {
pendingRequests = new HashSet<>(outstandingRequests);
}
if (pendingRequests.isEmpty()) {
listener.onFailure(e);
return;
}
// The recovery was cancelled so from this point onwards outstandingRequests won't track
// new requests and therefore we can safely use to wait until all the pending requests complete
// to notify the listener about the cancellation
final CountDown pendingRequestsCountDown = new CountDown(pendingRequests.size());
for (ListenableFuture outstandingFuture : pendingRequests) {
outstandingFuture.addListener(ActionListener.wrap(() -> {
if (pendingRequestsCountDown.countDown()) {
listener.onFailure(e);
}
}));
}
}
}
void createRetentionLease(final long startingSeqNo, ActionListener listener) {
runUnderPrimaryPermit(() -> {
// Clone the peer recovery retention lease belonging to the source shard. We are retaining history between the the local
// checkpoint of the safe commit we're creating and this lease's retained seqno with the retention lock, and by cloning an
// existing lease we (approximately) know that all our peers are also retaining history as requested by the cloned lease. If
// the recovery now fails before copying enough history over then a subsequent attempt will find this lease, determine it is
// not enough, and fall back to a file-based recovery.
//
// (approximately) because we do not guarantee to be able to satisfy every lease on every peer.
logger.trace("cloning primary's retention lease");
try {
final StepListener cloneRetentionLeaseStep = new StepListener<>();
final RetentionLease clonedLease = shard.cloneLocalPeerRecoveryRetentionLease(
request.targetNode().getId(),
new ThreadedActionListener<>(logger, shard.getThreadPool(), ThreadPool.Names.GENERIC, cloneRetentionLeaseStep, false)
);
logger.trace("cloned primary's retention lease as [{}]", clonedLease);
cloneRetentionLeaseStep.addListener(listener.map(rr -> clonedLease));
} catch (RetentionLeaseNotFoundException e) {
// it's possible that the primary has no retention lease yet if we are doing a rolling upgrade from a version before
// 7.4, and in that case we just create a lease using the local checkpoint of the safe commit which we're using for
// recovery as a conservative estimate for the global checkpoint.
assert shard.indexSettings().getIndexVersionCreated().before(Version.V_7_4_0)
|| shard.indexSettings().isSoftDeleteEnabled() == false;
final StepListener addRetentionLeaseStep = new StepListener<>();
final long estimatedGlobalCheckpoint = startingSeqNo - 1;
final RetentionLease newLease = shard.addPeerRecoveryRetentionLease(
request.targetNode().getId(),
estimatedGlobalCheckpoint,
new ThreadedActionListener<>(logger, shard.getThreadPool(), ThreadPool.Names.GENERIC, addRetentionLeaseStep, false)
);
addRetentionLeaseStep.addListener(listener.map(rr -> newLease));
logger.trace("created retention lease with estimated checkpoint of [{}]", estimatedGlobalCheckpoint);
}
}, shardId + " establishing retention lease for [" + request.targetAllocationId() + "]", shard, cancellableThreads, logger);
}
boolean canSkipPhase1(Store.MetadataSnapshot source, Store.MetadataSnapshot target) {
if (source.getSyncId() == null || source.getSyncId().equals(target.getSyncId()) == false) {
return false;
}
if (source.numDocs() != target.numDocs()) {
throw new IllegalStateException(
"try to recover "
+ request.shardId()
+ " from primary shard with sync id but number "
+ "of docs differ: "
+ source.numDocs()
+ " ("
+ request.sourceNode().getName()
+ ", primary) vs "
+ target.numDocs()
+ "("
+ request.targetNode().getName()
+ ")"
);
}
SequenceNumbers.CommitInfo sourceSeqNos = SequenceNumbers.loadSeqNoInfoFromLuceneCommit(source.commitUserData().entrySet());
SequenceNumbers.CommitInfo targetSeqNos = SequenceNumbers.loadSeqNoInfoFromLuceneCommit(target.commitUserData().entrySet());
if (sourceSeqNos.localCheckpoint != targetSeqNos.localCheckpoint || targetSeqNos.maxSeqNo != sourceSeqNos.maxSeqNo) {
final String message = "try to recover "
+ request.shardId()
+ " with sync id but "
+ "seq_no stats are mismatched: ["
+ source.commitUserData()
+ "] vs ["
+ target.commitUserData()
+ "]";
assert false : message;
throw new IllegalStateException(message);
}
return true;
}
void prepareTargetForTranslog(int totalTranslogOps, ActionListener listener) {
StopWatch stopWatch = new StopWatch().start();
final ActionListener wrappedListener = ActionListener.wrap(nullVal -> {
stopWatch.stop();
final TimeValue tookTime = stopWatch.totalTime();
logger.trace("recovery [phase1]: remote engine start took [{}]", tookTime);
listener.onResponse(tookTime);
}, e -> listener.onFailure(new RecoveryEngineException(shard.shardId(), 1, "prepare target for translog failed", e)));
// Send a request preparing the new shard's translog to receive operations. This ensures the shard engine is started and disables
// garbage collection (not the JVM's GC!) of tombstone deletes.
logger.trace("recovery [phase1]: prepare remote engine for translog");
cancellableThreads.checkForCancel();
recoveryTarget.prepareForTranslogOperations(totalTranslogOps, wrappedListener);
}
/**
* Perform phase two of the recovery process.
*
* Phase two uses a snapshot of the current translog *without* acquiring the write lock (however, the translog snapshot is
* point-in-time view of the translog). It then sends each translog operation to the target node so it can be replayed into the new
* shard.
*
* @param startingSeqNo the sequence number to start recovery from, or {@link SequenceNumbers#UNASSIGNED_SEQ_NO} if all
* ops should be sent
* @param endingSeqNo the highest sequence number that should be sent
* @param snapshot a snapshot of the translog
* @param maxSeenAutoIdTimestamp the max auto_id_timestamp of append-only requests on the primary
* @param maxSeqNoOfUpdatesOrDeletes the max seq_no of updates or deletes on the primary after these operations were executed on it.
* @param listener a listener which will be notified with the local checkpoint on the target.
*/
void phase2(
final long startingSeqNo,
final long endingSeqNo,
final Translog.Snapshot snapshot,
final long maxSeenAutoIdTimestamp,
final long maxSeqNoOfUpdatesOrDeletes,
final RetentionLeases retentionLeases,
final long mappingVersion,
final ActionListener listener
) throws IOException {
if (shard.state() == IndexShardState.CLOSED) {
throw new IndexShardClosedException(request.shardId());
}
logger.trace("recovery [phase2]: sending transaction log operations (from [" + startingSeqNo + "] to [" + endingSeqNo + "]");
final StopWatch stopWatch = new StopWatch().start();
final StepListener sendListener = new StepListener<>();
final OperationBatchSender sender = new OperationBatchSender(
startingSeqNo,
endingSeqNo,
snapshot,
maxSeenAutoIdTimestamp,
maxSeqNoOfUpdatesOrDeletes,
retentionLeases,
mappingVersion,
sendListener
);
sendListener.whenComplete(ignored -> {
final long skippedOps = sender.skippedOps.get();
final int totalSentOps = sender.sentOps.get();
final long targetLocalCheckpoint = sender.targetLocalCheckpoint.get();
assert snapshot.totalOperations() == snapshot.skippedOperations() + skippedOps + totalSentOps
: String.format(
Locale.ROOT,
"expected total [%d], overridden [%d], skipped [%d], total sent [%d]",
snapshot.totalOperations(),
snapshot.skippedOperations(),
skippedOps,
totalSentOps
);
stopWatch.stop();
final TimeValue tookTime = stopWatch.totalTime();
logger.trace("recovery [phase2]: took [{}]", tookTime);
listener.onResponse(new SendSnapshotResult(targetLocalCheckpoint, totalSentOps, tookTime));
}, listener::onFailure);
sender.start();
}
private static class OperationChunkRequest implements MultiChunkTransfer.ChunkRequest {
final List operations;
final boolean lastChunk;
OperationChunkRequest(List operations, boolean lastChunk) {
this.operations = operations;
this.lastChunk = lastChunk;
}
@Override
public boolean lastChunk() {
return lastChunk;
}
}
private class OperationBatchSender extends MultiChunkTransfer {
private final long startingSeqNo;
private final long endingSeqNo;
private final Translog.Snapshot snapshot;
private final long maxSeenAutoIdTimestamp;
private final long maxSeqNoOfUpdatesOrDeletes;
private final RetentionLeases retentionLeases;
private final long mappingVersion;
private int lastBatchCount = 0; // used to estimate the count of the subsequent batch.
private final AtomicInteger skippedOps = new AtomicInteger();
private final AtomicInteger sentOps = new AtomicInteger();
private final AtomicLong targetLocalCheckpoint = new AtomicLong(SequenceNumbers.NO_OPS_PERFORMED);
OperationBatchSender(
long startingSeqNo,
long endingSeqNo,
Translog.Snapshot snapshot,
long maxSeenAutoIdTimestamp,
long maxSeqNoOfUpdatesOrDeletes,
RetentionLeases retentionLeases,
long mappingVersion,
ActionListener listener
) {
super(logger, threadPool.getThreadContext(), listener, maxConcurrentOperations, List.of(snapshot));
this.startingSeqNo = startingSeqNo;
this.endingSeqNo = endingSeqNo;
this.snapshot = snapshot;
this.maxSeenAutoIdTimestamp = maxSeenAutoIdTimestamp;
this.maxSeqNoOfUpdatesOrDeletes = maxSeqNoOfUpdatesOrDeletes;
this.retentionLeases = retentionLeases;
this.mappingVersion = mappingVersion;
}
@Override
protected synchronized OperationChunkRequest nextChunkRequest(Translog.Snapshot snapshot) throws IOException {
// We need to synchronized Snapshot#next() because it's called by different threads through sendBatch.
// Even though those calls are not concurrent, Snapshot#next() uses non-synchronized state and is not multi-thread-compatible.
assert Transports.assertNotTransportThread("[phase2]");
cancellableThreads.checkForCancel();
final List ops = lastBatchCount > 0 ? new ArrayList<>(lastBatchCount) : new ArrayList<>();
long batchSizeInBytes = 0L;
Translog.Operation operation;
while ((operation = snapshot.next()) != null) {
if (shard.state() == IndexShardState.CLOSED) {
throw new IndexShardClosedException(request.shardId());
}
final long seqNo = operation.seqNo();
if (seqNo < startingSeqNo || seqNo > endingSeqNo) {
skippedOps.incrementAndGet();
continue;
}
ops.add(operation);
batchSizeInBytes += operation.estimateSize();
sentOps.incrementAndGet();
// check if this request is past bytes threshold, and if so, send it off
if (batchSizeInBytes >= chunkSizeInBytes) {
break;
}
}
lastBatchCount = ops.size();
return new OperationChunkRequest(ops, operation == null);
}
@Override
protected void executeChunkRequest(OperationChunkRequest request, ActionListener listener) {
cancellableThreads.checkForCancel();
recoveryTarget.indexTranslogOperations(
request.operations,
snapshot.totalOperations(),
maxSeenAutoIdTimestamp,
maxSeqNoOfUpdatesOrDeletes,
retentionLeases,
mappingVersion,
listener.delegateFailure((l, newCheckpoint) -> {
targetLocalCheckpoint.updateAndGet(curr -> SequenceNumbers.max(curr, newCheckpoint));
l.onResponse(null);
})
);
}
@Override
protected void handleError(Translog.Snapshot snapshot, Exception e) {
throw new RecoveryEngineException(shard.shardId(), 2, "failed to send/replay operations", e);
}
@Override
public void close() throws IOException {
snapshot.close();
}
}
void finalizeRecovery(long targetLocalCheckpoint, long trimAboveSeqNo, ActionListener listener) {
if (shard.state() == IndexShardState.CLOSED) {
throw new IndexShardClosedException(request.shardId());
}
cancellableThreads.checkForCancel();
StopWatch stopWatch = new StopWatch().start();
logger.trace("finalizing recovery");
/*
* Before marking the shard as in-sync we acquire an operation permit. We do this so that there is a barrier between marking a
* shard as in-sync and relocating a shard. If we acquire the permit then no relocation handoff can complete before we are done
* marking the shard as in-sync. If the relocation handoff holds all the permits then after the handoff completes and we acquire
* the permit then the state of the shard will be relocated and this recovery will fail.
*/
runUnderPrimaryPermit(
() -> shard.markAllocationIdAsInSync(request.targetAllocationId(), targetLocalCheckpoint),
shardId + " marking " + request.targetAllocationId() + " as in sync",
shard,
cancellableThreads,
logger
);
final long globalCheckpoint = shard.getLastKnownGlobalCheckpoint(); // this global checkpoint is persisted in finalizeRecovery
final StepListener finalizeListener = new StepListener<>();
cancellableThreads.checkForCancel();
recoveryTarget.finalizeRecovery(globalCheckpoint, trimAboveSeqNo, finalizeListener);
finalizeListener.whenComplete(r -> {
runUnderPrimaryPermit(
() -> shard.updateGlobalCheckpointForShard(request.targetAllocationId(), globalCheckpoint),
shardId + " updating " + request.targetAllocationId() + "'s global checkpoint",
shard,
cancellableThreads,
logger
);
if (request.isPrimaryRelocation()) {
logger.trace("performing relocation hand-off");
// this acquires all IndexShard operation permits and will thus delay new recoveries until it is done
cancellableThreads.execute(
() -> shard.relocated(request.targetAllocationId(), recoveryTarget::handoffPrimaryContext, ActionListener.wrap(v -> {
cancellableThreads.checkForCancel();
completeFinalizationListener(listener, stopWatch);
}, listener::onFailure))
);
/*
* if the recovery process fails after disabling primary mode on the source shard, both relocation source and
* target are failed (see {@link IndexShard#updateRoutingEntry}).
*/
} else {
completeFinalizationListener(listener, stopWatch);
}
}, listener::onFailure);
}
private void completeFinalizationListener(ActionListener listener, StopWatch stopWatch) {
stopWatch.stop();
logger.trace("finalizing recovery took [{}]", stopWatch.totalTime());
listener.onResponse(null);
}
static final class SendSnapshotResult {
final long targetLocalCheckpoint;
final int sentOperations;
final TimeValue tookTime;
SendSnapshotResult(final long targetLocalCheckpoint, final int sentOperations, final TimeValue tookTime) {
this.targetLocalCheckpoint = targetLocalCheckpoint;
this.sentOperations = sentOperations;
this.tookTime = tookTime;
}
}
/**
* Cancels the recovery and interrupts all eligible threads.
*/
public void cancel(String reason) {
cancellableThreads.cancel(reason);
recoveryTarget.cancel();
}
@Override
public String toString() {
return "ShardRecoveryHandler{"
+ "shardId="
+ request.shardId()
+ ", sourceNode="
+ request.sourceNode()
+ ", targetNode="
+ request.targetNode()
+ '}';
}
private static class FileChunk implements MultiChunkTransfer.ChunkRequest, Releasable {
final StoreFileMetadata md;
final BytesReference content;
final long position;
final boolean lastChunk;
final Releasable onClose;
FileChunk(StoreFileMetadata md, BytesReference content, long position, boolean lastChunk, Releasable onClose) {
this.md = md;
this.content = content;
this.position = position;
this.lastChunk = lastChunk;
this.onClose = onClose;
}
@Override
public boolean lastChunk() {
return lastChunk;
}
@Override
public void close() {
onClose.close();
}
}
void sendFiles(Store store, StoreFileMetadata[] files, IntSupplier translogOps, ActionListener listener) {
ArrayUtil.timSort(files, Comparator.comparingLong(StoreFileMetadata::length)); // send smallest first
// use a smaller buffer than the configured chunk size if we only have files smaller than the chunk size
final int bufferSize = files.length == 0 ? 0 : (int) Math.min(chunkSizeInBytes, files[files.length - 1].length());
Releasable temporaryStoreRef = acquireStore(store);
try {
final Releasable storeRef = temporaryStoreRef;
final MultiChunkTransfer multiFileSender = new MultiChunkTransfer<>(
logger,
threadPool.getThreadContext(),
listener,
maxConcurrentFileChunks,
Arrays.asList(files)
) {
final Deque buffers = new ConcurrentLinkedDeque<>();
final AtomicInteger liveBufferCount = new AtomicInteger(); // only used in assertions to verify proper recycling
IndexInput currentInput = null;
long offset = 0;
@Override
protected void onNewResource(StoreFileMetadata md) throws IOException {
offset = 0;
IOUtils.close(currentInput);
if (md.hashEqualsContents()) {
// we already have the file contents on heap no need to open the file again
currentInput = null;
} else {
currentInput = store.directory().openInput(md.name(), IOContext.READONCE);
}
}
@Override
protected FileChunk nextChunkRequest(StoreFileMetadata md) throws IOException {
assert Transports.assertNotTransportThread("read file chunk");
cancellableThreads.checkForCancel();
if (currentInput == null) {
// no input => reading directly from the metadata
assert md.hashEqualsContents();
return new FileChunk(md, new BytesArray(md.hash()), 0, true, () -> {});
}
final byte[] buffer = Objects.requireNonNullElseGet(buffers.pollFirst(), () -> new byte[bufferSize]);
assert liveBufferCount.incrementAndGet() > 0;
final int toRead = Math.toIntExact(Math.min(md.length() - offset, buffer.length));
currentInput.readBytes(buffer, 0, toRead, false);
final boolean lastChunk = offset + toRead == md.length();
final FileChunk chunk = new FileChunk(md, new BytesArray(buffer, 0, toRead), offset, lastChunk, () -> {
assert liveBufferCount.decrementAndGet() >= 0;
buffers.addFirst(buffer);
});
offset += toRead;
return chunk;
}
@Override
protected void executeChunkRequest(FileChunk request, ActionListener listener) {
cancellableThreads.checkForCancel();
final ReleasableBytesReference content = new ReleasableBytesReference(request.content, request);
recoveryTarget.writeFileChunk(
request.md,
request.position,
content,
request.lastChunk,
translogOps.getAsInt(),
ActionListener.runBefore(listener, content::close)
);
}
@Override
protected void handleError(StoreFileMetadata md, Exception e) throws Exception {
handleErrorOnSendFiles(store, e, new StoreFileMetadata[] { md });
}
@Override
public void close() throws IOException {
IOUtils.close(currentInput, storeRef);
}
@Override
protected boolean assertOnSuccess() {
assert liveBufferCount.get() == 0 : "leaked [" + liveBufferCount + "] buffers";
return true;
}
};
resources.add(multiFileSender);
temporaryStoreRef = null; // now owned by multiFileSender, tracked in resources, so won't be leaked
multiFileSender.start();
} finally {
Releasables.close(temporaryStoreRef);
}
}
private void cleanFiles(
Store store,
Store.MetadataSnapshot sourceMetadata,
IntSupplier translogOps,
long globalCheckpoint,
ActionListener listener
) {
// Send the CLEAN_FILES request, which takes all of the files that
// were transferred and renames them from their temporary file
// names to the actual file names. It also writes checksums for
// the files after they have been renamed.
//
// Once the files have been renamed, any other files that are not
// related to this recovery (out of date segments, for example)
// are deleted
cancellableThreads.checkForCancel();
recoveryTarget.cleanFiles(
translogOps.getAsInt(),
globalCheckpoint,
sourceMetadata,
listener.delegateResponse((l, e) -> ActionListener.completeWith(l, () -> {
StoreFileMetadata[] mds = StreamSupport.stream(sourceMetadata.spliterator(), false).toArray(StoreFileMetadata[]::new);
ArrayUtil.timSort(mds, Comparator.comparingLong(StoreFileMetadata::length)); // check small files first
handleErrorOnSendFiles(store, e, mds);
throw e;
}))
);
}
private void handleErrorOnSendFiles(Store store, Exception e, StoreFileMetadata[] mds) throws Exception {
final IOException corruptIndexException = ExceptionsHelper.unwrapCorruption(e);
assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[handle error on send/clean files]");
if (corruptIndexException != null) {
Exception localException = null;
for (StoreFileMetadata md : mds) {
cancellableThreads.checkForCancel();
logger.debug("checking integrity for file {} after remove corruption exception", md);
if (store.checkIntegrityNoException(md) == false) { // we are corrupted on the primary -- fail!
logger.warn("{} Corrupted file detected {} checksum mismatch", shardId, md);
if (localException == null) {
localException = corruptIndexException;
}
failEngine(corruptIndexException);
}
}
if (localException != null) {
throw localException;
} else { // corruption has happened on the way to replica
RemoteTransportException remoteException = new RemoteTransportException(
"File corruption occurred on recovery but checksums are ok",
null
);
remoteException.addSuppressed(e);
logger.warn(
() -> format(
"%s Remote file corruption on node %s, recovering %s. local checksum OK",
shardId,
request.targetNode(),
mds
),
corruptIndexException
);
throw remoteException;
}
}
throw e;
}
protected void failEngine(IOException cause) {
shard.failShard("recovery", cause);
}
}