All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.indices.recovery.RecoverySourceHandler Maven / Gradle / Ivy

There is a newer version: 8.14.0
Show newest version
/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */

package org.elasticsearch.indices.recovery;

import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexFormatTooNewException;
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.RateLimiter;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.SetOnce;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.StepListener;
import org.elasticsearch.action.support.ThreadedActionListener;
import org.elasticsearch.action.support.replication.ReplicationResponse;
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.common.StopWatch;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.bytes.ReleasableBytesReference;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.util.CancellableThreads;
import org.elasticsearch.common.util.concurrent.CountDown;
import org.elasticsearch.common.util.concurrent.FutureUtils;
import org.elasticsearch.common.util.concurrent.ListenableFuture;
import org.elasticsearch.core.IOUtils;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.Releasables;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.core.Tuple;
import org.elasticsearch.index.engine.Engine;
import org.elasticsearch.index.engine.RecoveryEngineException;
import org.elasticsearch.index.seqno.ReplicationTracker;
import org.elasticsearch.index.seqno.RetentionLease;
import org.elasticsearch.index.seqno.RetentionLeaseNotFoundException;
import org.elasticsearch.index.seqno.RetentionLeases;
import org.elasticsearch.index.seqno.SequenceNumbers;
import org.elasticsearch.index.shard.IndexShard;
import org.elasticsearch.index.shard.IndexShardClosedException;
import org.elasticsearch.index.shard.IndexShardRelocatedException;
import org.elasticsearch.index.shard.IndexShardState;
import org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshot;
import org.elasticsearch.index.store.Store;
import org.elasticsearch.index.store.StoreFileMetadata;
import org.elasticsearch.index.translog.Translog;
import org.elasticsearch.indices.recovery.plan.RecoveryPlannerService;
import org.elasticsearch.indices.recovery.plan.ShardRecoveryPlan;
import org.elasticsearch.snapshots.SnapshotShardsService;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.RemoteTransportException;
import org.elasticsearch.transport.Transports;

import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Deque;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Consumer;
import java.util.function.IntSupplier;
import java.util.stream.StreamSupport;

import static org.elasticsearch.common.util.CollectionUtils.concatLists;
import static org.elasticsearch.core.Strings.format;

/**
 * RecoverySourceHandler handles the three phases of shard recovery, which is
 * everything relating to copying the segment files as well as sending translog
 * operations across the wire once the segments have been copied.
 *
 * Note: There is always one source handler per recovery that handles all the
 * file and translog transfer. This handler is completely isolated from other recoveries
 * while the {@link RateLimiter} passed via {@link RecoverySettings} is shared across recoveries
 * originating from this nodes to throttle the number bytes send during file transfer. The transaction log
 * phase bypasses the rate limiter entirely.
 */
public class RecoverySourceHandler {

    protected final Logger logger;
    // Shard that is going to be recovered (the "source")
    private final IndexShard shard;
    private final int shardId;
    // Request containing source and target node information
    private final StartRecoveryRequest request;
    private final int chunkSizeInBytes;
    private final RecoveryTargetHandler recoveryTarget;
    private final int maxConcurrentFileChunks;
    private final int maxConcurrentOperations;
    private final int maxConcurrentSnapshotFileDownloads;
    private final boolean useSnapshots;
    private final ThreadPool threadPool;
    private final RecoveryPlannerService recoveryPlannerService;
    private final CancellableThreads cancellableThreads = new CancellableThreads();
    private final List resources = new CopyOnWriteArrayList<>();
    private final ListenableFuture future = new ListenableFuture<>();

    public RecoverySourceHandler(
        IndexShard shard,
        RecoveryTargetHandler recoveryTarget,
        ThreadPool threadPool,
        StartRecoveryRequest request,
        int fileChunkSizeInBytes,
        int maxConcurrentFileChunks,
        int maxConcurrentOperations,
        int maxConcurrentSnapshotFileDownloads,
        boolean useSnapshots,
        RecoveryPlannerService recoveryPlannerService
    ) {
        this.shard = shard;
        this.recoveryTarget = recoveryTarget;
        this.threadPool = threadPool;
        this.recoveryPlannerService = recoveryPlannerService;
        this.request = request;
        this.shardId = this.request.shardId().id();
        this.logger = Loggers.getLogger(getClass(), request.shardId(), "recover to " + request.targetNode().getName());
        this.chunkSizeInBytes = fileChunkSizeInBytes;
        this.maxConcurrentFileChunks = maxConcurrentFileChunks;
        this.maxConcurrentOperations = maxConcurrentOperations;
        this.maxConcurrentSnapshotFileDownloads = maxConcurrentSnapshotFileDownloads;
        this.useSnapshots = useSnapshots;
    }

    public StartRecoveryRequest getRequest() {
        return request;
    }

    public void addListener(ActionListener listener) {
        future.addListener(listener);
    }

    /**
     * performs the recovery from the local engine to the target
     */
    public void recoverToTarget(ActionListener listener) {
        addListener(listener);
        final Closeable releaseResources = () -> IOUtils.close(resources);
        try {
            cancellableThreads.setOnCancel((reason, beforeCancelEx) -> {
                final RuntimeException e;
                if (shard.state() == IndexShardState.CLOSED) { // check if the shard got closed on us
                    e = new IndexShardClosedException(shard.shardId(), "shard is closed and recovery was canceled reason [" + reason + "]");
                } else {
                    e = new CancellableThreads.ExecutionCancelledException("recovery was canceled reason [" + reason + "]");
                }
                if (beforeCancelEx != null) {
                    e.addSuppressed(beforeCancelEx);
                }
                IOUtils.closeWhileHandlingException(releaseResources, () -> future.onFailure(e));
                throw e;
            });
            final Consumer onFailure = e -> {
                assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[onFailure]");
                IOUtils.closeWhileHandlingException(releaseResources, () -> future.onFailure(e));
            };

            final SetOnce retentionLeaseRef = new SetOnce<>();

            runUnderPrimaryPermit(() -> {
                final IndexShardRoutingTable routingTable = shard.getReplicationGroup().getRoutingTable();
                ShardRouting targetShardRouting = routingTable.getByAllocationId(request.targetAllocationId());
                if (targetShardRouting == null) {
                    logger.debug(
                        "delaying recovery of {} as it is not listed as assigned to target node {}",
                        request.shardId(),
                        request.targetNode()
                    );
                    throw new DelayRecoveryException("source node does not have the shard listed in its state as allocated on the node");
                }
                assert targetShardRouting.initializing() : "expected recovery target to be initializing but was " + targetShardRouting;
                retentionLeaseRef.set(
                    shard.getRetentionLeases().get(ReplicationTracker.getPeerRecoveryRetentionLeaseId(targetShardRouting))
                );
            },
                shardId + " validating recovery target [" + request.targetAllocationId() + "] registered ",
                shard,
                cancellableThreads,
                logger
            );
            final Closeable retentionLock = shard.acquireHistoryRetentionLock();
            resources.add(retentionLock);
            final long startingSeqNo;
            final boolean isSequenceNumberBasedRecovery = request.startingSeqNo() != SequenceNumbers.UNASSIGNED_SEQ_NO
                && isTargetSameHistory()
                && shard.hasCompleteHistoryOperations("peer-recovery", request.startingSeqNo())
                && ((retentionLeaseRef.get() == null && shard.useRetentionLeasesInPeerRecovery() == false)
                    || (retentionLeaseRef.get() != null && retentionLeaseRef.get().retainingSequenceNumber() <= request.startingSeqNo()));
            // NB check hasCompleteHistoryOperations when computing isSequenceNumberBasedRecovery, even if there is a retention lease,
            // because when doing a rolling upgrade from earlier than 7.4 we may create some leases that are initially unsatisfied. It's
            // possible there are other cases where we cannot satisfy all leases, because that's not a property we currently expect to hold.
            // Also it's pretty cheap when soft deletes are enabled, and it'd be a disaster if we tried a sequence-number-based recovery
            // without having a complete history.

            if (isSequenceNumberBasedRecovery && retentionLeaseRef.get() != null) {
                // all the history we need is retained by an existing retention lease, so we do not need a separate retention lock
                retentionLock.close();
                logger.trace("history is retained by {}", retentionLeaseRef.get());
            } else {
                // all the history we need is retained by the retention lock, obtained before calling shard.hasCompleteHistoryOperations()
                // and before acquiring the safe commit we'll be using, so we can be certain that all operations after the safe commit's
                // local checkpoint will be retained for the duration of this recovery.
                logger.trace("history is retained by retention lock");
            }

            final StepListener sendFileStep = new StepListener<>();
            final StepListener prepareEngineStep = new StepListener<>();
            final StepListener sendSnapshotStep = new StepListener<>();
            final StepListener finalizeStep = new StepListener<>();

            if (isSequenceNumberBasedRecovery) {
                logger.trace("performing sequence numbers based recovery. starting at [{}]", request.startingSeqNo());
                startingSeqNo = request.startingSeqNo();
                if (retentionLeaseRef.get() == null) {
                    createRetentionLease(startingSeqNo, sendFileStep.map(ignored -> SendFileResult.EMPTY));
                } else {
                    sendFileStep.onResponse(SendFileResult.EMPTY);
                }
            } else {
                final Engine.IndexCommitRef safeCommitRef;
                try {
                    safeCommitRef = acquireSafeCommit(shard);
                    resources.add(safeCommitRef);
                } catch (final Exception e) {
                    throw new RecoveryEngineException(shard.shardId(), 1, "snapshot failed", e);
                }

                // Try and copy enough operations to the recovering peer so that if it is promoted to primary then it has a chance of being
                // able to recover other replicas using operations-based recoveries. If we are not using retention leases then we
                // conservatively copy all available operations. If we are using retention leases then "enough operations" is just the
                // operations from the local checkpoint of the safe commit onwards, because when using soft deletes the safe commit retains
                // at least as much history as anything else. The safe commit will often contain all the history retained by the current set
                // of retention leases, but this is not guaranteed: an earlier peer recovery from a different primary might have created a
                // retention lease for some history that this primary already discarded, since we discard history when the global checkpoint
                // advances and not when creating a new safe commit. In any case this is a best-effort thing since future recoveries can
                // always fall back to file-based ones, and only really presents a problem if this primary fails before things have settled
                // down.
                startingSeqNo = Long.parseLong(safeCommitRef.getIndexCommit().getUserData().get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)) + 1L;
                logger.trace("performing file-based recovery followed by history replay starting at [{}]", startingSeqNo);

                try {
                    final int estimateNumOps = estimateNumberOfHistoryOperations(startingSeqNo);
                    final Releasable releaseStore = acquireStore(shard.store());
                    resources.add(releaseStore);
                    sendFileStep.whenComplete(r -> IOUtils.close(safeCommitRef, releaseStore), e -> {
                        try {
                            IOUtils.close(safeCommitRef, releaseStore);
                        } catch (Exception ex) {
                            logger.warn("releasing snapshot caused exception", ex);
                        }
                    });

                    final StepListener deleteRetentionLeaseStep = new StepListener<>();
                    runUnderPrimaryPermit(() -> {
                        try {
                            // If the target previously had a copy of this shard then a file-based recovery might move its global
                            // checkpoint backwards. We must therefore remove any existing retention lease so that we can create a
                            // new one later on in the recovery.
                            shard.removePeerRecoveryRetentionLease(
                                request.targetNode().getId(),
                                new ThreadedActionListener<>(
                                    logger,
                                    shard.getThreadPool(),
                                    ThreadPool.Names.GENERIC,
                                    deleteRetentionLeaseStep,
                                    false
                                )
                            );
                        } catch (RetentionLeaseNotFoundException e) {
                            logger.debug("no peer-recovery retention lease for " + request.targetAllocationId());
                            deleteRetentionLeaseStep.onResponse(null);
                        }
                    }, shardId + " removing retention lease for [" + request.targetAllocationId() + "]", shard, cancellableThreads, logger);

                    deleteRetentionLeaseStep.whenComplete(ignored -> {
                        assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[phase1]");
                        phase1(safeCommitRef.getIndexCommit(), startingSeqNo, () -> estimateNumOps, sendFileStep);
                    }, onFailure);

                } catch (final Exception e) {
                    throw new RecoveryEngineException(shard.shardId(), 1, "sendFileStep failed", e);
                }
            }
            assert startingSeqNo >= 0 : "startingSeqNo must be non negative. got: " + startingSeqNo;

            sendFileStep.whenComplete(r -> {
                assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[prepareTargetForTranslog]");
                // For a sequence based recovery, the target can keep its local translog
                prepareTargetForTranslog(estimateNumberOfHistoryOperations(startingSeqNo), prepareEngineStep);
            }, onFailure);

            prepareEngineStep.whenComplete(prepareEngineTime -> {
                assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[phase2]");
                /*
                 * add shard to replication group (shard will receive replication requests from this point on) now that engine is open.
                 * This means that any document indexed into the primary after this will be replicated to this replica as well
                 * make sure to do this before sampling the max sequence number in the next step, to ensure that we send
                 * all documents up to maxSeqNo in phase2.
                 */
                runUnderPrimaryPermit(
                    () -> shard.initiateTracking(request.targetAllocationId()),
                    shardId + " initiating tracking of " + request.targetAllocationId(),
                    shard,
                    cancellableThreads,
                    logger
                );

                final long endingSeqNo = shard.seqNoStats().getMaxSeqNo();
                logger.trace("snapshot for recovery; current size is [{}]", estimateNumberOfHistoryOperations(startingSeqNo));
                final Translog.Snapshot phase2Snapshot = shard.newChangesSnapshot(
                    "peer-recovery",
                    startingSeqNo,
                    Long.MAX_VALUE,
                    false,
                    false,
                    true
                );
                resources.add(phase2Snapshot);
                retentionLock.close();

                // we have to capture the max_seen_auto_id_timestamp and the max_seq_no_of_updates to make sure that these values
                // are at least as high as the corresponding values on the primary when any of these operations were executed on it.
                final long maxSeenAutoIdTimestamp = shard.getMaxSeenAutoIdTimestamp();
                final long maxSeqNoOfUpdatesOrDeletes = shard.getMaxSeqNoOfUpdatesOrDeletes();
                final RetentionLeases retentionLeases = shard.getRetentionLeases();
                final long mappingVersionOnPrimary = shard.indexSettings().getIndexMetadata().getMappingVersion();
                phase2(
                    startingSeqNo,
                    endingSeqNo,
                    phase2Snapshot,
                    maxSeenAutoIdTimestamp,
                    maxSeqNoOfUpdatesOrDeletes,
                    retentionLeases,
                    mappingVersionOnPrimary,
                    sendSnapshotStep
                );

            }, onFailure);

            // Recovery target can trim all operations >= startingSeqNo as we have sent all these operations in the phase 2
            final long trimAboveSeqNo = startingSeqNo - 1;
            sendSnapshotStep.whenComplete(r -> finalizeRecovery(r.targetLocalCheckpoint, trimAboveSeqNo, finalizeStep), onFailure);

            finalizeStep.whenComplete(r -> {
                final long phase1ThrottlingWaitTime = 0L; // TODO: return the actual throttle time
                final SendSnapshotResult sendSnapshotResult = sendSnapshotStep.result();
                final SendFileResult sendFileResult = sendFileStep.result();
                final RecoveryResponse response = new RecoveryResponse(
                    sendFileResult.phase1FileNames,
                    sendFileResult.phase1FileSizes,
                    sendFileResult.phase1ExistingFileNames,
                    sendFileResult.phase1ExistingFileSizes,
                    sendFileResult.totalSize,
                    sendFileResult.existingTotalSize,
                    sendFileResult.took.millis(),
                    phase1ThrottlingWaitTime,
                    prepareEngineStep.result().millis(),
                    sendSnapshotResult.sentOperations,
                    sendSnapshotResult.tookTime.millis()
                );
                try {
                    future.onResponse(response);
                } finally {
                    IOUtils.close(resources);
                }
            }, onFailure);
        } catch (Exception e) {
            IOUtils.closeWhileHandlingException(releaseResources, () -> future.onFailure(e));
        }
    }

    private boolean isTargetSameHistory() {
        final String targetHistoryUUID = request.metadataSnapshot().getHistoryUUID();
        assert targetHistoryUUID != null : "incoming target history missing";
        return targetHistoryUUID.equals(shard.getHistoryUUID());
    }

    private int estimateNumberOfHistoryOperations(long startingSeqNo) throws IOException {
        return shard.countChanges("peer-recovery", startingSeqNo, Long.MAX_VALUE);
    }

    static void runUnderPrimaryPermit(
        CancellableThreads.Interruptible runnable,
        String reason,
        IndexShard primary,
        CancellableThreads cancellableThreads,
        Logger logger
    ) {
        cancellableThreads.execute(() -> {
            CompletableFuture permit = new CompletableFuture<>();
            final ActionListener onAcquired = new ActionListener() {
                @Override
                public void onResponse(Releasable releasable) {
                    if (permit.complete(releasable) == false) {
                        releasable.close();
                    }
                }

                @Override
                public void onFailure(Exception e) {
                    permit.completeExceptionally(e);
                }
            };
            primary.acquirePrimaryOperationPermit(onAcquired, ThreadPool.Names.SAME, reason);
            try (Releasable ignored = FutureUtils.get(permit)) {
                // check that the IndexShard still has the primary authority. This needs to be checked under operation permit to prevent
                // races, as IndexShard will switch its authority only when it holds all operation permits, see IndexShard.relocated()
                if (primary.isRelocatedPrimary()) {
                    throw new IndexShardRelocatedException(primary.shardId());
                }
                runnable.run();
            } finally {
                // just in case we got an exception (likely interrupted) while waiting for the get
                permit.whenComplete((r, e) -> {
                    if (r != null) {
                        r.close();
                    }
                    if (e != null) {
                        logger.trace("suppressing exception on completion (it was already bubbled up or the operation was aborted)", e);
                    }
                });
            }
        });
    }

    /**
     * Increases the store reference and returns a {@link Releasable} that will decrease the store reference using the generic thread pool.
     * We must never release the store using an interruptible thread as we can risk invalidating the node lock.
     */
    private Releasable acquireStore(Store store) {
        store.incRef();
        return Releasables.releaseOnce(() -> closeOnGenericThreadPool(store::decRef));
    }

    /**
     * Releasing a safe commit can access some commit files. It's better not to use {@link CancellableThreads} to interact
     * with the file systems due to interrupt (see {@link org.apache.lucene.store.NIOFSDirectory} javadocs for more detail).
     * This method acquires a safe commit and wraps it to make sure that it will be released using the generic thread pool.
     */
    private Engine.IndexCommitRef acquireSafeCommit(IndexShard shard) {
        final Engine.IndexCommitRef commitRef = shard.acquireSafeIndexCommit();
        return new Engine.IndexCommitRef(commitRef.getIndexCommit(), () -> closeOnGenericThreadPool(commitRef));
    }

    private void closeOnGenericThreadPool(Closeable closeable) {
        assert threadPool.generic().isShutdown() == false;
        threadPool.generic().execute(() -> {
            try {
                closeable.close();
            } catch (Exception e) {
                assert false : e;
                logger.warn(new ParameterizedMessage("Exception while closing [{}]", closeable), e);
            }
        });
    }

    static final class SendFileResult {
        final List phase1FileNames;
        final List phase1FileSizes;
        final long totalSize;

        final List phase1ExistingFileNames;
        final List phase1ExistingFileSizes;
        final long existingTotalSize;

        final TimeValue took;

        SendFileResult(
            List phase1FileNames,
            List phase1FileSizes,
            long totalSize,
            List phase1ExistingFileNames,
            List phase1ExistingFileSizes,
            long existingTotalSize,
            TimeValue took
        ) {
            this.phase1FileNames = phase1FileNames;
            this.phase1FileSizes = phase1FileSizes;
            this.totalSize = totalSize;
            this.phase1ExistingFileNames = phase1ExistingFileNames;
            this.phase1ExistingFileSizes = phase1ExistingFileSizes;
            this.existingTotalSize = existingTotalSize;
            this.took = took;
        }

        static final SendFileResult EMPTY = new SendFileResult(
            Collections.emptyList(),
            Collections.emptyList(),
            0L,
            Collections.emptyList(),
            Collections.emptyList(),
            0L,
            TimeValue.ZERO
        );
    }

    /**
     * Perform phase1 of the recovery operations. Once this {@link IndexCommit}
     * snapshot has been performed no commit operations (files being fsync'd)
     * are effectively allowed on this index until all recovery phases are done
     * 

* Phase1 examines the segment files on the target node and copies over the * segments that are missing. Only segments that have the same size and * checksum can be reused */ void phase1(IndexCommit snapshot, long startingSeqNo, IntSupplier translogOps, ActionListener listener) { cancellableThreads.checkForCancel(); final Store store = shard.store(); try { StopWatch stopWatch = new StopWatch().start(); final Store.MetadataSnapshot recoverySourceMetadata; final String shardStateIdentifier; try { recoverySourceMetadata = store.getMetadata(snapshot); shardStateIdentifier = SnapshotShardsService.getShardStateId(shard, snapshot); } catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) { shard.failShard("recovery", ex); throw ex; } for (String name : snapshot.getFileNames()) { final StoreFileMetadata md = recoverySourceMetadata.get(name); if (md == null) { logger.info("Snapshot differs from actual index for file: {} meta: {}", name, recoverySourceMetadata.fileMetadataMap()); throw new CorruptIndexException( "Snapshot differs from actual index - maybe index was removed metadata has " + recoverySourceMetadata.fileMetadataMap().size() + " files", name ); } } if (canSkipPhase1(recoverySourceMetadata, request.metadataSnapshot()) == false) { cancellableThreads.checkForCancel(); final boolean canUseSnapshots = useSnapshots && request.canDownloadSnapshotFiles(); recoveryPlannerService.computeRecoveryPlan( shard.shardId(), shardStateIdentifier, recoverySourceMetadata, request.metadataSnapshot(), startingSeqNo, translogOps.getAsInt(), getRequest().targetNode().getVersion(), canUseSnapshots, ActionListener.wrap(plan -> recoverFilesFromSourceAndSnapshot(plan, store, stopWatch, listener), listener::onFailure) ); } else { logger.trace("skipping [phase1] since source and target have identical sync id [{}]", recoverySourceMetadata.getSyncId()); // but we must still create a retention lease final StepListener createRetentionLeaseStep = new StepListener<>(); createRetentionLease(startingSeqNo, createRetentionLeaseStep); createRetentionLeaseStep.whenComplete(retentionLease -> { final TimeValue took = stopWatch.totalTime(); logger.trace("recovery [phase1]: took [{}]", took); listener.onResponse( new SendFileResult( Collections.emptyList(), Collections.emptyList(), 0L, Collections.emptyList(), Collections.emptyList(), 0L, took ) ); }, listener::onFailure); } } catch (Exception e) { throw new RecoverFilesRecoveryException(request.shardId(), 0, new ByteSizeValue(0L), e); } } void recoverFilesFromSourceAndSnapshot( ShardRecoveryPlan shardRecoveryPlan, Store store, StopWatch stopWatch, ActionListener listener ) { cancellableThreads.checkForCancel(); final List filesToRecoverNames = shardRecoveryPlan.getFilesToRecoverNames(); final List filesToRecoverSizes = shardRecoveryPlan.getFilesToRecoverSizes(); final List phase1ExistingFileNames = shardRecoveryPlan.getFilesPresentInTargetNames(); final List phase1ExistingFileSizes = shardRecoveryPlan.getFilesPresentInTargetSizes(); final long totalSize = shardRecoveryPlan.getTotalSize(); final long existingTotalSize = shardRecoveryPlan.getExistingSize(); if (logger.isTraceEnabled()) { for (StoreFileMetadata md : shardRecoveryPlan.getFilesPresentInTarget()) { logger.trace( "recovery [phase1]: not recovering [{}], exist in local store and has checksum [{}]," + " size [{}]", md.name(), md.checksum(), md.length() ); } for (StoreFileMetadata md : shardRecoveryPlan.getSourceFilesToRecover()) { if (request.metadataSnapshot().fileMetadataMap().containsKey(md.name())) { logger.trace( "recovery [phase1]: recovering [{}], exists in local store, but is different: remote [{}], local [{}]", md.name(), request.metadataSnapshot().fileMetadataMap().get(md.name()), md ); } else { logger.trace("recovery [phase1]: recovering [{}], does not exist in remote", md.name()); } } for (BlobStoreIndexShardSnapshot.FileInfo fileInfo : shardRecoveryPlan.getSnapshotFilesToRecover()) { final StoreFileMetadata md = fileInfo.metadata(); if (request.metadataSnapshot().fileMetadataMap().containsKey(md.name())) { logger.trace( "recovery [phase1]: recovering [{}], exists in local store, but is different: remote [{}], local [{}]", md.name(), request.metadataSnapshot().fileMetadataMap().get(md.name()), md ); } else { logger.trace("recovery [phase1]: recovering [{}], does not exist in remote", md.name()); } } logger.trace( "recovery [phase1]: recovering_files [{}] with total_size [{}], reusing_files [{}] with total_size [{}]", filesToRecoverNames.size(), new ByteSizeValue(totalSize), phase1ExistingFileNames.size(), new ByteSizeValue(existingTotalSize) ); } // We need to pass the ShardRecovery plan between steps instead of capturing it in the closures // since the plan can change after a failure recovering files from the snapshots that cannot be // recovered from the source node, in that case we have to start from scratch using the fallback // recovery plan that would be used in subsequent steps. final StepListener sendFileInfoStep = new StepListener<>(); final StepListener>> recoverSnapshotFilesStep = new StepListener<>(); final StepListener sendFilesStep = new StepListener<>(); final StepListener> createRetentionLeaseStep = new StepListener<>(); final StepListener cleanFilesStep = new StepListener<>(); final int translogOps = shardRecoveryPlan.getTranslogOps(); recoveryTarget.receiveFileInfo( filesToRecoverNames, filesToRecoverSizes, phase1ExistingFileNames, phase1ExistingFileSizes, translogOps, sendFileInfoStep ); sendFileInfoStep.whenComplete(unused -> { recoverSnapshotFiles(shardRecoveryPlan, new ActionListener<>() { @Override public void onResponse(List filesFailedToRecoverFromSnapshot) { recoverSnapshotFilesStep.onResponse(Tuple.tuple(shardRecoveryPlan, filesFailedToRecoverFromSnapshot)); } @Override public void onFailure(Exception e) { if (shardRecoveryPlan.canRecoverSnapshotFilesFromSourceNode() == false && e instanceof CancellableThreads.ExecutionCancelledException == false) { ShardRecoveryPlan fallbackPlan = shardRecoveryPlan.getFallbackPlan(); recoveryTarget.receiveFileInfo( fallbackPlan.getFilesToRecoverNames(), fallbackPlan.getFilesToRecoverSizes(), fallbackPlan.getFilesPresentInTargetNames(), fallbackPlan.getFilesPresentInTargetSizes(), fallbackPlan.getTranslogOps(), recoverSnapshotFilesStep.map(r -> Tuple.tuple(fallbackPlan, Collections.emptyList())) ); } else { recoverSnapshotFilesStep.onFailure(e); } } }); }, listener::onFailure); recoverSnapshotFilesStep.whenComplete(planAndFilesFailedToRecoverFromSnapshot -> { ShardRecoveryPlan recoveryPlan = planAndFilesFailedToRecoverFromSnapshot.v1(); List filesFailedToRecoverFromSnapshot = planAndFilesFailedToRecoverFromSnapshot.v2(); final List filesToRecoverFromSource; if (filesFailedToRecoverFromSnapshot.isEmpty()) { filesToRecoverFromSource = recoveryPlan.getSourceFilesToRecover(); } else { filesToRecoverFromSource = concatLists(recoveryPlan.getSourceFilesToRecover(), filesFailedToRecoverFromSnapshot); } sendFiles( store, filesToRecoverFromSource.toArray(new StoreFileMetadata[0]), recoveryPlan::getTranslogOps, sendFilesStep.map(unused -> recoveryPlan) ); }, listener::onFailure); sendFilesStep.whenComplete(recoveryPlan -> { createRetentionLease( recoveryPlan.getStartingSeqNo(), createRetentionLeaseStep.map(retentionLease -> Tuple.tuple(recoveryPlan, retentionLease)) ); }, listener::onFailure); createRetentionLeaseStep.whenComplete(recoveryPlanAndRetentionLease -> { final ShardRecoveryPlan recoveryPlan = recoveryPlanAndRetentionLease.v1(); final RetentionLease retentionLease = recoveryPlanAndRetentionLease.v2(); final Store.MetadataSnapshot recoverySourceMetadata = recoveryPlan.getSourceMetadataSnapshot(); final long lastKnownGlobalCheckpoint = shard.getLastKnownGlobalCheckpoint(); assert retentionLease == null || retentionLease.retainingSequenceNumber() - 1 <= lastKnownGlobalCheckpoint : retentionLease + " vs " + lastKnownGlobalCheckpoint; // Establishes new empty translog on the replica with global checkpoint set to lastKnownGlobalCheckpoint. We want // the commit we just copied to be a safe commit on the replica, so why not set the global checkpoint on the replica // to the max seqno of this commit? Because (in rare corner cases) this commit might not be a safe commit here on // the primary, and in these cases the max seqno would be too high to be valid as a global checkpoint. cleanFiles( store, recoverySourceMetadata, () -> translogOps, lastKnownGlobalCheckpoint, cleanFilesStep.map(unused -> recoveryPlan) ); }, listener::onFailure); cleanFilesStep.whenComplete(recoveryPlan -> { final TimeValue took = stopWatch.totalTime(); logger.trace("recovery [phase1]: took [{}]", took); listener.onResponse( new SendFileResult( recoveryPlan.getFilesToRecoverNames(), recoveryPlan.getFilesToRecoverSizes(), recoveryPlan.getTotalSize(), recoveryPlan.getFilesPresentInTargetNames(), recoveryPlan.getFilesPresentInTargetSizes(), recoveryPlan.getExistingSize(), took ) ); }, listener::onFailure); } /** * Send requests to the target node to recover files from a given snapshot. In case of failure, the listener * value contains the list of files that failed to be recovered from a snapshot. */ void recoverSnapshotFiles(ShardRecoveryPlan shardRecoveryPlan, ActionListener> listener) { ShardRecoveryPlan.SnapshotFilesToRecover snapshotFilesToRecover = shardRecoveryPlan.getSnapshotFilesToRecover(); if (snapshotFilesToRecover.isEmpty()) { listener.onResponse(Collections.emptyList()); return; } new SnapshotRecoverFileRequestsSender(shardRecoveryPlan, listener).start(); } private class SnapshotRecoverFileRequestsSender { private final ShardRecoveryPlan shardRecoveryPlan; private final ShardRecoveryPlan.SnapshotFilesToRecover snapshotFilesToRecover; private final ActionListener> listener; private final CountDown countDown; private final BlockingQueue pendingSnapshotFilesToRecover; private final AtomicBoolean cancelled = new AtomicBoolean(); private final Set> outstandingRequests = new HashSet<>(maxConcurrentSnapshotFileDownloads); private List filesFailedToDownloadFromSnapshot; SnapshotRecoverFileRequestsSender(ShardRecoveryPlan shardRecoveryPlan, ActionListener> listener) { this.shardRecoveryPlan = shardRecoveryPlan; this.snapshotFilesToRecover = shardRecoveryPlan.getSnapshotFilesToRecover(); this.listener = listener; this.countDown = new CountDown(shardRecoveryPlan.getSnapshotFilesToRecover().size()); this.pendingSnapshotFilesToRecover = new LinkedBlockingQueue<>( shardRecoveryPlan.getSnapshotFilesToRecover().getSnapshotFiles() ); } void start() { for (int i = 0; i < maxConcurrentSnapshotFileDownloads; i++) { sendRequest(); } } void sendRequest() { BlobStoreIndexShardSnapshot.FileInfo snapshotFileToRecover = pendingSnapshotFilesToRecover.poll(); if (snapshotFileToRecover == null) { return; } final ListenableFuture requestFuture = new ListenableFuture<>(); try { cancellableThreads.checkForCancel(); ActionListener sendRequestListener = new ActionListener<>() { @Override public void onResponse(Void unused) { onRequestCompletion(snapshotFileToRecover.metadata(), null); } @Override public void onFailure(Exception e) { if (cancelled.get() || e instanceof CancellableThreads.ExecutionCancelledException) { logger.debug( new ParameterizedMessage( "cancelled while recovering file [{}] from snapshot", snapshotFileToRecover.metadata() ), e ); } else { logger.warn( new ParameterizedMessage( "failed to recover file [{}] from snapshot{}", snapshotFileToRecover.metadata(), shardRecoveryPlan.canRecoverSnapshotFilesFromSourceNode() ? ", will recover from primary instead" : "" ), e ); } if (shardRecoveryPlan.canRecoverSnapshotFilesFromSourceNode()) { onRequestCompletion(snapshotFileToRecover.metadata(), e); } else { cancel(e); } } }; requestFuture.addListener(sendRequestListener); trackOutstandingRequest(requestFuture); recoveryTarget.restoreFileFromSnapshot( snapshotFilesToRecover.getRepository(), snapshotFilesToRecover.getIndexId(), snapshotFileToRecover, ActionListener.runBefore(requestFuture, () -> unTrackOutstandingRequest(requestFuture)) ); } catch (CancellableThreads.ExecutionCancelledException e) { cancel(e); } catch (Exception e) { unTrackOutstandingRequest(requestFuture); onRequestCompletion(snapshotFileToRecover.metadata(), e); } } void cancel(Exception e) { if (cancelled.compareAndSet(false, true)) { pendingSnapshotFilesToRecover.clear(); notifyFailureOnceAllOutstandingRequestAreDone(e); } } void onRequestCompletion(StoreFileMetadata storeFileMetadata, @Nullable Exception exception) { if (cancelled.get()) { return; } if (exception != null) { addFileFailedToRecoverFromSnapshot(storeFileMetadata); } if (countDown.countDown()) { final List failedToRecoverFromSnapshotFiles = getFilesFailedToRecoverFromSnapshot(); listener.onResponse(failedToRecoverFromSnapshotFiles); } else { sendRequest(); } } synchronized void addFileFailedToRecoverFromSnapshot(StoreFileMetadata storeFileMetadata) { if (filesFailedToDownloadFromSnapshot == null) { filesFailedToDownloadFromSnapshot = new ArrayList<>(); } filesFailedToDownloadFromSnapshot.add(storeFileMetadata); } synchronized List getFilesFailedToRecoverFromSnapshot() { return Objects.requireNonNullElse(filesFailedToDownloadFromSnapshot, Collections.emptyList()); } private void trackOutstandingRequest(ListenableFuture future) { boolean cancelled; synchronized (outstandingRequests) { cancelled = cancellableThreads.isCancelled() || this.cancelled.get(); if (cancelled == false) { outstandingRequests.add(future); } } if (cancelled) { cancellableThreads.checkForCancel(); // If the recover snapshot files operation is cancelled but the recovery is still // valid, it means that some of the snapshot files download failed and the snapshot files // differ from the source index files. In that case we have to cancel all pending operations // and wait until all the in-flight operations are done to reset the recovery and start from // scratch using the source node index files. assert this.cancelled.get(); throw new CancellableThreads.ExecutionCancelledException("Recover snapshot files cancelled"); } } private void unTrackOutstandingRequest(ListenableFuture future) { synchronized (outstandingRequests) { outstandingRequests.remove(future); } } private void notifyFailureOnceAllOutstandingRequestAreDone(Exception e) { assert cancelled.get(); final Set> pendingRequests; synchronized (outstandingRequests) { pendingRequests = new HashSet<>(outstandingRequests); } if (pendingRequests.isEmpty()) { listener.onFailure(e); return; } // The recovery was cancelled so from this point onwards outstandingRequests won't track // new requests and therefore we can safely use to wait until all the pending requests complete // to notify the listener about the cancellation final CountDown pendingRequestsCountDown = new CountDown(pendingRequests.size()); for (ListenableFuture outstandingFuture : pendingRequests) { outstandingFuture.addListener(ActionListener.wrap(() -> { if (pendingRequestsCountDown.countDown()) { listener.onFailure(e); } })); } } } void createRetentionLease(final long startingSeqNo, ActionListener listener) { runUnderPrimaryPermit(() -> { // Clone the peer recovery retention lease belonging to the source shard. We are retaining history between the the local // checkpoint of the safe commit we're creating and this lease's retained seqno with the retention lock, and by cloning an // existing lease we (approximately) know that all our peers are also retaining history as requested by the cloned lease. If // the recovery now fails before copying enough history over then a subsequent attempt will find this lease, determine it is // not enough, and fall back to a file-based recovery. // // (approximately) because we do not guarantee to be able to satisfy every lease on every peer. logger.trace("cloning primary's retention lease"); try { final StepListener cloneRetentionLeaseStep = new StepListener<>(); final RetentionLease clonedLease = shard.cloneLocalPeerRecoveryRetentionLease( request.targetNode().getId(), new ThreadedActionListener<>(logger, shard.getThreadPool(), ThreadPool.Names.GENERIC, cloneRetentionLeaseStep, false) ); logger.trace("cloned primary's retention lease as [{}]", clonedLease); cloneRetentionLeaseStep.addListener(listener.map(rr -> clonedLease)); } catch (RetentionLeaseNotFoundException e) { // it's possible that the primary has no retention lease yet if we are doing a rolling upgrade from a version before // 7.4, and in that case we just create a lease using the local checkpoint of the safe commit which we're using for // recovery as a conservative estimate for the global checkpoint. assert shard.indexSettings().getIndexVersionCreated().before(Version.V_7_4_0) || shard.indexSettings().isSoftDeleteEnabled() == false; final StepListener addRetentionLeaseStep = new StepListener<>(); final long estimatedGlobalCheckpoint = startingSeqNo - 1; final RetentionLease newLease = shard.addPeerRecoveryRetentionLease( request.targetNode().getId(), estimatedGlobalCheckpoint, new ThreadedActionListener<>(logger, shard.getThreadPool(), ThreadPool.Names.GENERIC, addRetentionLeaseStep, false) ); addRetentionLeaseStep.addListener(listener.map(rr -> newLease)); logger.trace("created retention lease with estimated checkpoint of [{}]", estimatedGlobalCheckpoint); } }, shardId + " establishing retention lease for [" + request.targetAllocationId() + "]", shard, cancellableThreads, logger); } boolean canSkipPhase1(Store.MetadataSnapshot source, Store.MetadataSnapshot target) { if (source.getSyncId() == null || source.getSyncId().equals(target.getSyncId()) == false) { return false; } if (source.numDocs() != target.numDocs()) { throw new IllegalStateException( "try to recover " + request.shardId() + " from primary shard with sync id but number " + "of docs differ: " + source.numDocs() + " (" + request.sourceNode().getName() + ", primary) vs " + target.numDocs() + "(" + request.targetNode().getName() + ")" ); } SequenceNumbers.CommitInfo sourceSeqNos = SequenceNumbers.loadSeqNoInfoFromLuceneCommit(source.commitUserData().entrySet()); SequenceNumbers.CommitInfo targetSeqNos = SequenceNumbers.loadSeqNoInfoFromLuceneCommit(target.commitUserData().entrySet()); if (sourceSeqNos.localCheckpoint != targetSeqNos.localCheckpoint || targetSeqNos.maxSeqNo != sourceSeqNos.maxSeqNo) { final String message = "try to recover " + request.shardId() + " with sync id but " + "seq_no stats are mismatched: [" + source.commitUserData() + "] vs [" + target.commitUserData() + "]"; assert false : message; throw new IllegalStateException(message); } return true; } void prepareTargetForTranslog(int totalTranslogOps, ActionListener listener) { StopWatch stopWatch = new StopWatch().start(); final ActionListener wrappedListener = ActionListener.wrap(nullVal -> { stopWatch.stop(); final TimeValue tookTime = stopWatch.totalTime(); logger.trace("recovery [phase1]: remote engine start took [{}]", tookTime); listener.onResponse(tookTime); }, e -> listener.onFailure(new RecoveryEngineException(shard.shardId(), 1, "prepare target for translog failed", e))); // Send a request preparing the new shard's translog to receive operations. This ensures the shard engine is started and disables // garbage collection (not the JVM's GC!) of tombstone deletes. logger.trace("recovery [phase1]: prepare remote engine for translog"); cancellableThreads.checkForCancel(); recoveryTarget.prepareForTranslogOperations(totalTranslogOps, wrappedListener); } /** * Perform phase two of the recovery process. *

* Phase two uses a snapshot of the current translog *without* acquiring the write lock (however, the translog snapshot is * point-in-time view of the translog). It then sends each translog operation to the target node so it can be replayed into the new * shard. * * @param startingSeqNo the sequence number to start recovery from, or {@link SequenceNumbers#UNASSIGNED_SEQ_NO} if all * ops should be sent * @param endingSeqNo the highest sequence number that should be sent * @param snapshot a snapshot of the translog * @param maxSeenAutoIdTimestamp the max auto_id_timestamp of append-only requests on the primary * @param maxSeqNoOfUpdatesOrDeletes the max seq_no of updates or deletes on the primary after these operations were executed on it. * @param listener a listener which will be notified with the local checkpoint on the target. */ void phase2( final long startingSeqNo, final long endingSeqNo, final Translog.Snapshot snapshot, final long maxSeenAutoIdTimestamp, final long maxSeqNoOfUpdatesOrDeletes, final RetentionLeases retentionLeases, final long mappingVersion, final ActionListener listener ) throws IOException { if (shard.state() == IndexShardState.CLOSED) { throw new IndexShardClosedException(request.shardId()); } logger.trace("recovery [phase2]: sending transaction log operations (from [" + startingSeqNo + "] to [" + endingSeqNo + "]"); final StopWatch stopWatch = new StopWatch().start(); final StepListener sendListener = new StepListener<>(); final OperationBatchSender sender = new OperationBatchSender( startingSeqNo, endingSeqNo, snapshot, maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes, retentionLeases, mappingVersion, sendListener ); sendListener.whenComplete(ignored -> { final long skippedOps = sender.skippedOps.get(); final int totalSentOps = sender.sentOps.get(); final long targetLocalCheckpoint = sender.targetLocalCheckpoint.get(); assert snapshot.totalOperations() == snapshot.skippedOperations() + skippedOps + totalSentOps : String.format( Locale.ROOT, "expected total [%d], overridden [%d], skipped [%d], total sent [%d]", snapshot.totalOperations(), snapshot.skippedOperations(), skippedOps, totalSentOps ); stopWatch.stop(); final TimeValue tookTime = stopWatch.totalTime(); logger.trace("recovery [phase2]: took [{}]", tookTime); listener.onResponse(new SendSnapshotResult(targetLocalCheckpoint, totalSentOps, tookTime)); }, listener::onFailure); sender.start(); } private static class OperationChunkRequest implements MultiChunkTransfer.ChunkRequest { final List operations; final boolean lastChunk; OperationChunkRequest(List operations, boolean lastChunk) { this.operations = operations; this.lastChunk = lastChunk; } @Override public boolean lastChunk() { return lastChunk; } } private class OperationBatchSender extends MultiChunkTransfer { private final long startingSeqNo; private final long endingSeqNo; private final Translog.Snapshot snapshot; private final long maxSeenAutoIdTimestamp; private final long maxSeqNoOfUpdatesOrDeletes; private final RetentionLeases retentionLeases; private final long mappingVersion; private int lastBatchCount = 0; // used to estimate the count of the subsequent batch. private final AtomicInteger skippedOps = new AtomicInteger(); private final AtomicInteger sentOps = new AtomicInteger(); private final AtomicLong targetLocalCheckpoint = new AtomicLong(SequenceNumbers.NO_OPS_PERFORMED); OperationBatchSender( long startingSeqNo, long endingSeqNo, Translog.Snapshot snapshot, long maxSeenAutoIdTimestamp, long maxSeqNoOfUpdatesOrDeletes, RetentionLeases retentionLeases, long mappingVersion, ActionListener listener ) { super(logger, threadPool.getThreadContext(), listener, maxConcurrentOperations, List.of(snapshot)); this.startingSeqNo = startingSeqNo; this.endingSeqNo = endingSeqNo; this.snapshot = snapshot; this.maxSeenAutoIdTimestamp = maxSeenAutoIdTimestamp; this.maxSeqNoOfUpdatesOrDeletes = maxSeqNoOfUpdatesOrDeletes; this.retentionLeases = retentionLeases; this.mappingVersion = mappingVersion; } @Override protected synchronized OperationChunkRequest nextChunkRequest(Translog.Snapshot snapshot) throws IOException { // We need to synchronized Snapshot#next() because it's called by different threads through sendBatch. // Even though those calls are not concurrent, Snapshot#next() uses non-synchronized state and is not multi-thread-compatible. assert Transports.assertNotTransportThread("[phase2]"); cancellableThreads.checkForCancel(); final List ops = lastBatchCount > 0 ? new ArrayList<>(lastBatchCount) : new ArrayList<>(); long batchSizeInBytes = 0L; Translog.Operation operation; while ((operation = snapshot.next()) != null) { if (shard.state() == IndexShardState.CLOSED) { throw new IndexShardClosedException(request.shardId()); } final long seqNo = operation.seqNo(); if (seqNo < startingSeqNo || seqNo > endingSeqNo) { skippedOps.incrementAndGet(); continue; } ops.add(operation); batchSizeInBytes += operation.estimateSize(); sentOps.incrementAndGet(); // check if this request is past bytes threshold, and if so, send it off if (batchSizeInBytes >= chunkSizeInBytes) { break; } } lastBatchCount = ops.size(); return new OperationChunkRequest(ops, operation == null); } @Override protected void executeChunkRequest(OperationChunkRequest request, ActionListener listener) { cancellableThreads.checkForCancel(); recoveryTarget.indexTranslogOperations( request.operations, snapshot.totalOperations(), maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes, retentionLeases, mappingVersion, listener.delegateFailure((l, newCheckpoint) -> { targetLocalCheckpoint.updateAndGet(curr -> SequenceNumbers.max(curr, newCheckpoint)); l.onResponse(null); }) ); } @Override protected void handleError(Translog.Snapshot snapshot, Exception e) { throw new RecoveryEngineException(shard.shardId(), 2, "failed to send/replay operations", e); } @Override public void close() throws IOException { snapshot.close(); } } void finalizeRecovery(long targetLocalCheckpoint, long trimAboveSeqNo, ActionListener listener) { if (shard.state() == IndexShardState.CLOSED) { throw new IndexShardClosedException(request.shardId()); } cancellableThreads.checkForCancel(); StopWatch stopWatch = new StopWatch().start(); logger.trace("finalizing recovery"); /* * Before marking the shard as in-sync we acquire an operation permit. We do this so that there is a barrier between marking a * shard as in-sync and relocating a shard. If we acquire the permit then no relocation handoff can complete before we are done * marking the shard as in-sync. If the relocation handoff holds all the permits then after the handoff completes and we acquire * the permit then the state of the shard will be relocated and this recovery will fail. */ runUnderPrimaryPermit( () -> shard.markAllocationIdAsInSync(request.targetAllocationId(), targetLocalCheckpoint), shardId + " marking " + request.targetAllocationId() + " as in sync", shard, cancellableThreads, logger ); final long globalCheckpoint = shard.getLastKnownGlobalCheckpoint(); // this global checkpoint is persisted in finalizeRecovery final StepListener finalizeListener = new StepListener<>(); cancellableThreads.checkForCancel(); recoveryTarget.finalizeRecovery(globalCheckpoint, trimAboveSeqNo, finalizeListener); finalizeListener.whenComplete(r -> { runUnderPrimaryPermit( () -> shard.updateGlobalCheckpointForShard(request.targetAllocationId(), globalCheckpoint), shardId + " updating " + request.targetAllocationId() + "'s global checkpoint", shard, cancellableThreads, logger ); if (request.isPrimaryRelocation()) { logger.trace("performing relocation hand-off"); // this acquires all IndexShard operation permits and will thus delay new recoveries until it is done cancellableThreads.execute( () -> shard.relocated(request.targetAllocationId(), recoveryTarget::handoffPrimaryContext, ActionListener.wrap(v -> { cancellableThreads.checkForCancel(); completeFinalizationListener(listener, stopWatch); }, listener::onFailure)) ); /* * if the recovery process fails after disabling primary mode on the source shard, both relocation source and * target are failed (see {@link IndexShard#updateRoutingEntry}). */ } else { completeFinalizationListener(listener, stopWatch); } }, listener::onFailure); } private void completeFinalizationListener(ActionListener listener, StopWatch stopWatch) { stopWatch.stop(); logger.trace("finalizing recovery took [{}]", stopWatch.totalTime()); listener.onResponse(null); } static final class SendSnapshotResult { final long targetLocalCheckpoint; final int sentOperations; final TimeValue tookTime; SendSnapshotResult(final long targetLocalCheckpoint, final int sentOperations, final TimeValue tookTime) { this.targetLocalCheckpoint = targetLocalCheckpoint; this.sentOperations = sentOperations; this.tookTime = tookTime; } } /** * Cancels the recovery and interrupts all eligible threads. */ public void cancel(String reason) { cancellableThreads.cancel(reason); recoveryTarget.cancel(); } @Override public String toString() { return "ShardRecoveryHandler{" + "shardId=" + request.shardId() + ", sourceNode=" + request.sourceNode() + ", targetNode=" + request.targetNode() + '}'; } private static class FileChunk implements MultiChunkTransfer.ChunkRequest, Releasable { final StoreFileMetadata md; final BytesReference content; final long position; final boolean lastChunk; final Releasable onClose; FileChunk(StoreFileMetadata md, BytesReference content, long position, boolean lastChunk, Releasable onClose) { this.md = md; this.content = content; this.position = position; this.lastChunk = lastChunk; this.onClose = onClose; } @Override public boolean lastChunk() { return lastChunk; } @Override public void close() { onClose.close(); } } void sendFiles(Store store, StoreFileMetadata[] files, IntSupplier translogOps, ActionListener listener) { ArrayUtil.timSort(files, Comparator.comparingLong(StoreFileMetadata::length)); // send smallest first // use a smaller buffer than the configured chunk size if we only have files smaller than the chunk size final int bufferSize = files.length == 0 ? 0 : (int) Math.min(chunkSizeInBytes, files[files.length - 1].length()); Releasable temporaryStoreRef = acquireStore(store); try { final Releasable storeRef = temporaryStoreRef; final MultiChunkTransfer multiFileSender = new MultiChunkTransfer<>( logger, threadPool.getThreadContext(), listener, maxConcurrentFileChunks, Arrays.asList(files) ) { final Deque buffers = new ConcurrentLinkedDeque<>(); final AtomicInteger liveBufferCount = new AtomicInteger(); // only used in assertions to verify proper recycling IndexInput currentInput = null; long offset = 0; @Override protected void onNewResource(StoreFileMetadata md) throws IOException { offset = 0; IOUtils.close(currentInput); if (md.hashEqualsContents()) { // we already have the file contents on heap no need to open the file again currentInput = null; } else { currentInput = store.directory().openInput(md.name(), IOContext.READONCE); } } @Override protected FileChunk nextChunkRequest(StoreFileMetadata md) throws IOException { assert Transports.assertNotTransportThread("read file chunk"); cancellableThreads.checkForCancel(); if (currentInput == null) { // no input => reading directly from the metadata assert md.hashEqualsContents(); return new FileChunk(md, new BytesArray(md.hash()), 0, true, () -> {}); } final byte[] buffer = Objects.requireNonNullElseGet(buffers.pollFirst(), () -> new byte[bufferSize]); assert liveBufferCount.incrementAndGet() > 0; final int toRead = Math.toIntExact(Math.min(md.length() - offset, buffer.length)); currentInput.readBytes(buffer, 0, toRead, false); final boolean lastChunk = offset + toRead == md.length(); final FileChunk chunk = new FileChunk(md, new BytesArray(buffer, 0, toRead), offset, lastChunk, () -> { assert liveBufferCount.decrementAndGet() >= 0; buffers.addFirst(buffer); }); offset += toRead; return chunk; } @Override protected void executeChunkRequest(FileChunk request, ActionListener listener) { cancellableThreads.checkForCancel(); final ReleasableBytesReference content = new ReleasableBytesReference(request.content, request); recoveryTarget.writeFileChunk( request.md, request.position, content, request.lastChunk, translogOps.getAsInt(), ActionListener.runBefore(listener, content::close) ); } @Override protected void handleError(StoreFileMetadata md, Exception e) throws Exception { handleErrorOnSendFiles(store, e, new StoreFileMetadata[] { md }); } @Override public void close() throws IOException { IOUtils.close(currentInput, storeRef); } @Override protected boolean assertOnSuccess() { assert liveBufferCount.get() == 0 : "leaked [" + liveBufferCount + "] buffers"; return true; } }; resources.add(multiFileSender); temporaryStoreRef = null; // now owned by multiFileSender, tracked in resources, so won't be leaked multiFileSender.start(); } finally { Releasables.close(temporaryStoreRef); } } private void cleanFiles( Store store, Store.MetadataSnapshot sourceMetadata, IntSupplier translogOps, long globalCheckpoint, ActionListener listener ) { // Send the CLEAN_FILES request, which takes all of the files that // were transferred and renames them from their temporary file // names to the actual file names. It also writes checksums for // the files after they have been renamed. // // Once the files have been renamed, any other files that are not // related to this recovery (out of date segments, for example) // are deleted cancellableThreads.checkForCancel(); recoveryTarget.cleanFiles( translogOps.getAsInt(), globalCheckpoint, sourceMetadata, listener.delegateResponse((l, e) -> ActionListener.completeWith(l, () -> { StoreFileMetadata[] mds = StreamSupport.stream(sourceMetadata.spliterator(), false).toArray(StoreFileMetadata[]::new); ArrayUtil.timSort(mds, Comparator.comparingLong(StoreFileMetadata::length)); // check small files first handleErrorOnSendFiles(store, e, mds); throw e; })) ); } private void handleErrorOnSendFiles(Store store, Exception e, StoreFileMetadata[] mds) throws Exception { final IOException corruptIndexException = ExceptionsHelper.unwrapCorruption(e); assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[handle error on send/clean files]"); if (corruptIndexException != null) { Exception localException = null; for (StoreFileMetadata md : mds) { cancellableThreads.checkForCancel(); logger.debug("checking integrity for file {} after remove corruption exception", md); if (store.checkIntegrityNoException(md) == false) { // we are corrupted on the primary -- fail! logger.warn("{} Corrupted file detected {} checksum mismatch", shardId, md); if (localException == null) { localException = corruptIndexException; } failEngine(corruptIndexException); } } if (localException != null) { throw localException; } else { // corruption has happened on the way to replica RemoteTransportException remoteException = new RemoteTransportException( "File corruption occurred on recovery but checksums are ok", null ); remoteException.addSuppressed(e); logger.warn( () -> format( "%s Remote file corruption on node %s, recovering %s. local checksum OK", shardId, request.targetNode(), mds ), corruptIndexException ); throw remoteException; } } throw e; } protected void failEngine(IOException cause) { shard.failShard("recovery", cause); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy