All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.snapshots.SnapshotsService Maven / Gradle / Ivy

There is a newer version: 8.15.1
Show newest version
/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */

package org.elasticsearch.snapshots;

import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.ActionResponse;
import org.elasticsearch.action.ActionRunnable;
import org.elasticsearch.action.StepListener;
import org.elasticsearch.action.admin.cluster.snapshots.clone.CloneSnapshotRequest;
import org.elasticsearch.action.admin.cluster.snapshots.create.CreateSnapshotRequest;
import org.elasticsearch.action.admin.cluster.snapshots.delete.DeleteSnapshotRequest;
import org.elasticsearch.action.support.ActionFilters;
import org.elasticsearch.action.support.ContextPreservingActionListener;
import org.elasticsearch.action.support.GroupedActionListener;
import org.elasticsearch.action.support.master.TransportMasterNodeAction;
import org.elasticsearch.cluster.ClusterChangedEvent;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.ClusterStateApplier;
import org.elasticsearch.cluster.ClusterStateTaskConfig;
import org.elasticsearch.cluster.ClusterStateTaskExecutor;
import org.elasticsearch.cluster.ClusterStateTaskListener;
import org.elasticsearch.cluster.ClusterStateUpdateTask;
import org.elasticsearch.cluster.NotMasterException;
import org.elasticsearch.cluster.RepositoryCleanupInProgress;
import org.elasticsearch.cluster.RestoreInProgress;
import org.elasticsearch.cluster.SnapshotDeletionsInProgress;
import org.elasticsearch.cluster.SnapshotsInProgress;
import org.elasticsearch.cluster.SnapshotsInProgress.ShardSnapshotStatus;
import org.elasticsearch.cluster.SnapshotsInProgress.ShardState;
import org.elasticsearch.cluster.SnapshotsInProgress.State;
import org.elasticsearch.cluster.block.ClusterBlockException;
import org.elasticsearch.cluster.coordination.FailedToCommitClusterStateException;
import org.elasticsearch.cluster.metadata.DataStream;
import org.elasticsearch.cluster.metadata.DataStreamAlias;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.cluster.metadata.RepositoriesMetadata;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.routing.IndexRoutingTable;
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
import org.elasticsearch.cluster.routing.RoutingTable;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.cluster.service.MasterService;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.UUIDs;
import org.elasticsearch.common.collect.ImmutableOpenMap;
import org.elasticsearch.common.component.AbstractLifecycleComponent;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.util.CollectionUtils;
import org.elasticsearch.common.util.Maps;
import org.elasticsearch.common.util.concurrent.ListenableFuture;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.SuppressForbidden;
import org.elasticsearch.core.Tuple;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.indices.SystemDataStreamDescriptor;
import org.elasticsearch.indices.SystemIndices;
import org.elasticsearch.repositories.FinalizeSnapshotContext;
import org.elasticsearch.repositories.IndexId;
import org.elasticsearch.repositories.RepositoriesService;
import org.elasticsearch.repositories.Repository;
import org.elasticsearch.repositories.RepositoryData;
import org.elasticsearch.repositories.RepositoryException;
import org.elasticsearch.repositories.RepositoryMissingException;
import org.elasticsearch.repositories.RepositoryShardId;
import org.elasticsearch.repositories.ShardGeneration;
import org.elasticsearch.repositories.ShardGenerations;
import org.elasticsearch.repositories.ShardSnapshotResult;
import org.elasticsearch.tasks.Task;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.TransportService;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Deque;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.Executor;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.function.Predicate;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static java.util.Collections.unmodifiableList;
import static org.elasticsearch.cluster.SnapshotsInProgress.completed;
import static org.elasticsearch.common.Strings.arrayToCommaDelimitedString;
import static org.elasticsearch.core.Strings.format;

/**
 * Service responsible for creating snapshots. This service runs all the steps executed on the master node during snapshot creation and
 * deletion.
 * See package level documentation of {@link org.elasticsearch.snapshots} for details.
 */
public class SnapshotsService extends AbstractLifecycleComponent implements ClusterStateApplier {

    public static final Version SHARD_GEN_IN_REPO_DATA_VERSION = Version.V_7_6_0;

    public static final Version INDEX_GEN_IN_REPO_DATA_VERSION = Version.V_7_9_0;

    public static final Version UUIDS_IN_REPO_DATA_VERSION = Version.V_7_12_0;

    public static final Version FILE_INFO_WRITER_UUIDS_IN_SHARD_DATA_VERSION = Version.V_7_16_0;

    public static final Version OLD_SNAPSHOT_FORMAT = Version.V_7_5_0;

    public static final String POLICY_ID_METADATA_FIELD = "policy";

    private static final Logger logger = LogManager.getLogger(SnapshotsService.class);

    public static final String UPDATE_SNAPSHOT_STATUS_ACTION_NAME = "internal:cluster/snapshot/update_snapshot_status";

    public static final String NO_FEATURE_STATES_VALUE = "none";

    private final ClusterService clusterService;

    private final IndexNameExpressionResolver indexNameExpressionResolver;

    private final RepositoriesService repositoriesService;

    private final ThreadPool threadPool;

    private final Map>> snapshotCompletionListeners = new ConcurrentHashMap<>();

    /**
     * Listeners for snapshot deletion keyed by delete uuid as returned from {@link SnapshotDeletionsInProgress.Entry#uuid()}
     */
    private final Map>> snapshotDeletionListeners = new HashMap<>();

    // Set of repositories currently running either a snapshot finalization or a snapshot delete.
    private final Set currentlyFinalizing = Collections.synchronizedSet(new HashSet<>());

    // Set of snapshots that are currently being ended by this node
    private final Set endingSnapshots = Collections.synchronizedSet(new HashSet<>());

    // Set of currently initializing clone operations
    private final Set initializingClones = Collections.synchronizedSet(new HashSet<>());

    private final UpdateSnapshotStatusAction updateSnapshotStatusHandler;

    private final TransportService transportService;

    private final OngoingRepositoryOperations repositoryOperations = new OngoingRepositoryOperations();

    private final SystemIndices systemIndices;

    /**
     * Setting that specifies the maximum number of allowed concurrent snapshot create and delete operations in the
     * cluster state. The number of concurrent operations in a cluster state is defined as the sum of
     * {@link SnapshotsInProgress#count()} and the size of {@link SnapshotDeletionsInProgress#getEntries()}.
     */
    public static final Setting MAX_CONCURRENT_SNAPSHOT_OPERATIONS_SETTING = Setting.intSetting(
        "snapshot.max_concurrent_operations",
        1000,
        1,
        Setting.Property.NodeScope,
        Setting.Property.Dynamic
    );

    private volatile int maxConcurrentOperations;

    public SnapshotsService(
        Settings settings,
        ClusterService clusterService,
        IndexNameExpressionResolver indexNameExpressionResolver,
        RepositoriesService repositoriesService,
        TransportService transportService,
        ActionFilters actionFilters,
        SystemIndices systemIndices
    ) {
        this.clusterService = clusterService;
        this.indexNameExpressionResolver = indexNameExpressionResolver;
        this.repositoriesService = repositoriesService;
        this.threadPool = transportService.getThreadPool();
        this.transportService = transportService;

        // The constructor of UpdateSnapshotStatusAction will register itself to the TransportService.
        this.updateSnapshotStatusHandler = new UpdateSnapshotStatusAction(
            transportService,
            clusterService,
            threadPool,
            actionFilters,
            indexNameExpressionResolver
        );
        if (DiscoveryNode.isMasterNode(settings)) {
            // addLowPriorityApplier to make sure that Repository will be created before snapshot
            clusterService.addLowPriorityApplier(this);
            maxConcurrentOperations = MAX_CONCURRENT_SNAPSHOT_OPERATIONS_SETTING.get(settings);
            clusterService.getClusterSettings()
                .addSettingsUpdateConsumer(MAX_CONCURRENT_SNAPSHOT_OPERATIONS_SETTING, i -> maxConcurrentOperations = i);
        }
        this.systemIndices = systemIndices;
    }

    /**
     * Same as {@link #createSnapshot(CreateSnapshotRequest, ActionListener)} but invokes its callback on completion of
     * the snapshot.
     *
     * @param request snapshot request
     * @param listener snapshot completion listener
     */
    public void executeSnapshot(final CreateSnapshotRequest request, final ActionListener listener) {
        createSnapshot(request, ActionListener.wrap(snapshot -> addListener(snapshot, listener), listener::onFailure));
    }

    /**
     * Initializes the snapshotting process.
     * 

* This method is used by clients to start snapshot. It makes sure that there is no snapshots are currently running and * creates a snapshot record in cluster state metadata. * * @param request snapshot request * @param listener snapshot creation listener */ public void createSnapshot(final CreateSnapshotRequest request, final ActionListener listener) { final String repositoryName = request.repository(); final String snapshotName = IndexNameExpressionResolver.resolveDateMathExpression(request.snapshot()); validate(repositoryName, snapshotName); // TODO: create snapshot UUID in CreateSnapshotRequest and make this operation idempotent to cleanly deal with transport layer // retries final SnapshotId snapshotId = new SnapshotId(snapshotName, UUIDs.randomBase64UUID()); // new UUID for the snapshot Repository repository = repositoriesService.repository(request.repository()); if (repository.isReadOnly()) { listener.onFailure(new RepositoryException(repository.getMetadata().name(), "cannot create snapshot in a readonly repository")); return; } final Snapshot snapshot = new Snapshot(repositoryName, snapshotId); // We should only use the feature states logic if we're sure we'll be able to finish the snapshot without a lower-version // node taking over and causing problems. Therefore, if we're in a mixed cluster with versions that don't know how to handle // feature states, skip all feature states logic, and if `feature_states` is explicitly configured, throw an exception. final List requestedStates = Arrays.asList(request.featureStates()); final Set featureStatesSet; if (request.includeGlobalState() || requestedStates.isEmpty() == false) { if (request.includeGlobalState() && requestedStates.isEmpty()) { // If we're including global state and feature states aren't specified, include all of them featureStatesSet = systemIndices.getFeatureNames(); } else if (requestedStates.size() == 1 && NO_FEATURE_STATES_VALUE.equalsIgnoreCase(requestedStates.get(0))) { // If there's exactly one value and it's "none", include no states featureStatesSet = Collections.emptySet(); } else { // Otherwise, check for "none" then use the list of requested states if (requestedStates.contains(NO_FEATURE_STATES_VALUE)) { listener.onFailure( new IllegalArgumentException( "the feature_states value [" + SnapshotsService.NO_FEATURE_STATES_VALUE + "] indicates that no feature states should be snapshotted, " + "but other feature states were requested: " + requestedStates ) ); return; } featureStatesSet = new HashSet<>(requestedStates); featureStatesSet.retainAll(systemIndices.getFeatureNames()); } } else { featureStatesSet = Collections.emptySet(); } final Map userMeta = repository.adaptUserMetadata(request.userMetadata()); repository.executeConsistentStateUpdate(repositoryData -> new ClusterStateUpdateTask(request.masterNodeTimeout()) { private SnapshotsInProgress.Entry newEntry; @Override public ClusterState execute(ClusterState currentState) { ensureRepositoryExists(repositoryName, currentState); ensureSnapshotNameAvailableInRepo(repositoryData, snapshotName, repository); final SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY); ensureSnapshotNameNotRunning(snapshots, repositoryName, snapshotName); validate(repositoryName, snapshotName, currentState); final SnapshotDeletionsInProgress deletionsInProgress = currentState.custom( SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY ); ensureNoCleanupInProgress(currentState, repositoryName, snapshotName, "create snapshot"); ensureBelowConcurrencyLimit(repositoryName, snapshotName, snapshots, deletionsInProgress); // Store newSnapshot here to be processed in clusterStateProcessed Map> requestedIndices = Arrays.stream( indexNameExpressionResolver.concreteIndexNames(currentState, request) ).collect(Collectors.partitioningBy(systemIndices::isSystemIndex)); List requestedSystemIndices = requestedIndices.get(true); if (requestedSystemIndices.isEmpty() == false) { Set explicitlyRequestedSystemIndices = new HashSet<>(requestedSystemIndices); explicitlyRequestedSystemIndices.retainAll(Arrays.asList(request.indices())); if (explicitlyRequestedSystemIndices.isEmpty() == false) { throw new IllegalArgumentException( format( "the [indices] parameter includes system indices %s; to include or exclude system indices from a " + "snapshot, use the [include_global_state] or [feature_states] parameters", explicitlyRequestedSystemIndices ) ); } } List indices = requestedIndices.get(false); final Set featureStates = new HashSet<>(); final Set systemDataStreamNames = new HashSet<>(); // if we have any feature states in the snapshot, we add their required indices to the snapshot indices if they haven't // been requested by the request directly final Set indexNames = new HashSet<>(indices); for (String featureName : featureStatesSet) { SystemIndices.Feature feature = systemIndices.getFeature(featureName); Set featureSystemIndices = feature.getIndexDescriptors() .stream() .flatMap(descriptor -> descriptor.getMatchingIndices(currentState.metadata()).stream()) .collect(Collectors.toSet()); Set featureAssociatedIndices = feature.getAssociatedIndexDescriptors() .stream() .flatMap(descriptor -> descriptor.getMatchingIndices(currentState.metadata()).stream()) .collect(Collectors.toSet()); Set featureSystemDataStreams = new HashSet<>(); Set featureDataStreamBackingIndices = new HashSet<>(); for (SystemDataStreamDescriptor sdd : feature.getDataStreamDescriptors()) { List backingIndexNames = sdd.getBackingIndexNames(currentState.metadata()); if (backingIndexNames.size() > 0) { featureDataStreamBackingIndices.addAll(backingIndexNames); featureSystemDataStreams.add(sdd.getDataStreamName()); } } if (featureSystemIndices.size() > 0 || featureAssociatedIndices.size() > 0 || featureDataStreamBackingIndices.size() > 0) { featureStates.add(new SnapshotFeatureInfo(featureName, List.copyOf(featureSystemIndices))); indexNames.addAll(featureSystemIndices); indexNames.addAll(featureAssociatedIndices); indexNames.addAll(featureDataStreamBackingIndices); systemDataStreamNames.addAll(featureSystemDataStreams); } indices = List.copyOf(indexNames); } logger.trace("[{}][{}] creating snapshot for indices [{}]", repositoryName, snapshotName, indices); final Map allIndices = new HashMap<>(); for (SnapshotsInProgress.Entry runningSnapshot : snapshots.forRepo(repositoryName)) { allIndices.putAll(runningSnapshot.indices()); } final Map indexIds = repositoryData.resolveNewIndices(indices, allIndices); final Version version = minCompatibleVersion(currentState.nodes().getMinNodeVersion(), repositoryData, null); ImmutableOpenMap shards = shards( snapshots, deletionsInProgress, currentState, indexIds.values(), useShardGenerations(version), repositoryData, repositoryName ); if (request.partial() == false) { Set missing = new HashSet<>(); for (Map.Entry entry : shards.entrySet()) { if (entry.getValue().state() == ShardState.MISSING) { missing.add(entry.getKey().getIndex().getName()); } } if (missing.isEmpty() == false) { throw new SnapshotException(snapshot, "Indices don't have primary shards " + missing); } } newEntry = SnapshotsInProgress.startedEntry( snapshot, request.includeGlobalState(), request.partial(), indexIds, CollectionUtils.concatLists( indexNameExpressionResolver.dataStreamNames(currentState, request.indicesOptions(), request.indices()), systemDataStreamNames ), threadPool.absoluteTimeInMillis(), repositoryData.getGenId(), shards, userMeta, version, List.copyOf(featureStates) ); return ClusterState.builder(currentState).putCustom(SnapshotsInProgress.TYPE, snapshots.withAddedEntry(newEntry)).build(); } @Override public void onFailure(Exception e) { logger.warn(() -> format("[%s][%s] failed to create snapshot", repositoryName, snapshotName), e); listener.onFailure(e); } @Override public void clusterStateProcessed(ClusterState oldState, final ClusterState newState) { try { logger.info("snapshot [{}] started", snapshot); listener.onResponse(snapshot); } finally { if (newEntry.state().completed()) { endSnapshot(newEntry, newState.metadata(), repositoryData); } } } }, "create_snapshot [" + snapshotName + ']', listener::onFailure); } private static void ensureSnapshotNameNotRunning(SnapshotsInProgress runningSnapshots, String repositoryName, String snapshotName) { if (runningSnapshots.forRepo(repositoryName).stream().anyMatch(s -> s.snapshot().getSnapshotId().getName().equals(snapshotName))) { throw new SnapshotNameAlreadyInUseException(repositoryName, snapshotName, "snapshot with the same name is already in-progress"); } } // TODO: It is worth revisiting the design choice of creating a placeholder entry in snapshots-in-progress here once we have a cache // for repository metadata and loading it has predictable performance public void cloneSnapshot(CloneSnapshotRequest request, ActionListener listener) { final String repositoryName = request.repository(); Repository repository = repositoriesService.repository(repositoryName); if (repository.isReadOnly()) { listener.onFailure(new RepositoryException(repositoryName, "cannot create snapshot in a readonly repository")); return; } final String snapshotName = IndexNameExpressionResolver.resolveDateMathExpression(request.target()); validate(repositoryName, snapshotName); // TODO: create snapshot UUID in CloneSnapshotRequest and make this operation idempotent to cleanly deal with transport layer // retries final SnapshotId snapshotId = new SnapshotId(snapshotName, UUIDs.randomBase64UUID()); final Snapshot snapshot = new Snapshot(repositoryName, snapshotId); initializingClones.add(snapshot); repository.executeConsistentStateUpdate(repositoryData -> new ClusterStateUpdateTask(request.masterNodeTimeout()) { private SnapshotsInProgress.Entry newEntry; @Override public ClusterState execute(ClusterState currentState) { ensureRepositoryExists(repositoryName, currentState); ensureSnapshotNameAvailableInRepo(repositoryData, snapshotName, repository); ensureNoCleanupInProgress(currentState, repositoryName, snapshotName, "clone snapshot"); final SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY); ensureSnapshotNameNotRunning(snapshots, repositoryName, snapshotName); validate(repositoryName, snapshotName, currentState); final SnapshotId sourceSnapshotId = repositoryData.getSnapshotIds() .stream() .filter(src -> src.getName().equals(request.source())) .findAny() .orElseThrow(() -> new SnapshotMissingException(repositoryName, request.source())); final SnapshotDeletionsInProgress deletionsInProgress = currentState.custom( SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY ); if (deletionsInProgress.getEntries().stream().anyMatch(entry -> entry.getSnapshots().contains(sourceSnapshotId))) { throw new ConcurrentSnapshotExecutionException( repositoryName, sourceSnapshotId.getName(), "cannot clone from snapshot that is being deleted" ); } ensureBelowConcurrencyLimit(repositoryName, snapshotName, snapshots, deletionsInProgress); final List indicesForSnapshot = new ArrayList<>(); for (IndexId indexId : repositoryData.getIndices().values()) { if (repositoryData.getSnapshots(indexId).contains(sourceSnapshotId)) { indicesForSnapshot.add(indexId.getName()); } } final List matchingIndices = SnapshotUtils.filterIndices( indicesForSnapshot, request.indices(), request.indicesOptions() ); if (matchingIndices.isEmpty()) { throw new SnapshotException( new Snapshot(repositoryName, sourceSnapshotId), "No indices in the source snapshot [" + sourceSnapshotId + "] matched requested pattern [" + Strings.arrayToCommaDelimitedString(request.indices()) + "]" ); } newEntry = SnapshotsInProgress.startClone( snapshot, sourceSnapshotId, repositoryData.resolveIndices(matchingIndices), threadPool.absoluteTimeInMillis(), repositoryData.getGenId(), minCompatibleVersion(currentState.nodes().getMinNodeVersion(), repositoryData, null) ); return ClusterState.builder(currentState).putCustom(SnapshotsInProgress.TYPE, snapshots.withAddedEntry(newEntry)).build(); } @Override public void onFailure(Exception e) { initializingClones.remove(snapshot); logger.warn(() -> format("[%s][%s] failed to clone snapshot", repositoryName, snapshotName), e); listener.onFailure(e); } @Override public void clusterStateProcessed(ClusterState oldState, final ClusterState newState) { logger.info("snapshot clone [{}] started", snapshot); addListener(snapshot, ActionListener.wrap(r -> listener.onResponse(null), listener::onFailure)); startCloning(repository, newEntry); } }, "clone_snapshot [" + request.source() + "][" + snapshotName + ']', listener::onFailure); } private static void ensureNoCleanupInProgress( final ClusterState currentState, final String repositoryName, final String snapshotName, final String reason ) { final RepositoryCleanupInProgress repositoryCleanupInProgress = currentState.custom( RepositoryCleanupInProgress.TYPE, RepositoryCleanupInProgress.EMPTY ); if (repositoryCleanupInProgress.hasCleanupInProgress()) { throw new ConcurrentSnapshotExecutionException( repositoryName, snapshotName, "cannot " + reason + " while a repository cleanup is in-progress in " + repositoryCleanupInProgress.entries() .stream() .map(RepositoryCleanupInProgress.Entry::repository) .collect(Collectors.toSet()) ); } } private static void ensureSnapshotNameAvailableInRepo(RepositoryData repositoryData, String snapshotName, Repository repository) { // check if the snapshot name already exists in the repository if (repositoryData.getSnapshotIds().stream().anyMatch(s -> s.getName().equals(snapshotName))) { throw new SnapshotNameAlreadyInUseException( repository.getMetadata().name(), snapshotName, "snapshot with the same name already exists" ); } } /** * Determine the number of shards in each index of a clone operation and update the cluster state accordingly. * * @param repository repository to run operation on * @param cloneEntry clone operation in the cluster state */ private void startCloning(Repository repository, SnapshotsInProgress.Entry cloneEntry) { final Collection indices = cloneEntry.indices().values(); final SnapshotId sourceSnapshot = cloneEntry.source(); final Snapshot targetSnapshot = cloneEntry.snapshot(); final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT); // Exception handler for IO exceptions with loading index and repo metadata final Consumer onFailure = e -> { endingSnapshots.add(targetSnapshot); initializingClones.remove(targetSnapshot); logger.info(() -> "Failed to start snapshot clone [" + cloneEntry + "]", e); removeFailedSnapshotFromClusterState(targetSnapshot, e, null); }; // 1. step, load SnapshotInfo to make sure that source snapshot was successful for the indices we want to clone // TODO: we could skip this step for snapshots with state SUCCESS final StepListener snapshotInfoListener = new StepListener<>(); repository.getSnapshotInfo(sourceSnapshot, snapshotInfoListener); final StepListener>> allShardCountsListener = new StepListener<>(); final GroupedActionListener> shardCountListener = new GroupedActionListener<>( allShardCountsListener, indices.size() ); snapshotInfoListener.whenComplete(snapshotInfo -> { for (IndexId indexId : indices) { if (RestoreService.failed(snapshotInfo, indexId.getName())) { throw new SnapshotException( targetSnapshot, "Can't clone index [" + indexId + "] because its snapshot was not successful." ); } } // 2. step, load the number of shards we have in each index to be cloned from the index metadata. repository.getRepositoryData(ActionListener.wrap(repositoryData -> { for (IndexId index : indices) { executor.execute(ActionRunnable.supply(shardCountListener, () -> { final IndexMetadata metadata = repository.getSnapshotIndexMetaData(repositoryData, sourceSnapshot, index); return Tuple.tuple(index, metadata.getNumberOfShards()); })); } }, onFailure)); }, onFailure); // 3. step, we have all the shard counts, now update the cluster state to have clone jobs in the snap entry allShardCountsListener.whenComplete(counts -> repository.executeConsistentStateUpdate(repoData -> new ClusterStateUpdateTask() { private SnapshotsInProgress.Entry updatedEntry; @Override public ClusterState execute(ClusterState currentState) { final SnapshotsInProgress snapshotsInProgress = currentState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY); final String repoName = cloneEntry.repository(); final List existingEntries = snapshotsInProgress.forRepo(repoName); final List updatedEntries = new ArrayList<>(existingEntries.size()); final String localNodeId = currentState.nodes().getLocalNodeId(); final ShardGenerations shardGenerations = repoData.shardGenerations(); for (SnapshotsInProgress.Entry existing : existingEntries) { if (cloneEntry.snapshot().getSnapshotId().equals(existing.snapshot().getSnapshotId())) { final ImmutableOpenMap.Builder clonesBuilder = ImmutableOpenMap.builder(); final boolean readyToExecute = currentState.custom( SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY ).hasExecutingDeletion(repoName) == false; final InFlightShardSnapshotStates inFlightShardStates; if (readyToExecute) { inFlightShardStates = InFlightShardSnapshotStates.forEntries(snapshotsInProgress.forRepo(repoName)); } else { // no need to compute these, we'll mark all shards as queued anyway because we wait for the delete inFlightShardStates = null; } for (Tuple count : counts) { for (int shardId = 0; shardId < count.v2(); shardId++) { final RepositoryShardId repoShardId = new RepositoryShardId(count.v1(), shardId); final String indexName = repoShardId.indexName(); if (readyToExecute == false || inFlightShardStates.isActive(indexName, shardId)) { clonesBuilder.put(repoShardId, ShardSnapshotStatus.UNASSIGNED_QUEUED); } else { clonesBuilder.put( repoShardId, new ShardSnapshotStatus( localNodeId, inFlightShardStates.generationForShard(repoShardId.index(), shardId, shardGenerations) ) ); } } } updatedEntry = cloneEntry.withClones(clonesBuilder.build()); } else { updatedEntries.add(existing); } } if (updatedEntry != null) { // Move the now ready to execute clone operation to the back of the snapshot operations order because its // shard snapshot state was based on all previous existing operations in progress // TODO: If we could eventually drop the snapshot clone init phase we don't need this any longer updatedEntries.add(updatedEntry); return updateWithSnapshots(currentState, snapshotsInProgress.withUpdatedEntriesForRepo(repoName, updatedEntries), null); } return currentState; } @Override public void onFailure(Exception e) { initializingClones.remove(targetSnapshot); logger.info(() -> "Failed to start snapshot clone [" + cloneEntry + "]", e); failAllListenersOnMasterFailOver(e); } @Override public void clusterStateProcessed(ClusterState oldState, ClusterState newState) { initializingClones.remove(targetSnapshot); if (updatedEntry != null) { final Snapshot target = updatedEntry.snapshot(); final SnapshotId sourceSnapshot = updatedEntry.source(); for (Map.Entry indexClone : updatedEntry.shardsByRepoShardId().entrySet()) { final ShardSnapshotStatus shardStatusBefore = indexClone.getValue(); if (shardStatusBefore.state() != ShardState.INIT) { continue; } final RepositoryShardId repoShardId = indexClone.getKey(); runReadyClone(target, sourceSnapshot, shardStatusBefore, repoShardId, repository); } } else { // Extremely unlikely corner case of master failing over between between starting the clone and // starting shard clones. logger.warn("Did not find expected entry [{}] in the cluster state", cloneEntry); } } }, "start snapshot clone", onFailure), onFailure); } private final Set currentlyCloning = Collections.synchronizedSet(new HashSet<>()); private void runReadyClone( Snapshot target, SnapshotId sourceSnapshot, ShardSnapshotStatus shardStatusBefore, RepositoryShardId repoShardId, Repository repository ) { final SnapshotId targetSnapshot = target.getSnapshotId(); final String localNodeId = clusterService.localNode().getId(); if (currentlyCloning.add(repoShardId)) { repository.cloneShardSnapshot( sourceSnapshot, targetSnapshot, repoShardId, shardStatusBefore.generation(), ActionListener.wrap( shardSnapshotResult -> innerUpdateSnapshotState( target, null, repoShardId, ShardSnapshotStatus.success(localNodeId, shardSnapshotResult), ActionListener.runBefore( ActionListener.wrap( v -> logger.trace( "Marked [{}] as successfully cloned from [{}] to [{}]", repoShardId, sourceSnapshot, targetSnapshot ), e -> { logger.warn("Cluster state update after successful shard clone [{}] failed", repoShardId); failAllListenersOnMasterFailOver(e); } ), () -> currentlyCloning.remove(repoShardId) ) ), e -> innerUpdateSnapshotState( target, null, repoShardId, new ShardSnapshotStatus( localNodeId, ShardState.FAILED, "failed to clone shard snapshot", shardStatusBefore.generation() ), ActionListener.runBefore( ActionListener.wrap( v -> logger.trace( "Marked [{}] as failed clone from [{}] to [{}]", repoShardId, sourceSnapshot, targetSnapshot ), ex -> { logger.warn("Cluster state update after failed shard clone [{}] failed", repoShardId); failAllListenersOnMasterFailOver(ex); } ), () -> currentlyCloning.remove(repoShardId) ) ) ) ); } } private void ensureBelowConcurrencyLimit( String repository, String name, SnapshotsInProgress snapshotsInProgress, SnapshotDeletionsInProgress deletionsInProgress ) { final int inProgressOperations = snapshotsInProgress.count() + deletionsInProgress.getEntries().size(); final int maxOps = maxConcurrentOperations; if (inProgressOperations >= maxOps) { throw new ConcurrentSnapshotExecutionException( repository, name, "Cannot start another operation, already running [" + inProgressOperations + "] operations and the current" + " limit for concurrent snapshot operations is set to [" + maxOps + "]" ); } } /** * Throws {@link RepositoryMissingException} if no repository by the given name is found in the given cluster state. */ public static void ensureRepositoryExists(String repoName, ClusterState state) { if (state.metadata().custom(RepositoriesMetadata.TYPE, RepositoriesMetadata.EMPTY).repository(repoName) == null) { throw new RepositoryMissingException(repoName); } } /** * Validates snapshot request * * @param repositoryName repository name * @param snapshotName snapshot name * @param state current cluster state */ private static void validate(String repositoryName, String snapshotName, ClusterState state) { RepositoriesMetadata repositoriesMetadata = state.getMetadata().custom(RepositoriesMetadata.TYPE, RepositoriesMetadata.EMPTY); if (repositoriesMetadata.repository(repositoryName) == null) { throw new RepositoryMissingException(repositoryName); } validate(repositoryName, snapshotName); } private static void validate(final String repositoryName, final String snapshotName) { if (Strings.hasLength(snapshotName) == false) { throw new InvalidSnapshotNameException(repositoryName, snapshotName, "cannot be empty"); } if (snapshotName.contains(" ")) { throw new InvalidSnapshotNameException(repositoryName, snapshotName, "must not contain whitespace"); } if (snapshotName.contains(",")) { throw new InvalidSnapshotNameException(repositoryName, snapshotName, "must not contain ','"); } if (snapshotName.contains("#")) { throw new InvalidSnapshotNameException(repositoryName, snapshotName, "must not contain '#'"); } if (snapshotName.charAt(0) == '_') { throw new InvalidSnapshotNameException(repositoryName, snapshotName, "must not start with '_'"); } if (snapshotName.toLowerCase(Locale.ROOT).equals(snapshotName) == false) { throw new InvalidSnapshotNameException(repositoryName, snapshotName, "must be lowercase"); } if (Strings.validFileName(snapshotName) == false) { throw new InvalidSnapshotNameException( repositoryName, snapshotName, "must not contain the following characters " + Strings.INVALID_FILENAME_CHARS ); } } private static ShardGenerations buildGenerations(SnapshotsInProgress.Entry snapshot, Metadata metadata) { ShardGenerations.Builder builder = ShardGenerations.builder(); if (snapshot.isClone()) { snapshot.shardsByRepoShardId().entrySet().forEach(c -> builder.put(c.getKey().index(), c.getKey().shardId(), c.getValue())); } else { snapshot.shardsByRepoShardId().entrySet().forEach(c -> { final Index index = snapshot.indexByName(c.getKey().indexName()); if (metadata.index(index) == null) { assert snapshot.partial() : "Index [" + index + "] was deleted during a snapshot but snapshot was not partial."; return; } builder.put(c.getKey().index(), c.getKey().shardId(), c.getValue()); }); } return builder.build(); } private static Metadata metadataForSnapshot(SnapshotsInProgress.Entry snapshot, Metadata metadata) { final Metadata.Builder builder; if (snapshot.includeGlobalState() == false) { // Remove global state from the cluster state builder = Metadata.builder(); for (IndexId index : snapshot.indices().values()) { final IndexMetadata indexMetadata = metadata.index(index.getName()); if (indexMetadata == null) { assert snapshot.partial() : "Index [" + index + "] was deleted during a snapshot but snapshot was not partial."; } else { builder.put(indexMetadata, false); } } } else { builder = Metadata.builder(metadata); } // Only keep those data streams in the metadata that were actually requested by the initial snapshot create operation and that have // all their indices contained in the snapshot final Map dataStreams = new HashMap<>(); final Set indicesInSnapshot = snapshot.indices().keySet(); for (String dataStreamName : snapshot.dataStreams()) { DataStream dataStream = metadata.dataStreams().get(dataStreamName); if (dataStream == null) { assert snapshot.partial() : "Data stream [" + dataStreamName + "] was deleted during a snapshot but snapshot was not partial."; } else { boolean missingIndex = false; for (Index index : dataStream.getIndices()) { final String indexName = index.getName(); if (builder.get(indexName) == null || indicesInSnapshot.contains(indexName) == false) { missingIndex = true; break; } } final DataStream reconciled = missingIndex ? dataStream.snapshot(indicesInSnapshot) : dataStream; if (reconciled != null) { dataStreams.put(dataStreamName, reconciled); } } } return builder.dataStreams(dataStreams, filterDataStreamAliases(dataStreams, metadata.dataStreamAliases())).build(); } /** * Returns status of the currently running snapshots *

* This method is executed on master node *

* * @param snapshotsInProgress snapshots in progress in the cluster state * @param repository repository id * @param snapshots list of snapshots that will be used as a filter, empty list means no snapshots are filtered * @return list of metadata for currently running snapshots */ public static List currentSnapshots( @Nullable SnapshotsInProgress snapshotsInProgress, String repository, List snapshots ) { if (snapshotsInProgress == null || snapshotsInProgress.isEmpty()) { return Collections.emptyList(); } if ("_all".equals(repository)) { return snapshotsInProgress.asStream().toList(); } if (snapshots.isEmpty()) { return snapshotsInProgress.forRepo(repository); } List builder = new ArrayList<>(); for (SnapshotsInProgress.Entry entry : snapshotsInProgress.forRepo(repository)) { for (String snapshot : snapshots) { if (entry.snapshot().getSnapshotId().getName().equals(snapshot)) { builder.add(entry); break; } } } return unmodifiableList(builder); } @Override public void applyClusterState(ClusterChangedEvent event) { try { if (event.localNodeMaster()) { // We don't remove old master when master flips anymore. So, we need to check for change in master SnapshotsInProgress snapshotsInProgress = event.state().custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY); final boolean newMaster = event.previousState().nodes().isLocalNodeElectedMaster() == false; processExternalChanges( newMaster || removedNodesCleanupNeeded(snapshotsInProgress, event.nodesDelta().removedNodes()), event.routingTableChanged() && waitingShardsStartedOrUnassigned(snapshotsInProgress, event) ); } else { if (snapshotCompletionListeners.isEmpty() == false) { // We have snapshot listeners but are not the master any more. Fail all waiting listeners except for those that already // have their snapshots finalizing (those that are already finalizing will fail on their own from to update the cluster // state). for (Snapshot snapshot : Set.copyOf(snapshotCompletionListeners.keySet())) { if (endingSnapshots.add(snapshot)) { failSnapshotCompletionListeners(snapshot, new SnapshotException(snapshot, "no longer master")); assert endingSnapshots.contains(snapshot) == false : snapshot; } } } if (snapshotDeletionListeners.isEmpty() == false) { final Exception e = new NotMasterException("no longer master"); for (String delete : Set.copyOf(snapshotDeletionListeners.keySet())) { failListenersIgnoringException(snapshotDeletionListeners.remove(delete), e); } } } } catch (Exception e) { assert false : new AssertionError(e); logger.warn("Failed to update snapshot state ", e); } assert assertConsistentWithClusterState(event.state()); assert assertNoDanglingSnapshots(event.state()); } private boolean assertConsistentWithClusterState(ClusterState state) { final SnapshotsInProgress snapshotsInProgress = state.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY); if (snapshotsInProgress.isEmpty() == false) { synchronized (endingSnapshots) { final Set runningSnapshots = Stream.concat( snapshotsInProgress.asStream().map(SnapshotsInProgress.Entry::snapshot), endingSnapshots.stream() ).collect(Collectors.toSet()); final Set snapshotListenerKeys = snapshotCompletionListeners.keySet(); assert runningSnapshots.containsAll(snapshotListenerKeys) : "Saw completion listeners for unknown snapshots in " + snapshotListenerKeys + " but running snapshots are " + runningSnapshots; } } final SnapshotDeletionsInProgress snapshotDeletionsInProgress = state.custom( SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY ); if (snapshotDeletionsInProgress.hasDeletionsInProgress()) { synchronized (repositoryOperations.runningDeletions) { final Set runningDeletes = Stream.concat( snapshotDeletionsInProgress.getEntries().stream().map(SnapshotDeletionsInProgress.Entry::uuid), repositoryOperations.runningDeletions.stream() ).collect(Collectors.toSet()); final Set deleteListenerKeys = snapshotDeletionListeners.keySet(); assert runningDeletes.containsAll(deleteListenerKeys) : "Saw deletions listeners for unknown uuids in " + deleteListenerKeys + " but running deletes are " + runningDeletes; } } return true; } // Assert that there are no snapshots that have a shard that is waiting to be assigned even though the cluster state would allow for it // to be assigned private static boolean assertNoDanglingSnapshots(ClusterState state) { final SnapshotsInProgress snapshotsInProgress = state.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY); final SnapshotDeletionsInProgress snapshotDeletionsInProgress = state.custom( SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY ); final Set reposWithRunningDelete = snapshotDeletionsInProgress.getEntries() .stream() .filter(entry -> entry.state() == SnapshotDeletionsInProgress.State.STARTED) .map(SnapshotDeletionsInProgress.Entry::repository) .collect(Collectors.toSet()); for (List repoEntry : snapshotsInProgress.entriesByRepo()) { final SnapshotsInProgress.Entry entry = repoEntry.get(0); for (ShardSnapshotStatus value : entry.shardsByRepoShardId().values()) { if (value.equals(ShardSnapshotStatus.UNASSIGNED_QUEUED)) { assert reposWithRunningDelete.contains(entry.repository()) : "Found shard snapshot waiting to be assigned in [" + entry + "] but it is not blocked by any running delete"; } else if (value.isActive()) { assert reposWithRunningDelete.contains(entry.repository()) == false : "Found shard snapshot actively executing in [" + entry + "] when it should be blocked by a running delete [" + Strings.toString(snapshotDeletionsInProgress) + "]"; } } } return true; } /** * Updates the state of in-progress snapshots in reaction to a change in the configuration of the cluster nodes (master fail-over or * disconnect of a data node that was executing a snapshot) or a routing change that started shards whose snapshot state is * {@link SnapshotsInProgress.ShardState#WAITING}. * * @param changedNodes true iff either a master fail-over occurred or a data node that was doing snapshot work got removed from the * cluster * @param startShards true iff any waiting shards were started due to a routing change */ private void processExternalChanges(boolean changedNodes, boolean startShards) { if (changedNodes == false && startShards == false) { // nothing to do, no relevant external change happened return; } final String source = "update snapshot after shards started [" + startShards + "] or node configuration changed [" + changedNodes + "]"; submitUnbatchedTask(source, new ClusterStateUpdateTask() { private final Collection finishedSnapshots = new ArrayList<>(); private final Collection deletionsToExecute = new ArrayList<>(); @Override public ClusterState execute(ClusterState currentState) { RoutingTable routingTable = currentState.routingTable(); final SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY); final SnapshotDeletionsInProgress deletes = currentState.custom( SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY ); DiscoveryNodes nodes = currentState.nodes(); final EnumSet statesToUpdate; // If we are reacting to a change in the cluster node configuration we have to update the shard states of both started // and // aborted snapshots to potentially fail shards running on the removed nodes if (changedNodes) { statesToUpdate = EnumSet.of(State.STARTED, State.ABORTED); } else { // We are reacting to shards that started only so which only affects the individual shard states of started // snapshots statesToUpdate = EnumSet.of(State.STARTED); } // We keep a cache of shards that failed in this map. If we fail a shardId for a given repository because of // a node leaving or shard becoming unassigned for one snapshot, we will also fail it for all subsequent enqueued // snapshots for the same repository // TODO: the code in this state update duplicates large chunks of the logic in #SHARD_STATE_EXECUTOR. // We should refactor it to ideally also go through #SHARD_STATE_EXECUTOR by hand-crafting shard state updates // that encapsulate nodes leaving or indices having been deleted and passing them to the executor instead. SnapshotsInProgress updated = snapshots; for (final List snapshotsInRepo : snapshots.entriesByRepo()) { boolean changed = false; final List updatedEntriesForRepo = new ArrayList<>(); final Map knownFailures = new HashMap<>(); final String repository = snapshotsInRepo.get(0).repository(); for (SnapshotsInProgress.Entry snapshot : snapshotsInRepo) { if (statesToUpdate.contains(snapshot.state())) { if (snapshot.isClone()) { if (snapshot.shardsByRepoShardId().isEmpty()) { // Currently initializing clone if (initializingClones.contains(snapshot.snapshot())) { updatedEntriesForRepo.add(snapshot); } else { logger.debug("removing not yet start clone operation [{}]", snapshot); changed = true; } } else { // see if any clones may have had a shard become available for execution because of failures if (deletes.hasExecutingDeletion(repository)) { // Currently executing a delete for this repo, no need to try and update any clone operations. // The logic for finishing the delete will update running clones with the latest changes. updatedEntriesForRepo.add(snapshot); continue; } ImmutableOpenMap.Builder clones = null; InFlightShardSnapshotStates inFlightShardSnapshotStates = null; for (Map.Entry failureEntry : knownFailures.entrySet()) { final RepositoryShardId repositoryShardId = failureEntry.getKey(); final ShardSnapshotStatus existingStatus = snapshot.shardsByRepoShardId().get(repositoryShardId); if (ShardSnapshotStatus.UNASSIGNED_QUEUED.equals(existingStatus)) { if (inFlightShardSnapshotStates == null) { inFlightShardSnapshotStates = InFlightShardSnapshotStates.forEntries(updatedEntriesForRepo); } if (inFlightShardSnapshotStates.isActive( repositoryShardId.indexName(), repositoryShardId.shardId() )) { // we already have this shard assigned to another task continue; } if (clones == null) { clones = ImmutableOpenMap.builder(snapshot.shardsByRepoShardId()); } // We can use the generation from the shard failure to start the clone operation here // because #processWaitingShardsAndRemovedNodes adds generations to failure statuses that // allow us to start another clone. // The usual route via InFlightShardSnapshotStates is not viable here because it would // require a consistent view of the RepositoryData which we don't have here because this // state update runs over all repositories at once. clones.put( repositoryShardId, new ShardSnapshotStatus(nodes.getLocalNodeId(), failureEntry.getValue().generation()) ); } } if (clones != null) { changed = true; updatedEntriesForRepo.add(snapshot.withClones(clones.build())); } else { updatedEntriesForRepo.add(snapshot); } } } else { ImmutableOpenMap shards = processWaitingShardsAndRemovedNodes( snapshot, routingTable, nodes, knownFailures ); if (shards != null) { final SnapshotsInProgress.Entry updatedSnapshot = snapshot.withShardStates(shards); changed = true; if (updatedSnapshot.state().completed()) { finishedSnapshots.add(updatedSnapshot); } updatedEntriesForRepo.add(updatedSnapshot); } else { updatedEntriesForRepo.add(snapshot); } } } else if (snapshot.repositoryStateId() == RepositoryData.UNKNOWN_REPO_GEN) { // BwC path, older versions could create entries with unknown repo GEN in INIT or ABORTED state that did not // yet write anything to the repository physically. This means we can simply remove these from the cluster // state without having to do any additional cleanup. changed = true; logger.debug("[{}] was found in dangling INIT or ABORTED state", snapshot); } else { if (snapshot.state().completed() || completed(snapshot.shardsByRepoShardId().values())) { finishedSnapshots.add(snapshot); } updatedEntriesForRepo.add(snapshot); } } if (changed) { updated = updated.withUpdatedEntriesForRepo(repository, updatedEntriesForRepo); } } final ClusterState res = readyDeletions( updated != snapshots ? ClusterState.builder(currentState).putCustom(SnapshotsInProgress.TYPE, updated).build() : currentState ).v1(); for (SnapshotDeletionsInProgress.Entry delete : res.custom( SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY ).getEntries()) { if (delete.state() == SnapshotDeletionsInProgress.State.STARTED) { deletionsToExecute.add(delete); } } return res; } @Override public void onFailure(Exception e) { logger.warn(() -> format("failed to update snapshot state after shards started or nodes removed from [%s] ", source), e); } @Override public void clusterStateProcessed(ClusterState oldState, ClusterState newState) { final SnapshotDeletionsInProgress snapshotDeletionsInProgress = newState.custom( SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY ); if (finishedSnapshots.isEmpty() == false) { // If we found snapshots that should be finalized as a result of the CS update we try to initiate finalization for // them // unless there is an executing snapshot delete already. If there is an executing snapshot delete we don't have to // enqueue the snapshot finalizations here because the ongoing delete will take care of that when removing the // delete // from the cluster state final Set reposWithRunningDeletes = snapshotDeletionsInProgress.getEntries() .stream() .filter(entry -> entry.state() == SnapshotDeletionsInProgress.State.STARTED) .map(SnapshotDeletionsInProgress.Entry::repository) .collect(Collectors.toSet()); for (SnapshotsInProgress.Entry entry : finishedSnapshots) { if (reposWithRunningDeletes.contains(entry.repository()) == false) { endSnapshot(entry, newState.metadata(), null); } } } startExecutableClones(newState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY), null); // run newly ready deletes for (SnapshotDeletionsInProgress.Entry entry : deletionsToExecute) { if (tryEnterRepoLoop(entry.repository())) { deleteSnapshotsFromRepository(entry, newState.nodes().getMinNodeVersion()); } } } }); } private static ImmutableOpenMap processWaitingShardsAndRemovedNodes( SnapshotsInProgress.Entry entry, RoutingTable routingTable, DiscoveryNodes nodes, Map knownFailures ) { assert entry.isClone() == false : "clones take a different path"; boolean snapshotChanged = false; ImmutableOpenMap.Builder shards = ImmutableOpenMap.builder(); for (Map.Entry shardEntry : entry.shardsByRepoShardId().entrySet()) { ShardSnapshotStatus shardStatus = shardEntry.getValue(); ShardId shardId = entry.shardId(shardEntry.getKey()); if (shardStatus.equals(ShardSnapshotStatus.UNASSIGNED_QUEUED)) { // this shard snapshot is waiting for a previous snapshot to finish execution for this shard final ShardSnapshotStatus knownFailure = knownFailures.get(shardEntry.getKey()); if (knownFailure == null) { final IndexRoutingTable indexShardRoutingTable = routingTable.index(shardId.getIndex()); if (indexShardRoutingTable == null) { // shard became unassigned while queued after a delete or clone operation so we can fail as missing here assert entry.partial(); snapshotChanged = true; logger.debug("failing snapshot of shard [{}] because index got deleted", shardId); shards.put(shardId, ShardSnapshotStatus.MISSING); knownFailures.put(shardEntry.getKey(), ShardSnapshotStatus.MISSING); } else { // if no failure is known for the shard we keep waiting shards.put(shardId, shardStatus); } } else { // If a failure is known for an execution we waited on for this shard then we fail with the same exception here // as well snapshotChanged = true; shards.put(shardId, knownFailure); } } else if (shardStatus.state() == ShardState.WAITING) { IndexRoutingTable indexShardRoutingTable = routingTable.index(shardId.getIndex()); if (indexShardRoutingTable != null) { IndexShardRoutingTable shardRouting = indexShardRoutingTable.shard(shardId.id()); if (shardRouting != null && shardRouting.primaryShard() != null) { if (shardRouting.primaryShard().started()) { // Shard that we were waiting for has started on a node, let's process it snapshotChanged = true; logger.trace("starting shard that we were waiting for [{}] on node [{}]", shardId, shardStatus.nodeId()); shards.put( shardId, new ShardSnapshotStatus(shardRouting.primaryShard().currentNodeId(), shardStatus.generation()) ); continue; } else if (shardRouting.primaryShard().initializing() || shardRouting.primaryShard().relocating()) { // Shard that we were waiting for hasn't started yet or still relocating - will continue to wait shards.put(shardId, shardStatus); continue; } } } // Shard that we were waiting for went into unassigned state or disappeared - giving up snapshotChanged = true; logger.warn("failing snapshot of shard [{}] on unassigned shard [{}]", shardId, shardStatus.nodeId()); final ShardSnapshotStatus failedState = new ShardSnapshotStatus( shardStatus.nodeId(), ShardState.FAILED, "shard is unassigned", shardStatus.generation() ); shards.put(shardId, failedState); knownFailures.put(shardEntry.getKey(), failedState); } else if (shardStatus.state().completed() == false && shardStatus.nodeId() != null) { if (nodes.nodeExists(shardStatus.nodeId())) { shards.put(shardId, shardStatus); } else { // TODO: Restart snapshot on another node? snapshotChanged = true; logger.warn("failing snapshot of shard [{}] on departed node [{}]", shardId, shardStatus.nodeId()); final ShardSnapshotStatus failedState = new ShardSnapshotStatus( shardStatus.nodeId(), ShardState.FAILED, "node left the cluster during snapshot", shardStatus.generation() ); shards.put(shardId, failedState); knownFailures.put(shardEntry.getKey(), failedState); } } else { shards.put(shardId, shardStatus); } } if (snapshotChanged) { return shards.build(); } else { return null; } } private static boolean waitingShardsStartedOrUnassigned(SnapshotsInProgress snapshotsInProgress, ClusterChangedEvent event) { for (List entries : snapshotsInProgress.entriesByRepo()) { for (SnapshotsInProgress.Entry entry : entries) { if (entry.state() == State.STARTED && entry.isClone() == false) { for (Map.Entry shardStatus : entry.shardsByRepoShardId().entrySet()) { final ShardState state = shardStatus.getValue().state(); if (state != ShardState.WAITING && state != ShardState.QUEUED) { continue; } final RepositoryShardId shardId = shardStatus.getKey(); if (event.indexRoutingTableChanged(shardId.indexName())) { IndexRoutingTable indexShardRoutingTable = event.state() .getRoutingTable() .index(entry.indexByName(shardId.indexName())); if (indexShardRoutingTable == null) { // index got removed concurrently and we have to fail WAITING or QUEUED state shards return true; } ShardRouting shardRouting = indexShardRoutingTable.shard(shardId.shardId()).primaryShard(); if (shardRouting != null && (shardRouting.started() || shardRouting.unassigned())) { return true; } } } } } } return false; } private static boolean removedNodesCleanupNeeded(SnapshotsInProgress snapshotsInProgress, List removedNodes) { if (removedNodes.isEmpty()) { // Nothing to do, no nodes removed return false; } final Set removedNodeIds = removedNodes.stream().map(DiscoveryNode::getId).collect(Collectors.toSet()); return snapshotsInProgress.asStream().anyMatch(snapshot -> { if (snapshot.state().completed() || snapshot.isClone()) { // nothing to do for already completed snapshots or clones that run on master anyways return false; } for (ShardSnapshotStatus shardSnapshotStatus : snapshot.shardsByRepoShardId().values()) { if (shardSnapshotStatus.state().completed() == false && removedNodeIds.contains(shardSnapshotStatus.nodeId())) { // Snapshot had an incomplete shard running on a removed node so we need to adjust that shard's snapshot status return true; } } return false; }); } /** * Finalizes the snapshot in the repository. * * @param entry snapshot */ private void endSnapshot(SnapshotsInProgress.Entry entry, Metadata metadata, @Nullable RepositoryData repositoryData) { final Snapshot snapshot = entry.snapshot(); final boolean newFinalization = endingSnapshots.add(snapshot); if (entry.isClone() && entry.state() == State.FAILED) { logger.debug("Removing failed snapshot clone [{}] from cluster state", entry); if (newFinalization) { removeFailedSnapshotFromClusterState(snapshot, new SnapshotException(snapshot, entry.failure()), null); } return; } final String repoName = snapshot.getRepository(); if (tryEnterRepoLoop(repoName)) { if (repositoryData == null) { repositoriesService.repository(repoName).getRepositoryData(new ActionListener<>() { @Override public void onResponse(RepositoryData repositoryData) { finalizeSnapshotEntry(snapshot, metadata, repositoryData); } @Override public void onFailure(Exception e) { submitUnbatchedTask("fail repo tasks for [" + repoName + "]", new FailPendingRepoTasksTask(repoName, e)); } }); } else { finalizeSnapshotEntry(snapshot, metadata, repositoryData); } } else { if (newFinalization) { repositoryOperations.addFinalization(snapshot, metadata); } } } /** * Try starting to run a snapshot finalization or snapshot delete for the given repository. If this method returns * {@code true} then snapshot finalizations and deletions for the repo may be executed. Once no more operations are * ready for the repository {@link #leaveRepoLoop(String)} should be invoked so that a subsequent state change that * causes another operation to become ready can execute. * * @return true if a finalization or snapshot delete may be started at this point */ private boolean tryEnterRepoLoop(String repository) { return currentlyFinalizing.add(repository); } /** * Stop polling for ready snapshot finalizations or deletes in state {@link SnapshotDeletionsInProgress.State#STARTED} to execute * for the given repository. */ private void leaveRepoLoop(String repository) { final boolean removed = currentlyFinalizing.remove(repository); assert removed; } private void finalizeSnapshotEntry(Snapshot snapshot, Metadata metadata, RepositoryData repositoryData) { assert currentlyFinalizing.contains(snapshot.getRepository()); try { SnapshotsInProgress.Entry entry = clusterService.state() .custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY) .snapshot(snapshot); final String failure = entry.failure(); logger.trace("[{}] finalizing snapshot in repository, state: [{}], failure[{}]", snapshot, entry.state(), failure); final ShardGenerations shardGenerations = buildGenerations(entry, metadata); final List finalIndices = shardGenerations.indices().stream().map(IndexId::getName).toList(); final Set indexNames = new HashSet<>(finalIndices); ArrayList shardFailures = new ArrayList<>(); for (Map.Entry shardStatus : entry.shardsByRepoShardId().entrySet()) { RepositoryShardId shardId = shardStatus.getKey(); if (indexNames.contains(shardId.indexName()) == false) { assert entry.partial() : "only ignoring shard failures for concurrently deleted indices for partial snapshots"; continue; } ShardSnapshotStatus status = shardStatus.getValue(); final ShardState state = status.state(); if (state.failed()) { shardFailures.add(new SnapshotShardFailure(status.nodeId(), entry.shardId(shardId), status.reason())); } else if (state.completed() == false) { shardFailures.add(new SnapshotShardFailure(status.nodeId(), entry.shardId(shardId), "skipped")); } else { assert state == ShardState.SUCCESS; } } final String repository = snapshot.getRepository(); final StepListener metadataListener = new StepListener<>(); final Repository repo = repositoriesService.repository(snapshot.getRepository()); if (entry.isClone()) { threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(ActionRunnable.supply(metadataListener, () -> { final Metadata existing = repo.getSnapshotGlobalMetadata(entry.source()); final Metadata.Builder metaBuilder = Metadata.builder(existing); final Set existingIndices = new HashSet<>(); for (IndexId index : entry.indices().values()) { final IndexMetadata indexMetadata = repo.getSnapshotIndexMetaData(repositoryData, entry.source(), index); existingIndices.add(indexMetadata.getIndex()); metaBuilder.put(indexMetadata, false); } // remove those data streams from metadata for which we are missing indices Map dataStreamsToCopy = new HashMap<>(); for (Map.Entry dataStreamEntry : existing.dataStreams().entrySet()) { if (existingIndices.containsAll(dataStreamEntry.getValue().getIndices())) { dataStreamsToCopy.put(dataStreamEntry.getKey(), dataStreamEntry.getValue()); } } Map dataStreamAliasesToCopy = filterDataStreamAliases( dataStreamsToCopy, existing.dataStreamAliases() ); metaBuilder.dataStreams(dataStreamsToCopy, dataStreamAliasesToCopy); return metaBuilder.build(); })); } else { metadataListener.onResponse(metadata); } metadataListener.whenComplete(meta -> { final Metadata metaForSnapshot = metadataForSnapshot(entry, meta); final Map indexSnapshotDetails = Maps.newMapWithExpectedSize( finalIndices.size() ); for (Map.Entry shardEntry : entry.shardsByRepoShardId().entrySet()) { indexSnapshotDetails.compute(shardEntry.getKey().indexName(), (indexName, current) -> { if (current == SnapshotInfo.IndexSnapshotDetails.SKIPPED) { // already found an unsuccessful shard in this index, skip this shard return current; } final ShardSnapshotStatus shardSnapshotStatus = shardEntry.getValue(); if (shardSnapshotStatus.state() != ShardState.SUCCESS) { // first unsuccessful shard in this index found, record that this index should be skipped return SnapshotInfo.IndexSnapshotDetails.SKIPPED; } final ShardSnapshotResult result = shardSnapshotStatus.shardSnapshotResult(); if (result == null) { // detailed result not recorded, skip this index return SnapshotInfo.IndexSnapshotDetails.SKIPPED; } if (current == null) { return new SnapshotInfo.IndexSnapshotDetails(1, result.getSize(), result.getSegmentCount()); } else { return new SnapshotInfo.IndexSnapshotDetails( current.getShardCount() + 1, ByteSizeValue.ofBytes(current.getSize().getBytes() + result.getSize().getBytes()), Math.max(current.getMaxSegmentsPerShard(), result.getSegmentCount()) ); } }); } indexSnapshotDetails.entrySet().removeIf(e -> e.getValue().getShardCount() == 0); final SnapshotInfo snapshotInfo = new SnapshotInfo( snapshot, finalIndices, entry.dataStreams().stream().filter(metaForSnapshot.dataStreams()::containsKey).toList(), entry.partial() ? onlySuccessfulFeatureStates(entry, finalIndices) : entry.featureStates(), failure, threadPool.absoluteTimeInMillis(), entry.partial() ? shardGenerations.totalShards() : entry.shardsByRepoShardId().size(), shardFailures, entry.includeGlobalState(), entry.userMetadata(), entry.startTime(), indexSnapshotDetails ); final ListenableFuture>> snapshotListeners = new ListenableFuture<>(); repo.finalizeSnapshot( new FinalizeSnapshotContext( shardGenerations, repositoryData.getGenId(), metaForSnapshot, snapshotInfo, entry.version(), ActionListener.wrap(updatedRepositoryData -> { // get a hold of the listeners for this snapshot here and store them in the future so they can be used // by the snapshot info callback below and won't be failed needlessly if #runNextQueuedOperation runs into // a fatal like e.g. this node stopped being the master node snapshotListeners.onResponse(endAndGetListenersToResolve(snapshot)); runNextQueuedOperation(updatedRepositoryData, repository, true); }, e -> handleFinalizationFailure(e, snapshot, repositoryData)), snInfo -> snapshotListeners.addListener(new ActionListener<>() { @Override public void onResponse(List> actionListeners) { completeListenersIgnoringException(actionListeners, snInfo); logger.info("snapshot [{}] completed with state [{}]", snapshot, snInfo.state()); } @Override public void onFailure(Exception e) { // never fails assert false : e; } }) ) ); }, e -> handleFinalizationFailure(e, snapshot, repositoryData)); } catch (Exception e) { assert false : new AssertionError(e); handleFinalizationFailure(e, snapshot, repositoryData); } } /** * Removes all feature states which have missing or failed shards, as they are no longer safely restorable. * @param entry The "in progress" entry with a list of feature states and one or more failed shards. * @param finalIndices The final list of indices in the snapshot, after any indices that were concurrently deleted are removed. * @return The list of feature states which were completed successfully in the given entry. */ private static List onlySuccessfulFeatureStates(SnapshotsInProgress.Entry entry, List finalIndices) { assert entry.partial() : "should not try to filter feature states from a non-partial entry"; // Figure out which indices have unsuccessful shards Set indicesWithUnsuccessfulShards = new HashSet<>(); entry.shardsByRepoShardId().entrySet().forEach(shard -> { final ShardState shardState = shard.getValue().state(); if (shardState.failed() || shardState.completed() == false) { indicesWithUnsuccessfulShards.add(shard.getKey().indexName()); } }); // Now remove any feature states which contain any of those indices, as the feature state is not intact and not safely restorable return entry.featureStates() .stream() .filter(stateInfo -> finalIndices.containsAll(stateInfo.getIndices())) .filter(stateInfo -> stateInfo.getIndices().stream().anyMatch(indicesWithUnsuccessfulShards::contains) == false) .toList(); } /** * Remove a snapshot from {@link #endingSnapshots} set and return its completion listeners that must be resolved. */ private List> endAndGetListenersToResolve(Snapshot snapshot) { // get listeners before removing from the ending snapshots set to not trip assertion in #assertConsistentWithClusterState that // makes sure we don't have listeners for snapshots that aren't tracked in any internal state of this class final List> listenersToComplete = snapshotCompletionListeners.remove(snapshot); endingSnapshots.remove(snapshot); return listenersToComplete; } /** * Handles failure to finalize a snapshot. If the exception indicates that this node was unable to publish a cluster state and stopped * being the master node, then fail all snapshot create and delete listeners executing on this node by delegating to * {@link #failAllListenersOnMasterFailOver}. Otherwise, i.e. as a result of failing to write to the snapshot repository for some * reason, remove the snapshot's {@link SnapshotsInProgress.Entry} from the cluster state and move on with other queued snapshot * operations if there are any. * * @param e exception encountered * @param snapshot snapshot that failed to finalize * @param repositoryData current repository data for the snapshot's repository */ private void handleFinalizationFailure(Exception e, Snapshot snapshot, RepositoryData repositoryData) { if (ExceptionsHelper.unwrap(e, NotMasterException.class, FailedToCommitClusterStateException.class) != null) { // Failure due to not being master any more, don't try to remove snapshot from cluster state the next master // will try ending this snapshot again logger.debug(() -> "[" + snapshot + "] failed to update cluster state during snapshot finalization", e); failSnapshotCompletionListeners( snapshot, new SnapshotException(snapshot, "Failed to update cluster state during snapshot finalization", e) ); failAllListenersOnMasterFailOver(e); } else { logger.warn(() -> "[" + snapshot + "] failed to finalize snapshot", e); removeFailedSnapshotFromClusterState(snapshot, e, repositoryData); } } /** * Run the next queued up repository operation for the given repository name. * * @param repositoryData current repository data * @param repository repository name * @param attemptDelete whether to try and run delete operations that are ready in the cluster state if no * snapshot create operations remain to execute */ private void runNextQueuedOperation(RepositoryData repositoryData, String repository, boolean attemptDelete) { assert currentlyFinalizing.contains(repository); final Tuple nextFinalization = repositoryOperations.pollFinalization(repository); if (nextFinalization == null) { if (attemptDelete) { runReadyDeletions(repositoryData, repository); } else { leaveRepoLoop(repository); } } else { logger.trace("Moving on to finalizing next snapshot [{}]", nextFinalization); finalizeSnapshotEntry(nextFinalization.v1(), nextFinalization.v2(), repositoryData); } } /** * Runs a cluster state update that checks whether we have outstanding snapshot deletions that can be executed and executes them. * * TODO: optimize this to execute in a single CS update together with finalizing the latest snapshot */ private void runReadyDeletions(RepositoryData repositoryData, String repository) { submitUnbatchedTask("Run ready deletions", new ClusterStateUpdateTask() { private SnapshotDeletionsInProgress.Entry deletionToRun; @Override public ClusterState execute(ClusterState currentState) { assert readyDeletions(currentState).v1() == currentState : "Deletes should have been set to ready by finished snapshot deletes and finalizations"; for (SnapshotDeletionsInProgress.Entry entry : currentState.custom( SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY ).getEntries()) { if (entry.repository().equals(repository) && entry.state() == SnapshotDeletionsInProgress.State.STARTED) { deletionToRun = entry; break; } } return currentState; } @Override public void onFailure(Exception e) { logger.warn("Failed to run ready delete operations", e); failAllListenersOnMasterFailOver(e); } @Override public void clusterStateProcessed(ClusterState oldState, ClusterState newState) { if (deletionToRun == null) { runNextQueuedOperation(repositoryData, repository, false); } else { deleteSnapshotsFromRepository(deletionToRun, repositoryData, newState.nodes().getMinNodeVersion()); } } }); } /** * Finds snapshot delete operations that are ready to execute in the given {@link ClusterState} and computes a new cluster state that * has all executable deletes marked as executing. Returns a {@link Tuple} of the updated cluster state and all executable deletes. * This can either be {@link SnapshotDeletionsInProgress.Entry} that were already in state * {@link SnapshotDeletionsInProgress.State#STARTED} or waiting entries in state {@link SnapshotDeletionsInProgress.State#WAITING} * that were moved to {@link SnapshotDeletionsInProgress.State#STARTED} in the returned updated cluster state. * * @param currentState current cluster state * @return tuple of an updated cluster state and currently executable snapshot delete operations */ private static Tuple> readyDeletions(ClusterState currentState) { final SnapshotDeletionsInProgress deletions = currentState.custom( SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY ); if (deletions.hasDeletionsInProgress() == false) { return Tuple.tuple(currentState, List.of()); } final SnapshotsInProgress snapshotsInProgress = currentState.custom(SnapshotsInProgress.TYPE); assert snapshotsInProgress != null; final Set repositoriesSeen = new HashSet<>(); boolean changed = false; final ArrayList readyDeletions = new ArrayList<>(); final List newDeletes = new ArrayList<>(); for (SnapshotDeletionsInProgress.Entry entry : deletions.getEntries()) { final String repo = entry.repository(); if (repositoriesSeen.add(entry.repository()) && entry.state() == SnapshotDeletionsInProgress.State.WAITING && snapshotsInProgress.forRepo(repo).stream().noneMatch(SnapshotsService::isWritingToRepository)) { changed = true; final SnapshotDeletionsInProgress.Entry newEntry = entry.started(); readyDeletions.add(newEntry); newDeletes.add(newEntry); } else { newDeletes.add(entry); } } return Tuple.tuple( changed ? ClusterState.builder(currentState) .putCustom(SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.of(newDeletes)) .build() : currentState, readyDeletions ); } /** * Computes the cluster state resulting from removing a given snapshot create operation from the given state. This method will update * the shard generations of snapshots that the given snapshot depended on so that finalizing them will not cause rolling back to an * outdated shard generation. * * @param state current cluster state * @param snapshot snapshot for which to remove the snapshot operation * @return updated cluster state */ public static ClusterState stateWithoutSnapshot(ClusterState state, Snapshot snapshot) { final SnapshotsInProgress snapshots = state.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY); ClusterState result = state; int indexOfEntry = -1; final List entryList = snapshots.forRepo(snapshot.getRepository()); for (int i = 0; i < entryList.size(); i++) { SnapshotsInProgress.Entry entry = entryList.get(i); if (entry.snapshot().equals(snapshot)) { indexOfEntry = i; break; } } if (indexOfEntry >= 0) { final List entries = new ArrayList<>(entryList.size() - 1); final SnapshotsInProgress.Entry removedEntry = entryList.get(indexOfEntry); for (int i = 0; i < indexOfEntry; i++) { final SnapshotsInProgress.Entry previousEntry = entryList.get(i); if (removedEntry.isClone()) { if (previousEntry.isClone()) { ImmutableOpenMap.Builder updatedShardAssignments = null; for (Map.Entry finishedShardEntry : removedEntry.shardsByRepoShardId() .entrySet()) { final ShardSnapshotStatus shardState = finishedShardEntry.getValue(); if (shardState.state() == ShardState.SUCCESS) { updatedShardAssignments = maybeAddUpdatedAssignment( updatedShardAssignments, shardState, finishedShardEntry.getKey(), previousEntry.shardsByRepoShardId() ); } } addCloneEntry(entries, previousEntry, updatedShardAssignments); } else { ImmutableOpenMap.Builder updatedShardAssignments = null; for (Map.Entry finishedShardEntry : removedEntry.shardsByRepoShardId() .entrySet()) { final ShardSnapshotStatus shardState = finishedShardEntry.getValue(); final RepositoryShardId repositoryShardId = finishedShardEntry.getKey(); if (shardState.state() != ShardState.SUCCESS || previousEntry.shardsByRepoShardId().containsKey(repositoryShardId) == false) { continue; } updatedShardAssignments = maybeAddUpdatedAssignment( updatedShardAssignments, shardState, previousEntry.shardId(repositoryShardId), previousEntry.shards() ); } addSnapshotEntry(entries, previousEntry, updatedShardAssignments); } } else { if (previousEntry.isClone()) { ImmutableOpenMap.Builder updatedShardAssignments = null; for (Map.Entry finishedShardEntry : removedEntry.shardsByRepoShardId() .entrySet()) { final ShardSnapshotStatus shardState = finishedShardEntry.getValue(); final RepositoryShardId repositoryShardId = finishedShardEntry.getKey(); if (shardState.state() != ShardState.SUCCESS || previousEntry.shardsByRepoShardId().containsKey(repositoryShardId) == false) { continue; } updatedShardAssignments = maybeAddUpdatedAssignment( updatedShardAssignments, shardState, repositoryShardId, previousEntry.shardsByRepoShardId() ); } addCloneEntry(entries, previousEntry, updatedShardAssignments); } else { ImmutableOpenMap.Builder updatedShardAssignments = null; for (Map.Entry finishedShardEntry : removedEntry.shardsByRepoShardId() .entrySet()) { final ShardSnapshotStatus shardState = finishedShardEntry.getValue(); if (shardState.state() == ShardState.SUCCESS && previousEntry.shardsByRepoShardId().containsKey(finishedShardEntry.getKey())) { updatedShardAssignments = maybeAddUpdatedAssignment( updatedShardAssignments, shardState, previousEntry.shardId(finishedShardEntry.getKey()), previousEntry.shards() ); } } addSnapshotEntry(entries, previousEntry, updatedShardAssignments); } } } for (int i = indexOfEntry + 1; i < entryList.size(); i++) { entries.add(entryList.get(i)); } result = ClusterState.builder(state) .putCustom(SnapshotsInProgress.TYPE, snapshots.withUpdatedEntriesForRepo(snapshot.getRepository(), entries)) .build(); } return readyDeletions(result).v1(); } private static void addSnapshotEntry( List entries, SnapshotsInProgress.Entry entryToUpdate, @Nullable ImmutableOpenMap.Builder updatedShardAssignments ) { if (updatedShardAssignments == null) { entries.add(entryToUpdate); } else { final ImmutableOpenMap.Builder updatedStatus = ImmutableOpenMap.builder(entryToUpdate.shards()); updatedStatus.putAllFromMap(updatedShardAssignments.build()); entries.add(entryToUpdate.withShardStates(updatedStatus.build())); } } private static void addCloneEntry( List entries, SnapshotsInProgress.Entry entryToUpdate, @Nullable ImmutableOpenMap.Builder updatedShardAssignments ) { if (updatedShardAssignments == null) { entries.add(entryToUpdate); } else { final ImmutableOpenMap.Builder updatedStatus = ImmutableOpenMap.builder( entryToUpdate.shardsByRepoShardId() ); updatedStatus.putAllFromMap(updatedShardAssignments.build()); entries.add(entryToUpdate.withClones(updatedStatus.build())); } } @Nullable private static ImmutableOpenMap.Builder maybeAddUpdatedAssignment( @Nullable ImmutableOpenMap.Builder updatedShardAssignments, ShardSnapshotStatus finishedShardState, T shardId, Map statesToUpdate ) { final ShardGeneration newGeneration = finishedShardState.generation(); final ShardSnapshotStatus stateToUpdate = statesToUpdate.get(shardId); if (stateToUpdate != null && stateToUpdate.state() == ShardState.SUCCESS && Objects.equals(newGeneration, stateToUpdate.generation()) == false) { if (updatedShardAssignments == null) { updatedShardAssignments = ImmutableOpenMap.builder(); } updatedShardAssignments.put(shardId, stateToUpdate.withUpdatedGeneration(newGeneration)); } return updatedShardAssignments; } /** * Removes record of running snapshot from cluster state and notifies the listener when this action is complete. This method is only * used when the snapshot fails for some reason. During normal operation the snapshot repository will remove the * {@link SnapshotsInProgress.Entry} from the cluster state once it's done finalizing the snapshot. * * @param snapshot snapshot that failed * @param failure exception that failed the snapshot * @param repositoryData repository data if the next finalization operation on the repository should be attempted or {@code null} if * no further actions should be executed */ private void removeFailedSnapshotFromClusterState(Snapshot snapshot, Exception failure, @Nullable RepositoryData repositoryData) { assert failure != null : "Failure must be supplied"; submitUnbatchedTask(REMOVE_SNAPSHOT_METADATA_TASK_SOURCE, new ClusterStateUpdateTask() { @Override public ClusterState execute(ClusterState currentState) { final ClusterState updatedState = stateWithoutSnapshot(currentState, snapshot); assert updatedState == currentState || endingSnapshots.contains(snapshot) : "did not track [" + snapshot + "] in ending snapshots while removing it from the cluster state"; // now check if there are any delete operations that refer to the just failed snapshot and remove the snapshot from them return updateWithSnapshots( updatedState, null, deletionsWithoutSnapshots( updatedState.custom(SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY), Collections.singletonList(snapshot.getSnapshotId()), snapshot.getRepository() ) ); } @Override public void onFailure(Exception e) { if (e instanceof NotMasterException) { failure.addSuppressed(new SnapshotException(snapshot, "no longer master")); } logger.log( MasterService.isPublishFailureException(e) ? Level.DEBUG : Level.WARN, () -> "[" + snapshot + "] failed to remove snapshot metadata", e ); failSnapshotCompletionListeners( snapshot, new SnapshotException(snapshot, "Failed to remove snapshot from cluster state", e) ); failAllListenersOnMasterFailOver(e); } @Override public void clusterStateProcessed(ClusterState oldState, ClusterState newState) { failSnapshotCompletionListeners(snapshot, failure); if (repositoryData != null) { runNextQueuedOperation(repositoryData, snapshot.getRepository(), true); } } }); } private static final String REMOVE_SNAPSHOT_METADATA_TASK_SOURCE = "remove snapshot metadata"; /** * Remove the given {@link SnapshotId}s for the given {@code repository} from an instance of {@link SnapshotDeletionsInProgress}. * If no deletion contained any of the snapshot ids to remove then return {@code null}. * * @param deletions snapshot deletions to update * @param snapshotIds snapshot ids to remove * @param repository repository that the snapshot ids belong to * @return updated {@link SnapshotDeletionsInProgress} or {@code null} if unchanged */ @Nullable private static SnapshotDeletionsInProgress deletionsWithoutSnapshots( SnapshotDeletionsInProgress deletions, Collection snapshotIds, String repository ) { boolean changed = false; List updatedEntries = new ArrayList<>(deletions.getEntries().size()); for (SnapshotDeletionsInProgress.Entry entry : deletions.getEntries()) { if (entry.repository().equals(repository)) { final List updatedSnapshotIds = new ArrayList<>(entry.getSnapshots()); if (updatedSnapshotIds.removeAll(snapshotIds)) { changed = true; updatedEntries.add(entry.withSnapshots(updatedSnapshotIds)); } else { updatedEntries.add(entry); } } else { updatedEntries.add(entry); } } return changed ? SnapshotDeletionsInProgress.of(updatedEntries) : null; } private void failSnapshotCompletionListeners(Snapshot snapshot, Exception e) { failListenersIgnoringException(endAndGetListenersToResolve(snapshot), e); assert repositoryOperations.assertNotQueued(snapshot); } /** * Deletes snapshots from the repository. In-progress snapshots matched by the delete will be aborted before deleting them. * * @param request delete snapshot request * @param listener listener */ public void deleteSnapshots(final DeleteSnapshotRequest request, final ActionListener listener) { final String repositoryName = request.repository(); final String[] snapshotNames = request.snapshots(); logger.info( () -> format("deleting snapshots [%s] from repository [%s]", arrayToCommaDelimitedString(snapshotNames), repositoryName) ); final Repository repository = repositoriesService.repository(repositoryName); repository.executeConsistentStateUpdate(repositoryData -> new ClusterStateUpdateTask(request.masterNodeTimeout()) { private SnapshotDeletionsInProgress.Entry newDelete = null; private boolean reusedExistingDelete = false; // Snapshots that had all of their shard snapshots in queued state and thus were removed from the // cluster state right away private final Collection completedNoCleanup = new ArrayList<>(); // Snapshots that were aborted and that already wrote data to the repository and now have to be deleted // from the repository after the cluster state update private final Collection completedWithCleanup = new ArrayList<>(); @Override public ClusterState execute(ClusterState currentState) { ensureRepositoryExists(repositoryName, currentState); final Set snapshotIds = new HashSet<>(); // find in-progress snapshots to delete in cluster state final SnapshotsInProgress snapshotsInProgress = currentState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY); for (SnapshotsInProgress.Entry entry : snapshotsInProgress.forRepo(repositoryName)) { final SnapshotId snapshotId = entry.snapshot().getSnapshotId(); if (Regex.simpleMatch(snapshotNames, snapshotId.getName())) { snapshotIds.add(snapshotId); } } // find snapshots to delete in repository data final Map snapshotsIdsInRepository = repositoryData.getSnapshotIds() .stream() .collect(Collectors.toMap(SnapshotId::getName, Function.identity())); for (String snapshotOrPattern : snapshotNames) { if (Regex.isSimpleMatchPattern(snapshotOrPattern)) { for (Map.Entry entry : snapshotsIdsInRepository.entrySet()) { if (Regex.simpleMatch(snapshotOrPattern, entry.getKey())) { snapshotIds.add(entry.getValue()); } } } else { final SnapshotId foundId = snapshotsIdsInRepository.get(snapshotOrPattern); if (foundId == null) { if (snapshotIds.stream().noneMatch(snapshotId -> snapshotId.getName().equals(snapshotOrPattern))) { throw new SnapshotMissingException(repositoryName, snapshotOrPattern); } } else { snapshotIds.add(foundId); } } } if (snapshotIds.isEmpty()) { return currentState; } final Set activeCloneSources = snapshotsInProgress.asStream() .filter(SnapshotsInProgress.Entry::isClone) .map(SnapshotsInProgress.Entry::source) .collect(Collectors.toSet()); for (SnapshotId snapshotId : snapshotIds) { if (activeCloneSources.contains(snapshotId)) { throw new ConcurrentSnapshotExecutionException( new Snapshot(repositoryName, snapshotId), "cannot delete snapshot while it is being cloned" ); } } ensureNoCleanupInProgress( currentState, repositoryName, snapshotIds.stream().findFirst().get().getName(), "delete snapshot" ); final SnapshotDeletionsInProgress deletionsInProgress = currentState.custom( SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY ); final RestoreInProgress restoreInProgress = currentState.custom(RestoreInProgress.TYPE, RestoreInProgress.EMPTY); // don't allow snapshot deletions while a restore is taking place, // otherwise we could end up deleting a snapshot that is being restored // and the files the restore depends on would all be gone for (RestoreInProgress.Entry entry : restoreInProgress) { if (repositoryName.equals(entry.snapshot().getRepository()) && snapshotIds.contains(entry.snapshot().getSnapshotId())) { throw new ConcurrentSnapshotExecutionException( new Snapshot(repositoryName, snapshotIds.stream().findFirst().get()), "cannot delete snapshot during a restore in progress in [" + restoreInProgress + "]" ); } } // Snapshot ids that will have to be physically deleted from the repository final Set snapshotIdsRequiringCleanup = new HashSet<>(snapshotIds); final SnapshotsInProgress updatedSnapshots = snapshotsInProgress.withUpdatedEntriesForRepo( repositoryName, snapshotsInProgress.forRepo(repositoryName).stream().map(existing -> { if (existing.state() == State.STARTED && snapshotIdsRequiringCleanup.contains(existing.snapshot().getSnapshotId())) { // snapshot is started - mark every non completed shard as aborted final SnapshotsInProgress.Entry abortedEntry = existing.abort(); if (abortedEntry == null) { // No work has been done for this snapshot yet so we remove it from the cluster state directly final Snapshot existingNotYetStartedSnapshot = existing.snapshot(); // Adding the snapshot to #endingSnapshots since we still have to resolve its listeners to not trip // any leaked listener assertions if (endingSnapshots.add(existingNotYetStartedSnapshot)) { completedNoCleanup.add(existingNotYetStartedSnapshot); } snapshotIdsRequiringCleanup.remove(existingNotYetStartedSnapshot.getSnapshotId()); } else if (abortedEntry.state().completed()) { completedWithCleanup.add(abortedEntry); } return abortedEntry; } return existing; }).filter(Objects::nonNull).toList() ); if (snapshotIdsRequiringCleanup.isEmpty()) { // We only saw snapshots that could be removed from the cluster state right away, no need to update the deletions return updateWithSnapshots(currentState, updatedSnapshots, null); } // add the snapshot deletion to the cluster state final SnapshotDeletionsInProgress.Entry replacedEntry = deletionsInProgress.getEntries() .stream() .filter(entry -> entry.repository().equals(repositoryName)) .filter(entry -> entry.state() == SnapshotDeletionsInProgress.State.WAITING) .findFirst() .orElse(null); if (replacedEntry == null) { final Optional foundDuplicate = deletionsInProgress.getEntries() .stream() .filter( entry -> entry.repository().equals(repositoryName) && entry.state() == SnapshotDeletionsInProgress.State.STARTED && entry.getSnapshots().containsAll(snapshotIds) ) .findFirst(); if (foundDuplicate.isPresent()) { newDelete = foundDuplicate.get(); reusedExistingDelete = true; return currentState; } newDelete = new SnapshotDeletionsInProgress.Entry( List.copyOf(snapshotIdsRequiringCleanup), repositoryName, threadPool.absoluteTimeInMillis(), repositoryData.getGenId(), updatedSnapshots.forRepo(repositoryName).stream().noneMatch(SnapshotsService::isWritingToRepository) && deletionsInProgress.hasExecutingDeletion(repositoryName) == false ? SnapshotDeletionsInProgress.State.STARTED : SnapshotDeletionsInProgress.State.WAITING ); } else { newDelete = replacedEntry.withAddedSnapshots(snapshotIdsRequiringCleanup); } return updateWithSnapshots( currentState, updatedSnapshots, (replacedEntry == null ? deletionsInProgress : deletionsInProgress.withRemovedEntry(replacedEntry.uuid())) .withAddedEntry(newDelete) ); } @Override public void onFailure(Exception e) { endingSnapshots.removeAll(completedNoCleanup); listener.onFailure(e); } @Override public void clusterStateProcessed(ClusterState oldState, ClusterState newState) { if (completedNoCleanup.isEmpty() == false) { logger.info("snapshots {} aborted", completedNoCleanup); } for (Snapshot snapshot : completedNoCleanup) { failSnapshotCompletionListeners(snapshot, new SnapshotException(snapshot, SnapshotsInProgress.ABORTED_FAILURE_TEXT)); } if (newDelete == null) { listener.onResponse(null); } else { addDeleteListener(newDelete.uuid(), listener); if (reusedExistingDelete) { return; } if (newDelete.state() == SnapshotDeletionsInProgress.State.STARTED) { if (tryEnterRepoLoop(repositoryName)) { deleteSnapshotsFromRepository(newDelete, repositoryData, newState.nodes().getMinNodeVersion()); } else { logger.trace("Delete [{}] could not execute directly and was queued", newDelete); } } else { for (SnapshotsInProgress.Entry completedSnapshot : completedWithCleanup) { endSnapshot(completedSnapshot, newState.metadata(), repositoryData); } } } } }, "delete snapshot [" + repository + "]" + Arrays.toString(snapshotNames), listener::onFailure); } /** * Checks if the given {@link SnapshotsInProgress.Entry} is currently writing to the repository. * * @param entry snapshot entry * @return true if entry is currently writing to the repository */ private static boolean isWritingToRepository(SnapshotsInProgress.Entry entry) { if (entry.state().completed()) { // Entry is writing to the repo because it's finalizing on master return true; } for (ShardSnapshotStatus value : entry.shardsByRepoShardId().values()) { if (value.isActive()) { // Entry is writing to the repo because it's writing to a shard on a data node or waiting to do so for a concrete shard return true; } } return false; } private void addDeleteListener(String deleteUUID, ActionListener listener) { snapshotDeletionListeners.computeIfAbsent(deleteUUID, k -> new CopyOnWriteArrayList<>()) .add(ContextPreservingActionListener.wrapPreservingContext(listener, threadPool.getThreadContext())); } /** * Determines the minimum {@link Version} that the snapshot repository must be compatible with from the current nodes in the cluster * and the contents of the repository. The minimum version is determined as the lowest version found across all snapshots in the * repository and all nodes in the cluster. * * @param minNodeVersion minimum node version in the cluster * @param repositoryData current {@link RepositoryData} of that repository * @param excluded snapshot id to ignore when computing the minimum version * (used to use newer metadata version after a snapshot delete) * @return minimum node version that must still be able to read the repository metadata */ public static Version minCompatibleVersion( Version minNodeVersion, RepositoryData repositoryData, @Nullable Collection excluded ) { Version minCompatVersion = minNodeVersion; final Collection snapshotIds = repositoryData.getSnapshotIds(); for (SnapshotId snapshotId : snapshotIds.stream() .filter(excluded == null ? sn -> true : Predicate.not(excluded::contains)) .toList()) { final Version known = repositoryData.getVersion(snapshotId); // If we don't have the version cached in the repository data yet we load it from the snapshot info blobs if (known == null) { assert repositoryData.shardGenerations().totalShards() == 0 : "Saw shard generations [" + repositoryData.shardGenerations() + "] but did not have versions tracked for snapshot [" + snapshotId + "]"; return OLD_SNAPSHOT_FORMAT; } else { minCompatVersion = minCompatVersion.before(known) ? minCompatVersion : known; } } return minCompatVersion; } /** * Checks whether the metadata version supports writing {@link ShardGenerations} to the repository. * * @param repositoryMetaVersion version to check * @return true if version supports {@link ShardGenerations} */ public static boolean useShardGenerations(Version repositoryMetaVersion) { return repositoryMetaVersion.onOrAfter(SHARD_GEN_IN_REPO_DATA_VERSION); } /** * Checks whether the metadata version supports writing {@link ShardGenerations} to the repository. * * @param repositoryMetaVersion version to check * @return true if version supports {@link ShardGenerations} */ public static boolean useIndexGenerations(Version repositoryMetaVersion) { return repositoryMetaVersion.onOrAfter(INDEX_GEN_IN_REPO_DATA_VERSION); } /** * Checks whether the metadata version supports writing the cluster- and repository-uuid to the repository. * * @param repositoryMetaVersion version to check * @return true if version supports writing cluster- and repository-uuid to the repository */ public static boolean includesUUIDs(Version repositoryMetaVersion) { return repositoryMetaVersion.onOrAfter(UUIDS_IN_REPO_DATA_VERSION); } public static boolean includeFileInfoWriterUUID(Version repositoryMetaVersion) { return repositoryMetaVersion.onOrAfter(FILE_INFO_WRITER_UUIDS_IN_SHARD_DATA_VERSION); } /** Deletes snapshot from repository * * @param deleteEntry delete entry in cluster state * @param minNodeVersion minimum node version in the cluster */ private void deleteSnapshotsFromRepository(SnapshotDeletionsInProgress.Entry deleteEntry, Version minNodeVersion) { final long expectedRepoGen = deleteEntry.repositoryStateId(); repositoriesService.getRepositoryData(deleteEntry.repository(), new ActionListener<>() { @Override public void onResponse(RepositoryData repositoryData) { assert repositoryData.getGenId() == expectedRepoGen : "Repository generation should not change as long as a ready delete is found in the cluster state but found [" + expectedRepoGen + "] in cluster state and [" + repositoryData.getGenId() + "] in the repository"; deleteSnapshotsFromRepository(deleteEntry, repositoryData, minNodeVersion); } @Override public void onFailure(Exception e) { submitUnbatchedTask( "fail repo tasks for [" + deleteEntry.repository() + "]", new FailPendingRepoTasksTask(deleteEntry.repository(), e) ); } }); } @SuppressForbidden(reason = "legacy usage of unbatched task") // TODO add support for batching here private void submitUnbatchedTask(@SuppressWarnings("SameParameterValue") String source, ClusterStateUpdateTask task) { clusterService.submitUnbatchedStateUpdateTask(source, task); } /** Deletes snapshot from repository * * @param deleteEntry delete entry in cluster state * @param repositoryData the {@link RepositoryData} of the repository to delete from * @param minNodeVersion minimum node version in the cluster */ private void deleteSnapshotsFromRepository( SnapshotDeletionsInProgress.Entry deleteEntry, RepositoryData repositoryData, Version minNodeVersion ) { if (repositoryOperations.startDeletion(deleteEntry.uuid())) { assert currentlyFinalizing.contains(deleteEntry.repository()); final List snapshotIds = deleteEntry.getSnapshots(); assert deleteEntry.state() == SnapshotDeletionsInProgress.State.STARTED : "incorrect state for entry [" + deleteEntry + "]"; if (snapshotIds.isEmpty()) { // this deletion overlapped one or more deletions that were successfully processed and there is no remaining snapshot to // delete now, we can avoid reaching to the repository and can complete the deletion. // TODO we should complete the deletion and resolve the listeners of SnapshotDeletionsInProgress with no snapshot sooner, // that would save some cluster state updates. removeSnapshotDeletionFromClusterState( deleteEntry, repositoryData, listeners -> completeListenersIgnoringException(listeners, null) ); return; } repositoriesService.repository(deleteEntry.repository()) .deleteSnapshots( snapshotIds, repositoryData.getGenId(), minCompatibleVersion(minNodeVersion, repositoryData, snapshotIds), new SnapshotDeleteListener() { private final ListenableFuture doneFuture = new ListenableFuture<>(); @Override public void onDone() { logger.info("snapshots {} deleted", snapshotIds); doneFuture.onResponse(null); } @Override public void onRepositoryDataWritten(RepositoryData updatedRepoData) { removeSnapshotDeletionFromClusterState( deleteEntry, updatedRepoData, listeners -> doneFuture.addListener(new ActionListener<>() { @Override public void onResponse(Void unused) { completeListenersIgnoringException(listeners, null); } @Override public void onFailure(Exception e) { // this should never be called, once updated repository metadata has been written to the // repository and the delete been removed from the cluster state, we ignore any further failures // and always complete the delete successfully assert false : e; } }) ); } @Override public void onFailure(Exception e) { submitUnbatchedTask( "remove snapshot deletion metadata after failed delete", new RemoveSnapshotDeletionAndContinueTask(deleteEntry, repositoryData) { @Override protected void handleListeners(List> deleteListeners) { failListenersIgnoringException(deleteListeners, e); } } ); } } ); } } /** * Removes a {@link SnapshotDeletionsInProgress.Entry} from {@link SnapshotDeletionsInProgress} in the cluster state after it executed * on the repository. * * @param deleteEntry delete entry to remove from the cluster state * @param repositoryData current {@link RepositoryData} for the repository we just ran the delete on. * @param listenersHandler consumer that gets passed a list of all listeners that had their delete entry successfully removed from the * cluster state */ private void removeSnapshotDeletionFromClusterState( final SnapshotDeletionsInProgress.Entry deleteEntry, final RepositoryData repositoryData, final Consumer>> listenersHandler ) { // We remove all snapshot ids that the delete successfully removed from the repository from enqueued snapshot delete entries during // the cluster state update. After the cluster state update we pass the list of listeners that had their entry removed from the // cluster state to the given handler submitUnbatchedTask("remove snapshot deletion metadata", new RemoveSnapshotDeletionAndContinueTask(deleteEntry, repositoryData) { @Override protected SnapshotDeletionsInProgress filterDeletions(SnapshotDeletionsInProgress deletions) { final SnapshotDeletionsInProgress updatedDeletions = deletionsWithoutSnapshots( deletions, deleteEntry.getSnapshots(), deleteEntry.repository() ); return updatedDeletions == null ? deletions : updatedDeletions; } @Override protected void handleListeners(List> deleteListeners) { assert repositoryData.getSnapshotIds().stream().noneMatch(deleteEntry.getSnapshots()::contains) : "Repository data contained snapshot ids " + repositoryData.getSnapshotIds() + " that should should been deleted by [" + deleteEntry + "]"; listenersHandler.accept(deleteListeners); } }); } /** * Handle snapshot or delete failure due to not being master any more so we don't try to do run additional cluster state updates. * The next master will try handling the missing operations. All we can do is fail all the listeners on this master node so that * transport requests return and we don't leak listeners. * * @param e exception that caused us to realize we are not master any longer */ private void failAllListenersOnMasterFailOver(Exception e) { logger.debug("Failing all snapshot operation listeners because this node is not master any longer", e); synchronized (currentlyFinalizing) { if (ExceptionsHelper.unwrap(e, NotMasterException.class, FailedToCommitClusterStateException.class) != null) { repositoryOperations.clear(); for (Snapshot snapshot : Set.copyOf(snapshotCompletionListeners.keySet())) { failSnapshotCompletionListeners(snapshot, new SnapshotException(snapshot, "no longer master")); } final Exception wrapped = new RepositoryException("_all", "Failed to update cluster state during repository operation", e); for (Iterator>> iterator = snapshotDeletionListeners.values().iterator(); iterator.hasNext();) { final List> listeners = iterator.next(); iterator.remove(); failListenersIgnoringException(listeners, wrapped); } assert snapshotDeletionListeners.isEmpty() : "No new listeners should have been added but saw " + snapshotDeletionListeners; } else { assert false : new AssertionError("Modifying snapshot state should only ever fail because we failed to publish new state", e); logger.error("Unexpected failure during cluster state update", e); } currentlyFinalizing.clear(); } } /** * A cluster state update that will remove a given {@link SnapshotDeletionsInProgress.Entry} from the cluster state * and trigger running the next snapshot-delete or -finalization operation available to execute if there is one * ready in the cluster state as a result of this state update. */ private abstract class RemoveSnapshotDeletionAndContinueTask extends ClusterStateUpdateTask { // Snapshots that can be finalized after the delete operation has been removed from the cluster state protected final List newFinalizations = new ArrayList<>(); private List readyDeletions = Collections.emptyList(); protected final SnapshotDeletionsInProgress.Entry deleteEntry; private final RepositoryData repositoryData; RemoveSnapshotDeletionAndContinueTask(SnapshotDeletionsInProgress.Entry deleteEntry, RepositoryData repositoryData) { this.deleteEntry = deleteEntry; this.repositoryData = repositoryData; } @Override public ClusterState execute(ClusterState currentState) { final SnapshotDeletionsInProgress deletions = currentState.custom(SnapshotDeletionsInProgress.TYPE); assert deletions != null : "We only run this if there were deletions in the cluster state before"; final SnapshotDeletionsInProgress updatedDeletions = deletions.withRemovedEntry(deleteEntry.uuid()); if (updatedDeletions == deletions) { return currentState; } final SnapshotDeletionsInProgress newDeletions = filterDeletions(updatedDeletions); final Tuple> res = readyDeletions( updateWithSnapshots(currentState, updatedSnapshotsInProgress(currentState, newDeletions), newDeletions) ); readyDeletions = res.v2(); return res.v1(); } @Override public void onFailure(Exception e) { logger.warn(() -> format("%s failed to remove snapshot deletion metadata", deleteEntry), e); repositoryOperations.finishDeletion(deleteEntry.uuid()); failAllListenersOnMasterFailOver(e); } protected SnapshotDeletionsInProgress filterDeletions(SnapshotDeletionsInProgress deletions) { return deletions; } @Override public final void clusterStateProcessed(ClusterState oldState, ClusterState newState) { repositoryOperations.finishDeletion(deleteEntry.uuid()); final List> deleteListeners = snapshotDeletionListeners.remove(deleteEntry.uuid()); handleListeners(deleteListeners); if (newFinalizations.isEmpty()) { if (readyDeletions.isEmpty()) { leaveRepoLoop(deleteEntry.repository()); } else { for (SnapshotDeletionsInProgress.Entry readyDeletion : readyDeletions) { deleteSnapshotsFromRepository(readyDeletion, repositoryData, newState.nodes().getMinNodeVersion()); } } } else { leaveRepoLoop(deleteEntry.repository()); assert readyDeletions.stream().noneMatch(entry -> entry.repository().equals(deleteEntry.repository())) : "New finalizations " + newFinalizations + " added even though deletes " + readyDeletions + " are ready"; for (SnapshotsInProgress.Entry entry : newFinalizations) { endSnapshot(entry, newState.metadata(), repositoryData); } } // TODO: be more efficient here, we could collect newly ready shard clones as we compute them and then directly start them // instead of looping over all possible clones to execute startExecutableClones(newState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY), null); } /** * Invoke snapshot delete listeners for {@link #deleteEntry}. * * @param deleteListeners delete snapshot listeners or {@code null} if there weren't any for {@link #deleteEntry}. */ protected abstract void handleListeners(@Nullable List> deleteListeners); /** * Computes an updated {@link SnapshotsInProgress} that takes into account an updated version of * {@link SnapshotDeletionsInProgress} that has a {@link SnapshotDeletionsInProgress.Entry} removed from it * relative to the {@link SnapshotDeletionsInProgress} found in {@code initialState}. * The removal of a delete from the cluster state can trigger two possible actions on in-progress snapshots: *
    *
  • Snapshots that had unfinished shard snapshots in state {@link ShardSnapshotStatus#UNASSIGNED_QUEUED} that * could not be started because the delete was running can have those started.
  • *
  • Snapshots that had all their shards reach a completed state while a delete was running (e.g. as a result of * nodes dropping out of the cluster or another incoming delete aborting them) need not be updated in the cluster * state but need to have their finalization triggered now that it's possible with the removal of the delete * from the state.
  • *
* * @param currentState current cluster state * @param updatedDeletions deletions with removed entry * @return updated snapshot in progress instance or {@code null} if there are no changes to it */ @Nullable private SnapshotsInProgress updatedSnapshotsInProgress(ClusterState currentState, SnapshotDeletionsInProgress updatedDeletions) { final SnapshotsInProgress snapshotsInProgress = currentState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY); final List snapshotEntries = new ArrayList<>(); // Keep track of shardIds that we started snapshots for as a result of removing this delete so we don't assign // them to multiple snapshots by accident final Set reassignedShardIds = new HashSet<>(); boolean changed = false; final String localNodeId = currentState.nodes().getLocalNodeId(); final String repoName = deleteEntry.repository(); InFlightShardSnapshotStates inFlightShardStates = null; // Keep track of IndexId values that may have gone unreferenced due to the delete entry just executed. // See org.elasticsearch.cluster.SnapshotsInProgress.Entry#withUpdatedIndexIds for details. final Set newIndexIdsToRefresh = new HashSet<>(); for (SnapshotsInProgress.Entry entry : snapshotsInProgress.forRepo(repoName)) { if (entry.state().completed() == false) { // TODO: dry up redundant computation and code between clone and non-clone case, in particular reuse // `inFlightShardStates` across both clone and standard snapshot code if (entry.isClone()) { // Collect waiting shards from that entry that we can assign now that we are done with the deletion final List canBeUpdated = new ArrayList<>(); for (Map.Entry value : entry.shardsByRepoShardId().entrySet()) { if (value.getValue().equals(ShardSnapshotStatus.UNASSIGNED_QUEUED) && reassignedShardIds.contains(value.getKey()) == false) { canBeUpdated.add(value.getKey()); } } // TODO: the below logic is very similar to that in #startCloning and both could be dried up against each other // also the code for standard snapshots could make use of this breakout as well if (canBeUpdated.isEmpty() || updatedDeletions.hasExecutingDeletion(repoName)) { // No shards can be updated in this snapshot so we just add it as is again snapshotEntries.add(entry); } else { if (inFlightShardStates == null) { inFlightShardStates = InFlightShardSnapshotStates.forEntries(snapshotsInProgress.forRepo(repoName)); } final ImmutableOpenMap.Builder updatedAssignmentsBuilder = ImmutableOpenMap.builder(entry.shardsByRepoShardId()); for (RepositoryShardId shardId : canBeUpdated) { if (inFlightShardStates.isActive(shardId.indexName(), shardId.shardId()) == false) { markShardReassigned(shardId, reassignedShardIds); updatedAssignmentsBuilder.put( shardId, new ShardSnapshotStatus( localNodeId, inFlightShardStates.generationForShard( shardId.index(), shardId.shardId(), repositoryData.shardGenerations() ) ) ); } } snapshotEntries.add(entry.withClones(updatedAssignmentsBuilder.build())); changed = true; } } else { // Collect waiting shards that in entry that we can assign now that we are done with the deletion final List canBeUpdated = new ArrayList<>(); for (Map.Entry value : entry.shardsByRepoShardId().entrySet()) { final RepositoryShardId repositoryShardId = value.getKey(); if (value.getValue().equals(ShardSnapshotStatus.UNASSIGNED_QUEUED) && reassignedShardIds.contains(repositoryShardId) == false) { canBeUpdated.add(repositoryShardId); if (repositoryData.hasIndex(repositoryShardId.indexName()) == false) { newIndexIdsToRefresh.add(repositoryShardId.index()); } } } if (canBeUpdated.isEmpty()) { // No shards can be updated in this snapshot so we just add it as is again snapshotEntries.add(entry); } else { final ImmutableOpenMap shardAssignments = shards( snapshotsInProgress, updatedDeletions, currentState, entry.indices().values(), entry.version().onOrAfter(SHARD_GEN_IN_REPO_DATA_VERSION), repositoryData, repoName ); final ImmutableOpenMap.Builder updatedAssignmentsBuilder = ImmutableOpenMap .builder(entry.shards()); for (RepositoryShardId shardId : canBeUpdated) { final ShardId sid = entry.shardId(shardId); final ShardSnapshotStatus updated = shardAssignments.get(sid); if (updated == null) { // We don't have a new assignment for this shard because its index was concurrently deleted assert currentState.routingTable().hasIndex(sid.getIndex()) == false : "Missing assignment for [" + sid + "]"; updatedAssignmentsBuilder.put(sid, ShardSnapshotStatus.MISSING); } else { if (updated.isActive()) { markShardReassigned(shardId, reassignedShardIds); } updatedAssignmentsBuilder.put(sid, updated); } } final SnapshotsInProgress.Entry updatedEntry = entry.withShardStates(updatedAssignmentsBuilder.build()); snapshotEntries.add(updatedEntry); changed = true; if (updatedEntry.state().completed()) { newFinalizations.add(entry); } } } } else { // Entry is already completed so we will finalize it now that the delete doesn't block us after // this CS update finishes newFinalizations.add(entry); snapshotEntries.add(entry); } } if (changed && newIndexIdsToRefresh.isEmpty() == false) { final Map updatedIndexIds = Maps.newMapWithExpectedSize(newIndexIdsToRefresh.size()); for (IndexId indexIdToRefresh : newIndexIdsToRefresh) { updatedIndexIds.put(indexIdToRefresh, new IndexId(indexIdToRefresh.getName(), UUIDs.randomBase64UUID())); } for (int i = 0; i < snapshotEntries.size(); i++) { snapshotEntries.set(i, snapshotEntries.get(i).withUpdatedIndexIds(updatedIndexIds)); } } return changed ? snapshotsInProgress.withUpdatedEntriesForRepo(repoName, snapshotEntries) : null; } private static void markShardReassigned(RepositoryShardId shardId, Set reassignments) { final boolean added = reassignments.add(shardId); assert added : "should only ever reassign each shard once but assigned [" + shardId + "] multiple times"; } } /** * Shortcut to build new {@link ClusterState} from the current state and updated values of {@link SnapshotsInProgress} and * {@link SnapshotDeletionsInProgress}. * * @param state current cluster state * @param snapshotsInProgress new value for {@link SnapshotsInProgress} or {@code null} if it's unchanged * @param snapshotDeletionsInProgress new value for {@link SnapshotDeletionsInProgress} or {@code null} if it's unchanged * @return updated cluster state */ public static ClusterState updateWithSnapshots( ClusterState state, @Nullable SnapshotsInProgress snapshotsInProgress, @Nullable SnapshotDeletionsInProgress snapshotDeletionsInProgress ) { if (snapshotsInProgress == null && snapshotDeletionsInProgress == null) { return state; } ClusterState.Builder builder = ClusterState.builder(state); if (snapshotsInProgress != null) { builder.putCustom(SnapshotsInProgress.TYPE, snapshotsInProgress); } if (snapshotDeletionsInProgress != null) { builder.putCustom(SnapshotDeletionsInProgress.TYPE, snapshotDeletionsInProgress); } return builder.build(); } private static void failListenersIgnoringException(@Nullable List> listeners, Exception failure) { if (listeners != null) { try { ActionListener.onFailure(listeners, failure); } catch (Exception ex) { assert false : new AssertionError(ex); logger.warn("Failed to notify listeners", ex); } } } private static void completeListenersIgnoringException(@Nullable List> listeners, T result) { if (listeners != null) { try { ActionListener.onResponse(listeners, result); } catch (Exception ex) { assert false : new AssertionError(ex); logger.warn("Failed to notify listeners", ex); } } } /** * Calculates the assignment of shards to data nodes for a new snapshot based on the given cluster state and the * indices that should be included in the snapshot. * * @param indices Indices to snapshot * @param useShardGenerations whether to write {@link ShardGenerations} during the snapshot * @return list of shard to be included into current snapshot */ private static ImmutableOpenMap shards( SnapshotsInProgress snapshotsInProgress, SnapshotDeletionsInProgress deletionsInProgress, ClusterState currentState, Collection indices, boolean useShardGenerations, RepositoryData repositoryData, String repoName ) { ImmutableOpenMap.Builder builder = ImmutableOpenMap.builder(); final ShardGenerations shardGenerations = repositoryData.shardGenerations(); final InFlightShardSnapshotStates inFlightShardStates = InFlightShardSnapshotStates.forEntries( snapshotsInProgress.forRepo(repoName) ); final boolean readyToExecute = deletionsInProgress.hasExecutingDeletion(repoName) == false; for (IndexId index : indices) { final String indexName = index.getName(); final boolean isNewIndex = repositoryData.getIndices().containsKey(indexName) == false; IndexMetadata indexMetadata = currentState.metadata().index(indexName); if (indexMetadata == null) { // The index was deleted before we managed to start the snapshot - mark it as missing. builder.put(new ShardId(indexName, IndexMetadata.INDEX_UUID_NA_VALUE, 0), ShardSnapshotStatus.MISSING); } else { final IndexRoutingTable indexRoutingTable = currentState.routingTable().index(indexName); assert indexRoutingTable != null; for (int i = 0; i < indexMetadata.getNumberOfShards(); i++) { final ShardId shardId = indexRoutingTable.shard(i).shardId(); final ShardGeneration shardRepoGeneration; if (useShardGenerations) { final ShardGeneration inFlightGeneration = inFlightShardStates.generationForShard( index, shardId.id(), shardGenerations ); if (inFlightGeneration == null && isNewIndex) { assert shardGenerations.getShardGen(index, shardId.getId()) == null : "Found shard generation for new index [" + index + "]"; shardRepoGeneration = ShardGenerations.NEW_SHARD_GEN; } else { shardRepoGeneration = inFlightGeneration; } } else { shardRepoGeneration = null; } final ShardSnapshotStatus shardSnapshotStatus; if (readyToExecute == false || inFlightShardStates.isActive(shardId.getIndexName(), shardId.id())) { shardSnapshotStatus = ShardSnapshotStatus.UNASSIGNED_QUEUED; } else { shardSnapshotStatus = initShardSnapshotStatus(shardRepoGeneration, indexRoutingTable.shard(i).primaryShard()); } builder.put(shardId, shardSnapshotStatus); } } } return builder.build(); } /** * Compute the snapshot status for a given shard based on the current primary routing entry for the shard. * * @param shardRepoGeneration repository generation of the shard in the repository * @param primary primary routing entry for the shard * @return shard snapshot status */ private static ShardSnapshotStatus initShardSnapshotStatus(ShardGeneration shardRepoGeneration, ShardRouting primary) { ShardSnapshotStatus shardSnapshotStatus; if (primary == null || primary.assignedToNode() == false) { shardSnapshotStatus = new ShardSnapshotStatus(null, ShardState.MISSING, "primary shard is not allocated", shardRepoGeneration); } else if (primary.relocating() || primary.initializing()) { shardSnapshotStatus = new ShardSnapshotStatus(primary.currentNodeId(), ShardState.WAITING, shardRepoGeneration); } else if (primary.started() == false) { shardSnapshotStatus = new ShardSnapshotStatus( primary.currentNodeId(), ShardState.MISSING, "primary shard hasn't been started yet", shardRepoGeneration ); } else { shardSnapshotStatus = new ShardSnapshotStatus(primary.currentNodeId(), shardRepoGeneration); } return shardSnapshotStatus; } /** * Returns the data streams that are currently being snapshotted (with partial == false) and that are contained in the * indices-to-check set. */ public static Set snapshottingDataStreams(final ClusterState currentState, final Set dataStreamsToCheck) { Map dataStreams = currentState.metadata().dataStreams(); return currentState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY) .asStream() .filter(e -> e.partial() == false) .flatMap(e -> e.dataStreams().stream()) .filter(ds -> dataStreams.containsKey(ds) && dataStreamsToCheck.contains(ds)) .collect(Collectors.toSet()); } /** * Returns the indices that are currently being snapshotted (with partial == false) and that are contained in the indices-to-check set. */ public static Set snapshottingIndices(final ClusterState currentState, final Set indicesToCheck) { final Set indices = new HashSet<>(); for (List snapshotsInRepo : currentState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY) .entriesByRepo()) { for (final SnapshotsInProgress.Entry entry : snapshotsInRepo) { if (entry.partial() == false && entry.isClone() == false) { for (String indexName : entry.indices().keySet()) { IndexMetadata indexMetadata = currentState.metadata().index(indexName); if (indexMetadata != null && indicesToCheck.contains(indexMetadata.getIndex())) { indices.add(indexMetadata.getIndex()); } } } } } return indices; } /** * Filters out the aliases that refer to data streams to do not exist in the provided data streams. * Also rewrites the list of data streams an alias point to to only contain data streams that exist in the provided data streams. * * The purpose of this method is to capture the relevant data stream aliases based on the data streams * that will be included in a snapshot. * * @param dataStreams The provided data streams, which will be included in a snapshot. * @param dataStreamAliases The data streams aliases that may contain aliases that refer to data streams * that don't exist in the provided data streams. * @return The filtered data streams aliases only referring to data streams in the provided data streams. */ static Map filterDataStreamAliases( Map dataStreams, Map dataStreamAliases ) { return dataStreamAliases.values() .stream() .filter(alias -> alias.getDataStreams().stream().anyMatch(dataStreams::containsKey)) .map(alias -> alias.intersect(dataStreams::containsKey)) .collect(Collectors.toMap(DataStreamAlias::getName, Function.identity())); } /** * Adds snapshot completion listener * * @param snapshot Snapshot to listen for * @param listener listener */ private void addListener(Snapshot snapshot, ActionListener listener) { snapshotCompletionListeners.computeIfAbsent(snapshot, k -> new CopyOnWriteArrayList<>()) .add(ContextPreservingActionListener.wrapPreservingContext(listener, threadPool.getThreadContext())); } @Override protected void doStart() { assert this.updateSnapshotStatusHandler != null; assert transportService.getRequestHandler(UPDATE_SNAPSHOT_STATUS_ACTION_NAME) != null; } @Override protected void doStop() { } @Override protected void doClose() { clusterService.removeApplier(this); } /** * Assert that no in-memory state for any running snapshot-create or -delete operation exists in this instance. */ public boolean assertAllListenersResolved() { final DiscoveryNode localNode = clusterService.localNode(); assert endingSnapshots.isEmpty() : "Found leaked ending snapshots " + endingSnapshots + " on [" + localNode + "]"; assert snapshotCompletionListeners.isEmpty() : "Found leaked snapshot completion listeners " + snapshotCompletionListeners + " on [" + localNode + "]"; assert currentlyFinalizing.isEmpty() : "Found leaked finalizations " + currentlyFinalizing + " on [" + localNode + "]"; assert snapshotDeletionListeners.isEmpty() : "Found leaked snapshot delete listeners " + snapshotDeletionListeners + " on [" + localNode + "]"; assert repositoryOperations.isEmpty() : "Found leaked snapshots to finalize " + repositoryOperations + " on [" + localNode + "]"; return true; } /** * Executor that applies {@link ShardSnapshotUpdate}s to the current cluster state. The algorithm implemented below works as described * below: * Every shard snapshot or clone state update can result in multiple snapshots being updated. In order to determine whether or not a * shard update has an effect we use an outer loop over all current executing snapshot operations that iterates over them in the order * they were started in and an inner loop over the list of shard update tasks. * * If the inner loop finds that a shard update task applies to a given snapshot and either a shard-snapshot or shard-clone operation in * it then it will update the state of the snapshot entry accordingly. If that update was a noop, then the task is removed from the * iteration as it was already applied before and likely just arrived on the master node again due to retries upstream. * If the update was not a noop, then it means that the shard it applied to is now available for another snapshot or clone operation * to be re-assigned if there is another snapshot operation that is waiting for the shard to become available. We therefore record the * fact that a task was executed by adding it to a collection of executed tasks. If a subsequent execution of the outer loop finds that * a task in the executed tasks collection applied to a shard it was waiting for to become available, then the shard snapshot operation * will be started for that snapshot entry and the task removed from the collection of tasks that need to be applied to snapshot * entries since it can not have any further effects. * * Package private to allow for tests. */ static final ClusterStateTaskExecutor SHARD_STATE_EXECUTOR = batchExecutionContext -> new SnapshotShardsUpdateContext(batchExecutionContext).computeUpdatedState(); private static boolean isQueued(@Nullable ShardSnapshotStatus status) { return status != null && status.state() == ShardState.QUEUED; } /** * State machine for updating existing {@link SnapshotsInProgress.Entry} by applying a given list of {@link ShardSnapshotUpdate} to * them. */ private static final class SnapshotShardsUpdateContext { // number of updated shard snapshot states as a result of applying updates to the snapshot entries seen so far private int changedCount = 0; // number of started tasks as a result of applying updates to the snapshot entries seen so far private int startedCount = 0; // batch execution context private final ClusterStateTaskExecutor.BatchExecutionContext batchExecutionContext; // initial cluster state for update computation private final ClusterState initialState; // updates outstanding to be applied to existing snapshot entries private final Map> updatesByRepo; // updates that were used to update an existing in-progress shard snapshot private final Set executedUpdates = new HashSet<>(); SnapshotShardsUpdateContext(ClusterStateTaskExecutor.BatchExecutionContext batchExecutionContext) { this.batchExecutionContext = batchExecutionContext; this.initialState = batchExecutionContext.initialState(); this.updatesByRepo = new HashMap<>(); for (final var taskContext : batchExecutionContext.taskContexts()) { updatesByRepo.computeIfAbsent(taskContext.getTask().snapshot.getRepository(), r -> new ArrayList<>()) .add(taskContext.getTask()); } } ClusterState computeUpdatedState() { final SnapshotsInProgress existing = initialState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY); SnapshotsInProgress updated = existing; for (Map.Entry> updates : updatesByRepo.entrySet()) { final String repoName = updates.getKey(); final List oldEntries = existing.forRepo(repoName); if (oldEntries.isEmpty()) { continue; } final List newEntries = new ArrayList<>(oldEntries.size()); for (SnapshotsInProgress.Entry entry : oldEntries) { newEntries.add(applyToEntry(entry, updates.getValue())); } updated = updated.withUpdatedEntriesForRepo(repoName, newEntries); } final var result = new ShardSnapshotUpdateResult(initialState.metadata(), updated); for (final var taskContext : batchExecutionContext.taskContexts()) { taskContext.success(() -> taskContext.getTask().listener.onResponse(result)); } if (changedCount > 0) { logger.trace( "changed cluster state triggered by [{}] snapshot state updates and resulted in starting " + "[{}] shard snapshots", changedCount, startedCount ); return ClusterState.builder(initialState).putCustom(SnapshotsInProgress.TYPE, updated).build(); } assert existing == updated; return initialState; } private SnapshotsInProgress.Entry applyToEntry(SnapshotsInProgress.Entry entry, List updates) { // Completed snapshots do not require any updates so we just add them to the output list and keep going. // Also we short circuit if there are no more unconsumed updates to apply. if (entry.state().completed() || updates.isEmpty()) { return entry; } return new EntryContext(entry, updates).computeUpdatedEntry(); } // Per snapshot entry state private final class EntryContext { private final SnapshotsInProgress.Entry entry; // iterator containing the updates yet to be applied to #entry private final Iterator iterator; // builder for updated shard snapshot status mappings if any could be computed private ImmutableOpenMap.Builder shardsBuilder = null; // builder for updated shard clone status mappings if any could be computed private ImmutableOpenMap.Builder clonesBuilder = null; EntryContext(SnapshotsInProgress.Entry entry, List updates) { this.entry = entry; this.iterator = updates.iterator(); } SnapshotsInProgress.Entry computeUpdatedEntry() { assert shardsBuilder == null && clonesBuilder == null : "update context was already used"; // loop over all the shard updates that are potentially applicable to the current snapshot entry while (iterator.hasNext()) { final ShardSnapshotUpdate update = iterator.next(); if (entry.snapshot().getSnapshotId().equals(update.snapshot.getSnapshotId())) { // update a currently running shard level operation if (update.isClone()) { executeShardSnapshotUpdate(entry.shardsByRepoShardId(), this::clonesBuilder, update, update.repoShardId); } else { executeShardSnapshotUpdate(entry.shards(), this::shardsBuilder, update, update.shardId); } } else if (executedUpdates.contains(update)) { // try starting a new shard level operation because one has completed if (update.isClone()) { tryStartNextTaskAfterCloneUpdated(update.repoShardId, update.updatedState); } else { tryStartNextTaskAfterSnapshotUpdated(update.shardId, update.updatedState); } } } if (shardsBuilder != null) { assert clonesBuilder == null : "Should not have updated clones when updating shard snapshots but saw " + clonesBuilder + " as well as " + shardsBuilder; return entry.withShardStates(shardsBuilder.build()); } else if (clonesBuilder != null) { return entry.withClones(clonesBuilder.build()); } else { return entry; } } /** * Start shard level operation for given {@code shardId}. * * @param newStates builder for updated shard states mapping * @param nodeId node id to execute started operation on * @param generation shard generation to base started operation on * @param shardId shard identifier of shard to start operation for * @param either {@link ShardId} for snapshots or {@link RepositoryShardId} for clones */ private void startShardOperation( ImmutableOpenMap.Builder newStates, String nodeId, ShardGeneration generation, T shardId ) { startShardOperation(newStates, shardId, new ShardSnapshotStatus(nodeId, generation)); } /** * Start shard level operation for given {@code shardId}. * * @param newStates builder for updated shard states mapping * @param shardId shard identifier of shard to start operation for * @param newState new shard task state for operation to start * @param either {@link ShardId} for snapshots or {@link RepositoryShardId} for clones */ private void startShardOperation( ImmutableOpenMap.Builder newStates, T shardId, ShardSnapshotStatus newState ) { logger.trace( "[{}] Starting [{}] on [{}] with generation [{}]", entry.snapshot(), shardId, newState.nodeId(), newState.generation() ); newStates.put(shardId, newState); iterator.remove(); startedCount++; } private void executeShardSnapshotUpdate( Map existingStates, Supplier> newStates, ShardSnapshotUpdate updateSnapshotState, T updatedShard ) { assert updateSnapshotState.snapshot.equals(entry.snapshot()); final ShardSnapshotStatus existing = existingStates.get(updatedShard); if (existing == null) { logger.warn("Received shard snapshot status update [{}] but this shard is not tracked in [{}]", updatedShard, entry); assert false : "This should never happen, should only receive updates for expected shards"; return; } if (existing.state().completed()) { // No point in doing noop updates that might happen if data nodes resends shard status after a disconnect. iterator.remove(); return; } logger.trace( "[{}] Updating shard [{}] with status [{}]", updateSnapshotState.snapshot, updatedShard, updateSnapshotState.updatedState.state() ); changedCount++; newStates.get().put(updatedShard, updateSnapshotState.updatedState); executedUpdates.add(updateSnapshotState); } private void tryStartNextTaskAfterCloneUpdated(RepositoryShardId repoShardId, ShardSnapshotStatus updatedState) { // the update was already executed on the clone operation it applied to, now we check if it may be possible to // start a shard snapshot or clone operation on the current entry if (entry.isClone() == false) { tryStartSnapshotAfterCloneFinish(repoShardId, updatedState.generation()); } else if (isQueued(entry.shardsByRepoShardId().get(repoShardId))) { final String localNodeId = initialState.nodes().getLocalNodeId(); assert updatedState.nodeId().equals(localNodeId) : "Clone updated with node id [" + updatedState.nodeId() + "] but local node id is [" + localNodeId + "]"; startShardOperation(clonesBuilder(), localNodeId, updatedState.generation(), repoShardId); } } private void tryStartNextTaskAfterSnapshotUpdated(ShardId shardId, ShardSnapshotStatus updatedState) { // We applied the update for a shard snapshot state to its snapshot entry, now check if we can update // either a clone or a snapshot final IndexId indexId = entry.indices().get(shardId.getIndexName()); if (indexId != null) { final RepositoryShardId repoShardId = new RepositoryShardId(indexId, shardId.id()); if (isQueued(entry.shardsByRepoShardId().get(repoShardId))) { if (entry.isClone()) { // shard snapshot was completed, we check if we can start a clone operation for the same repo shard startShardOperation( clonesBuilder(), initialState.nodes().getLocalNodeId(), updatedState.generation(), repoShardId ); } else { startShardSnapshot(repoShardId, updatedState.generation()); } } } } private void tryStartSnapshotAfterCloneFinish(RepositoryShardId repoShardId, ShardGeneration generation) { assert entry.source() == null; // current entry is a snapshot operation so we must translate the repository shard id to a routing shard id if (isQueued(entry.shardsByRepoShardId().get(repoShardId))) { startShardSnapshot(repoShardId, generation); } } private void startShardSnapshot(RepositoryShardId repoShardId, ShardGeneration generation) { final Index index = entry.indexByName(repoShardId.indexName()); assert index != null : "index [" + repoShardId.index() + "] must exist in snapshot entry [" + entry + "] because it's a normal snapshot but did not"; // work out the node to run the snapshot task on as it might have changed from the previous operation if it was a clone // or there was a primary failover final IndexRoutingTable indexRouting = initialState.routingTable().index(index); final ShardRouting shardRouting; if (indexRouting == null) { shardRouting = null; } else { shardRouting = indexRouting.shard(repoShardId.shardId()).primaryShard(); } final ShardSnapshotStatus shardSnapshotStatus = initShardSnapshotStatus(generation, shardRouting); final ShardId routingShardId = shardRouting != null ? shardRouting.shardId() : new ShardId(index, repoShardId.shardId()); if (shardSnapshotStatus.isActive()) { startShardOperation(shardsBuilder(), routingShardId, shardSnapshotStatus); } else { // update to queued snapshot did not result in an actual update execution so we just record it but keep applying // the update to e.g. fail all snapshots for a given shard if the primary for the shard went away shardsBuilder().put(routingShardId, shardSnapshotStatus); } } private ImmutableOpenMap.Builder clonesBuilder() { assert shardsBuilder == null; if (clonesBuilder == null) { clonesBuilder = ImmutableOpenMap.builder(entry.shardsByRepoShardId()); } return clonesBuilder; } private ImmutableOpenMap.Builder shardsBuilder() { assert clonesBuilder == null; if (shardsBuilder == null) { shardsBuilder = ImmutableOpenMap.builder(entry.shards()); } return shardsBuilder; } } } /** * The result of a {@link ShardSnapshotUpdate}, capturing the info needed to finalize the relevant snapshot if appropriate. */ record ShardSnapshotUpdateResult(Metadata metadata, SnapshotsInProgress snapshotsInProgress) {} /** * An update to the snapshot state of a shard. * * Package private for testing */ static final class ShardSnapshotUpdate implements ClusterStateTaskListener { private final Snapshot snapshot; private final ShardId shardId; private final RepositoryShardId repoShardId; private final ShardSnapshotStatus updatedState; private final ActionListener listener; ShardSnapshotUpdate( Snapshot snapshot, ShardId shardId, RepositoryShardId repoShardId, ShardSnapshotStatus updatedState, ActionListener listener ) { assert shardId != null ^ repoShardId != null; this.snapshot = snapshot; this.shardId = shardId; this.repoShardId = repoShardId; this.updatedState = updatedState; this.listener = listener; } public boolean isClone() { return repoShardId != null; } @Override public void onFailure(Exception e) { listener.onFailure(e); } @Override public boolean equals(Object other) { if (this == other) { return true; } if ((other instanceof ShardSnapshotUpdate) == false) { return false; } final ShardSnapshotUpdate that = (ShardSnapshotUpdate) other; return this.snapshot.equals(that.snapshot) && Objects.equals(this.shardId, that.shardId) && Objects.equals(this.repoShardId, that.repoShardId) && this.updatedState == that.updatedState; } @Override public int hashCode() { return Objects.hash(snapshot, shardId, updatedState, repoShardId); } @Override public String toString() { return "ShardSnapshotUpdate{" + "snapshot=" + snapshot + ", shardId=" + shardId + ", repoShardId=" + repoShardId + ", updatedState=" + updatedState + '}'; } } private void innerUpdateSnapshotState( Snapshot snapshot, ShardId shardId, RepositoryShardId repoShardId, ShardSnapshotStatus updatedState, ActionListener listener ) { var update = new ShardSnapshotUpdate(snapshot, shardId, repoShardId, updatedState, listener.delegateFailure((delegate, result) -> { try { delegate.onResponse(null); } finally { // Maybe this state update completed the snapshot. If we are not already ending it because of a concurrent // state update we check if its state is completed and end it if it is. final SnapshotsInProgress snapshotsInProgress = result.snapshotsInProgress(); if (endingSnapshots.contains(snapshot) == false) { final SnapshotsInProgress.Entry updatedEntry = snapshotsInProgress.snapshot(snapshot); // If the entry is still in the cluster state and is completed, try finalizing the snapshot in the repo if (updatedEntry != null && updatedEntry.state().completed()) { endSnapshot(updatedEntry, result.metadata(), null); } } startExecutableClones(snapshotsInProgress, snapshot.getRepository()); } })); logger.trace("received updated snapshot restore state [{}]", update); clusterService.submitStateUpdateTask( "update snapshot state", update, ClusterStateTaskConfig.build(Priority.NORMAL), SHARD_STATE_EXECUTOR ); } private void startExecutableClones(SnapshotsInProgress snapshotsInProgress, @Nullable String repoName) { if (repoName == null) { for (List entries : snapshotsInProgress.entriesByRepo()) { startExecutableClones(entries); } } else { startExecutableClones(snapshotsInProgress.forRepo(repoName)); } } private void startExecutableClones(List entries) { for (SnapshotsInProgress.Entry entry : entries) { if (entry.isClone() && entry.state() == State.STARTED) { // this is a clone, see if new work is ready for (Map.Entry clone : entry.shardsByRepoShardId().entrySet()) { if (clone.getValue().state() == ShardState.INIT) { runReadyClone( entry.snapshot(), entry.source(), clone.getValue(), clone.getKey(), repositoriesService.repository(entry.repository()) ); } } } } } private class UpdateSnapshotStatusAction extends TransportMasterNodeAction< UpdateIndexShardSnapshotStatusRequest, ActionResponse.Empty> { UpdateSnapshotStatusAction( TransportService transportService, ClusterService clusterService, ThreadPool threadPool, ActionFilters actionFilters, IndexNameExpressionResolver indexNameExpressionResolver ) { super( UPDATE_SNAPSHOT_STATUS_ACTION_NAME, false, transportService, clusterService, threadPool, actionFilters, UpdateIndexShardSnapshotStatusRequest::new, indexNameExpressionResolver, in -> ActionResponse.Empty.INSTANCE, ThreadPool.Names.SAME ); } @Override protected void masterOperation( Task task, UpdateIndexShardSnapshotStatusRequest request, ClusterState state, ActionListener listener ) { innerUpdateSnapshotState( request.snapshot(), request.shardId(), null, request.status(), listener.map(v -> ActionResponse.Empty.INSTANCE) ); } @Override protected ClusterBlockException checkBlock(UpdateIndexShardSnapshotStatusRequest request, ClusterState state) { return null; } } /** * Cluster state update task that removes all {@link SnapshotsInProgress.Entry} and {@link SnapshotDeletionsInProgress.Entry} for a * given repository from the cluster state and afterwards fails all relevant listeners in {@link #snapshotCompletionListeners} and * {@link #snapshotDeletionListeners}. */ private final class FailPendingRepoTasksTask extends ClusterStateUpdateTask { // Snapshots to fail after the state update private final List snapshotsToFail = new ArrayList<>(); // Delete uuids to fail because after the state update private final List deletionsToFail = new ArrayList<>(); // Failure that caused the decision to fail all snapshots and deletes for a repo private final Exception failure; private final String repository; FailPendingRepoTasksTask(String repository, Exception failure) { this.repository = repository; this.failure = failure; } @Override public ClusterState execute(ClusterState currentState) { final SnapshotDeletionsInProgress deletionsInProgress = currentState.custom( SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY ); boolean changed = false; final List remainingEntries = deletionsInProgress.getEntries(); List updatedEntries = new ArrayList<>(remainingEntries.size()); for (SnapshotDeletionsInProgress.Entry entry : remainingEntries) { if (entry.repository().equals(repository)) { changed = true; deletionsToFail.add(entry.uuid()); } else { updatedEntries.add(entry); } } final SnapshotDeletionsInProgress updatedDeletions = changed ? SnapshotDeletionsInProgress.of(updatedEntries) : null; final SnapshotsInProgress snapshotsInProgress = currentState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY); boolean changedSnapshots = false; for (SnapshotsInProgress.Entry entry : snapshotsInProgress.forRepo(repository)) { // We failed to read repository data for this delete, it is not the job of SnapshotsService to // retry these kinds of issues so we fail all the pending snapshots snapshotsToFail.add(entry.snapshot()); changedSnapshots = true; } final SnapshotsInProgress updatedSnapshotsInProgress = changedSnapshots ? snapshotsInProgress.withUpdatedEntriesForRepo(repository, List.of()) : null; return updateWithSnapshots(currentState, updatedSnapshotsInProgress, updatedDeletions); } @Override public void onFailure(Exception e) { logger.info(() -> "Failed to remove all snapshot tasks for repo [" + repository + "] from cluster state", e); failAllListenersOnMasterFailOver(e); } @Override public void clusterStateProcessed(ClusterState oldState, ClusterState newState) { logger.warn( () -> format("Removed all snapshot tasks for repository [%s] from cluster state, now failing listeners", repository), failure ); synchronized (currentlyFinalizing) { Tuple finalization; while ((finalization = repositoryOperations.pollFinalization(repository)) != null) { assert snapshotsToFail.contains(finalization.v1()) : "[" + finalization.v1() + "] not found in snapshots to fail " + snapshotsToFail; } leaveRepoLoop(repository); for (Snapshot snapshot : snapshotsToFail) { failSnapshotCompletionListeners(snapshot, failure); } for (String delete : deletionsToFail) { failListenersIgnoringException(snapshotDeletionListeners.remove(delete), failure); repositoryOperations.finishDeletion(delete); } } } } private static final class OngoingRepositoryOperations { /** * Map of repository name to a deque of {@link Snapshot} that need to be finalized for the repository and the * {@link Metadata to use when finalizing}. */ private final Map> snapshotsToFinalize = new HashMap<>(); /** * Set of delete operations currently being executed against the repository. The values in this set are the delete UUIDs returned * by {@link SnapshotDeletionsInProgress.Entry#uuid()}. */ private final Set runningDeletions = Collections.synchronizedSet(new HashSet<>()); @Nullable private Metadata latestKnownMetaData; @Nullable synchronized Tuple pollFinalization(String repository) { assertConsistent(); final Snapshot nextEntry; final Deque queued = snapshotsToFinalize.get(repository); if (queued == null) { return null; } nextEntry = queued.pollFirst(); assert nextEntry != null; final Tuple res = Tuple.tuple(nextEntry, latestKnownMetaData); if (queued.isEmpty()) { snapshotsToFinalize.remove(repository); } if (snapshotsToFinalize.isEmpty()) { latestKnownMetaData = null; } assert assertConsistent(); return res; } boolean startDeletion(String deleteUUID) { return runningDeletions.add(deleteUUID); } void finishDeletion(String deleteUUID) { runningDeletions.remove(deleteUUID); } synchronized void addFinalization(Snapshot snapshot, Metadata metadata) { snapshotsToFinalize.computeIfAbsent(snapshot.getRepository(), k -> new LinkedList<>()).add(snapshot); this.latestKnownMetaData = metadata; assertConsistent(); } /** * Clear all state associated with running snapshots. To be used on master-failover if the current node stops * being master. */ synchronized void clear() { snapshotsToFinalize.clear(); runningDeletions.clear(); latestKnownMetaData = null; } synchronized boolean isEmpty() { return snapshotsToFinalize.isEmpty(); } synchronized boolean assertNotQueued(Snapshot snapshot) { assert snapshotsToFinalize.getOrDefault(snapshot.getRepository(), new LinkedList<>()) .stream() .noneMatch(entry -> entry.equals(snapshot)) : "Snapshot [" + snapshot + "] is still in finalization queue"; return true; } synchronized boolean assertConsistent() { assert (latestKnownMetaData == null && snapshotsToFinalize.isEmpty()) || (latestKnownMetaData != null && snapshotsToFinalize.isEmpty() == false) : "Should not hold on to metadata if there are no more queued snapshots"; assert snapshotsToFinalize.values().stream().noneMatch(Collection::isEmpty) : "Found empty queue in " + snapshotsToFinalize; return true; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy