org.elasticsearch.repositories.blobstore.BlobStoreRepository Maven / Gradle / Ivy
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.repositories.blobstore;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexFormatTooNewException;
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RateLimiter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SetOnce;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.ActionRunnable;
import org.elasticsearch.action.ResultDeduplicator;
import org.elasticsearch.action.StepListener;
import org.elasticsearch.action.support.GroupedActionListener;
import org.elasticsearch.action.support.ListenableActionFuture;
import org.elasticsearch.action.support.PlainActionFuture;
import org.elasticsearch.action.support.ThreadedActionListener;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.ClusterStateUpdateTask;
import org.elasticsearch.cluster.RepositoryCleanupInProgress;
import org.elasticsearch.cluster.SnapshotDeletionsInProgress;
import org.elasticsearch.cluster.SnapshotsInProgress;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.cluster.metadata.RepositoriesMetadata;
import org.elasticsearch.cluster.metadata.RepositoryMetadata;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.routing.allocation.AllocationService;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.Numbers;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.UUIDs;
import org.elasticsearch.common.blobstore.BlobContainer;
import org.elasticsearch.common.blobstore.BlobMetadata;
import org.elasticsearch.common.blobstore.BlobPath;
import org.elasticsearch.common.blobstore.BlobStore;
import org.elasticsearch.common.blobstore.DeleteResult;
import org.elasticsearch.common.blobstore.fs.FsBlobContainer;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.component.AbstractLifecycleComponent;
import org.elasticsearch.common.compress.NotXContentException;
import org.elasticsearch.common.io.Streams;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.store.InputStreamIndexInput;
import org.elasticsearch.common.metrics.CounterMetric;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
import org.elasticsearch.common.util.concurrent.FutureUtils;
import org.elasticsearch.common.xcontent.LoggingDeprecationHandler;
import org.elasticsearch.core.CheckedConsumer;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.core.SuppressForbidden;
import org.elasticsearch.core.Tuple;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.index.snapshots.IndexShardRestoreFailedException;
import org.elasticsearch.index.snapshots.IndexShardSnapshotFailedException;
import org.elasticsearch.index.snapshots.IndexShardSnapshotStatus;
import org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshot;
import org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshots;
import org.elasticsearch.index.snapshots.blobstore.RateLimitingInputStream;
import org.elasticsearch.index.snapshots.blobstore.SlicedInputStream;
import org.elasticsearch.index.snapshots.blobstore.SnapshotFiles;
import org.elasticsearch.index.store.Store;
import org.elasticsearch.index.store.StoreFileMetadata;
import org.elasticsearch.indices.recovery.RecoverySettings;
import org.elasticsearch.indices.recovery.RecoveryState;
import org.elasticsearch.repositories.FinalizeSnapshotContext;
import org.elasticsearch.repositories.GetSnapshotInfoContext;
import org.elasticsearch.repositories.IndexId;
import org.elasticsearch.repositories.IndexMetaDataGenerations;
import org.elasticsearch.repositories.RepositoriesService;
import org.elasticsearch.repositories.Repository;
import org.elasticsearch.repositories.RepositoryCleanupResult;
import org.elasticsearch.repositories.RepositoryData;
import org.elasticsearch.repositories.RepositoryData.SnapshotDetails;
import org.elasticsearch.repositories.RepositoryException;
import org.elasticsearch.repositories.RepositoryOperation;
import org.elasticsearch.repositories.RepositoryShardId;
import org.elasticsearch.repositories.RepositoryStats;
import org.elasticsearch.repositories.RepositoryVerificationException;
import org.elasticsearch.repositories.ShardGeneration;
import org.elasticsearch.repositories.ShardGenerations;
import org.elasticsearch.repositories.ShardSnapshotResult;
import org.elasticsearch.repositories.SnapshotShardContext;
import org.elasticsearch.snapshots.AbortedSnapshotException;
import org.elasticsearch.snapshots.SnapshotException;
import org.elasticsearch.snapshots.SnapshotId;
import org.elasticsearch.snapshots.SnapshotInfo;
import org.elasticsearch.snapshots.SnapshotMissingException;
import org.elasticsearch.snapshots.SnapshotsService;
import org.elasticsearch.tasks.TaskCancelledException;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.xcontent.NamedXContentRegistry;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentFactory;
import org.elasticsearch.xcontent.XContentParser;
import org.elasticsearch.xcontent.XContentType;
import java.io.FilterInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.NoSuchFileException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executor;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.LongStream;
import java.util.stream.Stream;
import static org.elasticsearch.core.Strings.format;
import static org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshot.FileInfo.canonicalName;
/**
* BlobStore - based implementation of Snapshot Repository
*
* This repository works with any {@link BlobStore} implementation. The blobStore could be (and preferred) lazy initialized in
* {@link #createBlobStore()}.
*
* For in depth documentation on how exactly implementations of this class interact with the snapshot functionality please refer to the
* documentation of the package {@link org.elasticsearch.repositories.blobstore}.
*/
public abstract class BlobStoreRepository extends AbstractLifecycleComponent implements Repository {
private static final Logger logger = LogManager.getLogger(BlobStoreRepository.class);
protected volatile RepositoryMetadata metadata;
protected final ThreadPool threadPool;
public static final String SNAPSHOT_PREFIX = "snap-";
public static final String INDEX_FILE_PREFIX = "index-";
public static final String INDEX_LATEST_BLOB = "index.latest";
private static final String TESTS_FILE = "tests-";
public static final String METADATA_PREFIX = "meta-";
public static final String METADATA_NAME_FORMAT = METADATA_PREFIX + "%s.dat";
public static final String SNAPSHOT_NAME_FORMAT = SNAPSHOT_PREFIX + "%s.dat";
private static final String SNAPSHOT_INDEX_PREFIX = "index-";
private static final String SNAPSHOT_INDEX_NAME_FORMAT = SNAPSHOT_INDEX_PREFIX + "%s";
public static final String UPLOADED_DATA_BLOB_PREFIX = "__";
// Expose a copy of URLRepository#TYPE here too, for a better error message until https://github.com/elastic/elasticsearch/issues/68918
// is resolved.
public static final String URL_REPOSITORY_TYPE = "url";
/**
* All {@link BlobStoreRepository} implementations can be made read-only by setting this key to {@code true} in their settings.
*/
public static final String READONLY_SETTING_KEY = "readonly";
/**
* Prefix used for the identifiers of data blobs that were not actually written to the repository physically because their contents are
* already stored in the metadata referencing them, i.e. in {@link BlobStoreIndexShardSnapshot} and
* {@link BlobStoreIndexShardSnapshots}. This is the case for files for which {@link StoreFileMetadata#hashEqualsContents()} is
* {@code true}.
*/
private static final String VIRTUAL_DATA_BLOB_PREFIX = "v__";
/**
* When set to true metadata files are stored in compressed format. This setting doesn’t affect index
* files that are already compressed by default. Changing the setting does not invalidate existing files since reads
* do not observe the setting, instead they examine the file to see if it is compressed or not.
*/
public static final Setting COMPRESS_SETTING = Setting.boolSetting("compress", true, Setting.Property.NodeScope);
/**
* Setting to disable caching of the latest repository data.
*/
public static final Setting CACHE_REPOSITORY_DATA = Setting.boolSetting(
"cache_repository_data",
true,
Setting.Property.DeprecatedWarning
);
/**
* Size hint for the IO buffer size to use when reading from and writing to the repository.
*/
public static final Setting BUFFER_SIZE_SETTING = Setting.byteSizeSetting(
"io_buffer_size",
ByteSizeValue.parseBytesSizeValue("128kb", "io_buffer_size"),
ByteSizeValue.parseBytesSizeValue("8kb", "buffer_size"),
ByteSizeValue.parseBytesSizeValue("16mb", "io_buffer_size"),
Setting.Property.NodeScope
);
/**
* Setting to disable writing the {@code index.latest} blob which enables the contents of this repository to be used with a
* url-repository.
*/
public static final Setting SUPPORT_URL_REPO = Setting.boolSetting("support_url_repo", true, Setting.Property.NodeScope);
/**
* Setting that defines the maximum number of snapshots to which the repository may grow. Trying to create a snapshot into the
* repository that would move it above this size will throw an exception.
*/
public static final Setting MAX_SNAPSHOTS_SETTING = Setting.intSetting(
"max_number_of_snapshots",
Integer.MAX_VALUE,
1,
Setting.Property.NodeScope
);
/**
* Setting that defines if the repository should be used to recover index files during peer recoveries.
*/
public static final Setting USE_FOR_PEER_RECOVERY_SETTING = Setting.boolSetting("use_for_peer_recovery", false);
protected final boolean supportURLRepo;
private final boolean compress;
private final boolean cacheRepositoryData;
private volatile RateLimiter snapshotRateLimiter;
private volatile RateLimiter restoreRateLimiter;
private final CounterMetric snapshotRateLimitingTimeInNanos = new CounterMetric();
private final CounterMetric restoreRateLimitingTimeInNanos = new CounterMetric();
public static final ChecksumBlobStoreFormat GLOBAL_METADATA_FORMAT = new ChecksumBlobStoreFormat<>(
"metadata",
METADATA_NAME_FORMAT,
(repoName, parser) -> Metadata.fromXContent(parser)
);
public static final ChecksumBlobStoreFormat INDEX_METADATA_FORMAT = new ChecksumBlobStoreFormat<>(
"index-metadata",
METADATA_NAME_FORMAT,
(repoName, parser) -> IndexMetadata.Builder.legacyFromXContent(parser),
(repoName, parser) -> IndexMetadata.fromXContent(parser)
);
private static final String SNAPSHOT_CODEC = "snapshot";
public static final ChecksumBlobStoreFormat SNAPSHOT_FORMAT = new ChecksumBlobStoreFormat<>(
SNAPSHOT_CODEC,
SNAPSHOT_NAME_FORMAT,
SnapshotInfo::fromXContentInternal
);
public static final ChecksumBlobStoreFormat INDEX_SHARD_SNAPSHOT_FORMAT = new ChecksumBlobStoreFormat<>(
SNAPSHOT_CODEC,
SNAPSHOT_NAME_FORMAT,
(repoName, parser) -> BlobStoreIndexShardSnapshot.fromXContent(parser)
);
public static final ChecksumBlobStoreFormat INDEX_SHARD_SNAPSHOTS_FORMAT = new ChecksumBlobStoreFormat<>(
"snapshots",
SNAPSHOT_INDEX_NAME_FORMAT,
(repoName, parser) -> BlobStoreIndexShardSnapshots.fromXContent(parser)
);
public static final Setting MAX_SNAPSHOT_BYTES_PER_SEC = Setting.byteSizeSetting(
"max_snapshot_bytes_per_sec",
new ByteSizeValue(40, ByteSizeUnit.MB),
Setting.Property.Dynamic,
Setting.Property.NodeScope
);
public static final Setting MAX_RESTORE_BYTES_PER_SEC = Setting.byteSizeSetting(
"max_restore_bytes_per_sec",
ByteSizeValue.ZERO,
Setting.Property.Dynamic,
Setting.Property.NodeScope
);
/**
* Repository settings that can be updated dynamically without having to create a new repository.
*/
private static final Set DYNAMIC_SETTING_NAMES = Set.of(
MAX_SNAPSHOT_BYTES_PER_SEC.getKey(),
MAX_RESTORE_BYTES_PER_SEC.getKey()
);
private final boolean readOnly;
private final Object lock = new Object();
private final SetOnce blobContainer = new SetOnce<>();
private final SetOnce blobStore = new SetOnce<>();
private final BlobPath basePath;
private final ClusterService clusterService;
private final RecoverySettings recoverySettings;
private final NamedXContentRegistry namedXContentRegistry;
protected final BigArrays bigArrays;
/**
* Flag that is set to {@code true} if this instance is started with {@link #metadata} that has a higher value for
* {@link RepositoryMetadata#pendingGeneration()} than for {@link RepositoryMetadata#generation()} indicating a full cluster restart
* potentially accounting for the the last {@code index-N} write in the cluster state.
* Note: While it is true that this value could also be set to {@code true} for an instance on a node that is just joining the cluster
* during a new {@code index-N} write, this does not present a problem. The node will still load the correct {@link RepositoryData} in
* all cases and simply do a redundant listing of the repository contents if it tries to load {@link RepositoryData} and falls back
* to {@link #latestIndexBlobId()} to validate the value of {@link RepositoryMetadata#generation()}.
*/
private boolean uncleanStart;
/**
* This flag indicates that the repository can not exclusively rely on the value stored in {@link #latestKnownRepoGen} to determine the
* latest repository generation but must inspect its physical contents as well via {@link #latestIndexBlobId()}.
* This flag is set in the following situations:
*
* - All repositories that are read-only, i.e. for which {@link #isReadOnly()} returns {@code true} because there are no
* guarantees that another cluster is not writing to the repository at the same time
* - The value of {@link RepositoryMetadata#generation()} for this repository is {@link RepositoryData#UNKNOWN_REPO_GEN}
* indicating that no consistent repository generation is tracked in the cluster state yet.
* - The {@link #uncleanStart} flag is set to {@code true}
*
*/
private volatile boolean bestEffortConsistency;
/**
* IO buffer size hint for reading and writing to the underlying blob store.
*/
protected final int bufferSize;
/**
* Maximum number of snapshots that this repository can hold.
*/
private final int maxSnapshotCount;
/**
* Constructs new BlobStoreRepository
* @param metadata The metadata for this repository including name and settings
* @param clusterService ClusterService
*/
protected BlobStoreRepository(
final RepositoryMetadata metadata,
final NamedXContentRegistry namedXContentRegistry,
final ClusterService clusterService,
final BigArrays bigArrays,
final RecoverySettings recoverySettings,
final BlobPath basePath
) {
this.metadata = metadata;
this.threadPool = clusterService.getClusterApplierService().threadPool();
this.clusterService = clusterService;
this.bigArrays = bigArrays;
this.recoverySettings = recoverySettings;
this.compress = COMPRESS_SETTING.get(metadata.settings());
this.supportURLRepo = SUPPORT_URL_REPO.get(metadata.settings());
snapshotRateLimiter = getRateLimiter(metadata.settings(), MAX_SNAPSHOT_BYTES_PER_SEC);
restoreRateLimiter = getRateLimiter(metadata.settings(), MAX_RESTORE_BYTES_PER_SEC);
readOnly = metadata.settings().getAsBoolean(READONLY_SETTING_KEY, false);
cacheRepositoryData = CACHE_REPOSITORY_DATA.get(metadata.settings());
bufferSize = Math.toIntExact(BUFFER_SIZE_SETTING.get(metadata.settings()).getBytes());
this.namedXContentRegistry = namedXContentRegistry;
this.basePath = basePath;
this.maxSnapshotCount = MAX_SNAPSHOTS_SETTING.get(metadata.settings());
this.repoDataDeduplicator = new ResultDeduplicator<>(threadPool.getThreadContext());
}
@Override
protected void doStart() {
uncleanStart = metadata.pendingGeneration() > RepositoryData.EMPTY_REPO_GEN
&& metadata.generation() != metadata.pendingGeneration();
ByteSizeValue chunkSize = chunkSize();
if (chunkSize != null && chunkSize.getBytes() <= 0) {
throw new IllegalArgumentException("the chunk size cannot be negative: [" + chunkSize + "]");
}
}
@Override
protected void doStop() {}
@Override
protected void doClose() {
BlobStore store;
// to close blobStore if blobStore initialization is started during close
synchronized (lock) {
store = blobStore.get();
}
if (store != null) {
try {
store.close();
} catch (Exception t) {
logger.warn("cannot close blob store", t);
}
}
}
// listeners to invoke when a restore completes and there are no more restores running
@Nullable
private List> emptyListeners;
// Set of shard ids that this repository is currently restoring
private final Set ongoingRestores = new HashSet<>();
@Override
public void awaitIdle() {
assert lifecycle.stoppedOrClosed();
final PlainActionFuture future;
synchronized (ongoingRestores) {
if (ongoingRestores.isEmpty()) {
return;
}
future = new PlainActionFuture<>();
if (emptyListeners == null) {
emptyListeners = new ArrayList<>();
}
emptyListeners.add(future);
}
FutureUtils.get(future);
}
@Override
public void executeConsistentStateUpdate(
Function createUpdateTask,
String source,
Consumer onFailure
) {
final RepositoryMetadata repositoryMetadataStart = metadata;
getRepositoryData(ActionListener.wrap(repositoryData -> {
final ClusterStateUpdateTask updateTask = createUpdateTask.apply(repositoryData);
submitUnbatchedTask(source, new ClusterStateUpdateTask(updateTask.priority(), updateTask.timeout()) {
private boolean executedTask = false;
@Override
public ClusterState execute(ClusterState currentState) throws Exception {
// Comparing the full metadata here on purpose instead of simply comparing the safe generation.
// If the safe generation has changed, then we have to reload repository data and start over.
// If the pending generation has changed we are in the midst of a write operation and might pick up the
// updated repository data and state on the retry. We don't want to wait for the write to finish though
// because it could fail for any number of reasons so we just retry instead of waiting on the cluster state
// to change in any form.
if (repositoryMetadataStart.equals(getRepoMetadata(currentState))) {
executedTask = true;
return updateTask.execute(currentState);
}
return currentState;
}
@Override
public void onFailure(Exception e) {
if (executedTask) {
updateTask.onFailure(e);
} else {
onFailure.accept(e);
}
}
@Override
public void clusterStateProcessed(ClusterState oldState, ClusterState newState) {
if (executedTask) {
updateTask.clusterStateProcessed(oldState, newState);
} else {
executeConsistentStateUpdate(createUpdateTask, source, onFailure);
}
}
});
}, onFailure));
}
@SuppressForbidden(reason = "legacy usage of unbatched task") // TODO add support for batching here
private void submitUnbatchedTask(@SuppressWarnings("SameParameterValue") String source, ClusterStateUpdateTask task) {
clusterService.submitUnbatchedStateUpdateTask(source, task);
}
@Override
public void cloneShardSnapshot(
SnapshotId source,
SnapshotId target,
RepositoryShardId shardId,
@Nullable ShardGeneration shardGeneration,
ActionListener listener
) {
if (isReadOnly()) {
listener.onFailure(new RepositoryException(metadata.name(), "cannot clone shard snapshot on a readonly repository"));
return;
}
final IndexId index = shardId.index();
final int shardNum = shardId.shardId();
final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT);
executor.execute(ActionRunnable.supply(listener, () -> {
final long startTime = threadPool.absoluteTimeInMillis();
final BlobContainer shardContainer = shardContainer(index, shardNum);
final BlobStoreIndexShardSnapshots existingSnapshots;
final ShardGeneration newGen;
final ShardGeneration existingShardGen;
if (shardGeneration == null) {
Tuple tuple = buildBlobStoreIndexShardSnapshots(
shardContainer.listBlobsByPrefix(INDEX_FILE_PREFIX).keySet(),
shardContainer
);
existingShardGen = new ShardGeneration(tuple.v2());
newGen = new ShardGeneration(tuple.v2() + 1);
existingSnapshots = tuple.v1();
} else {
newGen = ShardGeneration.newGeneration();
existingSnapshots = buildBlobStoreIndexShardSnapshots(Collections.emptySet(), shardContainer, shardGeneration).v1();
existingShardGen = shardGeneration;
}
SnapshotFiles existingTargetFiles = null;
SnapshotFiles sourceFiles = null;
for (SnapshotFiles existingSnapshot : existingSnapshots) {
final String snapshotName = existingSnapshot.snapshot();
if (snapshotName.equals(target.getName())) {
existingTargetFiles = existingSnapshot;
} else if (snapshotName.equals(source.getName())) {
sourceFiles = existingSnapshot;
}
if (sourceFiles != null && existingTargetFiles != null) {
break;
}
}
if (sourceFiles == null) {
throw new RepositoryException(
metadata.name(),
"Can't create clone of ["
+ shardId
+ "] for snapshot ["
+ target
+ "]. The source snapshot ["
+ source
+ "] was not found in the shard metadata."
);
}
if (existingTargetFiles != null) {
if (existingTargetFiles.isSame(sourceFiles)) {
return new ShardSnapshotResult(
existingShardGen,
ByteSizeValue.ofBytes(existingTargetFiles.totalSize()),
getSegmentInfoFileCount(existingTargetFiles.indexFiles())
);
}
throw new RepositoryException(
metadata.name(),
"Can't create clone of ["
+ shardId
+ "] for snapshot ["
+ target
+ "]. A snapshot by that name already exists for this shard."
);
}
final BlobStoreIndexShardSnapshot sourceMeta = loadShardSnapshot(shardContainer, source);
logger.trace("[{}] [{}] writing shard snapshot file for clone", shardId, target);
INDEX_SHARD_SNAPSHOT_FORMAT.write(
sourceMeta.asClone(target.getName(), startTime, threadPool.absoluteTimeInMillis() - startTime),
shardContainer,
target.getUUID(),
compress
);
INDEX_SHARD_SNAPSHOTS_FORMAT.write(
existingSnapshots.withClone(source.getName(), target.getName()),
shardContainer,
newGen.toBlobNamePart(),
compress
);
return new ShardSnapshotResult(
newGen,
ByteSizeValue.ofBytes(sourceMeta.totalSize()),
getSegmentInfoFileCount(sourceMeta.indexFiles())
);
}));
}
private static int getSegmentInfoFileCount(List indexFiles) {
// noinspection ConstantConditions
return Math.toIntExact(Math.min(Integer.MAX_VALUE, indexFiles.stream().filter(fi -> fi.physicalName().endsWith(".si")).count()));
}
@Override
public boolean canUpdateInPlace(Settings updatedSettings, Set ignoredSettings) {
final Settings current = metadata.settings();
if (current.equals(updatedSettings)) {
return true;
}
final Set changedSettingNames = new HashSet<>(current.keySet());
changedSettingNames.addAll(updatedSettings.keySet());
changedSettingNames.removeAll(ignoredSettings);
changedSettingNames.removeIf(setting -> Objects.equals(current.get(setting), updatedSettings.get(setting)));
changedSettingNames.removeAll(DYNAMIC_SETTING_NAMES);
return changedSettingNames.isEmpty();
}
// Inspects all cluster state elements that contain a hint about what the current repository generation is and updates
// #latestKnownRepoGen if a newer than currently known generation is found
@Override
public void updateState(ClusterState state) {
final Settings previousSettings = metadata.settings();
metadata = getRepoMetadata(state);
final Settings updatedSettings = metadata.settings();
if (updatedSettings.equals(previousSettings) == false) {
snapshotRateLimiter = getRateLimiter(metadata.settings(), MAX_SNAPSHOT_BYTES_PER_SEC);
restoreRateLimiter = getRateLimiter(metadata.settings(), MAX_RESTORE_BYTES_PER_SEC);
}
uncleanStart = uncleanStart && metadata.generation() != metadata.pendingGeneration();
final boolean wasBestEffortConsistency = bestEffortConsistency;
bestEffortConsistency = uncleanStart || isReadOnly() || metadata.generation() == RepositoryData.UNKNOWN_REPO_GEN;
if (isReadOnly()) {
// No need to waste cycles, no operations can run against a read-only repository
return;
}
if (bestEffortConsistency) {
final SnapshotsInProgress snapshotsInProgress = state.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY);
long bestGenerationFromCS = bestGeneration(snapshotsInProgress.forRepo(this.metadata.name()));
// Don't use generation from the delete task if we already found a generation for an in progress snapshot.
// In this case, the generation points at the generation the repo will be in after the snapshot finishes so it may not yet
// exist
if (bestGenerationFromCS == RepositoryData.EMPTY_REPO_GEN) {
bestGenerationFromCS = bestGeneration(
state.custom(SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY).getEntries()
);
}
if (bestGenerationFromCS == RepositoryData.EMPTY_REPO_GEN) {
bestGenerationFromCS = bestGeneration(
state.custom(RepositoryCleanupInProgress.TYPE, RepositoryCleanupInProgress.EMPTY).entries()
);
}
final long finalBestGen = Math.max(bestGenerationFromCS, metadata.generation());
latestKnownRepoGen.updateAndGet(known -> Math.max(known, finalBestGen));
} else {
final long previousBest = latestKnownRepoGen.getAndSet(metadata.generation());
if (previousBest != metadata.generation()) {
assert wasBestEffortConsistency
|| metadata.generation() == RepositoryData.CORRUPTED_REPO_GEN
|| previousBest < metadata.generation()
: "Illegal move from repository generation [" + previousBest + "] to generation [" + metadata.generation() + "]";
logger.debug("Updated repository generation from [{}] to [{}]", previousBest, metadata.generation());
}
}
}
private long bestGeneration(Collection operations) {
final String repoName = metadata.name();
return operations.stream()
.filter(e -> e.repository().equals(repoName))
.mapToLong(RepositoryOperation::repositoryStateId)
.max()
.orElse(RepositoryData.EMPTY_REPO_GEN);
}
public ThreadPool threadPool() {
return threadPool;
}
// package private, only use for testing
BlobContainer getBlobContainer() {
return blobContainer.get();
}
// for test purposes only
protected BlobStore getBlobStore() {
return blobStore.get();
}
/**
* maintains single lazy instance of {@link BlobContainer}
*/
protected BlobContainer blobContainer() {
assertSnapshotOrGenericThread();
if (lifecycle.started() == false) {
throw notStartedException();
}
BlobContainer blobContainer = this.blobContainer.get();
if (blobContainer == null) {
synchronized (lock) {
blobContainer = this.blobContainer.get();
if (blobContainer == null) {
blobContainer = blobStore().blobContainer(basePath());
this.blobContainer.set(blobContainer);
}
}
}
return blobContainer;
}
/**
* Maintains single lazy instance of {@link BlobStore}.
* Public for testing.
*/
public BlobStore blobStore() {
assertSnapshotOrGenericThread();
BlobStore store = blobStore.get();
if (store == null) {
synchronized (lock) {
store = blobStore.get();
if (store == null) {
if (lifecycle.started() == false) {
throw notStartedException();
}
try {
store = createBlobStore();
} catch (RepositoryException e) {
throw e;
} catch (Exception e) {
throw new RepositoryException(metadata.name(), "cannot create blob store", e);
}
blobStore.set(store);
}
}
}
return store;
}
/**
* Creates new BlobStore to read and write data.
*/
protected abstract BlobStore createBlobStore() throws Exception;
/**
* Returns base path of the repository
* Public for testing.
*/
public BlobPath basePath() {
return basePath;
}
/**
* Returns true if metadata and snapshot files should be compressed
*
* @return true if compression is needed
*/
protected final boolean isCompress() {
return compress;
}
/**
* Returns data file chunk size.
*
* This method should return null if no chunking is needed.
*
* @return chunk size
*/
protected ByteSizeValue chunkSize() {
return null;
}
@Override
public RepositoryMetadata getMetadata() {
return metadata;
}
@Override
public RepositoryStats stats() {
final BlobStore store = blobStore.get();
if (store == null) {
return RepositoryStats.EMPTY_STATS;
}
return new RepositoryStats(store.stats());
}
@Override
public void deleteSnapshots(
Collection snapshotIds,
long repositoryStateId,
Version repositoryMetaVersion,
ActionListener listener
) {
if (isReadOnly()) {
listener.onFailure(new RepositoryException(metadata.name(), "cannot delete snapshot from a readonly repository"));
} else {
threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(new AbstractRunnable() {
@Override
protected void doRun() throws Exception {
final Map rootBlobs = blobContainer().listBlobs();
final RepositoryData repositoryData = safeRepositoryData(repositoryStateId, rootBlobs);
// Cache the indices that were found before writing out the new index-N blob so that a stuck master will never
// delete an index that was created by another master node after writing this index-N blob.
final Map foundIndices = blobStore().blobContainer(indicesPath()).children();
doDeleteShardSnapshots(
snapshotIds,
repositoryStateId,
foundIndices,
rootBlobs,
repositoryData,
repositoryMetaVersion,
listener
);
}
@Override
public void onFailure(Exception e) {
listener.onFailure(new RepositoryException(metadata.name(), "failed to delete snapshots " + snapshotIds, e));
}
});
}
}
/**
* Loads {@link RepositoryData} ensuring that it is consistent with the given {@code rootBlobs} as well of the assumed generation.
*
* @param repositoryStateId Expected repository generation
* @param rootBlobs Blobs at the repository root
* @return RepositoryData
*/
private RepositoryData safeRepositoryData(long repositoryStateId, Map rootBlobs) {
final long generation = latestGeneration(rootBlobs.keySet());
final long genToLoad;
final RepositoryData cached;
if (bestEffortConsistency) {
genToLoad = latestKnownRepoGen.updateAndGet(known -> Math.max(known, repositoryStateId));
cached = null;
} else {
genToLoad = latestKnownRepoGen.get();
cached = latestKnownRepositoryData.get();
}
if (genToLoad > generation) {
// It's always a possibility to not see the latest index-N in the listing here on an eventually consistent blob store, just
// debug log it. Any blobs leaked as a result of an inconsistent listing here will be cleaned up in a subsequent cleanup or
// snapshot delete run anyway.
logger.debug(
"Determined repository's generation from its contents to ["
+ generation
+ "] but "
+ "current generation is at least ["
+ genToLoad
+ "]"
);
}
if (genToLoad != repositoryStateId) {
throw new RepositoryException(
metadata.name(),
"concurrent modification of the index-N file, expected current generation ["
+ repositoryStateId
+ "], actual current generation ["
+ genToLoad
+ "]"
);
}
if (cached != null && cached.getGenId() == genToLoad) {
return cached;
}
return getRepositoryData(genToLoad);
}
/**
* After updating the {@link RepositoryData} each of the shards directories is individually first moved to the next shard generation
* and then has all now unreferenced blobs in it deleted.
*
* @param snapshotIds SnapshotIds to delete
* @param repositoryStateId Expected repository state id
* @param foundIndices All indices folders found in the repository before executing any writes to the repository during this
* delete operation
* @param rootBlobs All blobs found at the root of the repository before executing any writes to the repository during this
* delete operation
* @param repositoryData RepositoryData found the in the repository before executing this delete
* @param listener Listener to invoke once finished
*/
private void doDeleteShardSnapshots(
Collection snapshotIds,
long repositoryStateId,
Map foundIndices,
Map rootBlobs,
RepositoryData repositoryData,
Version repoMetaVersion,
ActionListener listener
) {
if (SnapshotsService.useShardGenerations(repoMetaVersion)) {
// First write the new shard state metadata (with the removed snapshot) and compute deletion targets
final StepListener> writeShardMetaDataAndComputeDeletesStep = new StepListener<>();
writeUpdatedShardMetaDataAndComputeDeletes(snapshotIds, repositoryData, true, writeShardMetaDataAndComputeDeletesStep);
// Once we have put the new shard-level metadata into place, we can update the repository metadata as follows:
// 1. Remove the snapshots from the list of existing snapshots
// 2. Update the index shard generations of all updated shard folders
//
// Note: If we fail updating any of the individual shard paths, none of them are changed since the newly created
// index-${gen_uuid} will not be referenced by the existing RepositoryData and new RepositoryData is only
// written if all shard paths have been successfully updated.
final StepListener writeUpdatedRepoDataStep = new StepListener<>();
writeShardMetaDataAndComputeDeletesStep.whenComplete(deleteResults -> {
final ShardGenerations.Builder builder = ShardGenerations.builder();
for (ShardSnapshotMetaDeleteResult newGen : deleteResults) {
builder.put(newGen.indexId, newGen.shardId, newGen.newGeneration);
}
final RepositoryData updatedRepoData = repositoryData.removeSnapshots(snapshotIds, builder.build());
writeIndexGen(
updatedRepoData,
repositoryStateId,
repoMetaVersion,
Function.identity(),
ActionListener.wrap(writeUpdatedRepoDataStep::onResponse, listener::onFailure)
);
}, listener::onFailure);
// Once we have updated the repository, run the clean-ups
writeUpdatedRepoDataStep.whenComplete(updatedRepoData -> {
// Run unreferenced blobs cleanup in parallel to shard-level snapshot deletion
final ActionListener afterCleanupsListener = new GroupedActionListener<>(
ActionListener.wrap(() -> listener.onResponse(updatedRepoData)),
2
);
cleanupUnlinkedRootAndIndicesBlobs(snapshotIds, foundIndices, rootBlobs, updatedRepoData, afterCleanupsListener);
asyncCleanupUnlinkedShardLevelBlobs(
repositoryData,
snapshotIds,
writeShardMetaDataAndComputeDeletesStep.result(),
afterCleanupsListener
);
}, listener::onFailure);
} else {
// Write the new repository data first (with the removed snapshot), using no shard generations
final RepositoryData updatedRepoData = repositoryData.removeSnapshots(snapshotIds, ShardGenerations.EMPTY);
writeIndexGen(updatedRepoData, repositoryStateId, repoMetaVersion, Function.identity(), ActionListener.wrap(newRepoData -> {
// Run unreferenced blobs cleanup in parallel to shard-level snapshot deletion
final ActionListener afterCleanupsListener = new GroupedActionListener<>(
ActionListener.wrap(() -> listener.onResponse(newRepoData)),
2
);
cleanupUnlinkedRootAndIndicesBlobs(snapshotIds, foundIndices, rootBlobs, newRepoData, afterCleanupsListener);
final StepListener> writeMetaAndComputeDeletesStep = new StepListener<>();
writeUpdatedShardMetaDataAndComputeDeletes(snapshotIds, repositoryData, false, writeMetaAndComputeDeletesStep);
writeMetaAndComputeDeletesStep.whenComplete(
deleteResults -> asyncCleanupUnlinkedShardLevelBlobs(repositoryData, snapshotIds, deleteResults, afterCleanupsListener),
afterCleanupsListener::onFailure
);
}, listener::onFailure));
}
}
private void cleanupUnlinkedRootAndIndicesBlobs(
Collection deletedSnapshots,
Map foundIndices,
Map rootBlobs,
RepositoryData updatedRepoData,
ActionListener listener
) {
cleanupStaleBlobs(deletedSnapshots, foundIndices, rootBlobs, updatedRepoData, listener.map(ignored -> null));
}
private void asyncCleanupUnlinkedShardLevelBlobs(
RepositoryData oldRepositoryData,
Collection snapshotIds,
Collection deleteResults,
ActionListener listener
) {
final Iterator filesToDelete = resolveFilesToDelete(oldRepositoryData, snapshotIds, deleteResults);
if (filesToDelete.hasNext() == false) {
listener.onResponse(null);
return;
}
threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(ActionRunnable.wrap(listener, l -> {
try {
deleteFromContainer(blobContainer(), filesToDelete);
l.onResponse(null);
} catch (Exception e) {
logger.warn(() -> format("%s Failed to delete some blobs during snapshot delete", snapshotIds), e);
throw e;
}
}));
}
// updates the shard state metadata for shards of a snapshot that is to be deleted. Also computes the files to be cleaned up.
private void writeUpdatedShardMetaDataAndComputeDeletes(
Collection snapshotIds,
RepositoryData oldRepositoryData,
boolean useUUIDs,
ActionListener> onAllShardsCompleted
) {
final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT);
final List indices = oldRepositoryData.indicesToUpdateAfterRemovingSnapshot(snapshotIds);
if (indices.isEmpty()) {
onAllShardsCompleted.onResponse(Collections.emptyList());
return;
}
// Listener that flattens out the delete results for each index
final ActionListener> deleteIndexMetadataListener = new GroupedActionListener<>(
onAllShardsCompleted.map(res -> res.stream().flatMap(Collection::stream).toList()),
indices.size()
);
for (IndexId indexId : indices) {
final Set snapshotsWithIndex = Set.copyOf(oldRepositoryData.getSnapshots(indexId));
final Set survivingSnapshots = snapshotsWithIndex.stream()
.filter(id -> snapshotIds.contains(id) == false)
.collect(Collectors.toSet());
final StepListener> shardCountListener = new StepListener<>();
final Collection indexMetaGenerations = snapshotIds.stream()
.filter(snapshotsWithIndex::contains)
.map(id -> oldRepositoryData.indexMetaDataGenerations().indexMetaBlobId(id, indexId))
.collect(Collectors.toSet());
final ActionListener allShardCountsListener = new GroupedActionListener<>(
shardCountListener,
indexMetaGenerations.size()
);
final BlobContainer indexContainer = indexContainer(indexId);
for (String indexMetaGeneration : indexMetaGenerations) {
executor.execute(ActionRunnable.supply(allShardCountsListener, () -> {
try {
return INDEX_METADATA_FORMAT.read(metadata.name(), indexContainer, indexMetaGeneration, namedXContentRegistry)
.getNumberOfShards();
} catch (Exception ex) {
logger.warn(
() -> format("[%s] [%s] failed to read metadata for index", indexMetaGeneration, indexId.getName()),
ex
);
// Just invoke the listener without any shard generations to count it down, this index will be cleaned up
// by the stale data cleanup in the end.
// TODO: Getting here means repository corruption. We should find a way of dealing with this instead of just
// ignoring it and letting the cleanup deal with it.
return null;
}
}));
}
shardCountListener.whenComplete(counts -> {
final int shardCount = counts.stream().mapToInt(i -> i).max().orElse(0);
if (shardCount == 0) {
deleteIndexMetadataListener.onResponse(null);
return;
}
// Listener for collecting the results of removing the snapshot from each shard's metadata in the current index
final ActionListener allShardsListener = new GroupedActionListener<>(
deleteIndexMetadataListener,
shardCount
);
for (int shardId = 0; shardId < shardCount; shardId++) {
final int finalShardId = shardId;
executor.execute(new AbstractRunnable() {
@Override
protected void doRun() throws Exception {
final BlobContainer shardContainer = shardContainer(indexId, finalShardId);
final Set blobs = shardContainer.listBlobs().keySet();
final BlobStoreIndexShardSnapshots blobStoreIndexShardSnapshots;
final long newGen;
if (useUUIDs) {
newGen = -1L;
blobStoreIndexShardSnapshots = buildBlobStoreIndexShardSnapshots(
blobs,
shardContainer,
oldRepositoryData.shardGenerations().getShardGen(indexId, finalShardId)
).v1();
} else {
Tuple tuple = buildBlobStoreIndexShardSnapshots(blobs, shardContainer);
newGen = tuple.v2() + 1;
blobStoreIndexShardSnapshots = tuple.v1();
}
allShardsListener.onResponse(
deleteFromShardSnapshotMeta(
survivingSnapshots,
indexId,
finalShardId,
snapshotIds,
shardContainer,
blobs,
blobStoreIndexShardSnapshots,
newGen
)
);
}
@Override
public void onFailure(Exception ex) {
logger.warn(
() -> format(
"%s failed to delete shard data for shard [%s][%s]",
snapshotIds,
indexId.getName(),
finalShardId
),
ex
);
// Just passing null here to count down the listener instead of failing it, the stale data left behind
// here will be retried in the next delete or repository cleanup
allShardsListener.onResponse(null);
}
});
}
}, deleteIndexMetadataListener::onFailure);
}
}
private Iterator resolveFilesToDelete(
RepositoryData oldRepositoryData,
Collection snapshotIds,
Collection deleteResults
) {
final String basePath = basePath().buildAsString();
final int basePathLen = basePath.length();
final Map> indexMetaGenerations = oldRepositoryData.indexMetaDataToRemoveAfterRemovingSnapshots(
snapshotIds
);
return Stream.concat(deleteResults.stream().flatMap(shardResult -> {
final String shardPath = shardContainer(shardResult.indexId, shardResult.shardId).path().buildAsString();
return shardResult.blobsToDelete.stream().map(blob -> shardPath + blob);
}), indexMetaGenerations.entrySet().stream().flatMap(entry -> {
final String indexContainerPath = indexContainer(entry.getKey()).path().buildAsString();
return entry.getValue().stream().map(id -> indexContainerPath + INDEX_METADATA_FORMAT.blobName(id));
})).map(absolutePath -> {
assert absolutePath.startsWith(basePath);
return absolutePath.substring(basePathLen);
}).iterator();
}
/**
* Cleans up stale blobs directly under the repository root as well as all indices paths that aren't referenced by any existing
* snapshots. This method is only to be called directly after a new {@link RepositoryData} was written to the repository and with
* parameters {@code foundIndices}, {@code rootBlobs}
*
* @param deletedSnapshots if this method is called as part of a delete operation, the snapshot ids just deleted or empty if called as
* part of a repository cleanup
* @param foundIndices all indices blob containers found in the repository before {@code newRepoData} was written
* @param rootBlobs all blobs found directly under the repository root
* @param newRepoData new repository data that was just written
* @param listener listener to invoke with the combined {@link DeleteResult} of all blobs removed in this operation
*/
private void cleanupStaleBlobs(
Collection deletedSnapshots,
Map foundIndices,
Map rootBlobs,
RepositoryData newRepoData,
ActionListener listener
) {
final GroupedActionListener groupedListener = new GroupedActionListener<>(ActionListener.wrap(deleteResults -> {
DeleteResult deleteResult = DeleteResult.ZERO;
for (DeleteResult result : deleteResults) {
deleteResult = deleteResult.add(result);
}
listener.onResponse(deleteResult);
}, listener::onFailure), 2);
final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT);
final List staleRootBlobs = staleRootBlobs(newRepoData, rootBlobs.keySet());
if (staleRootBlobs.isEmpty()) {
groupedListener.onResponse(DeleteResult.ZERO);
} else {
executor.execute(ActionRunnable.supply(groupedListener, () -> {
List deletedBlobs = cleanupStaleRootFiles(newRepoData.getGenId() - 1, deletedSnapshots, staleRootBlobs);
return new DeleteResult(deletedBlobs.size(), deletedBlobs.stream().mapToLong(name -> rootBlobs.get(name).length()).sum());
}));
}
final Set survivingIndexIds = newRepoData.getIndices().values().stream().map(IndexId::getId).collect(Collectors.toSet());
if (foundIndices.keySet().equals(survivingIndexIds)) {
groupedListener.onResponse(DeleteResult.ZERO);
} else {
executor.execute(ActionRunnable.supply(groupedListener, () -> cleanupStaleIndices(foundIndices, survivingIndexIds)));
}
}
/**
* Runs cleanup actions on the repository. Increments the repository state id by one before executing any modifications on the
* repository.
* TODO: Add shard level cleanups
* TODO: Add unreferenced index metadata cleanup
*
* - Deleting stale indices {@link #cleanupStaleIndices}
* - Deleting unreferenced root level blobs {@link #cleanupStaleRootFiles}
*
* @param repositoryStateId Current repository state id
* @param repositoryMetaVersion version of the updated repository metadata to write
* @param listener Listener to complete when done
*/
public void cleanup(long repositoryStateId, Version repositoryMetaVersion, ActionListener listener) {
try {
if (isReadOnly()) {
throw new RepositoryException(metadata.name(), "cannot run cleanup on readonly repository");
}
Map rootBlobs = blobContainer().listBlobs();
final RepositoryData repositoryData = safeRepositoryData(repositoryStateId, rootBlobs);
final Map foundIndices = blobStore().blobContainer(indicesPath()).children();
final Set survivingIndexIds = repositoryData.getIndices()
.values()
.stream()
.map(IndexId::getId)
.collect(Collectors.toSet());
final List staleRootBlobs = staleRootBlobs(repositoryData, rootBlobs.keySet());
if (survivingIndexIds.equals(foundIndices.keySet()) && staleRootBlobs.isEmpty()) {
// Nothing to clean up we return
listener.onResponse(new RepositoryCleanupResult(DeleteResult.ZERO));
} else {
// write new index-N blob to ensure concurrent operations will fail
writeIndexGen(
repositoryData,
repositoryStateId,
repositoryMetaVersion,
Function.identity(),
ActionListener.wrap(
v -> cleanupStaleBlobs(
Collections.emptyList(),
foundIndices,
rootBlobs,
repositoryData,
listener.map(RepositoryCleanupResult::new)
),
listener::onFailure
)
);
}
} catch (Exception e) {
listener.onFailure(e);
}
}
// Finds all blobs directly under the repository root path that are not referenced by the current RepositoryData
private static List staleRootBlobs(RepositoryData repositoryData, Set rootBlobNames) {
final Set allSnapshotIds = repositoryData.getSnapshotIds().stream().map(SnapshotId::getUUID).collect(Collectors.toSet());
return rootBlobNames.stream().filter(blob -> {
if (FsBlobContainer.isTempBlobName(blob)) {
return true;
}
if (blob.endsWith(".dat")) {
final String foundUUID;
if (blob.startsWith(SNAPSHOT_PREFIX)) {
foundUUID = blob.substring(SNAPSHOT_PREFIX.length(), blob.length() - ".dat".length());
assert SNAPSHOT_FORMAT.blobName(foundUUID).equals(blob);
} else if (blob.startsWith(METADATA_PREFIX)) {
foundUUID = blob.substring(METADATA_PREFIX.length(), blob.length() - ".dat".length());
assert GLOBAL_METADATA_FORMAT.blobName(foundUUID).equals(blob);
} else {
return false;
}
return allSnapshotIds.contains(foundUUID) == false;
} else if (blob.startsWith(INDEX_FILE_PREFIX)) {
// TODO: Include the current generation here once we remove keeping index-(N-1) around from #writeIndexGen
return repositoryData.getGenId() > Long.parseLong(blob.substring(INDEX_FILE_PREFIX.length()));
}
return false;
}).toList();
}
private List cleanupStaleRootFiles(
long previousGeneration,
Collection deletedSnapshots,
List blobsToDelete
) {
if (blobsToDelete.isEmpty()) {
return blobsToDelete;
}
try {
if (logger.isInfoEnabled()) {
// If we're running root level cleanup as part of a snapshot delete we should not log the snapshot- and global metadata
// blobs associated with the just deleted snapshots as they are expected to exist and not stale. Otherwise every snapshot
// delete would also log a confusing INFO message about "stale blobs".
final Set blobNamesToIgnore = deletedSnapshots.stream()
.flatMap(
snapshotId -> Stream.of(
GLOBAL_METADATA_FORMAT.blobName(snapshotId.getUUID()),
SNAPSHOT_FORMAT.blobName(snapshotId.getUUID()),
INDEX_FILE_PREFIX + previousGeneration
)
)
.collect(Collectors.toSet());
final List blobsToLog = blobsToDelete.stream().filter(b -> blobNamesToIgnore.contains(b) == false).toList();
if (blobsToLog.isEmpty() == false) {
logger.info("[{}] Found stale root level blobs {}. Cleaning them up", metadata.name(), blobsToLog);
}
}
deleteFromContainer(blobContainer(), blobsToDelete.iterator());
return blobsToDelete;
} catch (Exception e) {
logger.warn(
() -> format(
"[%s] The following blobs are no longer part of any snapshot [%s] but failed to remove them",
metadata.name(),
blobsToDelete
),
e
);
}
return Collections.emptyList();
}
private DeleteResult cleanupStaleIndices(Map foundIndices, Set survivingIndexIds) {
DeleteResult deleteResult = DeleteResult.ZERO;
for (Map.Entry indexEntry : foundIndices.entrySet()) {
final String indexSnId = indexEntry.getKey();
try {
if (survivingIndexIds.contains(indexSnId) == false) {
logger.debug("[{}] Found stale index [{}]. Cleaning it up", metadata.name(), indexSnId);
deleteResult = deleteResult.add(indexEntry.getValue().delete());
logger.debug("[{}] Cleaned up stale index [{}]", metadata.name(), indexSnId);
}
} catch (Exception e) {
logger.warn(
() -> format(
"[%s] index %s is no longer part of any snapshot in the repository, " + "but failed to clean up its index folder",
metadata.name(),
indexSnId
),
e
);
}
}
return deleteResult;
}
@Override
public void finalizeSnapshot(final FinalizeSnapshotContext finalizeSnapshotContext) {
final long repositoryStateId = finalizeSnapshotContext.repositoryStateId();
final ShardGenerations shardGenerations = finalizeSnapshotContext.updatedShardGenerations();
final SnapshotInfo snapshotInfo = finalizeSnapshotContext.snapshotInfo();
assert repositoryStateId > RepositoryData.UNKNOWN_REPO_GEN
: "Must finalize based on a valid repository generation but received [" + repositoryStateId + "]";
final Collection indices = shardGenerations.indices();
final SnapshotId snapshotId = snapshotInfo.snapshotId();
// Once we are done writing the updated index-N blob we remove the now unreferenced index-${uuid} blobs in each shard
// directory if all nodes are at least at version SnapshotsService#SHARD_GEN_IN_REPO_DATA_VERSION
// If there are older version nodes in the cluster, we don't need to run this cleanup as it will have already happened
// when writing the index-${N} to each shard directory.
final Version repositoryMetaVersion = finalizeSnapshotContext.repositoryMetaVersion();
final boolean writeShardGens = SnapshotsService.useShardGenerations(repositoryMetaVersion);
final Consumer onUpdateFailure = e -> finalizeSnapshotContext.onFailure(
new SnapshotException(metadata.name(), snapshotId, "failed to update snapshot in repository", e)
);
final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT);
final boolean writeIndexGens = SnapshotsService.useIndexGenerations(repositoryMetaVersion);
final StepListener repoDataListener = new StepListener<>();
getRepositoryData(repoDataListener);
repoDataListener.whenComplete(existingRepositoryData -> {
final int existingSnapshotCount = existingRepositoryData.getSnapshotIds().size();
if (existingSnapshotCount >= maxSnapshotCount) {
finalizeSnapshotContext.onFailure(
new RepositoryException(
metadata.name(),
"Cannot add another snapshot to this repository as it "
+ "already contains ["
+ existingSnapshotCount
+ "] snapshots and is configured to hold up to ["
+ maxSnapshotCount
+ "] snapshots only."
)
);
return;
}
final Map indexMetas;
final Map indexMetaIdentifiers;
if (writeIndexGens) {
indexMetaIdentifiers = ConcurrentCollections.newConcurrentMap();
indexMetas = ConcurrentCollections.newConcurrentMap();
} else {
indexMetas = null;
indexMetaIdentifiers = null;
}
final ActionListener allMetaListener = new GroupedActionListener<>(ActionListener.wrap(v -> {
final String slmPolicy = slmPolicy(snapshotInfo);
final SnapshotDetails snapshotDetails = new SnapshotDetails(
snapshotInfo.state(),
Version.CURRENT,
snapshotInfo.startTime(),
snapshotInfo.endTime(),
slmPolicy
);
writeIndexGen(
existingRepositoryData.addSnapshot(snapshotId, snapshotDetails, shardGenerations, indexMetas, indexMetaIdentifiers),
repositoryStateId,
repositoryMetaVersion,
finalizeSnapshotContext::updatedClusterState,
ActionListener.wrap(newRepoData -> {
if (writeShardGens) {
cleanupOldShardGens(existingRepositoryData, newRepoData, finalizeSnapshotContext);
}
finalizeSnapshotContext.onResponse(Tuple.tuple(newRepoData, snapshotInfo));
}, onUpdateFailure)
);
}, onUpdateFailure), 2 + indices.size());
// We ignore all FileAlreadyExistsException when writing metadata since otherwise a master failover while in this method will
// mean that no snap-${uuid}.dat blob is ever written for this snapshot. This is safe because any updated version of the
// index or global metadata will be compatible with the segments written in this snapshot as well.
// Failing on an already existing index-${repoGeneration} below ensures that the index.latest blob is not updated in a way
// that decrements the generation it points at
final Metadata clusterMetadata = finalizeSnapshotContext.clusterMetadata();
// Write Global MetaData
executor.execute(
ActionRunnable.run(
allMetaListener,
() -> GLOBAL_METADATA_FORMAT.write(clusterMetadata, blobContainer(), snapshotId.getUUID(), compress)
)
);
// write the index metadata for each index in the snapshot
for (IndexId index : indices) {
executor.execute(ActionRunnable.run(allMetaListener, () -> {
final IndexMetadata indexMetaData = clusterMetadata.index(index.getName());
if (writeIndexGens) {
final String identifiers = IndexMetaDataGenerations.buildUniqueIdentifier(indexMetaData);
String metaUUID = existingRepositoryData.indexMetaDataGenerations().getIndexMetaBlobId(identifiers);
if (metaUUID == null) {
// We don't yet have this version of the metadata so we write it
metaUUID = UUIDs.base64UUID();
INDEX_METADATA_FORMAT.write(indexMetaData, indexContainer(index), metaUUID, compress);
indexMetaIdentifiers.put(identifiers, metaUUID);
}
indexMetas.put(index, identifiers);
} else {
INDEX_METADATA_FORMAT.write(
clusterMetadata.index(index.getName()),
indexContainer(index),
snapshotId.getUUID(),
compress
);
}
}));
}
executor.execute(
ActionRunnable.run(
allMetaListener,
() -> SNAPSHOT_FORMAT.write(snapshotInfo, blobContainer(), snapshotId.getUUID(), compress)
)
);
}, onUpdateFailure);
}
// Delete all old shard gen blobs that aren't referenced any longer as a result from moving to updated repository data
private void cleanupOldShardGens(
RepositoryData existingRepositoryData,
RepositoryData updatedRepositoryData,
FinalizeSnapshotContext finalizeSnapshotContext
) {
final Set toDelete = new HashSet<>();
final int prefixPathLen = basePath().buildAsString().length();
updatedRepositoryData.shardGenerations()
.obsoleteShardGenerations(existingRepositoryData.shardGenerations())
.forEach(
(indexId, gens) -> gens.forEach(
(shardId, oldGen) -> toDelete.add(
shardContainer(indexId, shardId).path().buildAsString().substring(prefixPathLen) + INDEX_FILE_PREFIX + oldGen
)
)
);
for (Map.Entry> obsoleteEntry : finalizeSnapshotContext.obsoleteShardGenerations()
.entrySet()) {
final String containerPath = shardContainer(obsoleteEntry.getKey().index(), obsoleteEntry.getKey().shardId()).path()
.buildAsString()
.substring(prefixPathLen) + INDEX_FILE_PREFIX;
for (ShardGeneration shardGeneration : obsoleteEntry.getValue()) {
toDelete.add(containerPath + shardGeneration);
}
}
try {
deleteFromContainer(blobContainer(), toDelete.iterator());
} catch (Exception e) {
logger.warn("Failed to clean up old shard generation blobs", e);
}
}
@Override
public void getSnapshotInfo(GetSnapshotInfoContext context) {
// put snapshot info downloads into a task queue instead of pushing them all into the queue to not completely monopolize the
// snapshot meta pool for a single request
final int workers = Math.min(threadPool.info(ThreadPool.Names.SNAPSHOT_META).getMax(), context.snapshotIds().size());
final BlockingQueue queue = new LinkedBlockingQueue<>(context.snapshotIds());
for (int i = 0; i < workers; i++) {
getOneSnapshotInfo(queue, context);
}
}
/**
* Tries to poll a {@link SnapshotId} to load {@link SnapshotInfo} for from the given {@code queue}.
*/
private void getOneSnapshotInfo(BlockingQueue queue, GetSnapshotInfoContext context) {
final SnapshotId snapshotId = queue.poll();
if (snapshotId == null) {
return;
}
threadPool.executor(ThreadPool.Names.SNAPSHOT_META).execute(() -> {
if (context.done()) {
return;
}
if (context.isCancelled()) {
queue.clear();
context.onFailure(new TaskCancelledException("task cancelled"));
return;
}
Exception failure = null;
SnapshotInfo snapshotInfo = null;
try {
snapshotInfo = SNAPSHOT_FORMAT.read(metadata.name(), blobContainer(), snapshotId.getUUID(), namedXContentRegistry);
} catch (NoSuchFileException ex) {
failure = new SnapshotMissingException(metadata.name(), snapshotId, ex);
} catch (IOException | NotXContentException ex) {
failure = new SnapshotException(metadata.name(), snapshotId, "failed to get snapshot info" + snapshotId, ex);
} catch (Exception e) {
failure = e instanceof SnapshotException
? e
: new SnapshotException(metadata.name(), snapshotId, "Snapshot could not be read", e);
}
if (failure != null) {
if (context.abortOnFailure()) {
queue.clear();
}
context.onFailure(failure);
} else {
assert snapshotInfo != null;
context.onResponse(snapshotInfo);
}
getOneSnapshotInfo(queue, context);
});
}
@Override
public Metadata getSnapshotGlobalMetadata(final SnapshotId snapshotId) {
try {
return GLOBAL_METADATA_FORMAT.read(metadata.name(), blobContainer(), snapshotId.getUUID(), namedXContentRegistry);
} catch (NoSuchFileException ex) {
throw new SnapshotMissingException(metadata.name(), snapshotId, ex);
} catch (IOException ex) {
throw new SnapshotException(metadata.name(), snapshotId, "failed to read global metadata", ex);
}
}
@Override
public IndexMetadata getSnapshotIndexMetaData(RepositoryData repositoryData, SnapshotId snapshotId, IndexId index) throws IOException {
try {
return INDEX_METADATA_FORMAT.read(
metadata.name(),
indexContainer(index),
repositoryData.indexMetaDataGenerations().indexMetaBlobId(snapshotId, index),
namedXContentRegistry
);
} catch (NoSuchFileException e) {
throw new SnapshotMissingException(metadata.name(), snapshotId, e);
}
}
private void deleteFromContainer(BlobContainer container, Iterator blobs) throws IOException {
final Iterator wrappedIterator;
if (logger.isTraceEnabled()) {
wrappedIterator = new Iterator<>() {
@Override
public boolean hasNext() {
return blobs.hasNext();
}
@Override
public String next() {
final String blobName = blobs.next();
logger.trace("[{}] Deleting [{}] from [{}]", metadata.name(), blobName, container.path());
return blobName;
}
};
} else {
wrappedIterator = blobs;
}
container.deleteBlobsIgnoringIfNotExists(wrappedIterator);
}
private BlobPath indicesPath() {
return basePath().add("indices");
}
private BlobContainer indexContainer(IndexId indexId) {
return blobStore().blobContainer(indicesPath().add(indexId.getId()));
}
private BlobContainer shardContainer(IndexId indexId, ShardId shardId) {
return shardContainer(indexId, shardId.getId());
}
public BlobContainer shardContainer(IndexId indexId, int shardId) {
return blobStore().blobContainer(indicesPath().add(indexId.getId()).add(Integer.toString(shardId)));
}
/**
* Configures RateLimiter based on repository and global settings
*
* @param repositorySettings repository settings
* @param setting setting to use to configure rate limiter
* @return rate limiter or null of no throttling is needed
*/
private static RateLimiter getRateLimiter(Settings repositorySettings, Setting setting) {
ByteSizeValue maxSnapshotBytesPerSec = setting.get(repositorySettings);
if (maxSnapshotBytesPerSec.getBytes() <= 0) {
return null;
} else {
return new RateLimiter.SimpleRateLimiter(maxSnapshotBytesPerSec.getMbFrac());
}
}
@Override
public long getSnapshotThrottleTimeInNanos() {
return snapshotRateLimitingTimeInNanos.count();
}
@Override
public long getRestoreThrottleTimeInNanos() {
return restoreRateLimitingTimeInNanos.count();
}
protected void assertSnapshotOrGenericThread() {
assert Thread.currentThread().getName().contains('[' + ThreadPool.Names.SNAPSHOT + ']')
|| Thread.currentThread().getName().contains('[' + ThreadPool.Names.SNAPSHOT_META + ']')
|| Thread.currentThread().getName().contains('[' + ThreadPool.Names.GENERIC + ']')
: "Expected current thread [" + Thread.currentThread() + "] to be the snapshot or generic thread.";
}
@Override
public String startVerification() {
try {
if (isReadOnly()) {
// It's readonly - so there is not much we can do here to verify it apart from reading the blob store metadata
latestIndexBlobId();
return "read-only";
} else {
String seed = UUIDs.randomBase64UUID();
byte[] testBytes = Strings.toUTF8Bytes(seed);
BlobContainer testContainer = blobStore().blobContainer(basePath().add(testBlobPrefix(seed)));
testContainer.writeBlobAtomic("master.dat", new BytesArray(testBytes), true);
return seed;
}
} catch (Exception exp) {
throw new RepositoryVerificationException(metadata.name(), "path " + basePath() + " is not accessible on master node", exp);
}
}
@Override
public void endVerification(String seed) {
if (isReadOnly() == false) {
try {
final String testPrefix = testBlobPrefix(seed);
blobStore().blobContainer(basePath().add(testPrefix)).delete();
} catch (Exception exp) {
throw new RepositoryVerificationException(metadata.name(), "cannot delete test data at " + basePath(), exp);
}
}
}
// Tracks the latest known repository generation in a best-effort way to detect inconsistent listing of root level index-N blobs
// and concurrent modifications.
private final AtomicLong latestKnownRepoGen = new AtomicLong(RepositoryData.UNKNOWN_REPO_GEN);
// Best effort cache of the latest known repository data
private final AtomicReference latestKnownRepositoryData = new AtomicReference<>(RepositoryData.EMPTY);
@Override
public void getRepositoryData(ActionListener listener) {
// RepositoryData is the responsibility of the elected master: we shouldn't be loading it on other nodes as we don't have good
// consistency guarantees there, but electedness is too ephemeral to assert. We can say for sure that this node should be
// master-eligible, which is almost as strong since all other snapshot-related activity happens on data nodes whether they be
// master-eligible or not.
assert clusterService.localNode().isMasterNode() : "should only load repository data on master nodes";
if (lifecycle.started() == false) {
listener.onFailure(notStartedException());
return;
}
if (latestKnownRepoGen.get() == RepositoryData.CORRUPTED_REPO_GEN) {
listener.onFailure(corruptedStateException(null, null));
return;
}
final RepositoryData cached = latestKnownRepositoryData.get();
// Fast path loading repository data directly from cache if we're in fully consistent mode and the cache matches up with
// the latest known repository generation
if (bestEffortConsistency == false && cached.getGenId() == latestKnownRepoGen.get()) {
listener.onResponse(cached);
return;
}
if (metadata.generation() == RepositoryData.UNKNOWN_REPO_GEN && isReadOnly() == false) {
logger.debug(
"[{}] loading repository metadata for the first time, trying to determine correct generation and to store "
+ "it in the cluster state",
metadata.name()
);
initializeRepoGenerationTracking(listener);
} else {
logger.trace(
"[{}] loading un-cached repository data with best known repository generation [{}]",
metadata.name(),
latestKnownRepoGen
);
// Don't deduplicate repo data loading if we don't have strong consistency guarantees between the repo and the cluster state
// Also, if we are not caching repository data (for tests) we assume that the contents of the repository data at a given
// generation may change
final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT_META);
if (bestEffortConsistency || cacheRepositoryData == false) {
executor.execute(ActionRunnable.wrap(listener, this::doGetRepositoryData));
} else {
repoDataDeduplicator.executeOnce(
metadata,
listener,
(metadata, l) -> executor.execute(ActionRunnable.wrap(l, this::doGetRepositoryData))
);
}
}
}
private RepositoryException notStartedException() {
return new RepositoryException(metadata.name(), "repository is not in started state");
}
// Listener used to ensure that repository data is only initialized once in the cluster state by #initializeRepoGenerationTracking
private ListenableActionFuture repoDataInitialized;
/**
* Method used to set the current repository generation in the cluster state's {@link RepositoryMetadata} to the latest generation that
* can be physically found in the repository before passing the latest {@link RepositoryData} to the given listener.
* This ensures that operations using {@link #executeConsistentStateUpdate} right after mounting a fresh repository will have a
* consistent view of the {@link RepositoryData} before any data has been written to the repository.
*
* @param listener listener to resolve with new repository data
*/
private void initializeRepoGenerationTracking(ActionListener listener) {
synchronized (this) {
if (repoDataInitialized == null) {
// double check the generation since we checked it outside the mutex in the caller and it could have changed by a
// concurrent initialization of the repo metadata and just load repository normally in case we already finished the
// initialization
if (metadata.generation() != RepositoryData.UNKNOWN_REPO_GEN) {
getRepositoryData(listener);
return;
}
logger.trace("[{}] initializing repository generation in cluster state", metadata.name());
repoDataInitialized = new ListenableActionFuture<>();
repoDataInitialized.addListener(listener);
final Consumer onFailure = e -> {
logger.warn(
new ParameterizedMessage(
"[{}] Exception when initializing repository generation in cluster state",
metadata.name()
),
e
);
final ActionListener existingListener;
synchronized (BlobStoreRepository.this) {
existingListener = repoDataInitialized;
repoDataInitialized = null;
}
existingListener.onFailure(e);
};
threadPool.generic()
.execute(
ActionRunnable.wrap(
ActionListener.wrap(
repoData -> submitUnbatchedTask(
"set initial safe repository generation [" + metadata.name() + "][" + repoData.getGenId() + "]",
new ClusterStateUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
RepositoryMetadata metadata = getRepoMetadata(currentState);
// No update to the repository generation should have occurred concurrently in general except
// for
// extreme corner cases like failing over to an older version master node and back to the
// current
// node concurrently
if (metadata.generation() != RepositoryData.UNKNOWN_REPO_GEN) {
throw new RepositoryException(
metadata.name(),
"Found unexpected initialized repo metadata [" + metadata + "]"
);
}
return ClusterState.builder(currentState)
.metadata(
Metadata.builder(currentState.getMetadata())
.putCustom(
RepositoriesMetadata.TYPE,
currentState.metadata()
.custom(RepositoriesMetadata.TYPE)
.withUpdatedGeneration(
metadata.name(),
repoData.getGenId(),
repoData.getGenId()
)
)
)
.build();
}
@Override
public void onFailure(Exception e) {
onFailure.accept(e);
}
@Override
public void clusterStateProcessed(ClusterState oldState, ClusterState newState) {
logger.trace(
"[{}] initialized repository generation in cluster state to [{}]",
metadata.name(),
repoData.getGenId()
);
// Resolve listeners on generic pool since some callbacks for repository data do additional IO
threadPool.generic().execute(() -> {
final ActionListener existingListener;
synchronized (BlobStoreRepository.this) {
existingListener = repoDataInitialized;
repoDataInitialized = null;
}
existingListener.onResponse(repoData);
logger.trace(
"[{}] called listeners after initializing repository to generation [{}]",
metadata.name(),
repoData.getGenId()
);
});
}
}
),
onFailure
),
this::doGetRepositoryData
)
);
} else {
logger.trace(
"[{}] waiting for existing initialization of repository metadata generation in cluster state",
metadata.name()
);
repoDataInitialized.addListener(listener);
}
}
}
/**
* {@link RepositoryData} loading deduplicator. This may only be used with consistent generation repositories, meaning
* {@link #bestEffortConsistency} must be {@code false}, in which case we can assume that the {@link RepositoryData} loaded is
* unique for a given value of {@link #metadata} at any point in time.
*/
private final ResultDeduplicator repoDataDeduplicator;
private void doGetRepositoryData(ActionListener listener) {
// Retry loading RepositoryData in a loop in case we run into concurrent modifications of the repository.
// Keep track of the most recent generation we failed to load so we can break out of the loop if we fail to load the same
// generation repeatedly.
long lastFailedGeneration = RepositoryData.UNKNOWN_REPO_GEN;
while (true) {
final long genToLoad;
if (bestEffortConsistency) {
// We're only using #latestKnownRepoGen as a hint in this mode and listing repo contents as a secondary way of trying
// to find a higher generation
final long generation;
try {
generation = latestIndexBlobId();
} catch (Exception e) {
listener.onFailure(
new RepositoryException(metadata.name(), "Could not determine repository generation from root blobs", e)
);
return;
}
genToLoad = latestKnownRepoGen.updateAndGet(known -> Math.max(known, generation));
if (genToLoad > generation) {
logger.info(
"Determined repository generation [{}] from repository contents but correct generation must be at " + "least [{}]",
generation,
genToLoad
);
}
} else {
// We only rely on the generation tracked in #latestKnownRepoGen which is exclusively updated from the cluster state
genToLoad = latestKnownRepoGen.get();
}
try {
final RepositoryData cached = latestKnownRepositoryData.get();
// Caching is not used with #bestEffortConsistency see docs on #cacheRepositoryData for details
if (bestEffortConsistency == false && cached.getGenId() == genToLoad) {
listener.onResponse(cached);
} else {
final RepositoryData loaded = getRepositoryData(genToLoad);
if (cached == null || cached.getGenId() < genToLoad) {
// We can cache in the most recent version here without regard to the actual repository metadata version since
// we're only caching the information that we just wrote and thus won't accidentally cache any information that
// isn't safe
cacheRepositoryData(loaded, Version.CURRENT);
}
if (loaded.getUuid().equals(metadata.uuid())) {
listener.onResponse(loaded);
} else {
// someone switched the repo contents out from under us
RepositoriesService.updateRepositoryUuidInMetadata(
clusterService,
metadata.name(),
loaded,
new ThreadedActionListener<>(logger, threadPool, ThreadPool.Names.GENERIC, listener.map(v -> loaded), false)
);
}
}
return;
} catch (RepositoryException e) {
// If the generation to load changed concurrently and we didn't just try loading the same generation before we retry
if (genToLoad != latestKnownRepoGen.get() && genToLoad != lastFailedGeneration) {
lastFailedGeneration = genToLoad;
logger.warn(
"Failed to load repository data generation ["
+ genToLoad
+ "] because a concurrent operation moved the current generation to ["
+ latestKnownRepoGen.get()
+ "]",
e
);
continue;
}
if (bestEffortConsistency == false && ExceptionsHelper.unwrap(e, NoSuchFileException.class) != null) {
// We did not find the expected index-N even though the cluster state continues to point at the missing value
// of N so we mark this repository as corrupted.
Tuple previousWriterInformation = null;
try {
previousWriterInformation = readLastWriterInfo();
} catch (Exception ex) {
e.addSuppressed(ex);
}
final Tuple finalLastInfo = previousWriterInformation;
markRepoCorrupted(
genToLoad,
e,
ActionListener.wrap(v -> listener.onFailure(corruptedStateException(e, finalLastInfo)), listener::onFailure)
);
} else {
listener.onFailure(e);
}
return;
} catch (Exception e) {
listener.onFailure(new RepositoryException(metadata.name(), "Unexpected exception when loading repository data", e));
return;
}
}
}
/**
* Cache repository data if repository data caching is enabled.
*
* @param repositoryData repository data to cache
* @param version repository metadata version used when writing the data to the repository
*/
private void cacheRepositoryData(RepositoryData repositoryData, Version version) {
if (cacheRepositoryData == false) {
return;
}
final RepositoryData toCache;
if (SnapshotsService.useShardGenerations(version)) {
toCache = repositoryData;
} else {
// don't cache shard generations here as they may be unreliable
toCache = repositoryData.withoutShardGenerations();
assert repositoryData.indexMetaDataGenerations().equals(IndexMetaDataGenerations.EMPTY)
: "repository data should not contain index generations at version ["
+ version
+ "] but saw ["
+ repositoryData.indexMetaDataGenerations()
+ "]";
}
assert toCache.getGenId() >= 0 : "No need to cache abstract generations but attempted to cache [" + toCache.getGenId() + "]";
latestKnownRepositoryData.updateAndGet(known -> {
if (known.getGenId() > toCache.getGenId()) {
return known;
}
return toCache;
});
}
private RepositoryException corruptedStateException(@Nullable Exception cause, @Nullable Tuple previousWriterInfo) {
return new RepositoryException(
metadata.name(),
"Could not read repository data because the contents of the repository do not match its "
+ "expected state. This is likely the result of either concurrently modifying the contents of the "
+ "repository by a process other than this cluster or an issue with the repository's underlying storage. "
+ "The repository has been disabled to prevent corrupting its contents. To re-enable it "
+ "and continue using it please remove the repository from the cluster and add it again to make "
+ "the cluster recover the known state of the repository from its physical contents."
+ previousWriterMessage(previousWriterInfo),
cause
);
}
private static String previousWriterMessage(@Nullable Tuple previousWriterInfo) {
return previousWriterInfo == null
? ""
: " The last cluster to write to this repository was ["
+ previousWriterInfo.v2()
+ "] at generation ["
+ previousWriterInfo.v1()
+ "].";
}
/**
* Marks the repository as corrupted. This puts the repository in a state where its tracked value for
* {@link RepositoryMetadata#pendingGeneration()} is unchanged while its value for {@link RepositoryMetadata#generation()} is set to
* {@link RepositoryData#CORRUPTED_REPO_GEN}. In this state, the repository can not be used any longer and must be removed and
* recreated after the problem that lead to it being marked as corrupted has been fixed.
*
* @param corruptedGeneration generation that failed to load because the index file was not found but that should have loaded
* @param originalException exception that lead to the failing to load the {@code index-N} blob
* @param listener listener to invoke once done
*/
private void markRepoCorrupted(long corruptedGeneration, Exception originalException, ActionListener listener) {
assert corruptedGeneration != RepositoryData.UNKNOWN_REPO_GEN;
assert bestEffortConsistency == false;
submitUnbatchedTask(
"mark repository corrupted [" + metadata.name() + "][" + corruptedGeneration + "]",
new ClusterStateUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
final RepositoriesMetadata state = currentState.metadata().custom(RepositoriesMetadata.TYPE);
final RepositoryMetadata repoState = state.repository(metadata.name());
if (repoState.generation() != corruptedGeneration) {
throw new IllegalStateException(
"Tried to mark repo generation ["
+ corruptedGeneration
+ "] as corrupted but its state concurrently changed to ["
+ repoState
+ "]"
);
}
return ClusterState.builder(currentState)
.metadata(
Metadata.builder(currentState.metadata())
.putCustom(
RepositoriesMetadata.TYPE,
state.withUpdatedGeneration(
metadata.name(),
RepositoryData.CORRUPTED_REPO_GEN,
repoState.pendingGeneration()
)
)
.build()
)
.build();
}
@Override
public void onFailure(Exception e) {
listener.onFailure(
new RepositoryException(
metadata.name(),
"Failed marking repository state as corrupted",
ExceptionsHelper.useOrSuppress(e, originalException)
)
);
}
@Override
public void clusterStateProcessed(ClusterState oldState, ClusterState newState) {
listener.onResponse(null);
}
}
);
}
private RepositoryData getRepositoryData(long indexGen) {
if (indexGen == RepositoryData.EMPTY_REPO_GEN) {
return RepositoryData.EMPTY;
}
try {
final String snapshotsIndexBlobName = INDEX_FILE_PREFIX + Long.toString(indexGen);
// EMPTY is safe here because RepositoryData#fromXContent calls namedObject
try (
InputStream blob = blobContainer().readBlob(snapshotsIndexBlobName);
XContentParser parser = XContentType.JSON.xContent()
.createParser(NamedXContentRegistry.EMPTY, LoggingDeprecationHandler.INSTANCE, blob)
) {
return RepositoryData.snapshotsFromXContent(parser, indexGen, true);
}
} catch (IOException ioe) {
if (bestEffortConsistency) {
// If we fail to load the generation we tracked in latestKnownRepoGen we reset it.
// This is done as a fail-safe in case a user manually deletes the contents of the repository in which case subsequent
// operations must start from the EMPTY_REPO_GEN again
if (latestKnownRepoGen.compareAndSet(indexGen, RepositoryData.EMPTY_REPO_GEN)) {
logger.warn("Resetting repository generation tracker because we failed to read generation [" + indexGen + "]", ioe);
}
}
throw new RepositoryException(metadata.name(), "could not read repository data from index blob", ioe);
}
}
private static String testBlobPrefix(String seed) {
return TESTS_FILE + seed;
}
@Override
public boolean isReadOnly() {
return readOnly;
}
/**
* Writing a new index generation is a three step process.
* First, the {@link RepositoryMetadata} entry for this repository is set into a pending state by incrementing its
* pending generation {@code P} while its safe generation {@code N} remains unchanged.
* Second, the updated {@link RepositoryData} is written to generation {@code P + 1}.
* Lastly, the {@link RepositoryMetadata} entry for this repository is updated to the new generation {@code P + 1} and thus
* pending and safe generation are set to the same value marking the end of the update of the repository data.
*
* @param repositoryData RepositoryData to write
* @param expectedGen expected repository generation at the start of the operation
* @param version version of the repository metadata to write
* @param stateFilter filter for the last cluster state update executed by this method
* @param listener completion listener
*/
protected void writeIndexGen(
RepositoryData repositoryData,
long expectedGen,
Version version,
Function stateFilter,
ActionListener listener
) {
logger.trace("[{}] writing repository data on top of expected generation [{}]", metadata.name(), expectedGen);
assert isReadOnly() == false; // can not write to a read only repository
final long currentGen = repositoryData.getGenId();
if (currentGen != expectedGen) {
// the index file was updated by a concurrent operation, so we were operating on stale
// repository data
listener.onFailure(
new RepositoryException(
metadata.name(),
"concurrent modification of the index-N file, expected current generation ["
+ expectedGen
+ "], actual current generation ["
+ currentGen
+ "]"
)
);
return;
}
// Step 1: Set repository generation state to the next possible pending generation
final StepListener setPendingStep = new StepListener<>();
final String setPendingGenerationSource = "set pending repository generation [" + metadata.name() + "][" + expectedGen + "]";
submitUnbatchedTask(setPendingGenerationSource, new ClusterStateUpdateTask() {
private long newGen;
@Override
public ClusterState execute(ClusterState currentState) {
final RepositoryMetadata meta = getRepoMetadata(currentState);
final String repoName = metadata.name();
final long genInState = meta.generation();
final boolean uninitializedMeta = meta.generation() == RepositoryData.UNKNOWN_REPO_GEN || bestEffortConsistency;
if (uninitializedMeta == false && meta.pendingGeneration() != genInState) {
logger.info(
"Trying to write new repository data over unfinished write, repo [{}] is at "
+ "safe generation [{}] and pending generation [{}]",
meta.name(),
genInState,
meta.pendingGeneration()
);
}
assert expectedGen == RepositoryData.EMPTY_REPO_GEN || uninitializedMeta || expectedGen == meta.generation()
: "Expected non-empty generation [" + expectedGen + "] does not match generation tracked in [" + meta + "]";
// If we run into the empty repo generation for the expected gen, the repo is assumed to have been cleared of
// all contents by an external process so we reset the safe generation to the empty generation.
final long safeGeneration = expectedGen == RepositoryData.EMPTY_REPO_GEN
? RepositoryData.EMPTY_REPO_GEN
: (uninitializedMeta ? expectedGen : genInState);
// Regardless of whether or not the safe generation has been reset, the pending generation always increments so that
// even if a repository has been manually cleared of all contents we will never reuse the same repository generation.
// This is motivated by the consistency behavior the S3 based blob repository implementation has to support which does
// not offer any consistency guarantees when it comes to overwriting the same blob name with different content.
final long nextPendingGen = metadata.pendingGeneration() + 1;
newGen = uninitializedMeta ? Math.max(expectedGen + 1, nextPendingGen) : nextPendingGen;
assert newGen > latestKnownRepoGen.get()
: "Attempted new generation ["
+ newGen
+ "] must be larger than latest known generation ["
+ latestKnownRepoGen.get()
+ "]";
return ClusterState.builder(currentState)
.metadata(
Metadata.builder(currentState.getMetadata())
.putCustom(
RepositoriesMetadata.TYPE,
currentState.metadata()
.custom(RepositoriesMetadata.TYPE)
.withUpdatedGeneration(repoName, safeGeneration, newGen)
)
.build()
)
.build();
}
@Override
public void onFailure(Exception e) {
listener.onFailure(
new RepositoryException(
metadata.name(),
"Failed to execute cluster state update [" + setPendingGenerationSource + "]",
e
)
);
}
@Override
public void clusterStateProcessed(ClusterState oldState, ClusterState newState) {
logger.trace("[{}] successfully set pending repository generation to [{}]", metadata.name(), newGen);
setPendingStep.onResponse(newGen);
}
});
final StepListener filterRepositoryDataStep = new StepListener<>();
// Step 2: Write new index-N blob to repository and update index.latest
setPendingStep.whenComplete(newGen -> threadPool().executor(ThreadPool.Names.SNAPSHOT).execute(ActionRunnable.wrap(listener, l -> {
// BwC logic: Load snapshot version information if any snapshot is missing details in RepositoryData so that the new
// RepositoryData contains full details for every snapshot
final List snapshotIdsWithMissingDetails = repositoryData.getSnapshotIds()
.stream()
.filter(repositoryData::hasMissingDetails)
.toList();
if (snapshotIdsWithMissingDetails.isEmpty() == false) {
final Map extraDetailsMap = new ConcurrentHashMap<>();
getSnapshotInfo(new GetSnapshotInfoContext(snapshotIdsWithMissingDetails, false, () -> false, (context, snapshotInfo) -> {
final String slmPolicy = slmPolicy(snapshotInfo);
extraDetailsMap.put(
snapshotInfo.snapshotId(),
new SnapshotDetails(
snapshotInfo.state(),
snapshotInfo.version(),
snapshotInfo.startTime(),
snapshotInfo.endTime(),
slmPolicy
)
);
}, ActionListener.runAfter(new ActionListener<>() {
@Override
public void onResponse(Void aVoid) {
logger.info(
"Successfully loaded all snapshots' detailed information for {} from snapshot metadata",
AllocationService.firstListElementsToCommaDelimitedString(
snapshotIdsWithMissingDetails,
SnapshotId::toString,
logger.isDebugEnabled()
)
);
}
@Override
public void onFailure(Exception e) {
logger.warn("Failure when trying to load missing details from snapshot metadata", e);
}
}, () -> filterRepositoryDataStep.onResponse(repositoryData.withExtraDetails(extraDetailsMap)))));
} else {
filterRepositoryDataStep.onResponse(repositoryData);
}
})), listener::onFailure);
filterRepositoryDataStep.whenComplete(filteredRepositoryData -> {
final long newGen = setPendingStep.result();
final RepositoryData newRepositoryData = updateRepositoryData(filteredRepositoryData, version, newGen);
if (latestKnownRepoGen.get() >= newGen) {
throw new IllegalArgumentException(
"Tried writing generation ["
+ newGen
+ "] but repository is at least at generation ["
+ latestKnownRepoGen.get()
+ "] already"
);
}
// write the index file
if (ensureSafeGenerationExists(expectedGen, listener::onFailure) == false) {
return;
}
final String indexBlob = INDEX_FILE_PREFIX + Long.toString(newGen);
logger.debug("Repository [{}] writing new index generational blob [{}]", metadata.name(), indexBlob);
writeAtomic(blobContainer(), indexBlob, out -> {
try (XContentBuilder xContentBuilder = XContentFactory.jsonBuilder(org.elasticsearch.core.Streams.noCloseStream(out))) {
newRepositoryData.snapshotsToXContent(xContentBuilder, version);
}
}, true);
maybeWriteIndexLatest(newGen);
// Step 3: Update CS to reflect new repository generation.
final String setSafeGenerationSource = "set safe repository generation [" + metadata.name() + "][" + newGen + "]";
submitUnbatchedTask(setSafeGenerationSource, new ClusterStateUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
final RepositoryMetadata meta = getRepoMetadata(currentState);
if (meta.generation() != expectedGen) {
throw new IllegalStateException(
"Tried to update repo generation to [" + newGen + "] but saw unexpected generation in state [" + meta + "]"
);
}
if (meta.pendingGeneration() != newGen) {
throw new IllegalStateException(
"Tried to update from unexpected pending repo generation ["
+ meta.pendingGeneration()
+ "] after write to generation ["
+ newGen
+ "]"
);
}
final RepositoriesMetadata currentMetadata = currentState.metadata().custom(RepositoriesMetadata.TYPE);
final RepositoriesMetadata withGenerations = currentMetadata.withUpdatedGeneration(metadata.name(), newGen, newGen);
final RepositoriesMetadata withUuid = meta.uuid().equals(newRepositoryData.getUuid())
? withGenerations
: withGenerations.withUuid(metadata.name(), newRepositoryData.getUuid());
final ClusterState newClusterState = stateFilter.apply(
ClusterState.builder(currentState)
.metadata(Metadata.builder(currentState.getMetadata()).putCustom(RepositoriesMetadata.TYPE, withUuid))
.build()
);
return updateRepositoryGenerationsIfNecessary(newClusterState, expectedGen, newGen);
}
@Override
public void onFailure(Exception e) {
listener.onFailure(
new RepositoryException(
metadata.name(),
"Failed to execute cluster state update [" + setSafeGenerationSource + "]",
e
)
);
}
@Override
public void clusterStateProcessed(ClusterState oldState, ClusterState newState) {
logger.trace("[{}] successfully set safe repository generation to [{}]", metadata.name(), newGen);
cacheRepositoryData(newRepositoryData, version);
threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(ActionRunnable.supply(listener, () -> {
// Delete all now outdated index files up to 1000 blobs back from the new generation.
// If there are more than 1000 dangling index-N cleanup functionality on repo delete will take care of them.
// Deleting one older than the current expectedGen is done for BwC reasons as older versions used to keep
// two index-N blobs around.
try {
deleteFromContainer(
blobContainer(),
LongStream.range(Math.max(Math.max(expectedGen - 1, 0), newGen - 1000), newGen)
.mapToObj(gen -> INDEX_FILE_PREFIX + gen)
.iterator()
);
} catch (IOException e) {
logger.warn(() -> "Failed to clean up old index blobs from before [" + newGen + "]", e);
}
return newRepositoryData;
}));
}
});
}, listener::onFailure);
}
/**
* Extract slm policy from snapshot info. If none can be found, empty string is returned.
*/
private static String slmPolicy(SnapshotInfo snapshotInfo) {
final String slmPolicy;
if (snapshotInfo.userMetadata() == null) {
slmPolicy = "";
} else {
final Object policyFound = snapshotInfo.userMetadata().get(SnapshotsService.POLICY_ID_METADATA_FIELD);
if (policyFound instanceof String) {
slmPolicy = (String) policyFound;
} else {
slmPolicy = "";
}
}
return slmPolicy;
}
private RepositoryData updateRepositoryData(RepositoryData repositoryData, Version repositoryMetaversion, long newGen) {
if (SnapshotsService.includesUUIDs(repositoryMetaversion)) {
final String clusterUUID = clusterService.state().metadata().clusterUUID();
if (repositoryData.getClusterUUID().equals(clusterUUID) == false) {
repositoryData = repositoryData.withClusterUuid(clusterUUID);
}
}
return repositoryData.withGenId(newGen);
}
/**
* Write {@code index.latest} blob to support using this repository as the basis of a url repository.
*
* @param newGen new repository generation
*/
private void maybeWriteIndexLatest(long newGen) {
if (supportURLRepo) {
logger.debug("Repository [{}] updating index.latest with generation [{}]", metadata.name(), newGen);
try {
writeAtomic(blobContainer(), INDEX_LATEST_BLOB, out -> out.write(Numbers.longToBytes(newGen)), false);
} catch (Exception e) {
logger.warn(
() -> format(
"Failed to write index.latest blob. If you do not intend to use this "
+ "repository as the basis for a URL repository you may turn off attempting to write the index.latest blob by "
+ "setting repository setting [%s] to [false]",
SUPPORT_URL_REPO.getKey()
),
e
);
}
}
}
/**
* Ensures that {@link RepositoryData} for the given {@code safeGeneration} actually physically exists in the repository.
* This method is used by {@link #writeIndexGen} to make sure that no writes are executed on top of a concurrently modified repository.
* This check is necessary because {@link RepositoryData} is mostly read from the cached value in {@link #latestKnownRepositoryData}
* which could be stale in the broken situation of a concurrent write to the repository.
*
* @param safeGeneration generation to verify existence for
* @param onFailure callback to invoke with failure in case the repository generation is not physically found in the repository
*/
private boolean ensureSafeGenerationExists(long safeGeneration, Consumer onFailure) throws IOException {
logger.debug("Ensure generation [{}] that is the basis for this write exists in [{}]", safeGeneration, metadata.name());
if (safeGeneration != RepositoryData.EMPTY_REPO_GEN && blobContainer().blobExists(INDEX_FILE_PREFIX + safeGeneration) == false) {
Tuple previousWriterInfo = null;
Exception readRepoDataEx = null;
try {
previousWriterInfo = readLastWriterInfo();
} catch (Exception ex) {
readRepoDataEx = ex;
}
final Exception exception = new RepositoryException(
metadata.name(),
"concurrent modification of the index-N file, expected current generation ["
+ safeGeneration
+ "] but it was not found in the repository."
+ previousWriterMessage(previousWriterInfo)
);
if (readRepoDataEx != null) {
exception.addSuppressed(readRepoDataEx);
}
markRepoCorrupted(safeGeneration, exception, new ActionListener<>() {
@Override
public void onResponse(Void aVoid) {
onFailure.accept(exception);
}
@Override
public void onFailure(Exception e) {
onFailure.accept(e);
}
});
return false;
}
return true;
}
/**
* Tries to find the latest cluster UUID that wrote to this repository on a best effort basis by listing out repository root contents
* to find the latest repository generation and then reading the cluster UUID of the last writer from the {@link RepositoryData} found
* at this generation.
*
* @return tuple of repository generation and cluster UUID of the last cluster to write to this repository
*/
private Tuple readLastWriterInfo() throws IOException {
assert bestEffortConsistency == false : "This should only be used for adding information to errors in consistent mode";
final long latestGeneration = latestIndexBlobId();
final RepositoryData actualRepositoryData = getRepositoryData(latestGeneration);
return Tuple.tuple(latestGeneration, actualRepositoryData.getClusterUUID());
}
/**
* Updates the repository generation that running deletes and snapshot finalizations will be based on for this repository if any such
* operations are found in the cluster state while setting the safe repository generation.
*
* @param state cluster state to update
* @param oldGen previous safe repository generation
* @param newGen new safe repository generation
* @return updated cluster state
*/
private ClusterState updateRepositoryGenerationsIfNecessary(ClusterState state, long oldGen, long newGen) {
final String repoName = metadata.name();
final SnapshotsInProgress updatedSnapshotsInProgress;
boolean changedSnapshots = false;
final List snapshotEntries = new ArrayList<>();
final SnapshotsInProgress snapshotsInProgress = state.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY);
for (SnapshotsInProgress.Entry entry : state.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY).forRepo(repoName)) {
if (entry.repositoryStateId() == oldGen) {
snapshotEntries.add(entry.withRepoGen(newGen));
changedSnapshots = true;
} else {
snapshotEntries.add(entry);
}
}
updatedSnapshotsInProgress = changedSnapshots ? snapshotsInProgress.withUpdatedEntriesForRepo(repoName, snapshotEntries) : null;
final SnapshotDeletionsInProgress updatedDeletionsInProgress;
boolean changedDeletions = false;
final List deletionEntries = new ArrayList<>();
for (SnapshotDeletionsInProgress.Entry entry : state.custom(SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY)
.getEntries()) {
if (entry.repository().equals(repoName) && entry.repositoryStateId() == oldGen) {
deletionEntries.add(entry.withRepoGen(newGen));
changedDeletions = true;
} else {
deletionEntries.add(entry);
}
}
updatedDeletionsInProgress = changedDeletions ? SnapshotDeletionsInProgress.of(deletionEntries) : null;
return SnapshotsService.updateWithSnapshots(state, updatedSnapshotsInProgress, updatedDeletionsInProgress);
}
private RepositoryMetadata getRepoMetadata(ClusterState state) {
final RepositoryMetadata repositoryMetadata = state.getMetadata()
.custom(RepositoriesMetadata.TYPE)
.repository(metadata.name());
assert repositoryMetadata != null || lifecycle.stoppedOrClosed()
: "did not find metadata for repo [" + metadata.name() + "] in state [" + lifecycleState() + "]";
return repositoryMetadata;
}
/**
* Get the latest snapshot index blob id. Snapshot index blobs are named index-N, where N is
* the next version number from when the index blob was written. Each individual index-N blob is
* only written once and never overwritten. The highest numbered index-N blob is the latest one
* that contains the current snapshots in the repository.
*
* Package private for testing
*/
long latestIndexBlobId() throws IOException {
try {
// First, try listing all index-N blobs (there should only be two index-N blobs at any given
// time in a repository if cleanup is happening properly) and pick the index-N blob with the
// highest N value - this will be the latest index blob for the repository. Note, we do this
// instead of directly reading the index.latest blob to get the current index-N blob because
// index.latest is not written atomically and is not immutable - on every index-N change,
// we first delete the old index.latest and then write the new one. If the repository is not
// read-only, it is possible that we try deleting the index.latest blob while it is being read
// by some other operation (such as the get snapshots operation). In some file systems, it is
// illegal to delete a file while it is being read elsewhere (e.g. Windows). For read-only
// repositories, we read for index.latest, both because listing blob prefixes is often unsupported
// and because the index.latest blob will never be deleted and re-written.
return listBlobsToGetLatestIndexId();
} catch (UnsupportedOperationException e) {
// If its a read-only repository, listing blobs by prefix may not be supported (e.g. a URL repository),
// in this case, try reading the latest index generation from the index.latest blob
try {
return readSnapshotIndexLatestBlob();
} catch (NoSuchFileException nsfe) {
return RepositoryData.EMPTY_REPO_GEN;
}
}
}
// package private for testing
long readSnapshotIndexLatestBlob() throws IOException {
final BytesReference content = Streams.readFully(Streams.limitStream(blobContainer().readBlob(INDEX_LATEST_BLOB), Long.BYTES + 1));
if (content.length() != Long.BYTES) {
throw new RepositoryException(
metadata.name(),
"exception reading blob ["
+ INDEX_LATEST_BLOB
+ "]: expected 8 bytes but blob was "
+ (content.length() < Long.BYTES ? content.length() + " bytes" : "longer")
);
}
return Numbers.bytesToLong(content.toBytesRef());
}
private long listBlobsToGetLatestIndexId() throws IOException {
return latestGeneration(blobContainer().listBlobsByPrefix(INDEX_FILE_PREFIX).keySet());
}
private long latestGeneration(Collection rootBlobs) {
long latest = RepositoryData.EMPTY_REPO_GEN;
for (String blobName : rootBlobs) {
if (blobName.startsWith(INDEX_FILE_PREFIX) == false) {
continue;
}
try {
final long curr = Long.parseLong(blobName.substring(INDEX_FILE_PREFIX.length()));
latest = Math.max(latest, curr);
} catch (NumberFormatException nfe) {
// the index- blob wasn't of the format index-N where N is a number,
// no idea what this blob is but it doesn't belong in the repository!
logger.warn("[{}] Unknown blob in the repository: {}", metadata.name(), blobName);
}
}
return latest;
}
private void writeAtomic(
BlobContainer container,
final String blobName,
CheckedConsumer writer,
boolean failIfAlreadyExists
) throws IOException {
logger.trace(() -> format("[%s] Writing [%s] to %s atomically", metadata.name(), blobName, container.path()));
container.writeBlob(blobName, failIfAlreadyExists, true, writer);
}
@Override
public void snapshotShard(SnapshotShardContext context) {
if (isReadOnly()) {
context.onFailure(new RepositoryException(metadata.name(), "cannot snapshot shard on a readonly repository"));
return;
}
final Store store = context.store();
final IndexCommit snapshotIndexCommit = context.indexCommit();
final ShardId shardId = store.shardId();
final SnapshotId snapshotId = context.snapshotId();
final IndexShardSnapshotStatus snapshotStatus = context.status();
final long startTime = threadPool.absoluteTimeInMillis();
try {
final ShardGeneration generation = snapshotStatus.generation();
logger.debug("[{}] [{}] snapshot to [{}] [{}] ...", shardId, snapshotId, metadata.name(), generation);
final BlobContainer shardContainer = shardContainer(context.indexId(), shardId);
final Set blobs;
if (generation == null) {
try {
blobs = shardContainer.listBlobsByPrefix(INDEX_FILE_PREFIX).keySet();
} catch (IOException e) {
throw new IndexShardSnapshotFailedException(shardId, "failed to list blobs", e);
}
} else {
blobs = Collections.singleton(INDEX_FILE_PREFIX + generation);
}
Tuple tuple = buildBlobStoreIndexShardSnapshots(
blobs,
shardContainer,
generation
);
BlobStoreIndexShardSnapshots snapshots = tuple.v1();
ShardGeneration fileListGeneration = tuple.v2();
if (snapshots.snapshots().stream().anyMatch(sf -> sf.snapshot().equals(snapshotId.getName()))) {
throw new IndexShardSnapshotFailedException(
shardId,
"Duplicate snapshot name [" + snapshotId.getName() + "] detected, aborting"
);
}
// First inspect all known SegmentInfos instances to see if we already have an equivalent commit in the repository
final List filesFromSegmentInfos = Optional.ofNullable(context.stateIdentifier())
.map(id -> {
for (SnapshotFiles snapshotFileSet : snapshots.snapshots()) {
if (id.equals(snapshotFileSet.shardStateIdentifier())) {
return snapshotFileSet.indexFiles();
}
}
return null;
})
.orElse(null);
final List indexCommitPointFiles;
int indexIncrementalFileCount = 0;
int indexTotalNumberOfFiles = 0;
long indexIncrementalSize = 0;
long indexTotalFileSize = 0;
final BlockingQueue filesToSnapshot = new LinkedBlockingQueue<>();
int filesInShardMetadataCount = 0;
long filesInShardMetadataSize = 0;
if (store.indexSettings().getIndexMetadata().isSearchableSnapshot()) {
indexCommitPointFiles = Collections.emptyList();
} else if (filesFromSegmentInfos == null) {
// If we did not find a set of files that is equal to the current commit we determine the files to upload by comparing files
// in the commit with files already in the repository
indexCommitPointFiles = new ArrayList<>();
final Collection fileNames;
final Store.MetadataSnapshot metadataFromStore;
try (Releasable ignored = incrementStoreRef(store, snapshotStatus, shardId)) {
// TODO apparently we don't use the MetadataSnapshot#.recoveryDiff(...) here but we should
try {
logger.trace("[{}] [{}] Loading store metadata using index commit [{}]", shardId, snapshotId, snapshotIndexCommit);
metadataFromStore = store.getMetadata(snapshotIndexCommit);
fileNames = snapshotIndexCommit.getFileNames();
} catch (IOException e) {
throw new IndexShardSnapshotFailedException(shardId, "Failed to get store file metadata", e);
}
}
for (String fileName : fileNames) {
if (snapshotStatus.isAborted()) {
logger.debug("[{}] [{}] Aborted on the file [{}], exiting", shardId, snapshotId, fileName);
throw new AbortedSnapshotException();
}
logger.trace("[{}] [{}] Processing [{}]", shardId, snapshotId, fileName);
final StoreFileMetadata md = metadataFromStore.get(fileName);
BlobStoreIndexShardSnapshot.FileInfo existingFileInfo = null;
List filesInfo = snapshots.findPhysicalIndexFiles(fileName);
if (filesInfo != null) {
for (BlobStoreIndexShardSnapshot.FileInfo fileInfo : filesInfo) {
if (fileInfo.isSame(md)) {
// a commit point file with the same name, size and checksum was already copied to repository
// we will reuse it for this snapshot
existingFileInfo = fileInfo;
break;
}
}
}
// We can skip writing blobs where the metadata hash is equal to the blob's contents because we store the hash/contents
// directly in the shard level metadata in this case
final boolean needsWrite = md.hashEqualsContents() == false;
indexTotalFileSize += md.length();
indexTotalNumberOfFiles++;
if (existingFileInfo == null) {
indexIncrementalFileCount++;
indexIncrementalSize += md.length();
// create a new FileInfo
BlobStoreIndexShardSnapshot.FileInfo snapshotFileInfo = new BlobStoreIndexShardSnapshot.FileInfo(
(needsWrite ? UPLOADED_DATA_BLOB_PREFIX : VIRTUAL_DATA_BLOB_PREFIX) + UUIDs.randomBase64UUID(),
md,
chunkSize()
);
indexCommitPointFiles.add(snapshotFileInfo);
if (needsWrite) {
filesToSnapshot.add(snapshotFileInfo);
} else {
assert assertFileContentsMatchHash(snapshotStatus, snapshotFileInfo, store);
filesInShardMetadataCount += 1;
filesInShardMetadataSize += md.length();
}
} else {
indexCommitPointFiles.add(existingFileInfo);
}
}
} else {
for (BlobStoreIndexShardSnapshot.FileInfo fileInfo : filesFromSegmentInfos) {
indexTotalNumberOfFiles++;
indexTotalFileSize += fileInfo.length();
}
indexCommitPointFiles = filesFromSegmentInfos;
}
snapshotStatus.moveToStarted(
startTime,
indexIncrementalFileCount,
indexTotalNumberOfFiles,
indexIncrementalSize,
indexTotalFileSize
);
final ShardGeneration indexGeneration;
final boolean writeShardGens = SnapshotsService.useShardGenerations(context.getRepositoryMetaVersion());
final boolean writeFileInfoWriterUUID = SnapshotsService.includeFileInfoWriterUUID(context.getRepositoryMetaVersion());
// build a new BlobStoreIndexShardSnapshot, that includes this one and all the saved ones
List newSnapshotsList = new ArrayList<>();
newSnapshotsList.add(new SnapshotFiles(snapshotId.getName(), indexCommitPointFiles, context.stateIdentifier()));
for (SnapshotFiles point : snapshots) {
newSnapshotsList.add(point);
}
final BlobStoreIndexShardSnapshots updatedBlobStoreIndexShardSnapshots = new BlobStoreIndexShardSnapshots(newSnapshotsList);
final Runnable afterWriteSnapBlob;
if (writeShardGens) {
// When using shard generations we can safely write the index-${uuid} blob before writing out any of the actual data
// for this shard since the uuid named blob will simply not be referenced in case of error and thus we will never
// reference a generation that has not had all its files fully upload.
indexGeneration = ShardGeneration.newGeneration();
try {
final Map serializationParams = Collections.singletonMap(
BlobStoreIndexShardSnapshot.FileInfo.SERIALIZE_WRITER_UUID,
Boolean.toString(writeFileInfoWriterUUID)
);
INDEX_SHARD_SNAPSHOTS_FORMAT.write(
updatedBlobStoreIndexShardSnapshots,
shardContainer,
indexGeneration.toBlobNamePart(),
compress,
serializationParams
);
snapshotStatus.addProcessedFiles(filesInShardMetadataCount, filesInShardMetadataSize);
} catch (IOException e) {
throw new IndexShardSnapshotFailedException(
shardId,
"Failed to write shard level snapshot metadata for ["
+ snapshotId
+ "] to ["
+ INDEX_SHARD_SNAPSHOTS_FORMAT.blobName(indexGeneration.toBlobNamePart())
+ "]",
e
);
}
afterWriteSnapBlob = () -> {};
} else {
// When not using shard generations we can only write the index-${N} blob after all other work for this shard has
// completed.
// Also, in case of numeric shard generations the data node has to take care of deleting old shard generations.
final long newGen = Long.parseLong(fileListGeneration.toBlobNamePart()) + 1;
indexGeneration = new ShardGeneration(newGen);
// Delete all previous index-N blobs
final List blobsToDelete = blobs.stream().filter(blob -> blob.startsWith(SNAPSHOT_INDEX_PREFIX)).toList();
assert blobsToDelete.stream()
.mapToLong(b -> Long.parseLong(b.replaceFirst(SNAPSHOT_INDEX_PREFIX, "")))
.max()
.orElse(-1L) < Long.parseLong(indexGeneration.toString())
: "Tried to delete an index-N blob newer than the current generation ["
+ indexGeneration
+ "] when deleting index-N blobs "
+ blobsToDelete;
final var finalFilesInShardMetadataCount = filesInShardMetadataCount;
final var finalFilesInShardMetadataSize = filesInShardMetadataSize;
afterWriteSnapBlob = () -> {
try {
final Map serializationParams = Collections.singletonMap(
BlobStoreIndexShardSnapshot.FileInfo.SERIALIZE_WRITER_UUID,
Boolean.toString(writeFileInfoWriterUUID)
);
writeShardIndexBlobAtomic(shardContainer, newGen, updatedBlobStoreIndexShardSnapshots, serializationParams);
} catch (IOException e) {
throw new IndexShardSnapshotFailedException(
shardId,
"Failed to finalize snapshot creation ["
+ snapshotId
+ "] with shard index ["
+ INDEX_SHARD_SNAPSHOTS_FORMAT.blobName(indexGeneration.toBlobNamePart())
+ "]",
e
);
}
snapshotStatus.addProcessedFiles(finalFilesInShardMetadataCount, finalFilesInShardMetadataSize);
try {
deleteFromContainer(shardContainer, blobsToDelete.iterator());
} catch (IOException e) {
logger.warn(
() -> format("[%s][%s] failed to delete old index-N blobs during finalization", snapshotId, shardId),
e
);
}
};
}
final StepListener> allFilesUploadedListener = new StepListener<>();
allFilesUploadedListener.whenComplete(v -> {
final IndexShardSnapshotStatus.Copy lastSnapshotStatus = snapshotStatus.moveToFinalize(snapshotIndexCommit.getGeneration());
// now create and write the commit point
logger.trace("[{}] [{}] writing shard snapshot file", shardId, snapshotId);
final BlobStoreIndexShardSnapshot blobStoreIndexShardSnapshot = new BlobStoreIndexShardSnapshot(
snapshotId.getName(),
lastSnapshotStatus.getIndexVersion(),
indexCommitPointFiles,
lastSnapshotStatus.getStartTime(),
threadPool.absoluteTimeInMillis() - lastSnapshotStatus.getStartTime(),
lastSnapshotStatus.getIncrementalFileCount(),
lastSnapshotStatus.getIncrementalSize()
);
try {
final String snapshotUUID = snapshotId.getUUID();
final Map serializationParams = Collections.singletonMap(
BlobStoreIndexShardSnapshot.FileInfo.SERIALIZE_WRITER_UUID,
Boolean.toString(writeFileInfoWriterUUID)
);
INDEX_SHARD_SNAPSHOT_FORMAT.write(
blobStoreIndexShardSnapshot,
shardContainer,
snapshotUUID,
compress,
serializationParams
);
} catch (IOException e) {
throw new IndexShardSnapshotFailedException(shardId, "Failed to write commit point", e);
}
afterWriteSnapBlob.run();
final ShardSnapshotResult shardSnapshotResult = new ShardSnapshotResult(
indexGeneration,
ByteSizeValue.ofBytes(blobStoreIndexShardSnapshot.totalSize()),
getSegmentInfoFileCount(blobStoreIndexShardSnapshot.indexFiles())
);
snapshotStatus.moveToDone(threadPool.absoluteTimeInMillis(), shardSnapshotResult);
context.onResponse(shardSnapshotResult);
}, context::onFailure);
if (indexIncrementalFileCount == 0) {
allFilesUploadedListener.onResponse(Collections.emptyList());
return;
}
final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT);
// Start as many workers as fit into the snapshot pool at once at the most
final int workers = Math.min(threadPool.info(ThreadPool.Names.SNAPSHOT).getMax(), indexIncrementalFileCount);
final ActionListener filesListener = fileQueueListener(filesToSnapshot, workers, allFilesUploadedListener);
for (int i = 0; i < workers; ++i) {
executeOneFileSnapshot(store, snapshotId, context.indexId(), snapshotStatus, filesToSnapshot, executor, filesListener);
}
} catch (Exception e) {
context.onFailure(e);
}
}
private void executeOneFileSnapshot(
Store store,
SnapshotId snapshotId,
IndexId indexId,
IndexShardSnapshotStatus snapshotStatus,
BlockingQueue filesToSnapshot,
Executor executor,
ActionListener listener
) throws InterruptedException {
final ShardId shardId = store.shardId();
final BlobStoreIndexShardSnapshot.FileInfo snapshotFileInfo = filesToSnapshot.poll(0L, TimeUnit.MILLISECONDS);
if (snapshotFileInfo == null) {
listener.onResponse(null);
} else {
executor.execute(ActionRunnable.wrap(listener, l -> {
try (Releasable ignored = incrementStoreRef(store, snapshotStatus, shardId)) {
snapshotFile(snapshotFileInfo, indexId, shardId, snapshotId, snapshotStatus, store);
executeOneFileSnapshot(store, snapshotId, indexId, snapshotStatus, filesToSnapshot, executor, l);
}
}));
}
}
private static Releasable incrementStoreRef(Store store, IndexShardSnapshotStatus snapshotStatus, ShardId shardId) {
if (store.tryIncRef() == false) {
if (snapshotStatus.isAborted()) {
throw new AbortedSnapshotException();
} else {
assert false : "Store should not be closed concurrently unless snapshot is aborted";
throw new IndexShardSnapshotFailedException(shardId, "Store got closed concurrently");
}
}
return store::decRef;
}
private static boolean assertFileContentsMatchHash(
IndexShardSnapshotStatus snapshotStatus,
BlobStoreIndexShardSnapshot.FileInfo fileInfo,
Store store
) {
if (store.tryIncRef()) {
try (IndexInput indexInput = store.openVerifyingInput(fileInfo.physicalName(), IOContext.READONCE, fileInfo.metadata())) {
final byte[] tmp = new byte[Math.toIntExact(fileInfo.metadata().length())];
indexInput.readBytes(tmp, 0, tmp.length);
assert fileInfo.metadata().hash().bytesEquals(new BytesRef(tmp));
} catch (IOException e) {
throw new AssertionError(e);
} finally {
store.decRef();
}
} else {
assert snapshotStatus.isAborted() : "if the store is already closed we must have been aborted";
}
return true;
}
@Override
public void restoreShard(
Store store,
SnapshotId snapshotId,
IndexId indexId,
ShardId snapshotShardId,
RecoveryState recoveryState,
ActionListener listener
) {
final ShardId shardId = store.shardId();
final ActionListener restoreListener = listener.delegateResponse(
(l, e) -> l.onFailure(new IndexShardRestoreFailedException(shardId, "failed to restore snapshot [" + snapshotId + "]", e))
);
final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT);
final BlobContainer container = shardContainer(indexId, snapshotShardId);
synchronized (ongoingRestores) {
if (store.isClosing()) {
restoreListener.onFailure(new AlreadyClosedException("store is closing"));
return;
}
if (lifecycle.started() == false) {
restoreListener.onFailure(new AlreadyClosedException("repository [" + metadata.name() + "] closed"));
return;
}
final boolean added = ongoingRestores.add(shardId);
assert added : "add restore for [" + shardId + "] that already has an existing restore";
}
executor.execute(ActionRunnable.wrap(ActionListener.runBefore(restoreListener, () -> {
final List> onEmptyListeners;
synchronized (ongoingRestores) {
if (ongoingRestores.remove(shardId) && ongoingRestores.isEmpty() && emptyListeners != null) {
onEmptyListeners = emptyListeners;
emptyListeners = null;
} else {
return;
}
}
ActionListener.onResponse(onEmptyListeners, null);
}), l -> {
final BlobStoreIndexShardSnapshot snapshot = loadShardSnapshot(container, snapshotId);
final SnapshotFiles snapshotFiles = new SnapshotFiles(snapshot.snapshot(), snapshot.indexFiles(), null);
new FileRestoreContext(metadata.name(), shardId, snapshotId, recoveryState) {
@Override
protected void restoreFiles(
List filesToRecover,
Store store,
ActionListener listener
) {
if (filesToRecover.isEmpty()) {
listener.onResponse(null);
} else {
// Start as many workers as fit into the snapshot pool at once at the most
final int workers = Math.min(
threadPool.info(ThreadPool.Names.SNAPSHOT).getMax(),
snapshotFiles.indexFiles().size()
);
final BlockingQueue files = new LinkedBlockingQueue<>(filesToRecover);
final ActionListener allFilesListener = fileQueueListener(files, workers, listener.map(v -> null));
// restore the files from the snapshot to the Lucene store
for (int i = 0; i < workers; ++i) {
try {
executeOneFileRestore(files, allFilesListener);
} catch (Exception e) {
allFilesListener.onFailure(e);
}
}
}
}
private void executeOneFileRestore(
BlockingQueue files,
ActionListener allFilesListener
) throws InterruptedException {
final BlobStoreIndexShardSnapshot.FileInfo fileToRecover = files.poll(0L, TimeUnit.MILLISECONDS);
if (fileToRecover == null) {
allFilesListener.onResponse(null);
} else {
executor.execute(ActionRunnable.wrap(allFilesListener, filesListener -> {
store.incRef();
try {
restoreFile(fileToRecover, store);
} finally {
store.decRef();
}
executeOneFileRestore(files, filesListener);
}));
}
}
private void restoreFile(BlobStoreIndexShardSnapshot.FileInfo fileInfo, Store store) throws IOException {
ensureNotClosing(store);
logger.trace(() -> format("[%s] restoring [%s] to [%s]", metadata.name(), fileInfo, store));
boolean success = false;
try (
IndexOutput indexOutput = store.createVerifyingOutput(
fileInfo.physicalName(),
fileInfo.metadata(),
IOContext.DEFAULT
)
) {
if (fileInfo.name().startsWith(VIRTUAL_DATA_BLOB_PREFIX)) {
final BytesRef hash = fileInfo.metadata().hash();
indexOutput.writeBytes(hash.bytes, hash.offset, hash.length);
recoveryState.getIndex().addRecoveredBytesToFile(fileInfo.physicalName(), hash.length);
} else {
try (InputStream stream = maybeRateLimitRestores(new SlicedInputStream(fileInfo.numberOfParts()) {
@Override
protected InputStream openSlice(int slice) throws IOException {
ensureNotClosing(store);
return container.readBlob(fileInfo.partName(slice));
}
})) {
final byte[] buffer = new byte[Math.toIntExact(Math.min(bufferSize, fileInfo.length()))];
int length;
while ((length = stream.read(buffer)) > 0) {
ensureNotClosing(store);
indexOutput.writeBytes(buffer, 0, length);
recoveryState.getIndex().addRecoveredBytesToFile(fileInfo.physicalName(), length);
}
}
}
Store.verify(indexOutput);
indexOutput.close();
store.directory().sync(Collections.singleton(fileInfo.physicalName()));
success = true;
} catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
try {
store.markStoreCorrupted(ex);
} catch (IOException e) {
logger.warn("store cannot be marked as corrupted", e);
}
throw ex;
} finally {
if (success == false) {
store.deleteQuiet(fileInfo.physicalName());
}
}
}
void ensureNotClosing(final Store store) throws AlreadyClosedException {
assert store.refCount() > 0;
if (store.isClosing()) {
throw new AlreadyClosedException("store is closing");
}
if (lifecycle.started() == false) {
throw new AlreadyClosedException("repository [" + metadata.name() + "] closed");
}
}
}.restore(snapshotFiles, store, l);
}));
}
private static ActionListener fileQueueListener(
BlockingQueue files,
int workers,
ActionListener> listener
) {
return new GroupedActionListener<>(listener, workers).delegateResponse((l, e) -> {
files.clear(); // Stop uploading the remaining files if we run into any exception
l.onFailure(e);
});
}
private static InputStream maybeRateLimit(
InputStream stream,
Supplier rateLimiterSupplier,
RateLimitingInputStream.Listener throttleListener
) {
return new RateLimitingInputStream(stream, rateLimiterSupplier, throttleListener);
}
/**
* Wrap the restore rate limiter (controlled by the repository setting `max_restore_bytes_per_sec` and the cluster setting
* `indices.recovery.max_bytes_per_sec`) around the given stream. Any throttling is reported to the given listener and not otherwise
* recorded in the value returned by {@link BlobStoreRepository#getRestoreThrottleTimeInNanos}.
*/
public InputStream maybeRateLimitRestores(InputStream stream) {
return maybeRateLimitRestores(stream, restoreRateLimitingTimeInNanos::inc);
}
/**
* Wrap the restore rate limiter (controlled by the repository setting `max_restore_bytes_per_sec` and the cluster setting
* `indices.recovery.max_bytes_per_sec`) around the given stream. Any throttling is recorded in the value returned by {@link
* BlobStoreRepository#getRestoreThrottleTimeInNanos}.
*/
public InputStream maybeRateLimitRestores(InputStream stream, RateLimitingInputStream.Listener throttleListener) {
return maybeRateLimit(
maybeRateLimit(stream, () -> restoreRateLimiter, throttleListener),
recoverySettings::rateLimiter,
throttleListener
);
}
/**
* Wrap the snapshot rate limiter (controlled by the repository setting `max_snapshot_bytes_per_sec`) around the given stream. Any
* throttling is recorded in the value returned by {@link BlobStoreRepository#getSnapshotThrottleTimeInNanos()}.
*/
public InputStream maybeRateLimitSnapshots(InputStream stream) {
return maybeRateLimitSnapshots(stream, snapshotRateLimitingTimeInNanos::inc);
}
/**
* Wrap the snapshot rate limiter (controlled by the repository setting `max_snapshot_bytes_per_sec`) around the given stream. Any
* throttling is reported to the given listener and not otherwise recorded in the value returned by {@link
* BlobStoreRepository#getSnapshotThrottleTimeInNanos()}.
*/
public InputStream maybeRateLimitSnapshots(InputStream stream, RateLimitingInputStream.Listener throttleListener) {
return maybeRateLimit(stream, () -> snapshotRateLimiter, throttleListener);
}
@Override
public IndexShardSnapshotStatus getShardSnapshotStatus(SnapshotId snapshotId, IndexId indexId, ShardId shardId) {
BlobStoreIndexShardSnapshot snapshot = loadShardSnapshot(shardContainer(indexId, shardId), snapshotId);
return IndexShardSnapshotStatus.newDone(
snapshot.startTime(),
snapshot.time(),
snapshot.incrementalFileCount(),
snapshot.totalFileCount(),
snapshot.incrementalSize(),
snapshot.totalSize(),
null
); // Not adding a real generation here as it doesn't matter to callers
}
@Override
public void verify(String seed, DiscoveryNode localNode) {
assertSnapshotOrGenericThread();
if (isReadOnly()) {
try {
latestIndexBlobId();
} catch (Exception e) {
throw new RepositoryVerificationException(
metadata.name(),
"path " + basePath() + " is not accessible on node " + localNode,
e
);
}
} else {
BlobContainer testBlobContainer = blobStore().blobContainer(basePath().add(testBlobPrefix(seed)));
try {
testBlobContainer.writeBlob("data-" + localNode.getId() + ".dat", new BytesArray(seed), true);
} catch (Exception exp) {
throw new RepositoryVerificationException(
metadata.name(),
"store location [" + blobStore() + "] is not accessible on the node [" + localNode + "]",
exp
);
}
try (InputStream masterDat = testBlobContainer.readBlob("master.dat")) {
final String seedRead = Streams.readFully(masterDat).utf8ToString();
if (seedRead.equals(seed) == false) {
throw new RepositoryVerificationException(
metadata.name(),
"Seed read from master.dat was [" + seedRead + "] but expected seed [" + seed + "]"
);
}
} catch (NoSuchFileException e) {
throw new RepositoryVerificationException(
metadata.name(),
"a file written by master to the store ["
+ blobStore()
+ "] cannot be accessed on the node ["
+ localNode
+ "]. "
+ "This might indicate that the store ["
+ blobStore()
+ "] is not shared between this node and the master node or "
+ "that permissions on the store don't allow reading files written by the master node",
e
);
} catch (Exception e) {
throw new RepositoryVerificationException(metadata.name(), "Failed to verify repository", e);
}
}
}
@Override
public String toString() {
return "BlobStoreRepository[" + "[" + metadata.name() + "], [" + blobStore.get() + ']' + ']';
}
/**
* Delete snapshot from shard level metadata.
*
* @param indexGeneration generation to write the new shard level level metadata to. If negative a uuid id shard generation should be
* used
*/
private ShardSnapshotMetaDeleteResult deleteFromShardSnapshotMeta(
Set survivingSnapshots,
IndexId indexId,
int snapshotShardId,
Collection snapshotIds,
BlobContainer shardContainer,
Set blobs,
BlobStoreIndexShardSnapshots snapshots,
long indexGeneration
) {
// Build a list of snapshots that should be preserved
List newSnapshotsList = new ArrayList<>();
final Set survivingSnapshotNames = survivingSnapshots.stream().map(SnapshotId::getName).collect(Collectors.toSet());
for (SnapshotFiles point : snapshots) {
if (survivingSnapshotNames.contains(point.snapshot())) {
newSnapshotsList.add(point);
}
}
ShardGeneration writtenGeneration = null;
try {
if (newSnapshotsList.isEmpty()) {
return new ShardSnapshotMetaDeleteResult(indexId, snapshotShardId, ShardGenerations.DELETED_SHARD_GEN, blobs);
} else {
final BlobStoreIndexShardSnapshots updatedSnapshots = new BlobStoreIndexShardSnapshots(newSnapshotsList);
if (indexGeneration < 0L) {
writtenGeneration = ShardGeneration.newGeneration();
INDEX_SHARD_SNAPSHOTS_FORMAT.write(updatedSnapshots, shardContainer, writtenGeneration.toBlobNamePart(), compress);
} else {
writtenGeneration = new ShardGeneration(indexGeneration);
writeShardIndexBlobAtomic(shardContainer, indexGeneration, updatedSnapshots, Collections.emptyMap());
}
final Set survivingSnapshotUUIDs = survivingSnapshots.stream().map(SnapshotId::getUUID).collect(Collectors.toSet());
return new ShardSnapshotMetaDeleteResult(
indexId,
snapshotShardId,
writtenGeneration,
unusedBlobs(blobs, survivingSnapshotUUIDs, updatedSnapshots)
);
}
} catch (IOException e) {
throw new RepositoryException(
metadata.name(),
"Failed to finalize snapshot deletion "
+ snapshotIds
+ " with shard index ["
+ INDEX_SHARD_SNAPSHOTS_FORMAT.blobName(writtenGeneration.toBlobNamePart())
+ "]",
e
);
}
}
/**
* Utility for atomically writing shard level metadata to a numeric shard generation. This is only required for writing
* numeric shard generations where atomic writes with fail-if-already-exists checks are useful in preventing repository corruption.
*/
private void writeShardIndexBlobAtomic(
BlobContainer shardContainer,
long indexGeneration,
BlobStoreIndexShardSnapshots updatedSnapshots,
Map serializationParams
) throws IOException {
assert indexGeneration >= 0 : "Shard generation must not be negative but saw [" + indexGeneration + "]";
logger.trace(() -> format("[%s] Writing shard index [%s] to [%s]", metadata.name(), indexGeneration, shardContainer.path()));
final String blobName = INDEX_SHARD_SNAPSHOTS_FORMAT.blobName(String.valueOf(indexGeneration));
writeAtomic(
shardContainer,
blobName,
out -> INDEX_SHARD_SNAPSHOTS_FORMAT.serialize(updatedSnapshots, blobName, compress, serializationParams, out),
true
);
}
// Unused blobs are all previous index-, data- and meta-blobs and that are not referenced by the new index- as well as all
// temporary blobs
private static List unusedBlobs(
Set blobs,
Set survivingSnapshotUUIDs,
BlobStoreIndexShardSnapshots updatedSnapshots
) {
return blobs.stream()
.filter(
blob -> blob.startsWith(SNAPSHOT_INDEX_PREFIX)
|| (blob.startsWith(SNAPSHOT_PREFIX)
&& blob.endsWith(".dat")
&& survivingSnapshotUUIDs.contains(
blob.substring(SNAPSHOT_PREFIX.length(), blob.length() - ".dat".length())
) == false)
|| (blob.startsWith(UPLOADED_DATA_BLOB_PREFIX) && updatedSnapshots.findNameFile(canonicalName(blob)) == null)
|| FsBlobContainer.isTempBlobName(blob)
)
.toList();
}
/**
* Loads information about shard snapshot
*/
public BlobStoreIndexShardSnapshot loadShardSnapshot(BlobContainer shardContainer, SnapshotId snapshotId) {
try {
return INDEX_SHARD_SNAPSHOT_FORMAT.read(metadata.name(), shardContainer, snapshotId.getUUID(), namedXContentRegistry);
} catch (NoSuchFileException ex) {
throw new SnapshotMissingException(metadata.name(), snapshotId, ex);
} catch (IOException ex) {
throw new SnapshotException(
metadata.name(),
snapshotId,
"failed to read shard snapshot file for [" + shardContainer.path() + ']',
ex
);
}
}
/**
* Loads all available snapshots in the repository using the given {@code generation} for a shard. When {@code shardGen}
* is null it tries to load it using the BwC mode, listing the available index- blobs in the shard container.
*/
public BlobStoreIndexShardSnapshots getBlobStoreIndexShardSnapshots(IndexId indexId, int shardId, @Nullable ShardGeneration shardGen)
throws IOException {
final BlobContainer shardContainer = shardContainer(indexId, shardId);
Set blobs = Collections.emptySet();
if (shardGen == null) {
blobs = shardContainer.listBlobsByPrefix(INDEX_FILE_PREFIX).keySet();
}
return buildBlobStoreIndexShardSnapshots(blobs, shardContainer, shardGen).v1();
}
/**
* Loads all available snapshots in the repository using the given {@code generation} or falling back to trying to determine it from
* the given list of blobs in the shard container.
*
* @param blobs list of blobs in repository
* @param generation shard generation or {@code null} in case there was no shard generation tracked in the {@link RepositoryData} for
* this shard because its snapshot was created in a version older than
* {@link SnapshotsService#SHARD_GEN_IN_REPO_DATA_VERSION}.
* @return tuple of BlobStoreIndexShardSnapshots and the last snapshot index generation
*/
private Tuple buildBlobStoreIndexShardSnapshots(
Set blobs,
BlobContainer shardContainer,
@Nullable ShardGeneration generation
) throws IOException {
if (generation != null) {
if (generation.equals(ShardGenerations.NEW_SHARD_GEN)) {
return new Tuple<>(BlobStoreIndexShardSnapshots.EMPTY, ShardGenerations.NEW_SHARD_GEN);
}
return new Tuple<>(
INDEX_SHARD_SNAPSHOTS_FORMAT.read(metadata.name(), shardContainer, generation.toBlobNamePart(), namedXContentRegistry),
generation
);
}
final Tuple legacyIndex = buildBlobStoreIndexShardSnapshots(blobs, shardContainer);
return new Tuple<>(legacyIndex.v1(), new ShardGeneration(legacyIndex.v2()));
}
/**
* Loads all available snapshots in the repository
*
* @param blobs list of blobs in repository
* @return tuple of BlobStoreIndexShardSnapshots and the last snapshot index generation
*/
private Tuple buildBlobStoreIndexShardSnapshots(Set blobs, BlobContainer shardContainer)
throws IOException {
long latest = latestGeneration(blobs);
if (latest >= 0) {
final BlobStoreIndexShardSnapshots shardSnapshots = INDEX_SHARD_SNAPSHOTS_FORMAT.read(
metadata.name(),
shardContainer,
Long.toString(latest),
namedXContentRegistry
);
return new Tuple<>(shardSnapshots, latest);
} else if (blobs.stream()
.anyMatch(b -> b.startsWith(SNAPSHOT_PREFIX) || b.startsWith(INDEX_FILE_PREFIX) || b.startsWith(UPLOADED_DATA_BLOB_PREFIX))) {
logger.warn(
"Could not find a readable index-N file in a non-empty shard snapshot directory [" + shardContainer.path() + "]"
);
}
return new Tuple<>(BlobStoreIndexShardSnapshots.EMPTY, latest);
}
/**
* Snapshot individual file
* @param fileInfo file to be snapshotted
*/
private void snapshotFile(
BlobStoreIndexShardSnapshot.FileInfo fileInfo,
IndexId indexId,
ShardId shardId,
SnapshotId snapshotId,
IndexShardSnapshotStatus snapshotStatus,
Store store
) throws IOException {
final BlobContainer shardContainer = shardContainer(indexId, shardId);
final String file = fileInfo.physicalName();
try (IndexInput indexInput = store.openVerifyingInput(file, IOContext.READONCE, fileInfo.metadata())) {
for (int i = 0; i < fileInfo.numberOfParts(); i++) {
final long partBytes = fileInfo.partBytes(i);
// Make reads abortable by mutating the snapshotStatus object
final InputStream inputStream = new FilterInputStream(
maybeRateLimitSnapshots(new InputStreamIndexInput(indexInput, partBytes))
) {
@Override
public int read() throws IOException {
checkAborted();
return super.read();
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
checkAborted();
return super.read(b, off, len);
}
private void checkAborted() {
if (snapshotStatus.isAborted()) {
logger.debug("[{}] [{}] Aborted on the file [{}], exiting", shardId, snapshotId, fileInfo.physicalName());
throw new AbortedSnapshotException();
}
}
};
final String partName = fileInfo.partName(i);
logger.trace("[{}] Writing [{}] to [{}]", metadata.name(), partName, shardContainer.path());
final long startMS = threadPool.relativeTimeInMillis();
shardContainer.writeBlob(partName, inputStream, partBytes, false);
logger.trace(
"[{}] Writing [{}] of size [{}b] to [{}] took [{}ms]",
metadata.name(),
partName,
partBytes,
shardContainer.path(),
threadPool.relativeTimeInMillis() - startMS
);
}
Store.verify(indexInput);
snapshotStatus.addProcessedFile(fileInfo.length());
} catch (Exception t) {
failStoreIfCorrupted(store, t);
snapshotStatus.addProcessedFile(0);
throw t;
}
}
private static void failStoreIfCorrupted(Store store, Exception e) {
if (Lucene.isCorruptionException(e)) {
try {
store.markStoreCorrupted((IOException) e);
} catch (IOException inner) {
inner.addSuppressed(e);
logger.warn("store cannot be marked as corrupted", inner);
}
}
}
public boolean supportURLRepo() {
return supportURLRepo;
}
/**
* @return whether this repository performs overwrites atomically. In practice we only overwrite the `index.latest` blob so this
* is not very important, but the repository analyzer does test that overwrites happen atomically. It will skip those tests if the
* repository overrides this method to indicate that it does not support atomic overwrites.
*/
public boolean hasAtomicOverwrites() {
return true;
}
public int getReadBufferSizeInBytes() {
return bufferSize;
}
/**
* The result of removing a snapshot from a shard folder in the repository.
*/
private static final class ShardSnapshotMetaDeleteResult {
// Index that the snapshot was removed from
private final IndexId indexId;
// Shard id that the snapshot was removed from
private final int shardId;
// Id of the new index-${uuid} blob that does not include the snapshot any more
private final ShardGeneration newGeneration;
// Blob names in the shard directory that have become unreferenced in the new shard generation
private final Collection blobsToDelete;
ShardSnapshotMetaDeleteResult(IndexId indexId, int shardId, ShardGeneration newGeneration, Collection blobsToDelete) {
this.indexId = indexId;
this.shardId = shardId;
this.newGeneration = newGeneration;
this.blobsToDelete = blobsToDelete;
}
}
}