org.opensearch.index.store.Store Maven / Gradle / Ivy
Show all versions of opensearch Show documentation
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
/*
* Modifications Copyright OpenSearch Contributors. See
* GitHub history for details.
*/
package org.opensearch.index.store;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexFormatTooNewException;
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.index.IndexNotFoundException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.SegmentCommitInfo;
import org.apache.lucene.index.SegmentInfos;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.BufferedChecksum;
import org.apache.lucene.store.BufferedChecksumIndexInput;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FilterDirectory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.Lock;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.Version;
import org.opensearch.ExceptionsHelper;
import org.opensearch.common.UUIDs;
import org.opensearch.common.annotation.PublicApi;
import org.opensearch.common.io.stream.BytesStreamOutput;
import org.opensearch.common.logging.Loggers;
import org.opensearch.common.lucene.Lucene;
import org.opensearch.common.lucene.store.ByteArrayIndexInput;
import org.opensearch.common.lucene.store.InputStreamIndexInput;
import org.opensearch.common.settings.Setting;
import org.opensearch.common.settings.Setting.Property;
import org.opensearch.common.unit.TimeValue;
import org.opensearch.common.util.concurrent.AbstractRefCounted;
import org.opensearch.common.util.concurrent.RefCounted;
import org.opensearch.common.util.io.IOUtils;
import org.opensearch.common.util.iterable.Iterables;
import org.opensearch.core.common.bytes.BytesReference;
import org.opensearch.core.common.io.stream.StreamInput;
import org.opensearch.core.common.io.stream.StreamOutput;
import org.opensearch.core.common.io.stream.Writeable;
import org.opensearch.core.index.shard.ShardId;
import org.opensearch.env.NodeEnvironment;
import org.opensearch.env.ShardLock;
import org.opensearch.env.ShardLockObtainFailedException;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.engine.CombinedDeletionPolicy;
import org.opensearch.index.engine.Engine;
import org.opensearch.index.seqno.SequenceNumbers;
import org.opensearch.index.shard.AbstractIndexShardComponent;
import org.opensearch.index.shard.IndexShard;
import org.opensearch.index.shard.ShardPath;
import org.opensearch.index.translog.Translog;
import java.io.Closeable;
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.UncheckedIOException;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.Consumer;
import java.util.zip.CRC32;
import java.util.zip.Checksum;
import static java.util.Collections.emptyMap;
import static java.util.Collections.unmodifiableMap;
import static org.opensearch.index.seqno.SequenceNumbers.LOCAL_CHECKPOINT_KEY;
import static org.opensearch.index.store.Store.MetadataSnapshot.loadMetadata;
/**
* A Store provides plain access to files written by an opensearch index shard. Each shard
* has a dedicated store that is uses to access Lucene's Directory which represents the lowest level
* of file abstraction in Lucene used to read and write Lucene indices.
* This class also provides access to metadata information like checksums for committed files. A committed
* file is a file that belongs to a segment written by a Lucene commit. Files that have not been committed
* ie. created during a merge or a shard refresh / NRT reopen are not considered in the MetadataSnapshot.
*
* Note: If you use a store it's reference count should be increased before using it by calling #incRef and a
* corresponding #decRef must be called in a try/finally block to release the store again ie.:
*
* store.incRef();
* try {
* // use the store...
*
* } finally {
* store.decRef();
* }
*
*
* @opensearch.api
*/
@PublicApi(since = "1.0.0")
public class Store extends AbstractIndexShardComponent implements Closeable, RefCounted {
/**
* This is an escape hatch for lucenes internal optimization that checks if the IndexInput is an instance of ByteBufferIndexInput
* and if that's the case doesn't load the term dictionary into ram but loads it off disk iff the fields is not an ID like field.
* Since this optimization has been added very late in the release processes we add this setting to allow users to opt-out of
* this by exploiting lucene internals and wrapping the IndexInput in a simple delegate.
*/
public static final Setting FORCE_RAM_TERM_DICT = Setting.boolSetting(
"index.force_memory_term_dictionary",
false,
Property.IndexScope,
Property.Deprecated
);
static final String CODEC = "store";
static final int CORRUPTED_MARKER_CODEC_VERSION = 2;
// public is for test purposes
public static final String CORRUPTED_MARKER_NAME_PREFIX = "corrupted_";
public static final Setting INDEX_STORE_STATS_REFRESH_INTERVAL_SETTING = Setting.timeSetting(
"index.store.stats_refresh_interval",
TimeValue.timeValueSeconds(10),
Property.IndexScope
);
/**
* Specific {@link IOContext} used to verify Lucene files footer checksums.
* See {@link MetadataSnapshot#checksumFromLuceneFile(Directory, String, Map, Logger, Version, boolean)}
*/
public static final IOContext READONCE_CHECKSUM = new IOContext(IOContext.READONCE.context);
private final AtomicBoolean isClosed = new AtomicBoolean(false);
private final StoreDirectory directory;
private final ReentrantReadWriteLock metadataLock = new ReentrantReadWriteLock();
private final ShardLock shardLock;
private final OnClose onClose;
private final ShardPath shardPath;
// used to ref count files when a new Reader is opened for PIT/Scroll queries
// prevents segment files deletion until the PIT/Scroll expires or is discarded
private final AbstractRefCounted refCounter = new AbstractRefCounted("store") {
@Override
protected void closeInternal() {
// close us once we are done
Store.this.closeInternal();
}
};
public Store(ShardId shardId, IndexSettings indexSettings, Directory directory, ShardLock shardLock) {
this(shardId, indexSettings, directory, shardLock, OnClose.EMPTY, null);
}
public Store(
ShardId shardId,
IndexSettings indexSettings,
Directory directory,
ShardLock shardLock,
OnClose onClose,
ShardPath shardPath
) {
super(shardId, indexSettings);
final TimeValue refreshInterval = indexSettings.getValue(INDEX_STORE_STATS_REFRESH_INTERVAL_SETTING);
logger.debug("store stats are refreshed with refresh_interval [{}]", refreshInterval);
ByteSizeCachingDirectory sizeCachingDir = new ByteSizeCachingDirectory(directory, refreshInterval);
this.directory = new StoreDirectory(sizeCachingDir, Loggers.getLogger("index.store.deletes", shardId));
this.shardLock = shardLock;
this.onClose = onClose;
this.shardPath = shardPath;
assert onClose != null;
assert shardLock != null;
assert shardLock.getShardId().equals(shardId);
}
public Directory directory() {
ensureOpen();
return directory;
}
public ShardPath shardPath() {
return shardPath;
}
/**
* Returns the last committed segments info for this store
*
* @throws IOException if the index is corrupted or the segments file is not present
*/
public SegmentInfos readLastCommittedSegmentsInfo() throws IOException {
failIfCorrupted();
try {
if (indexSettings.isRemoteSnapshot() && indexSettings.getExtendedCompatibilitySnapshotVersion() != null) {
return readSegmentInfosExtendedCompatibility(directory(), indexSettings.getExtendedCompatibilitySnapshotVersion());
} else {
return readSegmentsInfo(null, directory());
}
} catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
markStoreCorrupted(ex);
throw ex;
}
}
/**
* Returns the segments info for the given commit or for the latest commit if the given commit is null
.
* This method will throw an exception if the index is older than the standard backwards compatibility
* policy ( current major - 1). See also {@link #readSegmentInfosExtendedCompatibility(Directory, org.opensearch.Version)}.
*
* @throws IOException if the index is corrupted or the segments file is not present
*/
private static SegmentInfos readSegmentsInfo(IndexCommit commit, Directory directory) throws IOException {
assert commit == null || commit.getDirectory() == directory;
try {
return commit == null ? Lucene.readSegmentInfos(directory) : Lucene.readSegmentInfos(commit);
} catch (EOFException eof) {
// TODO this should be caught by lucene - EOF is almost certainly an index corruption
throw new CorruptIndexException("Read past EOF while reading segment infos", "commit(" + commit + ")", eof);
} catch (IOException exception) {
throw exception; // IOExceptions like too many open files are not necessarily a corruption - just bubble it up
} catch (Exception ex) {
throw new CorruptIndexException("Hit unexpected exception while reading segment infos", "commit(" + commit + ")", ex);
}
}
/**
* Returns the segments info for the latest commit in the given directory. Unlike
* {@link #readSegmentsInfo(IndexCommit, Directory)}, this method supports reading
* older Lucene indices on a best-effort basis.
*
* @throws IOException if the index is corrupted or the segments file is not present
*/
private static SegmentInfos readSegmentInfosExtendedCompatibility(Directory directory, org.opensearch.Version minimumVersion)
throws IOException {
try {
return Lucene.readSegmentInfos(directory, minimumVersion);
} catch (EOFException eof) {
// TODO this should be caught by lucene - EOF is almost certainly an index corruption
throw new CorruptIndexException("Read past EOF while reading segment infos", "", eof);
} catch (IOException exception) {
throw exception; // IOExceptions like too many open files are not necessarily a corruption - just bubble it up
} catch (Exception ex) {
throw new CorruptIndexException("Hit unexpected exception while reading segment infos", "", ex);
}
}
final void ensureOpen() {
if (this.refCounter.refCount() <= 0) {
throw new AlreadyClosedException("store is already closed");
}
}
/**
* Returns a new MetadataSnapshot for the given commit. If the given commit is null
* the latest commit point is used.
*
* Note that this method requires the caller verify it has the right to access the store and
* no concurrent file changes are happening. If in doubt, you probably want to use one of the following:
*
* {@link #readMetadataSnapshot(Path, ShardId, NodeEnvironment.ShardLocker, Logger)} to read a meta data while locking
* {@link IndexShard#snapshotStoreMetadata()} to safely read from an existing shard
* {@link IndexShard#acquireLastIndexCommit(boolean)} to get an {@link IndexCommit} which is safe to use but has to be freed
*
* @param commit the index commit to read the snapshot from or {@code null} if the latest snapshot should be read from the
* directory
* @throws CorruptIndexException if the lucene index is corrupted. This can be caused by a checksum mismatch or an
* unexpected exception when opening the index reading the segments file.
* @throws IndexFormatTooOldException if the lucene index is too old to be opened.
* @throws IndexFormatTooNewException if the lucene index is too new to be opened.
* @throws FileNotFoundException if one or more files referenced by a commit are not present.
* @throws NoSuchFileException if one or more files referenced by a commit are not present.
* @throws IndexNotFoundException if the commit point can't be found in this store
*/
public MetadataSnapshot getMetadata(IndexCommit commit) throws IOException {
return getMetadata(commit, false);
}
/**
* Convenience wrapper around the {@link #getMetadata(IndexCommit)} method for null input.
*/
public MetadataSnapshot getMetadata() throws IOException {
return getMetadata(null, false);
}
/**
* Returns a new MetadataSnapshot for the given commit. If the given commit is null
* the latest commit point is used.
*
* Note that this method requires the caller verify it has the right to access the store and
* no concurrent file changes are happening. If in doubt, you probably want to use one of the following:
*
* {@link #readMetadataSnapshot(Path, ShardId, NodeEnvironment.ShardLocker, Logger)} to read a meta data while locking
* {@link IndexShard#snapshotStoreMetadata()} to safely read from an existing shard
* {@link IndexShard#acquireLastIndexCommit(boolean)} to get an {@link IndexCommit} which is safe to use but has to be freed
*
* @param commit the index commit to read the snapshot from or null
if the latest snapshot should be read from the
* directory
* @param lockDirectory if true
the index writer lock will be obtained before reading the snapshot. This should
* only be used if there is no started shard using this store.
* @throws CorruptIndexException if the lucene index is corrupted. This can be caused by a checksum mismatch or an
* unexpected exception when opening the index reading the segments file.
* @throws IndexFormatTooOldException if the lucene index is too old to be opened.
* @throws IndexFormatTooNewException if the lucene index is too new to be opened.
* @throws FileNotFoundException if one or more files referenced by a commit are not present.
* @throws NoSuchFileException if one or more files referenced by a commit are not present.
* @throws IndexNotFoundException if the commit point can't be found in this store
*/
public MetadataSnapshot getMetadata(IndexCommit commit, boolean lockDirectory) throws IOException {
ensureOpen();
failIfCorrupted();
assert lockDirectory ? commit == null : true : "IW lock should not be obtained if there is a commit point available";
// if we lock the directory we also acquire the write lock since that makes sure that nobody else tries to lock the IW
// on this store at the same time.
java.util.concurrent.locks.Lock lock = lockDirectory ? metadataLock.writeLock() : metadataLock.readLock();
lock.lock();
try (Closeable ignored = lockDirectory ? directory.obtainLock(IndexWriter.WRITE_LOCK_NAME) : () -> {}) {
return new MetadataSnapshot(commit, directory, logger);
} catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
markStoreCorrupted(ex);
throw ex;
} finally {
lock.unlock();
}
}
/**
* Returns a new {@link MetadataSnapshot} for the given {@link SegmentInfos} object.
* In contrast to {@link #getMetadata(IndexCommit)}, this method is useful for scenarios
* where we need to construct a MetadataSnapshot from an in-memory SegmentInfos object that
* may not have a IndexCommit associated with it, such as with segment replication.
*/
public MetadataSnapshot getMetadata(SegmentInfos segmentInfos) throws IOException {
return new MetadataSnapshot(segmentInfos, directory, logger);
}
/**
* Segment Replication method - Fetch a map of StoreFileMetadata for segments, ignoring Segment_N files.
* @param segmentInfos {@link SegmentInfos} from which to compute metadata.
* @return {@link Map} map file name to {@link StoreFileMetadata}.
*/
public Map getSegmentMetadataMap(SegmentInfos segmentInfos) throws IOException {
assert indexSettings.isSegRepEnabled();
failIfCorrupted();
try {
return loadMetadata(segmentInfos, directory, logger, true).fileMetadata;
} catch (NoSuchFileException | CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
markStoreCorrupted(ex);
throw ex;
}
}
/**
* Segment Replication method
* Returns a diff between the Maps of StoreFileMetadata that can be used for getting list of files to copy over to a replica for segment replication. The returned diff will hold a list of files that are:
*
* - identical: they exist in both maps and they can be considered the same ie. they don't need to be recovered
* - different: they exist in both maps but their they are not identical
* - missing: files that exist in the source but not in the target
*
*/
public static RecoveryDiff segmentReplicationDiff(Map source, Map target) {
final List identical = new ArrayList<>();
final List different = new ArrayList<>();
final List missing = new ArrayList<>();
for (StoreFileMetadata value : source.values()) {
if (value.name().startsWith(IndexFileNames.SEGMENTS)) {
continue;
}
if (target.containsKey(value.name()) == false) {
missing.add(value);
} else {
final StoreFileMetadata fileMetadata = target.get(value.name());
// match segments using checksum
if (fileMetadata.checksum().equals(value.checksum())) {
identical.add(value);
} else {
different.add(value);
}
}
}
return new RecoveryDiff(
Collections.unmodifiableList(identical),
Collections.unmodifiableList(different),
Collections.unmodifiableList(missing)
);
}
/**
* Renames all the given files from the key of the map to the
* value of the map. All successfully renamed files are removed from the map in-place.
*/
public void renameTempFilesSafe(Map tempFileMap) throws IOException {
// this works just like a lucene commit - we rename all temp files and once we successfully
// renamed all the segments we rename the commit to ensure we don't leave half baked commits behind.
final Map.Entry[] entries = tempFileMap.entrySet().toArray(new Map.Entry[0]);
ArrayUtil.timSort(entries, (o1, o2) -> {
String left = o1.getValue();
String right = o2.getValue();
if (left.startsWith(IndexFileNames.SEGMENTS) || right.startsWith(IndexFileNames.SEGMENTS)) {
if (left.startsWith(IndexFileNames.SEGMENTS) == false) {
return -1;
} else if (right.startsWith(IndexFileNames.SEGMENTS) == false) {
return 1;
}
}
return left.compareTo(right);
});
metadataLock.writeLock().lock();
// we make sure that nobody fetches the metadata while we do this rename operation here to ensure we don't
// get exceptions if files are still open.
try (Lock writeLock = directory().obtainLock(IndexWriter.WRITE_LOCK_NAME)) {
for (Map.Entry entry : entries) {
String tempFile = entry.getKey();
String origFile = entry.getValue();
// first, go and delete the existing ones
try {
directory.deleteFile(origFile);
} catch (FileNotFoundException | NoSuchFileException e) {} catch (Exception ex) {
logger.debug(() -> new ParameterizedMessage("failed to delete file [{}]", origFile), ex);
}
// now, rename the files... and fail it it won't work
directory.rename(tempFile, origFile);
final String remove = tempFileMap.remove(tempFile);
assert remove != null;
}
directory.syncMetaData();
} finally {
metadataLock.writeLock().unlock();
}
}
/**
* Checks and returns the status of the existing index in this store.
*
* @param out where infoStream messages should go. See {@link CheckIndex#setInfoStream(PrintStream)}
*/
public CheckIndex.Status checkIndex(PrintStream out) throws IOException {
metadataLock.writeLock().lock();
try (CheckIndex checkIndex = new CheckIndex(directory)) {
checkIndex.setInfoStream(out);
return checkIndex.checkIndex();
} finally {
metadataLock.writeLock().unlock();
}
}
/**
* @param reservedBytes a prediction of how much larger the store is expected to grow, or {@link StoreStats#UNKNOWN_RESERVED_BYTES}.
*/
public StoreStats stats(long reservedBytes) throws IOException {
ensureOpen();
return new StoreStats(directory.estimateSize(), reservedBytes);
}
/**
* Increments the refCount of this Store instance. RefCounts are used to determine when a
* Store can be closed safely, i.e. as soon as there are no more references. Be sure to always call a
* corresponding {@link #decRef}, in a finally clause; otherwise the store may never be closed. Note that
* {@link #close} simply calls decRef(), which means that the Store will not really be closed until {@link
* #decRef} has been called for all outstanding references.
*
* Note: Close can safely be called multiple times.
*
* @throws AlreadyClosedException iff the reference counter can not be incremented.
* @see #decRef
* @see #tryIncRef()
*/
@Override
public final void incRef() {
refCounter.incRef();
}
/**
* Tries to increment the refCount of this Store instance. This method will return {@code true} iff the refCount was
* incremented successfully otherwise {@code false}. RefCounts are used to determine when a
* Store can be closed safely, i.e. as soon as there are no more references. Be sure to always call a
* corresponding {@link #decRef}, in a finally clause; otherwise the store may never be closed. Note that
* {@link #close} simply calls decRef(), which means that the Store will not really be closed until {@link
* #decRef} has been called for all outstanding references.
*
* Note: Close can safely be called multiple times.
*
* @see #decRef()
* @see #incRef()
*/
@Override
public final boolean tryIncRef() {
return refCounter.tryIncRef();
}
/**
* Decreases the refCount of this Store instance. If the refCount drops to 0, then this
* store is closed.
*
* @see #incRef
*/
@Override
public final boolean decRef() {
return refCounter.decRef();
}
@Override
public void close() {
if (isClosed.compareAndSet(false, true)) {
// only do this once!
decRef();
logger.debug("store reference count on close: {}", refCounter.refCount());
}
}
/**
* @return true if the {@link Store#close()} method has been called. This indicates that the current
* store is either closed or being closed waiting for all references to it to be released.
* You might prefer to use {@link Store#ensureOpen()} instead.
*/
public boolean isClosing() {
return isClosed.get();
}
private void closeInternal() {
// Leverage try-with-resources to close the shard lock for us
try (Closeable c = shardLock) {
try {
directory.innerClose(); // this closes the distributorDirectory as well
} finally {
onClose.accept(shardLock);
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
/**
* Reads a MetadataSnapshot from the given index locations or returns an empty snapshot if it can't be read.
*
* @throws IOException if the index we try to read is corrupted
*/
public static MetadataSnapshot readMetadataSnapshot(
Path indexLocation,
ShardId shardId,
NodeEnvironment.ShardLocker shardLocker,
Logger logger
) throws IOException {
try (
ShardLock lock = shardLocker.lock(shardId, "read metadata snapshot", TimeUnit.SECONDS.toMillis(5));
Directory dir = new NIOFSDirectory(indexLocation)
) {
failIfCorrupted(dir);
return new MetadataSnapshot((IndexCommit) null, dir, logger);
} catch (IndexNotFoundException ex) {
// that's fine - happens all the time no need to log
} catch (FileNotFoundException | NoSuchFileException ex) {
logger.info("Failed to open / find files while reading metadata snapshot", ex);
} catch (ShardLockObtainFailedException ex) {
logger.info(() -> new ParameterizedMessage("{}: failed to obtain shard lock", shardId), ex);
}
return MetadataSnapshot.EMPTY;
}
/**
* Tries to open an index for the given location. This includes reading the
* segment infos and possible corruption markers. If the index can not
* be opened, an exception is thrown
*/
public static void tryOpenIndex(Path indexLocation, ShardId shardId, NodeEnvironment.ShardLocker shardLocker, Logger logger)
throws IOException, ShardLockObtainFailedException {
try (
ShardLock lock = shardLocker.lock(shardId, "open index", TimeUnit.SECONDS.toMillis(5));
Directory dir = new NIOFSDirectory(indexLocation)
) {
failIfCorrupted(dir);
SegmentInfos segInfo = Lucene.readSegmentInfos(dir);
logger.trace("{} loaded segment info [{}]", shardId, segInfo);
}
}
/**
* The returned IndexOutput validates the files checksum.
*
* Note: Checksums are calculated by default since version 4.8.0. This method only adds the
* verification against the checksum in the given metadata and does not add any significant overhead.
*/
public IndexOutput createVerifyingOutput(String fileName, final StoreFileMetadata metadata, final IOContext context)
throws IOException {
IndexOutput output = directory().createOutput(fileName, context);
boolean success = false;
try {
assert metadata.writtenBy() != null;
output = new LuceneVerifyingIndexOutput(metadata, output);
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(output);
}
}
return output;
}
public static void verify(IndexOutput output) throws IOException {
if (output instanceof VerifyingIndexOutput) {
((VerifyingIndexOutput) output).verify();
}
}
public IndexInput openVerifyingInput(String filename, IOContext context, StoreFileMetadata metadata) throws IOException {
assert metadata.writtenBy() != null;
return new VerifyingIndexInput(directory().openInput(filename, context));
}
public static void verify(IndexInput input) throws IOException {
if (input instanceof VerifyingIndexInput) {
((VerifyingIndexInput) input).verify();
}
}
public boolean checkIntegrityNoException(StoreFileMetadata md) {
return checkIntegrityNoException(md, directory());
}
public static boolean checkIntegrityNoException(StoreFileMetadata md, Directory directory) {
try {
checkIntegrity(md, directory);
return true;
} catch (IOException e) {
return false;
}
}
public static void checkIntegrity(final StoreFileMetadata md, final Directory directory) throws IOException {
try (IndexInput input = directory.openInput(md.name(), IOContext.READONCE)) {
if (input.length() != md.length()) { // first check the length no matter how old this file is
throw new CorruptIndexException(
"expected length=" + md.length() + " != actual length: " + input.length() + " : file truncated?",
input
);
}
// throw exception if the file is corrupt
String checksum = Store.digestToString(CodecUtil.checksumEntireFile(input));
// throw exception if metadata is inconsistent
if (!checksum.equals(md.checksum())) {
throw new CorruptIndexException(
"inconsistent metadata: lucene checksum=" + checksum + ", metadata checksum=" + md.checksum(),
input
);
}
}
}
public boolean isMarkedCorrupted() throws IOException {
ensureOpen();
/* marking a store as corrupted is basically adding a _corrupted to all
* the files. This prevent
*/
final String[] files = directory().listAll();
for (String file : files) {
if (file.startsWith(CORRUPTED_MARKER_NAME_PREFIX)) {
return true;
}
}
return false;
}
/**
* Deletes all corruption markers from this store.
*/
public void removeCorruptionMarker() throws IOException {
ensureOpen();
final Directory directory = directory();
IOException firstException = null;
final String[] files = directory.listAll();
for (String file : files) {
if (file.startsWith(CORRUPTED_MARKER_NAME_PREFIX)) {
try {
directory.deleteFile(file);
} catch (IOException ex) {
if (firstException == null) {
firstException = ex;
} else {
firstException.addSuppressed(ex);
}
}
}
}
if (firstException != null) {
throw firstException;
}
}
public void failIfCorrupted() throws IOException {
ensureOpen();
failIfCorrupted(directory);
}
private static void failIfCorrupted(Directory directory) throws IOException {
final String[] files = directory.listAll();
List ex = new ArrayList<>();
for (String file : files) {
if (file.startsWith(CORRUPTED_MARKER_NAME_PREFIX)) {
try (ChecksumIndexInput input = directory.openChecksumInput(file, IOContext.READONCE)) {
CodecUtil.checkHeader(input, CODEC, CORRUPTED_MARKER_CODEC_VERSION, CORRUPTED_MARKER_CODEC_VERSION);
final int size = input.readVInt();
final byte[] buffer = new byte[size];
input.readBytes(buffer, 0, buffer.length);
StreamInput in = StreamInput.wrap(buffer);
Exception t = in.readException();
if (t instanceof CorruptIndexException) {
ex.add((CorruptIndexException) t);
} else {
ex.add(new CorruptIndexException(t.getMessage(), "preexisting_corruption", t));
}
CodecUtil.checkFooter(input);
}
}
}
if (ex.isEmpty() == false) {
ExceptionsHelper.rethrowAndSuppress(ex);
}
}
/**
* This method deletes every file in this store that is not contained in the given source meta data or is a
* legacy checksum file. After the delete it pulls the latest metadata snapshot from the store and compares it
* to the given snapshot. If the snapshots are inconsistent an illegal state exception is thrown.
*
* @param reason the reason for this cleanup operation logged for each deleted file
* @param sourceMetadata the metadata used for cleanup. all files in this metadata should be kept around.
* @throws IOException if an IOException occurs
* @throws IllegalStateException if the latest snapshot in this store differs from the given one after the cleanup.
*/
public void cleanupAndVerify(String reason, MetadataSnapshot sourceMetadata) throws IOException {
metadataLock.writeLock().lock();
try (Lock writeLock = directory.obtainLock(IndexWriter.WRITE_LOCK_NAME)) {
for (String existingFile : directory.listAll()) {
if (Store.isAutogenerated(existingFile) || sourceMetadata.contains(existingFile)) {
// don't delete snapshot file, or the checksums file (note, this is extra protection since the Store won't delete
// checksum)
continue;
}
try {
directory.deleteFile(reason, existingFile);
// FNF should not happen since we hold a write lock?
} catch (IOException ex) {
if (existingFile.startsWith(IndexFileNames.SEGMENTS) || existingFile.startsWith(CORRUPTED_MARKER_NAME_PREFIX)) {
// TODO do we need to also fail this if we can't delete the pending commit file?
// if one of those files can't be deleted we better fail the cleanup otherwise we might leave an old commit
// point around?
throw new IllegalStateException("Can't delete " + existingFile + " - cleanup failed", ex);
}
logger.debug(() -> new ParameterizedMessage("failed to delete file [{}]", existingFile), ex);
// ignore, we don't really care, will get deleted later on
}
}
directory.syncMetaData();
final Store.MetadataSnapshot metadataOrEmpty = getMetadata();
verifyAfterCleanup(sourceMetadata, metadataOrEmpty);
} finally {
metadataLock.writeLock().unlock();
}
}
/**
* Segment replication method
*
* This method takes the segment info bytes to build SegmentInfos. It inc'refs files pointed by passed in SegmentInfos
* bytes to ensure they are not deleted.
*
* @param infosBytes bytes[] of SegmentInfos supposed to be sent over by primary excluding segment_N file
* @param segmentsGen segment generation number
* @throws IOException Exception while reading store and building segment infos
*/
public SegmentInfos buildSegmentInfos(byte[] infosBytes, long segmentsGen) throws IOException {
try (final ChecksumIndexInput input = toIndexInput(infosBytes)) {
return SegmentInfos.readCommit(directory, input, segmentsGen);
}
}
/**
* This method formats byte[] containing the primary's SegmentInfos into lucene's {@link ChecksumIndexInput} that can be
* passed to SegmentInfos.readCommit
*/
private ChecksumIndexInput toIndexInput(byte[] input) {
return new BufferedChecksumIndexInput(new ByteArrayIndexInput("Snapshot of SegmentInfos", input));
}
// pkg private for testing
final void verifyAfterCleanup(MetadataSnapshot sourceMetadata, MetadataSnapshot targetMetadata) {
final RecoveryDiff recoveryDiff = targetMetadata.recoveryDiff(sourceMetadata);
if (recoveryDiff.identical.size() != recoveryDiff.size()) {
if (recoveryDiff.missing.isEmpty()) {
for (StoreFileMetadata meta : recoveryDiff.different) {
StoreFileMetadata local = targetMetadata.get(meta.name());
StoreFileMetadata remote = sourceMetadata.get(meta.name());
// if we have different files then they must have no checksums; otherwise something went wrong during recovery.
// we have that problem when we have an empty index is only a segments_1 file so we can't tell if it's a Lucene 4.8 file
// and therefore no checksum is included. That isn't a problem since we simply copy it over anyway but those files
// come out as different in the diff. That's why we have to double check here again if the rest of it matches.
// all is fine this file is just part of a commit or a segment that is different
if (local.isSame(remote) == false) {
logger.debug("Files are different on the recovery target: {} ", recoveryDiff);
throw new IllegalStateException(
"local version: " + local + " is different from remote version after recovery: " + remote,
null
);
}
}
} else {
logger.debug("Files are missing on the recovery target: {} ", recoveryDiff);
throw new IllegalStateException(
"Files are missing on the recovery target: [different="
+ recoveryDiff.different
+ ", missing="
+ recoveryDiff.missing
+ ']',
null
);
}
}
}
/**
* Returns the current reference count.
*/
public int refCount() {
return refCounter.refCount();
}
public void beforeClose() {
shardLock.setDetails("closing shard");
}
/**
* This method should only be used with Segment Replication.
* Perform a commit from a live {@link SegmentInfos}. Replica engines with segrep do not have an IndexWriter and Lucene does not currently
* have the ability to create a writer directly from a SegmentInfos object. To promote the replica as a primary and avoid reindexing, we must first commit
* on the replica so that it can be opened with a writeable engine. Further, InternalEngine currently invokes `trimUnsafeCommits` which reverts the engine to a previous safeCommit where the max seqNo is less than or equal
* to the current global checkpoint. It is likely that the replica has a maxSeqNo that is higher than the global cp and a new commit will be wiped.
*
* To get around these limitations, this method first creates an IndexCommit directly from SegmentInfos, it then
* uses an appending IW to create an IndexCommit from the commit created on SegmentInfos.
* This ensures that 1. All files in the new commit are fsynced and 2. Deletes older commit points so the only commit to start from is our new commit.
*
* @param latestSegmentInfos {@link SegmentInfos} The latest active infos
* @param maxSeqNo The engine's current maxSeqNo
* @param processedCheckpoint The engine's current processed checkpoint.
* @throws IOException when there is an IO error committing.
*/
public void commitSegmentInfos(SegmentInfos latestSegmentInfos, long maxSeqNo, long processedCheckpoint) throws IOException {
assert indexSettings.isSegRepEnabled();
metadataLock.writeLock().lock();
try {
final Map userData = new HashMap<>(latestSegmentInfos.getUserData());
userData.put(LOCAL_CHECKPOINT_KEY, String.valueOf(processedCheckpoint));
userData.put(SequenceNumbers.MAX_SEQ_NO, Long.toString(maxSeqNo));
latestSegmentInfos.setUserData(userData, false);
latestSegmentInfos.commit(directory());
directory.sync(latestSegmentInfos.files(true));
directory.syncMetaData();
} finally {
metadataLock.writeLock().unlock();
}
}
public DirectoryFileTransferTracker getDirectoryFileTransferTracker() {
return directory.getDirectoryFileTransferTracker();
}
/**
* A store directory
*
* @opensearch.internal
*/
static final class StoreDirectory extends FilterDirectory {
private final Logger deletesLogger;
public final DirectoryFileTransferTracker directoryFileTransferTracker;
StoreDirectory(ByteSizeCachingDirectory delegateDirectory, Logger deletesLogger) {
super(delegateDirectory);
this.deletesLogger = deletesLogger;
this.directoryFileTransferTracker = new DirectoryFileTransferTracker();
}
/** Estimate the cumulative size of all files in this directory in bytes. */
long estimateSize() throws IOException {
return ((ByteSizeCachingDirectory) getDelegate()).estimateSizeInBytes();
}
@Override
public void close() {
assert false : "Nobody should close this directory except of the Store itself";
}
public void deleteFile(String msg, String name) throws IOException {
deletesLogger.trace("{}: delete file {}", msg, name);
super.deleteFile(name);
}
@Override
public void deleteFile(String name) throws IOException {
deleteFile("StoreDirectory.deleteFile", name);
}
private void innerClose() throws IOException {
super.close();
}
@Override
public String toString() {
return "store(" + in.toString() + ")";
}
@Override
public Set getPendingDeletions() throws IOException {
// FilterDirectory.getPendingDeletions does not delegate, working around it here.
// to be removed once fixed in FilterDirectory.
return unwrap(this).getPendingDeletions();
}
public DirectoryFileTransferTracker getDirectoryFileTransferTracker() {
return directoryFileTransferTracker;
}
@Override
public void copyFrom(Directory from, String src, String dest, IOContext context) throws IOException {
long fileSize = from.fileLength(src);
beforeDownload(fileSize);
boolean success = false;
long startTime = System.currentTimeMillis();
try {
super.copyFrom(from, src, dest, context);
success = true;
afterDownload(fileSize, startTime);
} finally {
if (!success) {
downloadFailed(fileSize, startTime);
}
}
}
/**
* Updates the amount of bytes attempted for download
*/
private void beforeDownload(long fileSize) {
directoryFileTransferTracker.addTransferredBytesStarted(fileSize);
}
/**
* Updates
* - The amount of bytes that has been successfully downloaded from the source store
* - The last successful download completion timestamp
* - The last successfully downloaded file
* - Download speed (in bytes/sec)
*/
private void afterDownload(long fileSize, long startTimeInMs) {
directoryFileTransferTracker.addTransferredBytesSucceeded(fileSize, startTimeInMs);
}
/**
* Updates the amount of bytes failed in download
*/
private void downloadFailed(long fileSize, long startTimeInMs) {
directoryFileTransferTracker.addTransferredBytesFailed(fileSize, startTimeInMs);
}
}
/**
* Represents a snapshot of the current directory build from the latest Lucene commit.
* Only files that are part of the last commit are considered in this datastructure.
* For backwards compatibility the snapshot might include legacy checksums that
* are derived from a dedicated checksum file written by older elasticsearch version pre 1.3
*
* Note: This class will ignore the {@code segments.gen} file since it's optional and might
* change concurrently for safety reasons.
*
* @see StoreFileMetadata
*
* @opensearch.api
*/
@PublicApi(since = "1.0.0")
public static final class MetadataSnapshot implements Iterable, Writeable {
private final Map metadata;
public static final MetadataSnapshot EMPTY = new MetadataSnapshot();
private final Map commitUserData;
private final long numDocs;
public MetadataSnapshot(Map metadata, Map commitUserData, long numDocs) {
this.metadata = metadata;
this.commitUserData = commitUserData;
this.numDocs = numDocs;
}
MetadataSnapshot() {
metadata = emptyMap();
commitUserData = emptyMap();
numDocs = 0;
}
MetadataSnapshot(IndexCommit commit, Directory directory, Logger logger) throws IOException {
this(loadMetadata(commit, directory, logger));
}
MetadataSnapshot(SegmentInfos segmentInfos, Directory directory, Logger logger) throws IOException {
this(loadMetadata(segmentInfos, directory, logger));
}
private MetadataSnapshot(LoadedMetadata loadedMetadata) {
metadata = loadedMetadata.fileMetadata;
commitUserData = loadedMetadata.userData;
numDocs = loadedMetadata.numDocs;
assert metadata.isEmpty() || numSegmentFiles() == 1 : "numSegmentFiles: " + numSegmentFiles();
}
/**
* Read from a stream.
*/
public MetadataSnapshot(StreamInput in) throws IOException {
final int size = in.readVInt();
Map metadata = new HashMap<>();
for (int i = 0; i < size; i++) {
StoreFileMetadata meta = new StoreFileMetadata(in);
metadata.put(meta.name(), meta);
}
Map commitUserData = new HashMap<>();
int num = in.readVInt();
for (int i = num; i > 0; i--) {
commitUserData.put(in.readString(), in.readString());
}
this.metadata = unmodifiableMap(metadata);
this.commitUserData = unmodifiableMap(commitUserData);
this.numDocs = in.readLong();
assert metadata.isEmpty() || numSegmentFiles() == 1 : "numSegmentFiles: " + numSegmentFiles();
}
@Override
public void writeTo(StreamOutput out) throws IOException {
out.writeVInt(this.metadata.size());
for (StoreFileMetadata meta : this) {
meta.writeTo(out);
}
out.writeVInt(commitUserData.size());
for (Map.Entry entry : commitUserData.entrySet()) {
out.writeString(entry.getKey());
out.writeString(entry.getValue());
}
out.writeLong(numDocs);
}
/**
* Returns the number of documents in this store snapshot
*/
public long getNumDocs() {
return numDocs;
}
/**
* Metadata that is currently loaded
*
* @opensearch.internal
*/
static class LoadedMetadata {
final Map fileMetadata;
final Map userData;
final long numDocs;
LoadedMetadata(Map fileMetadata, Map userData, long numDocs) {
this.fileMetadata = fileMetadata;
this.userData = userData;
this.numDocs = numDocs;
}
}
static LoadedMetadata loadMetadata(IndexCommit commit, Directory directory, Logger logger) throws IOException {
try {
final SegmentInfos segmentCommitInfos = Store.readSegmentsInfo(commit, directory);
return loadMetadata(segmentCommitInfos, directory, logger);
} catch (CorruptIndexException | IndexNotFoundException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
// we either know the index is corrupted or it's just not there
throw ex;
} catch (Exception ex) {
try {
// Lucene checks the checksum after it tries to lookup the codec etc.
// in that case we might get only IAE or similar exceptions while we are really corrupt...
// TODO we should check the checksum in lucene if we hit an exception
logger.warn(
() -> new ParameterizedMessage(
"failed to build store metadata. checking segment info integrity " + "(with commit [{}])",
commit == null ? "no" : "yes"
),
ex
);
Lucene.checkSegmentInfoIntegrity(directory);
} catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException cex) {
cex.addSuppressed(ex);
throw cex;
} catch (Exception inner) {
inner.addSuppressed(ex);
throw inner;
}
throw ex;
}
}
static LoadedMetadata loadMetadata(SegmentInfos segmentInfos, Directory directory, Logger logger) throws IOException {
return loadMetadata(segmentInfos, directory, logger, false);
}
static LoadedMetadata loadMetadata(SegmentInfos segmentInfos, Directory directory, Logger logger, boolean ignoreSegmentsFile)
throws IOException {
long numDocs = Lucene.getNumDocs(segmentInfos);
Map commitUserDataBuilder = new HashMap<>();
commitUserDataBuilder.putAll(segmentInfos.getUserData());
Map builder = new HashMap<>();
// we don't know which version was used to write so we take the max version.
Version maxVersion = segmentInfos.getMinSegmentLuceneVersion();
for (SegmentCommitInfo info : segmentInfos) {
final Version version = info.info.getVersion();
if (version == null) {
// version is written since 3.1+: we should have already hit IndexFormatTooOld.
throw new IllegalArgumentException("expected valid version value: " + info.info.toString());
}
// With segment replication enabled, we compute metadata snapshots from the latest in memory infos.
// In this case we will have SegmentInfos objects fetched from the primary's reader
// where the minSegmentLuceneVersion can be null even though there are segments.
// This is because the SegmentInfos object is not read from a commit/IndexInput, which sets
// minSegmentLuceneVersion.
if (maxVersion == null || version.onOrAfter(maxVersion)) {
maxVersion = version;
}
for (String file : info.files()) {
checksumFromLuceneFile(
directory,
file,
builder,
logger,
version,
SEGMENT_INFO_EXTENSION.equals(IndexFileNames.getExtension(file))
);
}
}
if (maxVersion == null) {
maxVersion = org.opensearch.Version.CURRENT.minimumIndexCompatibilityVersion().luceneVersion;
}
if (ignoreSegmentsFile == false) {
final String segmentsFile = segmentInfos.getSegmentsFileName();
checksumFromLuceneFile(directory, segmentsFile, builder, logger, maxVersion, true);
}
return new LoadedMetadata(unmodifiableMap(builder), unmodifiableMap(commitUserDataBuilder), numDocs);
}
private static void checksumFromLuceneFile(
Directory directory,
String file,
Map builder,
Logger logger,
Version version,
boolean readFileAsHash
) throws IOException {
final String checksum;
final BytesRefBuilder fileHash = new BytesRefBuilder();
try (IndexInput in = directory.openInput(file, READONCE_CHECKSUM)) {
final long length;
try {
length = in.length();
if (length < CodecUtil.footerLength()) {
// truncated files trigger IAE if we seek negative... these files are really corrupted though
throw new CorruptIndexException(
"Can't retrieve checksum from file: "
+ file
+ " file length must be >= "
+ CodecUtil.footerLength()
+ " but was: "
+ in.length(),
in
);
}
if (readFileAsHash) {
// additional safety we checksum the entire file we read the hash for...
final VerifyingIndexInput verifyingIndexInput = new VerifyingIndexInput(in);
hashFile(fileHash, new InputStreamIndexInput(verifyingIndexInput, length), length);
checksum = digestToString(verifyingIndexInput.verify());
} else {
checksum = digestToString(CodecUtil.retrieveChecksum(in));
}
} catch (Exception ex) {
logger.debug(() -> new ParameterizedMessage("Can retrieve checksum from file [{}]", file), ex);
throw ex;
}
builder.put(file, new StoreFileMetadata(file, length, checksum, version, fileHash.get()));
}
}
/**
* Computes a strong hash value for small files. Note that this method should only be used for files < 1MB
*/
public static void hashFile(BytesRefBuilder fileHash, InputStream in, long size) throws IOException {
final int len = (int) Math.min(1024 * 1024, size); // for safety we limit this to 1MB
fileHash.grow(len);
fileHash.setLength(len);
final int readBytes = in.readNBytes(fileHash.bytes(), 0, len);
assert readBytes == len : Integer.toString(readBytes) + " != " + Integer.toString(len);
assert fileHash.length() == len : Integer.toString(fileHash.length()) + " != " + Integer.toString(len);
}
@Override
public Iterator iterator() {
return metadata.values().iterator();
}
public StoreFileMetadata get(String name) {
return metadata.get(name);
}
public Map asMap() {
return metadata;
}
private static final String DEL_FILE_EXTENSION = "del"; // legacy delete file
private static final String LIV_FILE_EXTENSION = "liv"; // lucene 5 delete file
private static final String SEGMENT_INFO_EXTENSION = "si";
/**
* Helper method used to group store files according to segment and commit.
*
* @see MetadataSnapshot#recoveryDiff(MetadataSnapshot)
*/
private Iterable> getGroupedFilesIterable() {
final Map> perSegment = new HashMap<>();
final List perCommitStoreFiles = new ArrayList<>();
for (StoreFileMetadata meta : this) {
final String segmentId = IndexFileNames.parseSegmentName(meta.name());
final String extension = IndexFileNames.getExtension(meta.name());
if (IndexFileNames.SEGMENTS.equals(segmentId)
|| DEL_FILE_EXTENSION.equals(extension)
|| LIV_FILE_EXTENSION.equals(extension)) {
// only treat del files as per-commit files fnm files are generational but only for upgradable DV
perCommitStoreFiles.add(meta);
} else {
perSegment.computeIfAbsent(segmentId, k -> new ArrayList<>()).add(meta);
}
}
return Iterables.concat(perSegment.values(), Collections.singleton(perCommitStoreFiles));
}
/**
* Returns a diff between the two snapshots that can be used for recovery. The given snapshot is treated as the
* recovery target and this snapshot as the source. The returned diff will hold a list of files that are:
*
* - identical: they exist in both snapshots and they can be considered the same ie. they don't need to be recovered
* - different: they exist in both snapshots but their they are not identical
* - missing: files that exist in the source but not in the target
*
* This method groups file into per-segment files and per-commit files. A file is treated as
* identical if and on if all files in it's group are identical. On a per-segment level files for a segment are treated
* as identical iff:
*
* - all files in this segment have the same checksum
* - all files in this segment have the same length
* - the segments {@code .si} files hashes are byte-identical Note: This is a using a perfect hash function,
* The metadata transfers the {@code .si} file content as it's hash
*
*
* The {@code .si} file contains a lot of diagnostics including a timestamp etc. in the future there might be
* unique segment identifiers in there hardening this method further.
*
* The per-commit files handles very similar. A commit is composed of the {@code segments_N} files as well as generational files
* like deletes ({@code _x_y.del}) or field-info ({@code _x_y.fnm}) files. On a per-commit level files for a commit are treated
* as identical iff:
*
* - all files belonging to this commit have the same checksum
* - all files belonging to this commit have the same length
* - the segments file {@code segments_N} files hashes are byte-identical Note: This is a using a perfect hash function,
* The metadata transfers the {@code segments_N} file content as it's hash
*
*
* NOTE: this diff will not contain the {@code segments.gen} file. This file is omitted on recovery.
*/
public RecoveryDiff recoveryDiff(MetadataSnapshot recoveryTargetSnapshot) {
final List identical = new ArrayList<>();
final List different = new ArrayList<>();
final List missing = new ArrayList<>();
final ArrayList identicalFiles = new ArrayList<>();
for (List segmentFiles : getGroupedFilesIterable()) {
identicalFiles.clear();
boolean consistent = true;
for (StoreFileMetadata meta : segmentFiles) {
StoreFileMetadata storeFileMetadata = recoveryTargetSnapshot.get(meta.name());
if (storeFileMetadata == null) {
consistent = false;
missing.add(meta);
} else if (storeFileMetadata.isSame(meta) == false) {
consistent = false;
different.add(meta);
} else {
identicalFiles.add(meta);
}
}
if (consistent) {
identical.addAll(identicalFiles);
} else {
// make sure all files are added - this can happen if only the deletes are different
different.addAll(identicalFiles);
}
}
RecoveryDiff recoveryDiff = new RecoveryDiff(
Collections.unmodifiableList(identical),
Collections.unmodifiableList(different),
Collections.unmodifiableList(missing)
);
assert recoveryDiff.size() == this.metadata.size() : "some files are missing recoveryDiff size: ["
+ recoveryDiff.size()
+ "] metadata size: ["
+ this.metadata.size()
+ "]";
return recoveryDiff;
}
/**
* Returns the number of files in this snapshot
*/
public int size() {
return metadata.size();
}
public Map getCommitUserData() {
return commitUserData;
}
/**
* returns the history uuid the store points at, or null if nonexistent.
*/
public String getHistoryUUID() {
return commitUserData.get(Engine.HISTORY_UUID_KEY);
}
/**
* Returns true iff this metadata contains the given file.
*/
public boolean contains(String existingFile) {
return metadata.containsKey(existingFile);
}
/**
* Returns the segments file that this metadata snapshot represents or null if the snapshot is empty.
*/
public StoreFileMetadata getSegmentsFile() {
for (StoreFileMetadata file : this) {
if (file.name().startsWith(IndexFileNames.SEGMENTS)) {
return file;
}
}
assert metadata.isEmpty();
return null;
}
private int numSegmentFiles() { // only for asserts
int count = 0;
for (StoreFileMetadata file : this) {
if (file.name().startsWith(IndexFileNames.SEGMENTS)) {
count++;
}
}
return count;
}
/**
* Returns the sync id of the commit point that this MetadataSnapshot represents.
*
* @return sync id if exists, else null
*/
public String getSyncId() {
return commitUserData.get(Engine.SYNC_COMMIT_ID);
}
}
/**
* A class representing the diff between a recovery source and recovery target
*
* @see MetadataSnapshot#recoveryDiff(org.opensearch.index.store.Store.MetadataSnapshot)
*
* @opensearch.api
*/
@PublicApi(since = "1.0.0")
public static final class RecoveryDiff {
/**
* Files that exist in both snapshots and they can be considered the same ie. they don't need to be recovered
*/
public final List identical;
/**
* Files that exist in both snapshots but their they are not identical
*/
public final List different;
/**
* Files that exist in the source but not in the target
*/
public final List missing;
RecoveryDiff(List identical, List different, List missing) {
this.identical = identical;
this.different = different;
this.missing = missing;
}
/**
* Returns the sum of the files in this diff.
*/
public int size() {
return identical.size() + different.size() + missing.size();
}
@Override
public String toString() {
return "RecoveryDiff{" + "identical=" + identical + ", different=" + different + ", missing=" + missing + '}';
}
}
/**
* Returns true if the file is auto-generated by the store and shouldn't be deleted during cleanup.
* This includes write lock and checksum files
*/
public static boolean isAutogenerated(String name) {
return IndexWriter.WRITE_LOCK_NAME.equals(name);
}
/**
* Produces a string representation of the given digest value.
*/
public static String digestToString(long digest) {
return Long.toString(digest, Character.MAX_RADIX);
}
/**
* Class to verify the lucene index output
*
* @opensearch.internal
*/
public static class LuceneVerifyingIndexOutput extends VerifyingIndexOutput {
private final StoreFileMetadata metadata;
private long writtenBytes;
private final long checksumPosition;
private String actualChecksum;
private final byte[] footerChecksum = new byte[8]; // this holds the actual footer checksum data written by to this output
public LuceneVerifyingIndexOutput(StoreFileMetadata metadata, IndexOutput out) {
super(out);
this.metadata = metadata;
checksumPosition = metadata.length() - 8; // the last 8 bytes are the checksum - we store it in footerChecksum
}
@Override
public void verify() throws IOException {
String footerDigest = null;
if (metadata.checksum().equals(actualChecksum) && writtenBytes == metadata.length()) {
ByteArrayIndexInput indexInput = new ByteArrayIndexInput("checksum", this.footerChecksum);
footerDigest = digestToString(CodecUtil.readBELong(indexInput));
if (metadata.checksum().equals(footerDigest)) {
return;
}
}
throw new CorruptIndexException(
"verification failed (hardware problem?) : expected="
+ metadata.checksum()
+ " actual="
+ actualChecksum
+ " footer="
+ footerDigest
+ " writtenLength="
+ writtenBytes
+ " expectedLength="
+ metadata.length()
+ " (resource="
+ metadata.toString()
+ ")",
"VerifyingIndexOutput(" + metadata.name() + ")"
);
}
@Override
public void writeByte(byte b) throws IOException {
final long writtenBytes = this.writtenBytes++;
if (writtenBytes >= checksumPosition) { // we are writing parts of the checksum....
if (writtenBytes == checksumPosition) {
readAndCompareChecksum();
}
final int index = Math.toIntExact(writtenBytes - checksumPosition);
if (index < footerChecksum.length) {
footerChecksum[index] = b;
if (index == footerChecksum.length - 1) {
verify(); // we have recorded the entire checksum
}
} else {
verify(); // fail if we write more than expected
throw new AssertionError("write past EOF expected length: " + metadata.length() + " writtenBytes: " + writtenBytes);
}
}
out.writeByte(b);
}
private void readAndCompareChecksum() throws IOException {
actualChecksum = digestToString(getChecksum());
if (!metadata.checksum().equals(actualChecksum)) {
throw new CorruptIndexException(
"checksum failed (hardware problem?) : expected="
+ metadata.checksum()
+ " actual="
+ actualChecksum
+ " (resource="
+ metadata.toString()
+ ")",
"VerifyingIndexOutput(" + metadata.name() + ")"
);
}
}
@Override
public void writeBytes(byte[] b, int offset, int length) throws IOException {
if (writtenBytes + length > checksumPosition) {
for (int i = 0; i < length; i++) { // don't optimze writing the last block of bytes
writeByte(b[offset + i]);
}
} else {
out.writeBytes(b, offset, length);
writtenBytes += length;
}
}
}
/**
* Index input that calculates checksum as data is read from the input.
*
* This class supports random access (it is possible to seek backward and forward) in order to accommodate retry
* mechanism that is used in some repository plugins (S3 for example). However, the checksum is only calculated on
* the first read. All consecutive reads of the same data are not used to calculate the checksum.
*
* @opensearch.internal
*/
static class VerifyingIndexInput extends ChecksumIndexInput {
private final IndexInput input;
private final Checksum digest;
private final long checksumPosition;
private final byte[] checksum = new byte[8];
private long verifiedPosition = 0;
VerifyingIndexInput(IndexInput input) {
this(input, new BufferedChecksum(new CRC32()));
}
VerifyingIndexInput(IndexInput input, Checksum digest) {
super("VerifyingIndexInput(" + input + ")");
this.input = input;
this.digest = digest;
checksumPosition = input.length() - 8;
}
@Override
public byte readByte() throws IOException {
long pos = input.getFilePointer();
final byte b = input.readByte();
pos++;
if (pos > verifiedPosition) {
if (pos <= checksumPosition) {
digest.update(b);
} else {
checksum[(int) (pos - checksumPosition - 1)] = b;
}
verifiedPosition = pos;
}
return b;
}
@Override
public void readBytes(byte[] b, int offset, int len) throws IOException {
long pos = input.getFilePointer();
input.readBytes(b, offset, len);
if (pos + len > verifiedPosition) {
// Conversion to int is safe here because (verifiedPosition - pos) can be at most len, which is integer
int alreadyVerified = (int) Math.max(0, verifiedPosition - pos);
if (pos < checksumPosition) {
if (pos + len < checksumPosition) {
digest.update(b, offset + alreadyVerified, len - alreadyVerified);
} else {
int checksumOffset = (int) (checksumPosition - pos);
if (checksumOffset - alreadyVerified > 0) {
digest.update(b, offset + alreadyVerified, checksumOffset - alreadyVerified);
}
System.arraycopy(b, offset + checksumOffset, checksum, 0, len - checksumOffset);
}
} else {
// Conversion to int is safe here because checksumPosition is (file length - 8) so
// (pos - checksumPosition) cannot be bigger than 8 unless we are reading after the end of file
assert pos - checksumPosition < 8;
System.arraycopy(b, offset, checksum, (int) (pos - checksumPosition), len);
}
verifiedPosition = pos + len;
}
}
@Override
public long getChecksum() {
return digest.getValue();
}
@Override
public void seek(long pos) throws IOException {
if (pos < verifiedPosition) {
// going within verified region - just seek there
input.seek(pos);
} else {
if (verifiedPosition > getFilePointer()) {
// portion of the skip region is verified and portion is not
// skipping the verified portion
input.seek(verifiedPosition);
// and checking unverified
super.seek(pos);
} else {
super.seek(pos);
}
}
}
@Override
public void close() throws IOException {
input.close();
}
@Override
public long getFilePointer() {
return input.getFilePointer();
}
@Override
public long length() {
return input.length();
}
@Override
public IndexInput clone() {
throw new UnsupportedOperationException();
}
@Override
public IndexInput slice(String sliceDescription, long offset, long length) throws IOException {
throw new UnsupportedOperationException();
}
public long getStoredChecksum() {
try {
return CodecUtil.readBELong(new ByteArrayDataInput(checksum));
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
public long verify() throws CorruptIndexException, IOException {
long storedChecksum = getStoredChecksum();
if (getChecksum() == storedChecksum) {
return storedChecksum;
}
throw new CorruptIndexException(
"verification failed : calculated="
+ Store.digestToString(getChecksum())
+ " stored="
+ Store.digestToString(storedChecksum),
this
);
}
}
public void deleteQuiet(String... files) {
ensureOpen();
StoreDirectory directory = this.directory;
for (String file : files) {
try {
directory.deleteFile("Store.deleteQuiet", file);
} catch (Exception ex) {
// ignore :(
}
}
}
/**
* Marks this store as corrupted. This method writes a {@code corrupted_${uuid}} file containing the given exception
* message. If a store contains a {@code corrupted_${uuid}} file {@link #isMarkedCorrupted()} will return true
.
*/
public void markStoreCorrupted(IOException exception) throws IOException {
ensureOpen();
if (!isMarkedCorrupted()) {
final String corruptionMarkerName = CORRUPTED_MARKER_NAME_PREFIX + UUIDs.randomBase64UUID();
try (IndexOutput output = this.directory().createOutput(corruptionMarkerName, IOContext.DEFAULT)) {
CodecUtil.writeHeader(output, CODEC, CORRUPTED_MARKER_CODEC_VERSION);
BytesStreamOutput out = new BytesStreamOutput();
out.writeException(exception);
BytesReference bytes = out.bytes();
output.writeVInt(bytes.length());
BytesRef ref = bytes.toBytesRef();
output.writeBytes(ref.bytes, ref.offset, ref.length);
CodecUtil.writeFooter(output);
} catch (IOException ex) {
logger.warn("Can't mark store as corrupted", ex);
}
directory().sync(Collections.singleton(corruptionMarkerName));
}
}
/**
* A listener that is executed once the store is closed and all references to it are released
*
* @opensearch.internal
*/
public interface OnClose extends Consumer {
OnClose EMPTY = new OnClose() {
/**
* This method is called while the provided {@link org.opensearch.env.ShardLock} is held.
* This method is only called once after all resources for a store are released.
*/
@Override
public void accept(ShardLock Lock) {}
};
}
public void createEmpty(Version luceneVersion, String translogUUID) throws IOException {
metadataLock.writeLock().lock();
try (IndexWriter writer = newEmptyIndexWriter(directory, luceneVersion)) {
final Map map = new HashMap<>();
if (translogUUID != null) {
map.put(Translog.TRANSLOG_UUID_KEY, translogUUID);
}
map.put(Engine.HISTORY_UUID_KEY, UUIDs.randomBase64UUID());
map.put(SequenceNumbers.LOCAL_CHECKPOINT_KEY, Long.toString(SequenceNumbers.NO_OPS_PERFORMED));
map.put(SequenceNumbers.MAX_SEQ_NO, Long.toString(SequenceNumbers.NO_OPS_PERFORMED));
map.put(Engine.MAX_UNSAFE_AUTO_ID_TIMESTAMP_COMMIT_ID, "-1");
updateCommitData(writer, map);
} finally {
metadataLock.writeLock().unlock();
}
}
/**
* creates an empty lucene index and a corresponding empty translog. Any existing data will be deleted.
*/
public void createEmpty(Version luceneVersion) throws IOException {
createEmpty(luceneVersion, null);
}
/**
* Marks an existing lucene index with a new history uuid.
* This is used to make sure no existing shard will recovery from this index using ops based recovery.
*/
public void bootstrapNewHistory() throws IOException {
metadataLock.writeLock().lock();
try {
Map userData = readLastCommittedSegmentsInfo().getUserData();
final long maxSeqNo = Long.parseLong(userData.get(SequenceNumbers.MAX_SEQ_NO));
final long localCheckpoint = Long.parseLong(userData.get(SequenceNumbers.LOCAL_CHECKPOINT_KEY));
bootstrapNewHistory(localCheckpoint, maxSeqNo);
} finally {
metadataLock.writeLock().unlock();
}
}
/**
* Marks an existing lucene index with a new history uuid and sets the given local checkpoint
* as well as the maximum sequence number.
* This is used to make sure no existing shard will recover from this index using ops based recovery.
* @see SequenceNumbers#LOCAL_CHECKPOINT_KEY
* @see SequenceNumbers#MAX_SEQ_NO
*/
public void bootstrapNewHistory(long localCheckpoint, long maxSeqNo) throws IOException {
metadataLock.writeLock().lock();
try (IndexWriter writer = newAppendingIndexWriter(directory, null)) {
final Map map = new HashMap<>();
map.put(Engine.HISTORY_UUID_KEY, UUIDs.randomBase64UUID());
map.put(SequenceNumbers.LOCAL_CHECKPOINT_KEY, Long.toString(localCheckpoint));
map.put(SequenceNumbers.MAX_SEQ_NO, Long.toString(maxSeqNo));
updateCommitData(writer, map);
} finally {
metadataLock.writeLock().unlock();
}
}
/**
* Force bakes the given translog generation as recovery information in the lucene index. This is
* used when recovering from a snapshot or peer file based recovery where a new empty translog is
* created and the existing lucene index needs should be changed to use it.
*/
public void associateIndexWithNewTranslog(final String translogUUID) throws IOException {
metadataLock.writeLock().lock();
try (IndexWriter writer = newAppendingIndexWriter(directory, null)) {
if (translogUUID.equals(getUserData(writer).get(Translog.TRANSLOG_UUID_KEY))) {
throw new IllegalArgumentException("a new translog uuid can't be equal to existing one. got [" + translogUUID + "]");
}
updateCommitData(writer, Collections.singletonMap(Translog.TRANSLOG_UUID_KEY, translogUUID));
} finally {
metadataLock.writeLock().unlock();
}
}
/**
* Checks that the Lucene index contains a history uuid marker. If not, a new one is generated and committed.
*/
public void ensureIndexHasHistoryUUID() throws IOException {
metadataLock.writeLock().lock();
try (IndexWriter writer = newAppendingIndexWriter(directory, null)) {
final Map userData = getUserData(writer);
if (userData.containsKey(Engine.HISTORY_UUID_KEY) == false) {
updateCommitData(writer, Collections.singletonMap(Engine.HISTORY_UUID_KEY, UUIDs.randomBase64UUID()));
}
} finally {
metadataLock.writeLock().unlock();
}
}
/**
* Keeping existing unsafe commits when opening an engine can be problematic because these commits are not safe
* at the recovering time but they can suddenly become safe in the future.
* The following issues can happen if unsafe commits are kept oninit.
*
* 1. Replica can use unsafe commit in peer-recovery. This happens when a replica with a safe commit c1(max_seqno=1)
* and an unsafe commit c2(max_seqno=2) recovers from a primary with c1(max_seqno=1). If a new document(seqno=2)
* is added without flushing, the global checkpoint is advanced to 2; and the replica recovers again, it will use
* the unsafe commit c2(max_seqno=2 at most gcp=2) as the starting commit for sequenced-based recovery even the
* commit c2 contains a stale operation and the document(with seqno=2) will not be replicated to the replica.
*
* 2. Min translog gen for recovery can go backwards in peer-recovery. This happens when are replica with a safe commit
* c1(local_checkpoint=1, recovery_translog_gen=1) and an unsafe commit c2(local_checkpoint=2, recovery_translog_gen=2).
* The replica recovers from a primary, and keeps c2 as the last commit, then sets last_translog_gen to 2. Flushing a new
* commit on the replica will cause exception as the new last commit c3 will have recovery_translog_gen=1. The recovery
* translog generation of a commit is calculated based on the current local checkpoint. The local checkpoint of c3 is 1
* while the local checkpoint of c2 is 2.
*/
public void trimUnsafeCommits(final Path translogPath) throws IOException {
metadataLock.writeLock().lock();
try {
final List existingCommits = DirectoryReader.listCommits(directory);
assert existingCommits.isEmpty() == false : "No index found to trim";
final IndexCommit lastIndexCommit = existingCommits.get(existingCommits.size() - 1);
final String translogUUID = lastIndexCommit.getUserData().get(Translog.TRANSLOG_UUID_KEY);
final long lastSyncedGlobalCheckpoint = Translog.readGlobalCheckpoint(translogPath, translogUUID);
final IndexCommit startingIndexCommit = CombinedDeletionPolicy.findSafeCommitPoint(existingCommits, lastSyncedGlobalCheckpoint);
if (translogUUID.equals(startingIndexCommit.getUserData().get(Translog.TRANSLOG_UUID_KEY)) == false) {
throw new IllegalStateException(
"starting commit translog uuid ["
+ startingIndexCommit.getUserData().get(Translog.TRANSLOG_UUID_KEY)
+ "] is not equal to last commit's translog uuid ["
+ translogUUID
+ "]"
);
}
if (startingIndexCommit.equals(lastIndexCommit) == false) {
try (IndexWriter writer = newAppendingIndexWriter(directory, startingIndexCommit)) {
// this achieves two things:
// - by committing a new commit based on the starting commit, it make sure the starting commit will be opened
// - deletes any other commit (by lucene standard deletion policy)
//
// note that we can't just use IndexCommit.delete() as we really want to make sure that those files won't be used
// even if a virus scanner causes the files not to be used.
// The new commit will use segment files from the starting commit but userData from the last commit by default.
// Thus, we need to manually set the userData from the starting commit to the new commit.
writer.setLiveCommitData(startingIndexCommit.getUserData().entrySet());
writer.commit();
}
}
} finally {
metadataLock.writeLock().unlock();
}
}
/**
* Returns a {@link org.opensearch.index.seqno.SequenceNumbers.CommitInfo} of the safe commit if exists.
*/
public Optional findSafeIndexCommit(long globalCheckpoint) throws IOException {
final List commits = DirectoryReader.listCommits(directory);
assert commits.isEmpty() == false : "no commit found";
final IndexCommit safeCommit = CombinedDeletionPolicy.findSafeCommitPoint(commits, globalCheckpoint);
final SequenceNumbers.CommitInfo commitInfo = SequenceNumbers.loadSeqNoInfoFromLuceneCommit(safeCommit.getUserData().entrySet());
// all operations of the safe commit must be at most the global checkpoint.
if (commitInfo.maxSeqNo <= globalCheckpoint) {
return Optional.of(commitInfo);
} else {
return Optional.empty();
}
}
private static void updateCommitData(IndexWriter writer, Map keysToUpdate) throws IOException {
final Map userData = getUserData(writer);
userData.putAll(keysToUpdate);
writer.setLiveCommitData(userData.entrySet());
writer.commit();
}
private static Map getUserData(IndexWriter writer) {
final Map userData = new HashMap<>();
writer.getLiveCommitData().forEach(e -> userData.put(e.getKey(), e.getValue()));
return userData;
}
private static IndexWriter newAppendingIndexWriter(final Directory dir, final IndexCommit commit) throws IOException {
IndexWriterConfig iwc = newIndexWriterConfig().setIndexCommit(commit).setOpenMode(IndexWriterConfig.OpenMode.APPEND);
return new IndexWriter(dir, iwc);
}
private static IndexWriter newEmptyIndexWriter(final Directory dir, final Version luceneVersion) throws IOException {
IndexWriterConfig iwc = newIndexWriterConfig().setOpenMode(IndexWriterConfig.OpenMode.CREATE)
.setIndexCreatedVersionMajor(luceneVersion.major);
return new IndexWriter(dir, iwc);
}
private static IndexWriterConfig newIndexWriterConfig() {
return new IndexWriterConfig(null).setSoftDeletesField(Lucene.SOFT_DELETES_FIELD)
.setCommitOnClose(false)
// we don't want merges to happen here - we call maybe merge on the engine
// later once we stared it up otherwise we would need to wait for it here
// we also don't specify a codec here and merges should use the engines for this index
.setMergePolicy(NoMergePolicy.INSTANCE);
}
}