All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opensearch.index.translog.Translog Maven / Gradle / Ivy

There is a newer version: 2.18.0
Show newest version
/*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*
 * Modifications Copyright OpenSearch Contributors. See
 * GitHub history for details.
 */

package org.opensearch.index.translog;

import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.AlreadyClosedException;
import org.opensearch.LegacyESVersion;
import org.opensearch.common.Nullable;
import org.opensearch.common.Strings;
import org.opensearch.common.UUIDs;
import org.opensearch.common.bytes.BytesArray;
import org.opensearch.common.bytes.BytesReference;
import org.opensearch.common.io.stream.ReleasableBytesStreamOutput;
import org.opensearch.common.io.stream.StreamInput;
import org.opensearch.common.io.stream.StreamOutput;
import org.opensearch.common.lease.Releasable;
import org.opensearch.common.lease.Releasables;
import org.opensearch.common.lucene.uid.Versions;
import org.opensearch.common.util.BigArrays;
import org.opensearch.common.util.concurrent.ReleasableLock;
import org.opensearch.core.internal.io.IOUtils;
import org.opensearch.index.IndexSettings;
import org.opensearch.index.VersionType;
import org.opensearch.index.engine.Engine;
import org.opensearch.index.engine.MissingHistoryOperationsException;
import org.opensearch.index.seqno.SequenceNumbers;
import org.opensearch.index.shard.AbstractIndexShardComponent;
import org.opensearch.index.shard.IndexShardComponent;
import org.opensearch.index.shard.ShardId;

import java.io.Closeable;
import java.io.EOFException;
import java.io.IOException;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.OptionalLong;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.function.LongConsumer;
import java.util.function.LongSupplier;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.opensearch.index.translog.TranslogConfig.EMPTY_TRANSLOG_BUFFER_SIZE;

/**
 * A Translog is a per index shard component that records all non-committed index operations in a durable manner.
 * In OpenSearch there is one Translog instance per {@link org.opensearch.index.engine.InternalEngine}.
 * Additionally, the engine also records a {@link #TRANSLOG_UUID_KEY} with each commit to ensure a strong
 * association between the lucene index an the transaction log file. This UUID is used to prevent accidental recovery from a transaction
 * log that belongs to a
 * different engine.
 * 

* Each Translog has only one translog file open for writes at any time referenced by a translog generation ID. This ID is written to a * {@code translog.ckp} file that is designed to fit in a single disk block such that a write of the file is atomic. The checkpoint file * is written on each fsync operation of the translog and records the number of operations written, the current translog's file generation, * its fsynced offset in bytes, and other important statistics. *

*

* When the current translog file reaches a certain size ({@link IndexSettings#INDEX_TRANSLOG_GENERATION_THRESHOLD_SIZE_SETTING}, or when * a clear separation between old and new operations (upon change in primary term), the current file is reopened for read only and a new * write only file is created. Any non-current, read only translog file always has a {@code translog-${gen}.ckp} associated with it * which is an fsynced copy of its last {@code translog.ckp} such that in disaster recovery last fsynced offsets, number of * operation etc. are still preserved. *

*/ public class Translog extends AbstractIndexShardComponent implements IndexShardComponent, Closeable { /* * TODO * - we might need something like a deletion policy to hold on to more than one translog eventually (I think sequence IDs needs this) * but we can refactor as we go * - use a simple BufferedOutputStream to write stuff and fold BufferedTranslogWriter into it's super class... the tricky bit is we * need to be able to do random access reads even from the buffer * - we need random exception on the FileSystem API tests for all this. * - we need to page align the last write before we sync, we can take advantage of ensureSynced for this since we might have already * fsynced far enough */ public static final String TRANSLOG_UUID_KEY = "translog_uuid"; public static final String TRANSLOG_FILE_PREFIX = "translog-"; public static final String TRANSLOG_FILE_SUFFIX = ".tlog"; public static final String CHECKPOINT_SUFFIX = ".ckp"; public static final String CHECKPOINT_FILE_NAME = "translog" + CHECKPOINT_SUFFIX; static final Pattern PARSE_STRICT_ID_PATTERN = Pattern.compile("^" + TRANSLOG_FILE_PREFIX + "(\\d+)(\\.tlog)$"); public static final int DEFAULT_HEADER_SIZE_IN_BYTES = TranslogHeader.headerSizeInBytes(UUIDs.randomBase64UUID()); // the list of translog readers is guaranteed to be in order of translog generation private final List readers = new ArrayList<>(); private final BigArrays bigArrays; protected final ReleasableLock readLock; protected final ReleasableLock writeLock; private final Path location; private TranslogWriter current; protected final TragicExceptionHolder tragedy = new TragicExceptionHolder(); private final AtomicBoolean closed = new AtomicBoolean(); private final TranslogConfig config; private final LongSupplier globalCheckpointSupplier; private final LongSupplier primaryTermSupplier; private final String translogUUID; private final TranslogDeletionPolicy deletionPolicy; private final LongConsumer persistedSequenceNumberConsumer; /** * Creates a new Translog instance. This method will create a new transaction log unless the given {@link TranslogGeneration} is * {@code null}. If the generation is {@code null} this method is destructive and will delete all files in the translog path given. If * the generation is not {@code null}, this method tries to open the given translog generation. The generation is treated as the last * generation referenced from already committed data. This means all operations that have not yet been committed should be in the * translog file referenced by this generation. The translog creation will fail if this generation can't be opened. * * @param config the configuration of this translog * @param translogUUID the translog uuid to open, null for a new translog * @param deletionPolicy an instance of {@link TranslogDeletionPolicy} that controls when a translog file can be safely * deleted * @param globalCheckpointSupplier a supplier for the global checkpoint * @param primaryTermSupplier a supplier for the latest value of primary term of the owning index shard. The latest term value is * examined and stored in the header whenever a new generation is rolled. It's guaranteed from outside * that a new generation is rolled when the term is increased. This guarantee allows to us to validate * and reject operation whose term is higher than the primary term stored in the translog header. * @param persistedSequenceNumberConsumer a callback that's called whenever an operation with a given sequence number is successfully * persisted. */ public Translog( final TranslogConfig config, final String translogUUID, TranslogDeletionPolicy deletionPolicy, final LongSupplier globalCheckpointSupplier, final LongSupplier primaryTermSupplier, final LongConsumer persistedSequenceNumberConsumer ) throws IOException { super(config.getShardId(), config.getIndexSettings()); this.config = config; this.globalCheckpointSupplier = globalCheckpointSupplier; this.primaryTermSupplier = primaryTermSupplier; this.persistedSequenceNumberConsumer = persistedSequenceNumberConsumer; this.deletionPolicy = deletionPolicy; this.translogUUID = translogUUID; bigArrays = config.getBigArrays(); ReadWriteLock rwl = new ReentrantReadWriteLock(); readLock = new ReleasableLock(rwl.readLock()); writeLock = new ReleasableLock(rwl.writeLock()); this.location = config.getTranslogPath(); Files.createDirectories(this.location); try { final Checkpoint checkpoint = readCheckpoint(location); final Path nextTranslogFile = location.resolve(getFilename(checkpoint.generation + 1)); final Path currentCheckpointFile = location.resolve(getCommitCheckpointFileName(checkpoint.generation)); // this is special handling for error condition when we create a new writer but we fail to bake // the newly written file (generation+1) into the checkpoint. This is still a valid state // we just need to cleanup before we continue // we hit this before and then blindly deleted the new generation even though we managed to bake it in and then hit this: // https://discuss.elastic.co/t/cannot-recover-index-because-of-missing-tanslog-files/38336 as an example // // For this to happen we must have already copied the translog.ckp file into translog-gen.ckp so we first check if that // file exists. If not we don't even try to clean it up and wait until we fail creating it assert Files.exists(nextTranslogFile) == false || Files.size(nextTranslogFile) <= TranslogHeader.headerSizeInBytes(translogUUID) : "unexpected translog file: [" + nextTranslogFile + "]"; if (Files.exists(currentCheckpointFile) // current checkpoint is already copied && Files.deleteIfExists(nextTranslogFile)) { // delete it and log a warning logger.warn( "deleted previously created, but not yet committed, next generation [{}]. This can happen due to a" + " tragic exception when creating a new generation", nextTranslogFile.getFileName() ); } this.readers.addAll(recoverFromFiles(checkpoint)); if (readers.isEmpty()) { throw new IllegalStateException("at least one reader must be recovered"); } boolean success = false; current = null; try { current = createWriter( checkpoint.generation + 1, getMinFileGeneration(), checkpoint.globalCheckpoint, persistedSequenceNumberConsumer ); success = true; } finally { // we have to close all the recovered ones otherwise we leak file handles here // for instance if we have a lot of tlog and we can't create the writer we keep on holding // on to all the uncommitted tlog files if we don't close if (success == false) { IOUtils.closeWhileHandlingException(readers); } } } catch (Exception e) { // close the opened translog files if we fail to create a new translog... IOUtils.closeWhileHandlingException(current); IOUtils.closeWhileHandlingException(readers); throw e; } } /** recover all translog files found on disk */ private ArrayList recoverFromFiles(Checkpoint checkpoint) throws IOException { boolean success = false; ArrayList foundTranslogs = new ArrayList<>(); try (ReleasableLock ignored = writeLock.acquire()) { logger.debug("open uncommitted translog checkpoint {}", checkpoint); final long minGenerationToRecoverFrom = checkpoint.minTranslogGeneration; // we open files in reverse order in order to validate the translog uuid before we start traversing the translog based on // the generation id we found in the lucene commit. This gives for better error messages if the wrong // translog was found. for (long i = checkpoint.generation; i >= minGenerationToRecoverFrom; i--) { Path committedTranslogFile = location.resolve(getFilename(i)); if (Files.exists(committedTranslogFile) == false) { throw new TranslogCorruptedException( committedTranslogFile.toString(), "translog file doesn't exist with generation: " + i + " recovering from: " + minGenerationToRecoverFrom + " checkpoint: " + checkpoint.generation + " - translog ids must be consecutive" ); } final Checkpoint readerCheckpoint = i == checkpoint.generation ? checkpoint : Checkpoint.read(location.resolve(getCommitCheckpointFileName(i))); final TranslogReader reader = openReader(committedTranslogFile, readerCheckpoint); assert reader.getPrimaryTerm() <= primaryTermSupplier.getAsLong() : "Primary terms go backwards; current term [" + primaryTermSupplier.getAsLong() + "] translog path [ " + committedTranslogFile + ", existing term [" + reader.getPrimaryTerm() + "]"; foundTranslogs.add(reader); logger.debug("recovered local translog from checkpoint {}", checkpoint); } Collections.reverse(foundTranslogs); // when we clean up files, we first update the checkpoint with a new minReferencedTranslog and then delete them; // if we crash just at the wrong moment, it may be that we leave one unreferenced file behind so we delete it if there IOUtils.deleteFilesIgnoringExceptions( location.resolve(getFilename(minGenerationToRecoverFrom - 1)), location.resolve(getCommitCheckpointFileName(minGenerationToRecoverFrom - 1)) ); Path commitCheckpoint = location.resolve(getCommitCheckpointFileName(checkpoint.generation)); if (Files.exists(commitCheckpoint)) { Checkpoint checkpointFromDisk = Checkpoint.read(commitCheckpoint); if (checkpoint.equals(checkpointFromDisk) == false) { throw new TranslogCorruptedException( commitCheckpoint.toString(), "checkpoint file " + commitCheckpoint.getFileName() + " already exists but has corrupted content: expected " + checkpoint + " but got " + checkpointFromDisk ); } } else { copyCheckpointTo(commitCheckpoint); } success = true; } finally { if (success == false) { IOUtils.closeWhileHandlingException(foundTranslogs); } } return foundTranslogs; } private void copyCheckpointTo(Path targetPath) throws IOException { // a temp file to copy checkpoint to - note it must be in on the same FS otherwise atomic move won't work final Path tempFile = Files.createTempFile(location, TRANSLOG_FILE_PREFIX, CHECKPOINT_SUFFIX); boolean tempFileRenamed = false; try { // we first copy this into the temp-file and then fsync it followed by an atomic move into the target file // that way if we hit a disk-full here we are still in an consistent state. Files.copy(location.resolve(CHECKPOINT_FILE_NAME), tempFile, StandardCopyOption.REPLACE_EXISTING); IOUtils.fsync(tempFile, false); Files.move(tempFile, targetPath, StandardCopyOption.ATOMIC_MOVE); tempFileRenamed = true; // we only fsync the directory the tempFile was already fsynced IOUtils.fsync(targetPath.getParent(), true); } finally { if (tempFileRenamed == false) { try { Files.delete(tempFile); } catch (IOException ex) { logger.warn(() -> new ParameterizedMessage("failed to delete temp file {}", tempFile), ex); } } } } TranslogReader openReader(Path path, Checkpoint checkpoint) throws IOException { FileChannel channel = FileChannel.open(path, StandardOpenOption.READ); try { assert Translog.parseIdFromFileName(path) == checkpoint.generation : "expected generation: " + Translog.parseIdFromFileName(path) + " but got: " + checkpoint.generation; TranslogReader reader = TranslogReader.open(channel, path, checkpoint, translogUUID); channel = null; return reader; } finally { IOUtils.close(channel); } } /** * Extracts the translog generation from a file name. * * @throws IllegalArgumentException if the path doesn't match the expected pattern. */ public static long parseIdFromFileName(Path translogFile) { final String fileName = translogFile.getFileName().toString(); final Matcher matcher = PARSE_STRICT_ID_PATTERN.matcher(fileName); if (matcher.matches()) { try { return Long.parseLong(matcher.group(1)); } catch (NumberFormatException e) { throw new IllegalStateException( "number formatting issue in a file that passed PARSE_STRICT_ID_PATTERN: " + fileName + "]", e ); } } throw new IllegalArgumentException("can't parse id from file: " + fileName); } /** Returns {@code true} if this {@code Translog} is still open. */ public boolean isOpen() { return closed.get() == false; } private static boolean calledFromOutsideOrViaTragedyClose() { List frames = Stream.of(Thread.currentThread().getStackTrace()).skip(3). // skip getStackTrace, current method // and close method frames limit(10). // limit depth of analysis to 10 frames, it should be enough to catch closing with, e.g. IOUtils filter(f -> { try { return Translog.class.isAssignableFrom(Class.forName(f.getClassName())); } catch (Exception ignored) { return false; } }). // find all inner callers including Translog subclasses collect(Collectors.toList()); // the list of inner callers should be either empty or should contain closeOnTragicEvent method return frames.isEmpty() || frames.stream().anyMatch(f -> f.getMethodName().equals("closeOnTragicEvent")); } @Override public void close() throws IOException { assert calledFromOutsideOrViaTragedyClose() : "Translog.close method is called from inside Translog, but not via closeOnTragicEvent method"; if (closed.compareAndSet(false, true)) { try (ReleasableLock lock = writeLock.acquire()) { try { current.sync(); } finally { closeFilesIfNoPendingRetentionLocks(); } } finally { logger.debug("translog closed"); } } } /** * Returns all translog locations as absolute paths. * These paths don't contain actual translog files they are * directories holding the transaction logs. */ public Path location() { return location; } /** * Returns the generation of the current transaction log. */ public long currentFileGeneration() { try (ReleasableLock ignored = readLock.acquire()) { return current.getGeneration(); } } /** * Returns the minimum file generation referenced by the translog */ public long getMinFileGeneration() { try (ReleasableLock ignored = readLock.acquire()) { if (readers.isEmpty()) { return current.getGeneration(); } else { assert readers.stream().map(TranslogReader::getGeneration).min(Long::compareTo).get().equals(readers.get(0).getGeneration()) : "the first translog isn't the one with the minimum generation:" + readers; return readers.get(0).getGeneration(); } } } /** * Returns the number of operations in the translog files */ public int totalOperations() { return totalOperationsByMinGen(-1); } /** * Returns the size in bytes of the v files */ public long sizeInBytes() { return sizeInBytesByMinGen(-1); } long earliestLastModifiedAge() { try (ReleasableLock ignored = readLock.acquire()) { ensureOpen(); return findEarliestLastModifiedAge(System.currentTimeMillis(), readers, current); } catch (IOException e) { throw new TranslogException(shardId, "Unable to get the earliest last modified time for the transaction log"); } } /** * Returns the age of the oldest entry in the translog files in seconds */ static long findEarliestLastModifiedAge(long currentTime, Iterable readers, TranslogWriter writer) throws IOException { long earliestTime = currentTime; for (BaseTranslogReader r : readers) { earliestTime = Math.min(r.getLastModifiedTime(), earliestTime); } return Math.max(0, currentTime - Math.min(earliestTime, writer.getLastModifiedTime())); } /** * Returns the number of operations in the translog files at least the given generation */ public int totalOperationsByMinGen(long minGeneration) { try (ReleasableLock ignored = readLock.acquire()) { ensureOpen(); return Stream.concat(readers.stream(), Stream.of(current)) .filter(r -> r.getGeneration() >= minGeneration) .mapToInt(BaseTranslogReader::totalOperations) .sum(); } } /** * Returns the number of operations in the transaction files that contain operations with seq# above the given number. */ public int estimateTotalOperationsFromMinSeq(long minSeqNo) { try (ReleasableLock ignored = readLock.acquire()) { ensureOpen(); return readersAboveMinSeqNo(minSeqNo).mapToInt(BaseTranslogReader::totalOperations).sum(); } } /** * Returns the size in bytes of the translog files at least the given generation */ public long sizeInBytesByMinGen(long minGeneration) { try (ReleasableLock ignored = readLock.acquire()) { ensureOpen(); return Stream.concat(readers.stream(), Stream.of(current)) .filter(r -> r.getGeneration() >= minGeneration) .mapToLong(BaseTranslogReader::sizeInBytes) .sum(); } } /** * Creates a new translog for the specified generation. * * @param fileGeneration the translog generation * @return a writer for the new translog * @throws IOException if creating the translog failed */ TranslogWriter createWriter(long fileGeneration) throws IOException { final TranslogWriter writer = createWriter( fileGeneration, getMinFileGeneration(), globalCheckpointSupplier.getAsLong(), persistedSequenceNumberConsumer ); assert writer.sizeInBytes() == DEFAULT_HEADER_SIZE_IN_BYTES : "Mismatch translog header size; " + "empty translog size [" + writer.sizeInBytes() + ", header size [" + DEFAULT_HEADER_SIZE_IN_BYTES + "]"; return writer; } /** * creates a new writer * * @param fileGeneration the generation of the write to be written * @param initialMinTranslogGen the minimum translog generation to be written in the first checkpoint. This is * needed to solve and initialization problem while constructing an empty translog. * With no readers and no current, a call to {@link #getMinFileGeneration()} would not work. * @param initialGlobalCheckpoint the global checkpoint to be written in the first checkpoint. */ TranslogWriter createWriter( long fileGeneration, long initialMinTranslogGen, long initialGlobalCheckpoint, LongConsumer persistedSequenceNumberConsumer ) throws IOException { final TranslogWriter newWriter; try { newWriter = TranslogWriter.create( shardId, translogUUID, fileGeneration, location.resolve(getFilename(fileGeneration)), getChannelFactory(), config.getBufferSize(), initialMinTranslogGen, initialGlobalCheckpoint, globalCheckpointSupplier, this::getMinFileGeneration, primaryTermSupplier.getAsLong(), tragedy, persistedSequenceNumberConsumer, bigArrays ); } catch (final IOException e) { throw new TranslogException(shardId, "failed to create new translog file", e); } return newWriter; } /** * Adds an operation to the transaction log. * * @param operation the operation to add * @return the location of the operation in the translog * @throws IOException if adding the operation to the translog resulted in an I/O exception */ public Location add(final Operation operation) throws IOException { final ReleasableBytesStreamOutput out = new ReleasableBytesStreamOutput(bigArrays); try { final long start = out.position(); out.skip(Integer.BYTES); writeOperationNoSize(new BufferedChecksumStreamOutput(out), operation); final long end = out.position(); final int operationSize = (int) (end - Integer.BYTES - start); out.seek(start); out.writeInt(operationSize); out.seek(end); final BytesReference bytes = out.bytes(); try (ReleasableLock ignored = readLock.acquire()) { ensureOpen(); if (operation.primaryTerm() > current.getPrimaryTerm()) { assert false : "Operation term is newer than the current term; " + "current term[" + current.getPrimaryTerm() + "], operation term[" + operation + "]"; throw new IllegalArgumentException( "Operation term is newer than the current term; " + "current term[" + current.getPrimaryTerm() + "], operation term[" + operation + "]" ); } return current.add(bytes, operation.seqNo()); } } catch (final AlreadyClosedException | IOException ex) { closeOnTragicEvent(ex); throw ex; } catch (final Exception ex) { closeOnTragicEvent(ex); throw new TranslogException(shardId, "Failed to write operation [" + operation + "]", ex); } finally { Releasables.close(out); } } /** * Tests whether or not the translog generation should be rolled to a new generation. This test * is based on the size of the current generation compared to the configured generation * threshold size. * * @return {@code true} if the current generation should be rolled to a new generation */ public boolean shouldRollGeneration() { final long threshold = this.indexSettings.getGenerationThresholdSize().getBytes(); try (ReleasableLock ignored = readLock.acquire()) { return this.current.sizeInBytes() > threshold; } } /** * The a {@linkplain Location} that will sort after the {@linkplain Location} returned by the last write but before any locations which * can be returned by the next write. */ public Location getLastWriteLocation() { try (ReleasableLock lock = readLock.acquire()) { /* * We use position = current - 1 and size = Integer.MAX_VALUE here instead of position current and size = 0 for two reasons: * 1. Translog.Location's compareTo doesn't actually pay attention to size even though it's equals method does. * 2. It feels more right to return a *position* that is before the next write's position rather than rely on the size. */ return new Location(current.generation, current.sizeInBytes() - 1, Integer.MAX_VALUE); } } /** * The last synced checkpoint for this translog. * * @return the last synced checkpoint */ public long getLastSyncedGlobalCheckpoint() { return getLastSyncedCheckpoint().globalCheckpoint; } final Checkpoint getLastSyncedCheckpoint() { try (ReleasableLock ignored = readLock.acquire()) { return current.getLastSyncedCheckpoint(); } } // for testing public Snapshot newSnapshot() throws IOException { return newSnapshot(0, Long.MAX_VALUE); } public Snapshot newSnapshot(long fromSeqNo, long toSeqNo) throws IOException { return newSnapshot(fromSeqNo, toSeqNo, false); } /** * Creates a new translog snapshot containing operations from the given range. * * @param fromSeqNo the lower bound of the range (inclusive) * @param toSeqNo the upper bound of the range (inclusive) * @return the new snapshot */ public Snapshot newSnapshot(long fromSeqNo, long toSeqNo, boolean requiredFullRange) throws IOException { assert fromSeqNo <= toSeqNo : fromSeqNo + " > " + toSeqNo; assert fromSeqNo >= 0 : "from_seq_no must be non-negative " + fromSeqNo; try (ReleasableLock ignored = readLock.acquire()) { ensureOpen(); TranslogSnapshot[] snapshots = Stream.concat(readers.stream(), Stream.of(current)) .filter(reader -> reader.getCheckpoint().minSeqNo <= toSeqNo && fromSeqNo <= reader.getCheckpoint().maxEffectiveSeqNo()) .map(BaseTranslogReader::newSnapshot) .toArray(TranslogSnapshot[]::new); final Snapshot snapshot = newMultiSnapshot(snapshots); return new SeqNoFilterSnapshot(snapshot, fromSeqNo, toSeqNo, requiredFullRange); } } /** * Reads and returns the operation from the given location if the generation it references is still available. Otherwise * this method will return null. */ public Operation readOperation(Location location) throws IOException { try (ReleasableLock ignored = readLock.acquire()) { ensureOpen(); if (location.generation < getMinFileGeneration()) { return null; } if (current.generation == location.generation) { // no need to fsync here the read operation will ensure that buffers are written to disk // if they are still in RAM and we are reading onto that position return current.read(location); } else { // read backwards - it's likely we need to read on that is recent for (int i = readers.size() - 1; i >= 0; i--) { TranslogReader translogReader = readers.get(i); if (translogReader.generation == location.generation) { return translogReader.read(location); } } } } catch (final Exception ex) { closeOnTragicEvent(ex); throw ex; } return null; } private Snapshot newMultiSnapshot(TranslogSnapshot[] snapshots) throws IOException { final Closeable onClose; if (snapshots.length == 0) { onClose = () -> {}; } else { assert Arrays.stream(snapshots).map(BaseTranslogReader::getGeneration).min(Long::compareTo).get() == snapshots[0].generation : "first reader generation of " + snapshots + " is not the smallest"; onClose = acquireTranslogGenFromDeletionPolicy(snapshots[0].generation); } boolean success = false; try { Snapshot result = new MultiSnapshot(snapshots, onClose); success = true; return result; } finally { if (success == false) { onClose.close(); } } } private Stream readersAboveMinSeqNo(long minSeqNo) { assert readLock.isHeldByCurrentThread() || writeLock.isHeldByCurrentThread() : "callers of readersAboveMinSeqNo must hold a lock: readLock [" + readLock.isHeldByCurrentThread() + "], writeLock [" + readLock.isHeldByCurrentThread() + "]"; return Stream.concat(readers.stream(), Stream.of(current)).filter(reader -> minSeqNo <= reader.getCheckpoint().maxEffectiveSeqNo()); } /** * Acquires a lock on the translog files, preventing them from being trimmed */ public Closeable acquireRetentionLock() { try (ReleasableLock lock = readLock.acquire()) { ensureOpen(); final long viewGen = getMinFileGeneration(); return acquireTranslogGenFromDeletionPolicy(viewGen); } } private Closeable acquireTranslogGenFromDeletionPolicy(long viewGen) { Releasable toClose = deletionPolicy.acquireTranslogGen(viewGen); return () -> { try { toClose.close(); } finally { trimUnreferencedReaders(); closeFilesIfNoPendingRetentionLocks(); } }; } /** * Sync's the translog. */ public void sync() throws IOException { try (ReleasableLock lock = readLock.acquire()) { if (closed.get() == false) { current.sync(); } } catch (final Exception ex) { closeOnTragicEvent(ex); throw ex; } } /** * Returns true if an fsync is required to ensure durability of the translogs operations or it's metadata. */ public boolean syncNeeded() { try (ReleasableLock lock = readLock.acquire()) { return current.syncNeeded(); } } /** package private for testing */ public static String getFilename(long generation) { return TRANSLOG_FILE_PREFIX + generation + TRANSLOG_FILE_SUFFIX; } static String getCommitCheckpointFileName(long generation) { return TRANSLOG_FILE_PREFIX + generation + CHECKPOINT_SUFFIX; } /** * Trims translog for terms of files below belowTerm and seq# above aboveSeqNo. * Effectively it moves max visible seq# {@link Checkpoint#trimmedAboveSeqNo} therefore {@link TranslogSnapshot} skips those operations. */ public void trimOperations(long belowTerm, long aboveSeqNo) throws IOException { assert aboveSeqNo >= SequenceNumbers.NO_OPS_PERFORMED : "aboveSeqNo has to a valid sequence number"; try (ReleasableLock lock = writeLock.acquire()) { ensureOpen(); if (current.getPrimaryTerm() < belowTerm) { throw new IllegalArgumentException( "Trimming the translog can only be done for terms lower than the current one. " + "Trim requested for term [ " + belowTerm + " ] , current is [ " + current.getPrimaryTerm() + " ]" ); } // we assume that the current translog generation doesn't have trimmable ops. Verify that. assert current.assertNoSeqAbove(belowTerm, aboveSeqNo); // update all existed ones (if it is necessary) as checkpoint and reader are immutable final List newReaders = new ArrayList<>(readers.size()); try { for (TranslogReader reader : readers) { final TranslogReader newReader = reader.getPrimaryTerm() < belowTerm ? reader.closeIntoTrimmedReader(aboveSeqNo, getChannelFactory()) : reader; newReaders.add(newReader); } } catch (IOException e) { IOUtils.closeWhileHandlingException(newReaders); tragedy.setTragicException(e); closeOnTragicEvent(e); throw e; } this.readers.clear(); this.readers.addAll(newReaders); } } /** * Ensures that the given location has be synced / written to the underlying storage. * * @return Returns true iff this call caused an actual sync operation otherwise false */ public boolean ensureSynced(Location location) throws IOException { try (ReleasableLock lock = readLock.acquire()) { if (location.generation == current.getGeneration()) { // if we have a new one it's already synced ensureOpen(); return current.syncUpTo(location.translogLocation + location.size); } } catch (final Exception ex) { closeOnTragicEvent(ex); throw ex; } return false; } /** * Ensures that all locations in the given stream have been synced / written to the underlying storage. * This method allows for internal optimization to minimize the amount of fsync operations if multiple * locations must be synced. * * @return Returns true iff this call caused an actual sync operation otherwise false */ public boolean ensureSynced(Stream locations) throws IOException { final Optional max = locations.max(Location::compareTo); // we only need to sync the max location since it will sync all other // locations implicitly if (max.isPresent()) { return ensureSynced(max.get()); } else { return false; } } /** * Closes the translog if the current translog writer experienced a tragic exception. * * Note that in case this thread closes the translog it must not already be holding a read lock on the translog as it will acquire a * write lock in the course of closing the translog * * @param ex if an exception occurs closing the translog, it will be suppressed into the provided exception */ protected void closeOnTragicEvent(final Exception ex) { // we can not hold a read lock here because closing will attempt to obtain a write lock and that would result in self-deadlock assert readLock.isHeldByCurrentThread() == false : Thread.currentThread().getName(); if (tragedy.get() != null) { try { close(); } catch (final AlreadyClosedException inner) { /* * Don't do anything in this case. The AlreadyClosedException comes from TranslogWriter and we should not add it as * suppressed because it will contain the provided exception as its cause. See also * https://github.com/elastic/elasticsearch/issues/15941. */ } catch (final Exception inner) { assert ex != inner.getCause(); ex.addSuppressed(inner); } } } /** * return stats */ public TranslogStats stats() { // acquire lock to make the two numbers roughly consistent (no file change half way) try (ReleasableLock lock = readLock.acquire()) { long uncommittedGen = getMinGenerationForSeqNo(deletionPolicy.getLocalCheckpointOfSafeCommit() + 1).translogFileGeneration; return new TranslogStats( totalOperations(), sizeInBytes(), totalOperationsByMinGen(uncommittedGen), sizeInBytesByMinGen(uncommittedGen), earliestLastModifiedAge() ); } } public TranslogConfig getConfig() { return config; } // public for testing public TranslogDeletionPolicy getDeletionPolicy() { return deletionPolicy; } public static class Location implements Comparable { public final long generation; public final long translogLocation; public final int size; public Location(long generation, long translogLocation, int size) { this.generation = generation; this.translogLocation = translogLocation; this.size = size; } public String toString() { return "[generation: " + generation + ", location: " + translogLocation + ", size: " + size + "]"; } @Override public int compareTo(Location o) { if (generation == o.generation) { return Long.compare(translogLocation, o.translogLocation); } return Long.compare(generation, o.generation); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } Location location = (Location) o; if (generation != location.generation) { return false; } if (translogLocation != location.translogLocation) { return false; } return size == location.size; } @Override public int hashCode() { int result = Long.hashCode(generation); result = 31 * result + Long.hashCode(translogLocation); result = 31 * result + size; return result; } } /** * A snapshot of the transaction log, allows to iterate over all the transaction log operations. */ public interface Snapshot extends Closeable { /** * The total estimated number of operations in the snapshot. */ int totalOperations(); /** * The number of operations have been skipped (overridden or trimmed) in the snapshot so far. * Unlike {@link #totalOperations()}, this value is updated each time after {@link #next()}) is called. */ default int skippedOperations() { return 0; } /** * Returns the next operation in the snapshot or null if we reached the end. */ Translog.Operation next() throws IOException; } /** * A filtered snapshot consisting of only operations whose sequence numbers are in the given range * between {@code fromSeqNo} (inclusive) and {@code toSeqNo} (inclusive). This filtered snapshot * shares the same underlying resources with the {@code delegate} snapshot, therefore we should not * use the {@code delegate} after passing it to this filtered snapshot. */ private static final class SeqNoFilterSnapshot implements Snapshot { private final Snapshot delegate; private int filteredOpsCount; private int opsCount; private boolean requiredFullRange; private final long fromSeqNo; // inclusive private final long toSeqNo; // inclusive SeqNoFilterSnapshot(Snapshot delegate, long fromSeqNo, long toSeqNo, boolean requiredFullRange) { assert fromSeqNo <= toSeqNo : "from_seq_no[" + fromSeqNo + "] > to_seq_no[" + toSeqNo + "]"; this.delegate = delegate; this.fromSeqNo = fromSeqNo; this.toSeqNo = toSeqNo; this.requiredFullRange = requiredFullRange; } @Override public int totalOperations() { return delegate.totalOperations(); } @Override public int skippedOperations() { return filteredOpsCount + delegate.skippedOperations(); } @Override public Operation next() throws IOException, MissingHistoryOperationsException { Translog.Operation op; while ((op = delegate.next()) != null) { if (fromSeqNo <= op.seqNo() && op.seqNo() <= toSeqNo) { opsCount++; return op; } else { filteredOpsCount++; } } if (requiredFullRange && (toSeqNo - fromSeqNo + 1) != opsCount) { throw new MissingHistoryOperationsException( "Not all operations between from_seqno [" + fromSeqNo + "] " + "and to_seqno [" + toSeqNo + "] found" ); } return null; } @Override public void close() throws IOException { delegate.close(); } } /** * A generic interface representing an operation performed on the transaction log. * Each is associated with a type. */ public interface Operation { enum Type { @Deprecated CREATE((byte) 1), INDEX((byte) 2), DELETE((byte) 3), NO_OP((byte) 4); private final byte id; Type(byte id) { this.id = id; } public byte id() { return this.id; } public static Type fromId(byte id) { switch (id) { case 1: return CREATE; case 2: return INDEX; case 3: return DELETE; case 4: return NO_OP; default: throw new IllegalArgumentException("no type mapped for [" + id + "]"); } } } Type opType(); long estimateSize(); Source getSource(); long seqNo(); long primaryTerm(); /** * Reads the type and the operation from the given stream. The operation must be written with * {@link Operation#writeOperation(StreamOutput, Operation)} */ static Operation readOperation(final StreamInput input) throws IOException { final Translog.Operation.Type type = Translog.Operation.Type.fromId(input.readByte()); switch (type) { case CREATE: // the de-serialization logic in Index was identical to that of Create when create was deprecated case INDEX: return new Index(input); case DELETE: return new Delete(input); case NO_OP: return new NoOp(input); default: throw new AssertionError("no case for [" + type + "]"); } } /** * Writes the type and translog operation to the given stream */ static void writeOperation(final StreamOutput output, final Operation operation) throws IOException { output.writeByte(operation.opType().id()); switch (operation.opType()) { case CREATE: // the serialization logic in Index was identical to that of Create when create was deprecated case INDEX: ((Index) operation).write(output); break; case DELETE: ((Delete) operation).write(output); break; case NO_OP: ((NoOp) operation).write(output); break; default: throw new AssertionError("no case for [" + operation.opType() + "]"); } } } public static class Source { public final BytesReference source; public final String routing; public Source(BytesReference source, String routing) { this.source = source; this.routing = routing; } } public static class Index implements Operation { public static final int FORMAT_6_0 = 8; // since 6.0.0 public static final int FORMAT_NO_PARENT = FORMAT_6_0 + 1; // since 7.0 public static final int FORMAT_NO_VERSION_TYPE = FORMAT_NO_PARENT + 1; public static final int SERIALIZATION_FORMAT = FORMAT_NO_VERSION_TYPE; private final String id; private final long autoGeneratedIdTimestamp; private final String type; private final long seqNo; private final long primaryTerm; private final long version; private final BytesReference source; private final String routing; private Index(final StreamInput in) throws IOException { final int format = in.readVInt(); // SERIALIZATION_FORMAT assert format >= FORMAT_6_0 : "format was: " + format; id = in.readString(); type = in.readString(); source = in.readBytesReference(); routing = in.readOptionalString(); if (format < FORMAT_NO_PARENT) { in.readOptionalString(); // _parent } this.version = in.readLong(); if (format < FORMAT_NO_VERSION_TYPE) { in.readByte(); // _version_type } this.autoGeneratedIdTimestamp = in.readLong(); seqNo = in.readLong(); primaryTerm = in.readLong(); } public Index(Engine.Index index, Engine.IndexResult indexResult) { this.id = index.id(); this.type = index.type(); this.source = index.source(); this.routing = index.routing(); this.seqNo = indexResult.getSeqNo(); this.primaryTerm = index.primaryTerm(); this.version = indexResult.getVersion(); this.autoGeneratedIdTimestamp = index.getAutoGeneratedIdTimestamp(); } public Index(String type, String id, long seqNo, long primaryTerm, byte[] source) { this(type, id, seqNo, primaryTerm, Versions.MATCH_ANY, source, null, -1); } public Index( String type, String id, long seqNo, long primaryTerm, long version, byte[] source, String routing, long autoGeneratedIdTimestamp ) { this.type = type; this.id = id; this.source = new BytesArray(source); this.seqNo = seqNo; this.primaryTerm = primaryTerm; this.version = version; this.routing = routing; this.autoGeneratedIdTimestamp = autoGeneratedIdTimestamp; } @Override public Type opType() { return Type.INDEX; } @Override public long estimateSize() { return (2 * id.length()) + (2 * type.length()) + source.length() + (routing != null ? 2 * routing.length() : 0) + (4 * Long.BYTES); // timestamp, seq_no, primary_term, and version } public String type() { return this.type; } public String id() { return this.id; } public String routing() { return this.routing; } public BytesReference source() { return this.source; } @Override public long seqNo() { return seqNo; } @Override public long primaryTerm() { return primaryTerm; } public long version() { return this.version; } @Override public Source getSource() { return new Source(source, routing); } private void write(final StreamOutput out) throws IOException { final int format = out.getVersion().onOrAfter(LegacyESVersion.V_7_0_0) ? SERIALIZATION_FORMAT : FORMAT_6_0; out.writeVInt(format); out.writeString(id); out.writeString(type); out.writeBytesReference(source); out.writeOptionalString(routing); if (format < FORMAT_NO_PARENT) { out.writeOptionalString(null); // _parent } out.writeLong(version); if (format < FORMAT_NO_VERSION_TYPE) { out.writeByte(VersionType.EXTERNAL.getValue()); } out.writeLong(autoGeneratedIdTimestamp); out.writeLong(seqNo); out.writeLong(primaryTerm); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } Index index = (Index) o; if (version != index.version || seqNo != index.seqNo || primaryTerm != index.primaryTerm || id.equals(index.id) == false || type.equals(index.type) == false || autoGeneratedIdTimestamp != index.autoGeneratedIdTimestamp || source.equals(index.source) == false) { return false; } if (routing != null ? !routing.equals(index.routing) : index.routing != null) { return false; } return true; } @Override public int hashCode() { int result = id.hashCode(); result = 31 * result + type.hashCode(); result = 31 * result + Long.hashCode(seqNo); result = 31 * result + Long.hashCode(primaryTerm); result = 31 * result + Long.hashCode(version); result = 31 * result + source.hashCode(); result = 31 * result + (routing != null ? routing.hashCode() : 0); result = 31 * result + Long.hashCode(autoGeneratedIdTimestamp); return result; } @Override public String toString() { return "Index{" + "id='" + id + '\'' + ", type='" + type + '\'' + ", seqNo=" + seqNo + ", primaryTerm=" + primaryTerm + ", version=" + version + ", autoGeneratedIdTimestamp=" + autoGeneratedIdTimestamp + '}'; } public long getAutoGeneratedIdTimestamp() { return autoGeneratedIdTimestamp; } } public static class Delete implements Operation { private static final int FORMAT_6_0 = 4; // 6.0 - * public static final int FORMAT_NO_PARENT = FORMAT_6_0 + 1; // since 7.0 public static final int FORMAT_NO_VERSION_TYPE = FORMAT_NO_PARENT + 1; public static final int SERIALIZATION_FORMAT = FORMAT_NO_VERSION_TYPE; private final String type, id; private final Term uid; private final long seqNo; private final long primaryTerm; private final long version; private Delete(final StreamInput in) throws IOException { final int format = in.readVInt();// SERIALIZATION_FORMAT assert format >= FORMAT_6_0 : "format was: " + format; type = in.readString(); id = in.readString(); uid = new Term(in.readString(), in.readBytesRef()); this.version = in.readLong(); if (format < FORMAT_NO_VERSION_TYPE) { in.readByte(); // versionType } seqNo = in.readLong(); primaryTerm = in.readLong(); } public Delete(Engine.Delete delete, Engine.DeleteResult deleteResult) { this(delete.type(), delete.id(), delete.uid(), deleteResult.getSeqNo(), delete.primaryTerm(), deleteResult.getVersion()); } /** utility for testing */ public Delete(String type, String id, long seqNo, long primaryTerm, Term uid) { this(type, id, uid, seqNo, primaryTerm, Versions.MATCH_ANY); } public Delete(String type, String id, Term uid, long seqNo, long primaryTerm, long version) { this.type = Objects.requireNonNull(type); this.id = Objects.requireNonNull(id); this.uid = uid; this.seqNo = seqNo; this.primaryTerm = primaryTerm; this.version = version; } @Override public Type opType() { return Type.DELETE; } @Override public long estimateSize() { return (id.length() * 2) + (type.length() * 2) + ((uid.field().length() * 2) + (uid.text().length()) * 2) + (type.length() * 2) + (3 * Long.BYTES); // seq_no, primary_term, and version; } public String type() { return type; } public String id() { return id; } public Term uid() { return this.uid; } @Override public long seqNo() { return seqNo; } @Override public long primaryTerm() { return primaryTerm; } public long version() { return this.version; } @Override public Source getSource() { throw new IllegalStateException("trying to read doc source from delete operation"); } private void write(final StreamOutput out) throws IOException { final int format = out.getVersion().onOrAfter(LegacyESVersion.V_7_0_0) ? SERIALIZATION_FORMAT : FORMAT_6_0; out.writeVInt(format); out.writeString(type); out.writeString(id); out.writeString(uid.field()); out.writeBytesRef(uid.bytes()); out.writeLong(version); if (format < FORMAT_NO_VERSION_TYPE) { out.writeByte(VersionType.EXTERNAL.getValue()); } out.writeLong(seqNo); out.writeLong(primaryTerm); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } Delete delete = (Delete) o; return version == delete.version && seqNo == delete.seqNo && primaryTerm == delete.primaryTerm && uid.equals(delete.uid); } @Override public int hashCode() { int result = uid.hashCode(); result = 31 * result + Long.hashCode(seqNo); result = 31 * result + Long.hashCode(primaryTerm); result = 31 * result + Long.hashCode(version); return result; } @Override public String toString() { return "Delete{" + "uid=" + uid + ", seqNo=" + seqNo + ", primaryTerm=" + primaryTerm + ", version=" + version + '}'; } } public static class NoOp implements Operation { private final long seqNo; private final long primaryTerm; private final String reason; @Override public long seqNo() { return seqNo; } @Override public long primaryTerm() { return primaryTerm; } public String reason() { return reason; } private NoOp(final StreamInput in) throws IOException { seqNo = in.readLong(); primaryTerm = in.readLong(); reason = in.readString(); } public NoOp(final long seqNo, final long primaryTerm, final String reason) { assert seqNo > SequenceNumbers.NO_OPS_PERFORMED; assert primaryTerm >= 0; assert reason != null; this.seqNo = seqNo; this.primaryTerm = primaryTerm; this.reason = reason; } private void write(final StreamOutput out) throws IOException { out.writeLong(seqNo); out.writeLong(primaryTerm); out.writeString(reason); } @Override public Type opType() { return Type.NO_OP; } @Override public long estimateSize() { return 2 * reason.length() + 2 * Long.BYTES; } @Override public Source getSource() { throw new UnsupportedOperationException("source does not exist for a no-op"); } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null || getClass() != obj.getClass()) { return false; } final NoOp that = (NoOp) obj; return seqNo == that.seqNo && primaryTerm == that.primaryTerm && reason.equals(that.reason); } @Override public int hashCode() { return 31 * 31 * Long.hashCode(seqNo) + 31 * Long.hashCode(primaryTerm) + reason().hashCode(); } @Override public String toString() { return "NoOp{" + "seqNo=" + seqNo + ", primaryTerm=" + primaryTerm + ", reason='" + reason + '\'' + '}'; } } public enum Durability { /** * Async durability - translogs are synced based on a time interval. */ ASYNC, /** * Request durability - translogs are synced for each high level request (bulk, index, delete) */ REQUEST } static void verifyChecksum(BufferedChecksumStreamInput in) throws IOException { // This absolutely must come first, or else reading the checksum becomes part of the checksum long expectedChecksum = in.getChecksum(); long readChecksum = Integer.toUnsignedLong(in.readInt()); if (readChecksum != expectedChecksum) { throw new TranslogCorruptedException( in.getSource(), "checksum verification failed - expected: 0x" + Long.toHexString(expectedChecksum) + ", got: 0x" + Long.toHexString(readChecksum) ); } } /** * Reads a list of operations written with {@link #writeOperations(StreamOutput, List)} */ public static List readOperations(StreamInput input, String source) throws IOException { ArrayList operations = new ArrayList<>(); int numOps = input.readInt(); final BufferedChecksumStreamInput checksumStreamInput = new BufferedChecksumStreamInput(input, source); for (int i = 0; i < numOps; i++) { operations.add(readOperation(checksumStreamInput)); } return operations; } static Translog.Operation readOperation(BufferedChecksumStreamInput in) throws IOException { final Translog.Operation operation; try { final int opSize = in.readInt(); if (opSize < 4) { // 4byte for the checksum throw new TranslogCorruptedException(in.getSource(), "operation size must be at least 4 but was: " + opSize); } in.resetDigest(); // size is not part of the checksum! if (in.markSupported()) { // if we can we validate the checksum first // we are sometimes called when mark is not supported this is the case when // we are sending translogs across the network with LZ4 compression enabled - currently there is no way s // to prevent this unfortunately. in.mark(opSize); in.skip(opSize - 4); verifyChecksum(in); in.reset(); } operation = Translog.Operation.readOperation(in); verifyChecksum(in); } catch (EOFException e) { throw new TruncatedTranslogException(in.getSource(), "reached premature end of file, translog is truncated", e); } return operation; } /** * Writes all operations in the given iterable to the given output stream including the size of the array * use {@link #readOperations(StreamInput, String)} to read it back. */ public static void writeOperations(StreamOutput outStream, List toWrite) throws IOException { final ReleasableBytesStreamOutput out = new ReleasableBytesStreamOutput(BigArrays.NON_RECYCLING_INSTANCE); try { outStream.writeInt(toWrite.size()); final BufferedChecksumStreamOutput checksumStreamOutput = new BufferedChecksumStreamOutput(out); for (Operation op : toWrite) { out.reset(); final long start = out.position(); out.skip(Integer.BYTES); writeOperationNoSize(checksumStreamOutput, op); long end = out.position(); int operationSize = (int) (out.position() - Integer.BYTES - start); out.seek(start); out.writeInt(operationSize); out.seek(end); out.bytes().writeTo(outStream); } } finally { Releasables.close(out); } } public static void writeOperationNoSize(BufferedChecksumStreamOutput out, Translog.Operation op) throws IOException { // This BufferedChecksumStreamOutput remains unclosed on purpose, // because closing it closes the underlying stream, which we don't // want to do here. out.resetDigest(); Translog.Operation.writeOperation(out, op); long checksum = out.getChecksum(); out.writeInt((int) checksum); } /** * Gets the minimum generation that could contain any sequence number after the specified sequence number, or the current generation if * there is no generation that could any such sequence number. * * @param seqNo the sequence number * @return the minimum generation for the sequence number */ public TranslogGeneration getMinGenerationForSeqNo(final long seqNo) { try (ReleasableLock ignored = readLock.acquire()) { return new TranslogGeneration(translogUUID, minGenerationForSeqNo(seqNo, current, readers)); } } private static long minGenerationForSeqNo(long seqNo, TranslogWriter writer, List readers) { long minGen = writer.generation; for (final TranslogReader reader : readers) { if (seqNo <= reader.getCheckpoint().maxEffectiveSeqNo()) { minGen = Math.min(minGen, reader.getGeneration()); } } return minGen; } /** * Roll the current translog generation into a new generation if it's not empty. This does not commit the translog. * * @throws IOException if an I/O exception occurred during any file operations */ public void rollGeneration() throws IOException { syncBeforeRollGeneration(); if (current.totalOperations() == 0 && primaryTermSupplier.getAsLong() == current.getPrimaryTerm()) { return; } try (Releasable ignored = writeLock.acquire()) { ensureOpen(); try { final TranslogReader reader = current.closeIntoReader(); readers.add(reader); assert Checkpoint.read(location.resolve(CHECKPOINT_FILE_NAME)).generation == current.getGeneration(); copyCheckpointTo(location.resolve(getCommitCheckpointFileName(current.getGeneration()))); // create a new translog file; this will sync it and update the checkpoint data; current = createWriter(current.getGeneration() + 1); logger.trace("current translog set to [{}]", current.getGeneration()); } catch (final Exception e) { tragedy.setTragicException(e); closeOnTragicEvent(e); throw e; } } } void syncBeforeRollGeneration() throws IOException { // make sure we move most of the data to disk outside of the writeLock // in order to reduce the time the lock is held since it's blocking all threads sync(); } /** * Trims unreferenced translog generations by asking {@link TranslogDeletionPolicy} for the minimum * required generation */ public void trimUnreferencedReaders() throws IOException { // first check under read lock if any readers can be trimmed try (ReleasableLock ignored = readLock.acquire()) { if (closed.get()) { // we're shutdown potentially on some tragic event, don't delete anything return; } if (getMinReferencedGen() == getMinFileGeneration()) { return; } } // move most of the data to disk to reduce the time the write lock is held sync(); try (ReleasableLock ignored = writeLock.acquire()) { if (closed.get()) { // we're shutdown potentially on some tragic event, don't delete anything return; } final long minReferencedGen = getMinReferencedGen(); for (Iterator iterator = readers.iterator(); iterator.hasNext();) { TranslogReader reader = iterator.next(); if (reader.getGeneration() >= minReferencedGen) { break; } iterator.remove(); IOUtils.closeWhileHandlingException(reader); final Path translogPath = reader.path(); logger.trace("delete translog file [{}], not referenced and not current anymore", translogPath); // The checkpoint is used when opening the translog to know which files should be recovered from. // We now update the checkpoint to ignore the file we are going to remove. // Note that there is a provision in recoverFromFiles to allow for the case where we synced the checkpoint // but crashed before we could delete the file. // sync at once to make sure that there's at most one unreferenced generation. current.sync(); deleteReaderFiles(reader); } assert readers.isEmpty() == false || current.generation == minReferencedGen : "all readers were cleaned but the minReferenceGen [" + minReferencedGen + "] is not the current writer's gen [" + current.generation + "]"; } catch (final Exception ex) { closeOnTragicEvent(ex); throw ex; } } private long getMinReferencedGen() throws IOException { assert readLock.isHeldByCurrentThread() || writeLock.isHeldByCurrentThread(); long minReferencedGen = Math.min( deletionPolicy.minTranslogGenRequired(readers, current), minGenerationForSeqNo(deletionPolicy.getLocalCheckpointOfSafeCommit() + 1, current, readers) ); assert minReferencedGen >= getMinFileGeneration() : "deletion policy requires a minReferenceGen of [" + minReferencedGen + "] but the lowest gen available is [" + getMinFileGeneration() + "]"; assert minReferencedGen <= currentFileGeneration() : "deletion policy requires a minReferenceGen of [" + minReferencedGen + "] which is higher than the current generation [" + currentFileGeneration() + "]"; return minReferencedGen; } /** * deletes all files associated with a reader. package-private to be able to simulate node failures at this point */ void deleteReaderFiles(TranslogReader reader) { IOUtils.deleteFilesIgnoringExceptions( reader.path(), reader.path().resolveSibling(getCommitCheckpointFileName(reader.getGeneration())) ); } void closeFilesIfNoPendingRetentionLocks() throws IOException { try (ReleasableLock ignored = writeLock.acquire()) { if (closed.get() && deletionPolicy.pendingTranslogRefCount() == 0) { logger.trace("closing files. translog is closed and there are no pending retention locks"); ArrayList toClose = new ArrayList<>(readers); toClose.add(current); IOUtils.close(toClose); } } } /** * References a transaction log generation */ public static final class TranslogGeneration { public final String translogUUID; public final long translogFileGeneration; public TranslogGeneration(String translogUUID, long translogFileGeneration) { this.translogUUID = translogUUID; this.translogFileGeneration = translogFileGeneration; } } /** * Returns the current generation of this translog. This corresponds to the latest uncommitted translog generation */ public TranslogGeneration getGeneration() { return new TranslogGeneration(translogUUID, currentFileGeneration()); } long getFirstOperationPosition() { // for testing return current.getFirstOperationOffset(); } private void ensureOpen() { if (closed.get()) { throw new AlreadyClosedException("translog is already closed", tragedy.get()); } } ChannelFactory getChannelFactory() { return FileChannel::open; } /** * If this {@code Translog} was closed as a side-effect of a tragic exception, * e.g. disk full while flushing a new segment, this returns the root cause exception. * Otherwise (no tragic exception has occurred) it returns null. */ public Exception getTragicException() { return tragedy.get(); } /** Reads and returns the current checkpoint */ static Checkpoint readCheckpoint(final Path location) throws IOException { return Checkpoint.read(location.resolve(CHECKPOINT_FILE_NAME)); } /** * Reads the sequence numbers global checkpoint from the translog checkpoint. * This ensures that the translogUUID from this translog matches with the provided translogUUID. * * @param location the location of the translog * @return the global checkpoint * @throws IOException if an I/O exception occurred reading the checkpoint * @throws TranslogCorruptedException if the translog is corrupted or mismatched with the given uuid */ public static long readGlobalCheckpoint(final Path location, final String expectedTranslogUUID) throws IOException { final Checkpoint checkpoint = readCheckpoint(location, expectedTranslogUUID); return checkpoint.globalCheckpoint; } private static Checkpoint readCheckpoint(Path location, String expectedTranslogUUID) throws IOException { final Checkpoint checkpoint = readCheckpoint(location); // We need to open at least one translog header to validate the translogUUID. final Path translogFile = location.resolve(getFilename(checkpoint.generation)); try (FileChannel channel = FileChannel.open(translogFile, StandardOpenOption.READ)) { TranslogHeader.read(expectedTranslogUUID, translogFile, channel); } catch (TranslogCorruptedException ex) { throw ex; // just bubble up. } catch (Exception ex) { throw new TranslogCorruptedException(location.toString(), ex); } return checkpoint; } /** * Returns the minimum translog generation retained by the translog at the given location. * This ensures that the translogUUID from this translog matches with the provided translogUUID. * * @param location the location of the translog * @return the minimum translog generation * @throws IOException if an I/O exception occurred reading the checkpoint * @throws TranslogCorruptedException if the translog is corrupted or mismatched with the given uuid */ public static long readMinTranslogGeneration(final Path location, final String expectedTranslogUUID) throws IOException { final Checkpoint checkpoint = readCheckpoint(location, expectedTranslogUUID); return checkpoint.minTranslogGeneration; } /** * Returns the translog uuid used to associate a lucene index with a translog. */ public String getTranslogUUID() { return translogUUID; } /** * Returns the max seq_no of translog operations found in this translog. Since this value is calculated based on the current * existing readers, this value is not necessary to be the max seq_no of all operations have been stored in this translog. */ public long getMaxSeqNo() { try (ReleasableLock ignored = readLock.acquire()) { ensureOpen(); final OptionalLong maxSeqNo = Stream.concat(readers.stream(), Stream.of(current)) .mapToLong(reader -> reader.getCheckpoint().maxSeqNo) .max(); assert maxSeqNo.isPresent() : "must have at least one translog generation"; return maxSeqNo.getAsLong(); } } TranslogWriter getCurrent() { return current; } List getReaders() { return readers; } public static String createEmptyTranslog( final Path location, final long initialGlobalCheckpoint, final ShardId shardId, final long primaryTerm ) throws IOException { final ChannelFactory channelFactory = FileChannel::open; return createEmptyTranslog(location, initialGlobalCheckpoint, shardId, channelFactory, primaryTerm); } static String createEmptyTranslog( Path location, long initialGlobalCheckpoint, ShardId shardId, ChannelFactory channelFactory, long primaryTerm ) throws IOException { return createEmptyTranslog(location, shardId, initialGlobalCheckpoint, primaryTerm, null, channelFactory); } /** * Creates a new empty translog within the specified {@code location} that contains the given {@code initialGlobalCheckpoint}, * {@code primaryTerm} and {@code translogUUID}. * * This method should be used directly under specific circumstances like for shards that will see no indexing. Specifying a non-unique * translog UUID could cause a lot of issues and that's why in all (but one) cases the method * {@link #createEmptyTranslog(Path, long, ShardId, long)} should be used instead. * * @param location a {@link Path} to the directory that will contains the translog files (translog + translog checkpoint) * @param shardId the {@link ShardId} * @param initialGlobalCheckpoint the global checkpoint to initialize the translog with * @param primaryTerm the shard's primary term to initialize the translog with * @param translogUUID the unique identifier to initialize the translog with * @param factory a {@link ChannelFactory} used to open translog files * @return the translog's unique identifier * @throws IOException if something went wrong during translog creation */ public static String createEmptyTranslog( final Path location, final ShardId shardId, final long initialGlobalCheckpoint, final long primaryTerm, @Nullable final String translogUUID, @Nullable final ChannelFactory factory ) throws IOException { IOUtils.rm(location); Files.createDirectories(location); final long generation = 1L; final long minTranslogGeneration = 1L; final ChannelFactory channelFactory = factory != null ? factory : FileChannel::open; final String uuid = Strings.hasLength(translogUUID) ? translogUUID : UUIDs.randomBase64UUID(); final Path checkpointFile = location.resolve(CHECKPOINT_FILE_NAME); final Path translogFile = location.resolve(getFilename(generation)); final Checkpoint checkpoint = Checkpoint.emptyTranslogCheckpoint(0, generation, initialGlobalCheckpoint, minTranslogGeneration); Checkpoint.write(channelFactory, checkpointFile, checkpoint, StandardOpenOption.WRITE, StandardOpenOption.CREATE_NEW); IOUtils.fsync(checkpointFile, false); final TranslogWriter writer = TranslogWriter.create( shardId, uuid, generation, translogFile, channelFactory, EMPTY_TRANSLOG_BUFFER_SIZE, minTranslogGeneration, initialGlobalCheckpoint, () -> { throw new UnsupportedOperationException(); }, () -> { throw new UnsupportedOperationException(); }, primaryTerm, new TragicExceptionHolder(), seqNo -> { throw new UnsupportedOperationException(); }, BigArrays.NON_RECYCLING_INSTANCE ); writer.close(); return uuid; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy