All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.index.translog.Translog Maven / Gradle / Ivy

There is a newer version: 8.14.1
Show newest version
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.index.translog;

import org.apache.lucene.index.Term;
import org.apache.lucene.index.TwoPhaseCommit;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.bytes.ReleasablePagedBytesReference;
import org.elasticsearch.common.io.stream.ReleasableBytesStreamOutput;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Streamable;
import org.elasticsearch.common.lease.Releasable;
import org.elasticsearch.common.lease.Releasables;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.lucene.uid.Versions;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.util.BigArrays;
import org.elasticsearch.common.util.Callback;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
import org.elasticsearch.common.util.concurrent.FutureUtils;
import org.elasticsearch.common.util.concurrent.ReleasableLock;
import org.elasticsearch.index.VersionType;
import org.elasticsearch.index.engine.Engine;
import org.elasticsearch.index.shard.AbstractIndexShardComponent;
import org.elasticsearch.index.shard.IndexShardComponent;
import org.elasticsearch.threadpool.ThreadPool;

import java.io.Closeable;
import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.file.*;
import java.util.*;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * A Translog is a per index shard component that records all non-committed index operations in a durable manner.
 * In Elasticsearch there is one Translog instance per {@link org.elasticsearch.index.engine.InternalEngine}. The engine
 * records the current translog generation {@link Translog#getGeneration()} in it's commit metadata using {@link #TRANSLOG_GENERATION_KEY}
 * to reference the generation that contains all operations that have not yet successfully been committed to the engines lucene index.
 * Additionally, since Elasticsearch 2.0 the engine also records a {@link #TRANSLOG_UUID_KEY} with each commit to ensure a strong association
 * between the lucene index an the transaction log file. This UUID is used to prevent accidential recovery from a transaction log that belongs to a
 * different engine.
 * 

* Each Translog has only one translog file open at any time referenced by a translog generation ID. This ID is written to a translog.ckp file that is designed * to fit in a single disk block such that a write of the file is atomic. The checkpoint file is written on each fsync operation of the translog and records the number of operations * written, the current tranlogs file generation and it's fsynced offset in bytes. *

*

* When a translog is opened the checkpoint is use to retrieve the latest translog file generation and subsequently to open the last written file to recovery operations. * The {@link org.elasticsearch.index.translog.Translog.TranslogGeneration} on {@link TranslogConfig#getTranslogGeneration()} given when the translog is opened is compared against * the latest generation and all consecutive translog files singe the given generation and the last generation in the checkpoint will be recovered and preserved until the next * generation is committed using {@link Translog#commit()}. In the common case the translog file generation in the checkpoint and the generation passed to the translog on creation are * the same. The only situation when they can be different is when an actual translog commit fails in between {@link Translog#prepareCommit()} and {@link Translog#commit()}. In such a case * the currently being committed translog file will not be deleted since it's commit was not successful. Yet, a new/current translog file is already opened at that point such that there is more than * one translog file present. Such an uncommitted translog file always has a translog-${gen}.ckp associated with it which is an fsynced copy of the it's last translog.ckp such that in * disaster recovery last fsynced offsets, number of operation etc. are still preserved. *

*/ public class Translog extends AbstractIndexShardComponent implements IndexShardComponent, Closeable, TwoPhaseCommit { /* * TODO * - we might need something like a deletion policy to hold on to more than one translog eventually (I think sequence IDs needs this) but we can refactor as we go * - use a simple BufferedOuputStream to write stuff and fold BufferedTranslogWriter into it's super class... the tricky bit is we need to be able to do random access reads even from the buffer * - we need random exception on the FileSystem API tests for all this. * - we need to page align the last write before we sync, we can take advantage of ensureSynced for this since we might have already fsynced far enough */ public static final String TRANSLOG_GENERATION_KEY = "translog_generation"; public static final String TRANSLOG_UUID_KEY = "translog_uuid"; public static final String TRANSLOG_FILE_PREFIX = "translog-"; public static final String TRANSLOG_FILE_SUFFIX = ".tlog"; public static final String CHECKPOINT_SUFFIX = ".ckp"; public static final String CHECKPOINT_FILE_NAME = "translog" + CHECKPOINT_SUFFIX; static final Pattern PARSE_STRICT_ID_PATTERN = Pattern.compile("^" + TRANSLOG_FILE_PREFIX + "(\\d+)(\\.tlog)$"); private final List recoveredTranslogs; private volatile ScheduledFuture syncScheduler; // this is a concurrent set and is not protected by any of the locks. The main reason // is that is being accessed by two separate classes (additions & reading are done by FsTranslog, remove by FsView when closed) private final Set outstandingViews = ConcurrentCollections.newConcurrentSet(); private BigArrays bigArrays; protected final ReleasableLock readLock; protected final ReleasableLock writeLock; private final Path location; private TranslogWriter current; private volatile ImmutableTranslogReader currentCommittingTranslog; private volatile long lastCommittedTranslogFileGeneration = -1; // -1 is safe as it will not cause an translog deletion. private final AtomicBoolean closed = new AtomicBoolean(); private final TranslogConfig config; private final String translogUUID; private Callback onViewClose = new Callback() { @Override public void handle(View view) { logger.trace("closing view starting at translog [{}]", view.minTranslogGeneration()); boolean removed = outstandingViews.remove(view); assert removed : "View was never set but was supposed to be removed"; } }; /** * Creates a new Translog instance. This method will create a new transaction log unless the given {@link TranslogConfig} has * a non-null {@link org.elasticsearch.index.translog.Translog.TranslogGeneration}. If the generation is null this method * us destructive and will delete all files in the translog path given. * * @see TranslogConfig#getTranslogPath() */ public Translog(TranslogConfig config) throws IOException { super(config.getShardId(), config.getIndexSettings()); this.config = config; TranslogGeneration translogGeneration = config.getTranslogGeneration(); if (translogGeneration == null || translogGeneration.translogUUID == null) { // legacy case translogUUID = Strings.randomBase64UUID(); } else { translogUUID = translogGeneration.translogUUID; } bigArrays = config.getBigArrays(); ReadWriteLock rwl = new ReentrantReadWriteLock(); readLock = new ReleasableLock(rwl.readLock()); writeLock = new ReleasableLock(rwl.writeLock()); this.location = config.getTranslogPath(); Files.createDirectories(this.location); try { if (translogGeneration != null) { final Checkpoint checkpoint = readCheckpoint(); final Path nextTranslogFile = location.resolve(getFilename(checkpoint.generation + 1)); final Path currentCheckpointFile = location.resolve(getCommitCheckpointFileName(checkpoint.generation)); // this is special handling for error condition when we create a new writer but we fail to bake // the newly written file (generation+1) into the checkpoint. This is still a valid state // we just need to cleanup before we continue // we hit this before and then blindly deleted the new generation even though we managed to bake it in and then hit this: // https://discuss.elastic.co/t/cannot-recover-index-because-of-missing-tanslog-files/38336 as an example // // For this to happen we must have already copied the translog.ckp file into translog-gen.ckp so we first check if that file exists // if not we don't even try to clean it up and wait until we fail creating it assert Files.exists(nextTranslogFile) == false || Files.size(nextTranslogFile) <= TranslogWriter.getHeaderLength(translogUUID) : "unexpected translog file: [" + nextTranslogFile + "]"; if (Files.exists(currentCheckpointFile) // current checkpoint is already copied && Files.deleteIfExists(nextTranslogFile)) { // delete it and log a warning logger.warn("deleted previously created, but not yet committed, next generation [{}]. This can happen due to a tragic exception when creating a new generation", nextTranslogFile.getFileName()); } this.recoveredTranslogs = recoverFromFiles(translogGeneration, checkpoint); if (recoveredTranslogs.isEmpty()) { throw new IllegalStateException("at least one reader must be recovered"); } boolean success = false; try { current = createWriter(checkpoint.generation + 1); this.lastCommittedTranslogFileGeneration = translogGeneration.translogFileGeneration; success = true; } finally { // we have to close all the recovered ones otherwise we leak file handles here // for instance if we have a lot of tlog and we can't create the writer we keep on holding // on to all the uncommitted tlog files if we don't close if (success == false) { IOUtils.closeWhileHandlingException(recoveredTranslogs); } } } else { this.recoveredTranslogs = Collections.EMPTY_LIST; IOUtils.rm(location); logger.debug("wipe translog location - creating new translog"); Files.createDirectories(location); final long generation = 1; Checkpoint checkpoint = new Checkpoint(0, 0, generation); Checkpoint.write(location.resolve(CHECKPOINT_FILE_NAME), checkpoint, StandardOpenOption.WRITE, StandardOpenOption.CREATE_NEW); current = createWriter(generation); this.lastCommittedTranslogFileGeneration = -1; // playing safe } if (config.getSyncInterval().millis() > 0 && config.getThreadPool() != null) { syncScheduler = createSyncedScheduler(config); } } catch (Throwable t) { // close the opened translog files if we fail to create a new translog... IOUtils.closeWhileHandlingException(currentCommittingTranslog, current); throw t; } } private ScheduledFuture createSyncedScheduler(TranslogConfig config) { assert(current != null); return config.getThreadPool().schedule(config.getSyncInterval(), ThreadPool.Names.SAME, new Sync()); } /** * This method is used to upgarde a pre 2.0 translog structure to the new checkpoint based structure. * The {@link org.elasticsearch.index.translog.Translog.TranslogGeneration} in the given config is * used to determine the smallest file generation to upgrade. The procedure will travers the translog * directory to find all files that have a generation greater or equal to the translog generation and * renames the files to the new .tlog file format. *

* For each of the files a ${filename}.ckp * file is written containing the size of the translog in bytes, it's ID and the number of operations. Since * these files are all relying on the pre 2.0 truncation feature where we read operations until hitting an {@link EOFException} * the number of operations are recoreded as -1. Later once these files are opened for reading legacy readers will * allow for unknown number of operations and mimic the old behavior. *

*/ public static void upgradeLegacyTranslog(ESLogger logger, TranslogConfig config) throws IOException { Path translogPath = config.getTranslogPath(); TranslogGeneration translogGeneration = config.getTranslogGeneration(); if (translogGeneration == null) { throw new IllegalArgumentException("TranslogGeneration must be set in order to upgrade"); } if (translogGeneration.translogUUID != null) { throw new IllegalArgumentException("TranslogGeneration has a non-null UUID - index must have already been upgraded"); } try { if (Checkpoint.read(translogPath.resolve(CHECKPOINT_FILE_NAME)) != null) { throw new IllegalStateException(CHECKPOINT_FILE_NAME + " file already present, translog is already upgraded"); } } catch (NoSuchFileException | FileNotFoundException ex) { logger.debug("upgrading translog - no checkpoint found"); } final Pattern parseLegacyIdPattern = Pattern.compile("^" + TRANSLOG_FILE_PREFIX + "(\\d+)((\\.recovering))?$"); // here we have to be lenient - nowhere else! try (DirectoryStream stream = Files.newDirectoryStream(translogPath, new DirectoryStream.Filter() { @Override public boolean accept(Path entry) throws IOException { Matcher matcher = parseLegacyIdPattern.matcher(entry.getFileName().toString()); if (matcher.matches() == false) { Matcher newIdMatcher = PARSE_STRICT_ID_PATTERN.matcher(entry.getFileName().toString()); return newIdMatcher.matches(); } else { return true; } } })) { long latestGeneration = -1; List filesToUpgrade = new ArrayList<>(); for (Path path : stream) { Matcher matcher = parseLegacyIdPattern.matcher(path.getFileName().toString()); if (matcher.matches()) { long generation = Long.parseLong(matcher.group(1)); if (generation >= translogGeneration.translogFileGeneration) { latestGeneration = Math.max(translogGeneration.translogFileGeneration, generation); } filesToUpgrade.add(new PathWithGeneration(path, generation)); } else { Matcher strict_matcher = PARSE_STRICT_ID_PATTERN.matcher(path.getFileName().toString()); if (strict_matcher.matches()) { throw new IllegalStateException("non-legacy translog file [" + path.getFileName().toString() + "] found on a translog that wasn't upgraded yet"); } } } if (latestGeneration < translogGeneration.translogFileGeneration) { throw new IllegalStateException("latest found translog has a lower generation that the excepcted uncommitted " + translogGeneration.translogFileGeneration + " > " + latestGeneration); } CollectionUtil.timSort(filesToUpgrade, new Comparator() { @Override public int compare(PathWithGeneration o1, PathWithGeneration o2) { long gen1 = o1.getGeneration(); long gen2 = o2.getGeneration(); return Long.compare(gen1, gen2); } }); for (PathWithGeneration pathAndGeneration : filesToUpgrade) { final Path path = pathAndGeneration.getPath(); final long generation = pathAndGeneration.getGeneration(); final Path target = path.resolveSibling(getFilename(generation)); logger.debug("upgrading translog copy file from {} to {}", path, target); Files.move(path, target, StandardCopyOption.ATOMIC_MOVE); logger.debug("write commit point for {}", target); if (generation == latestGeneration) { // for the last one we only write a checkpoint not a real commit Checkpoint checkpoint = new Checkpoint(Files.size(translogPath.resolve(getFilename(latestGeneration))), -1, latestGeneration); Checkpoint.write(translogPath.resolve(CHECKPOINT_FILE_NAME), checkpoint, StandardOpenOption.WRITE, StandardOpenOption.CREATE_NEW); } else { Checkpoint checkpoint = new Checkpoint(Files.size(target), -1, generation); Checkpoint.write(translogPath.resolve(getCommitCheckpointFileName(generation)), checkpoint, StandardOpenOption.WRITE, StandardOpenOption.CREATE_NEW); } } IOUtils.fsync(translogPath, true); } } /** recover all translog files found on disk */ private final ArrayList recoverFromFiles(TranslogGeneration translogGeneration, Checkpoint checkpoint) throws IOException { boolean success = false; ArrayList foundTranslogs = new ArrayList<>(); final Path tempFile = Files.createTempFile(location, TRANSLOG_FILE_PREFIX, TRANSLOG_FILE_SUFFIX); // a temp file to copy checkpoint to - note it must be in on the same FS otherwise atomic move won't work boolean tempFileRenamed = false; try (ReleasableLock lock = writeLock.acquire()) { logger.debug("open uncommitted translog checkpoint {}", checkpoint); final String checkpointTranslogFile = getFilename(checkpoint.generation); for (long i = translogGeneration.translogFileGeneration; i < checkpoint.generation; i++) { Path committedTranslogFile = location.resolve(getFilename(i)); if (Files.exists(committedTranslogFile) == false) { throw new IllegalStateException("translog file doesn't exist with generation: " + i + " lastCommitted: " + lastCommittedTranslogFileGeneration + " checkpoint: " + checkpoint.generation + " - translog ids must be consecutive"); } final ImmutableTranslogReader reader = openReader(committedTranslogFile, Checkpoint.read(location.resolve(getCommitCheckpointFileName(i)))); foundTranslogs.add(reader); logger.debug("recovered local translog from checkpoint {}", checkpoint); } foundTranslogs.add(openReader(location.resolve(checkpointTranslogFile), checkpoint)); Path commitCheckpoint = location.resolve(getCommitCheckpointFileName(checkpoint.generation)); if (Files.exists(commitCheckpoint)) { Checkpoint checkpointFromDisk = Checkpoint.read(commitCheckpoint); if (checkpoint.equals(checkpointFromDisk) == false) { throw new IllegalStateException("Checkpoint file " + commitCheckpoint.getFileName() + " already exists but has corrupted content expected: " + checkpoint + " but got: " + checkpointFromDisk); } } else { // we first copy this into the temp-file and then fsync it followed by an atomic move into the target file // that way if we hit a disk-full here we are still in an consistent state. Files.copy(location.resolve(CHECKPOINT_FILE_NAME), tempFile, StandardCopyOption.REPLACE_EXISTING); IOUtils.fsync(tempFile, false); Files.move(tempFile, commitCheckpoint, StandardCopyOption.ATOMIC_MOVE); tempFileRenamed = true; // we only fsync the directory the tempFile was already fsynced IOUtils.fsync(commitCheckpoint.getParent(), true); } success = true; } finally { if (success == false) { IOUtils.closeWhileHandlingException(foundTranslogs); } if (tempFileRenamed == false) { try { Files.delete(tempFile); } catch (IOException ex) { logger.warn("failed to delete temp file {}", ex, tempFile); } } } return foundTranslogs; } ImmutableTranslogReader openReader(Path path, Checkpoint checkpoint) throws IOException { final long generation; try { generation = parseIdFromFileName(path); } catch (IllegalArgumentException ex) { throw new TranslogException(shardId, "failed to parse generation from file name matching pattern " + path, ex); } FileChannel channel = FileChannel.open(path, StandardOpenOption.READ); try { final ChannelReference raf = new ChannelReference(path, generation, channel, new OnCloseRunnable()); ImmutableTranslogReader reader = ImmutableTranslogReader.open(raf, checkpoint, translogUUID); channel = null; return reader; } finally { IOUtils.close(channel); } } /** * Extracts the translog generation from a file name. * * @throws IllegalArgumentException if the path doesn't match the expected pattern. */ public static long parseIdFromFileName(Path translogFile) { final String fileName = translogFile.getFileName().toString(); final Matcher matcher = PARSE_STRICT_ID_PATTERN.matcher(fileName); if (matcher.matches()) { try { return Long.parseLong(matcher.group(1)); } catch (NumberFormatException e) { throw new IllegalStateException("number formatting issue in a file that passed PARSE_STRICT_ID_PATTERN: " + fileName + "]", e); } } throw new IllegalArgumentException("can't parse id from file: " + fileName); } public void updateBuffer(ByteSizeValue bufferSize) { config.setBufferSize(bufferSize.bytesAsInt()); try (ReleasableLock lock = writeLock.acquire()) { current.updateBufferSize(config.getBufferSize()); } } /** Returns {@code true} if this {@code Translog} is still open. */ public boolean isOpen() { return closed.get() == false; } @Override public void close() throws IOException { if (closed.compareAndSet(false, true)) { try (ReleasableLock lock = writeLock.acquire()) { try { current.sync(); } finally { try { IOUtils.close(current, currentCommittingTranslog); } finally { IOUtils.close(recoveredTranslogs); recoveredTranslogs.clear(); } } } finally { FutureUtils.cancel(syncScheduler); logger.debug("translog closed"); } } } /** * Returns all translog locations as absolute paths. * These paths don't contain actual translog files they are * directories holding the transaction logs. */ public Path location() { return location; } /** * Returns the generation of the current transaction log. */ public long currentFileGeneration() { try (ReleasableLock lock = readLock.acquire()) { return current.getGeneration(); } } /** * Returns the number of operations in the transaction files that aren't committed to lucene.. * Note: may return -1 if unknown */ public int totalOperations() { int ops = 0; try (ReleasableLock lock = readLock.acquire()) { ops += current.totalOperations(); if (currentCommittingTranslog != null) { int tops = currentCommittingTranslog.totalOperations(); assert tops != TranslogReader.UNKNOWN_OP_COUNT; assert tops >= 0; ops += tops; } } return ops; } /** * Returns the size in bytes of the translog files that aren't committed to lucene. */ public long sizeInBytes() { long size = 0; try (ReleasableLock lock = readLock.acquire()) { size += current.sizeInBytes(); if (currentCommittingTranslog != null) { size += currentCommittingTranslog.sizeInBytes(); } } return size; } TranslogWriter createWriter(long fileGeneration) throws IOException { TranslogWriter newFile; try { newFile = TranslogWriter.create(config.getType(), shardId, translogUUID, fileGeneration, location.resolve(getFilename(fileGeneration)), new OnCloseRunnable(), config.getBufferSize(), getChannelFactory()); } catch (IOException e) { throw new TranslogException(shardId, "failed to create new translog file", e); } return newFile; } /** * Read the Operation object from the given location. This method will try to read the given location from * the current or from the currently committing translog file. If the location is in a file that has already * been closed or even removed the method will return null instead. */ public Translog.Operation read(Location location) { try (ReleasableLock lock = readLock.acquire()) { final TranslogReader reader; final long currentGeneration = current.getGeneration(); if (currentGeneration == location.generation) { reader = current; } else if (currentCommittingTranslog != null && currentCommittingTranslog.getGeneration() == location.generation) { reader = currentCommittingTranslog; } else if (currentGeneration < location.generation) { throw new IllegalStateException("location generation [" + location.generation + "] is greater than the current generation [" + currentGeneration + "]"); } else { return null; } return reader.read(location); } catch (IOException e) { throw new ElasticsearchException("failed to read source from translog location " + location, e); } } /** * Adds a created / delete / index operations to the transaction log. * * @see org.elasticsearch.index.translog.Translog.Operation * @see org.elasticsearch.index.translog.Translog.Create * @see org.elasticsearch.index.translog.Translog.Index * @see org.elasticsearch.index.translog.Translog.Delete */ public Location add(Operation operation) throws IOException { final ReleasableBytesStreamOutput out = new ReleasableBytesStreamOutput(bigArrays); try { final BufferedChecksumStreamOutput checksumStreamOutput = new BufferedChecksumStreamOutput(out); final long start = out.position(); out.skip(RamUsageEstimator.NUM_BYTES_INT); writeOperationNoSize(checksumStreamOutput, operation); final long end = out.position(); final int operationSize = (int) (end - RamUsageEstimator.NUM_BYTES_INT - start); out.seek(start); out.writeInt(operationSize); out.seek(end); final ReleasablePagedBytesReference bytes = out.bytes(); try (ReleasableLock lock = readLock.acquire()) { ensureOpen(); Location location = current.add(bytes); if (config.isSyncOnEachOperation()) { current.sync(); } assert assertBytesAtLocation(location, bytes); return location; } } catch (AlreadyClosedException | IOException ex) { closeOnTragicEvent(ex); throw ex; } catch (Throwable e) { closeOnTragicEvent(e); throw new TranslogException(shardId, "Failed to write operation [" + operation + "]", e); } finally { Releasables.close(out.bytes()); } } boolean assertBytesAtLocation(Translog.Location location, BytesReference expectedBytes) throws IOException { // tests can override this ByteBuffer buffer = ByteBuffer.allocate(location.size); current.readBytes(buffer, location.translogLocation); return new BytesArray(buffer.array()).equals(expectedBytes); } /** * Snapshots the current transaction log allowing to safely iterate over the snapshot. * Snapshots are fixed in time and will not be updated with future operations. */ public Snapshot newSnapshot() { ensureOpen(); try (ReleasableLock lock = readLock.acquire()) { ArrayList toOpen = new ArrayList<>(); toOpen.addAll(recoveredTranslogs); if (currentCommittingTranslog != null) { toOpen.add(currentCommittingTranslog); } toOpen.add(current); return createSnapshot(toOpen.toArray(new TranslogReader[toOpen.size()])); } } private static Snapshot createSnapshot(TranslogReader... translogs) { Snapshot[] snapshots = new Snapshot[translogs.length]; boolean success = false; try { for (int i = 0; i < translogs.length; i++) { snapshots[i] = translogs[i].newSnapshot(); } Snapshot snapshot = new MultiSnapshot(snapshots); success = true; return snapshot; } finally { if (success == false) { Releasables.close(snapshots); } } } /** * Returns a view into the current translog that is guaranteed to retain all current operations * while receiving future ones as well */ public Translog.View newView() { // we need to acquire the read lock to make sure no new translog is created // and will be missed by the view we're making try (ReleasableLock lock = readLock.acquire()) { ensureOpen(); ArrayList translogs = new ArrayList<>(); try { if (currentCommittingTranslog != null) { translogs.add(currentCommittingTranslog.clone()); } translogs.add(current.newReaderFromWriter()); View view = new View(translogs, onViewClose); // this is safe as we know that no new translog is being made at the moment // (we hold a read lock) and the view will be notified of any future one outstandingViews.add(view); translogs.clear(); return view; } finally { // close if anything happend and we didn't reach the clear IOUtils.closeWhileHandlingException(translogs); } } } /** * Sync's the translog. */ public void sync() throws IOException { try (ReleasableLock lock = readLock.acquire()) { if (closed.get() == false) { current.sync(); } } catch (Throwable ex) { closeOnTragicEvent(ex); throw ex; } } public boolean syncNeeded() { try (ReleasableLock lock = readLock.acquire()) { return current.syncNeeded(); } } /** package private for testing */ public static String getFilename(long generation) { return TRANSLOG_FILE_PREFIX + generation + TRANSLOG_FILE_SUFFIX; } static String getCommitCheckpointFileName(long generation) { return TRANSLOG_FILE_PREFIX + generation + CHECKPOINT_SUFFIX; } /** * Ensures that the given location has be synced / written to the underlying storage. * * @return Returns true iff this call caused an actual sync operation otherwise false */ public boolean ensureSynced(Location location) throws IOException { try (ReleasableLock lock = readLock.acquire()) { if (location.generation == current.generation) { // if we have a new one it's already synced ensureOpen(); return current.syncUpTo(location.translogLocation + location.size); } } catch (Throwable ex) { closeOnTragicEvent(ex); throw ex; } return false; } private void closeOnTragicEvent(Throwable ex) { if (current.getTragicException() != null) { try { close(); } catch (AlreadyClosedException inner) { // don't do anything in this case. The AlreadyClosedException comes from TranslogWriter and we should not add it as suppressed because // will contain the Exception ex as cause. See also https://github.com/elastic/elasticsearch/issues/15941 } catch (Exception inner) { assert (ex != inner.getCause()); ex.addSuppressed(inner); } } } /** * return stats */ public TranslogStats stats() { // acquire lock to make the two numbers roughly consistent (no file change half way) try (ReleasableLock lock = readLock.acquire()) { return new TranslogStats(totalOperations(), sizeInBytes()); } } private boolean isReferencedGeneration(long generation) { // used to make decisions if a file can be deleted return generation >= lastCommittedTranslogFileGeneration; } public TranslogConfig getConfig() { return config; } private final class OnCloseRunnable implements Callback { @Override public void handle(ChannelReference channelReference) { if (isReferencedGeneration(channelReference.getGeneration()) == false) { Path translogPath = channelReference.getPath(); assert channelReference.getPath().getParent().equals(location) : "translog files must be in the location folder: " + location + " but was: " + translogPath; // if the given translogPath is not the current we can safely delete the file since all references are released logger.trace("delete translog file - not referenced and not current anymore {}", translogPath); IOUtils.deleteFilesIgnoringExceptions(translogPath); IOUtils.deleteFilesIgnoringExceptions(translogPath.resolveSibling(getCommitCheckpointFileName(channelReference.getGeneration()))); } try (DirectoryStream stream = Files.newDirectoryStream(location)) { for (Path path : stream) { Matcher matcher = PARSE_STRICT_ID_PATTERN.matcher(path.getFileName().toString()); if (matcher.matches()) { long generation = Long.parseLong(matcher.group(1)); if (isReferencedGeneration(generation) == false) { logger.trace("delete translog file - not referenced and not current anymore {}", path); IOUtils.deleteFilesIgnoringExceptions(path); IOUtils.deleteFilesIgnoringExceptions(path.resolveSibling(getCommitCheckpointFileName(generation))); } } } } catch (IOException e) { logger.warn("failed to delete unreferenced translog files", e); } } } /** * a view into the translog, capturing all translog file at the moment of creation * and updated with any future translog. */ public static final class View implements Closeable { public static final Translog.View EMPTY_VIEW = new View(Collections.EMPTY_LIST, null); boolean closed; // last in this list is always FsTranslog.current final List orderedTranslogs; private final Callback onClose; View(List orderedTranslogs, Callback onClose) { // clone so we can safely mutate.. this.orderedTranslogs = new ArrayList<>(orderedTranslogs); this.onClose = onClose; } /** * Called by the parent class when ever the current translog changes * * @param oldCurrent a new read only reader for the old current (should replace the previous reference) * @param newCurrent a reader into the new current. */ synchronized void onNewTranslog(TranslogReader oldCurrent, TranslogReader newCurrent) throws IOException { // even though the close method removes this view from outstandingViews, there is no synchronisation in place // between that operation and an ongoing addition of a new translog, already having an iterator. // As such, this method can be called despite of the fact that we are closed. We need to check and ignore. if (closed) { // we have to close the new references created for as as we will not hold them IOUtils.close(oldCurrent, newCurrent); return; } orderedTranslogs.remove(orderedTranslogs.size() - 1).close(); orderedTranslogs.add(oldCurrent); orderedTranslogs.add(newCurrent); } /** this smallest translog generation in this view */ public synchronized long minTranslogGeneration() { ensureOpen(); return orderedTranslogs.get(0).getGeneration(); } /** * The total number of operations in the view. */ public synchronized int totalOperations() { int ops = 0; for (TranslogReader translog : orderedTranslogs) { int tops = translog.totalOperations(); if (tops == TranslogReader.UNKNOWN_OP_COUNT) { return -1; } assert tops >= 0; ops += tops; } return ops; } /** * Returns the size in bytes of the files behind the view. */ public synchronized long sizeInBytes() { long size = 0; for (TranslogReader translog : orderedTranslogs) { size += translog.sizeInBytes(); } return size; } /** create a snapshot from this view */ public synchronized Snapshot snapshot() { ensureOpen(); return createSnapshot(orderedTranslogs.toArray(new TranslogReader[orderedTranslogs.size()])); } void ensureOpen() { if (closed) { throw new ElasticsearchException("View is already closed"); } } @Override public void close() { final List toClose = new ArrayList<>(); try { synchronized (this) { if (closed == false) { try { if (onClose != null) { onClose.handle(this); } } finally { closed = true; toClose.addAll(orderedTranslogs); orderedTranslogs.clear(); } } } } finally { try { // Close out of lock to prevent deadlocks between channel close which checks for // references in InternalChannelReference.closeInternal (waiting on a read lock) // and other FsTranslog#newTranslog calling FsView.onNewTranslog (while having a write lock) IOUtils.close(toClose); } catch (Exception e) { throw new ElasticsearchException("failed to close view", e); } } } } class Sync implements Runnable { @Override public void run() { // don't re-schedule if its closed..., we are done if (closed.get()) { return; } final ThreadPool threadPool = config.getThreadPool(); if (syncNeeded()) { threadPool.executor(ThreadPool.Names.FLUSH).execute(new Runnable() { @Override public void run() { try { sync(); } catch (Exception e) { logger.warn("failed to sync translog", e); } if (closed.get() == false) { syncScheduler = threadPool.schedule(config.getSyncInterval(), ThreadPool.Names.SAME, Sync.this); } } }); } else { syncScheduler = threadPool.schedule(config.getSyncInterval(), ThreadPool.Names.SAME, Sync.this); } } } public static class Location implements Accountable, Comparable { public final long generation; public final long translogLocation; public final int size; Location(long generation, long translogLocation, int size) { this.generation = generation; this.translogLocation = translogLocation; this.size = size; } @Override public long ramBytesUsed() { return RamUsageEstimator.NUM_BYTES_OBJECT_HEADER + 2 * RamUsageEstimator.NUM_BYTES_LONG + RamUsageEstimator.NUM_BYTES_INT; } @Override public Collection getChildResources() { return Collections.emptyList(); } @Override public String toString() { return "[generation: " + generation + ", location: " + translogLocation + ", size: " + size + "]"; } @Override public int compareTo(Location o) { if (generation == o.generation) { return Long.compare(translogLocation, o.translogLocation); } return Long.compare(generation, o.generation); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } Location location = (Location) o; if (generation != location.generation) { return false; } if (translogLocation != location.translogLocation) { return false; } return size == location.size; } @Override public int hashCode() { int result = (int) (generation ^ (generation >>> 32)); result = 31 * result + (int) (translogLocation ^ (translogLocation >>> 32)); result = 31 * result + size; return result; } } /** * A snapshot of the transaction log, allows to iterate over all the transaction log operations. */ public interface Snapshot extends Releasable { /** * The total number of operations in the translog. */ int estimatedTotalOperations(); /** * Returns the next operation in the snapshot or null if we reached the end. */ Translog.Operation next() throws IOException; } /** * A generic interface representing an operation performed on the transaction log. * Each is associated with a type. */ public interface Operation extends Streamable { enum Type { CREATE((byte) 1), SAVE((byte) 2), DELETE((byte) 3), DELETE_BY_QUERY((byte) 4); private final byte id; private Type(byte id) { this.id = id; } public byte id() { return this.id; } public static Type fromId(byte id) { switch (id) { case 1: return CREATE; case 2: return SAVE; case 3: return DELETE; case 4: return DELETE_BY_QUERY; default: throw new IllegalArgumentException("No type mapped for [" + id + "]"); } } } Type opType(); long estimateSize(); Source getSource(); } public static class Source { public final BytesReference source; public final String routing; public final String parent; public final long timestamp; public final long ttl; public Source(BytesReference source, String routing, String parent, long timestamp, long ttl) { this.source = source; this.routing = routing; this.parent = parent; this.timestamp = timestamp; this.ttl = ttl; } } public static class Create implements Operation { public static final int SERIALIZATION_FORMAT = 6; private String id; private String type; private BytesReference source; private String routing; private String parent; private long timestamp; private long ttl; private long version = Versions.MATCH_ANY; private VersionType versionType = VersionType.INTERNAL; public Create() { } public Create(Engine.Create create) { this.id = create.id(); this.type = create.type(); this.source = create.source(); this.routing = create.routing(); this.parent = create.parent(); this.timestamp = create.timestamp(); this.ttl = create.ttl(); this.version = create.version(); this.versionType = create.versionType(); } public Create(String type, String id, byte[] source) { this.id = id; this.type = type; this.source = new BytesArray(source); } @Override public Type opType() { return Type.CREATE; } @Override public long estimateSize() { return ((id.length() + type.length()) * 2) + source.length() + 12; } public String id() { return this.id; } public BytesReference source() { return this.source; } public String type() { return this.type; } public String routing() { return this.routing; } public String parent() { return this.parent; } public long timestamp() { return this.timestamp; } public long ttl() { return this.ttl; } public long version() { return this.version; } public VersionType versionType() { return versionType; } @Override public Source getSource() { return new Source(source, routing, parent, timestamp, ttl); } @Override public void readFrom(StreamInput in) throws IOException { int version = in.readVInt(); // version id = in.readString(); type = in.readString(); source = in.readBytesReference(); if (version >= 1) { if (in.readBoolean()) { routing = in.readString(); } } if (version >= 2) { if (in.readBoolean()) { parent = in.readString(); } } if (version >= 3) { this.version = in.readLong(); } if (version >= 4) { this.timestamp = in.readLong(); } if (version >= 5) { this.ttl = in.readLong(); } if (version >= 6) { this.versionType = VersionType.fromValue(in.readByte()); } assert versionType.validateVersionForWrites(version); } @Override public void writeTo(StreamOutput out) throws IOException { out.writeVInt(SERIALIZATION_FORMAT); out.writeString(id); out.writeString(type); out.writeBytesReference(source); if (routing == null) { out.writeBoolean(false); } else { out.writeBoolean(true); out.writeString(routing); } if (parent == null) { out.writeBoolean(false); } else { out.writeBoolean(true); out.writeString(parent); } out.writeLong(version); out.writeLong(timestamp); out.writeLong(ttl); out.writeByte(versionType.getValue()); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } Create create = (Create) o; if (timestamp != create.timestamp || ttl != create.ttl || version != create.version || id.equals(create.id) == false || type.equals(create.type) == false || source.equals(create.source) == false) { return false; } if (routing != null ? !routing.equals(create.routing) : create.routing != null) { return false; } if (parent != null ? !parent.equals(create.parent) : create.parent != null) { return false; } return versionType == create.versionType; } @Override public int hashCode() { int result = id.hashCode(); result = 31 * result + type.hashCode(); result = 31 * result + source.hashCode(); result = 31 * result + (routing != null ? routing.hashCode() : 0); result = 31 * result + (parent != null ? parent.hashCode() : 0); result = 31 * result + (int) (timestamp ^ (timestamp >>> 32)); result = 31 * result + (int) (ttl ^ (ttl >>> 32)); result = 31 * result + (int) (version ^ (version >>> 32)); result = 31 * result + versionType.hashCode(); return result; } @Override public String toString() { return "Create{" + "id='" + id + '\'' + ", type='" + type + '\'' + '}'; } } public static class Index implements Operation { public static final int SERIALIZATION_FORMAT = 6; private String id; private String type; private long version = Versions.MATCH_ANY; private VersionType versionType = VersionType.INTERNAL; private BytesReference source; private String routing; private String parent; private long timestamp; private long ttl; public Index() { } public Index(Engine.Index index) { this.id = index.id(); this.type = index.type(); this.source = index.source(); this.routing = index.routing(); this.parent = index.parent(); this.version = index.version(); this.timestamp = index.timestamp(); this.ttl = index.ttl(); this.versionType = index.versionType(); } public Index(String type, String id, byte[] source) { this.type = type; this.id = id; this.source = new BytesArray(source); } @Override public Type opType() { return Type.SAVE; } @Override public long estimateSize() { return ((id.length() + type.length()) * 2) + source.length() + 12; } public String type() { return this.type; } public String id() { return this.id; } public String routing() { return this.routing; } public String parent() { return this.parent; } public long timestamp() { return this.timestamp; } public long ttl() { return this.ttl; } public BytesReference source() { return this.source; } public long version() { return this.version; } public VersionType versionType() { return versionType; } @Override public Source getSource() { return new Source(source, routing, parent, timestamp, ttl); } @Override public void readFrom(StreamInput in) throws IOException { int version = in.readVInt(); // version id = in.readString(); type = in.readString(); source = in.readBytesReference(); try { if (version >= 1) { if (in.readBoolean()) { routing = in.readString(); } } if (version >= 2) { if (in.readBoolean()) { parent = in.readString(); } } if (version >= 3) { this.version = in.readLong(); } if (version >= 4) { this.timestamp = in.readLong(); } if (version >= 5) { this.ttl = in.readLong(); } if (version >= 6) { this.versionType = VersionType.fromValue(in.readByte()); } } catch (Exception e) { throw new ElasticsearchException("failed to read [" + type + "][" + id + "]", e); } assert versionType.validateVersionForWrites(version); } @Override public void writeTo(StreamOutput out) throws IOException { out.writeVInt(SERIALIZATION_FORMAT); out.writeString(id); out.writeString(type); out.writeBytesReference(source); if (routing == null) { out.writeBoolean(false); } else { out.writeBoolean(true); out.writeString(routing); } if (parent == null) { out.writeBoolean(false); } else { out.writeBoolean(true); out.writeString(parent); } out.writeLong(version); out.writeLong(timestamp); out.writeLong(ttl); out.writeByte(versionType.getValue()); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } Index index = (Index) o; if (version != index.version || timestamp != index.timestamp || ttl != index.ttl || id.equals(index.id) == false || type.equals(index.type) == false || versionType != index.versionType || source.equals(index.source) == false) { return false; } if (routing != null ? !routing.equals(index.routing) : index.routing != null) { return false; } return !(parent != null ? !parent.equals(index.parent) : index.parent != null); } @Override public int hashCode() { int result = id.hashCode(); result = 31 * result + type.hashCode(); result = 31 * result + (int) (version ^ (version >>> 32)); result = 31 * result + versionType.hashCode(); result = 31 * result + source.hashCode(); result = 31 * result + (routing != null ? routing.hashCode() : 0); result = 31 * result + (parent != null ? parent.hashCode() : 0); result = 31 * result + (int) (timestamp ^ (timestamp >>> 32)); result = 31 * result + (int) (ttl ^ (ttl >>> 32)); return result; } @Override public String toString() { return "Index{" + "id='" + id + '\'' + ", type='" + type + '\'' + '}'; } } public static class Delete implements Operation { public static final int SERIALIZATION_FORMAT = 2; private Term uid; private long version = Versions.MATCH_ANY; private VersionType versionType = VersionType.INTERNAL; public Delete() { } public Delete(Engine.Delete delete) { this(delete.uid()); this.version = delete.version(); this.versionType = delete.versionType(); } public Delete(Term uid) { this.uid = uid; } public Delete(Term uid, long version, VersionType versionType) { this.uid = uid; this.version = version; this.versionType = versionType; } @Override public Type opType() { return Type.DELETE; } @Override public long estimateSize() { return ((uid.field().length() + uid.text().length()) * 2) + 20; } public Term uid() { return this.uid; } public long version() { return this.version; } public VersionType versionType() { return this.versionType; } @Override public Source getSource() { throw new IllegalStateException("trying to read doc source from delete operation"); } @Override public void readFrom(StreamInput in) throws IOException { int version = in.readVInt(); // version uid = new Term(in.readString(), in.readString()); if (version >= 1) { this.version = in.readLong(); } if (version >= 2) { this.versionType = VersionType.fromValue(in.readByte()); } assert versionType.validateVersionForWrites(version); } @Override public void writeTo(StreamOutput out) throws IOException { out.writeVInt(SERIALIZATION_FORMAT); out.writeString(uid.field()); out.writeString(uid.text()); out.writeLong(version); out.writeByte(versionType.getValue()); } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } Delete delete = (Delete) o; return version == delete.version && uid.equals(delete.uid) && versionType == delete.versionType; } @Override public int hashCode() { int result = uid.hashCode(); result = 31 * result + (int) (version ^ (version >>> 32)); result = 31 * result + versionType.hashCode(); return result; } @Override public String toString() { return "Delete{" + "uid=" + uid + '}'; } } /** @deprecated Delete-by-query is removed in 2.0, but we keep this so translog can replay on upgrade. */ @Deprecated public static class DeleteByQuery implements Operation { public static final int SERIALIZATION_FORMAT = 2; private BytesReference source; @Nullable private String[] filteringAliases; private String[] types = Strings.EMPTY_ARRAY; public DeleteByQuery() { } public DeleteByQuery(Engine.DeleteByQuery deleteByQuery) { this(deleteByQuery.source(), deleteByQuery.filteringAliases(), deleteByQuery.types()); } public DeleteByQuery(BytesReference source, String[] filteringAliases, String... types) { this.source = source; this.types = types == null ? Strings.EMPTY_ARRAY : types; this.filteringAliases = filteringAliases; } @Override public Type opType() { return Type.DELETE_BY_QUERY; } @Override public long estimateSize() { return source.length() + 8; } public BytesReference source() { return this.source; } public String[] filteringAliases() { return filteringAliases; } public String[] types() { return this.types; } @Override public Source getSource() { throw new IllegalStateException("trying to read doc source from delete_by_query operation"); } @Override public void readFrom(StreamInput in) throws IOException { int version = in.readVInt(); // version source = in.readBytesReference(); if (version < 2) { // for query_parser_name, which was removed if (in.readBoolean()) { in.readString(); } } int typesSize = in.readVInt(); if (typesSize > 0) { types = new String[typesSize]; for (int i = 0; i < typesSize; i++) { types[i] = in.readString(); } } if (version >= 1) { int aliasesSize = in.readVInt(); if (aliasesSize > 0) { filteringAliases = new String[aliasesSize]; for (int i = 0; i < aliasesSize; i++) { filteringAliases[i] = in.readString(); } } } } @Override public void writeTo(StreamOutput out) throws IOException { out.writeVInt(SERIALIZATION_FORMAT); out.writeBytesReference(source); out.writeVInt(types.length); for (String type : types) { out.writeString(type); } if (filteringAliases != null) { out.writeVInt(filteringAliases.length); for (String alias : filteringAliases) { out.writeString(alias); } } else { out.writeVInt(0); } } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } DeleteByQuery that = (DeleteByQuery) o; if (!Arrays.equals(filteringAliases, that.filteringAliases)) { return false; } if (!Arrays.equals(types, that.types)) { return false; } return source.equals(that.source); } @Override public int hashCode() { int result = source.hashCode(); result = 31 * result + (filteringAliases != null ? Arrays.hashCode(filteringAliases) : 0); result = 31 * result + Arrays.hashCode(types); return result; } @Override public String toString() { return "DeleteByQuery{" + "types=" + Arrays.toString(types) + '}'; } } public enum Durabilty { /** * Async durability - translogs are synced based on a time interval. */ ASYNC, /** * Request durability - translogs are synced for each high levle request (bulk, index, delete) */ REQUEST; } private static void verifyChecksum(BufferedChecksumStreamInput in) throws IOException { // This absolutely must come first, or else reading the checksum becomes part of the checksum long expectedChecksum = in.getChecksum(); long readChecksum = in.readInt() & 0xFFFF_FFFFL; if (readChecksum != expectedChecksum) { throw new TranslogCorruptedException("translog stream is corrupted, expected: 0x" + Long.toHexString(expectedChecksum) + ", got: 0x" + Long.toHexString(readChecksum)); } } /** * Reads a list of operations written with {@link #writeOperations(StreamOutput, List)} */ public static List readOperations(StreamInput input) throws IOException { ArrayList operations = new ArrayList<>(); int numOps = input.readInt(); final BufferedChecksumStreamInput checksumStreamInput = new BufferedChecksumStreamInput(input); for (int i = 0; i < numOps; i++) { operations.add(readOperation(checksumStreamInput)); } return operations; } static Translog.Operation readOperation(BufferedChecksumStreamInput in) throws IOException { Translog.Operation operation; try { final int opSize = in.readInt(); if (opSize < 4) { // 4byte for the checksum throw new AssertionError("operation size must be at least 4 but was: " + opSize); } in.resetDigest(); // size is not part of the checksum! if (in.markSupported()) { // if we can we validate the checksum first // we are sometimes called when mark is not supported this is the case when // we are sending translogs across the network with LZ4 compression enabled - currently there is no way s // to prevent this unfortunately. in.mark(opSize); in.skip(opSize - 4); verifyChecksum(in); in.reset(); } Translog.Operation.Type type = Translog.Operation.Type.fromId(in.readByte()); operation = newOperationFromType(type); operation.readFrom(in); verifyChecksum(in); } catch (EOFException e) { throw new TruncatedTranslogException("reached premature end of file, translog is truncated", e); } catch (AssertionError | Exception e) { throw new TranslogCorruptedException("translog corruption while reading from stream", e); } return operation; } /** * Writes all operations in the given iterable to the given output stream including the size of the array * use {@link #readOperations(StreamInput)} to read it back. */ public static void writeOperations(StreamOutput outStream, List toWrite) throws IOException { final ReleasableBytesStreamOutput out = new ReleasableBytesStreamOutput(BigArrays.NON_RECYCLING_INSTANCE); try { outStream.writeInt(toWrite.size()); final BufferedChecksumStreamOutput checksumStreamOutput = new BufferedChecksumStreamOutput(out); for (Operation op : toWrite) { out.reset(); final long start = out.position(); out.skip(RamUsageEstimator.NUM_BYTES_INT); writeOperationNoSize(checksumStreamOutput, op); long end = out.position(); int operationSize = (int) (out.position() - RamUsageEstimator.NUM_BYTES_INT - start); out.seek(start); out.writeInt(operationSize); out.seek(end); ReleasablePagedBytesReference bytes = out.bytes(); bytes.writeTo(outStream); } } finally { Releasables.close(out.bytes()); } } public static void writeOperationNoSize(BufferedChecksumStreamOutput out, Translog.Operation op) throws IOException { // This BufferedChecksumStreamOutput remains unclosed on purpose, // because closing it closes the underlying stream, which we don't // want to do here. out.resetDigest(); out.writeByte(op.opType().id()); op.writeTo(out); long checksum = out.getChecksum(); out.writeInt((int) checksum); } /** * Returns a new empty translog operation for the given {@link Translog.Operation.Type} */ static Translog.Operation newOperationFromType(Translog.Operation.Type type) throws IOException { switch (type) { case CREATE: return new Translog.Create(); case DELETE: return new Translog.Delete(); case DELETE_BY_QUERY: return new Translog.DeleteByQuery(); case SAVE: return new Translog.Index(); default: throw new IOException("No type for [" + type + "]"); } } @Override public void prepareCommit() throws IOException { try (ReleasableLock lock = writeLock.acquire()) { ensureOpen(); if (currentCommittingTranslog != null) { throw new IllegalStateException("already committing a translog with generation: " + currentCommittingTranslog.getGeneration()); } final TranslogWriter oldCurrent = current; oldCurrent.ensureOpen(); oldCurrent.sync(); currentCommittingTranslog = current.immutableReader(); Path checkpoint = location.resolve(CHECKPOINT_FILE_NAME); assert Checkpoint.read(checkpoint).generation == currentCommittingTranslog.getGeneration(); Path commitCheckpoint = location.resolve(getCommitCheckpointFileName(currentCommittingTranslog.getGeneration())); Files.copy(checkpoint, commitCheckpoint); IOUtils.fsync(commitCheckpoint, false); IOUtils.fsync(commitCheckpoint.getParent(), true); // create a new translog file - this will sync it and update the checkpoint data; current = createWriter(current.getGeneration() + 1); // notify all outstanding views of the new translog (no views are created now as // we hold a write lock). for (View view : outstandingViews) { view.onNewTranslog(currentCommittingTranslog.clone(), current.newReaderFromWriter()); } IOUtils.close(oldCurrent); logger.trace("current translog set to [{}]", current.getGeneration()); assert oldCurrent.syncNeeded() == false : "old translog oldCurrent must not need a sync"; } catch (Throwable t) { IOUtils.closeWhileHandlingException(this); // tragic event throw t; } } @Override public void commit() throws IOException { ImmutableTranslogReader toClose = null; try (ReleasableLock lock = writeLock.acquire()) { ensureOpen(); if (currentCommittingTranslog == null) { prepareCommit(); } lastCommittedTranslogFileGeneration = current.getGeneration(); // this is important - otherwise old files will not be cleaned up if (recoveredTranslogs.isEmpty() == false) { IOUtils.close(recoveredTranslogs); recoveredTranslogs.clear(); } toClose = this.currentCommittingTranslog; this.currentCommittingTranslog = null; } finally { IOUtils.close(toClose); } } @Override public void rollback() throws IOException { ensureOpen(); close(); } /** * References a transaction log generation */ public final static class TranslogGeneration { public final String translogUUID; public final long translogFileGeneration; public TranslogGeneration(String translogUUID, long translogFileGeneration) { this.translogUUID = translogUUID; this.translogFileGeneration = translogFileGeneration; } } /** * Returns the current generation of this translog. This corresponds to the latest uncommitted translog generation */ public TranslogGeneration getGeneration() { try (ReleasableLock lock = writeLock.acquire()) { return new TranslogGeneration(translogUUID, currentFileGeneration()); } } /** * Returns true iff the given generation is the current gbeneration of this translog */ public boolean isCurrent(TranslogGeneration generation) { try (ReleasableLock lock = writeLock.acquire()) { if (generation != null) { if (generation.translogUUID.equals(translogUUID) == false) { throw new IllegalArgumentException("commit belongs to a different translog: " + generation.translogUUID + " vs. " + translogUUID); } return generation.translogFileGeneration == currentFileGeneration(); } } return false; } long getFirstOperationPosition() { // for testing return current.getFirstOperationOffset(); } List getRecoveredReaders() { // for testing return this.recoveredTranslogs; } private void ensureOpen() { if (closed.get()) { throw new AlreadyClosedException("translog is already closed", current.getTragicException()); } } /** * The number of currently open views */ int getNumOpenViews() { return outstandingViews.size(); } private static class PathWithGeneration { private final Path path; private final long generation; public PathWithGeneration(Path path, long generation) { this.path = path; this.generation = generation; } public Path getPath() { return path; } public long getGeneration() { return generation; } } TranslogWriter.ChannelFactory getChannelFactory() { return TranslogWriter.ChannelFactory.DEFAULT; } /** If this {@code Translog} was closed as a side-effect of a tragic exception, * e.g. disk full while flushing a new segment, this returns the root cause exception. * Otherwise (no tragic exception has occurred) it returns null. */ public Throwable getTragicException() { return current.getTragicException(); } /** Reads and returns the current checkpoint */ final Checkpoint readCheckpoint() throws IOException { return Checkpoint.read(location.resolve(CHECKPOINT_FILE_NAME)); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy