org.elasticsearch.index.store.Store Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of org.elasticsearch Show documentation
The newest version!
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.index.store;

import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.*;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ExceptionsHelper;
import org.apache.lucene.util.Version;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.io.Streams;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Writeable;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.store.ByteArrayIndexInput;
import org.elasticsearch.common.lucene.store.InputStreamIndexInput;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.Callback;
import org.elasticsearch.common.util.SingleObjectCache;
import org.elasticsearch.common.util.concurrent.AbstractRefCounted;
import org.elasticsearch.common.util.concurrent.RefCounted;
import org.elasticsearch.env.ShardLock;
import org.elasticsearch.index.engine.Engine;
import org.elasticsearch.index.settings.IndexSettingsService;
import org.elasticsearch.index.shard.AbstractIndexShardComponent;
import org.elasticsearch.index.shard.ShardId;

import java.io.*;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.util.*;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.zip.Adler32;
import java.util.zip.CRC32;
import java.util.zip.Checksum;

/**
 * A Store provides plain access to files written by an elasticsearch index shard. Each shard
 * has a dedicated store that is uses to access Lucene's Directory which represents the lowest level
 * of file abstraction in Lucene used to read and write Lucene indices.
 * This class also provides access to metadata information like checksums for committed files. A committed
 * file is a file that belongs to a segment written by a Lucene commit. Files that have not been committed
 * ie. created during a merge or a shard refresh / NRT reopen are not considered in the MetadataSnapshot.
 * 
 * Note: If you use a store it's reference count should be increased before using it by calling #incRef and a
 * corresponding #decRef must be called in a try/finally block to release the store again ie.:
 * 
 *      store.incRef();
 *      try {
 *        // use the store...
 *
 *      } finally {
 *          store.decRef();
 *      }
 * 
 */
public class Store extends AbstractIndexShardComponent implements Closeable, RefCounted {

    static final String CODEC = "store";
    static final int VERSION_WRITE_THROWABLE= 2; // we write throwable since 2.0
    static final int VERSION_STACK_TRACE = 1; // we write the stack trace too since 1.4.0
    static final int VERSION_START = 0;
    static final int VERSION = VERSION_WRITE_THROWABLE;
    static final String CORRUPTED = "corrupted_";
    public static final String INDEX_STORE_STATS_REFRESH_INTERVAL = "index.store.stats_refresh_interval";

    private final AtomicBoolean isClosed = new AtomicBoolean(false);
    private final StoreDirectory directory;
    private final ReentrantReadWriteLock metadataLock = new ReentrantReadWriteLock();
    private final ShardLock shardLock;
    private final OnClose onClose;
    private final SingleObjectCache statsCache;

    private final AbstractRefCounted refCounter = new AbstractRefCounted("store") {
        @Override
        protected void closeInternal() {
            // close us once we are done
            Store.this.closeInternal();
        }
    };

    public Store(ShardId shardId, Settings indexSettings, DirectoryService directoryService, ShardLock shardLock) throws IOException {
        this(shardId, indexSettings, directoryService, shardLock, OnClose.EMPTY);
    }

    @Inject
    public Store(ShardId shardId, IndexSettingsService indexSettingsService, DirectoryService directoryService, ShardLock shardLock, OnClose onClose) throws IOException {
        this(shardId, indexSettingsService.getSettings(), directoryService, shardLock, onClose);
    }

    public Store(ShardId shardId, Settings indexSettings, DirectoryService directoryService, ShardLock shardLock, OnClose onClose) throws IOException {
        super(shardId, indexSettings);
        this.directory = new StoreDirectory(directoryService.newDirectory(), Loggers.getLogger("index.store.deletes", indexSettings, shardId));
        this.shardLock = shardLock;
        this.onClose = onClose;
        final TimeValue refreshInterval = indexSettings.getAsTime(INDEX_STORE_STATS_REFRESH_INTERVAL, TimeValue.timeValueSeconds(10));
        this.statsCache = new StoreStatsCache(refreshInterval, directory, directoryService);
        logger.debug("store stats are refreshed with refresh_interval [{}]", refreshInterval);

        assert onClose != null;
        assert shardLock != null;
        assert shardLock.getShardId().equals(shardId);
    }

    public Directory directory() {
        ensureOpen();
        return directory;
    }

    /**
     * Returns the last committed segments info for this store
     *
     * @throws IOException if the index is corrupted or the segments file is not present
     */
    public SegmentInfos readLastCommittedSegmentsInfo() throws IOException {
        failIfCorrupted();
        try {
            return readSegmentsInfo(null, directory());
        } catch (CorruptIndexException ex) {
            markStoreCorrupted(ex);
            throw ex;
        }
    }

    /**
     * Returns the segments info for the given commit or for the latest commit if the given commit is null
     *
     * @throws IOException if the index is corrupted or the segments file is not present
     */
    private static SegmentInfos readSegmentsInfo(IndexCommit commit, Directory directory) throws IOException {
        assert commit == null || commit.getDirectory() == directory;
        try {
            return commit == null ? Lucene.readSegmentInfos(directory) : Lucene.readSegmentInfos(commit);
        } catch (EOFException eof) {
            // TODO this should be caught by lucene - EOF is almost certainly an index corruption
            throw new CorruptIndexException("Read past EOF while reading segment infos", "commit(" + commit + ")", eof);
        } catch (IOException exception) {
            throw exception; // IOExceptions like too many open files are not necessarily a corruption - just bubble it up
        } catch (Exception ex) {
            throw new CorruptIndexException("Hit unexpected exception while reading segment infos", "commit(" + commit + ")", ex);
        }

    }

    final void ensureOpen() {
        if (this.refCounter.refCount() <= 0) {
            throw new AlreadyClosedException("store is already closed");
        }
    }

    /**
     * Returns a new MetadataSnapshot for the latest commit in this store or
     * an empty snapshot if no index exists or can not be opened.
     *
     * @throws CorruptIndexException      if the lucene index is corrupted. This can be caused by a checksum mismatch or an
     *                                    unexpected exception when opening the index reading the segments file.
     * @throws IndexFormatTooOldException if the lucene index is too old to be opened.
     * @throws IndexFormatTooNewException if the lucene index is too new to be opened.
     */
    public MetadataSnapshot getMetadataOrEmpty() throws IOException {
        try {
            return getMetadata(null);
        } catch (IndexNotFoundException ex) {
            // that's fine - happens all the time no need to log
        } catch (FileNotFoundException | NoSuchFileException ex) {
            logger.info("Failed to open / find files while reading metadata snapshot");
        }
        return MetadataSnapshot.EMPTY;
    }

    /**
     * Returns a new MetadataSnapshot for the latest commit in this store.
     *
     * @throws CorruptIndexException      if the lucene index is corrupted. This can be caused by a checksum mismatch or an
     *                                    unexpected exception when opening the index reading the segments file.
     * @throws IndexFormatTooOldException if the lucene index is too old to be opened.
     * @throws IndexFormatTooNewException if the lucene index is too new to be opened.
     * @throws FileNotFoundException      if one or more files referenced by a commit are not present.
     * @throws NoSuchFileException        if one or more files referenced by a commit are not present.
     * @throws IndexNotFoundException     if no index / valid commit-point can be found in this store
     */
    public MetadataSnapshot getMetadata() throws IOException {
        return getMetadata(null);
    }

    /**
     * Returns a new MetadataSnapshot for the given commit. If the given commit is null
     * the latest commit point is used.
     *
     * @throws CorruptIndexException      if the lucene index is corrupted. This can be caused by a checksum mismatch or an
     *                                    unexpected exception when opening the index reading the segments file.
     * @throws IndexFormatTooOldException if the lucene index is too old to be opened.
     * @throws IndexFormatTooNewException if the lucene index is too new to be opened.
     * @throws FileNotFoundException      if one or more files referenced by a commit are not present.
     * @throws NoSuchFileException        if one or more files referenced by a commit are not present.
     * @throws IndexNotFoundException     if the commit point can't be found in this store
     */
    public MetadataSnapshot getMetadata(IndexCommit commit) throws IOException {
        ensureOpen();
        failIfCorrupted();
        metadataLock.readLock().lock();
        try {
            return new MetadataSnapshot(commit, directory, logger);
        } catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
            markStoreCorrupted(ex);
            throw ex;
        } finally {
            metadataLock.readLock().unlock();
        }
    }


    /**
     * Renames all the given files form the key of the map to the
     * value of the map. All successfully renamed files are removed from the map in-place.
     */
    public void renameTempFilesSafe(Map tempFileMap) throws IOException {
        // this works just like a lucene commit - we rename all temp files and once we successfully
        // renamed all the segments we rename the commit to ensure we don't leave half baked commits behind.
        final Map.Entry[] entries = tempFileMap.entrySet().toArray(new Map.Entry[tempFileMap.size()]);
        ArrayUtil.timSort(entries, new Comparator>() {
            @Override
            public int compare(Map.Entry o1, Map.Entry o2) {
                String left = o1.getValue();
                String right = o2.getValue();
                if (left.startsWith(IndexFileNames.SEGMENTS) || right.startsWith(IndexFileNames.SEGMENTS)) {
                    if (left.startsWith(IndexFileNames.SEGMENTS) == false) {
                        return -1;
                    } else if (right.startsWith(IndexFileNames.SEGMENTS) == false) {
                        return 1;
                    }
                }
                return left.compareTo(right);
            }
        });
        metadataLock.writeLock().lock();
        // we make sure that nobody fetches the metadata while we do this rename operation here to ensure we don't
        // get exceptions if files are still open.
        try (Lock writeLock = directory().obtainLock(IndexWriter.WRITE_LOCK_NAME)) {
            for (Map.Entry entry : entries) {
                String tempFile = entry.getKey();
                String origFile = entry.getValue();
                // first, go and delete the existing ones
                try {
                    directory.deleteFile(origFile);
                } catch (FileNotFoundException | NoSuchFileException e) {
                } catch (Throwable ex) {
                    logger.debug("failed to delete file [{}]", ex, origFile);
                }
                // now, rename the files... and fail it it won't work
                this.renameFile(tempFile, origFile);
                final String remove = tempFileMap.remove(tempFile);
                assert remove != null;
            }
        } finally {
            metadataLock.writeLock().unlock();
        }

    }

    public StoreStats stats() throws IOException {
        ensureOpen();
        return statsCache.getOrRefresh();
    }

    public void renameFile(String from, String to) throws IOException {
        ensureOpen();
        directory.renameFile(from, to);
    }

    /**
     * Increments the refCount of this Store instance.  RefCounts are used to determine when a
     * Store can be closed safely, i.e. as soon as there are no more references. Be sure to always call a
     * corresponding {@link #decRef}, in a finally clause; otherwise the store may never be closed.  Note that
     * {@link #close} simply calls decRef(), which means that the Store will not really be closed until {@link
     * #decRef} has been called for all outstanding references.
     * 
     * Note: Close can safely be called multiple times.
     *
     * @throws AlreadyClosedException iff the reference counter can not be incremented.
     * @see #decRef
     * @see #tryIncRef()
     */
    @Override
    public final void incRef() {
        refCounter.incRef();
    }

    /**
     * Tries to increment the refCount of this Store instance. This method will return true iff the refCount was
     * incremented successfully otherwise false. RefCounts are used to determine when a
     * Store can be closed safely, i.e. as soon as there are no more references. Be sure to always call a
     * corresponding {@link #decRef}, in a finally clause; otherwise the store may never be closed.  Note that
     * {@link #close} simply calls decRef(), which means that the Store will not really be closed until {@link
     * #decRef} has been called for all outstanding references.
     * 

     * Note: Close can safely be called multiple times.
     *
     * @see #decRef()
     * @see #incRef()
     */
    @Override
    public final boolean tryIncRef() {
        return refCounter.tryIncRef();
    }

    /**
     * Decreases the refCount of this Store instance.If the refCount drops to 0, then this
     * store is closed.
     *
     * @see #incRef
     */
    @Override
    public final void decRef() {
        refCounter.decRef();
    }

    @Override
    public void close() {

        if (isClosed.compareAndSet(false, true)) {
            // only do this once!
            decRef();
            logger.debug("store reference count on close: " + refCounter.refCount());
        }
    }

    private void closeInternal() {
        try {
            try {
                directory.innerClose(); // this closes the distributorDirectory as well
            } finally {
                onClose.handle(shardLock);
            }
        } catch (IOException e) {
            logger.debug("failed to close directory", e);
        } finally {
            IOUtils.closeWhileHandlingException(shardLock);
        }
    }


    /**
     * Reads a MetadataSnapshot from the given index locations or returns an empty snapshot if it can't be read.
     *
     * @throws IOException if the index we try to read is corrupted
     */
    public static MetadataSnapshot readMetadataSnapshot(Path indexLocation, ESLogger logger) throws IOException {
        try (Directory dir = new SimpleFSDirectory(indexLocation)) {
            failIfCorrupted(dir, new ShardId("", 1));
            return new MetadataSnapshot(null, dir, logger);
        } catch (IndexNotFoundException ex) {
            // that's fine - happens all the time no need to log
        } catch (FileNotFoundException | NoSuchFileException ex) {
            logger.info("Failed to open / find files while reading metadata snapshot");
        }
        return MetadataSnapshot.EMPTY;
    }

    /**
     * Returns true iff the given location contains an index an the index
     * can be successfully opened. This includes reading the segment infos and possible
     * corruption markers.
     */
    public static boolean canOpenIndex(ESLogger logger, Path indexLocation) throws IOException {
        try {
            tryOpenIndex(indexLocation);
        } catch (Exception ex) {
            logger.trace("Can't open index for path [{}]", ex, indexLocation);
            return false;
        }
        return true;
    }

    /**
     * Tries to open an index for the given location. This includes reading the
     * segment infos and possible corruption markers. If the index can not
     * be opened, an exception is thrown
     */
    public static void tryOpenIndex(Path indexLocation) throws IOException {
        try (Directory dir = new SimpleFSDirectory(indexLocation)) {
            failIfCorrupted(dir, new ShardId("", 1));
            Lucene.readSegmentInfos(dir);
        }
    }

    /**
     * The returned IndexOutput might validate the files checksum if the file has been written with a newer lucene version
     * and the metadata holds the necessary information to detect that it was been written by Lucene 4.8 or newer. If it has only
     * a legacy checksum, returned IndexOutput will not verify the checksum.
     * 

     * Note: Checksums are calculated nevertheless since lucene does it by default sicne version 4.8.0. This method only adds the
     * verification against the checksum in the given metadata and does not add any significant overhead.
     */
    public IndexOutput createVerifyingOutput(String fileName, final StoreFileMetaData metadata, final IOContext context) throws IOException {
        IndexOutput output = directory().createOutput(fileName, context);
        boolean success = false;
        try {
            if (metadata.hasLegacyChecksum()) {
                logger.debug("create legacy adler32 output for {}", fileName);
                output = new LegacyVerification.Adler32VerifyingIndexOutput(output, metadata.checksum(), metadata.length());
            } else if (metadata.checksum() == null) {
                // TODO: when the file is a segments_N, we can still CRC-32 + length for more safety
                // its had that checksum forever.
                logger.debug("create legacy length-only output for {}", fileName);
                output = new LegacyVerification.LengthVerifyingIndexOutput(output, metadata.length());
            } else {
                assert metadata.writtenBy() != null;
                assert metadata.writtenBy().onOrAfter(Version.LUCENE_4_8);
                output = new LuceneVerifyingIndexOutput(metadata, output);
            }
            success = true;
        } finally {
            if (success == false) {
                IOUtils.closeWhileHandlingException(output);
            }
        }
        return output;
    }

    public static void verify(IndexOutput output) throws IOException {
        if (output instanceof VerifyingIndexOutput) {
            ((VerifyingIndexOutput) output).verify();
        }
    }

    public IndexInput openVerifyingInput(String filename, IOContext context, StoreFileMetaData metadata) throws IOException {
        if (metadata.hasLegacyChecksum() || metadata.checksum() == null) {
            logger.debug("open legacy input for {}", filename);
            return directory().openInput(filename, context);
        }
        assert metadata.writtenBy() != null;
        assert metadata.writtenBy().onOrAfter(Version.LUCENE_4_8_0);
        return new VerifyingIndexInput(directory().openInput(filename, context));
    }

    public static void verify(IndexInput input) throws IOException {
        if (input instanceof VerifyingIndexInput) {
            ((VerifyingIndexInput) input).verify();
        }
    }

    public boolean checkIntegrityNoException(StoreFileMetaData md) {
        return checkIntegrityNoException(md, directory());
    }

    public static boolean checkIntegrityNoException(StoreFileMetaData md, Directory directory) {
        try {
            checkIntegrity(md, directory);
            return true;
        } catch (IOException e) {
            return false;
        }
    }

    public static void checkIntegrity(final StoreFileMetaData md, final Directory directory) throws IOException {
        try (IndexInput input = directory.openInput(md.name(), IOContext.READONCE)) {
            if (input.length() != md.length()) { // first check the length no matter how old this file is
                throw new CorruptIndexException("expected length=" + md.length() + " != actual length: " + input.length() + " : file truncated?", input);
            }
            if (md.writtenBy() != null && md.writtenBy().onOrAfter(Version.LUCENE_4_8_0)) {
                // throw exception if the file is corrupt
                String checksum = Store.digestToString(CodecUtil.checksumEntireFile(input));
                // throw exception if metadata is inconsistent
                if (!checksum.equals(md.checksum())) {
                    throw new CorruptIndexException("inconsistent metadata: lucene checksum=" + checksum +
                            ", metadata checksum=" + md.checksum(), input);
                }
            } else if (md.hasLegacyChecksum()) {
                // legacy checksum verification - no footer that we need to omit in the checksum!
                final Checksum checksum = new Adler32();
                final byte[] buffer = new byte[md.length() > 4096 ? 4096 : (int) md.length()];
                final long len = input.length();
                long read = 0;
                while (len > read) {
                    final long bytesLeft = len - read;
                    final int bytesToRead = bytesLeft < buffer.length ? (int) bytesLeft : buffer.length;
                    input.readBytes(buffer, 0, bytesToRead, false);
                    checksum.update(buffer, 0, bytesToRead);
                    read += bytesToRead;
                }
                String adler32 = Store.digestToString(checksum.getValue());
                if (!adler32.equals(md.checksum())) {
                    throw new CorruptIndexException("checksum failed (hardware problem?) : expected=" + md.checksum() +
                            " actual=" + adler32, input);
                }
            }
        }
    }

    public boolean isMarkedCorrupted() throws IOException {
        ensureOpen();
        /* marking a store as corrupted is basically adding a _corrupted to all
         * the files. This prevent
         */
        final String[] files = directory().listAll();
        for (String file : files) {
            if (file.startsWith(CORRUPTED)) {
                return true;
            }
        }
        return false;
    }

    /**
     * Deletes all corruption markers from this store.
     */
    public void removeCorruptionMarker() throws IOException {
        ensureOpen();
        final Directory directory = directory();
        IOException firstException = null;
        final String[] files = directory.listAll();
        for (String file : files) {
            if (file.startsWith(CORRUPTED)) {
                try {
                    directory.deleteFile(file);
                } catch (IOException ex) {
                    if (firstException == null) {
                        firstException = ex;
                    } else {
                        firstException.addSuppressed(ex);
                    }
                }
            }
        }
        if (firstException != null) {
            throw firstException;
        }
    }

    public void failIfCorrupted() throws IOException {
        ensureOpen();
        failIfCorrupted(directory, shardId);
    }

    private static final void failIfCorrupted(Directory directory, ShardId shardId) throws IOException {
        final String[] files = directory.listAll();
        List ex = new ArrayList<>();
        for (String file : files) {
            if (file.startsWith(CORRUPTED)) {
                try (ChecksumIndexInput input = directory.openChecksumInput(file, IOContext.READONCE)) {
                    int version = CodecUtil.checkHeader(input, CODEC, VERSION_START, VERSION);

                    if (version == VERSION_WRITE_THROWABLE) {
                        final int size = input.readVInt();
                        final byte[] buffer = new byte[size];
                        input.readBytes(buffer, 0, buffer.length);
                        StreamInput in = StreamInput.wrap(buffer);
                        Throwable t = in.readThrowable();
                        if (t instanceof CorruptIndexException) {
                            ex.add((CorruptIndexException) t);
                        } else {
                            ex.add(new CorruptIndexException(t.getMessage(), "preexisting_corruption", t));
                        }
                    } else {
                        assert version == VERSION_START || version == VERSION_STACK_TRACE;
                        String msg = input.readString();
                        StringBuilder builder = new StringBuilder(shardId.toString());
                        builder.append(" Preexisting corrupted index [");
                        builder.append(file).append("] caused by: ");
                        builder.append(msg);
                        if (version == VERSION_STACK_TRACE) {
                            builder.append(System.lineSeparator());
                            builder.append(input.readString());
                        }
                        ex.add(new CorruptIndexException(builder.toString(), "preexisting_corruption"));
                    }
                    CodecUtil.checkFooter(input);
                }
            }
        }
        if (ex.isEmpty() == false) {
            ExceptionsHelper.rethrowAndSuppress(ex);
        }
    }

    /**
     * This method deletes every file in this store that is not contained in the given source meta data or is a
     * legacy checksum file. After the delete it pulls the latest metadata snapshot from the store and compares it
     * to the given snapshot. If the snapshots are inconsistent an illegal state exception is thrown
     *
     * @param reason         the reason for this cleanup operation logged for each deleted file
     * @param sourceMetaData the metadata used for cleanup. all files in this metadata should be kept around.
     * @throws IOException           if an IOException occurs
     * @throws IllegalStateException if the latest snapshot in this store differs from the given one after the cleanup.
     */
    public void cleanupAndVerify(String reason, MetadataSnapshot sourceMetaData) throws IOException {
        metadataLock.writeLock().lock();
        try (Lock writeLock = directory.obtainLock(IndexWriter.WRITE_LOCK_NAME)) {
            final StoreDirectory dir = directory;
            for (String existingFile : dir.listAll()) {
                if (Store.isAutogenerated(existingFile) || sourceMetaData.contains(existingFile)) {
                    continue; // don't delete snapshot file, or the checksums file (note, this is extra protection since the Store won't delete checksum)
                }
                try {
                    dir.deleteFile(reason, existingFile);
                    // FNF should not happen since we hold a write lock?
                } catch (IOException ex) {
                    if (existingFile.startsWith(IndexFileNames.SEGMENTS)
                            || existingFile.equals(IndexFileNames.OLD_SEGMENTS_GEN)) {
                        // TODO do we need to also fail this if we can't delete the pending commit file?
                        // if one of those files can't be deleted we better fail the cleanup otherwise we might leave an old commit point around?
                        throw new IllegalStateException("Can't delete " + existingFile + " - cleanup failed", ex);
                    }
                    logger.debug("failed to delete file [{}]", ex, existingFile);
                    // ignore, we don't really care, will get deleted later on
                }
            }
            final Store.MetadataSnapshot metadataOrEmpty = getMetadata();
            verifyAfterCleanup(sourceMetaData, metadataOrEmpty);
        } finally {
            metadataLock.writeLock().unlock();
        }
    }

    // pkg private for testing
    final void verifyAfterCleanup(MetadataSnapshot sourceMetaData, MetadataSnapshot targetMetaData) {
        final RecoveryDiff recoveryDiff = targetMetaData.recoveryDiff(sourceMetaData);
        if (recoveryDiff.identical.size() != recoveryDiff.size()) {
            if (recoveryDiff.missing.isEmpty()) {
                for (StoreFileMetaData meta : recoveryDiff.different) {
                    StoreFileMetaData local = targetMetaData.get(meta.name());
                    StoreFileMetaData remote = sourceMetaData.get(meta.name());
                    // if we have different files the they must have no checksums otherwise something went wrong during recovery.
                    // we have that problem when we have an empty index is only a segments_1 file then we can't tell if it's a Lucene 4.8 file
                    // and therefore no checksum. That isn't much of a problem since we simply copy it over anyway but those files come out as
                    // different in the diff. That's why we have to double check here again if the rest of it matches.

                    // all is fine this file is just part of a commit or a segment that is different
                    final boolean same = local.isSame(remote);

                    // this check ensures that the two files are consistent ie. if we don't have checksums only the rest needs to match we are just
                    // verifying that we are consistent on both ends source and target
                    final boolean hashAndLengthEqual = (
                            local.checksum() == null
                                    && remote.checksum() == null
                                    && local.hash().equals(remote.hash())
                                    && local.length() == remote.length());
                    final boolean consistent = hashAndLengthEqual || same;
                    if (consistent == false) {
                        logger.debug("Files are different on the recovery target: {} ", recoveryDiff);
                        throw new IllegalStateException("local version: " + local + " is different from remote version after recovery: " + remote, null);
                    }
                }
            } else {
                logger.debug("Files are missing on the recovery target: {} ", recoveryDiff);
                throw new IllegalStateException("Files are missing on the recovery target: [different="
                        + recoveryDiff.different + ", missing=" + recoveryDiff.missing + ']', null);
            }
        }
    }

    /**
     * Returns the current reference count.
     */
    public int refCount() {
        return refCounter.refCount();
    }

    private static final class StoreDirectory extends FilterDirectory {

        private final ESLogger deletesLogger;

        StoreDirectory(Directory delegateDirectory, ESLogger deletesLogger) throws IOException {
            super(delegateDirectory);
            this.deletesLogger = deletesLogger;
        }


        @Override
        public void close() throws IOException {
            assert false : "Nobody should close this directory except of the Store itself";
        }

        public void deleteFile(String msg, String name) throws IOException {
            deletesLogger.trace("{}: delete file {}", msg, name);
            super.deleteFile(name);
        }

        @Override
        public void deleteFile(String name) throws IOException {
            deleteFile("StoreDirectory.deleteFile", name);
        }

        private void innerClose() throws IOException {
            super.close();
        }

        @Override
        public String toString() {
            return "store(" + in.toString() + ")";
        }

    }

    /**
     * Represents a snapshot of the current directory build from the latest Lucene commit.
     * Only files that are part of the last commit are considered in this datastrucutre.
     * For backwards compatibility the snapshot might include legacy checksums that
     * are derived from a dedicated checksum file written by older elasticsearch version pre 1.3
     * 

     * Note: This class will ignore the segments.gen file since it's optional and might
     * change concurrently for safety reasons.
     *
     * @see StoreFileMetaData
     */
    public final static class MetadataSnapshot implements Iterable, Writeable {
        private static final ESLogger logger = Loggers.getLogger(MetadataSnapshot.class);
        private static final Version FIRST_LUCENE_CHECKSUM_VERSION = Version.LUCENE_4_8;

        private final ImmutableMap metadata;

        public static final MetadataSnapshot EMPTY = new MetadataSnapshot();

        private final ImmutableMap commitUserData;

        private final long numDocs;

        public MetadataSnapshot(Map metadata, Map commitUserData, long numDocs) {
            ImmutableMap.Builder metaDataBuilder = ImmutableMap.builder();
            this.metadata = metaDataBuilder.putAll(metadata).build();
            ImmutableMap.Builder commitUserDataBuilder = ImmutableMap.builder();
            this.commitUserData = commitUserDataBuilder.putAll(commitUserData).build();
            this.numDocs = numDocs;
        }

        MetadataSnapshot() {
            metadata = ImmutableMap.of();
            commitUserData = ImmutableMap.of();
            numDocs = 0;
        }

        MetadataSnapshot(IndexCommit commit, Directory directory, ESLogger logger) throws IOException {
            LoadedMetadata loadedMetadata = loadMetadata(commit, directory, logger);
            metadata = loadedMetadata.fileMetadata;
            commitUserData = loadedMetadata.userData;
            numDocs = loadedMetadata.numDocs;
            assert metadata.isEmpty() || numSegmentFiles() == 1 : "numSegmentFiles: " + numSegmentFiles();
        }

        public MetadataSnapshot(StreamInput in) throws IOException {
            final int size = in.readVInt();
            final ImmutableMap.Builder metadataBuilder = ImmutableMap.builder();
            for (int i = 0; i < size; i++) {
                StoreFileMetaData meta = StoreFileMetaData.readStoreFileMetaData(in);
                metadataBuilder.put(meta.name(), meta);
            }
            final ImmutableMap.Builder commitUserDataBuilder = ImmutableMap.builder();
            int num = in.readVInt();
            for (int i = num; i > 0; i--) {
                commitUserDataBuilder.put(in.readString(), in.readString());
            }

            this.commitUserData = commitUserDataBuilder.build();
            this.metadata = metadataBuilder.build();
            this.numDocs = in.readLong();
            assert metadata.isEmpty() || numSegmentFiles() == 1 : "numSegmentFiles: " + numSegmentFiles();
        }

        /**
         * Returns the number of documents in this store snapshot
         */
        public long getNumDocs() {
            return numDocs;
        }

        static class LoadedMetadata {
            final ImmutableMap fileMetadata;
            final ImmutableMap userData;
            final long numDocs;

            LoadedMetadata(ImmutableMap fileMetadata, ImmutableMap userData, long numDocs) {
                this.fileMetadata = fileMetadata;
                this.userData = userData;
                this.numDocs = numDocs;
            }
        }

        static LoadedMetadata loadMetadata(IndexCommit commit, Directory directory, ESLogger logger) throws IOException {
            long numDocs;
            ImmutableMap.Builder builder = ImmutableMap.builder();
            Map checksumMap = readLegacyChecksums(directory).v1();
            ImmutableMap.Builder commitUserDataBuilder = ImmutableMap.builder();
            try {
                final SegmentInfos segmentCommitInfos = Store.readSegmentsInfo(commit, directory);
                numDocs = Lucene.getNumDocs(segmentCommitInfos);
                commitUserDataBuilder.putAll(segmentCommitInfos.getUserData());
                Version maxVersion = Version.LUCENE_4_0; // we don't know which version was used to write so we take the max version.
                for (SegmentCommitInfo info : segmentCommitInfos) {
                    final Version version = info.info.getVersion();
                    if (version == null) {
                        // version is written since 3.1+: we should have already hit IndexFormatTooOld.
                        throw new IllegalArgumentException("expected valid version value: " + info.info.toString());
                    }
                    if (version.onOrAfter(maxVersion)) {
                        maxVersion = version;
                    }
                    for (String file : info.files()) {
                        String legacyChecksum = checksumMap.get(file);
                        if (version.onOrAfter(FIRST_LUCENE_CHECKSUM_VERSION)) {
                            checksumFromLuceneFile(directory, file, builder, logger, version, SEGMENT_INFO_EXTENSION.equals(IndexFileNames.getExtension(file)));
                        } else {
                            builder.put(file, new StoreFileMetaData(file, directory.fileLength(file), legacyChecksum, version));
                        }
                    }
                }
                final String segmentsFile = segmentCommitInfos.getSegmentsFileName();
                String legacyChecksum = checksumMap.get(segmentsFile);
                if (maxVersion.onOrAfter(FIRST_LUCENE_CHECKSUM_VERSION)) {
                    checksumFromLuceneFile(directory, segmentsFile, builder, logger, maxVersion, true);
                } else {
                    final BytesRefBuilder fileHash = new BytesRefBuilder();
                    final long length;
                    try (final IndexInput in = directory.openInput(segmentsFile, IOContext.READONCE)) {
                        length = in.length();
                        hashFile(fileHash, new InputStreamIndexInput(in, length), length);
                    }
                    builder.put(segmentsFile, new StoreFileMetaData(segmentsFile, length, legacyChecksum, maxVersion, fileHash.get()));
                }
            } catch (CorruptIndexException | IndexNotFoundException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
                // we either know the index is corrupted or it's just not there
                throw ex;
            } catch (Throwable ex) {
                try {
                    // Lucene checks the checksum after it tries to lookup the codec etc.
                    // in that case we might get only IAE or similar exceptions while we are really corrupt...
                    // TODO we should check the checksum in lucene if we hit an exception
                    logger.warn("failed to build store metadata. checking segment info integrity (with commit [{}])",
                            ex, commit == null ? "no" : "yes");
                    Lucene.checkSegmentInfoIntegrity(directory);
                } catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException cex) {
                    cex.addSuppressed(ex);
                    throw cex;
                } catch (Throwable e) {
                    // ignore...
                }

                throw ex;
            }
            return new LoadedMetadata(builder.build(), commitUserDataBuilder.build(), numDocs);
        }

        /**
         * Reads legacy checksum files found in the directory.
         * 

         * Files are expected to start with _checksums- prefix
         * followed by long file version. Only file with the highest version is read, all other files are ignored.
         *
         * @param directory the directory to read checksums from
         * @return a map of file checksums and the checksum file version
         */
        static Tuple, Long> readLegacyChecksums(Directory directory) throws IOException {
            synchronized (directory) {
                long lastFound = -1;
                for (String name : directory.listAll()) {
                    if (!isChecksum(name)) {
                        continue;
                    }
                    long current = Long.parseLong(name.substring(CHECKSUMS_PREFIX.length()));
                    if (current > lastFound) {
                        lastFound = current;
                    }
                }
                if (lastFound > -1) {
                    try (IndexInput indexInput = directory.openInput(CHECKSUMS_PREFIX + lastFound, IOContext.READONCE)) {
                        indexInput.readInt(); // version
                        return new Tuple(indexInput.readStringStringMap(), lastFound);
                    }
                }
                return new Tuple(new HashMap<>(), -1l);
            }
        }

        /**
         * Deletes all checksum files with version lower than newVersion.
         *
         * @param directory  the directory to clean
         * @param newVersion the latest checksum file version
         */
        static void cleanLegacyChecksums(Directory directory, long newVersion) throws IOException {
            synchronized (directory) {
                for (String name : directory.listAll()) {
                    if (isChecksum(name)) {
                        long current = Long.parseLong(name.substring(CHECKSUMS_PREFIX.length()));
                        if (current < newVersion) {
                            try {
                                directory.deleteFile(name);
                            } catch (IOException ex) {
                                logger.debug("can't delete old checksum file [{}]", ex, name);
                            }
                        }
                    }
                }
            }
        }

        private static void checksumFromLuceneFile(Directory directory, String file, ImmutableMap.Builder builder, ESLogger logger, Version version, boolean readFileAsHash) throws IOException {
            final String checksum;
            final BytesRefBuilder fileHash = new BytesRefBuilder();
            try (final IndexInput in = directory.openInput(file, IOContext.READONCE)) {
                final long length;
                try {
                    length = in.length();
                    if (length < CodecUtil.footerLength()) {
                        // truncated files trigger IAE if we seek negative... these files are really corrupted though
                        throw new CorruptIndexException("Can't retrieve checksum from file: " + file + " file length must be >= " + CodecUtil.footerLength() + " but was: " + in.length(), in);
                    }
                    if (readFileAsHash) {
                        final VerifyingIndexInput verifyingIndexInput = new VerifyingIndexInput(in); // additional safety we checksum the entire file we read the hash for...
                        hashFile(fileHash, new InputStreamIndexInput(verifyingIndexInput, length), length);
                        checksum = digestToString(verifyingIndexInput.verify());
                    } else {
                        checksum = digestToString(CodecUtil.retrieveChecksum(in));
                    }

                } catch (Throwable ex) {
                    logger.debug("Can retrieve checksum from file [{}]", ex, file);
                    throw ex;
                }
                builder.put(file, new StoreFileMetaData(file, length, checksum, version, fileHash.get()));
            }
        }

        /**
         * Computes a strong hash value for small files. Note that this method should only be used for files < 1MB
         */
        public static BytesRef hashFile(Directory directory, String file) throws IOException {
            final BytesRefBuilder fileHash = new BytesRefBuilder();
            try (final IndexInput in = directory.openInput(file, IOContext.READONCE)) {
                hashFile(fileHash, new InputStreamIndexInput(in, in.length()), in.length());
            }
            return fileHash.get();
        }


        /**
         * Computes a strong hash value for small files. Note that this method should only be used for files < 1MB
         */
        public static void hashFile(BytesRefBuilder fileHash, InputStream in, long size) throws IOException {
            final int len = (int) Math.min(1024 * 1024, size); // for safety we limit this to 1MB
            fileHash.grow(len);
            fileHash.setLength(len);
            final int readBytes = Streams.readFully(in, fileHash.bytes(), 0, len);
            assert readBytes == len : Integer.toString(readBytes) + " != " + Integer.toString(len);
            assert fileHash.length() == len : Integer.toString(fileHash.length()) + " != " + Integer.toString(len);
        }

        @Override
        public Iterator iterator() {
            return metadata.values().iterator();
        }

        public StoreFileMetaData get(String name) {
            return metadata.get(name);
        }

        public Map asMap() {
            return metadata;
        }

        private static final String DEL_FILE_EXTENSION = "del"; // legacy delete file
        private static final String LIV_FILE_EXTENSION = "liv"; // lucene 5 delete file
        private static final String FIELD_INFOS_FILE_EXTENSION = "fnm";
        private static final String SEGMENT_INFO_EXTENSION = "si";

        /**
         * Returns a diff between the two snapshots that can be used for recovery. The given snapshot is treated as the
         * recovery target and this snapshot as the source. The returned diff will hold a list of files that are:
         * 

         * identical: they exist in both snapshots and they can be considered the same ie. they don't need to be recovered
         * different: they exist in both snapshots but their they are not identical
         * missing: files that exist in the source but not in the target
         * 
         * This method groups file into per-segment files and per-commit files. A file is treated as
         * identical if and on if all files in it's group are identical. On a per-segment level files for a segment are treated
         * as identical iff:
         * 
         * all files in this segment have the same checksum
         * all files in this segment have the same length
         * the segments .si files hashes are byte-identical Note: This is a using a perfect hash function, The metadata transfers the .si file content as it's hash
         * 
         * 
         * The .si file contains a lot of diagnostics including a timestamp etc. in the future there might be
         * unique segment identifiers in there hardening this method further.
         * 

         * The per-commit files handles very similar. A commit is composed of the segments_N files as well as generational files like
         * deletes (_x_y.del) or field-info (_x_y.fnm) files. On a per-commit level files for a commit are treated
         * as identical iff:
         * 

         * all files belonging to this commit have the same checksum
         * all files belonging to this commit have the same length
         * the segments file segments_N files hashes are byte-identical Note: This is a using a perfect hash function, The metadata transfers the segments_N file content as it's hash
         * 
         * 
         * NOTE: this diff will not contain the segments.gen file. This file is omitted on recovery.
         */
        public RecoveryDiff recoveryDiff(MetadataSnapshot recoveryTargetSnapshot) {
            final List identical = new ArrayList<>();
            final List different = new ArrayList<>();
            final List missing = new ArrayList<>();
            final Map> perSegment = new HashMap<>();
            final List perCommitStoreFiles = new ArrayList<>();

            for (StoreFileMetaData meta : this) {
                if (IndexFileNames.OLD_SEGMENTS_GEN.equals(meta.name())) { // legacy
                    continue; // we don't need that file at all
                }
                final String segmentId = IndexFileNames.parseSegmentName(meta.name());
                final String extension = IndexFileNames.getExtension(meta.name());
                assert FIELD_INFOS_FILE_EXTENSION.equals(extension) == false || IndexFileNames.stripExtension(IndexFileNames.stripSegmentName(meta.name())).isEmpty() : "FieldInfos are generational but updateable DV are not supported in elasticsearch";
                if (IndexFileNames.SEGMENTS.equals(segmentId) || DEL_FILE_EXTENSION.equals(extension) || LIV_FILE_EXTENSION.equals(extension)) {
                    // only treat del files as per-commit files fnm files are generational but only for upgradable DV
                    perCommitStoreFiles.add(meta);
                } else {
                    List perSegStoreFiles = perSegment.get(segmentId);
                    if (perSegStoreFiles == null) {
                        perSegStoreFiles = new ArrayList<>();
                        perSegment.put(segmentId, perSegStoreFiles);
                    }
                    perSegStoreFiles.add(meta);
                }
            }
            final ArrayList identicalFiles = new ArrayList<>();
            for (List segmentFiles : Iterables.concat(perSegment.values(), Collections.singleton(perCommitStoreFiles))) {
                identicalFiles.clear();
                boolean consistent = true;
                for (StoreFileMetaData meta : segmentFiles) {
                    StoreFileMetaData storeFileMetaData = recoveryTargetSnapshot.get(meta.name());
                    if (storeFileMetaData == null) {
                        consistent = false;
                        missing.add(meta);
                    } else if (storeFileMetaData.isSame(meta) == false) {
                        consistent = false;
                        different.add(meta);
                    } else {
                        identicalFiles.add(meta);
                    }
                }
                if (consistent) {
                    identical.addAll(identicalFiles);
                } else {
                    // make sure all files are added - this can happen if only the deletes are different
                    different.addAll(identicalFiles);
                }
            }
            RecoveryDiff recoveryDiff = new RecoveryDiff(Collections.unmodifiableList(identical), Collections.unmodifiableList(different), Collections.unmodifiableList(missing));
            assert recoveryDiff.size() == this.metadata.size() - (metadata.containsKey(IndexFileNames.OLD_SEGMENTS_GEN) ? 1 : 0)
                    : "some files are missing recoveryDiff size: [" + recoveryDiff.size() + "] metadata size: [" + this.metadata.size() + "] contains  segments.gen: [" + metadata.containsKey(IndexFileNames.OLD_SEGMENTS_GEN) + "]";
            return recoveryDiff;
        }

        /**
         * Returns the number of files in this snapshot
         */
        public int size() {
            return metadata.size();
        }

        @Override
        public void writeTo(StreamOutput out) throws IOException {
            out.writeVInt(this.metadata.size());
            for (StoreFileMetaData meta : this) {
                meta.writeTo(out);
            }
            out.writeVInt(commitUserData.size());
            for (Map.Entry entry : commitUserData.entrySet()) {
                out.writeString(entry.getKey());
                out.writeString(entry.getValue());
            }
            out.writeLong(numDocs);
        }

        public Map getCommitUserData() {
            return commitUserData;
        }

        /**
         * Returns true iff this metadata contains the given file.
         */
        public boolean contains(String existingFile) {
            return metadata.containsKey(existingFile);
        }

        /**
         * Returns the segments file that this metadata snapshot represents or null if the snapshot is empty.
         */
        public StoreFileMetaData getSegmentsFile() {
            for (StoreFileMetaData file : this) {
                if (file.name().startsWith(IndexFileNames.SEGMENTS)) {
                    return file;
                }
            }
            assert metadata.isEmpty();
            return null;
        }

        private final int numSegmentFiles() { // only for asserts
            int count = 0;
            for (StoreFileMetaData file : this) {
                if (file.name().startsWith(IndexFileNames.SEGMENTS)) {
                    count++;
                }
            }
            return count;
        }

        /**
         * Returns the sync id of the commit point that this MetadataSnapshot represents.
         *
         * @return sync id if exists, else null
         */
        public String getSyncId() {
            return commitUserData.get(Engine.SYNC_COMMIT_ID);
        }

        @Override
        public MetadataSnapshot readFrom(StreamInput in) throws IOException {
            return new MetadataSnapshot(in);
        }
    }

    /**
     * A class representing the diff between a recovery source and recovery target
     *
     * @see MetadataSnapshot#recoveryDiff(org.elasticsearch.index.store.Store.MetadataSnapshot)
     */
    public static final class RecoveryDiff {
        /**
         * Files that exist in both snapshots and they can be considered the same ie. they don't need to be recovered
         */
        public final List identical;
        /**
         * Files that exist in both snapshots but their they are not identical
         */
        public final List different;
        /**
         * Files that exist in the source but not in the target
         */
        public final List missing;

        RecoveryDiff(List identical, List different, List missing) {
            this.identical = identical;
            this.different = different;
            this.missing = missing;
        }

        /**
         * Returns the sum of the files in this diff.
         */
        public int size() {
            return identical.size() + different.size() + missing.size();
        }

        @Override
        public String toString() {
            return "RecoveryDiff{" +
                    "identical=" + identical +
                    ", different=" + different +
                    ", missing=" + missing +
                    '}';
        }
    }

    public final static class LegacyChecksums {
        private final Map legacyChecksums = new HashMap<>();

        public void add(StoreFileMetaData metaData) throws IOException {

            if (metaData.hasLegacyChecksum()) {
                synchronized (this) {
                    // we don't add checksums if they were written by LUCENE_48... now we are using the build in mechanism.
                    legacyChecksums.put(metaData.name(), metaData.checksum());
                }
            }
        }

        public synchronized void write(Store store) throws IOException {
            synchronized (store.directory) {
                Tuple, Long> tuple = MetadataSnapshot.readLegacyChecksums(store.directory);
                tuple.v1().putAll(legacyChecksums);
                if (!tuple.v1().isEmpty()) {
                    writeChecksums(store.directory, tuple.v1(), tuple.v2());
                }
            }
        }

        synchronized void writeChecksums(Directory directory, Map checksums, long lastVersion) throws IOException {
            // Make sure if clock goes backwards we still move version forwards:
            long nextVersion = Math.max(lastVersion+1, System.currentTimeMillis());
            final String checksumName = CHECKSUMS_PREFIX + nextVersion;
            try (IndexOutput output = directory.createOutput(checksumName, IOContext.DEFAULT)) {
                output.writeInt(0); // version
                output.writeStringStringMap(checksums);
            }
            directory.sync(Collections.singleton(checksumName));
            MetadataSnapshot.cleanLegacyChecksums(directory, nextVersion);
        }

        public void clear() {
            this.legacyChecksums.clear();
        }

        public void remove(String name) {
            legacyChecksums.remove(name);
        }
    }

    public static final String CHECKSUMS_PREFIX = "_checksums-";

    public static boolean isChecksum(String name) {
        // TODO can we drowp .cks
        return name.startsWith(CHECKSUMS_PREFIX) || name.endsWith(".cks"); // bwcomapt - .cks used to be a previous checksum file
    }

    /**
     * Returns true if the file is auto-generated by the store and shouldn't be deleted during cleanup.
     * This includes write lock and checksum files
     */
    public static boolean isAutogenerated(String name) {
        return IndexWriter.WRITE_LOCK_NAME.equals(name) || isChecksum(name);
    }

    /**
     * Produces a string representation of the given digest value.
     */
    public static String digestToString(long digest) {
        return Long.toString(digest, Character.MAX_RADIX);
    }


    static class LuceneVerifyingIndexOutput extends VerifyingIndexOutput {

        private final StoreFileMetaData metadata;
        private long writtenBytes;
        private final long checksumPosition;
        private String actualChecksum;
        private final byte[] footerChecksum = new byte[8]; // this holds the actual footer checksum data written by to this output

        LuceneVerifyingIndexOutput(StoreFileMetaData metadata, IndexOutput out) {
            super(out);
            this.metadata = metadata;
            checksumPosition = metadata.length() - 8; // the last 8 bytes are the checksum - we store it in footerChecksum
        }

        @Override
        public void verify() throws IOException {
            String footerDigest = null;
            if (metadata.checksum().equals(actualChecksum) && writtenBytes == metadata.length()) {
                ByteArrayIndexInput indexInput = new ByteArrayIndexInput("checksum", this.footerChecksum);
                footerDigest = digestToString(indexInput.readLong());
                if (metadata.checksum().equals(footerDigest)) {
                    return;
                }
            }
            throw new CorruptIndexException("verification failed (hardware problem?) : expected=" + metadata.checksum() +
                    " actual=" + actualChecksum + " footer=" + footerDigest +" writtenLength=" + writtenBytes + " expectedLength=" + metadata.length() +
                    " (resource=" + metadata.toString() + ")", "VerifyingIndexOutput(" + metadata.name() + ")");
        }

        @Override
        public void writeByte(byte b) throws IOException {
            final long writtenBytes = this.writtenBytes++;
            if (writtenBytes >= checksumPosition) { // we are writing parts of the checksum....
                if (writtenBytes == checksumPosition) {
                    readAndCompareChecksum();
                }
                final long indexLong = writtenBytes - checksumPosition;
                if ((int)indexLong != indexLong) {
                    throw new ArithmeticException("integer overflow");
                }
                final int index = (int)indexLong;
                if (index < footerChecksum.length) {
                    footerChecksum[index] = b;
                    if (index == footerChecksum.length-1) {
                        verify(); // we have recorded the entire checksum
                    }
                } else {
                    verify(); // fail if we write more than expected
                    throw new AssertionError("write past EOF expected length: " + metadata.length() + " writtenBytes: " + writtenBytes);
                }
            }
            out.writeByte(b);
        }

        private void readAndCompareChecksum() throws IOException {
            actualChecksum = digestToString(getChecksum());
            if (!metadata.checksum().equals(actualChecksum)) {
                throw new CorruptIndexException("checksum failed (hardware problem?) : expected=" + metadata.checksum() +
                        " actual=" + actualChecksum +
                        " (resource=" + metadata.toString() + ")", "VerifyingIndexOutput(" + metadata.name() + ")");
            }
        }

        @Override
        public void writeBytes(byte[] b, int offset, int length) throws IOException {
            if (writtenBytes + length > checksumPosition) {
                for (int i = 0; i < length; i++) { // don't optimze writing the last block of bytes
                    writeByte(b[offset+i]);
                }
            } else {
                out.writeBytes(b, offset, length);
                writtenBytes += length;
            }
        }
    }

    /**
     * Index input that calculates checksum as data is read from the input.
     * 
     * This class supports random access (it is possible to seek backward and forward) in order to accommodate retry
     * mechanism that is used in some repository plugins (S3 for example). However, the checksum is only calculated on
     * the first read. All consecutive reads of the same data are not used to calculate the checksum.
     */
    static class VerifyingIndexInput extends ChecksumIndexInput {
        private final IndexInput input;
        private final Checksum digest;
        private final long checksumPosition;
        private final byte[] checksum = new byte[8];
        private long verifiedPosition = 0;

        public VerifyingIndexInput(IndexInput input) {
            this(input, new BufferedChecksum(new CRC32()));
        }

        public VerifyingIndexInput(IndexInput input, Checksum digest) {
            super("VerifyingIndexInput(" + input + ")");
            this.input = input;
            this.digest = digest;
            checksumPosition = input.length() - 8;
        }

        @Override
        public byte readByte() throws IOException {
            long pos = input.getFilePointer();
            final byte b = input.readByte();
            pos++;
            if (pos > verifiedPosition) {
                if (pos <= checksumPosition) {
                    digest.update(b);
                } else {
                    checksum[(int) (pos - checksumPosition - 1)] = b;
                }
                verifiedPosition = pos;
            }
            return b;
        }

        @Override
        public void readBytes(byte[] b, int offset, int len)
                throws IOException {
            long pos = input.getFilePointer();
            input.readBytes(b, offset, len);
            if (pos + len > verifiedPosition) {
                // Conversion to int is safe here because (verifiedPosition - pos) can be at most len, which is integer
                int alreadyVerified = (int) Math.max(0, verifiedPosition - pos);
                if (pos < checksumPosition) {
                    if (pos + len < checksumPosition) {
                        digest.update(b, offset + alreadyVerified, len - alreadyVerified);
                    } else {
                        int checksumOffset = (int) (checksumPosition - pos);
                        if (checksumOffset - alreadyVerified > 0) {
                            digest.update(b, offset + alreadyVerified, checksumOffset - alreadyVerified);
                        }
                        System.arraycopy(b, offset + checksumOffset, checksum, 0, len - checksumOffset);
                    }
                } else {
                    // Conversion to int is safe here because checksumPosition is (file length - 8) so
                    // (pos - checksumPosition) cannot be bigger than 8 unless we are reading after the end of file
                    assert pos - checksumPosition < 8;
                    System.arraycopy(b, offset, checksum, (int) (pos - checksumPosition), len);
                }
                verifiedPosition = pos + len;
            }
        }

        @Override
        public long getChecksum() {
            return digest.getValue();
        }

        @Override
        public void seek(long pos) throws IOException {
            if (pos < verifiedPosition) {
                // going within verified region - just seek there
                input.seek(pos);
            } else {
                if (verifiedPosition > getFilePointer()) {
                    // portion of the skip region is verified and portion is not
                    // skipping the verified portion
                    input.seek(verifiedPosition);
                    // and checking unverified
                    skipBytes(pos - verifiedPosition);
                } else {
                    skipBytes(pos - getFilePointer());
                }
            }
        }

        @Override
        public void close() throws IOException {
            input.close();
        }

        @Override
        public long getFilePointer() {
            return input.getFilePointer();
        }

        @Override
        public long length() {
            return input.length();
        }

        @Override
        public IndexInput clone() {
            throw new UnsupportedOperationException();
        }

        @Override
        public IndexInput slice(String sliceDescription, long offset, long length) throws IOException {
            throw new UnsupportedOperationException();
        }

        public long getStoredChecksum() {
            return new ByteArrayDataInput(checksum).readLong();
        }

        public long verify() throws CorruptIndexException {
            long storedChecksum = getStoredChecksum();
            if (getChecksum() == storedChecksum) {
                return storedChecksum;
            }
            throw new CorruptIndexException("verification failed : calculated=" + Store.digestToString(getChecksum()) +
                    " stored=" + Store.digestToString(storedChecksum), this);
        }

    }

    public void deleteQuiet(String... files) {
        for (String file : files) {
            try {
                directory().deleteFile(file);
            } catch (Throwable ex) {
                // ignore
            }
        }
    }

    /**
     * Marks this store as corrupted. This method writes a corrupted_${uuid} file containing the given exception
     * message. If a store contains a corrupted_${uuid} file {@link #isMarkedCorrupted()} will return true.
     */
    public void markStoreCorrupted(IOException exception) throws IOException {
        ensureOpen();
        if (!isMarkedCorrupted()) {
            String uuid = CORRUPTED + Strings.randomBase64UUID();
            try (IndexOutput output = this.directory().createOutput(uuid, IOContext.DEFAULT)) {
                CodecUtil.writeHeader(output, CODEC, VERSION);
                BytesStreamOutput out = new BytesStreamOutput();
                out.writeThrowable(exception);
                BytesReference bytes = out.bytes();
                output.writeVInt(bytes.length());
                output.writeBytes(bytes.array(), bytes.arrayOffset(), bytes.length());
                CodecUtil.writeFooter(output);
            } catch (IOException ex) {
                logger.warn("Can't mark store as corrupted", ex);
            }
            directory().sync(Collections.singleton(uuid));
        }
    }

    /**
     * A listener that is executed once the store is closed and all references to it are released
     */
    public static interface OnClose extends Callback {
        static final OnClose EMPTY = new OnClose() {
            /**
             * This method is called while the provided {@link org.elasticsearch.env.ShardLock} is held.
             * This method is only called once after all resources for a store are released.
             */
            @Override
            public void handle(ShardLock Lock) {
            }
        };
    }

    private static class StoreStatsCache extends SingleObjectCache {
        private final Directory directory;
        private final DirectoryService directoryService;

        public StoreStatsCache(TimeValue refreshInterval, Directory directory, DirectoryService directoryService) throws IOException {
            super(refreshInterval, new StoreStats(estimateSize(directory), directoryService.throttleTimeInNanos()));
            this.directory = directory;
            this.directoryService = directoryService;
        }

        @Override
        protected StoreStats refresh() {
            try {
                return new StoreStats(estimateSize(directory), directoryService.throttleTimeInNanos());
            } catch (IOException ex) {
                throw new ElasticsearchException("failed to refresh store stats", ex);
            }
        }

        private static long estimateSize(Directory directory) throws IOException {
            long estimatedSize = 0;
            String[] files = directory.listAll();
            for (String file : files) {
                try {
                    estimatedSize += directory.fileLength(file);
                } catch (NoSuchFileException | FileNotFoundException e) {
                    // ignore, the file is not there no more
                }
            }
            return estimatedSize;
        }
    }
}