org.opensearch.gateway.PersistedClusterStateService Maven / Gradle / Ivy

Go to download
/*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
/*
 * Modifications Copyright OpenSearch Contributors. See
 * GitHub history for details.
 */

package org.opensearch.gateway;

import com.carrotsearch.hppc.cursors.ObjectCursor;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexNotFoundException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.Weight;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefIterator;
import org.apache.lucene.util.SetOnce;
import org.opensearch.Version;
import org.opensearch.cluster.ClusterState;
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.cluster.metadata.Metadata;
import org.opensearch.common.CheckedConsumer;
import org.opensearch.common.Nullable;
import org.opensearch.common.bytes.BytesReference;
import org.opensearch.common.bytes.RecyclingBytesStreamOutput;
import org.opensearch.common.io.Streams;
import org.opensearch.common.lease.Releasable;
import org.opensearch.common.lease.Releasables;
import org.opensearch.common.logging.Loggers;
import org.opensearch.common.lucene.Lucene;
import org.opensearch.common.settings.ClusterSettings;
import org.opensearch.common.settings.Setting;
import org.opensearch.common.unit.TimeValue;
import org.opensearch.common.util.BigArrays;
import org.opensearch.common.util.ByteArray;
import org.opensearch.common.util.PageCacheRecycler;
import org.opensearch.common.xcontent.LoggingDeprecationHandler;
import org.opensearch.common.xcontent.NamedXContentRegistry;
import org.opensearch.common.xcontent.ToXContent;
import org.opensearch.common.xcontent.XContentBuilder;
import org.opensearch.common.xcontent.XContentFactory;
import org.opensearch.common.xcontent.XContentType;
import org.opensearch.core.internal.io.IOUtils;
import org.opensearch.env.NodeEnvironment;
import org.opensearch.env.NodeMetadata;
import org.opensearch.index.Index;

import java.io.Closeable;
import java.io.IOError;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.IntPredicate;
import java.util.function.LongSupplier;
import java.util.function.Supplier;

/**
 * Stores cluster metadata in a bare Lucene index (per data path) split across a number of documents. This is used by master-eligible nodes
 * to record the last-accepted cluster state during publication. The metadata is written incrementally where possible, leaving alone any
 * documents that have not changed. The index has the following fields:
 *
 * +------------------------------+-----------------------------+----------------------------------------------+
 * | "type" (string field)        | "index_uuid" (string field) | "data" (stored binary field in SMILE format) |
 * +------------------------------+-----------------------------+----------------------------------------------+
 * | GLOBAL_TYPE_NAME == "global" | (omitted)                   | Global metadata                              |
 * | INDEX_TYPE_NAME  == "index"  | Index UUID                  | Index metadata                               |
 * +------------------------------+-----------------------------+----------------------------------------------+
 *
 * Additionally each commit has the following user data:
 *
 * +---------------------------+-------------------------+-------------------------------------------------------------------------------+
 * |        Key symbol         |       Key literal       |                                     Value                                     |
 * +---------------------------+-------------------------+-------------------------------------------------------------------------------+
 * | CURRENT_TERM_KEY          | "current_term"          | Node's "current" term (≥ last-accepted term and the terms of all sent joins)  |
 * | LAST_ACCEPTED_VERSION_KEY | "last_accepted_version" | The cluster state version corresponding with the persisted metadata           |
 * | NODE_ID_KEY               | "node_id"               | The (persistent) ID of the node that wrote this metadata                      |
 * | NODE_VERSION_KEY          | "node_version"          | The (ID of the) version of the node that wrote this metadata                  |
 * +---------------------------+-------------------------+-------------------------------------------------------------------------------+
 *
 * (the last-accepted term is recorded in Metadata → CoordinationMetadata so does not need repeating here)
 */
public class PersistedClusterStateService {
    private static final Logger logger = LogManager.getLogger(PersistedClusterStateService.class);
    private static final String CURRENT_TERM_KEY = "current_term";
    private static final String LAST_ACCEPTED_VERSION_KEY = "last_accepted_version";
    private static final String NODE_ID_KEY = "node_id";
    private static final String NODE_VERSION_KEY = "node_version";
    private static final String TYPE_FIELD_NAME = "type";
    private static final String DATA_FIELD_NAME = "data";
    private static final String GLOBAL_TYPE_NAME = "global";
    private static final String INDEX_TYPE_NAME = "index";
    private static final String INDEX_UUID_FIELD_NAME = "index_uuid";
    private static final int COMMIT_DATA_SIZE = 4;

    public static final String METADATA_DIRECTORY_NAME = MetadataStateFormat.STATE_DIR_NAME;

    public static final Setting SLOW_WRITE_LOGGING_THRESHOLD = Setting.timeSetting(
        "gateway.slow_write_logging_threshold",
        TimeValue.timeValueSeconds(10),
        TimeValue.ZERO,
        Setting.Property.NodeScope,
        Setting.Property.Dynamic
    );

    private final Path[] dataPaths;
    private final String nodeId;
    private final NamedXContentRegistry namedXContentRegistry;
    private final BigArrays bigArrays;
    private final LongSupplier relativeTimeMillisSupplier;

    private volatile TimeValue slowWriteLoggingThreshold;

    public PersistedClusterStateService(
        NodeEnvironment nodeEnvironment,
        NamedXContentRegistry namedXContentRegistry,
        BigArrays bigArrays,
        ClusterSettings clusterSettings,
        LongSupplier relativeTimeMillisSupplier
    ) {
        this(
            nodeEnvironment.nodeDataPaths(),
            nodeEnvironment.nodeId(),
            namedXContentRegistry,
            bigArrays,
            clusterSettings,
            relativeTimeMillisSupplier
        );
    }

    public PersistedClusterStateService(
        Path[] dataPaths,
        String nodeId,
        NamedXContentRegistry namedXContentRegistry,
        BigArrays bigArrays,
        ClusterSettings clusterSettings,
        LongSupplier relativeTimeMillisSupplier
    ) {
        this.dataPaths = dataPaths;
        this.nodeId = nodeId;
        this.namedXContentRegistry = namedXContentRegistry;
        this.bigArrays = bigArrays;
        this.relativeTimeMillisSupplier = relativeTimeMillisSupplier;
        this.slowWriteLoggingThreshold = clusterSettings.get(SLOW_WRITE_LOGGING_THRESHOLD);
        clusterSettings.addSettingsUpdateConsumer(SLOW_WRITE_LOGGING_THRESHOLD, this::setSlowWriteLoggingThreshold);
    }

    private void setSlowWriteLoggingThreshold(TimeValue slowWriteLoggingThreshold) {
        this.slowWriteLoggingThreshold = slowWriteLoggingThreshold;
    }

    public String getNodeId() {
        return nodeId;
    }

    /**
     * Creates a new disk-based writer for cluster states
     */
    public Writer createWriter() throws IOException {
        final List metadataIndexWriters = new ArrayList<>();
        final List closeables = new ArrayList<>();
        boolean success = false;
        try {
            for (final Path path : dataPaths) {
                final Directory directory = createDirectory(path.resolve(METADATA_DIRECTORY_NAME));
                closeables.add(directory);

                final IndexWriter indexWriter = createIndexWriter(directory, false);
                closeables.add(indexWriter);
                metadataIndexWriters.add(new MetadataIndexWriter(directory, indexWriter));
            }
            success = true;
        } finally {
            if (success == false) {
                IOUtils.closeWhileHandlingException(closeables);
            }
        }
        return new Writer(metadataIndexWriters, nodeId, bigArrays, relativeTimeMillisSupplier, () -> slowWriteLoggingThreshold);
    }

    private static IndexWriter createIndexWriter(Directory directory, boolean openExisting) throws IOException {
        final IndexWriterConfig indexWriterConfig = new IndexWriterConfig(new KeywordAnalyzer());
        // start empty since we re-write the whole cluster state to ensure it is all using the same format version
        indexWriterConfig.setOpenMode(openExisting ? IndexWriterConfig.OpenMode.APPEND : IndexWriterConfig.OpenMode.CREATE);
        // only commit when specifically instructed, we must not write any intermediate states
        indexWriterConfig.setCommitOnClose(false);
        // most of the data goes into stored fields which are not buffered, so we only really need a tiny buffer
        indexWriterConfig.setRAMBufferSizeMB(1.0);
        // merge on the write thread (e.g. while flushing)
        indexWriterConfig.setMergeScheduler(new SerialMergeScheduler());

        return new IndexWriter(directory, indexWriterConfig);
    }

    /**
     * Remove all persisted cluster states from the given data paths, for use in tests. Should only be called when there is no open
     * {@link Writer} on these paths.
     */
    public static void deleteAll(Path[] dataPaths) throws IOException {
        for (Path dataPath : dataPaths) {
            Lucene.cleanLuceneIndex(new NIOFSDirectory(dataPath.resolve(METADATA_DIRECTORY_NAME)));
        }
    }

    // exposed for tests
    Directory createDirectory(Path path) throws IOException {
        // it is possible to disable the use of MMapDirectory for indices, and it may be surprising to users that have done so if we still
        // use a MMapDirectory here, which might happen with FSDirectory.open(path). Concurrency is of no concern here so a
        // NIOFSDirectory is fine:
        return new NIOFSDirectory(path);
    }

    public Path[] getDataPaths() {
        return dataPaths;
    }

    public static class OnDiskState {
        private static final OnDiskState NO_ON_DISK_STATE = new OnDiskState(null, null, 0L, 0L, Metadata.EMPTY_METADATA);

        private final String nodeId;
        private final Path dataPath;
        public final long currentTerm;
        public final long lastAcceptedVersion;
        public final Metadata metadata;

        private OnDiskState(String nodeId, Path dataPath, long currentTerm, long lastAcceptedVersion, Metadata metadata) {
            this.nodeId = nodeId;
            this.dataPath = dataPath;
            this.currentTerm = currentTerm;
            this.lastAcceptedVersion = lastAcceptedVersion;
            this.metadata = metadata;
        }

        public boolean empty() {
            return this == NO_ON_DISK_STATE;
        }
    }

    /**
     * Returns the node metadata for the given data paths, and checks if the node ids are unique
     * @param dataPaths the data paths to scan
     */
    @Nullable
    public static NodeMetadata nodeMetadata(Path... dataPaths) throws IOException {
        String nodeId = null;
        Version version = null;
        for (final Path dataPath : dataPaths) {
            final Path indexPath = dataPath.resolve(METADATA_DIRECTORY_NAME);
            if (Files.exists(indexPath)) {
                try (DirectoryReader reader = DirectoryReader.open(new NIOFSDirectory(dataPath.resolve(METADATA_DIRECTORY_NAME)))) {
                    final Map userData = reader.getIndexCommit().getUserData();
                    assert userData.get(NODE_VERSION_KEY) != null;

                    final String thisNodeId = userData.get(NODE_ID_KEY);
                    assert thisNodeId != null;
                    if (nodeId != null && nodeId.equals(thisNodeId) == false) {
                        throw new IllegalStateException(
                            "unexpected node ID in metadata, found [" + thisNodeId + "] in [" + dataPath + "] but expected [" + nodeId + "]"
                        );
                    } else if (nodeId == null) {
                        nodeId = thisNodeId;
                        version = Version.fromId(Integer.parseInt(userData.get(NODE_VERSION_KEY)));
                    }
                } catch (IndexNotFoundException e) {
                    logger.debug(new ParameterizedMessage("no on-disk state at {}", indexPath), e);
                }
            }
        }
        if (nodeId == null) {
            return null;
        }
        return new NodeMetadata(nodeId, version);
    }

    /**
     * Overrides the version field for the metadata in the given data path
     */
    public static void overrideVersion(Version newVersion, Path... dataPaths) throws IOException {
        for (final Path dataPath : dataPaths) {
            final Path indexPath = dataPath.resolve(METADATA_DIRECTORY_NAME);
            if (Files.exists(indexPath)) {
                try (DirectoryReader reader = DirectoryReader.open(new NIOFSDirectory(dataPath.resolve(METADATA_DIRECTORY_NAME)))) {
                    final Map userData = reader.getIndexCommit().getUserData();
                    assert userData.get(NODE_VERSION_KEY) != null;

                    try (IndexWriter indexWriter = createIndexWriter(new NIOFSDirectory(dataPath.resolve(METADATA_DIRECTORY_NAME)), true)) {
                        final Map commitData = new HashMap<>(userData);
                        commitData.put(NODE_VERSION_KEY, Integer.toString(newVersion.id));
                        indexWriter.setLiveCommitData(commitData.entrySet());
                        indexWriter.commit();
                    }
                } catch (IndexNotFoundException e) {
                    logger.debug(new ParameterizedMessage("no on-disk state at {}", indexPath), e);
                }
            }
        }
    }

    /**
     * Loads the best available on-disk cluster state. Returns {@link OnDiskState#NO_ON_DISK_STATE} if no such state was found.
     */
    public OnDiskState loadBestOnDiskState() throws IOException {
        String committedClusterUuid = null;
        Path committedClusterUuidPath = null;
        OnDiskState bestOnDiskState = OnDiskState.NO_ON_DISK_STATE;
        OnDiskState maxCurrentTermOnDiskState = bestOnDiskState;

        // We use a write-all-read-one strategy: metadata is written to every data path when accepting it, which means it is mostly
        // sufficient to read _any_ copy. "Mostly" sufficient because the user can change the set of data paths when restarting, and may
        // add a data path containing a stale copy of the metadata. We deal with this by using the freshest copy we can find.
        for (final Path dataPath : dataPaths) {
            final Path indexPath = dataPath.resolve(METADATA_DIRECTORY_NAME);
            if (Files.exists(indexPath)) {
                try (Directory directory = createDirectory(indexPath); DirectoryReader directoryReader = DirectoryReader.open(directory)) {
                    final OnDiskState onDiskState = loadOnDiskState(dataPath, directoryReader);

                    if (nodeId.equals(onDiskState.nodeId) == false) {
                        throw new IllegalStateException(
                            "unexpected node ID in metadata, found ["
                                + onDiskState.nodeId
                                + "] in ["
                                + dataPath
                                + "] but expected ["
                                + nodeId
                                + "]"
                        );
                    }

                    if (onDiskState.metadata.clusterUUIDCommitted()) {
                        if (committedClusterUuid == null) {
                            committedClusterUuid = onDiskState.metadata.clusterUUID();
                            committedClusterUuidPath = dataPath;
                        } else if (committedClusterUuid.equals(onDiskState.metadata.clusterUUID()) == false) {
                            throw new IllegalStateException(
                                "mismatched cluster UUIDs in metadata, found ["
                                    + committedClusterUuid
                                    + "] in ["
                                    + committedClusterUuidPath
                                    + "] and ["
                                    + onDiskState.metadata.clusterUUID()
                                    + "] in ["
                                    + dataPath
                                    + "]"
                            );
                        }
                    }

                    if (maxCurrentTermOnDiskState.empty() || maxCurrentTermOnDiskState.currentTerm < onDiskState.currentTerm) {
                        maxCurrentTermOnDiskState = onDiskState;
                    }

                    long acceptedTerm = onDiskState.metadata.coordinationMetadata().term();
                    long maxAcceptedTerm = bestOnDiskState.metadata.coordinationMetadata().term();
                    if (bestOnDiskState.empty()
                        || acceptedTerm > maxAcceptedTerm
                        || (acceptedTerm == maxAcceptedTerm
                            && (onDiskState.lastAcceptedVersion > bestOnDiskState.lastAcceptedVersion
                                || (onDiskState.lastAcceptedVersion == bestOnDiskState.lastAcceptedVersion)
                                    && onDiskState.currentTerm > bestOnDiskState.currentTerm))) {
                        bestOnDiskState = onDiskState;
                    }
                } catch (IndexNotFoundException e) {
                    logger.debug(new ParameterizedMessage("no on-disk state at {}", indexPath), e);
                }
            }
        }

        if (bestOnDiskState.currentTerm != maxCurrentTermOnDiskState.currentTerm) {
            throw new IllegalStateException(
                "inconsistent terms found: best state is from ["
                    + bestOnDiskState.dataPath
                    + "] in term ["
                    + bestOnDiskState.currentTerm
                    + "] but there is a stale state in ["
                    + maxCurrentTermOnDiskState.dataPath
                    + "] with greater term ["
                    + maxCurrentTermOnDiskState.currentTerm
                    + "]"
            );
        }

        return bestOnDiskState;
    }

    private OnDiskState loadOnDiskState(Path dataPath, DirectoryReader reader) throws IOException {
        final IndexSearcher searcher = new IndexSearcher(reader);
        searcher.setQueryCache(null);

        final SetOnce builderReference = new SetOnce<>();
        consumeFromType(searcher, GLOBAL_TYPE_NAME, bytes -> {
            final Metadata metadata = Metadata.Builder.fromXContent(
                XContentFactory.xContent(XContentType.SMILE)
                    .createParser(namedXContentRegistry, LoggingDeprecationHandler.INSTANCE, bytes.bytes, bytes.offset, bytes.length)
            );
            logger.trace("found global metadata with last-accepted term [{}]", metadata.coordinationMetadata().term());
            if (builderReference.get() != null) {
                throw new IllegalStateException("duplicate global metadata found in [" + dataPath + "]");
            }
            builderReference.set(Metadata.builder(metadata));
        });

        final Metadata.Builder builder = builderReference.get();
        if (builder == null) {
            throw new IllegalStateException("no global metadata found in [" + dataPath + "]");
        }

        logger.trace("got global metadata, now reading index metadata");

        final Set indexUUIDs = new HashSet<>();
        consumeFromType(searcher, INDEX_TYPE_NAME, bytes -> {
            final IndexMetadata indexMetadata = IndexMetadata.fromXContent(
                XContentFactory.xContent(XContentType.SMILE)
                    .createParser(namedXContentRegistry, LoggingDeprecationHandler.INSTANCE, bytes.bytes, bytes.offset, bytes.length)
            );
            logger.trace("found index metadata for {}", indexMetadata.getIndex());
            if (indexUUIDs.add(indexMetadata.getIndexUUID()) == false) {
                throw new IllegalStateException("duplicate metadata found for " + indexMetadata.getIndex() + " in [" + dataPath + "]");
            }
            builder.put(indexMetadata, false);
        });

        final Map userData = reader.getIndexCommit().getUserData();
        logger.trace("loaded metadata [{}] from [{}]", userData, reader.directory());
        assert userData.size() == COMMIT_DATA_SIZE : userData;
        assert userData.get(CURRENT_TERM_KEY) != null;
        assert userData.get(LAST_ACCEPTED_VERSION_KEY) != null;
        assert userData.get(NODE_ID_KEY) != null;
        assert userData.get(NODE_VERSION_KEY) != null;
        return new OnDiskState(
            userData.get(NODE_ID_KEY),
            dataPath,
            Long.parseLong(userData.get(CURRENT_TERM_KEY)),
            Long.parseLong(userData.get(LAST_ACCEPTED_VERSION_KEY)),
            builder.build()
        );
    }

    private static void consumeFromType(IndexSearcher indexSearcher, String type, CheckedConsumer bytesRefConsumer)
        throws IOException {

        final Query query = new TermQuery(new Term(TYPE_FIELD_NAME, type));
        final Weight weight = indexSearcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 0.0f);
        logger.trace("running query [{}]", query);

        for (LeafReaderContext leafReaderContext : indexSearcher.getIndexReader().leaves()) {
            logger.trace("new leafReaderContext: {}", leafReaderContext);
            final Scorer scorer = weight.scorer(leafReaderContext);
            if (scorer != null) {
                final Bits liveDocs = leafReaderContext.reader().getLiveDocs();
                final IntPredicate isLiveDoc = liveDocs == null ? i -> true : liveDocs::get;
                final DocIdSetIterator docIdSetIterator = scorer.iterator();
                while (docIdSetIterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                    if (isLiveDoc.test(docIdSetIterator.docID())) {
                        logger.trace("processing doc {}", docIdSetIterator.docID());
                        bytesRefConsumer.accept(
                            leafReaderContext.reader().document(docIdSetIterator.docID()).getBinaryValue(DATA_FIELD_NAME)
                        );
                    }
                }
            }
        }
    }

    private static final ToXContent.Params FORMAT_PARAMS;

    static {
        Map params = new HashMap<>(2);
        params.put("binary", "true");
        params.put(Metadata.CONTEXT_MODE_PARAM, Metadata.CONTEXT_MODE_GATEWAY);
        FORMAT_PARAMS = new ToXContent.MapParams(params);
    }

    /**
     * Encapsulates a single {@link IndexWriter} with its {@link Directory} for ease of closing, and a {@link Logger}. There is one of these
     * for each data path.
     */
    private static class MetadataIndexWriter implements Closeable {

        private final Logger logger;
        private final Directory directory;
        private final IndexWriter indexWriter;

        MetadataIndexWriter(Directory directory, IndexWriter indexWriter) {
            this.directory = directory;
            this.indexWriter = indexWriter;
            this.logger = Loggers.getLogger(MetadataIndexWriter.class, directory.toString());
        }

        void deleteAll() throws IOException {
            this.logger.trace("clearing existing metadata");
            this.indexWriter.deleteAll();
        }

        void updateIndexMetadataDocument(Document indexMetadataDocument, Index index) throws IOException {
            this.logger.trace("updating metadata for [{}]", index);
            indexWriter.updateDocument(new Term(INDEX_UUID_FIELD_NAME, index.getUUID()), indexMetadataDocument);
        }

        void updateGlobalMetadata(Document globalMetadataDocument) throws IOException {
            this.logger.trace("updating global metadata doc");
            indexWriter.updateDocument(new Term(TYPE_FIELD_NAME, GLOBAL_TYPE_NAME), globalMetadataDocument);
        }

        void deleteIndexMetadata(String indexUUID) throws IOException {
            this.logger.trace("removing metadata for [{}]", indexUUID);
            indexWriter.deleteDocuments(new Term(INDEX_UUID_FIELD_NAME, indexUUID));
        }

        void flush() throws IOException {
            this.logger.trace("flushing");
            this.indexWriter.flush();
        }

        void prepareCommit(String nodeId, long currentTerm, long lastAcceptedVersion) throws IOException {
            final Map commitData = new HashMap<>(COMMIT_DATA_SIZE);
            commitData.put(CURRENT_TERM_KEY, Long.toString(currentTerm));
            commitData.put(LAST_ACCEPTED_VERSION_KEY, Long.toString(lastAcceptedVersion));
            commitData.put(NODE_VERSION_KEY, Integer.toString(Version.CURRENT.id));
            commitData.put(NODE_ID_KEY, nodeId);
            indexWriter.setLiveCommitData(commitData.entrySet());
            indexWriter.prepareCommit();
        }

        void commit() throws IOException {
            indexWriter.commit();
        }

        @Override
        public void close() throws IOException {
            IOUtils.close(indexWriter, directory);
        }
    }

    public static class Writer implements Closeable {

        private final List metadataIndexWriters;
        private final String nodeId;
        private final BigArrays bigArrays;
        private final LongSupplier relativeTimeMillisSupplier;
        private final Supplier slowWriteLoggingThresholdSupplier;

        boolean fullStateWritten = false;
        private final AtomicBoolean closed = new AtomicBoolean();

        // The size of the document buffer that was used for the last write operation, used as a hint for allocating the buffer for the
        // next one.
        private int documentBufferUsed;

        private Writer(
            List metadataIndexWriters,
            String nodeId,
            BigArrays bigArrays,
            LongSupplier relativeTimeMillisSupplier,
            Supplier slowWriteLoggingThresholdSupplier
        ) {
            this.metadataIndexWriters = metadataIndexWriters;
            this.nodeId = nodeId;
            this.bigArrays = bigArrays;
            this.relativeTimeMillisSupplier = relativeTimeMillisSupplier;
            this.slowWriteLoggingThresholdSupplier = slowWriteLoggingThresholdSupplier;
        }

        private void ensureOpen() {
            if (closed.get()) {
                throw new AlreadyClosedException("cluster state writer is closed already");
            }
        }

        public boolean isOpen() {
            return closed.get() == false;
        }

        private void closeIfAnyIndexWriterHasTragedyOrIsClosed() {
            if (metadataIndexWriters.stream()
                .map(writer -> writer.indexWriter)
                .anyMatch(iw -> iw.getTragicException() != null || iw.isOpen() == false)) {
                try {
                    close();
                } catch (Exception e) {
                    logger.warn("failed on closing cluster state writer", e);
                }
            }
        }

        /**
         * Overrides and commits the given current term and cluster state
         */
        public void writeFullStateAndCommit(long currentTerm, ClusterState clusterState) throws IOException {
            ensureOpen();
            try {
                final long startTimeMillis = relativeTimeMillisSupplier.getAsLong();
                final WriterStats stats = overwriteMetadata(clusterState.metadata());
                commit(currentTerm, clusterState.version());
                fullStateWritten = true;
                final long durationMillis = relativeTimeMillisSupplier.getAsLong() - startTimeMillis;
                final TimeValue finalSlowWriteLoggingThreshold = slowWriteLoggingThresholdSupplier.get();
                if (durationMillis >= finalSlowWriteLoggingThreshold.getMillis()) {
                    logger.warn(
                        "writing cluster state took [{}ms] which is above the warn threshold of [{}]; "
                            + "wrote full state with [{}] indices",
                        durationMillis,
                        finalSlowWriteLoggingThreshold,
                        stats.numIndicesUpdated
                    );
                } else {
                    logger.debug(
                        "writing cluster state took [{}ms]; " + "wrote full state with [{}] indices",
                        durationMillis,
                        stats.numIndicesUpdated
                    );
                }
            } finally {
                closeIfAnyIndexWriterHasTragedyOrIsClosed();
            }
        }

        /**
         * Updates and commits the given cluster state update
         */
        void writeIncrementalStateAndCommit(long currentTerm, ClusterState previousClusterState, ClusterState clusterState)
            throws IOException {
            ensureOpen();
            ensureFullStateWritten();

            try {
                final long startTimeMillis = relativeTimeMillisSupplier.getAsLong();
                final WriterStats stats = updateMetadata(previousClusterState.metadata(), clusterState.metadata());
                commit(currentTerm, clusterState.version());
                final long durationMillis = relativeTimeMillisSupplier.getAsLong() - startTimeMillis;
                final TimeValue finalSlowWriteLoggingThreshold = slowWriteLoggingThresholdSupplier.get();
                if (durationMillis >= finalSlowWriteLoggingThreshold.getMillis()) {
                    logger.warn(
                        "writing cluster state took [{}ms] which is above the warn threshold of [{}]; "
                            + "wrote global metadata [{}] and metadata for [{}] indices and skipped [{}] unchanged indices",
                        durationMillis,
                        finalSlowWriteLoggingThreshold,
                        stats.globalMetaUpdated,
                        stats.numIndicesUpdated,
                        stats.numIndicesUnchanged
                    );
                } else {
                    logger.debug(
                        "writing cluster state took [{}ms]; "
                            + "wrote global metadata [{}] and metadata for [{}] indices and skipped [{}] unchanged indices",
                        durationMillis,
                        stats.globalMetaUpdated,
                        stats.numIndicesUpdated,
                        stats.numIndicesUnchanged
                    );
                }
            } finally {
                closeIfAnyIndexWriterHasTragedyOrIsClosed();
            }
        }

        private void ensureFullStateWritten() {
            assert fullStateWritten : "Need to write full state first before doing incremental writes";
            // noinspection ConstantConditions to catch this even if assertions are disabled
            if (fullStateWritten == false) {
                logger.error("cannot write incremental state");
                throw new IllegalStateException("cannot write incremental state");
            }
        }

        /**
         * Update the persisted metadata to match the given cluster state by removing any stale or unnecessary documents and adding any
         * updated documents.
         */
        private WriterStats updateMetadata(Metadata previouslyWrittenMetadata, Metadata metadata) throws IOException {
            assert previouslyWrittenMetadata.coordinationMetadata().term() == metadata.coordinationMetadata().term();
            logger.trace("currentTerm [{}] matches previous currentTerm, writing changes only", metadata.coordinationMetadata().term());

            try (DocumentBuffer documentBuffer = allocateBuffer()) {

                final boolean updateGlobalMeta = Metadata.isGlobalStateEquals(previouslyWrittenMetadata, metadata) == false;
                if (updateGlobalMeta) {
                    final Document globalMetadataDocument = makeGlobalMetadataDocument(metadata, documentBuffer);
                    for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
                        metadataIndexWriter.updateGlobalMetadata(globalMetadataDocument);
                    }
                }

                final Map indexMetadataVersionByUUID = new HashMap<>(previouslyWrittenMetadata.indices().size());
                for (ObjectCursor cursor : previouslyWrittenMetadata.indices().values()) {
                    final IndexMetadata indexMetadata = cursor.value;
                    final Long previousValue = indexMetadataVersionByUUID.putIfAbsent(
                        indexMetadata.getIndexUUID(),
                        indexMetadata.getVersion()
                    );
                    assert previousValue == null : indexMetadata.getIndexUUID() + " already mapped to " + previousValue;
                }

                int numIndicesUpdated = 0;
                int numIndicesUnchanged = 0;
                for (ObjectCursor cursor : metadata.indices().values()) {
                    final IndexMetadata indexMetadata = cursor.value;
                    final Long previousVersion = indexMetadataVersionByUUID.get(indexMetadata.getIndexUUID());
                    if (previousVersion == null || indexMetadata.getVersion() != previousVersion) {
                        logger.trace(
                            "updating metadata for [{}], changing version from [{}] to [{}]",
                            indexMetadata.getIndex(),
                            previousVersion,
                            indexMetadata.getVersion()
                        );
                        numIndicesUpdated++;
                        final Document indexMetadataDocument = makeIndexMetadataDocument(indexMetadata, documentBuffer);
                        for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
                            metadataIndexWriter.updateIndexMetadataDocument(indexMetadataDocument, indexMetadata.getIndex());
                        }
                    } else {
                        numIndicesUnchanged++;
                        logger.trace("no action required for [{}]", indexMetadata.getIndex());
                    }
                    indexMetadataVersionByUUID.remove(indexMetadata.getIndexUUID());
                }

                documentBufferUsed = documentBuffer.getMaxUsed();

                for (String removedIndexUUID : indexMetadataVersionByUUID.keySet()) {
                    for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
                        metadataIndexWriter.deleteIndexMetadata(removedIndexUUID);
                    }
                }

                // Flush, to try and expose a failure (e.g. out of disk space) before committing, because we can handle a failure here more
                // gracefully than one that occurs during the commit process.
                for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
                    metadataIndexWriter.flush();
                }

                return new WriterStats(updateGlobalMeta, numIndicesUpdated, numIndicesUnchanged);
            }
        }

        /**
         * Update the persisted metadata to match the given cluster state by removing all existing documents and then adding new documents.
         */
        private WriterStats overwriteMetadata(Metadata metadata) throws IOException {
            for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
                metadataIndexWriter.deleteAll();
            }
            return addMetadata(metadata);
        }

        /**
         * Add documents for the metadata of the given cluster state, assuming that there are currently no documents.
         */
        private WriterStats addMetadata(Metadata metadata) throws IOException {
            try (DocumentBuffer documentBuffer = allocateBuffer()) {

                final Document globalMetadataDocument = makeGlobalMetadataDocument(metadata, documentBuffer);
                for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
                    metadataIndexWriter.updateGlobalMetadata(globalMetadataDocument);
                }

                for (ObjectCursor cursor : metadata.indices().values()) {
                    final IndexMetadata indexMetadata = cursor.value;
                    final Document indexMetadataDocument = makeIndexMetadataDocument(indexMetadata, documentBuffer);
                    for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
                        metadataIndexWriter.updateIndexMetadataDocument(indexMetadataDocument, indexMetadata.getIndex());
                    }
                }

                documentBufferUsed = documentBuffer.getMaxUsed();

                // Flush, to try and expose a failure (e.g. out of disk space) before committing, because we can handle a failure here more
                // gracefully than one that occurs during the commit process.
                for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
                    metadataIndexWriter.flush();
                }

                return new WriterStats(true, metadata.indices().size(), 0);
            }
        }

        private DocumentBuffer allocateBuffer() {
            // heuristics for picking the initial buffer size based on the buffer we needed last time: try and fit within a single page,
            // but if we needed more than a single page last time then allow a bit more space to try and avoid needing to grow the buffer
            // later on.
            final int extraSpace = documentBufferUsed <= PageCacheRecycler.PAGE_SIZE_IN_BYTES ? 0 : PageCacheRecycler.PAGE_SIZE_IN_BYTES;
            return new DocumentBuffer(documentBufferUsed + extraSpace, bigArrays);
        }

        public void writeIncrementalTermUpdateAndCommit(long currentTerm, long lastAcceptedVersion) throws IOException {
            ensureOpen();
            ensureFullStateWritten();
            commit(currentTerm, lastAcceptedVersion);
        }

        void commit(long currentTerm, long lastAcceptedVersion) throws IOException {
            ensureOpen();
            try {
                for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
                    metadataIndexWriter.prepareCommit(nodeId, currentTerm, lastAcceptedVersion);
                }
            } catch (Exception e) {
                try {
                    close();
                } catch (Exception e2) {
                    logger.warn("failed on closing cluster state writer", e2);
                    e.addSuppressed(e2);
                }
                throw e;
            } finally {
                closeIfAnyIndexWriterHasTragedyOrIsClosed();
            }
            try {
                for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
                    metadataIndexWriter.commit();
                }
            } catch (IOException e) {
                // The commit() call has similar semantics to a fsync(): although it's atomic, if it fails then we've no idea whether the
                // data on disk is now the old version or the new version, and this is a disaster. It's safest to fail the whole node and
                // retry from the beginning.
                try {
                    close();
                } catch (Exception e2) {
                    e.addSuppressed(e2);
                }
                throw new IOError(e);
            } finally {
                closeIfAnyIndexWriterHasTragedyOrIsClosed();
            }
        }

        @Override
        public void close() throws IOException {
            logger.trace("closing PersistedClusterStateService.Writer");
            if (closed.compareAndSet(false, true)) {
                IOUtils.close(metadataIndexWriters);
            }
        }

        static class WriterStats {
            final boolean globalMetaUpdated;
            final long numIndicesUpdated;
            final long numIndicesUnchanged;

            WriterStats(boolean globalMetaUpdated, long numIndicesUpdated, long numIndicesUnchanged) {
                this.globalMetaUpdated = globalMetaUpdated;
                this.numIndicesUpdated = numIndicesUpdated;
                this.numIndicesUnchanged = numIndicesUnchanged;
            }
        }

        private Document makeIndexMetadataDocument(IndexMetadata indexMetadata, DocumentBuffer documentBuffer) throws IOException {
            final Document indexMetadataDocument = makeDocument(INDEX_TYPE_NAME, indexMetadata, documentBuffer);
            final String indexUUID = indexMetadata.getIndexUUID();
            assert indexUUID.equals(IndexMetadata.INDEX_UUID_NA_VALUE) == false;
            indexMetadataDocument.add(new StringField(INDEX_UUID_FIELD_NAME, indexUUID, Field.Store.NO));
            return indexMetadataDocument;
        }

        private Document makeGlobalMetadataDocument(Metadata metadata, DocumentBuffer documentBuffer) throws IOException {
            return makeDocument(GLOBAL_TYPE_NAME, metadata, documentBuffer);
        }

        private Document makeDocument(String typeName, ToXContent metadata, DocumentBuffer documentBuffer) throws IOException {
            final Document document = new Document();
            document.add(new StringField(TYPE_FIELD_NAME, typeName, Field.Store.NO));

            try (RecyclingBytesStreamOutput streamOutput = documentBuffer.streamOutput()) {
                try (
                    XContentBuilder xContentBuilder = XContentFactory.contentBuilder(
                        XContentType.SMILE,
                        Streams.flushOnCloseStream(streamOutput)
                    )
                ) {
                    xContentBuilder.startObject();
                    metadata.toXContent(xContentBuilder, FORMAT_PARAMS);
                    xContentBuilder.endObject();
                }
                document.add(new StoredField(DATA_FIELD_NAME, streamOutput.toBytesRef()));
            }

            return document;
        }
    }

    /**
     * Holds the current buffer, keeping track of new allocations as it grows.
     */
    private static class DocumentBuffer implements Releasable {
        private final BigArrays bigArrays;

        @Nullable // if the initial page doesn't need releasing
        private final Releasable releasable;
        private byte[] buffer;
        private int maxUsed;

        DocumentBuffer(int size, BigArrays bigArrays) {
            if (size <= PageCacheRecycler.PAGE_SIZE_IN_BYTES) {
                final ByteArray byteArray = bigArrays.newByteArray(PageCacheRecycler.PAGE_SIZE_IN_BYTES);
                final BytesRefIterator iterator = BytesReference.fromByteArray(byteArray, Math.toIntExact(byteArray.size())).iterator();
                final BytesRef firstPage;
                try {
                    firstPage = iterator.next();
                    assert iterator.next() == null : "should be one page";
                } catch (IOException e) {
                    throw new AssertionError("impossible", e);
                }

                // we require that we have the whole page to ourselves
                assert firstPage.offset == 0 : firstPage.offset;
                assert firstPage.bytes.length == PageCacheRecycler.PAGE_SIZE_IN_BYTES : firstPage.bytes.length;
                buffer = firstPage.bytes;
                releasable = byteArray;
            } else {
                buffer = new byte[size];
                releasable = null;
            }
            this.bigArrays = bigArrays;
            maxUsed = 0;
        }

        RecyclingBytesStreamOutput streamOutput() {
            return new RecyclingBytesStreamOutput(buffer, bigArrays) {
                @Override
                public BytesRef toBytesRef() {
                    final BytesRef bytesRef = super.toBytesRef();
                    maxUsed = Math.max(maxUsed, bytesRef.length);
                    if (buffer != bytesRef.bytes) {
                        assert bytesRef.length > buffer.length;
                        logger.trace("growing document buffer from [{}] to [{}]", buffer.length, maxUsed);
                        buffer = bytesRef.bytes;
                    }
                    assert maxUsed <= buffer.length;
                    return bytesRef;
                }
            };
        }

        int getMaxUsed() {
            return maxUsed;
        }

        @Override
        public void close() {
            Releasables.close(releasable);
        }
    }
}