org.elasticsearch.gateway.PersistedClusterStateService Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.gateway;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexNotFoundException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TieredMergePolicy;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.Weight;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SetOnce;
import org.elasticsearch.Assertions;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.metadata.Metadata;
import org.elasticsearch.common.CheckedBiConsumer;
import org.elasticsearch.common.Randomness;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.bytes.CompositeBytesReference;
import org.elasticsearch.common.compress.CompressorFactory;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import org.elasticsearch.common.logging.Loggers;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.settings.ClusterSettings;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.unit.ByteSizeUnit;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.util.Maps;
import org.elasticsearch.common.xcontent.LoggingDeprecationHandler;
import org.elasticsearch.core.CheckedConsumer;
import org.elasticsearch.core.CheckedFunction;
import org.elasticsearch.core.IOUtils;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.SuppressForbidden;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.env.NodeEnvironment;
import org.elasticsearch.env.NodeMetadata;
import org.elasticsearch.xcontent.NamedXContentRegistry;
import org.elasticsearch.xcontent.ToXContent;
import org.elasticsearch.xcontent.XContentBuilder;
import org.elasticsearch.xcontent.XContentFactory;
import org.elasticsearch.xcontent.XContentParser;
import org.elasticsearch.xcontent.XContentParserConfiguration;
import org.elasticsearch.xcontent.XContentType;
import java.io.Closeable;
import java.io.IOError;
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.IntPredicate;
import java.util.function.LongSupplier;
import java.util.function.Supplier;
/**
* Stores cluster metadata in a bare Lucene index (per data path) split across a number of documents. This is used by master-eligible nodes
* to record the last-accepted cluster state during publication. The metadata is written incrementally where possible, leaving alone any
* documents that have not changed. The index has the following fields:
*
* +------------------------------+-----------------------------+----------------------------------------------+--------+-------------+
* | "type" (string field) | "index_uuid" (string field) | "data" (stored binary field in SMILE format) | "page" | "last_page" |
* +------------------------------+-----------------------------+----------------------------------------------+--------+-------------+
* | GLOBAL_TYPE_NAME == "global" | (omitted) | Global metadata | large docs are |
* | INDEX_TYPE_NAME == "index" | Index UUID | Index metadata | split into pages |
* +------------------------------+-----------------------------+----------------------------------------------+--------+-------------+
*
* Additionally each commit has the following user data:
*
* +---------------------------+-------------------------+-------------------------------------------------------------------------------+
* | Key symbol | Key literal | Value |
* +---------------------------+-------------------------+-------------------------------------------------------------------------------+
* | CURRENT_TERM_KEY | "current_term" | Node's "current" term (≥ last-accepted term and the terms of all sent joins) |
* | LAST_ACCEPTED_VERSION_KEY | "last_accepted_version" | The cluster state version corresponding with the persisted metadata |
* | NODE_ID_KEY | "node_id" | The (persistent) ID of the node that wrote this metadata |
* | NODE_VERSION_KEY | "node_version" | The (ID of the) version of the node that wrote this metadata |
* +---------------------------+-------------------------+-------------------------------------------------------------------------------+
*
* (the last-accepted term is recorded in Metadata → CoordinationMetadata so does not need repeating here)
*/
public class PersistedClusterStateService {
private static final Logger logger = LogManager.getLogger(PersistedClusterStateService.class);
private static final String CURRENT_TERM_KEY = "current_term";
private static final String LAST_ACCEPTED_VERSION_KEY = "last_accepted_version";
private static final String NODE_ID_KEY = "node_id";
private static final String NODE_VERSION_KEY = "node_version";
private static final String OLDEST_INDEX_VERSION_KEY = "oldest_index_version";
public static final String TYPE_FIELD_NAME = "type";
public static final String GLOBAL_TYPE_NAME = "global";
public static final String INDEX_TYPE_NAME = "index";
private static final String DATA_FIELD_NAME = "data";
private static final String INDEX_UUID_FIELD_NAME = "index_uuid";
public static final String PAGE_FIELD_NAME = "page";
public static final String LAST_PAGE_FIELD_NAME = "last_page";
public static final int IS_LAST_PAGE = 1;
public static final int IS_NOT_LAST_PAGE = 0;
private static final int COMMIT_DATA_SIZE = 5;
private static final MergePolicy NO_MERGE_POLICY = noMergePolicy();
private static final MergePolicy DEFAULT_MERGE_POLICY = defaultMergePolicy();
public static final String METADATA_DIRECTORY_NAME = MetadataStateFormat.STATE_DIR_NAME;
public static final Setting SLOW_WRITE_LOGGING_THRESHOLD = Setting.timeSetting(
"gateway.slow_write_logging_threshold",
TimeValue.timeValueSeconds(10),
TimeValue.ZERO,
Setting.Property.NodeScope,
Setting.Property.Dynamic
);
public static final Setting DOCUMENT_PAGE_SIZE = Setting.byteSizeSetting(
"cluster_state.document_page_size",
ByteSizeValue.ofMb(1),
ByteSizeValue.ONE,
ByteSizeValue.ofGb(1),
Setting.Property.NodeScope
);
private final Path[] dataPaths;
private final String nodeId;
private final XContentParserConfiguration parserConfig;
private final LongSupplier relativeTimeMillisSupplier;
private final ByteSizeValue documentPageSize;
private volatile TimeValue slowWriteLoggingThreshold;
public PersistedClusterStateService(
NodeEnvironment nodeEnvironment,
NamedXContentRegistry namedXContentRegistry,
ClusterSettings clusterSettings,
LongSupplier relativeTimeMillisSupplier
) {
this(nodeEnvironment.nodeDataPaths(), nodeEnvironment.nodeId(), namedXContentRegistry, clusterSettings, relativeTimeMillisSupplier);
}
public PersistedClusterStateService(
Path[] dataPaths,
String nodeId,
NamedXContentRegistry namedXContentRegistry,
ClusterSettings clusterSettings,
LongSupplier relativeTimeMillisSupplier
) {
this.dataPaths = dataPaths;
this.nodeId = nodeId;
this.parserConfig = XContentParserConfiguration.EMPTY.withDeprecationHandler(LoggingDeprecationHandler.INSTANCE)
.withRegistry(namedXContentRegistry);
this.relativeTimeMillisSupplier = relativeTimeMillisSupplier;
this.slowWriteLoggingThreshold = clusterSettings.get(SLOW_WRITE_LOGGING_THRESHOLD);
clusterSettings.addSettingsUpdateConsumer(SLOW_WRITE_LOGGING_THRESHOLD, this::setSlowWriteLoggingThreshold);
this.documentPageSize = clusterSettings.get(DOCUMENT_PAGE_SIZE);
}
private void setSlowWriteLoggingThreshold(TimeValue slowWriteLoggingThreshold) {
this.slowWriteLoggingThreshold = slowWriteLoggingThreshold;
}
public String getNodeId() {
return nodeId;
}
/**
* Creates a new disk-based writer for cluster states
*/
public Writer createWriter() throws IOException {
final List metadataIndexWriters = new ArrayList<>();
final List closeables = new ArrayList<>();
boolean success = false;
try {
for (final Path path : dataPaths) {
final Directory directory = createDirectory(path.resolve(METADATA_DIRECTORY_NAME));
closeables.add(directory);
final IndexWriter indexWriter = createIndexWriter(directory, false);
closeables.add(indexWriter);
metadataIndexWriters.add(new MetadataIndexWriter(path, directory, indexWriter));
}
success = true;
} finally {
if (success == false) {
IOUtils.closeWhileHandlingException(closeables);
}
}
return new Writer(
metadataIndexWriters,
nodeId,
documentPageSize,
relativeTimeMillisSupplier,
() -> slowWriteLoggingThreshold,
getAssertOnCommit()
);
}
CheckedBiConsumer getAssertOnCommit() {
return Assertions.ENABLED ? this::loadOnDiskState : null;
}
private static IndexWriter createIndexWriter(Directory directory, boolean openExisting) throws IOException {
final IndexWriterConfig indexWriterConfig = new IndexWriterConfig(new KeywordAnalyzer());
// start empty since we re-write the whole cluster state to ensure it is all using the same format version
indexWriterConfig.setOpenMode(openExisting ? IndexWriterConfig.OpenMode.APPEND : IndexWriterConfig.OpenMode.CREATE);
// only commit when specifically instructed, we must not write any intermediate states
indexWriterConfig.setCommitOnClose(false);
// most of the data goes into stored fields which are not buffered, so each doc written accounts for ~500B of indexing buffer
// (see e.g. BufferedUpdates#BYTES_PER_DEL_TERM); a 1MB buffer therefore gets flushed every ~2000 docs.
indexWriterConfig.setRAMBufferSizeMB(1.0);
// merge on the write thread (e.g. while flushing)
indexWriterConfig.setMergeScheduler(new SerialMergeScheduler());
// apply the adjusted merge policy
indexWriterConfig.setMergePolicy(DEFAULT_MERGE_POLICY);
return new IndexWriter(directory, indexWriterConfig);
}
/**
* Remove all persisted cluster states from the given data paths, for use in tests. Should only be called when there is no open
* {@link Writer} on these paths.
*/
public static void deleteAll(Path[] dataPaths) throws IOException {
for (Path dataPath : dataPaths) {
Lucene.cleanLuceneIndex(new NIOFSDirectory(dataPath.resolve(METADATA_DIRECTORY_NAME)));
}
}
// exposed for tests
Directory createDirectory(Path path) throws IOException {
// it is possible to disable the use of MMapDirectory for indices, and it may be surprising to users that have done so if we still
// use a MMapDirectory here, which might happen with FSDirectory.open(path), so we force an NIOFSDirectory to be on the safe side.
return new NIOFSDirectory(path);
}
public Path[] getDataPaths() {
return dataPaths;
}
public static class OnDiskState {
private static final OnDiskState NO_ON_DISK_STATE = new OnDiskState(null, null, 0L, 0L, Metadata.EMPTY_METADATA);
private final String nodeId;
private final Path dataPath;
public final long currentTerm;
public final long lastAcceptedVersion;
public final Metadata metadata;
private OnDiskState(String nodeId, Path dataPath, long currentTerm, long lastAcceptedVersion, Metadata metadata) {
this.nodeId = nodeId;
this.dataPath = dataPath;
this.currentTerm = currentTerm;
this.lastAcceptedVersion = lastAcceptedVersion;
this.metadata = metadata;
}
public boolean empty() {
return this == NO_ON_DISK_STATE;
}
}
/**
* Returns the node metadata for the given data paths, and checks if the node ids are unique
* @param dataPaths the data paths to scan
*/
@Nullable
public static NodeMetadata nodeMetadata(Path... dataPaths) throws IOException {
String nodeId = null;
Version version = null;
Version oldestIndexVersion = Version.V_EMPTY;
for (final Path dataPath : dataPaths) {
final Path indexPath = dataPath.resolve(METADATA_DIRECTORY_NAME);
if (Files.exists(indexPath)) {
try (DirectoryReader reader = DirectoryReader.open(new NIOFSDirectory(dataPath.resolve(METADATA_DIRECTORY_NAME)))) {
final Map userData = reader.getIndexCommit().getUserData();
assert userData.get(NODE_VERSION_KEY) != null;
final String thisNodeId = userData.get(NODE_ID_KEY);
assert thisNodeId != null;
if (nodeId != null && nodeId.equals(thisNodeId) == false) {
throw new CorruptStateException(
"unexpected node ID in metadata, found [" + thisNodeId + "] in [" + dataPath + "] but expected [" + nodeId + "]"
);
} else if (nodeId == null) {
nodeId = thisNodeId;
version = Version.fromId(Integer.parseInt(userData.get(NODE_VERSION_KEY)));
if (userData.containsKey(OLDEST_INDEX_VERSION_KEY)) {
oldestIndexVersion = Version.fromId(Integer.parseInt(userData.get(OLDEST_INDEX_VERSION_KEY)));
} else {
oldestIndexVersion = Version.V_EMPTY;
}
}
} catch (IndexNotFoundException e) {
logger.debug(new ParameterizedMessage("no on-disk state at {}", indexPath), e);
}
}
}
if (nodeId == null) {
return null;
}
return new NodeMetadata(nodeId, version, oldestIndexVersion);
}
/**
* Overrides the version field for the metadata in the given data path
*/
public static void overrideVersion(Version newVersion, Path... dataPaths) throws IOException {
for (final Path dataPath : dataPaths) {
final Path indexPath = dataPath.resolve(METADATA_DIRECTORY_NAME);
if (Files.exists(indexPath)) {
try (DirectoryReader reader = DirectoryReader.open(new NIOFSDirectory(dataPath.resolve(METADATA_DIRECTORY_NAME)))) {
final Map userData = reader.getIndexCommit().getUserData();
assert userData.get(NODE_VERSION_KEY) != null;
try (IndexWriter indexWriter = createIndexWriter(new NIOFSDirectory(dataPath.resolve(METADATA_DIRECTORY_NAME)), true)) {
final Map commitData = new HashMap<>(userData);
commitData.put(NODE_VERSION_KEY, Integer.toString(newVersion.id));
indexWriter.setLiveCommitData(commitData.entrySet());
indexWriter.commit();
}
} catch (IndexNotFoundException e) {
logger.debug(new ParameterizedMessage("no on-disk state at {}", indexPath), e);
}
}
}
}
/**
* Loads the best available on-disk cluster state. Returns {@link OnDiskState#NO_ON_DISK_STATE} if no such state was found.
*/
public OnDiskState loadBestOnDiskState() throws IOException {
return loadBestOnDiskState(true);
}
/**
* Loads the available on-disk cluster state. Returns {@link OnDiskState#NO_ON_DISK_STATE} if no such state was found.
* @param checkClean whether to check the index for corruption before loading, only for tests
*/
OnDiskState loadBestOnDiskState(boolean checkClean) throws IOException {
String committedClusterUuid = null;
Path committedClusterUuidPath = null;
OnDiskState bestOnDiskState = OnDiskState.NO_ON_DISK_STATE;
OnDiskState maxCurrentTermOnDiskState = bestOnDiskState;
// We use a write-all-read-one strategy: metadata is written to every data path when accepting it, which means it is mostly
// sufficient to read _any_ copy. "Mostly" sufficient because the user can change the set of data paths when restarting, and may
// add a data path containing a stale copy of the metadata. We deal with this by using the freshest copy we can find.
for (final Path dataPath : dataPaths) {
final Path indexPath = dataPath.resolve(METADATA_DIRECTORY_NAME);
if (Files.exists(indexPath)) {
try (Directory directory = createDirectory(indexPath)) {
if (checkClean) {
try (BytesStreamOutput outputStream = new BytesStreamOutput()) {
final boolean isClean;
try (
PrintStream printStream = new PrintStream(outputStream, true, StandardCharsets.UTF_8);
CheckIndex checkIndex = new CheckIndex(directory)
) {
checkIndex.setInfoStream(printStream);
checkIndex.setChecksumsOnly(true);
isClean = checkIndex.checkIndex().clean;
}
if (isClean == false) {
if (logger.isErrorEnabled()) {
outputStream.bytes().utf8ToString().lines().forEach(l -> logger.error("checkIndex: {}", l));
}
throw new CorruptStateException(
"the index containing the cluster metadata under the data path ["
+ dataPath
+ "] has been changed by an external force after it was last written by Elasticsearch and is "
+ "now unreadable"
);
}
}
}
try (DirectoryReader directoryReader = DirectoryReader.open(directory)) {
final OnDiskState onDiskState = loadOnDiskState(dataPath, directoryReader);
if (nodeId.equals(onDiskState.nodeId) == false) {
throw new CorruptStateException(
"the index containing the cluster metadata under the data path ["
+ dataPath
+ "] belongs to a node with ID ["
+ onDiskState.nodeId
+ "] but this node's ID is ["
+ nodeId
+ "]"
);
}
if (onDiskState.metadata.clusterUUIDCommitted()) {
if (committedClusterUuid == null) {
committedClusterUuid = onDiskState.metadata.clusterUUID();
committedClusterUuidPath = dataPath;
} else if (committedClusterUuid.equals(onDiskState.metadata.clusterUUID()) == false) {
throw new CorruptStateException(
"mismatched cluster UUIDs in metadata, found ["
+ committedClusterUuid
+ "] in ["
+ committedClusterUuidPath
+ "] and ["
+ onDiskState.metadata.clusterUUID()
+ "] in ["
+ dataPath
+ "]"
);
}
}
if (maxCurrentTermOnDiskState.empty() || maxCurrentTermOnDiskState.currentTerm < onDiskState.currentTerm) {
maxCurrentTermOnDiskState = onDiskState;
}
long acceptedTerm = onDiskState.metadata.coordinationMetadata().term();
long maxAcceptedTerm = bestOnDiskState.metadata.coordinationMetadata().term();
if (bestOnDiskState.empty()
|| acceptedTerm > maxAcceptedTerm
|| (acceptedTerm == maxAcceptedTerm
&& (onDiskState.lastAcceptedVersion > bestOnDiskState.lastAcceptedVersion
|| (onDiskState.lastAcceptedVersion == bestOnDiskState.lastAcceptedVersion)
&& onDiskState.currentTerm > bestOnDiskState.currentTerm))) {
bestOnDiskState = onDiskState;
}
}
} catch (IndexNotFoundException e) {
logger.debug(new ParameterizedMessage("no on-disk state at {}", indexPath), e);
}
}
}
if (bestOnDiskState.currentTerm != maxCurrentTermOnDiskState.currentTerm) {
throw new CorruptStateException(
"inconsistent terms found: best state is from ["
+ bestOnDiskState.dataPath
+ "] in term ["
+ bestOnDiskState.currentTerm
+ "] but there is a stale state in ["
+ maxCurrentTermOnDiskState.dataPath
+ "] with greater term ["
+ maxCurrentTermOnDiskState.currentTerm
+ "]"
);
}
return bestOnDiskState;
}
private OnDiskState loadOnDiskState(Path dataPath, DirectoryReader reader) throws IOException {
final IndexSearcher searcher = new IndexSearcher(reader);
searcher.setQueryCache(null);
final SetOnce builderReference = new SetOnce<>();
consumeFromType(searcher, GLOBAL_TYPE_NAME, bytes -> {
final Metadata metadata = readXContent(bytes, Metadata.Builder::fromXContent);
logger.trace("found global metadata with last-accepted term [{}]", metadata.coordinationMetadata().term());
if (builderReference.get() != null) {
throw new CorruptStateException("duplicate global metadata found in [" + dataPath + "]");
}
builderReference.set(Metadata.builder(metadata));
});
final Metadata.Builder builder = builderReference.get();
if (builder == null) {
throw new CorruptStateException("no global metadata found in [" + dataPath + "]");
}
logger.trace("got global metadata, now reading index metadata");
final Set indexUUIDs = new HashSet<>();
consumeFromType(searcher, INDEX_TYPE_NAME, bytes -> {
final IndexMetadata indexMetadata = readXContent(bytes, IndexMetadata::fromXContent);
logger.trace("found index metadata for {}", indexMetadata.getIndex());
if (indexUUIDs.add(indexMetadata.getIndexUUID()) == false) {
throw new CorruptStateException("duplicate metadata found for " + indexMetadata.getIndex() + " in [" + dataPath + "]");
}
builder.put(indexMetadata, false);
});
final Map userData = reader.getIndexCommit().getUserData();
logger.trace("loaded metadata [{}] from [{}]", userData, reader.directory());
assert userData.size() == COMMIT_DATA_SIZE : userData;
assert userData.get(CURRENT_TERM_KEY) != null;
assert userData.get(LAST_ACCEPTED_VERSION_KEY) != null;
assert userData.get(NODE_ID_KEY) != null;
assert userData.get(NODE_VERSION_KEY) != null;
return new OnDiskState(
userData.get(NODE_ID_KEY),
dataPath,
Long.parseLong(userData.get(CURRENT_TERM_KEY)),
Long.parseLong(userData.get(LAST_ACCEPTED_VERSION_KEY)),
builder.build()
);
}
private T readXContent(BytesReference bytes, CheckedFunction reader) throws IOException {
final XContentParser parser = XContentFactory.xContent(XContentType.SMILE).createParser(parserConfig, bytes.streamInput());
try {
return reader.apply(parser);
} catch (Exception e) {
throw new CorruptStateException(e);
}
}
private static void consumeFromType(
IndexSearcher indexSearcher,
String type,
CheckedConsumer bytesReferenceConsumer
) throws IOException {
final Query query = new TermQuery(new Term(TYPE_FIELD_NAME, type));
final Weight weight = indexSearcher.createWeight(query, ScoreMode.COMPLETE_NO_SCORES, 0.0f);
logger.trace("running query [{}]", query);
final Map documentReaders = new HashMap<>();
for (LeafReaderContext leafReaderContext : indexSearcher.getIndexReader().leaves()) {
logger.trace("new leafReaderContext: {}", leafReaderContext);
final Scorer scorer = weight.scorer(leafReaderContext);
if (scorer != null) {
final Bits liveDocs = leafReaderContext.reader().getLiveDocs();
final IntPredicate isLiveDoc = liveDocs == null ? i -> true : liveDocs::get;
final DocIdSetIterator docIdSetIterator = scorer.iterator();
while (docIdSetIterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
if (isLiveDoc.test(docIdSetIterator.docID())) {
logger.trace("processing doc {}", docIdSetIterator.docID());
final Document document = leafReaderContext.reader().document(docIdSetIterator.docID());
final BytesArray documentData = new BytesArray(document.getBinaryValue(DATA_FIELD_NAME));
if (document.getField(PAGE_FIELD_NAME) == null) {
// legacy format: not paginated or compressed
assert Version.CURRENT.minimumIndexCompatibilityVersion().before(Version.V_7_16_0);
bytesReferenceConsumer.accept(documentData);
continue;
}
final int pageIndex = document.getField(PAGE_FIELD_NAME).numericValue().intValue();
final boolean isLastPage = document.getField(LAST_PAGE_FIELD_NAME).numericValue().intValue() == IS_LAST_PAGE;
if (pageIndex == 0 && isLastPage) {
// common case: metadata fits in a single page
bytesReferenceConsumer.accept(uncompress(documentData));
continue;
}
// The metadata doesn't fit into a single page, so we accumulate pages until we have a complete set. Typically we
// will see pages in order since they were written in order, so the map will often have at most one entry. Also 1MB
// should be ample space for compressed index metadata so this is almost always used just for the global metadata.
// Even in pathological cases we shouldn't run out of memory here because we're doing this very early on in node
// startup, on the main thread and before most other services have started, and we will need space to serialize the
// whole cluster state in memory later on.
final String key;
if (type.equals(GLOBAL_TYPE_NAME)) {
key = GLOBAL_TYPE_NAME;
} else {
key = document.getField(INDEX_UUID_FIELD_NAME).stringValue();
}
final PaginatedDocumentReader reader = documentReaders.computeIfAbsent(key, k -> new PaginatedDocumentReader());
final BytesReference bytesReference = reader.addPage(key, documentData, pageIndex, isLastPage);
if (bytesReference != null) {
documentReaders.remove(key);
bytesReferenceConsumer.accept(uncompress(bytesReference));
}
}
}
}
}
if (documentReaders.isEmpty() == false) {
throw new CorruptStateException(
"incomplete paginated documents " + documentReaders.keySet() + " when reading cluster state index [type=" + type + "]"
);
}
}
private static BytesReference uncompress(BytesReference bytesReference) throws IOException {
try {
return CompressorFactory.COMPRESSOR.uncompress(bytesReference);
} catch (IOException e) {
// no actual IO takes place, the data is all in-memory, so an exception indicates corruption
throw new CorruptStateException(e);
}
}
private static final ToXContent.Params FORMAT_PARAMS;
static {
Map params = Maps.newMapWithExpectedSize(2);
params.put("binary", "true");
params.put(Metadata.CONTEXT_MODE_PARAM, Metadata.CONTEXT_MODE_GATEWAY);
FORMAT_PARAMS = new ToXContent.MapParams(params);
}
@SuppressForbidden(reason = "merges are only temporarily suppressed, the merge scheduler does not need changing")
private static MergePolicy noMergePolicy() {
return NoMergePolicy.INSTANCE;
}
private static MergePolicy defaultMergePolicy() {
final TieredMergePolicy mergePolicy = new TieredMergePolicy();
// don't worry about cleaning up deletes too much, segments will often get completely deleted once they're old enough
mergePolicy.setDeletesPctAllowed(50.0);
// more/smaller segments means there's a better chance they just get deleted before needing a merge
mergePolicy.setSegmentsPerTier(100);
// ... but if we do end up merging them then do them all
mergePolicy.setMaxMergeAtOnce(100);
// always use compound segments to avoid fsync overhead
mergePolicy.setNoCFSRatio(1.0);
// segments are mostly tiny, so don't pretend they are bigger
mergePolicy.setFloorSegmentMB(0.001);
return mergePolicy;
}
/**
* Encapsulates a single {@link IndexWriter} with its {@link Directory} for ease of closing, and a {@link Logger}. There is one of these
* for each data path.
*/
private static class MetadataIndexWriter implements Closeable {
private final Logger logger;
private final Path path;
private final Directory directory;
private final IndexWriter indexWriter;
MetadataIndexWriter(Path path, Directory directory, IndexWriter indexWriter) {
this.path = path;
this.directory = directory;
this.indexWriter = indexWriter;
this.logger = Loggers.getLogger(MetadataIndexWriter.class, directory.toString());
}
void deleteAll() throws IOException {
this.logger.trace("clearing existing metadata");
indexWriter.deleteAll();
}
public void deleteGlobalMetadata() throws IOException {
this.logger.trace("deleting global metadata docs");
indexWriter.deleteDocuments(new Term(TYPE_FIELD_NAME, GLOBAL_TYPE_NAME));
}
void deleteIndexMetadata(String indexUUID) throws IOException {
this.logger.trace("removing metadata for [{}]", indexUUID);
indexWriter.deleteDocuments(new Term(INDEX_UUID_FIELD_NAME, indexUUID));
}
void flush() throws IOException {
this.logger.trace("flushing");
this.indexWriter.flush();
}
void startWrite() {
// Disable merges during indexing - many older segments will ultimately contain no live docs and simply get deleted.
indexWriter.getConfig().setMergePolicy(NO_MERGE_POLICY);
}
void prepareCommit(String nodeId, long currentTerm, long lastAcceptedVersion, Version oldestIndexVersion) throws IOException {
indexWriter.getConfig().setMergePolicy(DEFAULT_MERGE_POLICY);
indexWriter.maybeMerge();
final Map commitData = Maps.newMapWithExpectedSize(COMMIT_DATA_SIZE);
commitData.put(CURRENT_TERM_KEY, Long.toString(currentTerm));
commitData.put(LAST_ACCEPTED_VERSION_KEY, Long.toString(lastAcceptedVersion));
commitData.put(NODE_VERSION_KEY, Integer.toString(Version.CURRENT.id));
commitData.put(OLDEST_INDEX_VERSION_KEY, Integer.toString(oldestIndexVersion.id));
commitData.put(NODE_ID_KEY, nodeId);
indexWriter.setLiveCommitData(commitData.entrySet());
indexWriter.prepareCommit();
}
void commit() throws IOException {
indexWriter.commit();
}
@Override
public void close() throws IOException {
IOUtils.close(indexWriter, directory);
}
}
public static class Writer implements Closeable {
private final List metadataIndexWriters;
private final String nodeId;
private final LongSupplier relativeTimeMillisSupplier;
private final Supplier slowWriteLoggingThresholdSupplier;
boolean fullStateWritten = false;
private final AtomicBoolean closed = new AtomicBoolean();
private final byte[] documentBuffer;
@Nullable // if assertions disabled or we explicitly don't want to assert on commit in a test
private final CheckedBiConsumer assertOnCommit;
private Writer(
List metadataIndexWriters,
String nodeId,
ByteSizeValue documentPageSize,
LongSupplier relativeTimeMillisSupplier,
Supplier slowWriteLoggingThresholdSupplier,
@Nullable // if assertions disabled or we explicitly don't want to assert on commit in a test
CheckedBiConsumer assertOnCommit
) {
this.metadataIndexWriters = metadataIndexWriters;
this.nodeId = nodeId;
this.relativeTimeMillisSupplier = relativeTimeMillisSupplier;
this.slowWriteLoggingThresholdSupplier = slowWriteLoggingThresholdSupplier;
this.documentBuffer = new byte[ByteSizeUnit.BYTES.toIntBytes(documentPageSize.getBytes())];
this.assertOnCommit = assertOnCommit;
}
private void ensureOpen() {
if (closed.get()) {
throw new AlreadyClosedException("cluster state writer is closed already");
}
}
public boolean isOpen() {
return closed.get() == false;
}
private void closeIfAnyIndexWriterHasTragedyOrIsClosed() {
if (metadataIndexWriters.stream()
.map(writer -> writer.indexWriter)
.anyMatch(iw -> iw.getTragicException() != null || iw.isOpen() == false)) {
try {
close();
} catch (Exception e) {
logger.warn("failed on closing cluster state writer", e);
}
}
}
/**
* Overrides and commits the given current term and cluster state
*/
public void writeFullStateAndCommit(long currentTerm, ClusterState clusterState) throws IOException {
ensureOpen();
try {
final long startTimeMillis = relativeTimeMillisSupplier.getAsLong();
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
metadataIndexWriter.startWrite();
}
final WriterStats stats = overwriteMetadata(clusterState.metadata());
commit(currentTerm, clusterState.version(), clusterState.metadata().oldestIndexVersion());
fullStateWritten = true;
final long durationMillis = relativeTimeMillisSupplier.getAsLong() - startTimeMillis;
final TimeValue finalSlowWriteLoggingThreshold = slowWriteLoggingThresholdSupplier.get();
if (durationMillis >= finalSlowWriteLoggingThreshold.getMillis()) {
logger.warn(
"writing full cluster state took [{}ms] which is above the warn threshold of [{}]; {}",
durationMillis,
finalSlowWriteLoggingThreshold,
stats
);
} else {
logger.debug("writing full cluster state took [{}ms]; {}", durationMillis, stats);
}
} finally {
closeIfAnyIndexWriterHasTragedyOrIsClosed();
}
}
/**
* Updates and commits the given cluster state update
*/
void writeIncrementalStateAndCommit(long currentTerm, ClusterState previousClusterState, ClusterState clusterState)
throws IOException {
ensureOpen();
ensureFullStateWritten();
try {
final long startTimeMillis = relativeTimeMillisSupplier.getAsLong();
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
metadataIndexWriter.startWrite();
}
final WriterStats stats = updateMetadata(previousClusterState.metadata(), clusterState.metadata());
commit(currentTerm, clusterState.version(), clusterState.metadata().oldestIndexVersion());
final long durationMillis = relativeTimeMillisSupplier.getAsLong() - startTimeMillis;
final TimeValue finalSlowWriteLoggingThreshold = slowWriteLoggingThresholdSupplier.get();
if (durationMillis >= finalSlowWriteLoggingThreshold.getMillis()) {
logger.warn(
"writing cluster state took [{}ms] which is above the warn threshold of [{}]; {}",
durationMillis,
finalSlowWriteLoggingThreshold,
stats
);
} else {
logger.debug("writing cluster state took [{}ms]; {}", durationMillis, stats);
}
} finally {
closeIfAnyIndexWriterHasTragedyOrIsClosed();
}
}
private void ensureFullStateWritten() {
assert fullStateWritten : "Need to write full state first before doing incremental writes";
// noinspection ConstantConditions to catch this even if assertions are disabled
if (fullStateWritten == false) {
logger.error("cannot write incremental state");
throw new IllegalStateException("cannot write incremental state");
}
}
/**
* Update the persisted metadata to match the given cluster state by removing any stale or unnecessary documents and adding any
* updated documents.
*/
private WriterStats updateMetadata(Metadata previouslyWrittenMetadata, Metadata metadata) throws IOException {
assert previouslyWrittenMetadata.coordinationMetadata().term() == metadata.coordinationMetadata().term();
logger.trace("currentTerm [{}] matches previous currentTerm, writing changes only", metadata.coordinationMetadata().term());
final boolean updateGlobalMeta = Metadata.isGlobalStateEquals(previouslyWrittenMetadata, metadata) == false;
if (updateGlobalMeta) {
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
metadataIndexWriter.deleteGlobalMetadata();
}
addGlobalMetadataDocuments(metadata);
}
final Map indexMetadataVersionByUUID = Maps.newMapWithExpectedSize(previouslyWrittenMetadata.indices().size());
for (IndexMetadata indexMetadata : previouslyWrittenMetadata.indices().values()) {
final Long previousValue = indexMetadataVersionByUUID.putIfAbsent(indexMetadata.getIndexUUID(), indexMetadata.getVersion());
assert previousValue == null : indexMetadata.getIndexUUID() + " already mapped to " + previousValue;
}
int numIndicesAdded = 0;
int numIndicesUpdated = 0;
int numIndicesRemoved = 0;
int numIndicesUnchanged = 0;
for (IndexMetadata indexMetadata : metadata.indices().values()) {
final Long previousVersion = indexMetadataVersionByUUID.get(indexMetadata.getIndexUUID());
if (previousVersion == null || indexMetadata.getVersion() != previousVersion) {
logger.trace(
"updating metadata for [{}], changing version from [{}] to [{}]",
indexMetadata.getIndex(),
previousVersion,
indexMetadata.getVersion()
);
if (previousVersion == null) {
numIndicesAdded++;
} else {
numIndicesUpdated++;
}
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
metadataIndexWriter.deleteIndexMetadata(indexMetadata.getIndexUUID());
}
addIndexMetadataDocuments(indexMetadata);
} else {
numIndicesUnchanged++;
logger.trace("no action required for [{}]", indexMetadata.getIndex());
}
indexMetadataVersionByUUID.remove(indexMetadata.getIndexUUID());
}
for (String removedIndexUUID : indexMetadataVersionByUUID.keySet()) {
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
numIndicesRemoved++;
metadataIndexWriter.deleteIndexMetadata(removedIndexUUID);
}
}
// Flush, to try and expose a failure (e.g. out of disk space) before committing, because we can handle a failure here more
// gracefully than one that occurs during the commit process.
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
metadataIndexWriter.flush();
}
return new WriterStats(false, updateGlobalMeta, numIndicesUnchanged, numIndicesAdded, numIndicesUpdated, numIndicesRemoved);
}
private static int lastPageValue(boolean isLastPage) {
return isLastPage ? IS_LAST_PAGE : IS_NOT_LAST_PAGE;
}
private void addIndexMetadataDocuments(IndexMetadata indexMetadata) throws IOException {
final String indexUUID = indexMetadata.getIndexUUID();
assert indexUUID.equals(IndexMetadata.INDEX_UUID_NA_VALUE) == false;
logger.trace("updating metadata for [{}]", indexMetadata.getIndex());
writePages(indexMetadata, ((bytesRef, pageIndex, isLastPage) -> {
final Document document = new Document();
document.add(new StringField(TYPE_FIELD_NAME, INDEX_TYPE_NAME, Field.Store.NO));
document.add(new StringField(INDEX_UUID_FIELD_NAME, indexUUID, Field.Store.YES));
document.add(new StoredField(PAGE_FIELD_NAME, pageIndex));
document.add(new StoredField(LAST_PAGE_FIELD_NAME, lastPageValue(isLastPage)));
document.add(new StoredField(DATA_FIELD_NAME, bytesRef));
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
metadataIndexWriter.indexWriter.addDocument(document);
}
}));
}
private void addGlobalMetadataDocuments(Metadata metadata) throws IOException {
logger.trace("updating global metadata doc");
writePages(metadata, (bytesRef, pageIndex, isLastPage) -> {
final Document document = new Document();
document.add(new StringField(TYPE_FIELD_NAME, GLOBAL_TYPE_NAME, Field.Store.NO));
document.add(new StoredField(PAGE_FIELD_NAME, pageIndex));
document.add(new StoredField(LAST_PAGE_FIELD_NAME, lastPageValue(isLastPage)));
document.add(new StoredField(DATA_FIELD_NAME, bytesRef));
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
metadataIndexWriter.indexWriter.addDocument(document);
}
});
}
private void writePages(ToXContent metadata, PageWriter pageWriter) throws IOException {
try (
PageWriterOutputStream paginatedStream = new PageWriterOutputStream(documentBuffer, pageWriter);
OutputStream compressedStream = CompressorFactory.COMPRESSOR.threadLocalOutputStream(paginatedStream);
XContentBuilder xContentBuilder = XContentFactory.contentBuilder(XContentType.SMILE, compressedStream)
) {
xContentBuilder.startObject();
metadata.toXContent(xContentBuilder, FORMAT_PARAMS);
xContentBuilder.endObject();
}
}
/**
* Update the persisted metadata to match the given cluster state by removing all existing documents and then adding new documents.
*/
private WriterStats overwriteMetadata(Metadata metadata) throws IOException {
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
metadataIndexWriter.deleteAll();
}
return addMetadata(metadata);
}
/**
* Add documents for the metadata of the given cluster state, assuming that there are currently no documents.
*/
private WriterStats addMetadata(Metadata metadata) throws IOException {
addGlobalMetadataDocuments(metadata);
for (IndexMetadata indexMetadata : metadata.indices().values()) {
addIndexMetadataDocuments(indexMetadata);
}
// Flush, to try and expose a failure (e.g. out of disk space) before committing, because we can handle a failure here more
// gracefully than one that occurs during the commit process.
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
metadataIndexWriter.flush();
}
return new WriterStats(true, true, 0, 0, metadata.indices().size(), 0);
}
public void writeIncrementalTermUpdateAndCommit(long currentTerm, long lastAcceptedVersion, Version oldestIndexVersion)
throws IOException {
ensureOpen();
ensureFullStateWritten();
commit(currentTerm, lastAcceptedVersion, oldestIndexVersion);
}
void commit(long currentTerm, long lastAcceptedVersion, Version oldestIndexVersion) throws IOException {
ensureOpen();
prepareCommit(currentTerm, lastAcceptedVersion, oldestIndexVersion);
completeCommit();
assert assertOnCommit();
}
private boolean assertOnCommit() {
if (assertOnCommit != null && Randomness.get().nextInt(100) == 0) {
// only rarely run this assertion since reloading the whole state can be quite expensive
for (final var metadataIndexWriter : metadataIndexWriters) {
try (var directoryReader = DirectoryReader.open(metadataIndexWriter.indexWriter)) {
assertOnCommit.accept(metadataIndexWriter.path, directoryReader);
} catch (Exception e) {
throw new AssertionError(e);
}
}
}
return true;
}
private void prepareCommit(long currentTerm, long lastAcceptedVersion, Version oldestIndexVersion) throws IOException {
boolean prepareCommitSuccess = false;
try {
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
metadataIndexWriter.prepareCommit(nodeId, currentTerm, lastAcceptedVersion, oldestIndexVersion);
}
prepareCommitSuccess = true;
} catch (Exception e) {
try {
close();
} catch (Exception e2) {
logger.warn("failed on closing cluster state writer", e2);
e.addSuppressed(e2);
}
throw e;
} finally {
closeIfAnyIndexWriterHasTragedyOrIsClosed();
if (prepareCommitSuccess == false) {
closeAndSuppressExceptions(); // let the error propagate even if closing fails here
}
}
}
private void completeCommit() {
boolean commitSuccess = false;
try {
for (MetadataIndexWriter metadataIndexWriter : metadataIndexWriters) {
metadataIndexWriter.commit();
}
commitSuccess = true;
} catch (IOException e) {
// The commit() call has similar semantics to a fsync(): although it's atomic, if it fails then we've no idea whether the
// data on disk is now the old version or the new version, and this is a disaster. It's safest to fail the whole node and
// retry from the beginning.
try {
close();
} catch (Exception e2) {
e.addSuppressed(e2);
}
throw new IOError(e);
} finally {
closeIfAnyIndexWriterHasTragedyOrIsClosed();
if (commitSuccess == false) {
closeAndSuppressExceptions(); // let the error propagate even if closing fails here
}
}
}
private void closeAndSuppressExceptions() {
if (closed.compareAndSet(false, true)) {
logger.trace("closing PersistedClusterStateService.Writer suppressing any exceptions");
IOUtils.closeWhileHandlingException(metadataIndexWriters);
}
}
@Override
public void close() throws IOException {
logger.trace("closing PersistedClusterStateService.Writer");
if (closed.compareAndSet(false, true)) {
IOUtils.close(metadataIndexWriters);
}
}
private record WriterStats(
boolean isFullWrite,
boolean globalMetaUpdated,
int numIndicesUnchanged,
int numIndicesAdded,
int numIndicesUpdated,
int numIndicesRemoved
) {
@Override
public String toString() {
if (isFullWrite) {
return String.format(Locale.ROOT, "wrote global metadata and metadata for [%d] indices", numIndicesUpdated);
} else {
return String.format(
Locale.ROOT,
"""
[%s] global metadata, wrote metadata for [%d] new indices and [%d] existing indices, \
removed metadata for [%d] indices and skipped [%d] unchanged indices""",
globalMetaUpdated ? "wrote" : "skipped writing",
numIndicesAdded,
numIndicesUpdated,
numIndicesRemoved,
numIndicesUnchanged
);
}
}
}
}
private interface PageWriter {
void consumePage(BytesRef bytesRef, int pageIndex, boolean isLastPage) throws IOException;
}
private static class PageWriterOutputStream extends OutputStream {
private final byte[] buffer;
private final PageWriter pageWriter;
private int bufferPosition;
private int pageIndex;
private int bytesFlushed;
private boolean closed;
PageWriterOutputStream(byte[] buffer, PageWriter pageWriter) {
assert buffer.length > 0;
this.buffer = buffer;
this.pageWriter = pageWriter;
}
@Override
public void write(@SuppressWarnings("NullableProblems") byte[] b, int off, int len) throws IOException {
assert closed == false : "cannot write after close";
while (len > 0) {
if (bufferPosition == buffer.length) {
flushPage(false);
}
assert bufferPosition < buffer.length;
final int lenToBuffer = Math.min(len, buffer.length - bufferPosition);
System.arraycopy(b, off, buffer, bufferPosition, lenToBuffer);
bufferPosition += lenToBuffer;
off += lenToBuffer;
len -= lenToBuffer;
}
}
@Override
public void write(int b) throws IOException {
assert closed == false : "cannot write after close";
if (bufferPosition == buffer.length) {
flushPage(false);
}
assert bufferPosition < buffer.length;
buffer[bufferPosition++] = (byte) b;
}
@Override
public void flush() throws IOException {
assert closed == false : "must not flush after close";
// keep buffering, don't actually flush anything
}
@Override
public void close() throws IOException {
if (closed == false) {
closed = true;
flushPage(true);
}
}
private void flushPage(boolean isLastPage) throws IOException {
assert bufferPosition > 0 : "cannot flush empty page";
assert bufferPosition == buffer.length || isLastPage : "only the last page may be incomplete";
if (bytesFlushed > Integer.MAX_VALUE - bufferPosition) {
// At startup the state doc is loaded into a single BytesReference which means it must be no longer than Integer.MAX_VALUE,
// so we would not be able to read it if we carried on. Better to fail early during writing instead.
throw new IllegalArgumentException("cannot persist cluster state document larger than 2GB");
}
bytesFlushed += bufferPosition;
pageWriter.consumePage(new BytesRef(buffer, 0, bufferPosition), pageIndex, isLastPage);
pageIndex += 1;
bufferPosition = 0;
}
}
private static class PaginatedDocumentReader {
private final ArrayList pages = new ArrayList<>();
private int emptyPages;
private int pageCount = -1;
/**
* @return a {@link BytesReference} if all pages received, otherwise {@code null}.
*/
@Nullable
BytesReference addPage(String key, BytesReference bytesReference, int pageIndex, boolean isLastPage) throws CorruptStateException {
while (pages.size() < pageIndex) {
if (pageCount != -1) {
throw new CorruptStateException(
"found page ["
+ pageIndex
+ "] but last page was ["
+ pageCount
+ "] when reading key ["
+ key
+ "] from cluster state index"
);
}
emptyPages += 1;
pages.add(null);
}
if (pages.size() == pageIndex) {
pages.add(bytesReference);
} else {
if (pages.get(pageIndex) != null) {
throw new CorruptStateException(
"found duplicate page [" + pageIndex + "] when reading key [" + key + "] from cluster state index"
);
}
emptyPages -= 1;
pages.set(pageIndex, bytesReference);
}
if (isLastPage) {
if (pageCount != -1) {
throw new CorruptStateException(
"already read page count "
+ pageCount
+ " but page "
+ pageIndex
+ " is also marked as the last page when reading key ["
+ key
+ "] from cluster state index"
);
}
pageCount = pageIndex + 1;
if (pages.size() != pageCount) {
throw new CorruptStateException(
"already read " + pages.size() + " pages but page " + pageIndex + " is marked as the last page"
);
}
}
if (pageCount != -1 && emptyPages == 0) {
return CompositeBytesReference.of(pages.toArray(new BytesReference[0]));
} else {
return null;
}
}
}
}