
org.elasticsearch.cluster.routing.allocation.DiskThresholdMonitor Maven / Gradle / Ivy
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.cluster.routing.allocation;
import com.carrotsearch.hppc.ObjectLookupContainer;
import com.carrotsearch.hppc.cursors.ObjectObjectCursor;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.support.GroupedActionListener;
import org.elasticsearch.client.Client;
import org.elasticsearch.cluster.ClusterInfo;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.DiskUsage;
import org.elasticsearch.cluster.block.ClusterBlockLevel;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.metadata.SingleNodeShutdownMetadata;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.routing.RerouteService;
import org.elasticsearch.cluster.routing.RoutingNode;
import org.elasticsearch.cluster.routing.RoutingNodes;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDecider;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.collect.ImmutableOpenMap;
import org.elasticsearch.common.logging.DeprecationCategory;
import org.elasticsearch.common.logging.DeprecationLogger;
import org.elasticsearch.common.settings.ClusterSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.gateway.GatewayService;
import org.elasticsearch.index.Index;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.LongSupplier;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
/**
* Listens for a node to go over the high watermark and kicks off an empty
* reroute if it does. Also responsible for logging about nodes that have
* passed the disk watermarks
*/
public class DiskThresholdMonitor {
private static final Logger logger = LogManager.getLogger(DiskThresholdMonitor.class);
private static final Settings READ_ONLY_ALLOW_DELETE_SETTINGS = Settings.builder()
.put(IndexMetadata.SETTING_READ_ONLY_ALLOW_DELETE, Boolean.TRUE.toString())
.build();
private static final Settings NOT_READ_ONLY_ALLOW_DELETE_SETTINGS = Settings.builder()
.putNull(IndexMetadata.SETTING_READ_ONLY_ALLOW_DELETE)
.build();
private final DiskThresholdSettings diskThresholdSettings;
private final Client client;
private final Supplier clusterStateSupplier;
private final LongSupplier currentTimeMillisSupplier;
private final RerouteService rerouteService;
private final AtomicLong lastRunTimeMillis = new AtomicLong(Long.MIN_VALUE);
private final AtomicBoolean checkInProgress = new AtomicBoolean();
private final DeprecationLogger deprecationLogger = DeprecationLogger.getLogger(logger.getName());
/**
* The IDs of the nodes that were over the low threshold in the last check (and maybe over another threshold too). Tracked so that we
* can log when such nodes are no longer over the low threshold.
*/
private final Set nodesOverLowThreshold = Sets.newConcurrentHashSet();
/**
* The IDs of the nodes that were over the high threshold in the last check (and maybe over another threshold too). Tracked so that we
* can log when such nodes are no longer over the high threshold.
*/
private final Set nodesOverHighThreshold = Sets.newConcurrentHashSet();
/**
* The IDs of the nodes that were over the high threshold in the last check, but which are relocating shards that will bring them
* under the high threshold again. Tracked so that we can log when such nodes are no longer in this state.
*/
private final Set nodesOverHighThresholdAndRelocating = Sets.newConcurrentHashSet();
public DiskThresholdMonitor(
Settings settings,
Supplier clusterStateSupplier,
ClusterSettings clusterSettings,
Client client,
LongSupplier currentTimeMillisSupplier,
RerouteService rerouteService
) {
this.clusterStateSupplier = clusterStateSupplier;
this.currentTimeMillisSupplier = currentTimeMillisSupplier;
this.rerouteService = rerouteService;
this.diskThresholdSettings = new DiskThresholdSettings(settings, clusterSettings);
this.client = client;
if (diskThresholdSettings.isAutoReleaseIndexEnabled() == false) {
deprecationLogger.critical(
DeprecationCategory.SETTINGS,
DiskThresholdSettings.AUTO_RELEASE_INDEX_ENABLED_KEY.replace(".", "_"),
"[{}] will be removed in version {}",
DiskThresholdSettings.AUTO_RELEASE_INDEX_ENABLED_KEY,
Version.V_7_4_0.major + 1
);
}
}
private void checkFinished() {
final boolean checkFinished = checkInProgress.compareAndSet(true, false);
assert checkFinished;
logger.trace("checkFinished");
}
public void onNewInfo(ClusterInfo info) {
final ClusterState state = clusterStateSupplier.get();
if (state.blocks().hasGlobalBlock(GatewayService.STATE_NOT_RECOVERED_BLOCK)) {
logger.debug("skipping monitor as the cluster state is not recovered yet");
return;
}
// TODO find a better way to limit concurrent updates (and potential associated reroutes) while allowing tests to ensure that
// all ClusterInfo updates are processed and never ignored
if (checkInProgress.compareAndSet(false, true) == false) {
logger.info("skipping monitor as a check is already in progress");
return;
}
final ImmutableOpenMap usages = info.getNodeLeastAvailableDiskUsages();
if (usages == null) {
logger.trace("skipping monitor as no disk usage information is available");
checkFinished();
return;
}
logger.trace("processing new cluster info");
boolean reroute = false;
String explanation = "";
final long currentTimeMillis = currentTimeMillisSupplier.getAsLong();
// Clean up nodes that have been removed from the cluster
final ObjectLookupContainer nodes = usages.keys();
cleanUpRemovedNodes(nodes, nodesOverLowThreshold);
cleanUpRemovedNodes(nodes, nodesOverHighThreshold);
cleanUpRemovedNodes(nodes, nodesOverHighThresholdAndRelocating);
final Set indicesToMarkReadOnly = new HashSet<>();
RoutingNodes routingNodes = state.getRoutingNodes();
Set indicesNotToAutoRelease = new HashSet<>();
markNodesMissingUsageIneligibleForRelease(routingNodes, usages, indicesNotToAutoRelease);
final List usagesOverHighThreshold = new ArrayList<>();
for (final ObjectObjectCursor entry : usages) {
final String node = entry.key;
final DiskUsage usage = entry.value;
final RoutingNode routingNode = routingNodes.node(node);
if (isDedicatedFrozenNode(routingNode)) {
ByteSizeValue total = ByteSizeValue.ofBytes(usage.getTotalBytes());
long frozenFloodStageThreshold = diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(total).getBytes();
if (usage.getFreeBytes() < frozenFloodStageThreshold) {
logger.warn(
"flood stage disk watermark [{}] exceeded on {}",
diskThresholdSettings.describeFrozenFloodStageThreshold(total),
usage
);
}
// skip checking high/low watermarks for frozen nodes, since frozen shards have only insignificant local storage footprint
// and this allows us to use more of the local storage for cache.
continue;
}
if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdFloodStage().getBytes()
|| usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdFloodStage()) {
nodesOverLowThreshold.add(node);
nodesOverHighThreshold.add(node);
nodesOverHighThresholdAndRelocating.remove(node);
if (routingNode != null) { // might be temporarily null if the ClusterInfoService and the ClusterService are out of step
for (ShardRouting routing : routingNode) {
String indexName = routing.index().getName();
indicesToMarkReadOnly.add(indexName);
indicesNotToAutoRelease.add(indexName);
}
}
logger.warn(
"flood stage disk watermark [{}] exceeded on {}, all indices on this node will be marked read-only",
diskThresholdSettings.describeFloodStageThreshold(),
usage
);
continue;
}
if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes()
|| usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) {
if (routingNode != null) { // might be temporarily null if the ClusterInfoService and the ClusterService are out of step
for (ShardRouting routing : routingNode) {
String indexName = routing.index().getName();
indicesNotToAutoRelease.add(indexName);
}
}
}
final long reservedSpace = info.getReservedSpace(usage.getNodeId(), usage.getPath()).getTotal();
final DiskUsage usageWithReservedSpace = new DiskUsage(
usage.getNodeId(),
usage.getNodeName(),
usage.getPath(),
usage.getTotalBytes(),
Math.max(0L, usage.getFreeBytes() - reservedSpace)
);
if (usageWithReservedSpace.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes()
|| usageWithReservedSpace.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) {
nodesOverLowThreshold.add(node);
nodesOverHighThreshold.add(node);
if (lastRunTimeMillis.get() <= currentTimeMillis - diskThresholdSettings.getRerouteInterval().millis()) {
reroute = true;
explanation = "high disk watermark exceeded on one or more nodes";
usagesOverHighThreshold.add(usage);
// will log about this node when the reroute completes
} else {
logger.debug(
"high disk watermark exceeded on {} but an automatic reroute has occurred " + "in the last [{}], skipping reroute",
node,
diskThresholdSettings.getRerouteInterval()
);
}
} else if (usageWithReservedSpace.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdLow().getBytes()
|| usageWithReservedSpace.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdLow()) {
nodesOverHighThresholdAndRelocating.remove(node);
final boolean wasUnderLowThreshold = nodesOverLowThreshold.add(node);
final boolean wasOverHighThreshold = nodesOverHighThreshold.remove(node);
assert (wasUnderLowThreshold && wasOverHighThreshold) == false;
if (wasUnderLowThreshold) {
logger.info(
"low disk watermark [{}] exceeded on {}, replicas will not be assigned to this node",
diskThresholdSettings.describeLowThreshold(),
usage
);
} else if (wasOverHighThreshold) {
logger.info(
"high disk watermark [{}] no longer exceeded on {}, but low disk watermark [{}] is still exceeded",
diskThresholdSettings.describeHighThreshold(),
usage,
diskThresholdSettings.describeLowThreshold()
);
}
} else {
nodesOverHighThresholdAndRelocating.remove(node);
if (nodesOverLowThreshold.contains(node)) {
// The node has previously been over the low watermark, but is no longer, so it may be possible to allocate more
// shards
// if we reroute now.
if (lastRunTimeMillis.get() <= currentTimeMillis - diskThresholdSettings.getRerouteInterval().millis()) {
reroute = true;
explanation = "one or more nodes has gone under the high or low watermark";
nodesOverLowThreshold.remove(node);
nodesOverHighThreshold.remove(node);
logger.info(
"low disk watermark [{}] no longer exceeded on {}",
diskThresholdSettings.describeLowThreshold(),
usage
);
} else {
logger.debug(
"{} has gone below a disk threshold, but an automatic reroute has occurred "
+ "in the last [{}], skipping reroute",
node,
diskThresholdSettings.getRerouteInterval()
);
}
}
}
}
final ActionListener listener = new GroupedActionListener<>(ActionListener.wrap(this::checkFinished), 3);
if (reroute) {
logger.debug("rerouting shards: [{}]", explanation);
rerouteService.reroute("disk threshold monitor", Priority.HIGH, ActionListener.wrap(reroutedClusterState -> {
for (DiskUsage diskUsage : usagesOverHighThreshold) {
final RoutingNode routingNode = reroutedClusterState.getRoutingNodes().node(diskUsage.getNodeId());
final DiskUsage usageIncludingRelocations;
final long relocatingShardsSize;
if (routingNode != null) { // might be temporarily null if the ClusterInfoService and the ClusterService are out of step
relocatingShardsSize = sizeOfRelocatingShards(routingNode, diskUsage, info, reroutedClusterState);
usageIncludingRelocations = new DiskUsage(
diskUsage.getNodeId(),
diskUsage.getNodeName(),
diskUsage.getPath(),
diskUsage.getTotalBytes(),
diskUsage.getFreeBytes() - relocatingShardsSize
);
} else {
usageIncludingRelocations = diskUsage;
relocatingShardsSize = 0L;
}
if (usageIncludingRelocations.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes()
|| usageIncludingRelocations.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) {
nodesOverHighThresholdAndRelocating.remove(diskUsage.getNodeId());
logger.warn(
"high disk watermark [{}] exceeded on {}, shards will be relocated away from this node; "
+ "currently relocating away shards totalling [{}] bytes; the node is expected to continue to exceed "
+ "the high disk watermark when these relocations are complete",
diskThresholdSettings.describeHighThreshold(),
diskUsage,
-relocatingShardsSize
);
} else if (nodesOverHighThresholdAndRelocating.add(diskUsage.getNodeId())) {
logger.info(
"high disk watermark [{}] exceeded on {}, shards will be relocated away from this node; "
+ "currently relocating away shards totalling [{}] bytes; the node is expected to be below the high "
+ "disk watermark when these relocations are complete",
diskThresholdSettings.describeHighThreshold(),
diskUsage,
-relocatingShardsSize
);
} else {
logger.debug(
"high disk watermark [{}] exceeded on {}, shards will be relocated away from this node; "
+ "currently relocating away shards totalling [{}] bytes",
diskThresholdSettings.describeHighThreshold(),
diskUsage,
-relocatingShardsSize
);
}
}
setLastRunTimeMillis();
listener.onResponse(null);
}, e -> {
logger.debug("reroute failed", e);
setLastRunTimeMillis();
listener.onFailure(e);
}));
} else {
logger.trace("no reroute required");
listener.onResponse(null);
}
// Generate a map of node name to ID so we can use it to look up node replacement targets
final Map nodeNameToId = StreamSupport.stream(state.getRoutingNodes().spliterator(), false)
.collect(Collectors.toMap(rn -> rn.node().getName(), RoutingNode::nodeId, (s1, s2) -> s2));
// Calculate both the source node id and the target node id of a "replace" type shutdown
final Set nodesIdsPartOfReplacement = state.metadata()
.nodeShutdowns()
.values()
.stream()
.filter(meta -> meta.getType() == SingleNodeShutdownMetadata.Type.REPLACE)
.flatMap(meta -> Stream.of(meta.getNodeId(), nodeNameToId.get(meta.getTargetNodeName())))
.filter(Objects::nonNull) // The REPLACE target node might not still be in RoutingNodes
.collect(Collectors.toSet());
// Generate a set of all the indices that exist on either the target or source of a node replacement
final Set indicesOnReplaceSourceOrTarget = nodesIdsPartOfReplacement.stream()
.flatMap(nodeId -> state.getRoutingNodes().node(nodeId).copyShards().stream().map(ShardRouting::index).map(Index::getName))
.collect(Collectors.toSet());
final Set indicesToAutoRelease = state.routingTable()
.indicesRouting()
.keySet()
.stream()
.filter(index -> indicesNotToAutoRelease.contains(index) == false)
.filter(index -> state.getBlocks().hasIndexBlock(index, IndexMetadata.INDEX_READ_ONLY_ALLOW_DELETE_BLOCK))
// Do not auto release indices that are on either the source or the target of a node replacement
.filter(index -> indicesOnReplaceSourceOrTarget.contains(index) == false)
.collect(Collectors.toSet());
if (indicesToAutoRelease.isEmpty() == false) {
if (diskThresholdSettings.isAutoReleaseIndexEnabled()) {
logger.info("releasing read-only-allow-delete block on indices: [{}]", indicesToAutoRelease);
updateIndicesReadOnly(indicesToAutoRelease, listener, false);
} else {
deprecationLogger.critical(
DeprecationCategory.SETTINGS,
DiskThresholdSettings.AUTO_RELEASE_INDEX_ENABLED_KEY.replace(".", "_"),
"[{}] will be removed in version {}",
DiskThresholdSettings.AUTO_RELEASE_INDEX_ENABLED_KEY,
Version.V_7_4_0.major + 1
);
logger.debug(
"[{}] disabled, not releasing read-only-allow-delete block on indices: [{}]",
DiskThresholdSettings.AUTO_RELEASE_INDEX_ENABLED_KEY,
indicesToAutoRelease
);
listener.onResponse(null);
}
} else {
logger.trace("no auto-release required");
listener.onResponse(null);
}
indicesToMarkReadOnly.removeIf(index -> state.getBlocks().indexBlocked(ClusterBlockLevel.WRITE, index));
logger.trace("marking indices as read-only: [{}]", indicesToMarkReadOnly);
if (indicesToMarkReadOnly.isEmpty() == false) {
updateIndicesReadOnly(indicesToMarkReadOnly, listener, true);
} else {
listener.onResponse(null);
}
}
// exposed for tests to override
long sizeOfRelocatingShards(RoutingNode routingNode, DiskUsage diskUsage, ClusterInfo info, ClusterState reroutedClusterState) {
return DiskThresholdDecider.sizeOfRelocatingShards(
routingNode,
true,
diskUsage.getPath(),
info,
reroutedClusterState.metadata(),
reroutedClusterState.routingTable()
);
}
private void markNodesMissingUsageIneligibleForRelease(
RoutingNodes routingNodes,
ImmutableOpenMap usages,
Set indicesToMarkIneligibleForAutoRelease
) {
for (RoutingNode routingNode : routingNodes) {
if (usages.containsKey(routingNode.nodeId()) == false) {
if (routingNode != null) {
for (ShardRouting routing : routingNode) {
String indexName = routing.index().getName();
indicesToMarkIneligibleForAutoRelease.add(indexName);
}
}
}
}
}
private void setLastRunTimeMillis() {
lastRunTimeMillis.getAndUpdate(l -> Math.max(l, currentTimeMillisSupplier.getAsLong()));
}
protected void updateIndicesReadOnly(Set indicesToUpdate, ActionListener listener, boolean readOnly) {
// set read-only block but don't block on the response
ActionListener wrappedListener = ActionListener.wrap(r -> {
setLastRunTimeMillis();
listener.onResponse(r);
}, e -> {
logger.debug(new ParameterizedMessage("setting indices [{}] read-only failed", readOnly), e);
setLastRunTimeMillis();
listener.onFailure(e);
});
Settings readOnlySettings = readOnly ? READ_ONLY_ALLOW_DELETE_SETTINGS : NOT_READ_ONLY_ALLOW_DELETE_SETTINGS;
client.admin()
.indices()
.prepareUpdateSettings(indicesToUpdate.toArray(Strings.EMPTY_ARRAY))
.setSettings(readOnlySettings)
.origin("disk-threshold-monitor")
.execute(wrappedListener.map(r -> null));
}
private static void cleanUpRemovedNodes(ObjectLookupContainer nodesToKeep, Set nodesToCleanUp) {
for (String node : nodesToCleanUp) {
if (nodesToKeep.contains(node) == false) {
nodesToCleanUp.remove(node);
}
}
}
private boolean isDedicatedFrozenNode(RoutingNode routingNode) {
if (routingNode == null) {
return false;
}
DiscoveryNode node = routingNode.node();
return node.isDedicatedFrozenNode();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy