
org.elasticsearch.health.node.DiskHealthIndicatorService Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Show all versions of elasticsearch Show documentation
Elasticsearch - Open Source, Distributed, RESTful Search Engine
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.health.node;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
import org.elasticsearch.cluster.routing.RoutingNodes;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.features.FeatureService;
import org.elasticsearch.health.Diagnosis;
import org.elasticsearch.health.HealthFeatures;
import org.elasticsearch.health.HealthIndicatorDetails;
import org.elasticsearch.health.HealthIndicatorImpact;
import org.elasticsearch.health.HealthIndicatorResult;
import org.elasticsearch.health.HealthIndicatorService;
import org.elasticsearch.health.HealthStatus;
import org.elasticsearch.health.ImpactArea;
import org.elasticsearch.index.Index;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.EnumMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.elasticsearch.cluster.node.DiscoveryNode.DISCOVERY_NODE_COMPARATOR;
import static org.elasticsearch.common.util.CollectionUtils.limitSize;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.are;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.getSortedUniqueValuesString;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.getTruncatedIndices;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.indices;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.indicesComparatorByPriorityAndName;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.regularNoun;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.regularVerb;
import static org.elasticsearch.health.node.HealthIndicatorDisplayValues.these;
/**
* This indicator reports the clusters' disk health aka if the cluster has enough available space to function.
* Indicator will report YELLOW status when:
* - a data node's disk usage is above the high watermark and it's not relocating any of its shards.
* - a non data node's disk usage is above the high watermark.
* Indicator will report RED status when:
* - an index has the INDEX_READ_ONLY_ALLOW_DELETE_BLOCK which indicates that an index has been blocked because a node was out of space.
* - any node's disk usage is above the flood stage watermark.
*/
public class DiskHealthIndicatorService implements HealthIndicatorService {
public static final String NAME = "disk";
private static final Logger logger = LogManager.getLogger(DiskHealthIndicatorService.class);
private static final String IMPACT_INGEST_UNAVAILABLE_ID = "ingest_capability_unavailable";
private static final String IMPACT_INGEST_AT_RISK_ID = "ingest_capability_at_risk";
private static final String IMPACT_CLUSTER_STABILITY_AT_RISK_ID = "cluster_stability_at_risk";
private static final String IMPACT_CLUSTER_FUNCTIONALITY_UNAVAILABLE_ID = "cluster_functionality_unavailable";
private final ClusterService clusterService;
private final FeatureService featureService;
public DiskHealthIndicatorService(ClusterService clusterService, FeatureService featureService) {
this.clusterService = clusterService;
this.featureService = featureService;
}
@Override
public String name() {
return NAME;
}
@Override
public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResourcesCount, HealthInfo healthInfo) {
ClusterState clusterState = clusterService.state();
Map diskHealthInfoMap = healthInfo.diskInfoByNode();
if (diskHealthInfoMap == null || diskHealthInfoMap.isEmpty()) {
if (featureService.clusterHasFeature(clusterState, HealthFeatures.SUPPORTS_HEALTH) == false) {
return createIndicator(
HealthStatus.GREEN,
"No disk usage data available. The cluster currently has mixed versions (an upgrade may be in progress).",
HealthIndicatorDetails.EMPTY,
List.of(),
List.of()
);
}
/*
* If there is no disk health info, that either means that a new health node was just elected, or something is seriously
* wrong with health data collection on the health node. Either way, we immediately return UNKNOWN. If there are at least
* some health info results then we work with what we have (and log any missing ones at debug level immediately below this).
*/
return createIndicator(
HealthStatus.UNKNOWN,
"No disk usage data.",
HealthIndicatorDetails.EMPTY,
Collections.emptyList(),
Collections.emptyList()
);
}
logNodesMissingHealthInfo(diskHealthInfoMap, clusterState);
DiskHealthAnalyzer diskHealthAnalyzer = new DiskHealthAnalyzer(diskHealthInfoMap, clusterState);
return createIndicator(
diskHealthAnalyzer.getHealthStatus(),
diskHealthAnalyzer.getSymptom(),
diskHealthAnalyzer.getDetails(verbose),
diskHealthAnalyzer.getImpacts(),
diskHealthAnalyzer.getDiagnoses(maxAffectedResourcesCount)
);
}
/**
* This method logs if any nodes in the cluster state do not have health info results reported. This is logged at debug level and is
* not ordinary important, but could be useful in tracking down problems where nodes have stopped reporting health node information.
* @param diskHealthInfoMap A map of nodeId to DiskHealthInfo
*/
private static void logNodesMissingHealthInfo(Map diskHealthInfoMap, ClusterState clusterState) {
if (logger.isDebugEnabled()) {
String nodesMissingHealthInfo = getSortedUniqueValuesString(
clusterState.getNodes().getAllNodes(),
node -> diskHealthInfoMap.containsKey(node.getId()) == false,
HealthIndicatorDisplayValues::getNodeName
);
if (nodesMissingHealthInfo.isBlank() == false) {
logger.debug("The following nodes are in the cluster state but not reporting health data: [{}]", nodesMissingHealthInfo);
}
}
}
/**
* The disk health analyzer takes into consideration the blocked indices and the health status of the all the nodes and calculates
* the different aspects of the disk indicator such as the overall status, the symptom, the impacts and the diagnoses.
*/
static class DiskHealthAnalyzer {
public static final String INDICES_WITH_READONLY_BLOCK = "indices_with_readonly_block";
public static final String NODES_WITH_ENOUGH_DISK_SPACE = "nodes_with_enough_disk_space";
public static final String NODES_OVER_FLOOD_STAGE_WATERMARK = "nodes_over_flood_stage_watermark";
public static final String NODES_OVER_HIGH_WATERMARK = "nodes_over_high_watermark";
public static final String NODES_WITH_UNKNOWN_DISK_STATUS = "nodes_with_unknown_disk_status";
private final ClusterState clusterState;
private final Set blockedIndices;
private final List dataNodes = new ArrayList<>();
// In this context a master node, is a master node that cannot contain data.
private final Map> masterNodes = new EnumMap<>(HealthStatus.class);
// In this context "other" nodes are nodes that cannot contain data and are not masters.
private final Map> otherNodes = new EnumMap<>(HealthStatus.class);
private final Set affectedRoles = new HashSet<>();
private final Set indicesAtRisk;
private final HealthStatus healthStatus;
private final Map healthStatusNodeCount;
DiskHealthAnalyzer(Map diskHealthByNode, ClusterState clusterState) {
this.clusterState = clusterState;
blockedIndices = clusterState.blocks()
.indices()
.entrySet()
.stream()
.filter(entry -> entry.getValue().contains(IndexMetadata.INDEX_READ_ONLY_ALLOW_DELETE_BLOCK))
.map(Map.Entry::getKey)
.collect(Collectors.toSet());
HealthStatus mostSevereStatusSoFar = blockedIndices.isEmpty() ? HealthStatus.GREEN : HealthStatus.RED;
for (String nodeId : diskHealthByNode.keySet()) {
DiscoveryNode node = clusterState.getNodes().get(nodeId);
HealthStatus healthStatus = diskHealthByNode.get(nodeId).healthStatus();
if (node == null || healthStatus.indicatesHealthProblem() == false) {
continue;
}
if (mostSevereStatusSoFar.value() < healthStatus.value()) {
mostSevereStatusSoFar = healthStatus;
}
affectedRoles.addAll(node.getRoles());
if (node.canContainData()) {
dataNodes.add(node);
} else if (node.isMasterNode()) {
masterNodes.computeIfAbsent(healthStatus, ignored -> new ArrayList<>()).add(node);
} else {
otherNodes.computeIfAbsent(healthStatus, ignored -> new ArrayList<>()).add(node);
}
}
dataNodes.sort(DISCOVERY_NODE_COMPARATOR);
for (List masterNodes : masterNodes.values()) {
masterNodes.sort(DISCOVERY_NODE_COMPARATOR);
}
for (List nodes : otherNodes.values()) {
nodes.sort(DISCOVERY_NODE_COMPARATOR);
}
indicesAtRisk = getIndicesForNodes(dataNodes, clusterState);
healthStatus = mostSevereStatusSoFar;
healthStatusNodeCount = countNodesByHealthStatus(diskHealthByNode, clusterState);
}
public HealthStatus getHealthStatus() {
return healthStatus;
}
String getSymptom() {
if (healthStatus == HealthStatus.GREEN) {
return "The cluster has enough available disk space.";
}
String symptom;
if (hasBlockedIndices()) {
symptom = String.format(
Locale.ROOT,
"%d %s %s not allowed to be updated.",
blockedIndices.size(),
indices(blockedIndices.size()),
are(blockedIndices.size())
);
if (hasUnhealthyDataNodes()) {
symptom += String.format(
Locale.ROOT,
" %d %s %s out of disk or running low on disk space.",
dataNodes.size(),
regularNoun("node", dataNodes.size()),
are(dataNodes.size())
);
} else {
// In this case the disk issue has been resolved but the index block has not been removed yet or the
// cluster is still moving shards away from data nodes that are over the high watermark.
symptom += " The cluster is recovering and ingest capabilities should be restored within a few minutes.";
}
if (hasUnhealthyMasterNodes() || hasUnhealthyOtherNodes()) {
String roles = Stream.concat(masterNodes.values().stream(), otherNodes.values().stream())
.flatMap(Collection::stream)
.flatMap(node -> node.getRoles().stream())
.map(DiscoveryNodeRole::roleName)
.distinct()
.sorted()
.collect(Collectors.joining(", "));
int unhealthyNodesCount = getUnhealthyNodeSize(masterNodes) + getUnhealthyNodeSize(otherNodes);
symptom += String.format(
Locale.ROOT,
" %d %s with roles: [%s] %s out of disk or running low on disk space.",
unhealthyNodesCount,
regularNoun("node", unhealthyNodesCount),
roles,
are(unhealthyNodesCount)
);
}
} else {
String roles = getSortedUniqueValuesString(affectedRoles, DiscoveryNodeRole::roleName);
int unhealthyNodesCount = dataNodes.size() + getUnhealthyNodeSize(masterNodes) + getUnhealthyNodeSize(otherNodes);
symptom = String.format(
Locale.ROOT,
"%d %s with roles: [%s] %s out of disk or running low on disk space.",
unhealthyNodesCount,
regularNoun("node", unhealthyNodesCount),
roles,
are(unhealthyNodesCount)
);
}
return symptom;
}
List getImpacts() {
if (healthStatus == HealthStatus.GREEN) {
return List.of();
}
List impacts = new ArrayList<>();
if (hasBlockedIndices()) {
impacts.add(
new HealthIndicatorImpact(
NAME,
IMPACT_INGEST_UNAVAILABLE_ID,
1,
String.format(
Locale.ROOT,
"Cannot insert or update documents in the affected indices [%s].",
getTruncatedIndices(blockedIndices, clusterState.getMetadata())
),
List.of(ImpactArea.INGEST)
)
);
} else {
if (indicesAtRisk.isEmpty() == false) {
impacts.add(
new HealthIndicatorImpact(
NAME,
IMPACT_INGEST_AT_RISK_ID,
1,
String.format(
Locale.ROOT,
"The cluster is at risk of not being able to insert or update documents in the affected indices [%s].",
getTruncatedIndices(indicesAtRisk, clusterState.metadata())
),
List.of(ImpactArea.INGEST)
)
);
}
// data nodes don't have space, but no indices had the write block in the cluster
if (hasUnhealthyDataNodes()) {
impacts.add(
new HealthIndicatorImpact(
NAME,
IMPACT_INGEST_AT_RISK_ID,
2,
String.format(
Locale.ROOT,
"%d %s %s out of disk or running low on disk space. %s %s cannot be used to store data anymore.",
dataNodes.size(),
regularNoun("node", dataNodes.size()),
are(dataNodes.size()),
these(dataNodes.size()),
regularNoun("node", dataNodes.size())
),
List.of(ImpactArea.DEPLOYMENT_MANAGEMENT)
)
);
}
}
if (affectedRoles.contains(DiscoveryNodeRole.MASTER_ROLE)) {
impacts.add(
new HealthIndicatorImpact(
NAME,
IMPACT_CLUSTER_STABILITY_AT_RISK_ID,
1,
"Cluster stability might be impaired.",
List.of(ImpactArea.DEPLOYMENT_MANAGEMENT)
)
);
}
String impactedOtherRoles = getSortedUniqueValuesString(
affectedRoles,
role -> role.canContainData() == false && role.equals(DiscoveryNodeRole.MASTER_ROLE) == false,
DiscoveryNodeRole::roleName
);
if (impactedOtherRoles.isBlank() == false) {
impacts.add(
new HealthIndicatorImpact(
NAME,
IMPACT_CLUSTER_FUNCTIONALITY_UNAVAILABLE_ID,
3,
String.format(Locale.ROOT, "The [%s] functionality might be impaired.", impactedOtherRoles),
List.of(ImpactArea.DEPLOYMENT_MANAGEMENT)
)
);
}
return impacts;
}
private List getDiagnoses(int size) {
if (healthStatus == HealthStatus.GREEN) {
return List.of();
}
List diagnosisList = new ArrayList<>();
if (hasBlockedIndices() || hasUnhealthyDataNodes()) {
Set affectedIndices = Sets.union(blockedIndices, indicesAtRisk);
List affectedResources = new ArrayList<>();
if (dataNodes.size() > 0) {
Diagnosis.Resource nodeResources = new Diagnosis.Resource(limitSize(dataNodes, size));
affectedResources.add(nodeResources);
}
if (affectedIndices.size() > 0) {
Diagnosis.Resource indexResources = new Diagnosis.Resource(
Diagnosis.Resource.Type.INDEX,
affectedIndices.stream()
.sorted(indicesComparatorByPriorityAndName(clusterState.metadata()))
.limit(Math.min(affectedIndices.size(), size))
.collect(Collectors.toList())
);
affectedResources.add(indexResources);
}
diagnosisList.add(createDataNodeDiagnosis(affectedIndices.size(), affectedResources));
}
if (masterNodes.containsKey(HealthStatus.RED)) {
diagnosisList.add(createNonDataNodeDiagnosis(HealthStatus.RED, masterNodes.get(HealthStatus.RED), size, true));
}
if (masterNodes.containsKey(HealthStatus.YELLOW)) {
diagnosisList.add(createNonDataNodeDiagnosis(HealthStatus.YELLOW, masterNodes.get(HealthStatus.YELLOW), size, true));
}
if (otherNodes.containsKey(HealthStatus.RED)) {
diagnosisList.add(createNonDataNodeDiagnosis(HealthStatus.RED, otherNodes.get(HealthStatus.RED), size, false));
}
if (otherNodes.containsKey(HealthStatus.YELLOW)) {
diagnosisList.add(createNonDataNodeDiagnosis(HealthStatus.YELLOW, otherNodes.get(HealthStatus.YELLOW), size, false));
}
return diagnosisList;
}
HealthIndicatorDetails getDetails(boolean verbose) {
if (verbose == false) {
return HealthIndicatorDetails.EMPTY;
}
return ((builder, params) -> {
builder.startObject();
builder.field(INDICES_WITH_READONLY_BLOCK, blockedIndices.size());
for (HealthStatus healthStatus : HealthStatus.values()) {
builder.field(getDetailsDisplayKey(healthStatus), healthStatusNodeCount.get(healthStatus));
}
return builder.endObject();
});
}
// Visible for testing
static Map countNodesByHealthStatus(
Map diskHealthInfoMap,
ClusterState clusterState
) {
Map counts = new EnumMap<>(HealthStatus.class);
for (HealthStatus healthStatus : HealthStatus.values()) {
counts.put(healthStatus, 0);
}
for (DiscoveryNode node : clusterState.getNodes()) {
HealthStatus healthStatus = diskHealthInfoMap.containsKey(node.getId())
? diskHealthInfoMap.get(node.getId()).healthStatus()
: HealthStatus.UNKNOWN;
counts.computeIfPresent(healthStatus, (ignored, count) -> count + 1);
}
return counts;
}
private static String getDetailsDisplayKey(HealthStatus status) {
return switch (status) {
case GREEN -> NODES_WITH_ENOUGH_DISK_SPACE;
case UNKNOWN -> NODES_WITH_UNKNOWN_DISK_STATUS;
case YELLOW -> NODES_OVER_HIGH_WATERMARK;
case RED -> NODES_OVER_FLOOD_STAGE_WATERMARK;
};
}
private boolean hasUnhealthyDataNodes() {
return dataNodes.isEmpty() == false;
}
private boolean hasUnhealthyMasterNodes() {
return masterNodes.isEmpty() == false;
}
private boolean hasUnhealthyOtherNodes() {
return otherNodes.isEmpty() == false;
}
private boolean hasBlockedIndices() {
return blockedIndices.isEmpty() == false;
}
// Non-private for unit testing
static Set getIndicesForNodes(List nodes, ClusterState clusterState) {
RoutingNodes routingNodes = clusterState.getRoutingNodes();
return nodes.stream()
.map(node -> routingNodes.node(node.getId()))
.filter(Objects::nonNull)
.flatMap(routingNode -> Arrays.stream(routingNode.copyIndices()))
.map(Index::getName)
.collect(Collectors.toSet());
}
// Visible for testing
static Diagnosis createDataNodeDiagnosis(int numberOfAffectedIndices, List affectedResources) {
String message = numberOfAffectedIndices == 0
? "Disk is almost full."
: String.format(
Locale.ROOT,
"%d %s %s on nodes that have run or are likely to run out of disk space, "
+ "this can temporarily disable writing on %s %s.",
numberOfAffectedIndices,
indices(numberOfAffectedIndices),
regularVerb("reside", numberOfAffectedIndices),
these(numberOfAffectedIndices),
indices(numberOfAffectedIndices)
);
return new Diagnosis(
new Diagnosis.Definition(
NAME,
"add_disk_capacity_data_nodes",
message,
"Enable autoscaling (if applicable), add disk capacity or free up disk space to resolve "
+ "this. If you have already taken action please wait for the rebalancing to complete.",
"https://ela.st/fix-data-disk"
),
affectedResources
);
}
// Visible for testing
static Diagnosis createNonDataNodeDiagnosis(HealthStatus healthStatus, List nodes, int size, boolean isMaster) {
return new Diagnosis(
new Diagnosis.Definition(
NAME,
isMaster ? "add_disk_capacity_master_nodes" : "add_disk_capacity",
healthStatus == HealthStatus.RED ? "Disk is full." : "The cluster is running low on disk space.",
"Please add capacity to the current nodes, or replace them with ones with higher capacity.",
isMaster ? "https://ela.st/fix-master-disk" : "https://ela.st/fix-disk-space"
),
List.of(new Diagnosis.Resource(limitSize(nodes, size)))
);
}
private static int getUnhealthyNodeSize(Map> nodes) {
return (nodes.containsKey(HealthStatus.RED) ? nodes.get(HealthStatus.RED).size() : 0) + (nodes.containsKey(HealthStatus.YELLOW)
? nodes.get(HealthStatus.YELLOW).size()
: 0);
}
}
}