org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDecider Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.cluster.routing.allocation.decider;
import java.util.Set;
import com.carrotsearch.hppc.cursors.ObjectCursor;
import org.elasticsearch.cluster.ClusterInfo;
import org.elasticsearch.cluster.DiskUsage;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
import org.elasticsearch.cluster.routing.RecoverySource;
import org.elasticsearch.cluster.routing.RoutingNode;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.routing.ShardRoutingState;
import org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings;
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.collect.ImmutableOpenMap;
import org.elasticsearch.common.settings.ClusterSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.shard.ShardId;
import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING;
import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING;
/**
* The {@link DiskThresholdDecider} checks that the node a shard is potentially
* being allocated to has enough disk space.
*
* It has three configurable settings, all of which can be changed dynamically:
*
* cluster.routing.allocation.disk.watermark.low
is the low disk
* watermark. New shards will not allocated to a node with usage higher than this,
* although this watermark may be passed by allocating a shard. It defaults to
* 0.85 (85.0%).
*
* cluster.routing.allocation.disk.watermark.high
is the high disk
* watermark. If a node has usage higher than this, shards are not allowed to
* remain on the node. In addition, if allocating a shard to a node causes the
* node to pass this watermark, it will not be allowed. It defaults to
* 0.90 (90.0%).
*
* Both watermark settings are expressed in terms of used disk percentage, or
* exact byte values for free space (like "500mb")
*
* cluster.routing.allocation.disk.threshold_enabled
is used to
* enable or disable this decider. It defaults to false (disabled).
*/
public class DiskThresholdDecider extends AllocationDecider {
public static final String NAME = "disk_threshold";
private final DiskThresholdSettings diskThresholdSettings;
public DiskThresholdDecider(Settings settings, ClusterSettings clusterSettings) {
super(settings);
this.diskThresholdSettings = new DiskThresholdSettings(settings, clusterSettings);
}
/**
* Returns the size of all shards that are currently being relocated to
* the node, but may not be finished transferring yet.
*
* If subtractShardsMovingAway is true then the size of shards moving away is subtracted from the total size of all shards
*/
static long sizeOfRelocatingShards(RoutingNode node, RoutingAllocation allocation,
boolean subtractShardsMovingAway, String dataPath) {
ClusterInfo clusterInfo = allocation.clusterInfo();
long totalSize = 0;
for (ShardRouting routing : node.shardsWithState(ShardRoutingState.RELOCATING, ShardRoutingState.INITIALIZING)) {
String actualPath = clusterInfo.getDataPath(routing);
if (dataPath.equals(actualPath)) {
if (routing.initializing() && routing.relocatingNodeId() != null) {
totalSize += getExpectedShardSize(routing, allocation, 0);
} else if (subtractShardsMovingAway && routing.relocating()) {
totalSize -= getExpectedShardSize(routing, allocation, 0);
}
}
}
return totalSize;
}
@Override
public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
ClusterInfo clusterInfo = allocation.clusterInfo();
ImmutableOpenMap usages = clusterInfo.getNodeMostAvailableDiskUsages();
final Decision decision = earlyTerminate(allocation, usages);
if (decision != null) {
return decision;
}
final double usedDiskThresholdLow = 100.0 - diskThresholdSettings.getFreeDiskThresholdLow();
final double usedDiskThresholdHigh = 100.0 - diskThresholdSettings.getFreeDiskThresholdHigh();
// subtractLeavingShards is passed as false here, because they still use disk space, and therefore should we should be extra careful
// and take the size into account
DiskUsage usage = getDiskUsage(node, allocation, usages, false);
// First, check that the node currently over the low watermark
double freeDiskPercentage = usage.getFreeDiskAsPercentage();
// Cache the used disk percentage for displaying disk percentages consistent with documentation
double usedDiskPercentage = usage.getUsedDiskAsPercentage();
long freeBytes = usage.getFreeBytes();
if (logger.isTraceEnabled()) {
logger.trace("node [{}] has {}% used disk", node.nodeId(), usedDiskPercentage);
}
// flag that determines whether the low threshold checks below can be skipped. We use this for a primary shard that is freshly
// allocated and empty.
boolean skipLowTresholdChecks = shardRouting.primary() &&
shardRouting.active() == false && shardRouting.recoverySource().getType() == RecoverySource.Type.EMPTY_STORE;
// checks for exact byte comparisons
if (freeBytes < diskThresholdSettings.getFreeBytesThresholdLow().getBytes()) {
if (skipLowTresholdChecks == false) {
if (logger.isDebugEnabled()) {
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, preventing allocation",
diskThresholdSettings.getFreeBytesThresholdLow(), freeBytes, node.nodeId());
}
return allocation.decision(Decision.NO, NAME,
"the node is above the low watermark cluster setting [%s=%s], having less than the minimum required [%s] free " +
"space, actual free: [%s]",
CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(),
diskThresholdSettings.getLowWatermarkRaw(),
diskThresholdSettings.getFreeBytesThresholdLow(), new ByteSizeValue(freeBytes));
} else if (freeBytes > diskThresholdSettings.getFreeBytesThresholdHigh().getBytes()) {
// Allow the shard to be allocated because it is primary that
// has never been allocated if it's under the high watermark
if (logger.isDebugEnabled()) {
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, " +
"but allowing allocation because primary has never been allocated",
diskThresholdSettings.getFreeBytesThresholdLow(), freeBytes, node.nodeId());
}
return allocation.decision(Decision.YES, NAME,
"the node is above the low watermark, but less than the high watermark, and this primary shard has " +
"never been allocated before");
} else {
// Even though the primary has never been allocated, the node is
// above the high watermark, so don't allow allocating the shard
if (logger.isDebugEnabled()) {
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, " +
"preventing allocation even though primary has never been allocated",
diskThresholdSettings.getFreeBytesThresholdHigh(), freeBytes, node.nodeId());
}
return allocation.decision(Decision.NO, NAME,
"the node is above the high watermark cluster setting [%s=%s], having less than the minimum required [%s] free " +
"space, actual free: [%s]",
CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(),
diskThresholdSettings.getHighWatermarkRaw(),
diskThresholdSettings.getFreeBytesThresholdHigh(), new ByteSizeValue(freeBytes));
}
}
// checks for percentage comparisons
if (freeDiskPercentage < diskThresholdSettings.getFreeDiskThresholdLow()) {
// If the shard is a replica or is a non-empty primary, check the low threshold
if (skipLowTresholdChecks == false) {
if (logger.isDebugEnabled()) {
logger.debug("more than the allowed {} used disk threshold ({} used) on node [{}], preventing allocation",
Strings.format1Decimals(usedDiskThresholdLow, "%"),
Strings.format1Decimals(usedDiskPercentage, "%"), node.nodeId());
}
return allocation.decision(Decision.NO, NAME,
"the node is above the low watermark cluster setting [%s=%s], using more disk space than the maximum allowed " +
"[%s%%], actual free: [%s%%]",
CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(),
diskThresholdSettings.getLowWatermarkRaw(), usedDiskThresholdLow, freeDiskPercentage);
} else if (freeDiskPercentage > diskThresholdSettings.getFreeDiskThresholdHigh()) {
// Allow the shard to be allocated because it is primary that
// has never been allocated if it's under the high watermark
if (logger.isDebugEnabled()) {
logger.debug("more than the allowed {} used disk threshold ({} used) on node [{}], " +
"but allowing allocation because primary has never been allocated",
Strings.format1Decimals(usedDiskThresholdLow, "%"),
Strings.format1Decimals(usedDiskPercentage, "%"), node.nodeId());
}
return allocation.decision(Decision.YES, NAME,
"the node is above the low watermark, but less than the high watermark, and this primary shard has " +
"never been allocated before");
} else {
// Even though the primary has never been allocated, the node is
// above the high watermark, so don't allow allocating the shard
if (logger.isDebugEnabled()) {
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, " +
"preventing allocation even though primary has never been allocated",
Strings.format1Decimals(diskThresholdSettings.getFreeDiskThresholdHigh(), "%"),
Strings.format1Decimals(freeDiskPercentage, "%"), node.nodeId());
}
return allocation.decision(Decision.NO, NAME,
"the node is above the high watermark cluster setting [%s=%s], using more disk space than the maximum allowed " +
"[%s%%], actual free: [%s%%]",
CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(),
diskThresholdSettings.getHighWatermarkRaw(), usedDiskThresholdHigh, freeDiskPercentage);
}
}
// Secondly, check that allocating the shard to this node doesn't put it above the high watermark
final long shardSize = getExpectedShardSize(shardRouting, allocation, 0);
double freeSpaceAfterShard = freeDiskPercentageAfterShardAssigned(usage, shardSize);
long freeBytesAfterShard = freeBytes - shardSize;
if (freeBytesAfterShard < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes()) {
logger.warn("after allocating, node [{}] would have less than the required " +
"{} free bytes threshold ({} bytes free), preventing allocation",
node.nodeId(), diskThresholdSettings.getFreeBytesThresholdHigh(), freeBytesAfterShard);
return allocation.decision(Decision.NO, NAME,
"allocating the shard to this node will bring the node above the high watermark cluster setting [%s=%s] " +
"and cause it to have less than the minimum required [%s] of free space (free bytes after shard added: [%s])",
CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(),
diskThresholdSettings.getHighWatermarkRaw(),
diskThresholdSettings.getFreeBytesThresholdHigh(), new ByteSizeValue(freeBytesAfterShard));
}
if (freeSpaceAfterShard < diskThresholdSettings.getFreeDiskThresholdHigh()) {
logger.warn("after allocating, node [{}] would have more than the allowed " +
"{} free disk threshold ({} free), preventing allocation",
node.nodeId(), Strings.format1Decimals(diskThresholdSettings.getFreeDiskThresholdHigh(), "%"),
Strings.format1Decimals(freeSpaceAfterShard, "%"));
return allocation.decision(Decision.NO, NAME,
"allocating the shard to this node will bring the node above the high watermark cluster setting [%s=%s] " +
"and cause it to use more disk space than the maximum allowed [%s%%] (free space after shard added: [%s%%])",
CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(),
diskThresholdSettings.getHighWatermarkRaw(), usedDiskThresholdHigh, freeSpaceAfterShard);
}
return allocation.decision(Decision.YES, NAME,
"enough disk for shard on node, free: [%s], shard size: [%s], free after allocating shard: [%s]",
new ByteSizeValue(freeBytes),
new ByteSizeValue(shardSize),
new ByteSizeValue(freeBytesAfterShard));
}
@Override
public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
if (shardRouting.currentNodeId().equals(node.nodeId()) == false) {
throw new IllegalArgumentException("Shard [" + shardRouting + "] is not allocated on node: [" + node.nodeId() + "]");
}
final ClusterInfo clusterInfo = allocation.clusterInfo();
final ImmutableOpenMap usages = clusterInfo.getNodeLeastAvailableDiskUsages();
final Decision decision = earlyTerminate(allocation, usages);
if (decision != null) {
return decision;
}
// subtractLeavingShards is passed as true here, since this is only for shards remaining, we will *eventually* have enough disk
// since shards are moving away. No new shards will be incoming since in canAllocate we pass false for this check.
final DiskUsage usage = getDiskUsage(node, allocation, usages, true);
final String dataPath = clusterInfo.getDataPath(shardRouting);
// If this node is already above the high threshold, the shard cannot remain (get it off!)
final double freeDiskPercentage = usage.getFreeDiskAsPercentage();
final long freeBytes = usage.getFreeBytes();
if (logger.isTraceEnabled()) {
logger.trace("node [{}] has {}% free disk ({} bytes)", node.nodeId(), freeDiskPercentage, freeBytes);
}
if (dataPath == null || usage.getPath().equals(dataPath) == false) {
return allocation.decision(Decision.YES, NAME,
"this shard is not allocated on the most utilized disk and can remain");
}
if (freeBytes < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes()) {
if (logger.isDebugEnabled()) {
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, shard cannot remain",
diskThresholdSettings.getFreeBytesThresholdHigh(), freeBytes, node.nodeId());
}
return allocation.decision(Decision.NO, NAME,
"the shard cannot remain on this node because it is above the high watermark cluster setting [%s=%s] " +
"and there is less than the required [%s] free space on node, actual free: [%s]",
CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(),
diskThresholdSettings.getHighWatermarkRaw(),
diskThresholdSettings.getFreeBytesThresholdHigh(), new ByteSizeValue(freeBytes));
}
if (freeDiskPercentage < diskThresholdSettings.getFreeDiskThresholdHigh()) {
if (logger.isDebugEnabled()) {
logger.debug("less than the required {}% free disk threshold ({}% free) on node {}, shard cannot remain",
diskThresholdSettings.getFreeDiskThresholdHigh(), freeDiskPercentage, node.nodeId());
}
return allocation.decision(Decision.NO, NAME,
"the shard cannot remain on this node because it is above the high watermark cluster setting [%s=%s] " +
"and there is less than the required [%s%%] free disk on node, actual free: [%s%%]",
CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(),
diskThresholdSettings.getHighWatermarkRaw(),
diskThresholdSettings.getFreeDiskThresholdHigh(), freeDiskPercentage);
}
return allocation.decision(Decision.YES, NAME,
"there is enough disk on this node for the shard to remain, free: [%s]", new ByteSizeValue(freeBytes));
}
private DiskUsage getDiskUsage(RoutingNode node, RoutingAllocation allocation,
ImmutableOpenMap usages, boolean subtractLeavingShards) {
DiskUsage usage = usages.get(node.nodeId());
if (usage == null) {
// If there is no usage, and we have other nodes in the cluster,
// use the average usage for all nodes as the usage for this node
usage = averageUsage(node, usages);
if (logger.isDebugEnabled()) {
logger.debug("unable to determine disk usage for {}, defaulting to average across nodes [{} total] [{} free] [{}% free]",
node.nodeId(), usage.getTotalBytes(), usage.getFreeBytes(), usage.getFreeDiskAsPercentage());
}
}
if (diskThresholdSettings.includeRelocations()) {
long relocatingShardsSize = sizeOfRelocatingShards(node, allocation, subtractLeavingShards, usage.getPath());
DiskUsage usageIncludingRelocations = new DiskUsage(node.nodeId(), node.node().getName(), usage.getPath(),
usage.getTotalBytes(), usage.getFreeBytes() - relocatingShardsSize);
if (logger.isTraceEnabled()) {
logger.trace("usage without relocations: {}", usage);
logger.trace("usage with relocations: [{} bytes] {}", relocatingShardsSize, usageIncludingRelocations);
}
usage = usageIncludingRelocations;
}
return usage;
}
/**
* Returns a {@link DiskUsage} for the {@link RoutingNode} using the
* average usage of other nodes in the disk usage map.
* @param node Node to return an averaged DiskUsage object for
* @param usages Map of nodeId to DiskUsage for all known nodes
* @return DiskUsage representing given node using the average disk usage
*/
DiskUsage averageUsage(RoutingNode node, ImmutableOpenMap usages) {
if (usages.size() == 0) {
return new DiskUsage(node.nodeId(), node.node().getName(), "_na_", 0, 0);
}
long totalBytes = 0;
long freeBytes = 0;
for (ObjectCursor du : usages.values()) {
totalBytes += du.value.getTotalBytes();
freeBytes += du.value.getFreeBytes();
}
return new DiskUsage(node.nodeId(), node.node().getName(), "_na_", totalBytes / usages.size(), freeBytes / usages.size());
}
/**
* Given the DiskUsage for a node and the size of the shard, return the
* percentage of free disk if the shard were to be allocated to the node.
* @param usage A DiskUsage for the node to have space computed for
* @param shardSize Size in bytes of the shard
* @return Percentage of free space after the shard is assigned to the node
*/
double freeDiskPercentageAfterShardAssigned(DiskUsage usage, Long shardSize) {
shardSize = (shardSize == null) ? 0 : shardSize;
DiskUsage newUsage = new DiskUsage(usage.getNodeId(), usage.getNodeName(), usage.getPath(),
usage.getTotalBytes(), usage.getFreeBytes() - shardSize);
return newUsage.getFreeDiskAsPercentage();
}
private Decision earlyTerminate(RoutingAllocation allocation, ImmutableOpenMap usages) {
// Always allow allocation if the decider is disabled
if (diskThresholdSettings.isEnabled() == false) {
return allocation.decision(Decision.YES, NAME, "the disk threshold decider is disabled");
}
// Allow allocation regardless if only a single data node is available
if (allocation.nodes().getDataNodes().size() <= 1) {
if (logger.isTraceEnabled()) {
logger.trace("only a single data node is present, allowing allocation");
}
return allocation.decision(Decision.YES, NAME, "there is only a single data node present");
}
// Fail open there is no info available
final ClusterInfo clusterInfo = allocation.clusterInfo();
if (clusterInfo == null) {
if (logger.isTraceEnabled()) {
logger.trace("cluster info unavailable for disk threshold decider, allowing allocation.");
}
return allocation.decision(Decision.YES, NAME, "the cluster info is unavailable");
}
// Fail open if there are no disk usages available
if (usages.isEmpty()) {
if (logger.isTraceEnabled()) {
logger.trace("unable to determine disk usages for disk-aware allocation, allowing allocation");
}
return allocation.decision(Decision.YES, NAME, "disk usages are unavailable");
}
return null;
}
/**
* Returns the expected shard size for the given shard or the default value provided if not enough information are available
* to estimate the shards size.
*/
public static long getExpectedShardSize(ShardRouting shard, RoutingAllocation allocation, long defaultValue) {
final IndexMetaData metaData = allocation.metaData().getIndexSafe(shard.index());
final ClusterInfo info = allocation.clusterInfo();
if (metaData.getMergeSourceIndex() != null && shard.active() == false &&
shard.recoverySource().getType() == RecoverySource.Type.LOCAL_SHARDS) {
// in the shrink index case we sum up the source index shards since we basically make a copy of the shard in
// the worst case
long targetShardSize = 0;
final Index mergeSourceIndex = metaData.getMergeSourceIndex();
final IndexMetaData sourceIndexMeta = allocation.metaData().getIndexSafe(mergeSourceIndex);
final Set shardIds = IndexMetaData.selectShrinkShards(shard.id(), sourceIndexMeta, metaData.getNumberOfShards());
for (IndexShardRoutingTable shardRoutingTable : allocation.routingTable().index(mergeSourceIndex.getName())) {
if (shardIds.contains(shardRoutingTable.shardId())) {
targetShardSize += info.getShardSize(shardRoutingTable.primaryShard(), 0);
}
}
return targetShardSize == 0 ? defaultValue : targetShardSize;
} else {
return info.getShardSize(shard, defaultValue);
}
}
}