org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDecider Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.cluster.routing.allocation.decider;
import com.google.common.collect.Sets;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.client.Client;
import org.elasticsearch.cluster.ClusterInfo;
import org.elasticsearch.cluster.ClusterInfoService;
import org.elasticsearch.cluster.DiskUsage;
import org.elasticsearch.cluster.EmptyClusterInfoService;
import org.elasticsearch.cluster.routing.RoutingNode;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.routing.ShardRoutingState;
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.RatioValue;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.node.settings.NodeSettingsService;
import java.util.Map;
import java.util.Set;
/**
* The {@link DiskThresholdDecider} checks that the node a shard is potentially
* being allocated to has enough disk space.
*
* It has three configurable settings, all of which can be changed dynamically:
*
* cluster.routing.allocation.disk.watermark.low
is the low disk
* watermark. New shards will not allocated to a node with usage higher than this,
* although this watermark may be passed by allocating a shard. It defaults to
* 0.85 (85.0%).
*
* cluster.routing.allocation.disk.watermark.high
is the high disk
* watermark. If a node has usage higher than this, shards are not allowed to
* remain on the node. In addition, if allocating a shard to a node causes the
* node to pass this watermark, it will not be allowed. It defaults to
* 0.90 (90.0%).
*
* Both watermark settings are expressed in terms of used disk percentage, or
* exact byte values for free space (like "500mb")
*
* cluster.routing.allocation.disk.threshold_enabled
is used to
* enable or disable this decider. It defaults to false (disabled).
*/
public class DiskThresholdDecider extends AllocationDecider {
public static final String NAME = "disk_threshold";
private volatile Double freeDiskThresholdLow;
private volatile Double freeDiskThresholdHigh;
private volatile ByteSizeValue freeBytesThresholdLow;
private volatile ByteSizeValue freeBytesThresholdHigh;
private volatile boolean includeRelocations;
private volatile boolean enabled;
private volatile TimeValue rerouteInterval;
public static final String CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED = "cluster.routing.allocation.disk.threshold_enabled";
public static final String CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK = "cluster.routing.allocation.disk.watermark.low";
public static final String CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK = "cluster.routing.allocation.disk.watermark.high";
public static final String CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS = "cluster.routing.allocation.disk.include_relocations";
public static final String CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL = "cluster.routing.allocation.disk.reroute_interval";
class ApplySettings implements NodeSettingsService.Listener {
@Override
public void onRefreshSettings(Settings settings) {
String newLowWatermark = settings.get(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK, null);
String newHighWatermark = settings.get(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK, null);
Boolean newRelocationsSetting = settings.getAsBoolean(CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS, null);
Boolean newEnableSetting = settings.getAsBoolean(CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED, null);
TimeValue newRerouteInterval = settings.getAsTime(CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL, null);
if (newEnableSetting != null) {
logger.info("updating [{}] from [{}] to [{}]", CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED,
DiskThresholdDecider.this.enabled, newEnableSetting);
DiskThresholdDecider.this.enabled = newEnableSetting;
}
if (newRelocationsSetting != null) {
logger.info("updating [{}] from [{}] to [{}]", CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS,
DiskThresholdDecider.this.includeRelocations, newRelocationsSetting);
DiskThresholdDecider.this.includeRelocations = newRelocationsSetting;
}
if (newLowWatermark != null) {
if (!validWatermarkSetting(newLowWatermark, CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK)) {
throw new ElasticsearchParseException("unable to parse low watermark [{}]", newLowWatermark);
}
logger.info("updating [{}] to [{}]", CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK, newLowWatermark);
DiskThresholdDecider.this.freeDiskThresholdLow = 100.0 - thresholdPercentageFromWatermark(newLowWatermark);
DiskThresholdDecider.this.freeBytesThresholdLow = thresholdBytesFromWatermark(newLowWatermark, CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK);
}
if (newHighWatermark != null) {
if (!validWatermarkSetting(newHighWatermark, CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK)) {
throw new ElasticsearchParseException("unable to parse high watermark [{}]", newHighWatermark);
}
logger.info("updating [{}] to [{}]", CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK, newHighWatermark);
DiskThresholdDecider.this.freeDiskThresholdHigh = 100.0 - thresholdPercentageFromWatermark(newHighWatermark);
DiskThresholdDecider.this.freeBytesThresholdHigh = thresholdBytesFromWatermark(newHighWatermark, CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK);
}
if (newRerouteInterval != null) {
logger.info("updating [{}] to [{}]", CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL, newRerouteInterval);
DiskThresholdDecider.this.rerouteInterval = newRerouteInterval;
}
}
}
/**
* Listens for a node to go over the high watermark and kicks off an empty
* reroute if it does. Also responsible for logging about nodes that have
* passed the disk watermarks
*/
class DiskListener implements ClusterInfoService.Listener {
private final Client client;
private final Set nodeHasPassedWatermark = Sets.newConcurrentHashSet();
private long lastRunNS;
DiskListener(Client client) {
this.client = client;
}
/**
* Warn about the given disk usage if the low or high watermark has been passed
*/
private void warnAboutDiskIfNeeded(DiskUsage usage) {
// Check absolute disk values
if (usage.getFreeBytes() < DiskThresholdDecider.this.freeBytesThresholdHigh.bytes()) {
logger.warn("high disk watermark [{}] exceeded on {}, shards will be relocated away from this node",
DiskThresholdDecider.this.freeBytesThresholdHigh, usage);
} else if (usage.getFreeBytes() < DiskThresholdDecider.this.freeBytesThresholdLow.bytes()) {
logger.info("low disk watermark [{}] exceeded on {}, replicas will not be assigned to this node",
DiskThresholdDecider.this.freeBytesThresholdLow, usage);
}
// Check percentage disk values
if (usage.getFreeDiskAsPercentage() < DiskThresholdDecider.this.freeDiskThresholdHigh) {
logger.warn("high disk watermark [{}] exceeded on {}, shards will be relocated away from this node",
Strings.format1Decimals(100.0 - DiskThresholdDecider.this.freeDiskThresholdHigh, "%"), usage);
} else if (usage.getFreeDiskAsPercentage() < DiskThresholdDecider.this.freeDiskThresholdLow) {
logger.info("low disk watermark [{}] exceeded on {}, replicas will not be assigned to this node",
Strings.format1Decimals(100.0 - DiskThresholdDecider.this.freeDiskThresholdLow, "%"), usage);
}
}
@Override
public void onNewInfo(ClusterInfo info) {
Map usages = info.getNodeLeastAvailableDiskUsages();
if (usages != null) {
boolean reroute = false;
String explanation = "";
// Garbage collect nodes that have been removed from the cluster
// from the map that tracks watermark crossing
Set nodes = usages.keySet();
for (String node : nodeHasPassedWatermark) {
if (nodes.contains(node) == false) {
nodeHasPassedWatermark.remove(node);
}
}
for (Map.Entry entry : usages.entrySet()) {
String node = entry.getKey();
DiskUsage usage = entry.getValue();
warnAboutDiskIfNeeded(usage);
if (usage.getFreeBytes() < DiskThresholdDecider.this.freeBytesThresholdHigh.bytes() ||
usage.getFreeDiskAsPercentage() < DiskThresholdDecider.this.freeDiskThresholdHigh) {
if ((System.nanoTime() - lastRunNS) > DiskThresholdDecider.this.rerouteInterval.nanos()) {
lastRunNS = System.nanoTime();
reroute = true;
explanation = "high disk watermark exceeded on one or more nodes";
} else {
logger.debug("high disk watermark exceeded on {} but an automatic reroute has occurred in the last [{}], skipping reroute",
node, DiskThresholdDecider.this.rerouteInterval);
}
nodeHasPassedWatermark.add(node);
} else if (usage.getFreeBytes() < DiskThresholdDecider.this.freeBytesThresholdLow.bytes() ||
usage.getFreeDiskAsPercentage() < DiskThresholdDecider.this.freeDiskThresholdLow) {
nodeHasPassedWatermark.add(node);
} else {
if (nodeHasPassedWatermark.contains(node)) {
// The node has previously been over the high or
// low watermark, but is no longer, so we should
// reroute so any unassigned shards can be allocated
// if they are able to be
if ((System.nanoTime() - lastRunNS) > DiskThresholdDecider.this.rerouteInterval.nanos()) {
lastRunNS = System.nanoTime();
reroute = true;
explanation = "one or more nodes has gone under the high or low watermark";
nodeHasPassedWatermark.remove(node);
} else {
logger.debug("{} has gone below a disk threshold, but an automatic reroute has occurred in the last [{}], skipping reroute",
node, DiskThresholdDecider.this.rerouteInterval);
}
}
}
}
if (reroute) {
logger.info("rerouting shards: [{}]", explanation);
// Execute an empty reroute, but don't block on the response
client.admin().cluster().prepareReroute().execute();
}
}
}
}
public DiskThresholdDecider(Settings settings) {
// It's okay the Client is null here, because the empty cluster info
// service will never actually call the listener where the client is
// needed. Also this constructor is only used for tests
this(settings, new NodeSettingsService(settings), EmptyClusterInfoService.INSTANCE, null);
}
@Inject
public DiskThresholdDecider(Settings settings, NodeSettingsService nodeSettingsService, ClusterInfoService infoService, Client client) {
super(settings);
String lowWatermark = settings.get(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK, "85%");
String highWatermark = settings.get(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK, "90%");
if (!validWatermarkSetting(lowWatermark, CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK)) {
throw new ElasticsearchParseException("unable to parse low watermark [{}]", lowWatermark);
}
if (!validWatermarkSetting(highWatermark, CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK)) {
throw new ElasticsearchParseException("unable to parse high watermark [{}]", highWatermark);
}
// Watermark is expressed in terms of used data, but we need "free" data watermark
this.freeDiskThresholdLow = 100.0 - thresholdPercentageFromWatermark(lowWatermark);
this.freeDiskThresholdHigh = 100.0 - thresholdPercentageFromWatermark(highWatermark);
this.freeBytesThresholdLow = thresholdBytesFromWatermark(lowWatermark, CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK);
this.freeBytesThresholdHigh = thresholdBytesFromWatermark(highWatermark, CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK);
this.includeRelocations = settings.getAsBoolean(CLUSTER_ROUTING_ALLOCATION_INCLUDE_RELOCATIONS, true);
this.rerouteInterval = settings.getAsTime(CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL, TimeValue.timeValueSeconds(60));
this.enabled = settings.getAsBoolean(CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED, true);
nodeSettingsService.addListener(new ApplySettings());
infoService.addListener(new DiskListener(client));
}
// For Testing
ApplySettings newApplySettings() {
return new ApplySettings();
}
// For Testing
public Double getFreeDiskThresholdLow() {
return freeDiskThresholdLow;
}
// For Testing
public Double getFreeDiskThresholdHigh() {
return freeDiskThresholdHigh;
}
// For Testing
public Double getUsedDiskThresholdLow() {
return 100.0 - freeDiskThresholdLow;
}
// For Testing
public Double getUsedDiskThresholdHigh() {
return 100.0 - freeDiskThresholdHigh;
}
// For Testing
public ByteSizeValue getFreeBytesThresholdLow() {
return freeBytesThresholdLow;
}
// For Testing
public ByteSizeValue getFreeBytesThresholdHigh() {
return freeBytesThresholdHigh;
}
// For Testing
public boolean isIncludeRelocations() {
return includeRelocations;
}
// For Testing
public boolean isEnabled() {
return enabled;
}
// For Testing
public TimeValue getRerouteInterval() {
return rerouteInterval;
}
/**
* Returns the size of all shards that are currently being relocated to
* the node, but may not be finished transfering yet.
*
* If subtractShardsMovingAway is true then the size of shards moving away is subtracted from the total size of all shards
*/
static long sizeOfRelocatingShards(RoutingNode node, ClusterInfo clusterInfo,
boolean subtractShardsMovingAway, String dataPath) {
long totalSize = 0;
for (ShardRouting routing : node.shardsWithState(ShardRoutingState.RELOCATING, ShardRoutingState.INITIALIZING)) {
String actualPath = clusterInfo.getDataPath(routing);
if (dataPath.equals(actualPath)) {
if (routing.initializing() && routing.relocatingNodeId() != null) {
totalSize += getShardSize(routing, clusterInfo);
} else if (subtractShardsMovingAway && routing.relocating()) {
totalSize -= getShardSize(routing, clusterInfo);
}
}
}
return totalSize;
}
static long getShardSize(ShardRouting routing, ClusterInfo clusterInfo) {
Long shardSize = clusterInfo.getShardSize(routing);
return shardSize == null ? 0 : shardSize;
}
@Override
public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
ClusterInfo clusterInfo = allocation.clusterInfo();
Map usages = clusterInfo.getNodeMostAvailableDiskUsages();
final Decision decision = earlyTerminate(allocation, usages);
if (decision != null) {
return decision;
}
final double usedDiskThresholdLow = 100.0 - DiskThresholdDecider.this.freeDiskThresholdLow;
final double usedDiskThresholdHigh = 100.0 - DiskThresholdDecider.this.freeDiskThresholdHigh;
// subtractLeavingShards is passed as false here, because they still use disk space, and therefore should we should be extra careful
// and take the size into account
DiskUsage usage = getDiskUsage(node, allocation, usages, false);
// First, check that the node currently over the low watermark
double freeDiskPercentage = usage.getFreeDiskAsPercentage();
// Cache the used disk percentage for displaying disk percentages consistent with documentation
double usedDiskPercentage = usage.getUsedDiskAsPercentage();
long freeBytes = usage.getFreeBytes();
if (logger.isTraceEnabled()) {
logger.trace("node [{}] has {}% used disk", node.nodeId(), usedDiskPercentage);
}
// a flag for whether the primary shard has been previously allocated
boolean primaryHasBeenAllocated = shardRouting.primary() && shardRouting.allocatedPostIndexCreate();
// checks for exact byte comparisons
if (freeBytes < freeBytesThresholdLow.bytes()) {
// If the shard is a replica or has a primary that has already been allocated before, check the low threshold
if (!shardRouting.primary() || (shardRouting.primary() && primaryHasBeenAllocated)) {
if (logger.isDebugEnabled()) {
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, preventing allocation",
freeBytesThresholdLow, freeBytes, node.nodeId());
}
return allocation.decision(Decision.NO, NAME, "less than required [%s] free on node, free: [%s]",
freeBytesThresholdLow, new ByteSizeValue(freeBytes));
} else if (freeBytes > freeBytesThresholdHigh.bytes()) {
// Allow the shard to be allocated because it is primary that
// has never been allocated if it's under the high watermark
if (logger.isDebugEnabled()) {
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, " +
"but allowing allocation because primary has never been allocated",
freeBytesThresholdLow, freeBytes, node.nodeId());
}
return allocation.decision(Decision.YES, NAME, "primary has never been allocated before");
} else {
// Even though the primary has never been allocated, the node is
// above the high watermark, so don't allow allocating the shard
if (logger.isDebugEnabled()) {
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, " +
"preventing allocation even though primary has never been allocated",
freeBytesThresholdHigh, freeBytes, node.nodeId());
}
return allocation.decision(Decision.NO, NAME, "less than required [%s] free on node, free: [%s]",
freeBytesThresholdHigh, new ByteSizeValue(freeBytes));
}
}
// checks for percentage comparisons
if (freeDiskPercentage < freeDiskThresholdLow) {
// If the shard is a replica or has a primary that has already been allocated before, check the low threshold
if (!shardRouting.primary() || (shardRouting.primary() && primaryHasBeenAllocated)) {
if (logger.isDebugEnabled()) {
logger.debug("more than the allowed {} used disk threshold ({} used) on node [{}], preventing allocation",
Strings.format1Decimals(usedDiskThresholdLow, "%"),
Strings.format1Decimals(usedDiskPercentage, "%"), node.nodeId());
}
return allocation.decision(Decision.NO, NAME, "more than allowed [%s%%] used disk on node, free: [%s%%]",
usedDiskThresholdLow, freeDiskPercentage);
} else if (freeDiskPercentage > freeDiskThresholdHigh) {
// Allow the shard to be allocated because it is primary that
// has never been allocated if it's under the high watermark
if (logger.isDebugEnabled()) {
logger.debug("more than the allowed {} used disk threshold ({} used) on node [{}], " +
"but allowing allocation because primary has never been allocated",
Strings.format1Decimals(usedDiskThresholdLow, "%"),
Strings.format1Decimals(usedDiskPercentage, "%"), node.nodeId());
}
return allocation.decision(Decision.YES, NAME, "primary has never been allocated before");
} else {
// Even though the primary has never been allocated, the node is
// above the high watermark, so don't allow allocating the shard
if (logger.isDebugEnabled()) {
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, " +
"preventing allocation even though primary has never been allocated",
Strings.format1Decimals(freeDiskThresholdHigh, "%"),
Strings.format1Decimals(freeDiskPercentage, "%"), node.nodeId());
}
return allocation.decision(Decision.NO, NAME, "more than allowed [%s%%] used disk on node, free: [%s%%]",
usedDiskThresholdHigh, freeDiskPercentage);
}
}
// Secondly, check that allocating the shard to this node doesn't put it above the high watermark
final long shardSize = getShardSize(shardRouting, allocation.clusterInfo());
double freeSpaceAfterShard = freeDiskPercentageAfterShardAssigned(usage, shardSize);
long freeBytesAfterShard = freeBytes - shardSize;
if (freeBytesAfterShard < freeBytesThresholdHigh.bytes()) {
logger.warn("after allocating, node [{}] would have less than the required {} free bytes threshold ({} bytes free), preventing allocation",
node.nodeId(), freeBytesThresholdHigh, freeBytesAfterShard);
return allocation.decision(Decision.NO, NAME, "after allocation less than required [%s] free on node, free: [%s]",
freeBytesThresholdLow, new ByteSizeValue(freeBytesAfterShard));
}
if (freeSpaceAfterShard < freeDiskThresholdHigh) {
logger.warn("after allocating, node [{}] would have more than the allowed {} free disk threshold ({} free), preventing allocation",
node.nodeId(), Strings.format1Decimals(freeDiskThresholdHigh, "%"), Strings.format1Decimals(freeSpaceAfterShard, "%"));
return allocation.decision(Decision.NO, NAME, "after allocation more than allowed [%s%%] used disk on node, free: [%s%%]",
usedDiskThresholdLow, freeSpaceAfterShard);
}
return allocation.decision(Decision.YES, NAME, "enough disk for shard on node, free: [%s]", new ByteSizeValue(freeBytes));
}
@Override
public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
if (shardRouting.currentNodeId().equals(node.nodeId()) == false) {
throw new IllegalArgumentException("Shard [" + shardRouting + "] is not allocated on node: [" + node.nodeId() + "]");
}
final ClusterInfo clusterInfo = allocation.clusterInfo();
final Map usages = clusterInfo.getNodeLeastAvailableDiskUsages();
final Decision decision = earlyTerminate(allocation, usages);
if (decision != null) {
return decision;
}
// subtractLeavingShards is passed as true here, since this is only for shards remaining, we will *eventually* have enough disk
// since shards are moving away. No new shards will be incoming since in canAllocate we pass false for this check.
final DiskUsage usage = getDiskUsage(node, allocation, usages, true);
final String dataPath = clusterInfo.getDataPath(shardRouting);
// If this node is already above the high threshold, the shard cannot remain (get it off!)
final double freeDiskPercentage = usage.getFreeDiskAsPercentage();
final long freeBytes = usage.getFreeBytes();
if (logger.isTraceEnabled()) {
logger.trace("node [{}] has {}% free disk ({} bytes)", node.nodeId(), freeDiskPercentage, freeBytes);
}
if (dataPath == null || usage.getPath().equals(dataPath) == false) {
return allocation.decision(Decision.YES, NAME, "shard is not allocated on the most utilized disk");
}
if (freeBytes < freeBytesThresholdHigh.bytes()) {
if (logger.isDebugEnabled()) {
logger.debug("less than the required {} free bytes threshold ({} bytes free) on node {}, shard cannot remain",
freeBytesThresholdHigh, freeBytes, node.nodeId());
}
return allocation.decision(Decision.NO, NAME, "after allocation less than required [%s] free on node, free: [%s]",
freeBytesThresholdHigh, new ByteSizeValue(freeBytes));
}
if (freeDiskPercentage < freeDiskThresholdHigh) {
if (logger.isDebugEnabled()) {
logger.debug("less than the required {}% free disk threshold ({}% free) on node {}, shard cannot remain",
freeDiskThresholdHigh, freeDiskPercentage, node.nodeId());
}
return allocation.decision(Decision.NO, NAME, "after allocation less than required [%s%%] free disk on node, free: [%s%%]",
freeDiskThresholdHigh, freeDiskPercentage);
}
return allocation.decision(Decision.YES, NAME, "enough disk for shard to remain on node, free: [%s]", new ByteSizeValue(freeBytes));
}
private DiskUsage getDiskUsage(RoutingNode node, RoutingAllocation allocation,
Map usages, boolean subtractLeavingShards) {
DiskUsage usage = usages.get(node.nodeId());
if (usage == null) {
// If there is no usage, and we have other nodes in the cluster,
// use the average usage for all nodes as the usage for this node
usage = averageUsage(node, usages);
if (logger.isDebugEnabled()) {
logger.debug("unable to determine disk usage for {}, defaulting to average across nodes [{} total] [{} free] [{}% free]",
node.nodeId(), usage.getTotalBytes(), usage.getFreeBytes(), usage.getFreeDiskAsPercentage());
}
}
if (includeRelocations) {
long relocatingShardsSize = sizeOfRelocatingShards(node, allocation.clusterInfo(), subtractLeavingShards, usage.getPath());
DiskUsage usageIncludingRelocations = new DiskUsage(node.nodeId(), node.node().name(), usage.getPath(),
usage.getTotalBytes(), usage.getFreeBytes() - relocatingShardsSize);
if (logger.isTraceEnabled()) {
logger.trace("usage without relocations: {}", usage);
logger.trace("usage with relocations: [{} bytes] {}", relocatingShardsSize, usageIncludingRelocations);
}
usage = usageIncludingRelocations;
}
return usage;
}
/**
* Returns a {@link DiskUsage} for the {@link RoutingNode} using the
* average usage of other nodes in the disk usage map.
* @param node Node to return an averaged DiskUsage object for
* @param usages Map of nodeId to DiskUsage for all known nodes
* @return DiskUsage representing given node using the average disk usage
*/
public DiskUsage averageUsage(RoutingNode node, Map usages) {
if (usages.size() == 0) {
return new DiskUsage(node.nodeId(), node.node().name(), "_na_", 0, 0);
}
long totalBytes = 0;
long freeBytes = 0;
for (DiskUsage du : usages.values()) {
totalBytes += du.getTotalBytes();
freeBytes += du.getFreeBytes();
}
return new DiskUsage(node.nodeId(), node.node().name(), "_na_", totalBytes / usages.size(), freeBytes / usages.size());
}
/**
* Given the DiskUsage for a node and the size of the shard, return the
* percentage of free disk if the shard were to be allocated to the node.
* @param usage A DiskUsage for the node to have space computed for
* @param shardSize Size in bytes of the shard
* @return Percentage of free space after the shard is assigned to the node
*/
public double freeDiskPercentageAfterShardAssigned(DiskUsage usage, Long shardSize) {
shardSize = (shardSize == null) ? 0 : shardSize;
DiskUsage newUsage = new DiskUsage(usage.getNodeId(), usage.getNodeName(), usage.getPath(),
usage.getTotalBytes(), usage.getFreeBytes() - shardSize);
return newUsage.getFreeDiskAsPercentage();
}
/**
* Attempts to parse the watermark into a percentage, returning 100.0% if
* it cannot be parsed.
*/
public double thresholdPercentageFromWatermark(String watermark) {
try {
return RatioValue.parseRatioValue(watermark).getAsPercent();
} catch (ElasticsearchParseException ex) {
// NOTE: this is not end-user leniency, since up above we check that it's a valid byte or percentage, and then store the two cases separately
return 100.0;
}
}
/**
* Attempts to parse the watermark into a {@link ByteSizeValue}, returning
* a ByteSizeValue of 0 bytes if the value cannot be parsed.
*/
public ByteSizeValue thresholdBytesFromWatermark(String watermark, String settingName) {
try {
return ByteSizeValue.parseBytesSizeValue(watermark, settingName);
} catch (ElasticsearchParseException ex) {
// NOTE: this is not end-user leniency, since up above we check that it's a valid byte or percentage, and then store the two cases separately
return ByteSizeValue.parseBytesSizeValue("0b", settingName);
}
}
/**
* Checks if a watermark string is a valid percentage or byte size value,
* returning true if valid, false if invalid.
*/
public boolean validWatermarkSetting(String watermark, String settingName) {
try {
RatioValue.parseRatioValue(watermark);
return true;
} catch (ElasticsearchParseException e) {
try {
ByteSizeValue.parseBytesSizeValue(watermark, settingName);
return true;
} catch (ElasticsearchParseException ex) {
return false;
}
}
}
private Decision earlyTerminate(RoutingAllocation allocation, final Map usages) {
// Always allow allocation if the decider is disabled
if (!enabled) {
return allocation.decision(Decision.YES, NAME, "disk threshold decider disabled");
}
// Allow allocation regardless if only a single data node is available
if (allocation.nodes().dataNodes().size() <= 1) {
if (logger.isTraceEnabled()) {
logger.trace("only a single data node is present, allowing allocation");
}
return allocation.decision(Decision.YES, NAME, "only a single data node is present");
}
// Fail open there is no info available
final ClusterInfo clusterInfo = allocation.clusterInfo();
if (clusterInfo == null) {
if (logger.isTraceEnabled()) {
logger.trace("cluster info unavailable for disk threshold decider, allowing allocation.");
}
return allocation.decision(Decision.YES, NAME, "cluster info unavailable");
}
// Fail open if there are no disk usages available
if (usages.isEmpty()) {
if (logger.isTraceEnabled()) {
logger.trace("unable to determine disk usages for disk-aware allocation, allowing allocation");
}
return allocation.decision(Decision.YES, NAME, "disk usages unavailable");
}
return null;
}
}