org.elasticsearch.cluster.routing.allocation.decider.AwarenessAllocationDecider Maven / Gradle / Ivy
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/
package org.elasticsearch.cluster.routing.allocation.decider;
import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.routing.RoutingNode;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.ClusterSettings;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Setting.Property;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.settings.SettingsException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Stream;
import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.toList;
import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_AUTO_EXPAND_REPLICAS_SETTING;
/**
* This {@link AllocationDecider} controls shard allocation based on
* {@code awareness} key-value pairs defined in the node configuration.
* Awareness explicitly controls where replicas should be allocated based on
* attributes like node or physical rack locations. Awareness attributes accept
* arbitrary configuration keys like a rack data-center identifier. For example
* the setting:
*
* cluster.routing.allocation.awareness.attributes: rack_id
*
*
* will cause allocations to be distributed over different racks such that
* ideally at least one replicas of the all shard is available on the same rack.
* To enable allocation awareness in this example nodes should contain a value
* for the {@code rack_id} key like:
*
* node.attr.rack_id:1
*
*
* Awareness can also be used to prevent over-allocation in the case of node or
* even "zone" failure. For example in cloud-computing infrastructures like
* Amazon AWS a cluster might span over multiple "zones". Awareness can be used
* to distribute replicas to individual zones by setting:
*
* cluster.routing.allocation.awareness.attributes: zone
*
*
* and forcing allocation to be aware of the following zone the data resides in:
*
* cluster.routing.allocation.awareness.force.zone.values: zone1,zone2
*
*
* In contrast to regular awareness this setting will prevent over-allocation on
* {@code zone1} even if {@code zone2} fails partially or becomes entirely
* unavailable. Nodes that belong to a certain zone / group should be started
* with the zone id configured on the node-level settings like:
*
* node.zone: zone1
*
*/
public class AwarenessAllocationDecider extends AllocationDecider {
public static final String NAME = "awareness";
public static final Setting> CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING = Setting.listSetting(
"cluster.routing.allocation.awareness.attributes",
emptyList(),
Function.identity(),
Property.Dynamic,
Property.NodeScope
);
private static final String FORCE_GROUP_SETTING_PREFIX = "cluster.routing.allocation.awareness.force.";
public static final Setting CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING = Setting.groupSetting(
FORCE_GROUP_SETTING_PREFIX,
AwarenessAllocationDecider::validateForceAwarenessSettings,
Property.Dynamic,
Property.NodeScope
);
private volatile List awarenessAttributes;
private volatile Map> forcedAwarenessAttributes;
public AwarenessAllocationDecider(Settings settings, ClusterSettings clusterSettings) {
this.awarenessAttributes = CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.get(settings);
clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING, this::setAwarenessAttributes);
setForcedAwarenessAttributes(CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING.get(settings));
clusterSettings.addSettingsUpdateConsumer(
CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING,
this::setForcedAwarenessAttributes
);
}
private void setForcedAwarenessAttributes(Settings forceSettings) {
Map> forcedAwarenessAttributes = new HashMap<>();
Map forceGroups = forceSettings.getAsGroups();
for (Map.Entry entry : forceGroups.entrySet()) {
List aValues = entry.getValue().getAsList("values");
if (aValues.size() > 0) {
forcedAwarenessAttributes.put(entry.getKey(), aValues);
}
}
this.forcedAwarenessAttributes = forcedAwarenessAttributes;
}
private void setAwarenessAttributes(List awarenessAttributes) {
this.awarenessAttributes = awarenessAttributes;
}
@Override
public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
return underCapacity(shardRouting, node, allocation, true);
}
@Override
public Decision canForceAllocateDuringReplace(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
// We need to meet the criteria for shard awareness even during a replacement so that all
// copies of a shard do not get allocated to the same host/rack/AZ, so this explicitly
// checks the awareness 'canAllocate' to ensure we don't violate that constraint.
return canAllocate(shardRouting, node, allocation);
}
@Override
public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
return underCapacity(shardRouting, node, allocation, false);
}
private static final Decision YES_NOT_ENABLED = Decision.single(
Decision.Type.YES,
NAME,
"allocation awareness is not enabled, set cluster setting ["
+ CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.getKey()
+ "] to enable it"
);
private static final Decision YES_AUTO_EXPAND_ALL = Decision.single(
Decision.Type.YES,
NAME,
"allocation awareness is ignored, this index is set to auto-expand to all nodes"
);
private static final Decision YES_ALL_MET = Decision.single(Decision.Type.YES, NAME, "node meets all awareness attribute requirements");
private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation, boolean moveToNode) {
if (awarenessAttributes.isEmpty()) {
return YES_NOT_ENABLED;
}
final boolean debug = allocation.debugDecision();
final IndexMetadata indexMetadata = allocation.metadata().getIndexSafe(shardRouting.index());
if (INDEX_AUTO_EXPAND_REPLICAS_SETTING.get(indexMetadata.getSettings()).expandToAllNodes()) {
return YES_AUTO_EXPAND_ALL;
}
final int shardCount = indexMetadata.getNumberOfReplicas() + 1; // 1 for primary
for (String awarenessAttribute : awarenessAttributes) {
// the node the shard exists on must be associated with an awareness attribute
if (node.node().getAttributes().containsKey(awarenessAttribute) == false) {
return debug ? debugNoMissingAttribute(awarenessAttribute, awarenessAttributes) : Decision.NO;
}
final Set actualAttributeValues = allocation.routingNodes().getAttributeValues(awarenessAttribute);
final String targetAttributeValue = node.node().getAttributes().get(awarenessAttribute);
assert targetAttributeValue != null : "attribute [" + awarenessAttribute + "] missing on " + node.node();
assert actualAttributeValues.contains(targetAttributeValue)
: "attribute [" + awarenessAttribute + "] on " + node.node() + " is not in " + actualAttributeValues;
int shardsForTargetAttributeValue = 0;
// Will be the count of shards on nodes with attribute `awarenessAttribute` matching the one on `node`.
for (ShardRouting assignedShard : allocation.routingNodes().assignedShards(shardRouting.shardId())) {
if (assignedShard.started() || assignedShard.initializing()) {
// Note: this also counts relocation targets as that will be the new location of the shard.
// Relocation sources should not be counted as the shard is moving away
final RoutingNode assignedNode = allocation.routingNodes().node(assignedShard.currentNodeId());
if (targetAttributeValue.equals(assignedNode.node().getAttributes().get(awarenessAttribute))) {
shardsForTargetAttributeValue += 1;
}
}
}
if (moveToNode) {
if (shardRouting.assignedToNode()) {
final RoutingNode currentNode = allocation.routingNodes()
.node(shardRouting.relocating() ? shardRouting.relocatingNodeId() : shardRouting.currentNodeId());
if (targetAttributeValue.equals(currentNode.node().getAttributes().get(awarenessAttribute)) == false) {
shardsForTargetAttributeValue += 1;
} // else this shard is already on a node in the same zone as the target node, so moving it doesn't change the count
} else {
shardsForTargetAttributeValue += 1;
}
}
final List forcedValues = forcedAwarenessAttributes.get(awarenessAttribute);
final int valueCount = forcedValues == null
? actualAttributeValues.size()
: Math.toIntExact(Stream.concat(actualAttributeValues.stream(), forcedValues.stream()).distinct().count());
final int maximumShardsPerAttributeValue = (shardCount + valueCount - 1) / valueCount; // ceil(shardCount/valueCount)
if (shardsForTargetAttributeValue > maximumShardsPerAttributeValue) {
return debug
? debugNoTooManyCopies(
shardCount,
awarenessAttribute,
node.node().getAttributes().get(awarenessAttribute),
valueCount,
actualAttributeValues.stream().sorted().collect(toList()),
forcedValues == null ? null : forcedValues.stream().sorted().collect(toList()),
shardsForTargetAttributeValue,
maximumShardsPerAttributeValue
)
: Decision.NO;
}
}
return YES_ALL_MET;
}
private static Decision debugNoTooManyCopies(
int shardCount,
String attributeName,
String attributeValue,
int numberOfAttributes,
List realAttributes,
List forcedAttributes,
int actualShardCount,
int maximumShardCount
) {
return Decision.single(
Decision.Type.NO,
NAME,
"there are [%d] copies of this shard and [%d] values for attribute [%s] (%s from nodes in the cluster and %s) so there "
+ "may be at most [%d] copies of this shard allocated to nodes with each value, but (including this copy) there "
+ "would be [%d] copies allocated to nodes with [node.attr.%s: %s]",
shardCount,
numberOfAttributes,
attributeName,
realAttributes,
forcedAttributes == null ? "no forced awareness" : forcedAttributes + " from forced awareness",
maximumShardCount,
actualShardCount,
attributeName,
attributeValue
);
}
private static Decision debugNoMissingAttribute(String awarenessAttribute, List awarenessAttributes) {
return Decision.single(
Decision.Type.NO,
NAME,
"node does not contain the awareness attribute [%s]; required attributes cluster setting [%s=%s]",
awarenessAttribute,
CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.getKey(),
Strings.collectionToCommaDelimitedString(awarenessAttributes)
);
}
private static void validateForceAwarenessSettings(Settings forceSettings) {
final Map settingGroups;
try {
settingGroups = forceSettings.getAsGroups();
} catch (SettingsException e) {
throw new IllegalArgumentException("invalid forced awareness settings with prefix [" + FORCE_GROUP_SETTING_PREFIX + "]", e);
}
for (Map.Entry entry : settingGroups.entrySet()) {
final Optional notValues = entry.getValue().keySet().stream().filter(s -> s.equals("values") == false).findFirst();
if (notValues.isPresent()) {
throw new IllegalArgumentException(
"invalid forced awareness setting [" + FORCE_GROUP_SETTING_PREFIX + entry.getKey() + "." + notValues.get() + "]"
);
}
}
}
}