org.elasticsearch.cluster.routing.allocation.decider.AwarenessAllocationDecider Maven / Gradle / Ivy

Go to download
/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */

package org.elasticsearch.cluster.routing.allocation.decider;

import org.elasticsearch.cluster.metadata.IndexMetadata;
import org.elasticsearch.cluster.routing.RoutingNode;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.settings.ClusterSettings;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Setting.Property;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.settings.SettingsException;

import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.Function;
import java.util.stream.Stream;

import static java.util.Collections.emptyList;
import static java.util.stream.Collectors.toList;
import static org.elasticsearch.cluster.metadata.IndexMetadata.INDEX_AUTO_EXPAND_REPLICAS_SETTING;

/**
 * This {@link AllocationDecider} controls shard allocation based on
 * {@code awareness} key-value pairs defined in the node configuration.
 * Awareness explicitly controls where replicas should be allocated based on
 * attributes like node or physical rack locations. Awareness attributes accept
 * arbitrary configuration keys like a rack data-center identifier. For example
 * the setting:
 *  * cluster.routing.allocation.awareness.attributes: rack_id
 * 
 * 
 * will cause allocations to be distributed over different racks such that
 * ideally at least one replicas of the all shard is available on the same rack.
 * To enable allocation awareness in this example nodes should contain a value
 * for the {@code rack_id} key like:
 * 
 * node.attr.rack_id:1
 * 
 * 
 * Awareness can also be used to prevent over-allocation in the case of node or
 * even "zone" failure. For example in cloud-computing infrastructures like
 * Amazon AWS a cluster might span over multiple "zones". Awareness can be used
 * to distribute replicas to individual zones by setting:
 * 
 * cluster.routing.allocation.awareness.attributes: zone
 * 
 * 
 * and forcing allocation to be aware of the following zone the data resides in:
 * 
 * cluster.routing.allocation.awareness.force.zone.values: zone1,zone2
 * 
 * 
 * In contrast to regular awareness this setting will prevent over-allocation on
 * {@code zone1} even if {@code zone2} fails partially or becomes entirely
 * unavailable. Nodes that belong to a certain zone / group should be started
 * with the zone id configured on the node-level settings like:
 * 
 * node.zone: zone1
 * 
 */
public class AwarenessAllocationDecider extends AllocationDecider {

    public static final String NAME = "awareness";

    public static final Setting> CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING = Setting.listSetting(
        "cluster.routing.allocation.awareness.attributes",
        emptyList(),
        Function.identity(),
        Property.Dynamic,
        Property.NodeScope
    );

    private static final String FORCE_GROUP_SETTING_PREFIX = "cluster.routing.allocation.awareness.force.";

    public static final Setting CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING = Setting.groupSetting(
        FORCE_GROUP_SETTING_PREFIX,
        AwarenessAllocationDecider::validateForceAwarenessSettings,
        Property.Dynamic,
        Property.NodeScope
    );

    private volatile List awarenessAttributes;

    private volatile Map> forcedAwarenessAttributes;

    public AwarenessAllocationDecider(Settings settings, ClusterSettings clusterSettings) {
        this.awarenessAttributes = CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.get(settings);
        clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING, this::setAwarenessAttributes);
        setForcedAwarenessAttributes(CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING.get(settings));
        clusterSettings.addSettingsUpdateConsumer(
            CLUSTER_ROUTING_ALLOCATION_AWARENESS_FORCE_GROUP_SETTING,
            this::setForcedAwarenessAttributes
        );
    }

    private void setForcedAwarenessAttributes(Settings forceSettings) {
        Map> forcedAwarenessAttributes = new HashMap<>();
        Map forceGroups = forceSettings.getAsGroups();
        for (Map.Entry entry : forceGroups.entrySet()) {
            List aValues = entry.getValue().getAsList("values");
            if (aValues.size() > 0) {
                forcedAwarenessAttributes.put(entry.getKey(), aValues);
            }
        }
        this.forcedAwarenessAttributes = forcedAwarenessAttributes;
    }

    private void setAwarenessAttributes(List awarenessAttributes) {
        this.awarenessAttributes = awarenessAttributes;
    }

    @Override
    public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
        return underCapacity(shardRouting, node, allocation, true);
    }

    @Override
    public Decision canForceAllocateDuringReplace(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
        // We need to meet the criteria for shard awareness even during a replacement so that all
        // copies of a shard do not get allocated to the same host/rack/AZ, so this explicitly
        // checks the awareness 'canAllocate' to ensure we don't violate that constraint.
        return canAllocate(shardRouting, node, allocation);
    }

    @Override
    public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
        return underCapacity(shardRouting, node, allocation, false);
    }

    private static final Decision YES_NOT_ENABLED = Decision.single(
        Decision.Type.YES,
        NAME,
        "allocation awareness is not enabled, set cluster setting ["
            + CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.getKey()
            + "] to enable it"
    );

    private static final Decision YES_AUTO_EXPAND_ALL = Decision.single(
        Decision.Type.YES,
        NAME,
        "allocation awareness is ignored, this index is set to auto-expand to all nodes"
    );

    private static final Decision YES_ALL_MET = Decision.single(Decision.Type.YES, NAME, "node meets all awareness attribute requirements");

    private Decision underCapacity(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation, boolean moveToNode) {
        if (awarenessAttributes.isEmpty()) {
            return YES_NOT_ENABLED;
        }

        final boolean debug = allocation.debugDecision();
        final IndexMetadata indexMetadata = allocation.metadata().getIndexSafe(shardRouting.index());

        if (INDEX_AUTO_EXPAND_REPLICAS_SETTING.get(indexMetadata.getSettings()).expandToAllNodes()) {
            return YES_AUTO_EXPAND_ALL;
        }

        final int shardCount = indexMetadata.getNumberOfReplicas() + 1; // 1 for primary
        for (String awarenessAttribute : awarenessAttributes) {
            // the node the shard exists on must be associated with an awareness attribute
            if (node.node().getAttributes().containsKey(awarenessAttribute) == false) {
                return debug ? debugNoMissingAttribute(awarenessAttribute, awarenessAttributes) : Decision.NO;
            }

            final Set actualAttributeValues = allocation.routingNodes().getAttributeValues(awarenessAttribute);
            final String targetAttributeValue = node.node().getAttributes().get(awarenessAttribute);
            assert targetAttributeValue != null : "attribute [" + awarenessAttribute + "] missing on " + node.node();
            assert actualAttributeValues.contains(targetAttributeValue)
                : "attribute [" + awarenessAttribute + "] on " + node.node() + " is not in " + actualAttributeValues;

            int shardsForTargetAttributeValue = 0;
            // Will be the count of shards on nodes with attribute `awarenessAttribute` matching the one on `node`.

            for (ShardRouting assignedShard : allocation.routingNodes().assignedShards(shardRouting.shardId())) {
                if (assignedShard.started() || assignedShard.initializing()) {
                    // Note: this also counts relocation targets as that will be the new location of the shard.
                    // Relocation sources should not be counted as the shard is moving away
                    final RoutingNode assignedNode = allocation.routingNodes().node(assignedShard.currentNodeId());
                    if (targetAttributeValue.equals(assignedNode.node().getAttributes().get(awarenessAttribute))) {
                        shardsForTargetAttributeValue += 1;
                    }
                }
            }

            if (moveToNode) {
                if (shardRouting.assignedToNode()) {
                    final RoutingNode currentNode = allocation.routingNodes()
                        .node(shardRouting.relocating() ? shardRouting.relocatingNodeId() : shardRouting.currentNodeId());
                    if (targetAttributeValue.equals(currentNode.node().getAttributes().get(awarenessAttribute)) == false) {
                        shardsForTargetAttributeValue += 1;
                    } // else this shard is already on a node in the same zone as the target node, so moving it doesn't change the count
                } else {
                    shardsForTargetAttributeValue += 1;
                }
            }

            final List forcedValues = forcedAwarenessAttributes.get(awarenessAttribute);
            final int valueCount = forcedValues == null
                ? actualAttributeValues.size()
                : Math.toIntExact(Stream.concat(actualAttributeValues.stream(), forcedValues.stream()).distinct().count());

            final int maximumShardsPerAttributeValue = (shardCount + valueCount - 1) / valueCount; // ceil(shardCount/valueCount)
            if (shardsForTargetAttributeValue > maximumShardsPerAttributeValue) {
                return debug
                    ? debugNoTooManyCopies(
                        shardCount,
                        awarenessAttribute,
                        node.node().getAttributes().get(awarenessAttribute),
                        valueCount,
                        actualAttributeValues.stream().sorted().collect(toList()),
                        forcedValues == null ? null : forcedValues.stream().sorted().collect(toList()),
                        shardsForTargetAttributeValue,
                        maximumShardsPerAttributeValue
                    )
                    : Decision.NO;
            }
        }

        return YES_ALL_MET;
    }

    private static Decision debugNoTooManyCopies(
        int shardCount,
        String attributeName,
        String attributeValue,
        int numberOfAttributes,
        List realAttributes,
        List forcedAttributes,
        int actualShardCount,
        int maximumShardCount
    ) {
        return Decision.single(
            Decision.Type.NO,
            NAME,
            "there are [%d] copies of this shard and [%d] values for attribute [%s] (%s from nodes in the cluster and %s) so there "
                + "may be at most [%d] copies of this shard allocated to nodes with each value, but (including this copy) there "
                + "would be [%d] copies allocated to nodes with [node.attr.%s: %s]",
            shardCount,
            numberOfAttributes,
            attributeName,
            realAttributes,
            forcedAttributes == null ? "no forced awareness" : forcedAttributes + " from forced awareness",
            maximumShardCount,
            actualShardCount,
            attributeName,
            attributeValue
        );
    }

    private static Decision debugNoMissingAttribute(String awarenessAttribute, List awarenessAttributes) {
        return Decision.single(
            Decision.Type.NO,
            NAME,
            "node does not contain the awareness attribute [%s]; required attributes cluster setting [%s=%s]",
            awarenessAttribute,
            CLUSTER_ROUTING_ALLOCATION_AWARENESS_ATTRIBUTE_SETTING.getKey(),
            Strings.collectionToCommaDelimitedString(awarenessAttributes)
        );
    }

    private static void validateForceAwarenessSettings(Settings forceSettings) {
        final Map settingGroups;
        try {
            settingGroups = forceSettings.getAsGroups();
        } catch (SettingsException e) {
            throw new IllegalArgumentException("invalid forced awareness settings with prefix [" + FORCE_GROUP_SETTING_PREFIX + "]", e);
        }
        for (Map.Entry entry : settingGroups.entrySet()) {
            final Optional notValues = entry.getValue().keySet().stream().filter(s -> s.equals("values") == false).findFirst();
            if (notValues.isPresent()) {
                throw new IllegalArgumentException(
                    "invalid forced awareness setting [" + FORCE_GROUP_SETTING_PREFIX + entry.getKey() + "." + notValues.get() + "]"
                );
            }
        }
    }
}