All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opensearch.cluster.routing.allocation.decider.NodeLoadAwareAllocationDecider Maven / Gradle / Ivy

/*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */

package org.opensearch.cluster.routing.allocation.decider;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.opensearch.cluster.metadata.Metadata;
import org.opensearch.cluster.routing.RoutingNode;
import org.opensearch.cluster.routing.ShardRouting;
import org.opensearch.cluster.routing.allocation.RoutingAllocation;
import org.opensearch.common.settings.ClusterSettings;
import org.opensearch.common.settings.Setting;
import org.opensearch.common.settings.Settings;
import org.opensearch.common.settings.Setting.Property;

import java.util.function.BiPredicate;

/**
 * This {@link NodeLoadAwareAllocationDecider} controls shard over-allocation
 * due to node failures or otherwise on the surviving nodes. The allocation limits
 * are decided by the user provisioned capacity, to determine if there were lost nodes.
 * The provisioned capacity as defined by the below settings needs to be updated on every
 * cluster scale up and scale down operations.
 * 
 * cluster.routing.allocation.overload_awareness.provisioned_capacity: N
 * 
*

* and prevent allocation on the surviving nodes of the under capacity cluster * based on overload factor defined as a percentage by *

 * cluster.routing.allocation.load_awareness.skew_factor: X
 * 
* The total limit per node based on skew_factor doesn't limit primaries that previously * existed on the disk as those shards are force allocated by * {@link AllocationDeciders#canForceAllocatePrimary(ShardRouting, RoutingNode, RoutingAllocation)} * however new primaries due to index creation, snapshot restore etc can be controlled via the below settings. * Setting the value to true allows newly created primaries to get assigned while preventing the replica allocation * breaching the skew factor. * Note that setting this to false can result in the primaries not get assigned and the cluster turning RED *
 * cluster.routing.allocation.load_awareness.allow_unassigned_primaries
 * 
*/ public class NodeLoadAwareAllocationDecider extends AllocationDecider { public static final String NAME = "load_awareness"; public static final Setting CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING = Setting.intSetting( "cluster.routing.allocation.load_awareness.provisioned_capacity", -1, -1, Property.Dynamic, Property.NodeScope ); public static final Setting CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING = Setting.doubleSetting( "cluster.routing.allocation.load_awareness.skew_factor", 50, -1, Property.Dynamic, Property.NodeScope ); public static final Setting CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING = Setting.boolSetting( "cluster.routing.allocation.load_awareness.allow_unassigned_primaries", true, Setting.Property.Dynamic, Property.NodeScope ); private volatile int provisionedCapacity; private volatile double skewFactor; private volatile boolean allowUnassignedPrimaries; private static final Logger logger = LogManager.getLogger(NodeLoadAwareAllocationDecider.class); public NodeLoadAwareAllocationDecider(Settings settings, ClusterSettings clusterSettings) { this.skewFactor = CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.get(settings); this.provisionedCapacity = CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.get(settings); this.allowUnassignedPrimaries = CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.get(settings); clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING, this::setSkewFactor); clusterSettings.addSettingsUpdateConsumer( CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING, this::setProvisionedCapacity ); clusterSettings.addSettingsUpdateConsumer( CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING, this::setAllowUnassignedPrimaries ); } private void setAllowUnassignedPrimaries(boolean allowUnassignedPrimaries) { this.allowUnassignedPrimaries = allowUnassignedPrimaries; } private void setSkewFactor(double skewFactor) { this.skewFactor = skewFactor; } private void setProvisionedCapacity(int provisionedCapacity) { this.provisionedCapacity = provisionedCapacity; } @Override public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { return underCapacity(shardRouting, node, allocation, (count, limit) -> count >= limit); } @Override public Decision canRemain(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) { return underCapacity(shardRouting, node, allocation, (count, limit) -> count > limit); } private Decision underCapacity( ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation, BiPredicate decider ) { if (provisionedCapacity <= 0 || skewFactor < 0) { return allocation.decision( Decision.YES, NAME, "overload awareness allocation is not enabled, set cluster setting [%s] and cluster setting [%s] to enable it", CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_SKEW_FACTOR_SETTING.getKey(), CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_PROVISIONED_CAPACITY_SETTING.getKey() ); } if (shardRouting.unassigned() && shardRouting.primary() && allowUnassignedPrimaries) { return allocation.decision( Decision.YES, NAME, "overload allocation awareness is allowed for unassigned primaries, set cluster setting [%s] to disable it", CLUSTER_ROUTING_ALLOCATION_LOAD_AWARENESS_ALLOW_UNASSIGNED_PRIMARIES_SETTING.getKey() ); } Metadata metadata = allocation.metadata(); float expectedAvgShardsPerNode = (float) metadata.getTotalNumberOfShards() / provisionedCapacity; int nodeShardCount = node.numberOfOwningShards(); int limit = (int) Math.ceil(expectedAvgShardsPerNode * (1 + skewFactor / 100.0)); if (decider.test(nodeShardCount, limit)) { logger.debug( () -> new ParameterizedMessage( "Too many shards [{}] allocated to this node [{}]. Expected average shards" + " per node [{}], overload factor [{}], node limit [{}]", nodeShardCount, node.nodeId(), expectedAvgShardsPerNode, skewFactor, limit ) ); return allocation.decision( Decision.NO, NAME, "too many shards [%d] allocated to this node, limit per node [%d] considering" + " overload factor [%.2f] based on capacity [%d]", nodeShardCount, limit, skewFactor, provisionedCapacity ); } return allocation.decision(Decision.YES, NAME, "node meets all skew awareness attribute requirements"); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy