org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator Maven / Gradle / Ivy
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.cluster.routing.allocation.allocator;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import org.apache.lucene.util.SorterTemplate;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.cluster.routing.*;
import org.elasticsearch.cluster.routing.allocation.FailedRerouteAllocation;
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
import org.elasticsearch.cluster.routing.allocation.StartedRerouteAllocation;
import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders;
import org.elasticsearch.cluster.routing.allocation.decider.Decision;
import org.elasticsearch.cluster.routing.allocation.decider.Decision.Type;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.node.settings.NodeSettingsService;
import java.util.*;
import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING;
import static org.elasticsearch.cluster.routing.ShardRoutingState.RELOCATING;
/**
* The {@link BalancedShardsAllocator} re-balances the nodes allocations
* within an cluster based on a {@link WeightFunction}. The clusters balance is defined by four parameters which can be set
* in the cluster update API that allows changes in real-time:
*
* cluster.routing.allocation.balance.shard
- The shard balance defines the weight factor
* for shards allocated on a {@link RoutingNode}
* cluster.routing.allocation.balance.index
- The index balance defines a factor to the number
* of {@link org.elasticsearch.cluster.routing.ShardRouting}s per index allocated on a specific node
* cluster.routing.allocation.balance.primary
- the primary balance defines a weight factor for
* the number of primaries of a specific index allocated on a node
* cluster.routing.allocation.balance.threshold
- A threshold to set the minimal optimization
* value of operations that should be performed
*
*
* These parameters are combined in a {@link WeightFunction} that allows calculation of node weights which
* are used to re-balance shards based on global as well as per-index factors.
*/
public class BalancedShardsAllocator extends AbstractComponent implements ShardsAllocator {
public static final String SETTING_THRESHOLD = "cluster.routing.allocation.balance.threshold";
public static final String SETTING_INDEX_BALANCE_FACTOR = "cluster.routing.allocation.balance.index";
public static final String SETTING_SHARD_BALANCE_FACTOR = "cluster.routing.allocation.balance.shard";
public static final String SETTING_PRIMARY_BALANCE_FACTOR = "cluster.routing.allocation.balance.primary";
private static final float DEFAULT_INDEX_BALANCE_FACTOR = 0.5f;
private static final float DEFAULT_SHARD_BALANCE_FACTOR = 0.45f;
private static final float DEFAULT_PRIMARY_BALANCE_FACTOR = 0.05f;
class ApplySettings implements NodeSettingsService.Listener {
@Override
public void onRefreshSettings(Settings settings) {
final float indexBalance = settings.getAsFloat(SETTING_INDEX_BALANCE_FACTOR, weightFunction.indexBalance);
final float shardBalance = settings.getAsFloat(SETTING_SHARD_BALANCE_FACTOR, weightFunction.shardBalance);
final float primaryBalance = settings.getAsFloat(SETTING_PRIMARY_BALANCE_FACTOR, weightFunction.primaryBalance);
float threshold = settings.getAsFloat(SETTING_THRESHOLD, BalancedShardsAllocator.this.threshold);
if (threshold <= 0.0f) {
throw new ElasticSearchIllegalArgumentException("threshold must be greater than 0.0f but was: " + threshold);
}
BalancedShardsAllocator.this.threshold = threshold;
BalancedShardsAllocator.this.weightFunction = new WeightFunction(indexBalance, shardBalance, primaryBalance);
}
}
private volatile WeightFunction weightFunction = new WeightFunction(DEFAULT_INDEX_BALANCE_FACTOR, DEFAULT_SHARD_BALANCE_FACTOR, DEFAULT_PRIMARY_BALANCE_FACTOR);
private volatile float threshold = 1.0f;
public BalancedShardsAllocator(Settings settings) {
this(settings, new NodeSettingsService(settings));
}
@Inject
public BalancedShardsAllocator(Settings settings, NodeSettingsService nodeSettingsService) {
super(settings);
ApplySettings applySettings = new ApplySettings();
applySettings.onRefreshSettings(settings);
nodeSettingsService.addListener(applySettings);
}
@Override
public void applyStartedShards(StartedRerouteAllocation allocation) { /* ONLY FOR GATEWAYS */ }
@Override
public void applyFailedShards(FailedRerouteAllocation allocation) { /* ONLY FOR GATEWAYS */ }
@Override
public boolean allocateUnassigned(RoutingAllocation allocation) {
return rebalance(allocation);
}
@Override
public boolean rebalance(RoutingAllocation allocation) {
final Balancer balancer = new Balancer(logger, allocation, weightFunction, threshold);
return balancer.balance();
}
@Override
public boolean move(MutableShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
final Balancer balancer = new Balancer(logger, allocation, weightFunction, threshold);
return balancer.move(shardRouting, node);
}
/**
* Returns the currently configured delta threshold
*/
public float getThreshold() {
return threshold;
}
/**
* Returns the index related weight factor.
*/
public float getIndexBalance() {
return weightFunction.indexBalance;
}
/**
* Returns the primary related weight factor.
*/
public float getPrimaryBalance() {
return weightFunction.primaryBalance;
}
/**
* Returns the shard related weight factor.
*/
public float getShardBalance() {
return weightFunction.shardBalance;
}
/**
* This class is the primary weight function used to create balanced over nodes and shards in the cluster.
* Currently this function has 3 properties:
*
* index balance
- balance property over shards per index
* shard balance
- balance property over shards per cluster
* primary balance
- balance property over primaries per cluster
*
*
* Each of these properties are expressed as factor such that the properties factor defines the relative importance of the property for the
* weight function. For example if the weight function should calculate the weights only based on a global (shard) balance the index and primary balance
* can be set to 0.0 and will in turn have no effect on the distribution.
*
* The weight per index is calculated based on the following formula:
*
* -
*
weightindex(node, index) = indexBalance * (node.numShards(index) - avgShardsPerNode(index))
*
* -
*
weightnode(node, index) = shardBalance * (node.numShards() - avgShardsPerNode)
*
* -
*
weightprimary(node, index) = primaryBalance * (node.numPrimaries() - avgPrimariesPerNode)
*
*
* weight(node, index) = weightindex(node, index) + weightnode(node, index) + weightprimary(node, index)
*/
public static class WeightFunction {
private final float indexBalance;
private final float shardBalance;
private final float primaryBalance;
private final EnumMap thetaMap = new EnumMap(Operation.class);
public WeightFunction(float indexBalance, float shardBalance, float primaryBalance) {
float sum = indexBalance + shardBalance + primaryBalance;
if (sum <= 0.0f) {
throw new ElasticSearchIllegalArgumentException("Balance factors must sum to a value > 0 but was: " + sum);
}
final float[] defaultTheta = new float[] { shardBalance / sum, indexBalance / sum, primaryBalance / sum };
for(Operation operation : Operation.values()) {
switch(operation) {
case THRESHOLD_CHECK:
sum = indexBalance + shardBalance;
if (sum <= 0.0f) {
thetaMap.put(operation, defaultTheta);
}
thetaMap.put(operation, new float[] { shardBalance / sum, indexBalance / sum, 0});
break;
case BALANCE:
case ALLOCATE:
case MOVE:
thetaMap.put(operation, defaultTheta);
break;
default:
assert false;
}
}
this.indexBalance = indexBalance;
this.shardBalance = shardBalance;
this.primaryBalance = primaryBalance;
}
public float weight(Operation operation, Balancer balancer, ModelNode node, String index) {
final float weightShard = (node.numShards() - balancer.avgShardsPerNode());
final float weightIndex = (node.numShards(index) - balancer.avgShardsPerNode(index));
final float weightPrimary = (node.numPrimaries() - balancer.avgPrimariesPerNode());
final float[] theta = thetaMap.get(operation);
assert theta != null;
return theta[0] * weightShard + theta[1] * weightIndex + theta[2] * weightPrimary;
}
}
/**
* An enum that donates the actual operation the {@link WeightFunction} is
* applied to.
*/
public static enum Operation {
/**
* Provided during balance operations.
*/
BALANCE,
/**
* Provided during initial allocation operation for unassigned shards.
*/
ALLOCATE,
/**
* Provided during move operation.
*/
MOVE,
/**
* Provided when the weight delta is checked against the configured threshold.
* This can be used to ignore tie-breaking weight factors that should not
* solely trigger a relocation unless the delta is above the threshold.
*/
THRESHOLD_CHECK
}
/**
* A {@link Balancer}
*/
public static class Balancer {
private final ESLogger logger;
private final Map nodes = new HashMap();
private final HashSet indices = new HashSet();
private final RoutingAllocation allocation;
private final WeightFunction weight;
private final float threshold;
private final MetaData metaData;
private final Predicate assignedFilter = new Predicate() {
@Override
public boolean apply(MutableShardRouting input) {
return input.assignedToNode();
}
};
public Balancer(ESLogger logger, RoutingAllocation allocation, WeightFunction weight, float threshold) {
this.logger = logger;
this.allocation = allocation;
this.weight = weight;
this.threshold = threshold;
for (RoutingNode node : allocation.routingNodes()) {
nodes.put(node.nodeId(), new ModelNode(node.nodeId()));
}
metaData = allocation.routingNodes().metaData();
}
/**
* Returns an array view on the nodes in the balancer. Nodes should not be removed from this list.
*/
private ModelNode[] nodesArray() {
return nodes.values().toArray(new ModelNode[nodes.size()]);
}
/**
* Returns the average of shards per node for the given index
*/
public float avgShardsPerNode(String index) {
return ((float) metaData.index(index).totalNumberOfShards()) / nodes.size();
}
/**
* Returns the global average of shards per node
*/
public float avgShardsPerNode() {
return ((float) metaData.totalNumberOfShards()) / nodes.size();
}
/**
* Returns the global average of primaries per node
*/
public float avgPrimariesPerNode() {
return ((float) metaData.numberOfShards()) / nodes.size();
}
/**
* Returns the average of primaries per node for the given index
*/
public float avgPrimariesPerNode(String index) {
return ((float) metaData.index(index).numberOfShards()) / nodes.size();
}
/**
* Returns a new {@link NodeSorter} that sorts the nodes based on their
* current weight with respect to the index passed to the sorter. The
* returned sorter is not sorted. Use {@link NodeSorter#reset(String)}
* to sort based on an index.
*/
private NodeSorter newNodeSorter() {
final NodeSorter sorter = new NodeSorter(nodesArray(), weight, this);
return sorter;
}
private boolean initialize(RoutingNodes routing) {
Collection shards = new ArrayList();
if (logger.isTraceEnabled()) {
logger.trace("Start distributing Shards");
}
for (IndexRoutingTable index : allocation.routingTable().indicesRouting().values()) {
indices.add(index.index());
for (IndexShardRoutingTable shard : index.getShards().values()) {
shards.addAll(routing.shardsRoutingFor(index.index(), shard.shardId().id()));
}
}
buildModelFromAssigned(Iterables.filter(shards, assignedFilter));
return allocateUnassigned(allocation.routingNodes().unassigned(), allocation.routingNodes().ignoredUnassigned());
}
/**
* Balances the nodes on the cluster model according to the weight
* function. The configured threshold is the minimum delta between the
* weight of the maximum node and the minimum node according to the
* {@link WeightFunction}. This weight is calculated per index to
* distribute shards evenly per index. The balancer tries to relocate
* shards only if the delta exceeds the threshold. If the default case
* the threshold is set to 1.0 to enforce gaining relocation
* only, or in other words relocations that move the weight delta closer
* to 0.0
*
* @return true
if the current configuration has been
* changed, otherwise false
*/
public boolean balance() {
if (this.nodes.isEmpty()) {
/* with no nodes this is pointless */
return false;
}
if (logger.isTraceEnabled()) {
logger.trace("Start balancing cluster");
}
boolean changed = initialize(allocation.routingNodes());
NodeSorter sorter = newNodeSorter();
if (nodes.size() > 1) { /* skip if we only have one node */
for (String index : buildWeightOrderedIndidces(Operation.BALANCE, sorter)) {
sorter.reset(Operation.BALANCE,index);
final float[] weights = sorter.weights;
final ModelNode[] modelNodes = sorter.modelNodes;
int lowIdx = 0;
int highIdx = weights.length - 1;
while (true) {
final ModelNode minNode = modelNodes[lowIdx];
final ModelNode maxNode = modelNodes[highIdx];
if (maxNode.numShards(index) > 0) {
float delta = weights[highIdx] - weights[lowIdx];
delta = delta <= threshold ? delta : sorter.weight(Operation.THRESHOLD_CHECK, maxNode) - sorter.weight(Operation.THRESHOLD_CHECK, minNode);
if (delta <= threshold) {
if (logger.isTraceEnabled()) {
logger.trace("Stop balancing index [{}] min_node [{}] weight: [{}] max_node [{}] weight: [{}] delta: [{}]",
index, maxNode.getNodeId(), weights[highIdx], minNode.getNodeId(), weights[lowIdx], delta);
}
break;
}
if (logger.isTraceEnabled()) {
logger.trace("Balancing from node [{}] weight: [{}] to node [{}] weight: [{}] delta: [{}]",
maxNode.getNodeId(), weights[highIdx], minNode.getNodeId(), weights[lowIdx], delta);
}
/* pass the delta to the replication function to prevent relocations that only swap the weights of the two nodes.
* a relocation must bring us closer to the balance if we only achive the same delta the relocation is useless */
if (tryRelocateShard(Operation.BALANCE, minNode, maxNode, index, delta)) {
/*
* TODO we could be a bit smarter here, we don't need to fully sort necessarily
* we could just find the place to insert linearly but the win might be minor
* compared to the added complexity
*/
weights[lowIdx] = sorter.weight(Operation.BALANCE, modelNodes[lowIdx]);
weights[highIdx] = sorter.weight(Operation.BALANCE, modelNodes[highIdx]);
sorter.quickSort(0, weights.length - 1);
lowIdx = 0;
highIdx = weights.length - 1;
changed = true;
continue;
}
}
if (lowIdx < highIdx - 1) {
/* we can't move from any shard from the min node lets move on to the next node
* and see if the threshold still holds. We either don't have any shard of this
* index on this node of allocation deciders prevent any relocation.*/
lowIdx++;
} else if (lowIdx > 0) {
/* now we go max to min since obviously we can't move anything to the max node
* lets pick the next highest */
lowIdx = 0;
highIdx--;
} else {
/* we are done here, we either can't relocate anymore or we are balanced */
break;
}
}
}
}
return changed;
}
/**
* This builds a initial index ordering where the indices are returned
* in most unbalanced first. We need this in order to prevent over
* allocations on added nodes from one index when the weight parameters
* for global balance overrule the index balance at an intermediate
* state. For example this can happen if we have 3 nodes and 3 indices
* with 3 shards and 1 shard. At the first stage all three nodes hold
* 2 shard for each index. now we add another node and the first index
* is balanced moving 3 two of the nodes over to the new node since it
* has no shards yet and global balance for the node is way below
* average. To re-balance we need to move shards back eventually likely
* to the nodes we relocated them from.
*/
private String[] buildWeightOrderedIndidces(Operation operation, NodeSorter sorter) {
final String[] indices = this.indices.toArray(new String[this.indices.size()]);
final float[] deltas = new float[indices.length];
for (int i = 0; i < deltas.length; i++) {
sorter.reset(operation, indices[i]);
deltas[i] = sorter.delta();
}
new SorterTemplate() {
float pivotWeight;
@Override
protected void swap(int i, int j) {
final String tmpIdx = indices[i];
indices[i] = indices[j];
indices[j] = tmpIdx;
final float tmpDelta = deltas[i];
deltas[i] = deltas[j];
deltas[j] = tmpDelta;
}
@Override
protected int compare(int i, int j) {
return Float.compare(deltas[j], deltas[i]);
}
@Override
protected void setPivot(int i) {
pivotWeight = deltas[i];
}
@Override
protected int comparePivot(int j) {
return Float.compare(deltas[j], pivotWeight);
}
}.quickSort(0, deltas.length - 1);
return indices;
}
/**
* This function executes a move operation moving the given shard from
* the given node to the minimal eligible node with respect to the
* weight function. Iff the shard is moved the shard will be set to
* {@link ShardRoutingState#RELOCATING} and a shadow instance of this
* shard is created with an incremented version in the state
* {@link ShardRoutingState#INITIALIZING}.
*
* @return true
iff the shard has successfully been moved.
*/
public boolean move(MutableShardRouting shard, RoutingNode node) {
if (nodes.isEmpty() || !shard.started()) {
/* with no nodes or a not started shard this is pointless */
return false;
}
if (logger.isTraceEnabled()) {
logger.trace("Try moving shard [{}] from [{}]", shard, node);
}
boolean changed = initialize(allocation.routingNodes());
final ModelNode sourceNode = nodes.get(node.nodeId());
assert sourceNode != null;
final NodeSorter sorter = newNodeSorter();
sorter.reset(Operation.MOVE, shard.getIndex());
final ModelNode[] nodes = sorter.modelNodes;
assert sourceNode.containsShard(shard);
/*
* the sorter holds the minimum weight node first for the shards index.
* We now walk through the nodes until we find a node to allocate the shard.
* This is not guaranteed to be balanced after this operation we still try best effort to
* allocate on the minimal eligable node.
*/
for (ModelNode currentNode : nodes) {
if (currentNode.getNodeId().equals(node.nodeId())) {
continue;
}
RoutingNode target = allocation.routingNodes().node(currentNode.getNodeId());
Decision decision = allocation.deciders().canAllocate(shard, target, allocation);
if (decision.type() == Type.YES) { // TODO maybe we can respect throtteling here too?
sourceNode.removeShard(shard);
final MutableShardRouting initializingShard = new MutableShardRouting(shard.index(), shard.id(), currentNode.getNodeId(),
shard.currentNodeId(), shard.primary(), INITIALIZING, shard.version() + 1);
currentNode.addShard(initializingShard, decision);
target.add(initializingShard);
shard.relocate(target.nodeId()); // set the node to relocate after we added the initializing shard
if (logger.isTraceEnabled()) {
logger.trace("Moved shard [{}] to node [{}]", shard, currentNode.getNodeId());
}
return true;
}
}
return changed;
}
/**
* Builds the internal model from all shards in the given
* {@link Iterable}. All shards in the {@link Iterable} must be assigned
* to a node. This method will skip shards in the state
* {@link ShardRoutingState#RELOCATING} since each relocating shard has
* a shadow shard in the state {@link ShardRoutingState#INITIALIZING}
* on the target node which we respect during the allocation / balancing
* process. In short, this method recreates the status-quo in the cluster.
*/
private void buildModelFromAssigned(Iterable shards) {
for (MutableShardRouting shard : shards) {
assert shard.assignedToNode();
/* we skip relocating shards here since we expect an initializing shard with the same id coming in */
if (shard.state() == RELOCATING) {
continue;
}
ModelNode node = nodes.get(shard.currentNodeId());
assert node != null;
node.addShard(shard, Decision.single(Type.YES, "Already allocated on node", node.getNodeId()));
if (logger.isTraceEnabled()) {
logger.trace("Assigned shard [{}] to node [{}]", shard, node.getNodeId());
}
}
}
/**
* Allocates all given shards on the minimal eligable node for the shards index
* with respect to the weight function. All given shards must be unassigned.
*/
private boolean allocateUnassigned(List unassigned, List ignoredUnassigned) {
assert !nodes.isEmpty();
if (logger.isTraceEnabled()) {
logger.trace("Start allocating unassigned shards");
}
if (unassigned.isEmpty()) {
return false;
}
boolean changed = false;
/*
* TODO: We could be smarter here and group the shards by index and then
* use the sorter to save some iterations.
*/
final RoutingNodes routingNodes = allocation.routingNodes();
final AllocationDeciders deciders = allocation.deciders();
final Set currentRound = new TreeSet(new Comparator() {
@Override
public int compare(MutableShardRouting o1,
MutableShardRouting o2) {
final int indexCmp;
if ((indexCmp = o1.index().compareTo(o2.index())) == 0) {
if (o1.getId() - o2.getId() == 0) {
return o1.primary() ? -1 : o2.primary() ? 1 : 0;
}
return o1.getId() - o2.getId();
}
return indexCmp;
}
});
do {
Iterator iterator = unassigned.iterator();
while (iterator.hasNext()) {
/* we treat every index equally here once chunk a time such that we fill up
* nodes with all indices at the same time. Only on shard of a shard a time.
* Although there might be a primary and a shard of a shard in the set but
* primaries will be started first.*/
if (currentRound.add(iterator.next())) {
iterator.remove();
}
}
boolean iterationChanged = false;
for (MutableShardRouting shard : currentRound) {
assert !shard.assignedToNode();
/* find an node with minimal weight we can allocate on*/
float minWeight = Float.POSITIVE_INFINITY;
ModelNode minNode = null;
Decision decision = null;
for (ModelNode node : nodes.values()) {
/*
* The shard we add is removed below to simulate the
* addition for weight calculation we use Decision.ALWAYS to
* not violate the not null condition.
*/
if (!node.containsShard(shard)) {
node.addShard(shard, Decision.ALWAYS);
float currentWeight = weight.weight(Operation.ALLOCATE, this, node, shard.index());
/*
* Remove the shard from the node again this is only a
* simulation
*/
Decision removed = node.removeShard(shard);
assert removed != null;
/*
* Unless the operation is not providing any gains we
* don't check deciders
*/
if (currentWeight <= minWeight) {
Decision currentDecision = deciders.canAllocate(shard, routingNodes.node(node.getNodeId()), allocation);
NOUPDATE:
if (currentDecision.type() == Type.YES || currentDecision.type() == Type.THROTTLE) {
if (currentWeight == minWeight) {
/* we have an equal weight tie breaking:
* 1. if one decision is YES prefer it
* 2. prefer the node that holds the primary for this index with the next id in the ring ie.
* for the 3 shards 2 replica case we try to build up:
* 1 2 0
* 2 0 1
* 0 1 2
* such that if we need to tie-break we try to prefer the node holding a shard with the minimal id greater
* than the id of the shard we need to assign. This works find when new indices are created since
* primaries are added first and we only add one shard set a time in this algorithm.
*/
if (currentDecision.type() == decision.type()) {
final int repId = shard.id();
final int nodeHigh = node.highestPrimary(shard.index());
final int minNodeHigh = minNode.highestPrimary(shard.index());
if ((((nodeHigh > repId && minNodeHigh > repId) || (nodeHigh < repId && minNodeHigh < repId)) && (nodeHigh < minNodeHigh))
|| (nodeHigh > minNodeHigh && nodeHigh > repId && minNodeHigh < repId)) {
minNode = node;
minWeight = currentWeight;
decision = currentDecision;
} else {
break NOUPDATE;
}
} else if (currentDecision.type() != Type.YES) {
break NOUPDATE;
}
}
minNode = node;
minWeight = currentWeight;
decision = currentDecision;
}
}
}
}
assert decision != null && minNode != null || decision == null && minNode == null;
if (minNode != null) {
iterationChanged = true;
minNode.addShard(shard, decision);
if (decision.type() == Type.YES) {
if (logger.isTraceEnabled()) {
logger.trace("Assigned shard [{}] to [{}]", shard, minNode.getNodeId());
}
routingNodes.node(minNode.getNodeId()).add(shard);
changed = true;
continue; // don't add to ignoreUnassigned
}
} else if (logger.isTraceEnabled()) {
logger.trace("No Node found to assign shard [{}]", shard);
}
ignoredUnassigned.add(shard);
}
if (!iterationChanged && !unassigned.isEmpty()) {
ignoredUnassigned.addAll(unassigned);
unassigned.clear();
return changed;
}
currentRound.clear();
} while (!unassigned.isEmpty());
// clear everything we have either added it or moved to ingoreUnassigned
return changed;
}
/**
* Tries to find a relocation from the max node to the minimal node for an arbitrary shard of the given index on the
* balance model. Iff this method returns a true
the relocation has already been executed on the
* simulation model as well as on the cluster.
*/
private boolean tryRelocateShard(Operation operation, ModelNode minNode, ModelNode maxNode, String idx, float minCost) {
final ModelIndex index = maxNode.getIndex(idx);
if (index != null) {
if (logger.isTraceEnabled()) {
logger.trace("Try relocating shard for index index [{}] from node [{}] to node [{}]", idx, maxNode.getNodeId(),
minNode.getNodeId());
}
final RoutingNode node = allocation.routingNodes().node(minNode.getNodeId());
MutableShardRouting candidate = null;
Decision decision = null;
final AllocationDeciders deciders = allocation.deciders();
/* make a copy since we modify this list in the loop */
final ArrayList shards = new ArrayList(index.getAllShards());
for (MutableShardRouting shard : shards) {
if (shard.started()) {
// skip initializing, unassigned and relocating shards we can't relocate them anyway
Decision allocationDecision = deciders.canAllocate(shard, node, allocation);
Decision rebalanceDecission = deciders.canRebalance(shard, allocation);
if (((allocationDecision.type() == Type.YES) || (allocationDecision.type() == Type.THROTTLE))
&& ((rebalanceDecission.type() == Type.YES) || (rebalanceDecission.type() == Type.THROTTLE))) {
Decision srcDecision;
if ((srcDecision = maxNode.removeShard(shard)) != null) {
minNode.addShard(shard, srcDecision);
final float delta = weight.weight(operation, this, minNode, idx) - weight.weight(operation, this, maxNode, idx);
if (delta < minCost) {
minCost = delta;
candidate = shard;
decision = new Decision.Multi().add(allocationDecision).add(rebalanceDecission);
}
minNode.removeShard(shard);
maxNode.addShard(shard, srcDecision);
}
}
}
}
if (candidate != null) {
/* allocate on the model even if not throttled */
maxNode.removeShard(candidate);
minNode.addShard(candidate, decision);
if (decision.type() == Type.YES) { /* only allocate on the cluster if we are not throttled */
if (logger.isTraceEnabled()) {
logger.trace("Relocate shard [{}] from node [{}] to node [{}]", candidate, maxNode.getNodeId(),
minNode.getNodeId());
}
/* now allocate on the cluster - if we are started we need to relocate the shard */
if (candidate.started()) {
RoutingNode lowRoutingNode = allocation.routingNodes().node(minNode.getNodeId());
lowRoutingNode.add(new MutableShardRouting(candidate.index(), candidate.id(), lowRoutingNode.nodeId(), candidate
.currentNodeId(), candidate.primary(), INITIALIZING, candidate.version() + 1));
candidate.relocate(lowRoutingNode.nodeId());
} else {
assert candidate.unassigned();
allocation.routingNodes().node(minNode.getNodeId()).add(candidate);
}
return true;
}
}
}
if (logger.isTraceEnabled()) {
logger.trace("Couldn't find shard to relocate from node [{}] to node [{}]", maxNode.getNodeId(),
minNode.getNodeId());
}
return false;
}
}
static class ModelNode implements Iterable {
private final String id;
private final Map indices = new HashMap();
/* cached stats - invalidated on add/remove and lazily calculated */
private int numShards = -1;
private int numPrimaries = -1;
public ModelNode(String id) {
this.id = id;
}
public ModelIndex getIndex(String indexId) {
return indices.get(indexId);
}
public String getNodeId() {
return id;
}
public int numShards() {
if (numShards == -1) {
int sum = 0;
for (ModelIndex index : indices.values()) {
sum += index.numShards();
}
numShards = sum;
}
return numShards;
}
public int numShards(String idx) {
ModelIndex index = indices.get(idx);
return index == null ? 0 : index.numShards();
}
public int numPrimaries(String idx) {
ModelIndex index = indices.get(idx);
return index == null ? 0 : index.numPrimaries();
}
public int numPrimaries() {
if (numPrimaries == -1) {
int sum = 0;
for (ModelIndex index : indices.values()) {
sum += index.numPrimaries();
}
numPrimaries = sum;
}
return numPrimaries;
}
public Collection shards() {
Collection result = new ArrayList();
for (ModelIndex index : indices.values()) {
result.addAll(index.getAllShards());
}
return result;
}
public int highestPrimary(String index) {
ModelIndex idx = indices.get(index);
if (idx != null) {
return idx.highestPrimary();
}
return -1;
}
public void addShard(MutableShardRouting shard, Decision decision) {
numPrimaries = numShards = -1;
ModelIndex index = indices.get(shard.index());
if (index == null) {
index = new ModelIndex(shard.index());
indices.put(index.getIndexId(), index);
}
index.addShard(shard, decision);
}
public Decision removeShard(MutableShardRouting shard) {
numPrimaries = numShards = -1;
ModelIndex index = indices.get(shard.index());
Decision removed = null;
if (index != null) {
removed = index.removeShard(shard);
if (removed != null && index.numShards() == 0) {
indices.remove(shard.index());
}
}
return removed;
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("Node(").append(id).append(")");
return sb.toString();
}
@Override
public Iterator iterator() {
return indices.values().iterator();
}
public boolean containsShard(MutableShardRouting shard) {
ModelIndex index = getIndex(shard.getIndex());
return index == null ? false : index.containsShard(shard);
}
}
static final class ModelIndex {
private final String id;
private final Map shards = new HashMap();
private int numPrimaries = -1;
private int highestPrimary = -1;
public ModelIndex(String id) {
this.id = id;
}
public int highestPrimary() {
if (highestPrimary == -1) {
int maxId = -1;
for (MutableShardRouting shard : shards.keySet()) {
if (shard.primary()) {
maxId = Math.max(maxId, shard.id());
}
}
return highestPrimary = maxId;
}
return highestPrimary;
}
public String getIndexId() {
return id;
}
public Decision getDecicion(MutableShardRouting shard) {
return shards.get(shard);
}
public int numShards() {
return shards.size();
}
public Collection getAllShards() {
return shards.keySet();
}
public int numPrimaries() {
if (numPrimaries == -1) {
int num = 0;
for (MutableShardRouting shard : shards.keySet()) {
if (shard.primary()) {
num++;
}
}
return numPrimaries = num;
}
return numPrimaries;
}
public Decision removeShard(MutableShardRouting shard) {
highestPrimary = numPrimaries = -1;
return shards.remove(shard);
}
public void addShard(MutableShardRouting shard, Decision decision) {
highestPrimary = numPrimaries = -1;
assert decision != null;
assert !shards.containsKey(shard) : "Shard already allocated on current node: " + shards.get(shard) + " " + shard;
shards.put(shard, decision);
}
public boolean containsShard(MutableShardRouting shard) {
return shards.containsKey(shard);
}
}
static final class NodeSorter extends SorterTemplate {
final ModelNode[] modelNodes;
/* the nodes weights with respect to the current weight function / index */
final float[] weights;
private final WeightFunction function;
private String index;
private final Balancer balancer;
private float pivotWeight;
public NodeSorter(ModelNode[] modelNodes, WeightFunction function, Balancer balancer) {
this.function = function;
this.balancer = balancer;
this.modelNodes = modelNodes;
weights = new float[modelNodes.length];
}
/**
* Resets the sorter, recalculates the weights per node and sorts the
* nodes by weight, with minimal weight first.
*/
public void reset(Operation operation, String index) {
this.index = index;
for (int i = 0; i < weights.length; i++) {
weights[i] = weight(operation, modelNodes[i]);
}
quickSort(0, modelNodes.length - 1);
}
public float weight(Operation operation, ModelNode node) {
return function.weight(operation, balancer, node, index);
}
@Override
protected void swap(int i, int j) {
final ModelNode tmpNode = modelNodes[i];
modelNodes[i] = modelNodes[j];
modelNodes[j] = tmpNode;
final float tmpWeight = weights[i];
weights[i] = weights[j];
weights[j] = tmpWeight;
}
@Override
protected int compare(int i, int j) {
return Float.compare(weights[i], weights[j]);
}
@Override
protected void setPivot(int i) {
pivotWeight = weights[i];
}
@Override
protected int comparePivot(int j) {
return Float.compare(pivotWeight, weights[j]);
}
public float delta() {
return weights[weights.length - 1] - weights[0];
}
}
}