org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of elasticsearch
ElasticSearch - Open Source, Distributed, RESTful Search Engine
There is a newer version: 8.14.1
/*
 * Licensed to ElasticSearch and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. ElasticSearch licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.cluster.routing.allocation.allocator;

import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import org.apache.lucene.util.SorterTemplate;
import org.elasticsearch.ElasticSearchIllegalArgumentException;
import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.cluster.routing.*;
import org.elasticsearch.cluster.routing.allocation.FailedRerouteAllocation;
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
import org.elasticsearch.cluster.routing.allocation.StartedRerouteAllocation;
import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders;
import org.elasticsearch.cluster.routing.allocation.decider.Decision;
import org.elasticsearch.cluster.routing.allocation.decider.Decision.Type;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.node.settings.NodeSettingsService;

import java.util.*;

import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING;
import static org.elasticsearch.cluster.routing.ShardRoutingState.RELOCATING;

/**
 * The {@link BalancedShardsAllocator} re-balances the nodes allocations
 * within an cluster based on a {@link WeightFunction}. The clusters balance is defined by four parameters which can be set
 * in the cluster update API that allows changes in real-time:
 * 
 * 
cluster.routing.allocation.balance.shard - The shard balance defines the weight factor
 * for shards allocated on a {@link RoutingNode}
 * cluster.routing.allocation.balance.index - The index balance defines a factor to the number
 * of {@link org.elasticsearch.cluster.routing.ShardRouting}s per index allocated on a specific node
 * cluster.routing.allocation.balance.primary - the primary balance defines a weight factor for
 * the number of primaries of a specific index allocated on a node
 * cluster.routing.allocation.balance.threshold - A threshold to set the minimal optimization
 * value of operations that should be performed
 * 
 * 
 * These parameters are combined in a {@link WeightFunction} that allows calculation of node weights which
 * are used to re-balance shards based on global as well as per-index factors.
 */
public class BalancedShardsAllocator extends AbstractComponent implements ShardsAllocator {

    public static final String SETTING_THRESHOLD = "cluster.routing.allocation.balance.threshold";
    public static final String SETTING_INDEX_BALANCE_FACTOR = "cluster.routing.allocation.balance.index";
    public static final String SETTING_SHARD_BALANCE_FACTOR = "cluster.routing.allocation.balance.shard";
    public static final String SETTING_PRIMARY_BALANCE_FACTOR = "cluster.routing.allocation.balance.primary";
    
    private static final float DEFAULT_INDEX_BALANCE_FACTOR = 0.5f;
    private static final float DEFAULT_SHARD_BALANCE_FACTOR  = 0.45f;
    private static final float DEFAULT_PRIMARY_BALANCE_FACTOR = 0.05f;

    class ApplySettings implements NodeSettingsService.Listener {
        @Override
        public void onRefreshSettings(Settings settings) {
            final float indexBalance = settings.getAsFloat(SETTING_INDEX_BALANCE_FACTOR, weightFunction.indexBalance);
            final float shardBalance = settings.getAsFloat(SETTING_SHARD_BALANCE_FACTOR, weightFunction.shardBalance);
            final float primaryBalance = settings.getAsFloat(SETTING_PRIMARY_BALANCE_FACTOR, weightFunction.primaryBalance);
            float threshold = settings.getAsFloat(SETTING_THRESHOLD, BalancedShardsAllocator.this.threshold);
            if (threshold <= 0.0f) {
                throw new ElasticSearchIllegalArgumentException("threshold must be greater than 0.0f but was: " + threshold);
            }
            BalancedShardsAllocator.this.threshold = threshold;
            BalancedShardsAllocator.this.weightFunction = new WeightFunction(indexBalance, shardBalance, primaryBalance);
        }
    }

    private volatile WeightFunction weightFunction = new WeightFunction(DEFAULT_INDEX_BALANCE_FACTOR, DEFAULT_SHARD_BALANCE_FACTOR, DEFAULT_PRIMARY_BALANCE_FACTOR);

    private volatile float threshold = 1.0f;
    

    public BalancedShardsAllocator(Settings settings) {
        this(settings, new NodeSettingsService(settings));
    }

    @Inject
    public BalancedShardsAllocator(Settings settings, NodeSettingsService nodeSettingsService) {
        super(settings);
        ApplySettings applySettings = new ApplySettings();
        applySettings.onRefreshSettings(settings);
        nodeSettingsService.addListener(applySettings);
    }

    @Override
    public void applyStartedShards(StartedRerouteAllocation allocation) { /* ONLY FOR GATEWAYS */ }

    @Override
    public void applyFailedShards(FailedRerouteAllocation allocation) { /* ONLY FOR GATEWAYS */ }

    @Override
    public boolean allocateUnassigned(RoutingAllocation allocation) {
        return rebalance(allocation);
    }

    @Override
    public boolean rebalance(RoutingAllocation allocation) {
        final Balancer balancer = new Balancer(logger, allocation, weightFunction, threshold);
        return balancer.balance();
    }

    @Override
    public boolean move(MutableShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
        final Balancer balancer = new Balancer(logger, allocation, weightFunction, threshold);
        return balancer.move(shardRouting, node);
    }
    
    /**
     * Returns the currently configured delta threshold
     */
    public float getThreshold() {
        return threshold;
    }
    
    /**
     * Returns the index related weight factor.
     */
    public float getIndexBalance() {
        return weightFunction.indexBalance;
    }
    
    /**
     * Returns the primary related weight factor.
     */
    public float getPrimaryBalance() {
        return weightFunction.primaryBalance;
    }
    
    /**
     * Returns the shard related weight factor.
     */
    public float getShardBalance() {
        return weightFunction.shardBalance;
    }


    /**
     * This class is the primary weight function used to create balanced over nodes and shards in the cluster.
     * Currently this function has 3 properties:
     * 

     * index balance - balance property over shards per index
     * shard balance - balance property over shards per cluster
     * primary balance - balance property over primaries per cluster
     * 
     * 
     * Each of these properties are expressed as factor such that the properties factor defines the relative importance of the property for the
     * weight function. For example if the weight function should calculate the weights only based on a global (shard) balance the index and primary balance
     * can be set to 0.0 and will in turn have no effect on the distribution.
     * 
     * The weight per index is calculated based on the following formula:
     * 
     * 
     * weight_index(node, index) = indexBalance * (node.numShards(index) - avgShardsPerNode(index))
     * 
     * 
     * weight_node(node, index) = shardBalance * (node.numShards() - avgShardsPerNode)
     * 
     * 
     * weight_primary(node, index) = primaryBalance * (node.numPrimaries() - avgPrimariesPerNode)
     * 
     * 
     * weight(node, index) = weight_index(node, index) + weight_node(node, index) + weight_primary(node, index)
     */
    public static class WeightFunction {

        private final float indexBalance;
        private final float shardBalance;
        private final float primaryBalance;
        private final EnumMap thetaMap = new EnumMap(Operation.class);

        public WeightFunction(float indexBalance, float shardBalance, float primaryBalance) {
            float sum = indexBalance + shardBalance + primaryBalance;
            if (sum <= 0.0f) {
                throw new ElasticSearchIllegalArgumentException("Balance factors must sum to a value > 0 but was: " + sum);
            }
            final float[] defaultTheta = new float[] {  shardBalance / sum, indexBalance / sum, primaryBalance / sum };
            for(Operation operation : Operation.values()) {
                switch(operation) {
                case THRESHOLD_CHECK:
                    sum = indexBalance + shardBalance;
                    if (sum <= 0.0f) {
                        thetaMap.put(operation,  defaultTheta);
                    }
                    thetaMap.put(operation,  new float[] { shardBalance / sum,  indexBalance / sum, 0});
                    break;
                case BALANCE:
                case ALLOCATE:
                case MOVE:
                    thetaMap.put(operation,  defaultTheta);
                    break;
                default:
                    assert false;
                }
            }
            this.indexBalance = indexBalance;
            this.shardBalance = shardBalance;
            this.primaryBalance = primaryBalance;
        }

        public float weight(Operation operation, Balancer balancer, ModelNode node, String index) {
            final float weightShard = (node.numShards() - balancer.avgShardsPerNode());
            final float weightIndex = (node.numShards(index) - balancer.avgShardsPerNode(index));
            final float weightPrimary = (node.numPrimaries() - balancer.avgPrimariesPerNode());
            final float[] theta = thetaMap.get(operation);
            assert theta != null;
            return theta[0] * weightShard + theta[1] * weightIndex + theta[2] * weightPrimary;    
        }

    }
    
    /**
     * An enum that donates the actual operation the {@link WeightFunction} is
     * applied to.
     */
    public static enum Operation {
        /**
         * Provided during balance operations. 
         */
        BALANCE,
        /**
         * Provided during initial allocation operation for unassigned shards. 
         */
        ALLOCATE,
        /**
         * Provided during move operation.
         */
        MOVE, 
        /**
         * Provided when the weight delta is checked against the configured threshold.
         * This can be used to ignore tie-breaking weight factors that should not 
         * solely trigger a relocation unless the delta is above the threshold. 
         */
        THRESHOLD_CHECK
    }

    /**
     * A {@link Balancer}
     */
    public static class Balancer {

        private final ESLogger logger;
        private final Map nodes = new HashMap();
        private final HashSet indices = new HashSet();
        private final RoutingAllocation allocation;
        private final WeightFunction weight;

        private final float threshold;
        private final MetaData metaData;
        
        private final Predicate assignedFilter = new Predicate() {
            @Override
            public boolean apply(MutableShardRouting input) {
                return input.assignedToNode();
            }
        };


        public Balancer(ESLogger logger, RoutingAllocation allocation, WeightFunction weight, float threshold) {
            this.logger = logger;
            this.allocation = allocation;
            this.weight = weight;
            this.threshold = threshold;
            for (RoutingNode node : allocation.routingNodes()) {
                nodes.put(node.nodeId(), new ModelNode(node.nodeId()));
            }
            metaData = allocation.routingNodes().metaData();
        }

        /**
         * Returns an array view on the nodes in the balancer. Nodes should not be removed from this list.
         */
        private ModelNode[] nodesArray() {
            return nodes.values().toArray(new ModelNode[nodes.size()]);
        }

        /**
         * Returns the average of shards per node for the given index
         */
        public float avgShardsPerNode(String index) {
            return ((float) metaData.index(index).totalNumberOfShards()) / nodes.size();
        }

        /**
         * Returns the global average of shards per node
         */
        public float avgShardsPerNode() {
            return ((float) metaData.totalNumberOfShards()) / nodes.size();
        }

        /**
         * Returns the global average of primaries per node
         */
        public float avgPrimariesPerNode() {
            return ((float) metaData.numberOfShards()) / nodes.size();
        }

        /**
         * Returns the average of primaries per node for the given index
         */
        public float avgPrimariesPerNode(String index) {
            return ((float) metaData.index(index).numberOfShards()) / nodes.size();
        }

        /**
         * Returns a new {@link NodeSorter} that sorts the nodes based on their
         * current weight with respect to the index passed to the sorter. The
         * returned sorter is not sorted. Use {@link NodeSorter#reset(String)}
         * to sort based on an index.
         */
        private NodeSorter newNodeSorter() {
            final NodeSorter sorter = new NodeSorter(nodesArray(), weight, this);
            return sorter;
        }

        private boolean initialize(RoutingNodes routing) {
            Collection shards = new ArrayList();
            if (logger.isTraceEnabled()) {
                logger.trace("Start distributing Shards");
            }

            for (IndexRoutingTable index : allocation.routingTable().indicesRouting().values()) {
                indices.add(index.index());
                for (IndexShardRoutingTable shard : index.getShards().values()) {
                    shards.addAll(routing.shardsRoutingFor(index.index(), shard.shardId().id()));
                }
            }
            buildModelFromAssigned(Iterables.filter(shards, assignedFilter));
            return allocateUnassigned(allocation.routingNodes().unassigned(), allocation.routingNodes().ignoredUnassigned());
        }

        /**
         * Balances the nodes on the cluster model according to the weight
         * function. The configured threshold is the minimum delta between the
         * weight of the maximum node and the minimum node according to the
         * {@link WeightFunction}. This weight is calculated per index to
         * distribute shards evenly per index. The balancer tries to relocate
         * shards only if the delta exceeds the threshold. If the default case
         * the threshold is set to 1.0 to enforce gaining relocation
         * only, or in other words relocations that move the weight delta closer
         * to 0.0
         *
         * @return true if the current configuration has been
         *         changed, otherwise false
         */
        public boolean balance() {
            if (this.nodes.isEmpty()) {
                /* with no nodes this is pointless */
                return false;
            }
            if (logger.isTraceEnabled()) {
                logger.trace("Start balancing cluster");
            }

            boolean changed = initialize(allocation.routingNodes());
            NodeSorter sorter = newNodeSorter();
            if (nodes.size() > 1) { /* skip if we only have one node */
                for (String index : buildWeightOrderedIndidces(Operation.BALANCE, sorter)) {
                    sorter.reset(Operation.BALANCE,index);
                    final float[] weights = sorter.weights;
                    final ModelNode[] modelNodes = sorter.modelNodes;
                    int lowIdx = 0;
                    int highIdx = weights.length - 1;
                    while (true) {
                        final ModelNode minNode = modelNodes[lowIdx];
                        final ModelNode maxNode = modelNodes[highIdx];
                        if (maxNode.numShards(index) > 0) {
                            float delta = weights[highIdx] - weights[lowIdx];
                            delta = delta <= threshold ? delta : sorter.weight(Operation.THRESHOLD_CHECK, maxNode) - sorter.weight(Operation.THRESHOLD_CHECK, minNode);
                            if (delta <= threshold) {
                                if (logger.isTraceEnabled()) {
                                    
                                    logger.trace("Stop balancing index [{}]  min_node [{}] weight: [{}]  max_node [{}] weight: [{}]  delta: [{}]",
                                            index, maxNode.getNodeId(), weights[highIdx], minNode.getNodeId(), weights[lowIdx], delta);
                                }
                                break;
                            }
                            if (logger.isTraceEnabled()) {
                                logger.trace("Balancing from node [{}] weight: [{}] to node [{}] weight: [{}]  delta: [{}]",
                                        maxNode.getNodeId(), weights[highIdx], minNode.getNodeId(), weights[lowIdx], delta);
                            }
                            /* pass the delta to the replication function to prevent relocations that only swap the weights of the two nodes.
                             * a relocation must bring us closer to the balance if we only achive the same delta the relocation is useless */
                            if (tryRelocateShard(Operation.BALANCE, minNode, maxNode, index, delta)) {
                                /*
                                 * TODO we could be a bit smarter here, we don't need to fully sort necessarily
                                 * we could just find the place to insert linearly but the win might be minor
                                 * compared to the added complexity
                                 */
                                weights[lowIdx] = sorter.weight(Operation.BALANCE, modelNodes[lowIdx]);
                                weights[highIdx] = sorter.weight(Operation.BALANCE, modelNodes[highIdx]);
                                sorter.quickSort(0, weights.length - 1);
                                lowIdx = 0;
                                highIdx = weights.length - 1;
                                changed = true;
                                continue;
                            }
                        }
                        if (lowIdx < highIdx - 1) {
                            /* we can't move from any shard from the min node lets move on to the next node
                             * and see if the threshold still holds. We either don't have any shard of this
                             * index on this node of allocation deciders prevent any relocation.*/
                            lowIdx++;
                        } else if (lowIdx > 0) {
                            /* now we go max to min since obviously we can't move anything to the max node 
                             * lets pick the next highest */
                            lowIdx = 0;
                            highIdx--;
                        } else {
                            /* we are done here, we either can't relocate anymore or we are balanced */
                            break;
                        }
                    }
                }
            }
            return changed;
        }

        /**
         * This builds a initial index ordering where the indices are returned
         * in most unbalanced first. We need this in order to prevent over
         * allocations on added nodes from one index when the weight parameters
         * for global balance overrule the index balance at an intermediate
         * state. For example this can happen if we have 3 nodes and 3 indices
         * with 3 shards and 1 shard. At the first stage all three nodes hold
         * 2 shard for each index. now we add another node and the first index
         * is balanced moving 3 two of the nodes over to the new node since it
         * has no shards yet and global balance for the node is way below
         * average. To re-balance we need to move shards back eventually likely
         * to the nodes we relocated them from.
         */
        private String[] buildWeightOrderedIndidces(Operation operation, NodeSorter sorter) {
            final String[] indices = this.indices.toArray(new String[this.indices.size()]);
            final float[] deltas = new float[indices.length];
            for (int i = 0; i < deltas.length; i++) {
                sorter.reset(operation, indices[i]);
                deltas[i] = sorter.delta();
            }
            new SorterTemplate() {
                float pivotWeight;

                @Override
                protected void swap(int i, int j) {
                    final String tmpIdx = indices[i];
                    indices[i] = indices[j];
                    indices[j] = tmpIdx;
                    final float tmpDelta = deltas[i];
                    deltas[i] = deltas[j];
                    deltas[j] = tmpDelta;
                }

                @Override
                protected int compare(int i, int j) {
                    return Float.compare(deltas[j], deltas[i]);
                }

                @Override
                protected void setPivot(int i) {
                    pivotWeight = deltas[i];
                }

                @Override
                protected int comparePivot(int j) {
                    return Float.compare(deltas[j], pivotWeight);
                }
            }.quickSort(0, deltas.length - 1);

            return indices;
        }

        /**
         * This function executes a move operation moving the given shard from
         * the given node to the minimal eligible node with respect to the
         * weight function. Iff the shard is moved the shard will be set to
         * {@link ShardRoutingState#RELOCATING} and a shadow instance of this
         * shard is created with an incremented version in the state
         * {@link ShardRoutingState#INITIALIZING}.
         *
         * @return true iff the shard has successfully been moved.
         */
        public boolean move(MutableShardRouting shard, RoutingNode node) {
            if (nodes.isEmpty() || !shard.started()) {
                /* with no nodes or a not started shard this is pointless */
                return false;
            }
            if (logger.isTraceEnabled()) {
                logger.trace("Try moving shard [{}] from [{}]", shard, node);
            }
            boolean changed = initialize(allocation.routingNodes());

            final ModelNode sourceNode = nodes.get(node.nodeId());
            assert sourceNode != null;
            final NodeSorter sorter = newNodeSorter();
            sorter.reset(Operation.MOVE, shard.getIndex());
            final ModelNode[] nodes = sorter.modelNodes;
            assert sourceNode.containsShard(shard);
            /*
             * the sorter holds the minimum weight node first for the shards index.
             * We now walk through the nodes until we find a node to allocate the shard.
             * This is not guaranteed to be balanced after this operation we still try best effort to 
             * allocate on the minimal eligable node.
             */
            for (ModelNode currentNode : nodes) {
                if (currentNode.getNodeId().equals(node.nodeId())) {
                    continue;
                }
                RoutingNode target = allocation.routingNodes().node(currentNode.getNodeId());
                Decision decision = allocation.deciders().canAllocate(shard, target, allocation);
                if (decision.type() == Type.YES) { // TODO maybe we can respect throtteling here too?
                    sourceNode.removeShard(shard);
                    final MutableShardRouting initializingShard = new MutableShardRouting(shard.index(), shard.id(), currentNode.getNodeId(),
                            shard.currentNodeId(), shard.primary(), INITIALIZING, shard.version() + 1);
                    currentNode.addShard(initializingShard, decision);
                    target.add(initializingShard);
                    shard.relocate(target.nodeId()); // set the node to relocate after we added the initializing shard
                    if (logger.isTraceEnabled()) {
                        logger.trace("Moved shard [{}] to node [{}]", shard, currentNode.getNodeId());
                    }
                    return true;
                }
            }

            return changed;
        }

        /**
         * Builds the internal model from all shards in the given
         * {@link Iterable}. All shards in the {@link Iterable} must be assigned
         * to a node. This method will skip shards in the state
         * {@link ShardRoutingState#RELOCATING} since each relocating shard has
         * a shadow shard in the state {@link ShardRoutingState#INITIALIZING}
         * on the target node which we respect during the allocation / balancing
         * process. In short, this method recreates the status-quo in the cluster.
         */
        private void buildModelFromAssigned(Iterable shards) {
            for (MutableShardRouting shard : shards) {
                assert shard.assignedToNode();
                /* we skip relocating shards here since we expect an initializing shard with the same id coming in */
                if (shard.state() == RELOCATING) {
                    continue;
                }
                ModelNode node = nodes.get(shard.currentNodeId());
                assert node != null;
                node.addShard(shard, Decision.single(Type.YES, "Already allocated on node", node.getNodeId()));
                if (logger.isTraceEnabled()) {
                    logger.trace("Assigned shard [{}] to node [{}]", shard, node.getNodeId());
                }
            }
        }

        /**
         * Allocates all given shards on the minimal eligable node for the shards index
         * with respect to the weight function. All given shards must be unassigned.
         */
        private boolean allocateUnassigned(List unassigned, List ignoredUnassigned) {
            assert !nodes.isEmpty();
            if (logger.isTraceEnabled()) {
                logger.trace("Start allocating unassigned shards");
            }
            if (unassigned.isEmpty()) {
                return false;
            }
            boolean changed = false;
          
            /*
             * TODO: We could be smarter here and group the shards by index and then
             * use the sorter to save some iterations. 
             */
            final RoutingNodes routingNodes = allocation.routingNodes();
            final AllocationDeciders deciders = allocation.deciders();
            final Set currentRound = new TreeSet(new Comparator() {
                @Override
                public int compare(MutableShardRouting o1,
                                   MutableShardRouting o2) {
                    final int indexCmp;
                    if ((indexCmp = o1.index().compareTo(o2.index())) == 0) {
                        if (o1.getId() - o2.getId() == 0) {
                            return o1.primary() ? -1 : o2.primary() ? 1 : 0;
                        }
                        return o1.getId() - o2.getId();

                    }
                    return indexCmp;
                }
            });
            do {
                Iterator iterator = unassigned.iterator();
                while (iterator.hasNext()) {
                    /* we treat every index equally here once chunk a time such that we fill up
                     * nodes with all indices at the same time. Only on shard of a shard a time.
                	 * Although there might be a primary and a shard of a shard in the set but
                	 * primaries will be started first.*/
                    if (currentRound.add(iterator.next())) {
                        iterator.remove();
                    }
                }
                boolean iterationChanged = false;
                for (MutableShardRouting shard : currentRound) {
                    assert !shard.assignedToNode();
                    /* find an node with minimal weight we can allocate on*/
                    float minWeight = Float.POSITIVE_INFINITY;
                    ModelNode minNode = null;
                    Decision decision = null;
                    for (ModelNode node : nodes.values()) {
	                    /*
	                     * The shard we add is removed below to simulate the
	                     * addition for weight calculation we use Decision.ALWAYS to
	                     * not violate the not null condition.
	                     */
                        if (!node.containsShard(shard)) {
                            node.addShard(shard, Decision.ALWAYS);
                            float currentWeight = weight.weight(Operation.ALLOCATE, this, node, shard.index());
	                        /*
	                         * Remove the shard from the node again this is only a
	                         * simulation
	                         */
                            Decision removed = node.removeShard(shard);
                            assert removed != null;
	                        /*
	                         * Unless the operation is not providing any gains we
	                         * don't check deciders
	                         */
                            if (currentWeight <= minWeight) {
                                Decision currentDecision = deciders.canAllocate(shard, routingNodes.node(node.getNodeId()), allocation);
                                NOUPDATE:
                                if (currentDecision.type() == Type.YES || currentDecision.type() == Type.THROTTLE) {
                                    if (currentWeight == minWeight) {
	                                    /*  we have an equal weight tie breaking:
	                                     *  1. if one decision is YES prefer it
	                                     *  2. prefer the node that holds the primary for this index with the next id in the ring ie.
	                                     *  for the 3 shards 2 replica case we try to build up:
	                                     *    1 2 0
	                                     *    2 0 1
	                                     *    0 1 2
	                                     *  such that if we need to tie-break we try to prefer the node holding a shard with the minimal id greater
	                                     *  than the id of the shard we need to assign. This works find when new indices are created since 
	                                     *  primaries are added first and we only add one shard set a time in this algorithm.
	                                     */
                                        if (currentDecision.type() == decision.type()) {
                                            final int repId = shard.id();
                                            final int nodeHigh = node.highestPrimary(shard.index());
                                            final int minNodeHigh = minNode.highestPrimary(shard.index());
                                            if ((((nodeHigh > repId && minNodeHigh > repId) || (nodeHigh < repId && minNodeHigh < repId)) && (nodeHigh < minNodeHigh))
                                                    || (nodeHigh > minNodeHigh && nodeHigh > repId && minNodeHigh < repId)) {
                                                minNode = node;
                                                minWeight = currentWeight;
                                                decision = currentDecision;
                                            } else {
                                                break NOUPDATE;
                                            }
                                        } else if (currentDecision.type() != Type.YES) {
                                            break NOUPDATE;
                                        }
                                    }
                                    minNode = node;
                                    minWeight = currentWeight;
                                    decision = currentDecision;
                                }
                            }
                        }
                    }
                    assert decision != null && minNode != null || decision == null && minNode == null;
                    if (minNode != null) {
                        iterationChanged = true;
                        minNode.addShard(shard, decision);
                        if (decision.type() == Type.YES) {
                            if (logger.isTraceEnabled()) {
                                logger.trace("Assigned shard [{}] to [{}]", shard, minNode.getNodeId());
                            }
                            routingNodes.node(minNode.getNodeId()).add(shard);
                            changed = true;
                            continue; // don't add to ignoreUnassigned
                        }
                    } else if (logger.isTraceEnabled()) {
                        logger.trace("No Node found to assign shard [{}]", shard);
                    }
                    ignoredUnassigned.add(shard);
                }
                if (!iterationChanged && !unassigned.isEmpty()) {
                    ignoredUnassigned.addAll(unassigned);
                    unassigned.clear();
                    return changed;
                }
                currentRound.clear();
            } while (!unassigned.isEmpty());
            // clear everything we have either added it or moved to ingoreUnassigned
            return changed;
        }

        /**
         * Tries to find a relocation from the max node to the minimal node for an arbitrary shard of the given index on the
         * balance model. Iff this method returns a true the relocation has already been executed on the
         * simulation model as well as on the cluster.
         */
        private boolean tryRelocateShard(Operation operation, ModelNode minNode, ModelNode maxNode, String idx, float minCost) {
            final ModelIndex index = maxNode.getIndex(idx);
            if (index != null) {
                if (logger.isTraceEnabled()) {
                    logger.trace("Try relocating shard for index index [{}] from node [{}] to node [{}]", idx, maxNode.getNodeId(),
                            minNode.getNodeId());
                }
                final RoutingNode node = allocation.routingNodes().node(minNode.getNodeId());
                MutableShardRouting candidate = null;
                Decision decision = null;
                final AllocationDeciders deciders = allocation.deciders();
                /* make a copy since we modify this list in the loop */
                final ArrayList shards = new ArrayList(index.getAllShards());
                for (MutableShardRouting shard : shards) {
                    if (shard.started()) {
                        // skip initializing, unassigned and relocating shards we can't relocate them anyway
                        Decision allocationDecision = deciders.canAllocate(shard, node, allocation);
                        Decision rebalanceDecission = deciders.canRebalance(shard, allocation);

                        if (((allocationDecision.type() == Type.YES) || (allocationDecision.type() == Type.THROTTLE))
                                && ((rebalanceDecission.type() == Type.YES) || (rebalanceDecission.type() == Type.THROTTLE))) {
                            Decision srcDecision;
                            if ((srcDecision = maxNode.removeShard(shard)) != null) {
                                minNode.addShard(shard, srcDecision);
                                final float delta = weight.weight(operation, this, minNode, idx) - weight.weight(operation, this, maxNode, idx);
                                if (delta < minCost) {
                                    minCost = delta;
                                    candidate = shard;
                                    decision = new Decision.Multi().add(allocationDecision).add(rebalanceDecission);
                                }
                                minNode.removeShard(shard);
                                maxNode.addShard(shard, srcDecision);
                            }
                        }
                    }
                }

                if (candidate != null) {
                  
                    /* allocate on the model even if not throttled */
                    maxNode.removeShard(candidate);
                    minNode.addShard(candidate, decision);
                    if (decision.type() == Type.YES) { /* only allocate on the cluster if we are not throttled */
                        if (logger.isTraceEnabled()) {
                            logger.trace("Relocate shard [{}] from node [{}] to node [{}]", candidate, maxNode.getNodeId(),
                                    minNode.getNodeId());
                        }
                        /* now allocate on the cluster - if we are started we need to relocate the shard */
                        if (candidate.started()) {
                            RoutingNode lowRoutingNode = allocation.routingNodes().node(minNode.getNodeId());
                            lowRoutingNode.add(new MutableShardRouting(candidate.index(), candidate.id(), lowRoutingNode.nodeId(), candidate
                                    .currentNodeId(), candidate.primary(), INITIALIZING, candidate.version() + 1));
                            candidate.relocate(lowRoutingNode.nodeId());

                        } else {
                            assert candidate.unassigned();
                            allocation.routingNodes().node(minNode.getNodeId()).add(candidate);
                        }
                        return true;

                    }
                }
            }
            if (logger.isTraceEnabled()) {
                logger.trace("Couldn't find shard to relocate from node [{}] to node [{}]", maxNode.getNodeId(),
                        minNode.getNodeId());
            }
            return false;
        }

    }

    static class ModelNode implements Iterable {
        private final String id;
        private final Map indices = new HashMap();
        /* cached stats - invalidated on add/remove and lazily calculated */
        private int numShards = -1;
        private int numPrimaries = -1;

        public ModelNode(String id) {
            this.id = id;
        }

        public ModelIndex getIndex(String indexId) {
            return indices.get(indexId);
        }

        public String getNodeId() {
            return id;
        }

        public int numShards() {
            if (numShards == -1) {
                int sum = 0;
                for (ModelIndex index : indices.values()) {
                    sum += index.numShards();
                }
                numShards = sum;
            }
            return numShards;
        }

        public int numShards(String idx) {
            ModelIndex index = indices.get(idx);
            return index == null ? 0 : index.numShards();
        }

        public int numPrimaries(String idx) {
            ModelIndex index = indices.get(idx);
            return index == null ? 0 : index.numPrimaries();
        }

        public int numPrimaries() {
            if (numPrimaries == -1) {
                int sum = 0;
                for (ModelIndex index : indices.values()) {
                    sum += index.numPrimaries();
                }
                numPrimaries = sum;
            }
            return numPrimaries;
        }

        public Collection shards() {
            Collection result = new ArrayList();
            for (ModelIndex index : indices.values()) {
                result.addAll(index.getAllShards());
            }
            return result;
        }

        public int highestPrimary(String index) {
            ModelIndex idx = indices.get(index);
            if (idx != null) {
                return idx.highestPrimary();
            }
            return -1;
        }

        public void addShard(MutableShardRouting shard, Decision decision) {
            numPrimaries = numShards = -1;
            ModelIndex index = indices.get(shard.index());
            if (index == null) {
                index = new ModelIndex(shard.index());
                indices.put(index.getIndexId(), index);
            }
            index.addShard(shard, decision);
        }

        public Decision removeShard(MutableShardRouting shard) {
            numPrimaries = numShards = -1;
            ModelIndex index = indices.get(shard.index());
            Decision removed = null;
            if (index != null) {
                removed = index.removeShard(shard);
                if (removed != null && index.numShards() == 0) {
                    indices.remove(shard.index());
                }
            }
            return removed;
        }

        public String toString() {
            StringBuilder sb = new StringBuilder();
            sb.append("Node(").append(id).append(")");
            return sb.toString();
        }

        @Override
        public Iterator iterator() {
            return indices.values().iterator();
        }

        public boolean containsShard(MutableShardRouting shard) {
            ModelIndex index = getIndex(shard.getIndex());
            return index == null ? false : index.containsShard(shard);
        }

    }

    static final class ModelIndex {
        private final String id;
        private final Map shards = new HashMap();
        private int numPrimaries = -1;
        private int highestPrimary = -1;

        public ModelIndex(String id) {
            this.id = id;
        }

        public int highestPrimary() {
            if (highestPrimary == -1) {
                int maxId = -1;
                for (MutableShardRouting shard : shards.keySet()) {
                    if (shard.primary()) {
                        maxId = Math.max(maxId, shard.id());
                    }
                }
                return highestPrimary = maxId;
            }
            return highestPrimary;
        }

        public String getIndexId() {
            return id;
        }

        public Decision getDecicion(MutableShardRouting shard) {
            return shards.get(shard);
        }

        public int numShards() {
            return shards.size();
        }

        public Collection getAllShards() {
            return shards.keySet();
        }

        public int numPrimaries() {
            if (numPrimaries == -1) {
                int num = 0;
                for (MutableShardRouting shard : shards.keySet()) {
                    if (shard.primary()) {
                        num++;
                    }
                }
                return numPrimaries = num;
            }
            return numPrimaries;
        }

        public Decision removeShard(MutableShardRouting shard) {
            highestPrimary = numPrimaries = -1;
            return shards.remove(shard);
        }

        public void addShard(MutableShardRouting shard, Decision decision) {
            highestPrimary = numPrimaries = -1;
            assert decision != null;
            assert !shards.containsKey(shard) : "Shard already allocated on current node: " + shards.get(shard) + " " + shard;
            shards.put(shard, decision);
        }

        public boolean containsShard(MutableShardRouting shard) {
            return shards.containsKey(shard);
        }
    }

    static final class NodeSorter extends SorterTemplate {

        final ModelNode[] modelNodes;
        /* the nodes weights with respect to the current weight function / index */
        final float[] weights;
        private final WeightFunction function;
        private String index;
        private final Balancer balancer;
        private float pivotWeight;

        public NodeSorter(ModelNode[] modelNodes, WeightFunction function, Balancer balancer) {
            this.function = function;
            this.balancer = balancer;
            this.modelNodes = modelNodes;
            weights = new float[modelNodes.length];
        }

        /**
         * Resets the sorter, recalculates the weights per node and sorts the
         * nodes by weight, with minimal weight first.
         */
        public void reset(Operation operation, String index) {
            this.index = index;
            for (int i = 0; i < weights.length; i++) {
                weights[i] = weight(operation, modelNodes[i]);
            }
            quickSort(0, modelNodes.length - 1);
        }

        public float weight(Operation operation, ModelNode node) {
            return function.weight(operation, balancer, node, index);
        }

        @Override
        protected void swap(int i, int j) {
            final ModelNode tmpNode = modelNodes[i];
            modelNodes[i] = modelNodes[j];
            modelNodes[j] = tmpNode;
            final float tmpWeight = weights[i];
            weights[i] = weights[j];
            weights[j] = tmpWeight;
        }

        @Override
        protected int compare(int i, int j) {
            return Float.compare(weights[i], weights[j]);
        }

        @Override
        protected void setPivot(int i) {
            pivotWeight = weights[i];
        }

        @Override
        protected int comparePivot(int j) {
            return Float.compare(pivotWeight, weights[j]);
        }

        public float delta() {
            return weights[weights.length - 1] - weights[0];
        }
    }
}