org.apache.cassandra.dht.tokenallocator.ReplicationAwareTokenAllocator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
A fork of the Apache Cassandra Project ready to embed Elasticsearch.
There is a newer version: 3.11.12.3
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.dht.tokenallocator;

import java.util.*;

import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;

import org.apache.cassandra.dht.IPartitioner;
import org.apache.cassandra.dht.Token;

/**
 * A Replication Aware allocator for tokens, that attempts to ensure an even distribution of ownership across
 * the known cluster for the provided replication strategy.
 *
 * A unit is shorthand for a "unit of ownership" which translates roughly to a node, or a disk on the node,
 * a CPU on the node, or some other relevant unit of ownership. These units should be the lowest rung over which
 * ownership needs to be evenly distributed. At the moment only nodes as a whole are treated as units, but that
 * will change with the introduction of token ranges per disk.
 */
class ReplicationAwareTokenAllocator extends TokenAllocatorBase
{
    final Multimap unitToTokens;
    final int replicas;

    ReplicationAwareTokenAllocator(NavigableMap sortedTokens, ReplicationStrategy strategy, IPartitioner partitioner)
    {
        super(sortedTokens, strategy, partitioner);
        unitToTokens = HashMultimap.create();
        for (Map.Entry en : sortedTokens.entrySet())
            unitToTokens.put(en.getValue(), en.getKey());
        this.replicas = strategy.replicas();
    }

    public int getReplicas()
    {
        return replicas;
    }

    public Collection addUnit(Unit newUnit, int numTokens)
    {
        assert !unitToTokens.containsKey(newUnit);

        if (unitCount() < replicas)
            // Allocation does not matter; everything replicates everywhere.
            return generateRandomTokens(newUnit, numTokens);
        if (numTokens > sortedTokens.size())
            // Some of the heuristics below can't deal with this case. Use random for now, later allocations can fix any problems this may cause.
            return generateRandomTokens(newUnit, numTokens);

        // ============= construct our initial token ring state =============

        double optTokenOwnership = optimalTokenOwnership(numTokens);
        Map groups = Maps.newHashMap();
        Map> unitInfos = createUnitInfos(groups);
        if (groups.size() < replicas)
        {
            // We need at least replicas groups to do allocation correctly. If there aren't enough, 
            // use random allocation.
            // This part of the code should only be reached via the RATATest. StrategyAdapter should disallow
            // token allocation in this case as the algorithm is not able to cover the behavior of NetworkTopologyStrategy.
            return generateRandomTokens(newUnit, numTokens);
        }

        // initialise our new unit's state (with an idealised ownership)
        // strategy must already know about this unit
        UnitInfo newUnitInfo = new UnitInfo<>(newUnit, numTokens * optTokenOwnership, groups, strategy);

        // build the current token ring state
        TokenInfo tokens = createTokenInfos(unitInfos, newUnitInfo.group);
        newUnitInfo.tokenCount = numTokens;

        // ============= construct and rank our candidate token allocations =============

        // walk the token ring, constructing the set of candidates in ring order
        // as the midpoints between all existing tokens
        CandidateInfo candidates = createCandidates(tokens, newUnitInfo, optTokenOwnership);

        // Evaluate the expected improvements from all candidates and form a priority queue.
        PriorityQueue>> improvements = new PriorityQueue<>(sortedTokens.size());
        CandidateInfo candidate = candidates;
        do
        {
            double impr = evaluateImprovement(candidate, optTokenOwnership, 1.0 / numTokens);
            improvements.add(new Weighted<>(impr, candidate));
            candidate = candidate.next;
        } while (candidate != candidates);

        // ============= iteratively take the best candidate, and re-rank =============

        CandidateInfo bestToken = improvements.remove().value;
        for (int vn = 1; ; ++vn)
        {
            candidates = bestToken.removeFrom(candidates);
            confirmCandidate(bestToken);

            if (vn == numTokens)
                break;

            while (true)
            {
                // Get the next candidate in the queue. Its improvement may have changed (esp. if multiple tokens
                // were good suggestions because they could improve the same problem)-- evaluate it again to check
                // if it is still a good candidate.
                bestToken = improvements.remove().value;
                double impr = evaluateImprovement(bestToken, optTokenOwnership, (vn + 1.0) / numTokens);
                Weighted> next = improvements.peek();

                // If it is better than the next in the queue, it is good enough. This is a heuristic that doesn't
                // get the best results, but works well enough and on average cuts search time by a factor of O(vnodes).
                if (next == null || impr >= next.weight)
                    break;
                improvements.add(new Weighted<>(impr, bestToken));
            }
        }

        return ImmutableList.copyOf(unitToTokens.get(newUnit));
    }

    private Collection generateRandomTokens(Unit newUnit, int numTokens)
    {
        Set tokens = new HashSet<>(numTokens);
        while (tokens.size() < numTokens)
        {
            Token token = partitioner.getRandomToken();
            if (!sortedTokens.containsKey(token))
            {
                tokens.add(token);
                sortedTokens.put(token, newUnit);
                unitToTokens.put(newUnit, token);
            }
        }
        return tokens;
    }

    /**
     * Construct the token ring as a CircularList of TokenInfo,
     * and populate the ownership of the UnitInfo's provided
     */
    private TokenInfo createTokenInfos(Map> units, GroupInfo newUnitGroup)
    {
        // build the circular list
        TokenInfo prev = null;
        TokenInfo first = null;
        for (Map.Entry en : sortedTokens.entrySet())
        {
            Token t = en.getKey();
            UnitInfo ni = units.get(en.getValue());
            TokenInfo ti = new TokenInfo<>(t, ni);
            first = ti.insertAfter(first, prev);
            prev = ti;
        }

        TokenInfo curr = first;
        do
        {
            populateTokenInfoAndAdjustUnit(curr, newUnitGroup);
            curr = curr.next;
        } while (curr != first);

        return first;
    }

    private CandidateInfo createCandidates(TokenInfo tokens, UnitInfo newUnitInfo, double initialTokenOwnership)
    {
        TokenInfo curr = tokens;
        CandidateInfo first = null;
        CandidateInfo prev = null;
        do
        {
            CandidateInfo candidate = new CandidateInfo(partitioner.midpoint(curr.prev.token, curr.token), curr, newUnitInfo);
            first = candidate.insertAfter(first, prev);

            candidate.replicatedOwnership = initialTokenOwnership;
            populateCandidate(candidate);

            prev = candidate;
            curr = curr.next;
        } while (curr != tokens);
        prev.next = first;
        return first;
    }

    private void populateCandidate(CandidateInfo candidate)
    {
        // Only finding replication start would do.
        populateTokenInfo(candidate, candidate.owningUnit.group);
    }

    /**
     * Incorporates the selected candidate into the ring, adjusting ownership information and calculated token
     * information.
     */
    private void confirmCandidate(CandidateInfo candidate)
    {
        // This process is less efficient than it could be (loops through each vnode's replication span instead
        // of recalculating replicationStart, replicationThreshold from existing data + new token data in an O(1)
        // case analysis similar to evaluateImprovement). This is fine as the method does not dominate processing
        // time.

        // Put the accepted candidate in the token list.
        UnitInfo newUnit = candidate.owningUnit;
        Token newToken = candidate.token;
        sortedTokens.put(newToken, newUnit.unit);
        unitToTokens.put(newUnit.unit, newToken);

        TokenInfo prev = candidate.prevInRing();
        TokenInfo newTokenInfo = new TokenInfo<>(newToken, newUnit);
        newTokenInfo.replicatedOwnership = candidate.replicatedOwnership;
        newTokenInfo.insertAfter(prev, prev);   // List is not empty so this won't need to change head of list.

        // Update data for candidate.
        populateTokenInfoAndAdjustUnit(newTokenInfo, newUnit.group);

        ReplicationVisitor replicationVisitor = new ReplicationVisitor();
        assert newTokenInfo.next == candidate.split;
        for (TokenInfo curr = newTokenInfo.next; !replicationVisitor.visitedAll(); curr = curr.next)
        {
            // update the candidate between curr and next
            candidate = candidate.next;
            populateCandidate(candidate);

            if (!replicationVisitor.add(curr.owningUnit.group))
                continue;    // If we've already seen this group, the token cannot be affected.

            populateTokenInfoAndAdjustUnit(curr, newUnit.group);
        }

        replicationVisitor.clean();
    }

    /**
     * Calculates the {@code replicationStart} of a token, as well as {@code replicationThreshold} which is chosen in a way
     * that permits {@code findUpdatedReplicationStart} to quickly identify changes in ownership.
     */
    private Token populateTokenInfo(BaseTokenInfo token, GroupInfo newUnitGroup)
    {
        GroupInfo tokenGroup = token.owningUnit.group;
        PopulateVisitor visitor = new PopulateVisitor();

        // Replication start = the end of a token from the RF'th different group seen before the token.
        Token replicationStart;
        // The end of a token from the RF-1'th different group seen before the token.
        Token replicationThreshold = token.token;
        GroupInfo currGroup;
        for (TokenInfo curr = token.prevInRing(); ; curr = curr.prev)
        {
            replicationStart = curr.token;
            currGroup = curr.owningUnit.group;
            if (!visitor.add(currGroup))
                continue; // Group is already seen.
            if (visitor.visitedAll())
                break;

            replicationThreshold = replicationStart;
            // Another instance of the same group precedes us in the replication range of the ring,
            // so this is where our replication range begins
            if (currGroup == tokenGroup)
                break;
        }
        if (newUnitGroup == tokenGroup)
            // new token is always a boundary (as long as it's closer than replicationStart)
            replicationThreshold = token.token;
        else if (newUnitGroup != currGroup && visitor.seen(newUnitGroup))
            // already has new group in replication span before last seen. cannot be affected
            replicationThreshold = replicationStart;
        visitor.clean();

        token.replicationThreshold = replicationThreshold;
        token.replicationStart = replicationStart;
        return replicationStart;
    }

    private void populateTokenInfoAndAdjustUnit(TokenInfo populate, GroupInfo newUnitGroup)
    {
        Token replicationStart = populateTokenInfo(populate, newUnitGroup);
        double newOwnership = replicationStart.size(populate.token);
        double oldOwnership = populate.replicatedOwnership;
        populate.replicatedOwnership = newOwnership;
        populate.owningUnit.ownership += newOwnership - oldOwnership;
    }

    /**
     * Evaluates the improvement in variance for both units and individual tokens when candidate is inserted into the
     * ring.
     */
    private double evaluateImprovement(CandidateInfo candidate, double optTokenOwnership, double newUnitMult)
    {
        double tokenChange = 0;

        UnitInfo candidateUnit = candidate.owningUnit;
        Token candidateEnd = candidate.token;

        // Form a chain of units affected by the insertion to be able to qualify change of unit ownership.
        // A unit may be affected more than once.
        UnitAdjustmentTracker unitTracker = new UnitAdjustmentTracker<>(candidateUnit);

        // Reflect change in ownership of the splitting token (candidate).
        tokenChange += applyOwnershipAdjustment(candidate, candidateUnit, candidate.replicationStart, candidateEnd, optTokenOwnership, unitTracker);

        // Loop through all vnodes that replicate candidate or split and update their ownership.
        ReplicationVisitor replicationVisitor = new ReplicationVisitor();
        for (TokenInfo curr = candidate.split; !replicationVisitor.visitedAll(); curr = curr.next)
        {
            UnitInfo currUnit = curr.owningUnit;

            if (!replicationVisitor.add(currUnit.group))
                continue;    // If this group is already seen, the token cannot be affected.

            Token replicationEnd = curr.token;
            Token replicationStart = findUpdatedReplicationStart(curr, candidate);
            tokenChange += applyOwnershipAdjustment(curr, currUnit, replicationStart, replicationEnd, optTokenOwnership, unitTracker);
        }
        replicationVisitor.clean();

        double nodeChange = unitTracker.calculateUnitChange(newUnitMult, optTokenOwnership);
        return -(tokenChange + nodeChange);
    }

    /**
     * Returns the start of the replication span for the token {@code curr} when {@code candidate} is inserted into the
     * ring.
     */
    private Token findUpdatedReplicationStart(TokenInfo curr, CandidateInfo candidate)
    {
        return furtherStartToken(curr.replicationThreshold, candidate.token, curr.token);
    }

    /**
     * Applies the ownership adjustment for the given element, updating tracked unit ownership and returning the change
     * of variance.
     */
    private double applyOwnershipAdjustment(BaseTokenInfo curr, UnitInfo currUnit,
            Token replicationStart, Token replicationEnd,
            double optTokenOwnership, UnitAdjustmentTracker unitTracker)
    {
        double oldOwnership = curr.replicatedOwnership;
        double newOwnership = replicationStart.size(replicationEnd);
        double tokenCount = currUnit.tokenCount;
        assert tokenCount > 0;
        unitTracker.add(currUnit, newOwnership - oldOwnership);
        return (sq(newOwnership - optTokenOwnership) - sq(oldOwnership - optTokenOwnership)) / sq(tokenCount);
    }

    /**
     * Tracker for unit ownership changes. The changes are tracked by a chain of UnitInfos where the adjustedOwnership
     * field is being updated as we see changes in token ownership.
     *
     * The chain ends with an element that points to itself; this element must be specified as argument to the
     * constructor as well as be the first unit with which 'add' is called; when calculating the variance change
     * a separate multiplier is applied to it (used to permit more freedom in choosing the first tokens of a unit).
     */
    private static class UnitAdjustmentTracker
    {
        UnitInfo unitsChain;

        UnitAdjustmentTracker(UnitInfo newUnit)
        {
            unitsChain = newUnit;
        }

        void add(UnitInfo currUnit, double diff)
        {
            if (currUnit.prevUsed == null)
            {
                assert unitsChain.prevUsed != null || currUnit == unitsChain;

                currUnit.adjustedOwnership = currUnit.ownership + diff;
                currUnit.prevUsed = unitsChain;
                unitsChain = currUnit;
            }
            else
            {
                currUnit.adjustedOwnership += diff;
            }
        }

        double calculateUnitChange(double newUnitMult, double optTokenOwnership)
        {
            double unitChange = 0;
            UnitInfo unitsChain = this.unitsChain;
            // Now loop through the units chain and add the unit-level changes. Also clear the groups' seen marks.
            while (true)
            {
                double newOwnership = unitsChain.adjustedOwnership;
                double oldOwnership = unitsChain.ownership;
                double tokenCount = unitsChain.tokenCount;
                double diff = (sq(newOwnership / tokenCount - optTokenOwnership) - sq(oldOwnership / tokenCount - optTokenOwnership));
                UnitInfo prev = unitsChain.prevUsed;
                unitsChain.prevUsed = null;
                if (unitsChain != prev)
                    unitChange += diff;
                else
                {
                    unitChange += diff * newUnitMult;
                    break;
                }
                unitsChain = prev;
            }
            this.unitsChain = unitsChain;
            return unitChange;
        }
    }


    /**
     * Helper class for marking/unmarking visited a chain of groups
     */
    private abstract class GroupVisitor
    {
        GroupInfo groupChain = GroupInfo.TERMINATOR;
        int seen = 0;

        abstract GroupInfo prevSeen(GroupInfo group);
        abstract void setPrevSeen(GroupInfo group, GroupInfo prevSeen);

        // true iff this is the first time we've visited this group
        boolean add(GroupInfo group)
        {
            if (prevSeen(group) != null)
                return false;
            ++seen;
            setPrevSeen(group, groupChain);
            groupChain = group;
            return true;
        }

        boolean visitedAll()
        {
            return seen >= replicas;
        }

        boolean seen(GroupInfo group)
        {
            return prevSeen(group) != null;
        }

        // Clean group seen markers.
        void clean()
        {
            GroupInfo groupChain = this.groupChain;
            while (groupChain != GroupInfo.TERMINATOR)
            {
                GroupInfo prev = prevSeen(groupChain);
                setPrevSeen(groupChain, null);
                groupChain = prev;
            }
            this.groupChain = GroupInfo.TERMINATOR;
        }
    }

    private class ReplicationVisitor extends GroupVisitor
    {
        GroupInfo prevSeen(GroupInfo group)
        {
            return group.prevSeen;
        }

        void setPrevSeen(GroupInfo group, GroupInfo prevSeen)
        {
            group.prevSeen = prevSeen;
        }
    }

    private class PopulateVisitor extends GroupVisitor
    {
        GroupInfo prevSeen(GroupInfo group)
        {
            return group.prevPopulate;
        }

        void setPrevSeen(GroupInfo group, GroupInfo prevSeen)
        {
            group.prevPopulate = prevSeen;
        }
    }

    private double optimalTokenOwnership(int tokensToAdd)
    {
        return 1.0 * replicas / (sortedTokens.size() + tokensToAdd);
    }

    /**
     * Selects from {@code t1}, {@code t2} the token that forms a bigger range with {@code towards} as the upper bound,
     * taking into account wrapping.
     * Unlike Token.size(), equality is taken to mean "same as" rather than covering the whole range.
     */
    private static Token furtherStartToken(Token t1, Token t2, Token towards)
    {
        if (t1.equals(towards))
            return t2;
        if (t2.equals(towards))
            return t1;

        return t1.size(towards) > t2.size(towards) ? t1 : t2;
    }

    private static double sq(double d)
    {
        return d * d;
    }


    /**
     * For testing, remove the given unit preserving correct state of the allocator.
     */
    void removeUnit(Unit n)
    {
        Collection tokens = unitToTokens.removeAll(n);
        sortedTokens.keySet().removeAll(tokens);
    }

    public int unitCount()
    {
        return unitToTokens.asMap().size();
    }

    public String toString()
    {
        return getClass().getSimpleName();
    }

    /**
     * TokenInfo about candidate new tokens/vnodes.
     */
    private static class CandidateInfo extends BaseTokenInfo>
    {
        // directly preceding token in the current token ring
        final TokenInfo split;

        public CandidateInfo(Token token, TokenInfo split, UnitInfo owningUnit)
        {
            super(token, owningUnit);
            this.split = split;
        }

        TokenInfo prevInRing()
        {
            return split.prev;
        }
    }

    static void dumpTokens(String lead, BaseTokenInfo tokens)
    {
        BaseTokenInfo token = tokens;
        do
        {
            System.out.format("%s%s: rs %s rt %s size %.2e%n", lead, token, token.replicationStart, token.replicationThreshold, token.replicatedOwnership);
            token = token.next;
        } while (token != null && token != tokens);
    }
}