Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
org.apache.cassandra.dht.tokenallocator.ReplicationAwareTokenAllocator Maven / Gradle / Ivy
Go to download
A fork of the Apache Cassandra Project ready to embed Elasticsearch.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.dht.tokenallocator;
import java.util.*;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import org.apache.cassandra.dht.IPartitioner;
import org.apache.cassandra.dht.Token;
/**
* A Replication Aware allocator for tokens, that attempts to ensure an even distribution of ownership across
* the known cluster for the provided replication strategy.
*
* A unit is shorthand for a "unit of ownership" which translates roughly to a node, or a disk on the node,
* a CPU on the node, or some other relevant unit of ownership. These units should be the lowest rung over which
* ownership needs to be evenly distributed. At the moment only nodes as a whole are treated as units, but that
* will change with the introduction of token ranges per disk.
*/
class ReplicationAwareTokenAllocator extends TokenAllocatorBase
{
final Multimap unitToTokens;
final int replicas;
ReplicationAwareTokenAllocator(NavigableMap sortedTokens, ReplicationStrategy strategy, IPartitioner partitioner)
{
super(sortedTokens, strategy, partitioner);
unitToTokens = HashMultimap.create();
for (Map.Entry en : sortedTokens.entrySet())
unitToTokens.put(en.getValue(), en.getKey());
this.replicas = strategy.replicas();
}
public int getReplicas()
{
return replicas;
}
public Collection addUnit(Unit newUnit, int numTokens)
{
assert !unitToTokens.containsKey(newUnit);
if (unitCount() < replicas)
// Allocation does not matter; everything replicates everywhere.
return generateRandomTokens(newUnit, numTokens);
if (numTokens > sortedTokens.size())
// Some of the heuristics below can't deal with this case. Use random for now, later allocations can fix any problems this may cause.
return generateRandomTokens(newUnit, numTokens);
// ============= construct our initial token ring state =============
double optTokenOwnership = optimalTokenOwnership(numTokens);
Map groups = Maps.newHashMap();
Map> unitInfos = createUnitInfos(groups);
if (groups.size() < replicas)
{
// We need at least replicas groups to do allocation correctly. If there aren't enough,
// use random allocation.
// This part of the code should only be reached via the RATATest. StrategyAdapter should disallow
// token allocation in this case as the algorithm is not able to cover the behavior of NetworkTopologyStrategy.
return generateRandomTokens(newUnit, numTokens);
}
// initialise our new unit's state (with an idealised ownership)
// strategy must already know about this unit
UnitInfo newUnitInfo = new UnitInfo<>(newUnit, numTokens * optTokenOwnership, groups, strategy);
// build the current token ring state
TokenInfo tokens = createTokenInfos(unitInfos, newUnitInfo.group);
newUnitInfo.tokenCount = numTokens;
// ============= construct and rank our candidate token allocations =============
// walk the token ring, constructing the set of candidates in ring order
// as the midpoints between all existing tokens
CandidateInfo candidates = createCandidates(tokens, newUnitInfo, optTokenOwnership);
// Evaluate the expected improvements from all candidates and form a priority queue.
PriorityQueue>> improvements = new PriorityQueue<>(sortedTokens.size());
CandidateInfo candidate = candidates;
do
{
double impr = evaluateImprovement(candidate, optTokenOwnership, 1.0 / numTokens);
improvements.add(new Weighted<>(impr, candidate));
candidate = candidate.next;
} while (candidate != candidates);
// ============= iteratively take the best candidate, and re-rank =============
CandidateInfo bestToken = improvements.remove().value;
for (int vn = 1; ; ++vn)
{
candidates = bestToken.removeFrom(candidates);
confirmCandidate(bestToken);
if (vn == numTokens)
break;
while (true)
{
// Get the next candidate in the queue. Its improvement may have changed (esp. if multiple tokens
// were good suggestions because they could improve the same problem)-- evaluate it again to check
// if it is still a good candidate.
bestToken = improvements.remove().value;
double impr = evaluateImprovement(bestToken, optTokenOwnership, (vn + 1.0) / numTokens);
Weighted> next = improvements.peek();
// If it is better than the next in the queue, it is good enough. This is a heuristic that doesn't
// get the best results, but works well enough and on average cuts search time by a factor of O(vnodes).
if (next == null || impr >= next.weight)
break;
improvements.add(new Weighted<>(impr, bestToken));
}
}
return ImmutableList.copyOf(unitToTokens.get(newUnit));
}
private Collection generateRandomTokens(Unit newUnit, int numTokens)
{
Set tokens = new HashSet<>(numTokens);
while (tokens.size() < numTokens)
{
Token token = partitioner.getRandomToken();
if (!sortedTokens.containsKey(token))
{
tokens.add(token);
sortedTokens.put(token, newUnit);
unitToTokens.put(newUnit, token);
}
}
return tokens;
}
/**
* Construct the token ring as a CircularList of TokenInfo,
* and populate the ownership of the UnitInfo's provided
*/
private TokenInfo createTokenInfos(Map> units, GroupInfo newUnitGroup)
{
// build the circular list
TokenInfo prev = null;
TokenInfo first = null;
for (Map.Entry en : sortedTokens.entrySet())
{
Token t = en.getKey();
UnitInfo ni = units.get(en.getValue());
TokenInfo ti = new TokenInfo<>(t, ni);
first = ti.insertAfter(first, prev);
prev = ti;
}
TokenInfo curr = first;
do
{
populateTokenInfoAndAdjustUnit(curr, newUnitGroup);
curr = curr.next;
} while (curr != first);
return first;
}
private CandidateInfo createCandidates(TokenInfo tokens, UnitInfo newUnitInfo, double initialTokenOwnership)
{
TokenInfo curr = tokens;
CandidateInfo first = null;
CandidateInfo prev = null;
do
{
CandidateInfo candidate = new CandidateInfo(partitioner.midpoint(curr.prev.token, curr.token), curr, newUnitInfo);
first = candidate.insertAfter(first, prev);
candidate.replicatedOwnership = initialTokenOwnership;
populateCandidate(candidate);
prev = candidate;
curr = curr.next;
} while (curr != tokens);
prev.next = first;
return first;
}
private void populateCandidate(CandidateInfo candidate)
{
// Only finding replication start would do.
populateTokenInfo(candidate, candidate.owningUnit.group);
}
/**
* Incorporates the selected candidate into the ring, adjusting ownership information and calculated token
* information.
*/
private void confirmCandidate(CandidateInfo candidate)
{
// This process is less efficient than it could be (loops through each vnode's replication span instead
// of recalculating replicationStart, replicationThreshold from existing data + new token data in an O(1)
// case analysis similar to evaluateImprovement). This is fine as the method does not dominate processing
// time.
// Put the accepted candidate in the token list.
UnitInfo newUnit = candidate.owningUnit;
Token newToken = candidate.token;
sortedTokens.put(newToken, newUnit.unit);
unitToTokens.put(newUnit.unit, newToken);
TokenInfo prev = candidate.prevInRing();
TokenInfo newTokenInfo = new TokenInfo<>(newToken, newUnit);
newTokenInfo.replicatedOwnership = candidate.replicatedOwnership;
newTokenInfo.insertAfter(prev, prev); // List is not empty so this won't need to change head of list.
// Update data for candidate.
populateTokenInfoAndAdjustUnit(newTokenInfo, newUnit.group);
ReplicationVisitor replicationVisitor = new ReplicationVisitor();
assert newTokenInfo.next == candidate.split;
for (TokenInfo curr = newTokenInfo.next; !replicationVisitor.visitedAll(); curr = curr.next)
{
// update the candidate between curr and next
candidate = candidate.next;
populateCandidate(candidate);
if (!replicationVisitor.add(curr.owningUnit.group))
continue; // If we've already seen this group, the token cannot be affected.
populateTokenInfoAndAdjustUnit(curr, newUnit.group);
}
replicationVisitor.clean();
}
/**
* Calculates the {@code replicationStart} of a token, as well as {@code replicationThreshold} which is chosen in a way
* that permits {@code findUpdatedReplicationStart} to quickly identify changes in ownership.
*/
private Token populateTokenInfo(BaseTokenInfo token, GroupInfo newUnitGroup)
{
GroupInfo tokenGroup = token.owningUnit.group;
PopulateVisitor visitor = new PopulateVisitor();
// Replication start = the end of a token from the RF'th different group seen before the token.
Token replicationStart;
// The end of a token from the RF-1'th different group seen before the token.
Token replicationThreshold = token.token;
GroupInfo currGroup;
for (TokenInfo curr = token.prevInRing(); ; curr = curr.prev)
{
replicationStart = curr.token;
currGroup = curr.owningUnit.group;
if (!visitor.add(currGroup))
continue; // Group is already seen.
if (visitor.visitedAll())
break;
replicationThreshold = replicationStart;
// Another instance of the same group precedes us in the replication range of the ring,
// so this is where our replication range begins
if (currGroup == tokenGroup)
break;
}
if (newUnitGroup == tokenGroup)
// new token is always a boundary (as long as it's closer than replicationStart)
replicationThreshold = token.token;
else if (newUnitGroup != currGroup && visitor.seen(newUnitGroup))
// already has new group in replication span before last seen. cannot be affected
replicationThreshold = replicationStart;
visitor.clean();
token.replicationThreshold = replicationThreshold;
token.replicationStart = replicationStart;
return replicationStart;
}
private void populateTokenInfoAndAdjustUnit(TokenInfo populate, GroupInfo newUnitGroup)
{
Token replicationStart = populateTokenInfo(populate, newUnitGroup);
double newOwnership = replicationStart.size(populate.token);
double oldOwnership = populate.replicatedOwnership;
populate.replicatedOwnership = newOwnership;
populate.owningUnit.ownership += newOwnership - oldOwnership;
}
/**
* Evaluates the improvement in variance for both units and individual tokens when candidate is inserted into the
* ring.
*/
private double evaluateImprovement(CandidateInfo candidate, double optTokenOwnership, double newUnitMult)
{
double tokenChange = 0;
UnitInfo candidateUnit = candidate.owningUnit;
Token candidateEnd = candidate.token;
// Form a chain of units affected by the insertion to be able to qualify change of unit ownership.
// A unit may be affected more than once.
UnitAdjustmentTracker unitTracker = new UnitAdjustmentTracker<>(candidateUnit);
// Reflect change in ownership of the splitting token (candidate).
tokenChange += applyOwnershipAdjustment(candidate, candidateUnit, candidate.replicationStart, candidateEnd, optTokenOwnership, unitTracker);
// Loop through all vnodes that replicate candidate or split and update their ownership.
ReplicationVisitor replicationVisitor = new ReplicationVisitor();
for (TokenInfo curr = candidate.split; !replicationVisitor.visitedAll(); curr = curr.next)
{
UnitInfo currUnit = curr.owningUnit;
if (!replicationVisitor.add(currUnit.group))
continue; // If this group is already seen, the token cannot be affected.
Token replicationEnd = curr.token;
Token replicationStart = findUpdatedReplicationStart(curr, candidate);
tokenChange += applyOwnershipAdjustment(curr, currUnit, replicationStart, replicationEnd, optTokenOwnership, unitTracker);
}
replicationVisitor.clean();
double nodeChange = unitTracker.calculateUnitChange(newUnitMult, optTokenOwnership);
return -(tokenChange + nodeChange);
}
/**
* Returns the start of the replication span for the token {@code curr} when {@code candidate} is inserted into the
* ring.
*/
private Token findUpdatedReplicationStart(TokenInfo curr, CandidateInfo candidate)
{
return furtherStartToken(curr.replicationThreshold, candidate.token, curr.token);
}
/**
* Applies the ownership adjustment for the given element, updating tracked unit ownership and returning the change
* of variance.
*/
private double applyOwnershipAdjustment(BaseTokenInfo curr, UnitInfo currUnit,
Token replicationStart, Token replicationEnd,
double optTokenOwnership, UnitAdjustmentTracker unitTracker)
{
double oldOwnership = curr.replicatedOwnership;
double newOwnership = replicationStart.size(replicationEnd);
double tokenCount = currUnit.tokenCount;
assert tokenCount > 0;
unitTracker.add(currUnit, newOwnership - oldOwnership);
return (sq(newOwnership - optTokenOwnership) - sq(oldOwnership - optTokenOwnership)) / sq(tokenCount);
}
/**
* Tracker for unit ownership changes. The changes are tracked by a chain of UnitInfos where the adjustedOwnership
* field is being updated as we see changes in token ownership.
*
* The chain ends with an element that points to itself; this element must be specified as argument to the
* constructor as well as be the first unit with which 'add' is called; when calculating the variance change
* a separate multiplier is applied to it (used to permit more freedom in choosing the first tokens of a unit).
*/
private static class UnitAdjustmentTracker
{
UnitInfo unitsChain;
UnitAdjustmentTracker(UnitInfo newUnit)
{
unitsChain = newUnit;
}
void add(UnitInfo currUnit, double diff)
{
if (currUnit.prevUsed == null)
{
assert unitsChain.prevUsed != null || currUnit == unitsChain;
currUnit.adjustedOwnership = currUnit.ownership + diff;
currUnit.prevUsed = unitsChain;
unitsChain = currUnit;
}
else
{
currUnit.adjustedOwnership += diff;
}
}
double calculateUnitChange(double newUnitMult, double optTokenOwnership)
{
double unitChange = 0;
UnitInfo unitsChain = this.unitsChain;
// Now loop through the units chain and add the unit-level changes. Also clear the groups' seen marks.
while (true)
{
double newOwnership = unitsChain.adjustedOwnership;
double oldOwnership = unitsChain.ownership;
double tokenCount = unitsChain.tokenCount;
double diff = (sq(newOwnership / tokenCount - optTokenOwnership) - sq(oldOwnership / tokenCount - optTokenOwnership));
UnitInfo prev = unitsChain.prevUsed;
unitsChain.prevUsed = null;
if (unitsChain != prev)
unitChange += diff;
else
{
unitChange += diff * newUnitMult;
break;
}
unitsChain = prev;
}
this.unitsChain = unitsChain;
return unitChange;
}
}
/**
* Helper class for marking/unmarking visited a chain of groups
*/
private abstract class GroupVisitor
{
GroupInfo groupChain = GroupInfo.TERMINATOR;
int seen = 0;
abstract GroupInfo prevSeen(GroupInfo group);
abstract void setPrevSeen(GroupInfo group, GroupInfo prevSeen);
// true iff this is the first time we've visited this group
boolean add(GroupInfo group)
{
if (prevSeen(group) != null)
return false;
++seen;
setPrevSeen(group, groupChain);
groupChain = group;
return true;
}
boolean visitedAll()
{
return seen >= replicas;
}
boolean seen(GroupInfo group)
{
return prevSeen(group) != null;
}
// Clean group seen markers.
void clean()
{
GroupInfo groupChain = this.groupChain;
while (groupChain != GroupInfo.TERMINATOR)
{
GroupInfo prev = prevSeen(groupChain);
setPrevSeen(groupChain, null);
groupChain = prev;
}
this.groupChain = GroupInfo.TERMINATOR;
}
}
private class ReplicationVisitor extends GroupVisitor
{
GroupInfo prevSeen(GroupInfo group)
{
return group.prevSeen;
}
void setPrevSeen(GroupInfo group, GroupInfo prevSeen)
{
group.prevSeen = prevSeen;
}
}
private class PopulateVisitor extends GroupVisitor
{
GroupInfo prevSeen(GroupInfo group)
{
return group.prevPopulate;
}
void setPrevSeen(GroupInfo group, GroupInfo prevSeen)
{
group.prevPopulate = prevSeen;
}
}
private double optimalTokenOwnership(int tokensToAdd)
{
return 1.0 * replicas / (sortedTokens.size() + tokensToAdd);
}
/**
* Selects from {@code t1}, {@code t2} the token that forms a bigger range with {@code towards} as the upper bound,
* taking into account wrapping.
* Unlike Token.size(), equality is taken to mean "same as" rather than covering the whole range.
*/
private static Token furtherStartToken(Token t1, Token t2, Token towards)
{
if (t1.equals(towards))
return t2;
if (t2.equals(towards))
return t1;
return t1.size(towards) > t2.size(towards) ? t1 : t2;
}
private static double sq(double d)
{
return d * d;
}
/**
* For testing, remove the given unit preserving correct state of the allocator.
*/
void removeUnit(Unit n)
{
Collection tokens = unitToTokens.removeAll(n);
sortedTokens.keySet().removeAll(tokens);
}
public int unitCount()
{
return unitToTokens.asMap().size();
}
public String toString()
{
return getClass().getSimpleName();
}
/**
* TokenInfo about candidate new tokens/vnodes.
*/
private static class CandidateInfo extends BaseTokenInfo>
{
// directly preceding token in the current token ring
final TokenInfo split;
public CandidateInfo(Token token, TokenInfo split, UnitInfo owningUnit)
{
super(token, owningUnit);
this.split = split;
}
TokenInfo prevInRing()
{
return split.prev;
}
}
static void dumpTokens(String lead, BaseTokenInfo, ?> tokens)
{
BaseTokenInfo, ?> token = tokens;
do
{
System.out.format("%s%s: rs %s rt %s size %.2e%n", lead, token, token.replicationStart, token.replicationThreshold, token.replicatedOwnership);
token = token.next;
} while (token != null && token != tokens);
}
}