com.hazelcast.internal.partition.impl.PartitionStateGeneratorImpl Maven / Gradle / Ivy
/*
* Copyright (c) 2008-2023, Hazelcast, Inc. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.hazelcast.internal.partition.impl;
import com.hazelcast.cluster.Member;
import com.hazelcast.internal.partition.InternalPartition;
import com.hazelcast.internal.partition.PartitionReplica;
import com.hazelcast.internal.partition.PartitionStateGenerator;
import com.hazelcast.logging.ILogger;
import com.hazelcast.logging.Logger;
import com.hazelcast.spi.partitiongroup.MemberGroup;
import com.hazelcast.internal.partition.membergroup.SingleMemberGroup;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.Set;
final class PartitionStateGeneratorImpl implements PartitionStateGenerator {
private static final ILogger LOGGER = Logger.getLogger(PartitionStateGenerator.class);
private static final int DEFAULT_RETRY_MULTIPLIER = 10;
private static final float RANGE_CHECK_RATIO = 1.1f;
private static final int MAX_RETRY_COUNT = 3;
private static final int AGGRESSIVE_RETRY_THRESHOLD = 1;
private static final int AGGRESSIVE_INDEX_THRESHOLD = 3;
private static final int MIN_AVG_OWNER_DIFF = 3;
@Override
public PartitionReplica[][] arrange(Collection memberGroups, InternalPartition[] currentState) {
return arrange(memberGroups, currentState, null);
}
@Override
public PartitionReplica[][] arrange(Collection memberGroups, InternalPartition[] currentState,
Collection partitions) {
Queue groups = createNodeGroups(memberGroups);
if (groups.isEmpty()) {
return null;
}
int partitionCount = currentState.length;
PartitionReplica[][] state = new PartitionReplica[partitionCount][InternalPartition.MAX_REPLICA_COUNT];
initialize(currentState, state, partitions);
int tryCount = 0;
do {
boolean aggressive = tryCount >= AGGRESSIVE_RETRY_THRESHOLD;
tryArrange(state, groups, partitionCount, aggressive, partitions);
if (tryCount++ > 0) {
if (LOGGER.isFineEnabled()) {
LOGGER.fine("Re-trying partition arrangement. Count: " + tryCount);
}
}
} while (tryCount < MAX_RETRY_COUNT && !areGroupsBalanced(groups, partitionCount));
return state;
}
private void initialize(InternalPartition[] currentState, PartitionReplica[][] state, Collection partitions) {
int partitionCount = currentState.length;
for (int partitionId = 0; partitionId < partitionCount; partitionId++) {
InternalPartition p = currentState[partitionId];
PartitionReplica[] replicas = state[partitionId];
boolean empty = true;
for (int index = 0; index < InternalPartition.MAX_REPLICA_COUNT; index++) {
replicas[index] = p.getReplica(index);
empty &= replicas[index] == null;
}
if (empty || partitions != null && !partitions.contains(partitionId)) {
continue;
}
// auto shift-up colder replicas to hotter replicas to fill the empty gaps
int maxReplicaIndex = InternalPartition.MAX_REPLICA_COUNT - 1;
for (int index = 0; index < InternalPartition.MAX_REPLICA_COUNT; index++) {
if (replicas[index] == null) {
for (int k = maxReplicaIndex; k > index; k--) {
if (replicas[k] != null) {
replicas[index] = replicas[k];
replicas[k] = null;
maxReplicaIndex = k - 1;
break;
}
}
}
}
}
}
private void tryArrange(PartitionReplica[][] state, Queue groups, int partitionCount, boolean aggressive,
Collection toBeArrangedPartitions) {
int groupSize = groups.size();
int replicaCount = Math.min(groupSize, InternalPartition.MAX_REPLICA_COUNT);
int avgPartitionPerGroup = partitionCount / groupSize;
// clear unused replica owners
// initialize partition registry for each group
initializeGroupPartitions(state, groups, replicaCount, aggressive, toBeArrangedPartitions);
for (int index = 0; index < replicaCount; index++) {
// partitions those are not bound to any node/group
Queue freePartitions = getUnownedPartitions(state, index);
// retain only to-be-arranged partitions
if (toBeArrangedPartitions != null) {
freePartitions.retainAll(toBeArrangedPartitions);
}
// groups having partitions under average
Queue underLoadedGroups = new LinkedList<>();
// groups having partitions over average
List overLoadedGroups = new LinkedList<>();
// number of groups should have (average + 1) partitions
int plusOneGroupCount = partitionCount - avgPartitionPerGroup * groupSize;
// determine under-loaded and over-loaded groups
for (NodeGroup nodeGroup : groups) {
int size = nodeGroup.getPartitionCount(index);
if (size < avgPartitionPerGroup) {
underLoadedGroups.add(nodeGroup);
} else if (size > avgPartitionPerGroup) {
overLoadedGroups.add(nodeGroup);
}
// What about maxPartitionPerGroup ??
}
// distribute free partitions among under-loaded groups
plusOneGroupCount = tryToDistributeUnownedPartitions(underLoadedGroups, freePartitions,
avgPartitionPerGroup, index, plusOneGroupCount);
if (!freePartitions.isEmpty()) {
// if there are still free partitions those could not be distributed
// to under-loaded groups then one-by-one distribute them among all groups
// until queue is empty.
distributeUnownedPartitions(groups, freePartitions, index);
}
assert freePartitions.isEmpty() : "There are partitions not-owned yet: " + freePartitions;
if (toBeArrangedPartitions == null) {
// iterate through over-loaded groups' partitions and distribute them to under-loaded groups.
transferPartitionsBetweenGroups(underLoadedGroups, overLoadedGroups, index, avgPartitionPerGroup,
plusOneGroupCount);
}
// post process each group's partition table (distribute partitions added to group to nodes
// and balance load of partition ownership s in group) and save partition ownerships to
// cluster partition state table.
updatePartitionState(state, groups, index);
}
}
@SuppressWarnings({"checkstyle:cyclomaticcomplexity", "checkstyle:npathcomplexity"})
private void transferPartitionsBetweenGroups(Queue underLoadedGroups, Collection overLoadedGroups,
int index, int avgPartitionPerGroup, int plusOneGroupCount) {
int maxPartitionPerGroup = avgPartitionPerGroup + 1;
int maxTries = underLoadedGroups.size() * overLoadedGroups.size() * DEFAULT_RETRY_MULTIPLIER;
int tries = 0;
int expectedPartitionCount = plusOneGroupCount > 0 ? maxPartitionPerGroup : avgPartitionPerGroup;
while (tries++ < maxTries && !underLoadedGroups.isEmpty()) {
NodeGroup toGroup = underLoadedGroups.poll();
Iterator overLoadedGroupsIterator = overLoadedGroups.iterator();
while (overLoadedGroupsIterator.hasNext()) {
NodeGroup fromGroup = overLoadedGroupsIterator.next();
selectToGroupPartitions(index, expectedPartitionCount, toGroup, fromGroup);
int fromCount = fromGroup.getPartitionCount(index);
if (plusOneGroupCount > 0 && fromCount == maxPartitionPerGroup) {
if (--plusOneGroupCount == 0) {
expectedPartitionCount = avgPartitionPerGroup;
}
}
if (fromCount <= expectedPartitionCount) {
overLoadedGroupsIterator.remove();
}
int toCount = toGroup.getPartitionCount(index);
if (plusOneGroupCount > 0 && toCount == maxPartitionPerGroup) {
if (--plusOneGroupCount == 0) {
expectedPartitionCount = avgPartitionPerGroup;
}
}
if (toCount >= expectedPartitionCount) {
break;
}
}
if (toGroup.getPartitionCount(index) < avgPartitionPerGroup/* && !underLoadedGroups.contains(toGroup)*/) {
underLoadedGroups.offer(toGroup);
}
}
}
private void selectToGroupPartitions(int index, int expectedPartitionCount, NodeGroup toGroup, NodeGroup fromGroup) {
Iterator partitionsIterator = fromGroup.getPartitionsIterator(index);
while (partitionsIterator.hasNext()
&& fromGroup.getPartitionCount(index) > expectedPartitionCount
&& toGroup.getPartitionCount(index) < expectedPartitionCount) {
Integer partitionId = partitionsIterator.next();
if (toGroup.addPartition(index, partitionId)) {
partitionsIterator.remove();
}
}
}
private void updatePartitionState(PartitionReplica[][] state, Collection groups, int index) {
for (NodeGroup group : groups) {
group.postProcessPartitionTable(index);
for (PartitionReplica replica : group.getReplicas()) {
PartitionTable table = group.getPartitionTable(replica);
Set set = table.getPartitions(index);
for (Integer partitionId : set) {
state[partitionId][index] = replica;
}
}
}
}
private void distributeUnownedPartitions(Queue groups, Queue freePartitions, int index) {
int groupSize = groups.size();
int maxTries = freePartitions.size() * groupSize * DEFAULT_RETRY_MULTIPLIER;
int tries = 0;
Integer partitionId = freePartitions.poll();
while (partitionId != null && tries++ < maxTries) {
NodeGroup group = groups.poll();
if (group.addPartition(index, partitionId)) {
partitionId = freePartitions.poll();
}
groups.offer(group);
}
}
private int tryToDistributeUnownedPartitions(Queue underLoadedGroups, Queue freePartitions,
int avgPartitionPerGroup, int index, int plusOneGroupCount) {
// distribute free partitions among under-loaded groups
int maxPartitionPerGroup = avgPartitionPerGroup + 1;
int maxTries = freePartitions.size() * underLoadedGroups.size();
int tries = 0;
while (tries++ < maxTries && !freePartitions.isEmpty() && !underLoadedGroups.isEmpty()) {
NodeGroup group = underLoadedGroups.poll();
assignFreePartitionsToNodeGroup(freePartitions, index, group);
int count = group.getPartitionCount(index);
if (plusOneGroupCount > 0 && count == maxPartitionPerGroup) {
if (--plusOneGroupCount == 0) {
// all (avg + 1) partitions owned groups are found
// if there is any group has avg number of partitions in under-loaded queue
// remove it.
underLoadedGroups.removeIf(nodeGroup -> nodeGroup.getPartitionCount(index) >= avgPartitionPerGroup);
}
} else if ((plusOneGroupCount > 0 && count < maxPartitionPerGroup)
|| (count < avgPartitionPerGroup)) {
underLoadedGroups.offer(group);
}
}
return plusOneGroupCount;
}
private void assignFreePartitionsToNodeGroup(Queue freePartitions, int index, NodeGroup group) {
int size = freePartitions.size();
for (int i = 0; i < size; i++) {
Integer partitionId = freePartitions.poll();
if (!group.addPartition(index, partitionId)) {
freePartitions.offer(partitionId);
} else {
break;
}
}
}
private Queue getUnownedPartitions(PartitionReplica[][] state, int replicaIndex) {
LinkedList freePartitions = new LinkedList<>();
// if owner of a partition can not be found then add partition to free partitions queue.
for (int partitionId = 0; partitionId < state.length; partitionId++) {
PartitionReplica[] replicas = state[partitionId];
if (replicas[replicaIndex] == null) {
freePartitions.add(partitionId);
}
}
Collections.shuffle(freePartitions);
return freePartitions;
}
private void initializeGroupPartitions(PartitionReplica[][] state, Queue groups, int replicaCount,
boolean aggressive, Collection toBeArrangedPartitions) {
// reset partition before reuse
for (NodeGroup nodeGroup : groups) {
nodeGroup.resetPartitions();
}
for (int partitionId = 0; partitionId < state.length; partitionId++) {
PartitionReplica[] replicas = state[partitionId];
for (int replicaIndex = 0; replicaIndex < InternalPartition.MAX_REPLICA_COUNT; replicaIndex++) {
if (replicaIndex >= replicaCount) {
replicas[replicaIndex] = null;
continue;
}
PartitionReplica owner = replicas[replicaIndex];
boolean valid = false;
if (owner != null) {
valid = partitionOwnerAvailable(groups, partitionId, replicaIndex, owner);
}
if (!valid) {
replicas[replicaIndex] = null;
} else if (aggressive && replicaIndex < AGGRESSIVE_INDEX_THRESHOLD
&& (toBeArrangedPartitions == null || toBeArrangedPartitions.contains(partitionId))) {
for (int i = AGGRESSIVE_INDEX_THRESHOLD; i < replicaCount; i++) {
replicas[i] = null;
}
}
}
}
}
private boolean partitionOwnerAvailable(Queue groups, int partitionId, int replicaIndex, PartitionReplica owner) {
for (NodeGroup nodeGroup : groups) {
if (nodeGroup.hasNode(owner)) {
if (nodeGroup.ownPartition(owner, replicaIndex, partitionId)) {
return true;
}
break;
}
}
return false;
}
private Queue createNodeGroups(Collection memberGroups) {
Queue nodeGroups = new LinkedList<>();
if (memberGroups == null || memberGroups.isEmpty()) {
return nodeGroups;
}
for (MemberGroup memberGroup : memberGroups) {
NodeGroup nodeGroup;
if (memberGroup.size() == 0) {
continue;
}
if (memberGroup instanceof SingleMemberGroup || memberGroup.size() == 1) {
nodeGroup = new SingleNodeGroup();
Member next = memberGroup.iterator().next();
nodeGroup.addNode(PartitionReplica.from(next));
} else {
nodeGroup = new DefaultNodeGroup();
Iterator iter = memberGroup.iterator();
while (iter.hasNext()) {
Member next = iter.next();
nodeGroup.addNode(PartitionReplica.from(next));
}
}
nodeGroups.add(nodeGroup);
}
return nodeGroups;
}
private boolean areGroupsBalanced(Collection groups, int partitionCount) {
float ratio = RANGE_CHECK_RATIO;
int avgPartitionPerGroup = partitionCount / groups.size();
int replicaCount = Math.min(groups.size(), InternalPartition.MAX_REPLICA_COUNT);
for (NodeGroup group : groups) {
for (int i = 0; i < replicaCount; i++) {
int partitionCountOfGroup = group.getPartitionCount(i);
if (Math.abs(partitionCountOfGroup - avgPartitionPerGroup) <= MIN_AVG_OWNER_DIFF) {
continue;
}
if ((partitionCountOfGroup < avgPartitionPerGroup / ratio)
|| (partitionCountOfGroup > avgPartitionPerGroup * ratio)) {
if (LOGGER.isFineEnabled()) {
LOGGER.fine("Not well balanced! Replica: " + i + ", PartitionCount: "
+ partitionCountOfGroup + ", AvgPartitionCount: " + avgPartitionPerGroup);
}
return false;
}
}
}
return true;
}
// ----- INNER CLASSES -----
private interface NodeGroup {
void addNode(PartitionReplica replica);
boolean hasNode(PartitionReplica replica);
Set getReplicas();
PartitionTable getPartitionTable(PartitionReplica replica);
void resetPartitions();
int getPartitionCount(int index);
boolean ownPartition(PartitionReplica replica, int index, Integer partitionId);
boolean addPartition(int replicaIndex, Integer partitionId);
Iterator getPartitionsIterator(int index);
void postProcessPartitionTable(int index);
}
private static class DefaultNodeGroup implements NodeGroup {
final PartitionTable groupPartitionTable = new PartitionTable();
final Map nodePartitionTables = new HashMap<>();
final LinkedList partitionQ = new LinkedList<>();
@Override
public void addNode(PartitionReplica replica) {
nodePartitionTables.put(replica, new PartitionTable());
}
@Override
public boolean hasNode(PartitionReplica replica) {
return nodePartitionTables.containsKey(replica);
}
@Override
public Set getReplicas() {
return nodePartitionTables.keySet();
}
@Override
public PartitionTable getPartitionTable(PartitionReplica replica) {
return nodePartitionTables.get(replica);
}
@Override
public void resetPartitions() {
groupPartitionTable.reset();
partitionQ.clear();
for (PartitionTable table : nodePartitionTables.values()) {
table.reset();
}
}
@Override
public int getPartitionCount(int index) {
return groupPartitionTable.size(index);
}
private boolean containsPartition(Integer partitionId) {
return groupPartitionTable.contains(partitionId);
}
@Override
public boolean ownPartition(PartitionReplica replica, int index, Integer partitionId) {
if (!hasNode(replica)) {
String error = "PartitionReplica does not belong to this group: " + replica.toString();
LOGGER.warning(error);
return false;
}
if (containsPartition(partitionId)) {
if (LOGGER.isFinestEnabled()) {
LOGGER.finest("Partition[" + partitionId + "] is already owned by this group!");
}
return false;
}
groupPartitionTable.add(index, partitionId);
return nodePartitionTables.get(replica).add(index, partitionId);
}
@Override
public boolean addPartition(int replicaIndex, Integer partitionId) {
if (containsPartition(partitionId)) {
return false;
}
if (groupPartitionTable.add(replicaIndex, partitionId)) {
partitionQ.add(partitionId);
return true;
}
return false;
}
@Override
public Iterator getPartitionsIterator(final int index) {
final Iterator iterator = groupPartitionTable.getPartitions(index).iterator();
return new Iterator() {
Integer current;
@Override
public boolean hasNext() {
return iterator.hasNext();
}
@Override
public Integer next() {
current = iterator.next();
return current;
}
@Override
public void remove() {
iterator.remove();
doRemovePartition(index, current);
}
};
}
private void doRemovePartition(int index, Integer partitionId) {
for (PartitionTable table : nodePartitionTables.values()) {
if (table.remove(index, partitionId)) {
break;
}
}
}
@Override
public void postProcessPartitionTable(int index) {
if (nodePartitionTables.size() == 1) {
PartitionTable table = nodePartitionTables.values().iterator().next();
while (!partitionQ.isEmpty()) {
table.add(index, partitionQ.poll());
}
} else {
List underLoadedStates = new LinkedList<>();
int avgCount = slimDownNodesToAvgPartitionTableSize(index, underLoadedStates);
if (!partitionQ.isEmpty()) {
for (PartitionTable table : underLoadedStates) {
while (table.size(index) < avgCount) {
table.add(index, partitionQ.poll());
}
}
}
while (!partitionQ.isEmpty()) {
for (PartitionTable table : nodePartitionTables.values()) {
table.add(index, partitionQ.poll());
if (partitionQ.isEmpty()) {
break;
}
}
}
}
}
private int slimDownNodesToAvgPartitionTableSize(int index, List underLoadedStates) {
int totalCount = getPartitionCount(index);
int avgCount = totalCount / nodePartitionTables.values().size();
for (PartitionTable table : nodePartitionTables.values()) {
Set partitions = table.getPartitions(index);
if (partitions.size() > avgCount) {
Integer[] partitionArray = partitions.toArray(new Integer[0]);
while (partitions.size() > avgCount) {
int partitionId = partitionArray[partitions.size() - 1];
partitions.remove(partitionId);
partitionQ.add(partitionId);
}
} else {
underLoadedStates.add(table);
}
}
return avgCount;
}
@Override
public String toString() {
return "DefaultNodeGroupRegistry [nodes=" + nodePartitionTables.keySet() + "]";
}
}
private static class SingleNodeGroup implements NodeGroup {
final PartitionTable nodeTable = new PartitionTable();
PartitionReplica replica;
Set replicas;
@Override
public void addNode(PartitionReplica replica) {
if (this.replica != null) {
LOGGER.warning("Single node group already has an address => " + this.replica);
return;
}
this.replica = replica;
replicas = Collections.singleton(replica);
}
@Override
public boolean hasNode(PartitionReplica replica) {
return this.replica != null && this.replica.equals(replica);
}
@Override
public Set getReplicas() {
return replicas;
}
@Override
public PartitionTable getPartitionTable(PartitionReplica replica) {
return hasNode(replica) ? nodeTable : null;
}
@Override
public void resetPartitions() {
nodeTable.reset();
}
@Override
public int getPartitionCount(int index) {
return nodeTable.size(index);
}
private boolean containsPartition(Integer partitionId) {
return nodeTable.contains(partitionId);
}
@Override
public boolean ownPartition(PartitionReplica replica, int index, Integer partitionId) {
if (!hasNode(replica)) {
String error = replica + " is different from this node's " + this.replica;
LOGGER.warning(error);
return false;
}
if (containsPartition(partitionId)) {
if (LOGGER.isFinestEnabled()) {
LOGGER.finest("Partition[" + partitionId + "] is already owned by this node " + replica);
}
return false;
}
return nodeTable.add(index, partitionId);
}
@Override
public boolean addPartition(int replicaIndex, Integer partitionId) {
if (containsPartition(partitionId)) {
return false;
}
return nodeTable.add(replicaIndex, partitionId);
}
@Override
public Iterator getPartitionsIterator(int index) {
return nodeTable.getPartitions(index).iterator();
}
@Override
public void postProcessPartitionTable(int index) {
}
@Override
public String toString() {
return "SingleNodeGroupRegistry [address=" + replica + "]";
}
}
@SuppressWarnings("unchecked")
private static class PartitionTable {
final Set[] partitions = new Set[InternalPartition.MAX_REPLICA_COUNT];
Set getPartitions(int index) {
check(index);
Set set = partitions[index];
if (set == null) {
set = new LinkedHashSet<>();
partitions[index] = set;
}
return set;
}
boolean add(int index, Integer partitionId) {
return getPartitions(index).add(partitionId);
}
boolean contains(int index, Integer partitionId) {
return getPartitions(index).contains(partitionId);
}
boolean contains(Integer partitionId) {
for (Set set : partitions) {
if (set != null && set.contains(partitionId)) {
return true;
}
}
return false;
}
boolean remove(int index, Integer partitionId) {
return getPartitions(index).remove(partitionId);
}
int size(int index) {
return getPartitions(index).size();
}
void reset() {
for (Set set : partitions) {
if (set != null) {
set.clear();
}
}
}
private void check(int index) {
if (index < 0 || index >= InternalPartition.MAX_REPLICA_COUNT) {
throw new ArrayIndexOutOfBoundsException(index);
}
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy