com.staros.shard.ShardChecker Maven / Gradle / Ivy
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.staros.shard;
import com.google.common.base.Preconditions;
import com.staros.exception.ExceptionCode;
import com.staros.exception.StarException;
import com.staros.proto.PlacementPolicy;
import com.staros.schedule.ReplicaWorkerInvertIndex;
import com.staros.schedule.ScheduleScorer;
import com.staros.schedule.Scheduler;
import com.staros.schedule.select.FirstNSelector;
import com.staros.service.ServiceManager;
import com.staros.util.AbstractServer;
import com.staros.util.Config;
import com.staros.worker.Worker;
import com.staros.worker.WorkerGroup;
import com.staros.worker.WorkerManager;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
public class ShardChecker extends AbstractServer {
private static final Logger LOG = LogManager.getLogger(ShardChecker.class);
private final ServiceManager serviceManager;
private final WorkerManager workerManager;
private final Scheduler scheduler;
// TODO: use thread pool
private final Thread checkThread;
private final FirstNSelector selector;
private final long coolDownMs = 60 * 1000; // 60 seconds
private AtomicLong allows = new AtomicLong();
public ShardChecker(ServiceManager serviceManager, WorkerManager workerManager, Scheduler scheduler) {
this.serviceManager = serviceManager;
this.workerManager = workerManager;
this.scheduler = scheduler;
this.selector = new FirstNSelector();
this.checkThread = new Thread(this::runCheckThread);
}
@Override
public void doStart() {
checkThread.start();
}
@Override
public void doStop() {
try {
checkThread.interrupt();
checkThread.join();
} catch (InterruptedException e) {
LOG.warn("join shard checker thread failed! {}", e.getMessage());
}
}
/**
* Sampling logging, avoid too frequent logging overwhelming log file
*
* @param level log level
* @param message log message
*/
private void sampleLogging(Level level, String message) {
long now = System.currentTimeMillis();
long allowed = allows.get();
if (now > allowed && allows.compareAndSet(allowed, now + coolDownMs)) {
LOG.log(level, message);
}
}
private void runCheckThread() {
while (isRunning()) {
if (Config.DISABLE_BACKGROUND_SHARD_SCHEDULE_CHECK) {
// reduce frequency logging, check interval is 10s, sample logging is 1 min.
sampleLogging(Level.WARN,
"DISABLE_BACKGROUND_SHARD_SCHEDULE_CHECK is turned on. Disabling balance shards " +
"ability!");
} else {
LOG.debug("running shard check once.");
for (String serviceId : serviceManager.getServiceIdSet()) {
shardHealthCheckForService(serviceId);
}
// TODO: shard balance inside a shard group, does not need to do it every loop
for (String serviceId : serviceManager.getServiceIdSet()) {
shardGroupBalanceCheckForService(serviceId);
}
}
try {
Thread.sleep(Config.SHARD_CHECKER_LOOP_INTERVAL_SEC * 1000L);
} catch (InterruptedException e) {
LOG.info("shard checker thread interrupted! {}", e.getMessage());
}
}
}
/**
* check the shards inside a service
*
* @param serviceId the target service id
*/
protected void shardHealthCheckForService(String serviceId) {
LOG.debug("Start shard replica health check for service: {}", serviceId);
ShardManager shardManager = serviceManager.getShardManager(serviceId);
if (shardManager == null) {
LOG.info("ShardManager not exist for service {}, skip the healthy check.", serviceId);
return;
}
// Get a snapshot of shard ids, new added shards will be checked in next round.
List shardIds = shardManager.getAllShardIds();
for (long shardId : shardIds) {
Shard shard = shardManager.getShard(shardId);
if (shard == null) {
continue;
}
try {
shardHealthCheck(shard);
} catch (StarException exception) {
LOG.info("Got exception during processing service:{}, shard:{} health check, skip it.",
serviceId, shardId, exception);
}
}
}
/**
* Check given shard's replica status, remove dead replica and add missing replicas. The task is expected to
* run with low priority.
*
* @param shard the shard to be checked
*/
private void shardHealthCheck(Shard shard) throws StarException {
// DON'T EXPECT shard can have more than `nHealthyWeight` copies of unhealthy replicas
final int nHealthyWeight = 10000;
List replicas = shard.getReplicaWorkerIds();
// Divide replicas into healthy replicas and unhealthy replicas, healthy replicas group by workerGroupId.
// workerGroup -> numOfReplicas (nHealthyWeight * healthy + unhealthy)
Map healthy = new HashMap<>();
// list of unhealthy replica worker ids
List unknownWorkers = new ArrayList<>();
replicas.forEach(workerId -> {
Worker w = workerManager.getWorker(workerId);
if (w == null) {
unknownWorkers.add(workerId);
} else {
// Just a lazy way to encode numOfHealthyWorkers and numOfDeadWorkers into a single counter
final int delta = w.isAlive() ? nHealthyWeight : 1;
// For workerGroup that has unhealthy replicas and zero healthy replica, NEED add it into healthy map.
// so the shard replicas can be scheduled to the workerGroup again.
// Following scenario is possible:
// A shard replica have a few healthy replicas in a worker group and,
// a) have healthy replicas.
// b) have unhealthy replica, no healthy replica (e.g. the worker is dead.).
// c) unhealthy replica removed, scheduling of new replicas is triggered.
// d) scheduling failed, no replica exists in the worker group. (e.g. no available workers.).
// e) new workers added into the worker group.
// f) shard replica check skipped even though there are available workers.
healthy.compute(w.getGroupId(), (key, value) -> value == null ? delta : value + delta);
}
});
// TODO: sophisticated control of removal by adding more state of worker: UP/DISCONNECTED/DOWN/CONNECTING
// * DEAD workers can be removed directly
// * DISCONNECTED/CONNECTING workers are unhealthy workers, but may be functional if giving more time.
// Check remain replicas if meets the expected number of replicas.
String serviceId = shard.getServiceId();
long shardId = shard.getShardId();
if (!unknownWorkers.isEmpty()) {
List shardIds = Collections.nCopies(1, shardId);
for (long workerId : unknownWorkers) {
try {
scheduler.scheduleAsyncRemoveFromWorker(serviceId, shardIds, workerId);
} catch (StarException exception) {
// log a message and continue
LOG.debug("Fail to schedule a remove-shard request for service:{}, shard:{}, worker:{}, error:",
serviceId, shardId, workerId, exception);
}
}
}
int expectedNum = shard.getExpectedReplicaNum();
for (Map.Entry entry : healthy.entrySet()) {
long workerGroupId = entry.getKey();
int nHealthy = entry.getValue() / nHealthyWeight;
int nUnhealthy = entry.getValue() % nHealthyWeight;
// When both healthy replicas and non-healthy replicas exist,
// 1. request to add replicas up to expected count
// 2. request to remove replicas down to expected count
try {
if (nHealthy < expectedNum) {
// give a try to schedule the shard again to the workerGroup, let scheduler choose a proper worker.
LOG.debug("Request schedule new replicas for service:{}, shard:{}, workerGroup:{},"
+ " expected num: {}, actual num: {}",
serviceId, shardId, workerGroupId, expectedNum, nHealthy);
scheduler.scheduleAsyncAddToGroup(serviceId, shardId, workerGroupId);
}
if (nHealthy + nUnhealthy > expectedNum) {
LOG.debug("Remove redundant replicas for service:{}, shard:{}, workerGroup:{},"
+ " expected num: {}, actual num: {}",
serviceId, shardId, workerGroupId, expectedNum, nHealthy + nUnhealthy);
scheduler.scheduleAsyncRemoveFromGroup(serviceId, shardId, workerGroupId);
}
} catch (StarException exception) {
LOG.info("Fail to schedule tasks to scheduler. error:", exception);
}
}
}
/**
* Check and balance shard group if needed
*
* @param serviceId target service id
*/
protected void shardGroupBalanceCheckForService(String serviceId) {
LOG.debug("Start shard group balance health check for service: {}", serviceId);
ShardManager shardManager = serviceManager.getShardManager(serviceId);
if (shardManager == null) {
LOG.info("ShardManager not exist for service {}, skip the shard group balance check.", serviceId);
return;
}
// Get a snapshot of shard group ids, new created shard group will be checked in next round.
List shardGroupIds = shardManager.getAllShardGroupIds();
for (long groupId : shardGroupIds) {
ShardGroup group = shardManager.getShardGroup(groupId);
if (group == null) {
continue;
}
if (group.getShardIds().isEmpty()) {
LOG.debug("empty shard group {} in service {}. skip it!", groupId, serviceId);
continue;
}
try {
PlacementPolicy policy = group.getPlacementPolicy();
switch (policy) {
case PACK:
balancePackShardGroup(shardManager, group);
break;
case SPREAD:
balanceSpreadShardGroup(shardManager, group);
break;
case EXCLUDE:
balanceExcludeShardGroup(shardManager, group);
case NONE:
case RANDOM:
default:
break;
}
} catch (StarException exception) {
LOG.info("Got exception during processing service:{}, shardgroup:{} balance check, skip it.",
serviceId, groupId, exception);
}
}
}
private void balanceExcludeShardGroup(ShardManager shardManager, ShardGroup group) {
// (A EXCLUDE B) && (B EXCLUDE C) => (A EXCLUDE C), NO!
ReplicaWorkerInvertIndex index = new ReplicaWorkerInvertIndex();
index.buildFrom(workerManager, shardManager, group);
for (long workerGroupId : index.getAllWorkerGroupIds()) {
WorkerGroup workerGroup = workerManager.getWorkerGroupNoException(group.getServiceId(), workerGroupId);
if (workerGroup == null) {
continue;
}
List workerIds = workerGroup.getAllWorkerIds(true);
if (workerIds.size() < 2) {
continue;
}
// Divide workers into two sets, HAVE replicas v.s. NO replicas
// move replicas from HAVE replicas to NO replicas.
// DON'T try to move around for following case:
// workerA: (X, Y, Z), workerB: (W) --> workerA: (X, Y), workerB: (Z, W)
// It breaks the EXCLUDE rule, makes no sense to balance around.
List srcWorkerIds = new ArrayList<>();
List tgtWorkerIds = new ArrayList<>();
for (long workerId : workerIds) {
if (index.getReplicaShardList(workerId).isEmpty()) {
tgtWorkerIds.add(workerId);
} else {
srcWorkerIds.add(workerId);
}
}
if (tgtWorkerIds.isEmpty()) {
// no available workers to move replica
return;
}
// Sort by its replicaNum in reverseOrder
srcWorkerIds.sort(
(o1, o2) -> Integer.compare(index.getReplicaShardList(o2).size(), index.getReplicaShardList(o1).size()));
ScheduleScorer tgtScore = new ScheduleScorer(tgtWorkerIds);
tgtScore.apply(workerManager);
Iterator srcIt = srcWorkerIds.iterator();
while (srcIt.hasNext() && !tgtScore.isEmpty()) {
// srcWorkerId: least score, targetWorkerId: most score, move from srcWorkerId -> targetWorkerId
long srcWorkerId = srcIt.next();
List tgtIdList = tgtScore.selectHighEnd(selector, 1);
if (tgtIdList.isEmpty()) {
break;
}
long tgtWorkerId = tgtIdList.get(0);
List candidates = new ArrayList<>(index.getReplicaShardList(srcWorkerId));
if (candidates.size() <= 1) {
// No need to move any replica
break;
}
long selected = candidates.get(0);
String serviceId = group.getServiceId();
try {
LOG.debug("[ExcludeGroup] Try to balance shard:{} (service:{}) replica from worker:{} => worker:{}",
selected, serviceId, srcWorkerId, tgtWorkerId);
scheduler.scheduleAddToWorker(serviceId, selected, tgtWorkerId);
// remove tgtWorkerId from scorer
tgtScore.remove(tgtWorkerId);
// only submit the asyncRemoveFromWorker if AddToWorker success.
scheduler.scheduleAsyncRemoveFromWorker(
serviceId, Collections.nCopies(1, selected), srcWorkerId);
} catch (Exception exception) {
LOG.info("[ExcludeGroup] Fail to balance shard:{} in service:{}, form worker:{} to worker:{}. Error:",
selected, serviceId, srcWorkerId, tgtWorkerId, exception);
}
}
}
}
private void balancePackShardGroup(ShardManager shardManager, ShardGroup group) {
// (A PACK B) && (B PACK C) => (A PACK C), YES!
// TODO: mark the PACK shard group UNSTABLE?
// recursively find all PACK shard groups who are connected by the same shard.
List packGroups = new ArrayList<>();
List packShards = new ArrayList<>();
int replicaNum = collectAllShardsAndGroupsRecursively(shardManager, group, packGroups, packShards);
Optional minGroupId = packGroups.stream().map(ShardGroup::getGroupId).min(Comparator.naturalOrder());
Preconditions.checkState(minGroupId.isPresent());
if (minGroupId.get() < group.getGroupId()) {
// must have been processed by the minimal group id.
return;
}
ReplicaWorkerInvertIndex index = new ReplicaWorkerInvertIndex();
packShards.forEach(x -> index.addReplicas(workerManager, x));
for (long workerGroupId : index.getAllWorkerGroupIds()) {
WorkerGroup workerGroup = workerManager.getWorkerGroupNoException(group.getServiceId(), workerGroupId);
if (workerGroup == null) {
continue;
}
List workerIds = workerGroup.getAllWorkerIds(true);
if (workerIds.size() < 2) {
continue;
}
if (workerIds.size() < replicaNum) {
// Can't fulfill the requirement any way.
LOG.debug("Worker group:{} only has {} alive workers. Shard in PACK shard group requires {} replicas. Skip it.",
workerGroupId, workerIds.size(), replicaNum);
continue;
}
balancePackGroupInSingleWorkerGroup(workerIds, group, replicaNum, packGroups, index);
}
}
/**
* Collect all shards and PACK shard groups starting from a specific shard group recursively
*
* @param shardManager shard manager
* @param startGroup the shard group to start the visit
* @param groups result group list
* @param shards result shard list
* @return replica number
* @throws StarException StarException throws when replica number check fails
*/
private int collectAllShardsAndGroupsRecursively(ShardManager shardManager, ShardGroup startGroup,
List groups, List shards) throws StarException {
int replica = -1;
List todoGroup = new ArrayList<>();
todoGroup.add(startGroup);
List blockList = new ArrayList<>();
while (!todoGroup.isEmpty()) {
List newGroups = new ArrayList<>();
for (ShardGroup grp : todoGroup) {
if (groups.contains(grp)) { // group already checked
continue;
}
groups.add(grp);
for (long shardId : grp.getShardIds()) {
Shard shard = shardManager.getShard(shardId);
if (shard == null || shards.contains(shard) || blockList.contains(shard)) {
// the shard doesn't exist or is already checked
continue;
}
if (replica == -1) {
replica = shard.getExpectedReplicaNum();
} else if (replica != shard.getExpectedReplicaNum()) {
throw new StarException(ExceptionCode.INTERNAL, String.format(
"Inconsistent shard replica number in PACK group, expected number:%d, actual number:%d",
replica, shard.getExpectedReplicaNum()));
}
boolean hasPrecedenceGroup = false;
for (long gid : shard.getGroupIds()) {
ShardGroup grp2 = shardManager.getShardGroup(gid);
if (grp2 != null) {
if (grp2.getPlacementPolicy() == PlacementPolicy.PACK) {
newGroups.add(grp2);
} else if (grp2.getPlacementPolicy().getNumber() > PlacementPolicy.PACK.getNumber() &&
grp2.getShardIds().size() > 1) {
// skip the shard, let exclude group adjusting the distribution.
hasPrecedenceGroup = true;
}
}
}
if (hasPrecedenceGroup) {
blockList.add(shard);
} else {
shards.add(shard);
}
}
}
todoGroup = newGroups;
}
return replica;
}
private void balancePackGroupInSingleWorkerGroup(List workerIds, ShardGroup currentGroup, int replicaNum,
List groups, ReplicaWorkerInvertIndex index) {
ScheduleScorer scorer = new ScheduleScorer(workerIds);
List workersWithReplicas = new ArrayList<>();
for (long workerId : workerIds) {
int numReplicas = index.getReplicaShardList(workerId).size();
if (numReplicas == 0) {
continue;
}
workersWithReplicas.add(workerId);
scorer.apply(PlacementPolicy.PACK, Collections.nCopies(numReplicas, workerId));
}
if (workersWithReplicas.size() == replicaNum) {
// ALL DONE, shard health checker will take care of the missing replicas
return;
}
scorer.apply(workerManager);
List selectedWorkers = scorer.selectHighEnd(selector, replicaNum);
if (selectedWorkers.size() != replicaNum) {
// defensive coding, right now it is impossible as long as workerIds.size() >= replicaNum
LOG.info("Failed to select {} workers from candidates while doing shard group:{} balance check, skip it!",
replicaNum, currentGroup.getGroupId());
return;
}
LOG.debug("[PackGroup] shardGroup: {}. Existing workers with replica: {}, selected tgargetWorkers: {}",
currentGroup.getGroupId(), workersWithReplicas, selectedWorkers);
List existShardIds = new ArrayList<>();
workersWithReplicas.forEach(x -> existShardIds.addAll(index.getReplicaShardList(x)));
List validTargetShardIdList = new ArrayList<>();
for (ShardGroup packGroup : groups) {
// skip the shard groups who have no replicas in this worker group at all.
if (packGroup.getShardIds().stream().anyMatch(existShardIds::contains)) {
validTargetShardIdList.addAll(packGroup.getShardIds());
}
}
for (long wid : selectedWorkers) {
Collection existingIdList = index.getReplicaShardList(wid);
List todoIdList = validTargetShardIdList.stream()
.filter(x -> !existingIdList.contains(x))
.collect(Collectors.toList());
if (todoIdList.isEmpty()) {
continue;
}
try {
// add new replica and wait for the result.
// NOTE: this is a batch adding operation without considering shard existing replica count,
// so it will get replicas back to expected count if any shard has missing replicas.
scheduler.scheduleAddToWorker(currentGroup.getServiceId(), todoIdList, wid);
} catch (StarException exception) {
LOG.info("Fail to schedule new replicas of shard:{} to worker:{}, error:", todoIdList, wid, exception);
return;
}
}
// remove selectedWorkers, replicas on the remaining workers will be cleaned
workersWithReplicas.removeIf(selectedWorkers::contains);
for (long wid : workersWithReplicas) {
Collection todoList = index.getReplicaShardList(wid);
if (todoList.isEmpty()) {
continue;
}
// schedule remove jobs asynchronously.
LOG.debug("Submit async task to remove shard replicas:{} from worker:{}", todoList, wid);
scheduler.scheduleAsyncRemoveFromWorker(currentGroup.getServiceId(), new ArrayList<>(todoList), wid);
}
}
private void balanceSpreadShardGroup(ShardManager shardManager, ShardGroup group) {
// (A SPREAD B) && (B SPREAD C) => (A SPREAD C), NO!
//
// Assume a new replica of a shard is to be added to the worker group.
// all target workers are scored from most preferred to least preferred.
// The balancer is to move a few replica from least preferred worker to most preferred worker.
ReplicaWorkerInvertIndex index = new ReplicaWorkerInvertIndex();
index.buildFrom(workerManager, shardManager, group);
for (long workerGroupId : index.getAllWorkerGroupIds()) {
balanceSpreadGroupInSingleWorkerGroup(shardManager, group, index, workerGroupId);
}
}
private void balanceSpreadGroupInSingleWorkerGroup(ShardManager shardManager, ShardGroup group,
ReplicaWorkerInvertIndex index, long workerGroupId) {
WorkerGroup workerGroup = workerManager.getWorkerGroupNoException(group.getServiceId(), workerGroupId);
if (workerGroup == null) {
return;
}
List workerIds = workerGroup.getAllWorkerIds(true);
if (workerIds.size() <= 1) {
return;
}
boolean done = false;
while (!done) { // repeat the loop until the group is balanced in the target worker group
if (workerIds.size() <= 1) {
return;
}
ScheduleScorer scorer = new ScheduleScorer(workerIds);
for (long workerId : workerIds) {
int numReplicas = index.getReplicaShardList(workerId).size();
if (numReplicas == 0) {
continue;
}
scorer.apply(group.getPlacementPolicy(), Collections.nCopies(numReplicas, workerId));
}
scorer.apply(workerManager);
List> sortedEntries = scorer.getScores().entrySet().stream()
.sorted(Map.Entry.comparingByValue(Comparator.naturalOrder()))
.collect(Collectors.toList());
// srcWorkerId: least score, targetWorkerId: most score, move from srcWorkerId -> targetWorkerId
long srcWorkerId = sortedEntries.get(0).getKey();
long targetWorkerId = sortedEntries.get(sortedEntries.size() - 1).getKey();
Collection srcList = index.getReplicaShardList(srcWorkerId);
Collection tgtList = index.getReplicaShardList(targetWorkerId);
// TODO: sortedEntries is not strictly sorted by replica numbers of this shard group.
// It is possible that a worker with more replicas of this shard group is ahead of
// a worker with less replicas of this shard group due to its load and total replicas.
if (srcList.size() <= tgtList.size() + Config.SCHEDULER_BALANCE_MAX_SKEW) {
done = true;
continue;
}
// Select a shard from `srcList` and migrate to `tgtList`
// `relaxCandidates` contains all the shards that are not null and don't have replica in target worker.
List relaxCandidates = srcList.stream()
.filter(x -> !tgtList.contains(x)) // doesn't have replica in target worker
.map(shardManager::getShard)
.filter(Objects::nonNull)
.collect(Collectors.toList());
if (relaxCandidates.isEmpty()) {
done = true;
continue;
}
// `candidates` contains the shards from `relaxCandidates` and filtered out those shards who have precedent
// shard groups than current shard group (SPREAD), which means the shard is in PACK/EXCLUDE meta group as well
List candidates = relaxCandidates.stream()
.filter(x -> !hasPrecedenceShardGroup(shardManager, group.getPlacementPolicy(), x.getGroupIds()))
.collect(Collectors.toList());
List finalSelects = new ArrayList<>();
if (!candidates.isEmpty()) {
// TODO: random choose one ?
// or the shard with more replicas will be highly preferred
// or if the shard doesn't have enough expected num of replicas, will be skipped in this round.
finalSelects.add(candidates.get(0));
} else {
// use `relaxCandidates`
for (Shard current : relaxCandidates) {
// all shards that satisfy following conditions
// 1. be within the same PACK GROUP as `current`
// 2. has at least one replica in this worker group
ShardGroup startGroup = null;
for (long currentShardGroupId : current.getGroupIds()) {
ShardGroup tmpGroup = shardManager.getShardGroup(currentShardGroupId);
if (tmpGroup == null || tmpGroup.getPlacementPolicy() != PlacementPolicy.PACK) {
continue;
}
startGroup = tmpGroup;
break;
}
if (startGroup == null) {
// should not happen, but just let it go.
continue;
}
List packedShards = new ArrayList<>();
// not used, just want to reuse collectAllShardsAndGroupsRecursively() interface
List packGroups = new ArrayList<>();
collectAllShardsAndGroupsRecursively(shardManager, startGroup, packGroups, packedShards);
// get the shard list that has replica in the worker group
List effectiveShards = new ArrayList<>();
List effectiveShardGroups = new ArrayList<>();
for (Shard shard : packedShards) {
if (shard.getReplicaWorkerIds().stream().anyMatch(workerIds::contains)) {
effectiveShards.add(shard);
// plan to move the shard, will all the groups the shard belongs to agree the movement?
for (long groupId : shard.getGroupIds()) {
ShardGroup groupObj = shardManager.getShardGroup(groupId);
if (groupObj == null) {
continue;
}
if (groupObj.getPlacementPolicy() != PlacementPolicy.PACK
&& !effectiveShardGroups.contains(groupObj)) {
effectiveShardGroups.add(groupObj);
}
}
}
}
int agree = 0;
int reject = 0;
for (ShardGroup affectGroup : effectiveShardGroups) {
// measure from the affectedGroup point of view, if the movement is acceptable
ReplicaWorkerInvertIndex tempIndex = new ReplicaWorkerInvertIndex();
tempIndex.buildFrom(workerManager, shardManager, affectGroup);
double scoreBeforeMove = calculateScore(tempIndex, workerIds, affectGroup.getPlacementPolicy());
for (Shard toMove : effectiveShards) {
// assume move the shard from srcWorkerId to targetWorkerId
if (toMove.getGroupIds().contains(affectGroup.getGroupId())) {
tempIndex.removeReplica(toMove.getShardId(), srcWorkerId);
tempIndex.addReplica(toMove.getShardId(), targetWorkerId, workerGroupId);
}
}
// NOTE: this `epsilon` has a relationship with the number of nodes in the workerGroup.
// It defines how to detect the real changes in the meanwhile, ignore unstable calculation deviation
// * Given a worker group with `n` nodes, with each node has Xi (i=0..n-1) replicas, the minimal
// deviation of moving one to the other is: 2 / n
// Choose a value of `epsilon` <= 2 / n, to be able to detect even a single replica movement in a group
// of n nodes.
final double epsilon = 0.001d; // n <= 2000
// score again
double scoreAfterMove = calculateScore(tempIndex, workerIds, affectGroup.getPlacementPolicy());
if (Math.abs(scoreAfterMove - scoreBeforeMove) > epsilon) {
// only consider when the score changes dramatically, otherwise take it as neutral.
// can avoid unstable adjustment back and forth.
if (scoreAfterMove > scoreBeforeMove) {
++reject;
break;
} else if (scoreAfterMove < scoreBeforeMove) {
++agree;
}
}
}
if (agree > 0 && reject == 0) {
// someone agrees, no one objects, make the move
finalSelects.addAll(effectiveShards);
break;
}
// exclude the case that all are neutral to the movement.
}
if (finalSelects.isEmpty()) {
done = true;
continue;
}
}
// Consolidate the selection:
// `finalSelects`: remove from srcWorkerId, add to targetWorkerId
List allTodoIds = finalSelects.stream().map(Shard::getShardId).collect(Collectors.toList());
Preconditions.checkState(!allTodoIds.isEmpty());
boolean addSuccess = false;
try {
LOG.debug("Try to balance shard:{} (service:{}) replica from worker:{} => worker:{}",
allTodoIds, group.getServiceId(), srcWorkerId, targetWorkerId);
scheduler.scheduleAddToWorker(group.getServiceId(), allTodoIds, targetWorkerId);
addSuccess = true;
// update inverted index with the new schedule info, only the shard who belongs to the `group`
finalSelects.forEach(x -> {
if (x.getGroupIds().contains(group.getGroupId())) {
index.addReplica(x.getShardId(), targetWorkerId, workerGroupId);
}
});
} catch (Exception exception) {
LOG.info("Fail to balance shard:{} in service:{}, form worker:{} to worker:{}. Error:",
allTodoIds, group.getServiceId(), srcWorkerId, targetWorkerId, exception);
// remove targetWorkerId from the list, so it will be considered as unavailable in this round of check.
workerIds.remove(targetWorkerId);
}
if (addSuccess) {
try {
if (allTodoIds.size() > 1) {
// the group removal action will affect the shard replica distribution and
// eventually affect next round's choice.
scheduler.scheduleRemoveFromWorker(group.getServiceId(), allTodoIds, srcWorkerId);
} else {
scheduler.scheduleAsyncRemoveFromWorker(group.getServiceId(), allTodoIds, srcWorkerId);
// assume remove success, won't affect balance result too much even it is not done yet or failure
// eventually because the info was updated into `index`
}
finalSelects.forEach(x -> {
if (x.getGroupIds().contains(group.getGroupId())) {
index.removeReplica(x.getShardId(), srcWorkerId);
}
});
} catch (Exception exception) {
LOG.debug("Fail to remove shard:{} replica from worker:{}", allTodoIds, srcWorkerId);
// ignore scheduleAsyncRemoveFromWorker failure
}
}
}
}
/** Calculate score of given distribution. The lower score, the better result.
*/
private double calculateScore(ReplicaWorkerInvertIndex index, List workerIds, PlacementPolicy policy) {
switch (policy) {
case SPREAD:
// replica spread variance, the smaller, the better
return calculateDeviation(index, workerIds);
case EXCLUDE:
// percentage of emtpy workers, lower better.
return workerIds.stream().filter(x -> index.getReplicaShardList(x).isEmpty()).count() / (double) workerIds.size();
default:
return 0;
}
}
// D(X) = E(X^2) - E^2(X)
private double calculateDeviation(ReplicaWorkerInvertIndex index, List workerIds) {
double mean = 0;
double sMean = 0;
for (long id : workerIds) {
int n = index.getReplicaShardList(id).size();
mean += n;
sMean += n * n;
}
mean = mean / workerIds.size();
return sMean / workerIds.size() - mean * mean;
}
/**
* Determine if any shard group in given shard group id list has a higher precedence than policy.
*
* @param shardManager shard manager, to get shard group by id
* @param policy baseline policy
* @param groupIds the list of shard groups to be compared.
* @return true if there are higher shard group precedence.
*/
private boolean hasPrecedenceShardGroup(ShardManager shardManager, PlacementPolicy policy, List groupIds) {
for (long id : groupIds) {
ShardGroup group = shardManager.getShardGroup(id);
if (group == null) {
continue;
}
if (group.getPlacementPolicy().getNumber() > policy.getNumber() && group.getShardIds().size() > 1) {
return true;
}
}
return false;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy