com.staros.shard.ShardChecker Maven / Gradle / Ivy
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.staros.shard;
import com.google.common.base.Preconditions;
import com.staros.exception.ExceptionCode;
import com.staros.exception.StarException;
import com.staros.proto.PlacementPolicy;
import com.staros.schedule.ReplicaWorkerInvertIndex;
import com.staros.schedule.ScheduleScorer;
import com.staros.schedule.Scheduler;
import com.staros.schedule.select.FirstNSelector;
import com.staros.service.ServiceManager;
import com.staros.util.AbstractServer;
import com.staros.util.Config;
import com.staros.worker.Worker;
import com.staros.worker.WorkerGroup;
import com.staros.worker.WorkerManager;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Collectors;
public class ShardChecker extends AbstractServer {
private static final Logger LOG = LogManager.getLogger(ShardChecker.class);
private final ServiceManager serviceManager;
private final WorkerManager workerManager;
private final Scheduler scheduler;
// TODO: use thread pool
private final Thread checkThread;
private final FirstNSelector selector;
private final long coolDownMs = 60 * 1000; // 60 seconds
private AtomicLong allows = new AtomicLong();
public ShardChecker(ServiceManager serviceManager, WorkerManager workerManager, Scheduler scheduler) {
this.serviceManager = serviceManager;
this.workerManager = workerManager;
this.scheduler = scheduler;
this.selector = new FirstNSelector();
this.checkThread = new Thread(this::runCheckThread);
}
@Override
public void doStart() {
checkThread.start();
}
@Override
public void doStop() {
try {
checkThread.interrupt();
checkThread.join();
} catch (InterruptedException e) {
LOG.warn("join shard checker thread failed! {}", e.getMessage());
}
}
/**
* Sampling logging, avoid too frequent logging overwhelming log file
*
* @param level log level
* @param message log message
*/
private void sampleLogging(Level level, String message) {
long now = System.currentTimeMillis();
long allowed = allows.get();
if (now > allowed && allows.compareAndSet(allowed, now + coolDownMs)) {
LOG.log(level, message);
}
}
private void runCheckThread() {
while (isRunning()) {
if (Config.DISABLE_BACKGROUND_SHARD_SCHEDULE_CHECK) {
// reduce frequency logging, check interval is 10s, sample logging is 1 min.
sampleLogging(Level.WARN,
"DISABLE_BACKGROUND_SHARD_SCHEDULE_CHECK is turned on. Disabling balance shards " +
"ability!");
} else {
LOG.debug("running shard check once.");
for (String serviceId : serviceManager.getServiceIdSet()) {
shardHealthCheckForService(serviceId);
}
// TODO: shard balance inside a shard group, does not need to do it every loop
for (String serviceId : serviceManager.getServiceIdSet()) {
shardGroupBalanceCheckForService(serviceId);
}
}
try {
Thread.sleep(Config.SHARD_CHECKER_LOOP_INTERVAL_SEC * 1000L);
} catch (InterruptedException e) {
LOG.info("shard checker thread interrupted! {}", e.getMessage());
}
}
}
/**
* check the shards inside a service
*
* @param serviceId the target service id
*/
protected void shardHealthCheckForService(String serviceId) {
LOG.debug("Start shard replica health check for service: {}", serviceId);
ShardManager shardManager = serviceManager.getShardManager(serviceId);
if (shardManager == null) {
LOG.info("ShardManager not exist for service {}, skip the healthy check.", serviceId);
return;
}
// Get a snapshot of shard ids, new added shards will be checked in next round.
List shardIds = shardManager.getAllShardIds();
for (long shardId : shardIds) {
Shard shard = shardManager.getShard(shardId);
if (shard == null) {
LOG.info("shard not exist for service {}, shard {}, skip it.", serviceId, shardId);
continue;
}
try {
shardHealthCheck(shard);
} catch (StarException exception) {
LOG.info("Got exception during processing service:{}, shard:{} health check, skip it.",
serviceId, shardId, exception);
}
}
}
/**
* Check given shard's replica status, remove dead replica and add missing replicas. The task is expected to
* run with low priority.
*
* @param shard the shard to be checked
*/
private void shardHealthCheck(Shard shard) throws StarException {
// DON'T EXPECT shard can have more than `nHealthyWeight` copies of unhealthy replicas
final int nHealthyWeight = 10000;
List replicas = shard.getReplicaWorkerIds();
// Divide replicas into healthy replicas and unhealthy replicas, healthy replicas group by workerGroupId.
// workerGroup -> numOfReplicas (nHealthyWeight * healthy + unhealthy)
Map healthy = new HashMap<>();
// list of unhealthy replica worker ids
List unknownWorkers = new ArrayList<>();
replicas.forEach(workerId -> {
Worker w = workerManager.getWorker(workerId);
if (w == null) {
unknownWorkers.add(workerId);
} else {
// Just a lazy way to encode numOfHealthyWorkers and numOfDeadWorkers into a single counter
final int delta = w.isAlive() ? nHealthyWeight : 1;
// For workerGroup that has unhealthy replicas and zero healthy replica, NEED add it into healthy map.
// so the shard replicas can be scheduled to the workerGroup again.
// Following scenario is possible:
// A shard replica have a few healthy replicas in a worker group and,
// a) have healthy replicas.
// b) have unhealthy replica, no healthy replica (e.g. the worker is dead.).
// c) unhealthy replica removed, scheduling of new replicas is triggered.
// d) scheduling failed, no replica exists in the worker group. (e.g. no available workers.).
// e) new workers added into the worker group.
// f) shard replica check skipped even though there are available workers.
healthy.compute(w.getGroupId(), (key, value) -> value == null ? delta : value + delta);
}
});
// TODO: sophisticated control of removal by adding more state of worker: UP/DISCONNECTED/DOWN/CONNECTING
// * DEAD workers can be removed directly
// * DISCONNECTED/CONNECTING workers are unhealthy workers, but may be functional if giving more time.
// Check remain replicas if meets the expected number of replicas.
String serviceId = shard.getServiceId();
long shardId = shard.getShardId();
if (!unknownWorkers.isEmpty()) {
List shardIds = Collections.nCopies(1, shardId);
for (long workerId : unknownWorkers) {
try {
scheduler.scheduleAsyncRemoveFromWorker(serviceId, shardIds, workerId);
} catch (StarException exception) {
// log a message and continue
LOG.info("Fail to schedule a remove-shard request for service:{}, shard:{}, worker:{}, error:",
serviceId, shardId, workerId, exception);
}
}
}
int expectedNum = shard.getExpectedReplicaCount();
for (Map.Entry entry : healthy.entrySet()) {
long workerGroupId = entry.getKey();
int nHealthy = entry.getValue() / nHealthyWeight;
int nUnhealthy = entry.getValue() % nHealthyWeight;
// When both healthy replicas and non-healthy replicas exist,
// 1. request to add replicas up to expected count
// 2. request to remove replicas down to expected count
try {
if (nHealthy < expectedNum) {
// give a try to schedule the shard again to the workerGroup, let scheduler choose a proper worker.
LOG.debug("Request schedule new replicas for service:{}, shard:{}, workerGroup:{},"
+ " expected num: {}, actual num: {}",
serviceId, shardId, workerGroupId, expectedNum, nHealthy);
scheduler.scheduleAsyncAddToGroup(serviceId, shardId, workerGroupId);
}
if (nHealthy + nUnhealthy > expectedNum) {
LOG.debug("Remove redundant replicas for service:{}, shard:{}, workerGroup:{},"
+ " expected num: {}, actual num: {}",
serviceId, shardId, workerGroupId, expectedNum, nHealthy + nUnhealthy);
scheduler.scheduleAsyncRemoveFromGroup(serviceId, shardId, workerGroupId);
}
} catch (StarException exception) {
LOG.info("Fail to schedule tasks to scheduler. error:", exception);
}
}
}
/**
* Check and balance shard group if needed
*
* @param serviceId target service id
*/
protected void shardGroupBalanceCheckForService(String serviceId) {
LOG.debug("Start shard group balance health check for service: {}", serviceId);
ShardManager shardManager = serviceManager.getShardManager(serviceId);
if (shardManager == null) {
LOG.info("ShardManager not exist for service {}, skip the shard group balance check.", serviceId);
return;
}
// Get a snapshot of shard group ids, new created shard group will be checked in next round.
List shardGroupIds = shardManager.getAllShardGroupIds();
for (long groupId : shardGroupIds) {
ShardGroup group = shardManager.getShardGroup(groupId);
if (group == null) {
LOG.info("shard group {} not exist for service {}, skip it.", groupId, serviceId);
continue;
}
if (group.getShardIds().isEmpty()) {
LOG.debug("empty shard group {} in service {}. skip it!", groupId, serviceId);
continue;
}
try {
PlacementPolicy policy = group.getPlacementPolicy();
switch (policy) {
case PACK:
balancePackShardGroup(shardManager, group);
break;
case SPREAD:
balanceSpreadShardGroup(shardManager, group);
break;
case EXCLUDE:
balanceExcludeShardGroup(shardManager, group);
case NONE:
case RANDOM:
default:
break;
}
} catch (StarException exception) {
LOG.info("Got exception during processing service:{}, shardgroup:{} balance check, skip it.",
serviceId, groupId, exception);
}
}
}
private void balanceExcludeShardGroup(ShardManager shardManager, ShardGroup group) {
// (A EXCLUDE B) && (B EXCLUDE C) => (A EXCLUDE C), NO!
ReplicaWorkerInvertIndex index = new ReplicaWorkerInvertIndex();
index.buildFrom(workerManager, shardManager, group);
for (long workerGroupId : index.getAllWorkerGroupIds()) {
WorkerGroup workerGroup = workerManager.getWorkerGroup(group.getServiceId(), workerGroupId);
if (workerGroup == null) {
continue;
}
List workerIds = workerGroup.getAllWorkerIds(true);
if (workerIds.size() < 2) {
LOG.info("worker group:{} only has {} alive workers. Skip the balance check.",
workerGroupId, workerIds.size());
continue;
}
// Divide workers into two sets, HAVE replicas v.s. NO replicas
// move replicas from HAVE replicas to NO replicas.
// DON'T try to move around for following case:
// workerA: (X, Y, Z), workerB: (W) --> workerA: (X, Y), workerB: (Z, W)
// It breaks the EXCLUDE rule, makes no sense to balance around.
List srcWorkerIds = new ArrayList<>();
List tgtWorkerIds = new ArrayList<>();
for (long workerId : workerIds) {
if (index.getReplicaShardList(workerId).isEmpty()) {
tgtWorkerIds.add(workerId);
} else {
srcWorkerIds.add(workerId);
}
}
if (tgtWorkerIds.isEmpty()) {
// no available workers to move replica
return;
}
// Sort by its replicaNum in reverseOrder
srcWorkerIds.sort(
(o1, o2) -> Integer.compare(index.getReplicaShardList(o2).size(), index.getReplicaShardList(o1).size()));
ScheduleScorer tgtScore = new ScheduleScorer(tgtWorkerIds);
tgtScore.apply(workerManager);
Iterator srcIt = srcWorkerIds.iterator();
while (srcIt.hasNext() && !tgtScore.isEmpty()) {
// srcWorkerId: least score, targetWorkerId: most score, move from srcWorkerId -> targetWorkerId
long srcWorkerId = srcIt.next();
List tgtIdList = tgtScore.selectHighEnd(selector, 1);
if (tgtIdList.isEmpty()) {
break;
}
long tgtWorkerId = tgtIdList.get(0);
List candidates = new ArrayList<>(index.getReplicaShardList(srcWorkerId));
if (candidates.size() <= 1) {
// No need to move any replica
break;
}
long selected = candidates.get(0);
String serviceId = group.getServiceId();
try {
LOG.info("[ExcludeGroup] Try to balance shard:{} (service:{}) replica from worker:{} => worker:{}",
selected, serviceId, srcWorkerId, tgtWorkerId);
scheduler.scheduleAddToWorker(serviceId, selected, tgtWorkerId);
// remove tgtWorkerId from scorer
tgtScore.remove(tgtWorkerId);
// only submit the asyncRemoveFromWorker if AddToWorker success.
scheduler.scheduleAsyncRemoveFromWorker(
serviceId, Collections.nCopies(1, selected), srcWorkerId);
} catch (Exception exception) {
LOG.info("[ExcludeGroup] Fail to balance shard:{} in service:{}, form worker:{} to worker:{}. Error:",
selected, serviceId, srcWorkerId, tgtWorkerId, exception);
}
}
}
}
private void balancePackShardGroup(ShardManager shardManager, ShardGroup group) {
// (A PACK B) && (B PACK C) => (A PACK C), YES!
// TODO: mark the PACK shard group UNSTABLE?
// recursively find all PACK shard groups who are connected by the same shard.
List packGroups = new ArrayList<>();
List packShards = new ArrayList<>();
int replicaNum = collectAllShardsAndGroupsRecursively(shardManager, group, packGroups, packShards);
LOG.info("PACK groups balance: groups:{}, shards:{}, replicaNum:{}",
packGroups.size(), packShards.size(), replicaNum);
Optional minGroupId = packGroups.stream().map(ShardGroup::getGroupId).min(Comparator.naturalOrder());
Preconditions.checkState(minGroupId.isPresent());
if (minGroupId.get() < group.getGroupId()) {
// must have been processed by the minimal group id.
return;
}
ReplicaWorkerInvertIndex index = new ReplicaWorkerInvertIndex();
packShards.forEach(x -> index.addReplicas(workerManager, x));
for (long workerGroupId : index.getAllWorkerGroupIds()) {
WorkerGroup workerGroup = workerManager.getWorkerGroup(group.getServiceId(), workerGroupId);
if (workerGroup == null) {
continue;
}
List workerIds = workerGroup.getAllWorkerIds(true);
if (workerIds.size() < 2) {
LOG.info("worker group:{} only has {} alive workers. Skip the balance check.",
workerGroup.getGroupId(), workerIds.size());
continue;
}
if (workerIds.size() < replicaNum) {
// Can't fulfill the requirement any way.
LOG.info("Worker group:{} only has {} alive workers. Shard in PACK shard group requires {} replicas. Skip it.",
workerGroupId, workerIds.size(), replicaNum);
continue;
}
balancePackGroupInSingleWorkerGroup(workerIds, group, replicaNum, packGroups, index);
}
}
/**
* Collect all shards and PACK shard groups starting from a specific shard group recursively
*
* @param shardManager shard manager
* @param startGroup the shard group to start the visit
* @param groups result group list
* @param shards result shard list
* @return replica number
* @throws StarException StarException throws when replica number check fails
*/
private int collectAllShardsAndGroupsRecursively(ShardManager shardManager, ShardGroup startGroup,
List groups, List shards) throws StarException {
int replica = -1;
List todoGroup = new ArrayList<>();
todoGroup.add(startGroup);
List blockList = new ArrayList<>();
while (!todoGroup.isEmpty()) {
List newGroups = new ArrayList<>();
for (ShardGroup grp : todoGroup) {
if (groups.contains(grp)) { // group already checked
continue;
}
groups.add(grp);
for (long shardId : grp.getShardIds()) {
Shard shard = shardManager.getShard(shardId);
if (shard == null || shards.contains(shard) || blockList.contains(shard)) {
// the shard doesn't exist or is already checked
continue;
}
if (replica == -1) {
replica = shard.getExpectedReplicaCount();
} else if (replica != shard.getExpectedReplicaCount()) {
throw new StarException(ExceptionCode.INTERNAL, String.format(
"Inconsistent shard replica number in PACK group, expected number:%d, actual number:%d",
replica, shard.getExpectedReplicaCount()));
}
boolean hasPrecedenceGroup = false;
for (long gid : shard.getGroupIds()) {
ShardGroup grp2 = shardManager.getShardGroup(gid);
if (grp2 != null) {
if (grp2.getPlacementPolicy() == PlacementPolicy.PACK) {
newGroups.add(grp2);
} else if (grp2.getPlacementPolicy().getNumber() > PlacementPolicy.PACK.getNumber() &&
grp2.getShardIds().size() > 1) {
// skip the shard, let exclude group adjusting the distribution.
hasPrecedenceGroup = true;
}
}
}
if (hasPrecedenceGroup) {
blockList.add(shard);
} else {
shards.add(shard);
}
}
}
todoGroup = newGroups;
}
return replica;
}
private void balancePackGroupInSingleWorkerGroup(List workerIds, ShardGroup currentGroup, int replicaNum,
List groups, ReplicaWorkerInvertIndex index) {
ScheduleScorer scorer = new ScheduleScorer(workerIds);
List workersWithReplicas = new ArrayList<>();
for (long workerId : workerIds) {
int numReplicas = index.getReplicaShardList(workerId).size();
if (numReplicas == 0) {
continue;
}
workersWithReplicas.add(workerId);
scorer.apply(PlacementPolicy.PACK, Collections.nCopies(numReplicas, workerId));
}
if (workersWithReplicas.size() == replicaNum) {
// ALL DONE, shard health checker will take care of the missing replicas
return;
}
scorer.apply(workerManager);
List selectedWorkers = scorer.selectHighEnd(selector, replicaNum);
if (selectedWorkers.size() != replicaNum) {
// defensive coding, right now it is impossible as long as workerIds.size() >= replicaNum
LOG.info("Failed to select {} workers from candidates while doing shard group:{} balance check, skip it!",
replicaNum, currentGroup.getGroupId());
return;
}
LOG.debug("[PackGroup] shardGroup: {}. Existing workers with replica: {}, selected tgargetWorkers: {}",
currentGroup.getGroupId(), workersWithReplicas, selectedWorkers);
List existShardIds = new ArrayList<>();
workersWithReplicas.forEach(x -> existShardIds.addAll(index.getReplicaShardList(x)));
List validTargetShardIdList = new ArrayList<>();
for (ShardGroup packGroup : groups) {
// skip the shard groups who have no replicas in this worker group at all.
if (packGroup.getShardIds().stream().anyMatch(existShardIds::contains)) {
validTargetShardIdList.addAll(packGroup.getShardIds());
}
}
for (long wid : selectedWorkers) {
Collection existingIdList = index.getReplicaShardList(wid);
List todoIdList = validTargetShardIdList.stream()
.filter(x -> !existingIdList.contains(x))
.collect(Collectors.toList());
if (todoIdList.isEmpty()) {
continue;
}
try {
// add new replica and wait for the result.
// NOTE: this is a batch adding operation without considering shard existing replica count,
// so it will get replicas back to expected count if any shard has missing replicas.
scheduler.scheduleAddToWorker(currentGroup.getServiceId(), todoIdList, wid);
} catch (StarException exception) {
LOG.info("Fail to schedule new replicas of shard:{} to worker:{}, error:", todoIdList, wid, exception);
return;
}
}
// remove selectedWorkers, replicas on the remaining workers will be cleaned
workersWithReplicas.removeIf(selectedWorkers::contains);
for (long wid : workersWithReplicas) {
Collection todoList = index.getReplicaShardList(wid);
if (todoList.isEmpty()) {
continue;
}
// schedule remove jobs asynchronously.
LOG.info("Submit async task to remove shard replicas:{} from worker:{}", todoList, wid);
scheduler.scheduleAsyncRemoveFromWorker(currentGroup.getServiceId(), new ArrayList<>(todoList), wid);
}
}
private void balanceSpreadShardGroup(ShardManager shardManager, ShardGroup group) {
// (A SPREAD B) && (B SPREAD C) => (A SPREAD C), NO!
//
// Assume a new replica of a shard is to be added to the worker group.
// all target workers are scored from most preferred to least preferred.
// The balancer is to move a few replica from least preferred worker to most preferred worker.
ReplicaWorkerInvertIndex index = new ReplicaWorkerInvertIndex();
index.buildFrom(workerManager, shardManager, group);
for (long workerGroupId : index.getAllWorkerGroupIds()) {
WorkerGroup workerGroup = workerManager.getWorkerGroup(group.getServiceId(), workerGroupId);
if (workerGroup == null) {
continue;
}
List workerIds = workerGroup.getAllWorkerIds(true);
if (workerIds.size() < 2) {
LOG.info("worker group:{} only has {} alive workers. Skip the balance check.",
workerGroupId, workerIds.size());
continue;
}
ScheduleScorer scorer = new ScheduleScorer(workerIds);
for (long workerId : workerIds) {
int numReplicas = index.getReplicaShardList(workerId).size();
if (numReplicas == 0) {
continue;
}
scorer.apply(group.getPlacementPolicy(), Collections.nCopies(numReplicas, workerId));
}
scorer.apply(workerManager);
List> sortedEntries = scorer.getScores().entrySet().stream()
.sorted(Map.Entry.comparingByValue(Comparator.naturalOrder()))
.collect(Collectors.toList());
int head = 0;
int tail = sortedEntries.size() - 1;
while (head < tail) {
// srcWorkerId: least score, targetWorkerId: most score, move from srcWorkerId -> targetWorkerId
long srcWorkerId = sortedEntries.get(head).getKey();
long targetWorkerId = sortedEntries.get(tail).getKey();
Collection srcList = index.getReplicaShardList(srcWorkerId);
Collection tgtList = index.getReplicaShardList(targetWorkerId);
// TODO: sortedEntries is not strictly sorted by replica numbers of this shard group.
// It is possible that a worker with more replicas of this shard group is ahead of
// a worker with less replicas of this shard group due to its load and total replicas.
if (srcList.size() <= tgtList.size() + Config.SCHEDULER_BALANCE_MAX_SKEW) {
break;
}
// Select a shard from `srcList` and migrate to `tgtList`
List candidates = srcList.stream()
.filter(x -> !tgtList.contains(x)) // doesn't have replica in target worker
.map(shardManager::getShard)
.filter(y -> y != null &&
!hasPrecedenceShardGroup(shardManager, group.getPlacementPolicy(), y.getGroupIds()))
.collect(Collectors.toList());
if (!candidates.isEmpty()) {
// TODO: random choose one ?
// or the shard with more replicas will be highly preferred
// or if the shard doesn't have enough expected num of replicas, will be skipped in this round.
Shard selected = candidates.get(0);
try {
LOG.info("Try to balance shard:{} (service:{}) replica from worker:{} => worker:{}",
selected.getShardId(), selected.getServiceId(), srcWorkerId, targetWorkerId);
scheduler.scheduleAddToWorker(selected.getServiceId(), selected.getShardId(), targetWorkerId);
// only submit the asyncRemoveFromWorker if AddToWorker success.
scheduler.scheduleAsyncRemoveFromWorker(
selected.getServiceId(), Collections.nCopies(1, selected.getShardId()), srcWorkerId);
} catch (Exception exception) {
LOG.info("Fail to balance shard:{} in service:{}, form worker:{} to worker:{}. Error:",
selected.getShardId(), selected.getServiceId(), srcWorkerId, targetWorkerId, exception);
}
} else {
// TODO: If there is no any candidates, try to look into PACK group, check if there is any PACK group
// can be migrated together.
}
++head;
--tail;
}
}
}
/**
* Determine if any shard group in given shard group id list has a higher precedence than policy.
*
* @param shardManager shard manager, to get shard group by id
* @param policy baseline policy
* @param groupIds the list of shard groups to be compared.
* @return true if there are higher shard group precedence.
*/
private boolean hasPrecedenceShardGroup(ShardManager shardManager, PlacementPolicy policy, List groupIds) {
for (long id : groupIds) {
ShardGroup group = shardManager.getShardGroup(id);
if (group == null) {
continue;
}
if (group.getPlacementPolicy().getNumber() > policy.getNumber() && group.getShardIds().size() > 1) {
return true;
}
}
return false;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy