com.staros.schedule.ShardSchedulerV2 Maven / Gradle / Ivy
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.staros.schedule;
import com.google.common.base.Preconditions;
import com.staros.exception.ExceptionCode;
import com.staros.exception.ScheduleConflictStarException;
import com.staros.exception.StarException;
import com.staros.proto.AddShardInfo;
import com.staros.proto.AddShardRequest;
import com.staros.proto.PlacementPolicy;
import com.staros.proto.RemoveShardRequest;
import com.staros.schedule.select.FirstNSelector;
import com.staros.schedule.select.Selector;
import com.staros.service.ServiceManager;
import com.staros.shard.Shard;
import com.staros.shard.ShardGroup;
import com.staros.shard.ShardManager;
import com.staros.shard.ShardPolicyFilter;
import com.staros.util.AbstractServer;
import com.staros.util.Config;
import com.staros.util.LockCloseable;
import com.staros.util.Utils;
import com.staros.worker.Worker;
import com.staros.worker.WorkerGroup;
import com.staros.worker.WorkerManager;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.Closeable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.FutureTask;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.ReentrantLock;
import java.util.stream.Collectors;
/**
* Designed to be multi-services shared scheduler
* Assumptions:
* Uniqueness:
* * serviceId + shardId, globally unique
* * shardGroupId, globally unique
* * workerGroupId, globally unique
* * workerId, globally unique
* Two phases scheduling:
* - phase 1: calculation
* * select worker as target to add/remove shard replica
* - phase 2: dispatch
* * send RPC call to worker to make the change of the adding/removing shard replica, and update shard info upon success.
* Some of the calls can skip phase 1 and directly go to phase 2, e.g. to re-add replica back to worker if the worker is
* restarted.
* TODO: handle following cases
* 1. handle shard migration
* 2. handle workerGroup balance (could be caused by worker online, or triggered by monitoring system)
* 3. thread safety refactor of serviceManager/shardManager/workerManager components for multi-threads scheduling
*/
public class ShardSchedulerV2 extends AbstractServer implements Scheduler {
private static final Logger LOG = LogManager.getLogger(ShardSchedulerV2.class);
private static final int PRIORITY_LOW = 0;
private static final int PRIORITY_MEDIUM = 10;
private static final int PRIORITY_HIGH = 20;
private static final int shortNap = 100; // unit: us
// locker to ensure conflict requests are processed in sequential.
private final ExclusiveLocker requestLocker = new ExclusiveLocker();
// TODO: Configurable policy based Selector
private final Selector scoreSelector = new FirstNSelector();
private static final List conflictPolicies =
Arrays.asList(PlacementPolicy.EXCLUDE, PlacementPolicy.PACK, PlacementPolicy.SPREAD);
// Phase 1 executors, mainly focus on selecting target workers for the shard replica. CPU intensive operation.
private ScheduledThreadPoolExecutor calculateExecutors;
// Phase 2 executors, make RPC calls to worker to finalize the selection. Network IO intensive operation.
private ThreadPoolExecutor dispatchExecutors;
private final ServiceManager serviceManager;
private final WorkerManager workerManager;
/**
* Cancellable close object. If cancelled before calling close(), the runnable object will not be actually run.
*/
private static class DeferOp implements Closeable {
private final Runnable runnable;
private boolean done;
public DeferOp(Runnable runnable) {
this.runnable = runnable;
done = false;
}
public void cancel() {
this.done = true;
}
@Override
public void close() {
if (!done) {
done = true;
runnable.run();
}
}
}
/**
* Helper class to manage schedule request exclusive relationship
* 1. at any time, only one ScheduleRequestContext with the same tuple (serviceId, shardId, workerGroupId)
* can be entered into phase 2.
* 2. at any time, only one shard of the same ShardGroup (whose placement policy is in conflictPolicies) can
* be entered the phase 2 execution.
* NOTE:
* 1. the 1st conflict doesn't necessarily imply the 2nd conflict. E.g. the shard doesn't have any shardGroup or
* the shard doesn't belong to any shard groups with conflict placement policies.
* 2. the 2nd conflict doesn't necessarily imply the 1st conflict. E.g. it is conflict due to two different shards
* from the same shard group are request to schedule at the same time.
*/
private static class ExclusiveLocker {
// Lock-free Set, no need to protect by lock
protected Set exclusiveContexts = new ConcurrentSkipListSet<>();
// protect exclusiveShardGroups
protected final ReentrantLock exclusiveMapLock = new ReentrantLock();
// workerGroup -> (shardGroup, serviceId + shardId), check conflicts of a workerGroup shard scheduling in the same shardGroup
// TODO: periodically clean the Map which has empty sets due to auto scaling creating/destroying workerGroups
protected final Map> exclusiveShardGroups = new HashMap<>();
/**
* try to set exclusive marker for schedule request with given context.
* @param ctx request context
* @param manager shard manager
* @return true - if the exclusive markers are all set
* false - fail to set the exclusive marker, conflicts detected.
*/
public boolean tryLock(ScheduleRequestContext ctx, ShardManager manager) {
// exclusiveContexts ensure the same request is executed sequentially.
if (exclusiveContexts.add(ctx)) {
// exclusiveShardGroups ensure only one request in progress in the specific shardGroup
if (checkAndUpdateExclusiveShardGroups(ctx, manager)) {
// Set closeable callback
ctx.setRunnable(() -> tryUnlock(ctx));
return true;
} else {
// rollback adding operation
exclusiveContexts.remove(ctx);
}
}
return false;
}
private void tryUnlock(ScheduleRequestContext ctx) {
// reverse order of tryLock()
cleanExclusiveShardGroup(ctx);
exclusiveContexts.remove(ctx);
}
/**
* Clean exclusive shardGroup ids from exclusiveShardGroups
* @param ctx ScheduleRequestContext to be processed
*/
private void cleanExclusiveShardGroup(ScheduleRequestContext ctx) {
// clean shardGroup exclusive marker.
// can't use Shard.getShardGroupIds() from Shard since the shard's group list can be updated in between.
Collection exclusiveGroups = ctx.getExclusiveGroupIds();
if (exclusiveGroups != null && !exclusiveGroups.isEmpty()) {
try (LockCloseable ignored = new LockCloseable(exclusiveMapLock)) {
Collection groupMarker = exclusiveShardGroups.get(ctx.getWorkerGroupId());
if (groupMarker != null) {
groupMarker.removeAll(exclusiveGroups);
}
}
}
}
/**
* Check whether there is any conflict shard that belongs to the same shard group that is in processing.
* @return true: check pass, no conflict. false: check fail, has conflict.
*/
private boolean checkAndUpdateExclusiveShardGroups(ScheduleRequestContext ctx, ShardManager shardManager) {
Shard shard = shardManager.getShard(ctx.getShardId());
if (shard == null) {
return true;
}
Set exclusiveIds = shard.getGroupIds().stream()
.map(shardManager::getShardGroup)
.filter(y -> y != null && conflictPolicies.contains(y.getPlacementPolicy()))
.map(ShardGroup::getGroupId)
.collect(Collectors.toSet());
if (exclusiveIds.isEmpty()) {
return true;
}
try (LockCloseable ignored = new LockCloseable(exclusiveMapLock)) {
Set groupMarker = exclusiveShardGroups.get(ctx.getWorkerGroupId());
if (groupMarker == null) {
groupMarker = new HashSet<>();
exclusiveShardGroups.put(ctx.getWorkerGroupId(), groupMarker);
} else {
// check if there is any conflict shard group in scheduling
if (groupMarker.stream().anyMatch(exclusiveIds::contains)) {
// has conflict, need to do the schedule later.
LOG.debug("Has conflict shardgroup running, retry later. {}", ctx);
return false;
}
}
// add all groupIds into exclusiveShardGroup to prevent shards in the same group to be scheduled.
groupMarker.addAll(exclusiveIds);
// save the exclusiveGroupIds and add into exclusiveShardGroups
ctx.setExclusiveGroupIds(exclusiveIds);
}
return true;
}
}
/**
* Abstraction of phase 2 tasks.
* Wrap of FutureTask with Comparable implementation, so it can be prioritized.
*/
private static class DispatchTask extends FutureTask implements Comparable> {
private final int priority;
private final String description;
public DispatchTask(Runnable runnable, T result, int priority, String description) {
super(runnable, result);
this.priority = priority;
this.description = description;
}
public DispatchTask(Callable callable, int priority, String description) {
super(callable);
this.priority = priority;
this.description = description;
}
@Override
public int compareTo(DispatchTask o) {
// reverse order of Integer, because PriorityQueue peaks element from least to most
return Integer.compare(o.priority, this.priority);
}
@Override
public String toString() {
return description;
}
}
private DispatchTask dispatchTaskForAddToWorker(
String serviceId, List shardIds, long workerId, int priority) {
Callable callable = () -> {
try {
executeAddToWorker(serviceId, shardIds, workerId);
return null;
} catch (StarException e) {
return e;
} catch (Exception e) {
return new StarException(ExceptionCode.SCHEDULE, e.getMessage());
}
};
String description = String.format("[AddToWorker Task] serviceId: %s, workerId: %s, priority: %d",
serviceId, workerId, priority);
return new DispatchTask<>(callable, priority, description);
}
private DispatchTask dispatchTaskForAddToGroup(
ScheduleRequestContext ctx, int priority) {
String description = String.format("[AddToGroup Task] %s, priority: %d", ctx, priority);
return new DispatchTask<>(() -> executeAddToGroupPhase2(ctx), true, priority, description);
}
private DispatchTask dispatchTaskForRemoveFromGroup(ScheduleRequestContext ctx, int priority) {
String description = String.format("[RemoveFromGroup Task] %s, priority: %d", ctx, priority);
return new DispatchTask<>(() -> executeRemoveFromGroupPhase2(ctx), true, priority, description);
}
private DispatchTask dispatchTaskForRemoveFromWorker(
String serviceId, List shardIds, long workerId, int priority) {
Callable callable = () -> {
try {
executeRemoveFromWorker(serviceId, shardIds, workerId);
return null;
} catch (StarException e) {
return e;
} catch (Exception e) {
return new StarException(ExceptionCode.SCHEDULE, e.getMessage());
}
};
String description = String.format("[RemoveFromWorker Task] serviceId: %s, workerId: %s, priority: %d",
serviceId, workerId, priority);
return new DispatchTask<>(callable, priority, description);
}
public ShardSchedulerV2(ServiceManager serviceManager, WorkerManager workerManager) {
this.serviceManager = serviceManager;
this.workerManager = workerManager;
}
/**
* BLOCKING interface to schedule single shard (shardId) in service (serviceId) in workerGroup (wgId).
* @param serviceId service id
* @param shardId shard id
* @param wgId worker group id
* @throws StarException exception if fails
*/
@Override
public void scheduleAddToGroup(String serviceId, long shardId, long wgId) throws StarException {
scheduleAddToGroup(serviceId, Collections.nCopies(1, shardId), wgId);
}
/**
* BLOCKING interface of scheduling a list of shards to a WorkerGroup. These shards are not necessarily
* to be in the same shard group.
* @param serviceId shard serviceId
* @param shardIds shard id list to be scheduled
* @param wgId workerGroup id to be scheduled
* @throws StarException starException with ExceptionCode
*/
@Override
public void scheduleAddToGroup(String serviceId, List shardIds, long wgId) throws StarException {
CountDownLatch latch = new CountDownLatch(shardIds.size());
List ctxs = new ArrayList<>();
for (Long id : shardIds) {
ScheduleRequestContext ctx = new ScheduleRequestContext(serviceId, id, wgId, latch);
ctxs.add(ctx);
submitCalcTaskInternal(() -> executeAddToGroupPhase1(ctx), 0);
}
try {
latch.await();
} catch (InterruptedException e) {
throw new StarException(ExceptionCode.SCHEDULE, e.getMessage());
}
for (ScheduleRequestContext ctx : ctxs) {
if (ctx.getException() != null) {
// If any shard schedule fails with exception, propagate it to the caller
throw ctx.getException();
}
}
}
/**
* Async schedule single shard (shardId) in service (serviceId) in wgId workerGroup (wgId), don't wait for result.
* @param serviceId service identity
* @param shardId shard identity
* @param wgId worker group identity
* @throws StarException exception if fails
*/
@Override
public void scheduleAsyncAddToGroup(String serviceId, long shardId, long wgId) throws StarException {
ScheduleRequestContext ctx = new ScheduleRequestContext(serviceId, shardId, wgId, null);
submitCalcTaskInternal(() -> executeAddToGroupPhase1(ctx), 0);
}
/**
* Request to schedule a list of shards in service (serviceId), let schedule to choose the default worker group.
* Wait for the scheduling done.
* @param serviceId service identity
* @param shardIds list of shard identities
* @throws StarException exception if fails
*/
@Override
public void scheduleAddToDefaultGroup(String serviceId, List shardIds) throws StarException {
WorkerGroup group = workerManager.getDefaultWorkerGroup(serviceId);
if (group == null) {
throw new StarException(ExceptionCode.NOT_EXIST,
String.format("DefaultWorkerGroup not exist for service %s", serviceId));
}
// TODO: it is still possible that the worker group is deleted after this scheduleShards() called.
scheduleAddToGroup(serviceId, shardIds, group.getGroupId());
}
/**
* Remove redundant replicas for the shard, running:
* 1. with low priority
* 2. as background task
* 3. remove one replica at most each time
* @param serviceId target service id shard belongs to
* @param shardId shard id
* @param workerGroupId target worker group id
* @throws StarException throws star exception if the task can't be added to thread pool.
*/
@Override
public void scheduleAsyncRemoveFromGroup(String serviceId, long shardId, long workerGroupId)
throws StarException {
ScheduleRequestContext ctx = new ScheduleRequestContext(serviceId, shardId, workerGroupId, null);
submitCalcTaskInternal(() -> executeRemoveFromGroupPhase1(ctx), 0);
}
/**
* Adding shard to group. Phase 1, choose workers from the worker group.
*/
private void executeAddToGroupPhase1(ScheduleRequestContext ctx) {
try {
executeAddToGroupPhase1Detail(ctx);
} catch (ScheduleConflictStarException e) {
// A specific exception that schedule understands and retries the request
// TODO: add RetryPolicy inside RequestCtx to achieve fine controlled retry behavior
submitCalcTaskInternal(() -> executeAddToGroupPhase1(ctx), shortNap);
} catch (StarException exception) {
ctx.done(exception);
} catch (Throwable throwable) {
ctx.done(new StarException(ExceptionCode.SCHEDULE, throwable.getMessage()));
}
}
/**
* submit phase 1 task to calculateExecutors thread pool, optionally with delay
* @param run runnable object
* @param delay: delay time before execute, usually for retry
* @exception StarException fail to submit the task
*/
private void submitCalcTaskInternal(Runnable run, long delay) throws StarException {
try {
if (delay == 0) {
calculateExecutors.execute(run);
} else {
calculateExecutors.schedule(run, delay, TimeUnit.MICROSECONDS);
}
} catch (RejectedExecutionException e) {
if (!isRunning()) {
throw new StarException(ExceptionCode.SCHEDULE, "Scheduling shutdown!");
} else {
throw new StarException(ExceptionCode.SCHEDULE, e.getMessage());
}
}
}
/**
* Phase2: add a replica of a shard to a worker without
* @param serviceId service Id
* @param shardId shard id to be added
* @param workerId target worker Id to be added
* @throws StarException Schedule Error
* TODO: refactor with Priority Queue
*/
@Override
public void scheduleAddToWorker(String serviceId, long shardId, long workerId) throws StarException {
scheduleAddToWorker(serviceId, Collections.nCopies(1, shardId), workerId);
}
/**
* Phase2: batch add list of shards to a worker
* @param serviceId service Id
* @param shardIds list of shard ids to be added
* @param workerId target worker Id to be added
* @throws StarException Schedule Error
*/
@Override
public void scheduleAddToWorker(String serviceId, List shardIds, long workerId) throws StarException {
DispatchTask task = dispatchTaskForAddToWorker(serviceId, shardIds, workerId, PRIORITY_HIGH);
submitDispatchTask(task, true /* wait */);
}
/**
* Phase2: Async interface to schedule a list of shards to the target worker, don't wait for the result.
* @param serviceId service id
* @param shardIds list of shard ids
* @param workerId target worker id
* @throws StarException schedule error
*/
@Override
public void scheduleAsyncAddToWorker(String serviceId, List shardIds, long workerId) throws StarException {
DispatchTask task = dispatchTaskForAddToWorker(serviceId, shardIds, workerId, PRIORITY_MEDIUM);
submitDispatchTask(task, false /* wait */);
}
/**
* Phase2: Remove single shard replica from target worker. worker should belong to the service (serviceId)
* @param serviceId shard service Id
* @param shardId shard id to be removed
* @param workerId worker id
*/
@Override
public void scheduleRemoveFromWorker(String serviceId, long shardId, long workerId) throws StarException {
scheduleRemoveFromWorker(serviceId, Collections.nCopies(1, shardId), workerId);
}
/**
* Phase2: Remove list of shards from target worker, wait for the result back.
* @param serviceId shard service Id
* @param shardIds list of shard ids to be removed
* @param workerId target worker id
* @throws StarException error out if the request can't be fulfilled. ExceptionCode will represent the error category.
*/
@Override
public void scheduleRemoveFromWorker(String serviceId, List shardIds, long workerId) throws StarException {
DispatchTask task = dispatchTaskForRemoveFromWorker(serviceId, shardIds, workerId, PRIORITY_MEDIUM);
submitDispatchTask(task, true /* wait */);
}
/**
* Phase2: Remove list of shards from target worker, don't wait for the result.
* @param serviceId shard service Id
* @param shardIds list of shard ids to be removed
* @param workerId target worker id
*/
@Override
public void scheduleAsyncRemoveFromWorker(String serviceId, List shardIds, long workerId) throws StarException {
DispatchTask task = dispatchTaskForRemoveFromWorker(serviceId, shardIds, workerId, PRIORITY_LOW);
submitDispatchTask(task, false /* wait */);
}
private void submitDispatchTask(DispatchTask task, boolean wait) throws StarException {
try {
LOG.debug("Submit task to executor services. Task: {}, waitForResult: {}", task, wait);
dispatchExecutors.execute(task);
} catch (Exception e) {
LOG.error("Fail to submit schedule task {}", task, e);
throw new StarException(ExceptionCode.SCHEDULE, e.getMessage());
}
if (wait) {
Exception exception = null;
try {
if (task.get() != null) {
exception = task.get();
}
} catch (Throwable e) {
LOG.error("Fail to get task result. task: {}", task, e);
throw new StarException(ExceptionCode.SCHEDULE, e.getMessage());
}
if (exception != null) {
if (exception instanceof StarException) {
throw (StarException) exception;
} else {
// convert to StarException
throw new StarException(ExceptionCode.SCHEDULE, exception.getMessage());
}
}
}
}
/**
* Process a single shard schedule request.
* @param ctx Request context
* @throws ScheduleConflictStarException, schedule is conflicts, caller can retry the ctx.
* StarException, other error cases
*/
private void executeAddToGroupPhase1Detail(ScheduleRequestContext ctx) {
// TODO: to be multi-threads safe, shard/shardGroup/workerGroup accessing should be all thread-safe.
ShardManager shardManager = serviceManager.getShardManager(ctx.getServiceId());
if (shardManager == null) {
LOG.info("Service {} is not available before request can be processed. Ignore the request!",
ctx.getServiceId());
ctx.done(new StarException(ExceptionCode.NOT_EXIST, String.format("Service %s Not Exist", ctx.getServiceId())));
return;
}
Shard shard = shardManager.getShard(ctx.getShardId());
if (shard == null) {
LOG.info("Shard {} is not available before request can be processed. Ignore the request!",
ctx.getShardId());
ctx.done(new StarException(ExceptionCode.NOT_EXIST, String.format("Shard %d Not Exist", ctx.getShardId())));
return;
}
// check replica numbers
int replicaNum = shard.getExpectedReplicaNum();
List existReplicas = shard.getReplicaWorkerIds().stream()
.map(workerManager::getWorker)
.filter(y -> y != null && y.isAlive() && y.getGroupId() == ctx.getWorkerGroupId())
.map(Worker::getWorkerId)
.collect(Collectors.toList());
if (existReplicas.size() >= replicaNum) {
// All good, nothing to do. Let ShardChecker takes care of the removal.
ctx.done();
return;
}
int desired = replicaNum - existReplicas.size();
int priority = PRIORITY_MEDIUM;
if (ctx.isWaited()) {
// someone is waiting for the request result
priority = PRIORITY_HIGH;
} else {
if (desired <= replicaNum / 2) {
// already has more than half of replicas, set its priority to LOW
priority = PRIORITY_LOW;
}
}
// get WorkerList by groupId
WorkerGroup wg = workerManager.getWorkerGroupNoException(shard.getServiceId(), ctx.getWorkerGroupId());
if (wg == null) {
LOG.warn("WorkerGroup {} is not available before request can be processed. Ignore the request! {}",
ctx.getWorkerGroupId(), ctx);
ctx.done(new StarException(ExceptionCode.NOT_EXIST,
String.format("WorkerGroup %d doesn't exist!", ctx.getWorkerGroupId())));
return;
}
// get all alive worker Ids in the workerGroup
Set wIds = new HashSet<>(wg.getAllWorkerIds(true /*onlyAlive*/));
// exclude workers that already has a copy of the shard
existReplicas.forEach(wIds::remove);
// IMPORTANT: add exclusiveShardGroup marker so there is only one shard
// from a shardGroup can be scheduled. This should be put before
// pulling any shard replicas who co-exists in the same shard group as the processing one.
if (!requestLocker.tryLock(ctx, shardManager)) {
ctx.reset();
throw new ScheduleConflictStarException();
}
try (DeferOp cleanOp = new DeferOp(ctx.getRunnable())) { // enters critical area
// get shard's all 1st-degree shards categorized by shard group type
Map> ppMap = new HashMap<>();
for (Long gid : shard.getGroupIds()) {
ShardGroup g = shardManager.getShardGroup(gid);
PlacementPolicy policy = g.getPlacementPolicy();
List workerIds = new ArrayList<>();
for (Long id : g.getShardIds()) {
if (id == shard.getShardId()) {
continue;
}
Shard firstDegreeShard = shardManager.getShard(id);
if (firstDegreeShard == null) {
continue;
}
workerIds.addAll(firstDegreeShard.getReplicaWorkerIds()
.stream()
.filter(wIds::contains)
.collect(Collectors.toList()));
}
if (workerIds.isEmpty()) {
continue;
}
if (ppMap.containsKey(policy)) {
ppMap.get(policy).addAll(workerIds);
} else {
ppMap.put(policy, workerIds);
}
LOG.debug("Add policy {} workers: {} for request ctx: {}", policy, workerIds, ctx);
}
ShardPolicyFilter.filter(ppMap, wIds);
// Final: Select valid workerIds
if (wIds.isEmpty()) {
LOG.warn("Can't find any worker for schedule request: {}", ctx);
ctx.done(new StarException(ExceptionCode.SCHEDULE,
String.format("Can't find worker for request: %s", ctx)));
return;
}
List workerIds;
if (wIds.size() < desired) {
LOG.warn("Schedule requests {} workers, but only {} available. {}", desired, wIds.size(), ctx);
workerIds = new ArrayList<>(wIds);
} else {
ScheduleScorer scorer = new ScheduleScorer(wIds);
// apply shardgroup policy scoring
ppMap.forEach(scorer::apply);
// apply worker status scoring
scorer.apply(workerManager);
LOG.debug("final scores for selection: {}, for request {}", scorer.getScores(), ctx);
// Select the worker with the HIGHEST score
workerIds = scorer.selectHighEnd(scoreSelector, desired);
}
ctx.setWorkerIds(workerIds);
// submit to next thread pool to do the execution.
LOG.debug("Schedule request {}, pending schedule to workerList: {}", ctx, ctx.getWorkerIds());
DispatchTask task = dispatchTaskForAddToGroup(ctx, priority);
try {
dispatchExecutors.execute(task);
// submit success, don't clean the exclusive group marker
cleanOp.cancel();
} catch (Throwable e) {
LOG.error("Fail to add task {} into dispatchWorkerExecutors", task, e);
// error out
ctx.done(new StarException(ExceptionCode.SCHEDULE, e.getMessage()));
}
}
}
/**
* Phase2: Process tasks submitted by phase1, talk to workers to assign shards.
* NOTE: the ctx will not be removed from pendingFinishRequests until its execution is done.
* @param ctx ScheduleRequestContext to be processed
*/
private void executeAddToGroupPhase2(ScheduleRequestContext ctx) {
try (DeferOp ignored = new DeferOp(ctx.getRunnable())) {
if (isRunning()) {
executeAddToWorker(ctx);
} else {
ctx.done(new StarException(ExceptionCode.SCHEDULE, "Schedule shutdown in progress"));
}
}
}
private void executeAddToWorker(ScheduleRequestContext ctx) {
StarException exception = null;
for (long workerId : ctx.getWorkerIds()) {
try {
executeAddToWorker(ctx.getServiceId(), Collections.nCopies(1, ctx.getShardId()), workerId);
} catch (StarException e) {
exception = e;
} catch (Exception e) {
exception = new StarException(ExceptionCode.SCHEDULE, e.getMessage());
}
}
ctx.done(exception);
}
/**
* Do execution of adding shards to worker
* @param serviceId service Id to be operated
* @param shardIds list of shards to be removed
* @param workerId target worker id
*/
private void executeAddToWorker(String serviceId, List shardIds, long workerId) {
ShardManager shardManager = serviceManager.getShardManager(serviceId);
if (shardManager == null) {
LOG.warn("service {} not exist for schedule request.", serviceId);
throw new StarException(ExceptionCode.NOT_EXIST, String.format("service %s not exists", serviceId));
}
Worker worker = workerManager.getWorker(workerId);
if (worker == null) {
LOG.warn("worker {} not exist when execute add shard task for service {}.", workerId, serviceId);
throw new StarException(ExceptionCode.NOT_EXIST, String.format("worker %d not exists", workerId));
}
if (!worker.getServiceId().equals(serviceId)) {
throw new StarException(ExceptionCode.INVALID_ARGUMENT,
String.format("worker %d doesn't belong to service: %s", workerId, serviceId));
}
List> batches = new ArrayList<>();
int remainShardSize = shardIds.size();
List miniBatch = new ArrayList<>(Integer.min(remainShardSize, Config.SCHEDULER_MAX_BATCH_ADD_SHARD_SIZE));
for (Long id : shardIds) {
Shard shard = shardManager.getShard(id);
--remainShardSize;
if (shard == null) {
LOG.warn("shard {} not exist when execute add shard task for service {}.", id, serviceId);
if (shardIds.size() == 1) {
throw new StarException(ExceptionCode.NOT_EXIST, String.format("shard %d not exists", id));
}
} else {
miniBatch.add(shard.getAddShardInfo());
if (miniBatch.size() >= Config.SCHEDULER_MAX_BATCH_ADD_SHARD_SIZE) {
batches.add(miniBatch);
miniBatch = new ArrayList<>(Integer.min(remainShardSize, Config.SCHEDULER_MAX_BATCH_ADD_SHARD_SIZE));
}
}
}
if (!miniBatch.isEmpty()) {
batches.add(miniBatch);
}
StarException exception = null;
for (List info : batches) {
AddShardRequest request = AddShardRequest.newBuilder()
.setServiceId(serviceId)
.setWorkerId(workerId)
.addAllShardInfo(info)
.build();
if (worker.addShard(request)) {
shardManager.addShardReplicas(shardIds, workerId);
} else {
exception = new StarException(ExceptionCode.SCHEDULE,
String.format("Schedule add shard task execution failed serviceId: %s, workerId: %d, shardIds: %s",
serviceId, workerId, shardIds));
}
}
if (exception != null) {
throw exception;
}
}
/**
* Do execution of removing shards
* @param serviceId service Id to be operated
* @param shardIds list of shards to be removed
* @param workerId target worker id
*/
private void executeRemoveFromWorker(String serviceId, List shardIds, long workerId) {
ShardManager shardManager = serviceManager.getShardManager(serviceId);
if (shardManager == null) {
LOG.warn("service {} not exist when execute remove-shard.", serviceId);
throw new StarException(ExceptionCode.NOT_EXIST, String.format("service %s not exist!", serviceId));
}
boolean doClean = true;
Worker worker = workerManager.getWorker(workerId);
if (worker == null) {
LOG.warn("worker {} not exist when execute remove shards for service {}!", workerId, serviceId);
} else if (!worker.isAlive()) {
LOG.warn("worker {} dead when execute remove shards for service {}.", workerId, serviceId);
} else if (!worker.getServiceId().equals(serviceId)) {
LOG.warn("worker {} doesn't belong to service {}", workerId, serviceId);
} else {
RemoveShardRequest request = RemoveShardRequest.newBuilder()
.setServiceId(serviceId)
.setWorkerId(workerId)
.addAllShardIds(shardIds).build();
doClean = worker.removeShard(request);
}
if (doClean) {
shardManager.removeShardReplicas(shardIds, workerId);
}
}
/**
* Try to select a replica to remove for given shard from given worker Group
* @param ctx request context
*/
private void executeRemoveFromGroupPhase1(ScheduleRequestContext ctx) {
if (!isRunning()) {
ctx.done(new StarException(ExceptionCode.SCHEDULE, "Schedule in shutdown progress"));
return;
}
try {
executeRemoveFromGroupPhase1Detail(ctx);
} catch (ScheduleConflictStarException exception) {
// TODO: add RetryPolicy inside RequestCtx to achieve fine controlled retry behavior
submitCalcTaskInternal(() -> executeRemoveFromGroupPhase1(ctx), shortNap);
} catch (StarException exception) {
ctx.done(exception);
} catch (Throwable throwable) {
ctx.done(new StarException(ExceptionCode.SCHEDULE, throwable.getMessage()));
}
}
/**
* execute the task of selecting a replica of the shard from the worker group to remove.
*
* @param ctx Request context
* @throws ScheduleConflictStarException - schedule is conflicts, caller can retry the ctx.
* StarException - other error cases
*/
private void executeRemoveFromGroupPhase1Detail(ScheduleRequestContext ctx) {
ShardManager shardManager = serviceManager.getShardManager(ctx.getServiceId());
if (shardManager == null) {
LOG.info("Service {} is not available before request can be processed. Ignore the request!",
ctx.getServiceId());
ctx.done(new StarException(ExceptionCode.NOT_EXIST, String.format("Service %s Not Exist", ctx.getServiceId())));
return;
}
Shard shard = shardManager.getShard(ctx.getShardId());
if (shard == null) {
LOG.info("Shard {} is not available before request can be processed. Ignore the request!",
ctx.getShardId());
ctx.done(new StarException(ExceptionCode.NOT_EXIST, String.format("Shard %d Not Exist", ctx.getShardId())));
return;
}
// check replica numbers
int replicaNum = shard.getExpectedReplicaNum();
List healthyWorkerIds = new ArrayList<>();
List deadWorkers = new ArrayList<>();
shard.getReplicaWorkerIds().forEach(id -> {
Worker worker = workerManager.getWorker(id);
if (worker == null) {
// Ask schedule to remove the replica directly.
scheduleAsyncRemoveFromWorker(
ctx.getServiceId(), Collections.nCopies(1, ctx.getShardId()), id);
} else if (worker.getGroupId() == ctx.getWorkerGroupId()) {
// Our interested worker group
if (!worker.isAlive()) {
deadWorkers.add(worker);
} else {
healthyWorkerIds.add(worker.getWorkerId());
}
}
});
if (healthyWorkerIds.size() + deadWorkers.size() <= replicaNum) {
// Don't do anything, because right now the total replicas are less than expected. Wish dead replicas can
// be back some time in the future. Finger crossed.
LOG.debug("{}, Number of replicas (include dead ones) are less than expected replica. Skip it.", ctx);
ctx.done();
return;
}
List workerIds;
if (!requestLocker.tryLock(ctx, shardManager)) {
throw new ScheduleConflictStarException();
}
// tryLock will call ctx.setCloseable() to register a closeable to clean the exclusive markers
try (DeferOp cleanOp = new DeferOp(ctx.getRunnable())) {
if (!deadWorkers.isEmpty()) { // easy case
workerIds = Arrays.asList(selectOldestWorkerLastSeen(deadWorkers).getWorkerId());
} else {
// scoring all the replicas
ScheduleScorer scorer = new ScheduleScorer(healthyWorkerIds);
for (long groupId : shard.getGroupIds()) {
ShardGroup group = shardManager.getShardGroup(groupId);
if (group == null) {
continue;
}
List allReplicaWorkerIds = new ArrayList<>();
for (long sid : group.getShardIds()) {
if (sid == shard.getShardId()) {
continue;
}
Shard firstDegreeShard = shardManager.getShard(sid);
if (firstDegreeShard == null) {
continue;
}
allReplicaWorkerIds.addAll(firstDegreeShard.getReplicaWorkerIds());
}
scorer.apply(group.getPlacementPolicy(), allReplicaWorkerIds);
}
scorer.apply(workerManager);
LOG.debug("final scores for selection: {}, for request {}", scorer.getScores(), ctx);
// select replica with the LOWEST score to remove
workerIds = scorer.selectLowEnd(scoreSelector, 1);
LOG.debug("Final selection for remove-healthy shard, request:{} selection: {}", ctx, workerIds);
}
Preconditions.checkState(workerIds.size() == 1L, "Should only have one replica to remove!");
ctx.setWorkerIds(workerIds);
LOG.info("Schedule request {}, pending schedule to workerList: {}", ctx, ctx.getWorkerIds());
DispatchTask task = dispatchTaskForRemoveFromGroup(ctx, ctx.isWaited() ? PRIORITY_MEDIUM : PRIORITY_LOW);
try {
dispatchExecutors.execute(task);
// submit success, don't clean the exclusive group marker
cleanOp.cancel();
} catch (Throwable e) {
LOG.error("Fail to add task {} into dispatchWorkerExecutors", task, e);
// error out
ctx.done(new StarException(ExceptionCode.SCHEDULE, e.getMessage()));
}
}
}
/**
* Select the worker whose lastSeenTime is oldest.
* @param workers workers to choose
* @return the worker with smallest lastSeenTime
*/
private Worker selectOldestWorkerLastSeen(List workers) {
Worker targetWorker = workers.get(0);
for (Worker worker : workers) {
if (worker.getLastSeenTime() < targetWorker.getLastSeenTime()) {
targetWorker = worker;
}
}
return targetWorker;
}
private void executeRemoveFromGroupPhase2(ScheduleRequestContext ctx) {
try (DeferOp ignored = new DeferOp(ctx.getRunnable())) {
if (isRunning()) {
for (long workerId : ctx.getWorkerIds()) {
executeRemoveFromWorker(ctx.getServiceId(), Collections.nCopies(1, ctx.getShardId()), workerId);
}
} else {
ctx.done(new StarException(ExceptionCode.SCHEDULE, "Schedule shutdown in progress"));
}
}
}
@Override
public boolean isIdle() {
if (!isRunning()) {
return true;
}
return calculateExecutors.getCompletedTaskCount() == calculateExecutors.getTaskCount() &&
dispatchExecutors.getCompletedTaskCount() == dispatchExecutors.getTaskCount();
}
@Override
public void doStart() {
calculateExecutors = new ScheduledThreadPoolExecutor(Config.SCHEDULER_CORE_THREAD_POOL_SIZE);
dispatchExecutors = new ThreadPoolExecutor(Config.SCHEDULER_DISPATCH_WORKER_THREAD_POOL_SIZE,
Config.SCHEDULER_DISPATCH_WORKER_THREAD_POOL_SIZE,
0L,
TimeUnit.MILLISECONDS,
new PriorityBlockingQueue<>());
}
@Override
public void doStop() {
Utils.shutdownExecutorService(calculateExecutors);
Utils.shutdownExecutorService(dispatchExecutors);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy