com.staros.schedule.ShardSchedulerV2 Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of starmanager Show documentation
There is a newer version: 3.4-rc2
// Copyright 2021-present StarRocks, Inc. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


package com.staros.schedule;

import com.google.common.base.Preconditions;
import com.staros.exception.ExceptionCode;
import com.staros.exception.ScheduleConflictException;
import com.staros.exception.StarException;
import com.staros.proto.AddShardInfo;
import com.staros.proto.AddShardRequest;
import com.staros.proto.PlacementPolicy;
import com.staros.proto.RemoveShardRequest;
import com.staros.schedule.select.FirstNSelector;
import com.staros.schedule.select.Selector;
import com.staros.service.ServiceManager;
import com.staros.shard.Shard;
import com.staros.shard.ShardGroup;
import com.staros.shard.ShardManager;
import com.staros.shard.ShardPolicyFilter;
import com.staros.util.AbstractServer;
import com.staros.util.Config;
import com.staros.util.LockCloseable;
import com.staros.worker.Worker;
import com.staros.worker.WorkerGroup;
import com.staros.worker.WorkerManager;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.Closeable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentSkipListSet;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.FutureTask;
import java.util.concurrent.PriorityBlockingQueue;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.ReentrantLock;
import java.util.stream.Collectors;

/**
 * Designed to be multi-services shared scheduler
 * Assumptions:
 *    Uniqueness:
 *      * serviceId + shardId, globally unique
 *      * shardGroupId, globally unique
 *      * workerGroupId, globally unique
 *      * workerId, globally unique
 *  Two phases scheduling:
 *   - phase 1: calculation
 *     * select worker as target to add/remove shard replica
 *   - phase 2: dispatch
 *     * send RPC call to worker to make the change of the adding/removing shard replica, and update shard info upon success.
 *   Some of the calls can skip phase 1 and directly go to phase 2, e.g. to re-add replica back to worker if the worker is
 *   restarted.
 *  TODO: handle following cases
 *     1. handle shard migration
 *     2. handle workerGroup balance (could be caused by worker online, or triggered by monitoring system)
 *     3. thread safety refactor of serviceManager/shardManager/workerManager components for multi-threads scheduling
 */
public class ShardSchedulerV2 extends AbstractServer implements Scheduler {

    private static final Logger LOG = LogManager.getLogger(ShardSchedulerV2.class);
    private static final int PRIORITY_LOW = 0;
    private static final int PRIORITY_MEDIUM = 10;
    private static final int PRIORITY_HIGH = 20;

    private static final int shortNap = 100; // unit: us

    // locker to ensure conflict requests are processed in sequential.
    private final ExclusiveLocker requestLocker = new ExclusiveLocker();
    // TODO: Configurable policy based Selector
    private final Selector scoreSelector = new FirstNSelector();

    private static final List conflictPolicies =
            Arrays.asList(PlacementPolicy.EXCLUDE, PlacementPolicy.PACK, PlacementPolicy.SPREAD);

    // Phase 1 executors, mainly focus on selecting target workers for the shard replica. CPU intensive operation.
    private ScheduledThreadPoolExecutor calculateExecutors;
    // Phase 2 executors, make RPC calls to worker to finalize the selection. Network IO intensive operation.
    private ThreadPoolExecutor dispatchExecutors;

    private final ServiceManager serviceManager;
    private final WorkerManager workerManager;

    /**
     * Cancellable close object. If cancelled before calling close(), the runnable object will not be actually run.
     */
    private static class DeferOp implements Closeable {
        private final Runnable runnable;
        private boolean done;

        public DeferOp(Runnable runnable) {
            this.runnable = runnable;
            done = false;
        }

        public void cancel() {
            this.done = true;
        }

        @Override
        public void close() {
            if (!done) {
                done = true;
                runnable.run();
            }
        }
    }

    /**
     * Helper class to manage schedule request exclusive relationship
     * 1. at any time, only one ScheduleRequestContext with the same tuple (serviceId, shardId, workerGroupId)
     *  can be entered into phase 2.
     * 2. at any time, only one shard of the same ShardGroup (whose placement policy is in conflictPolicies) can
     *  be entered the phase 2 execution.
     * NOTE:
     *  1. the 1st conflict doesn't necessarily imply the 2nd conflict. E.g. the shard doesn't have any shardGroup or
     *   the shard doesn't belong to any shard groups with conflict placement policies.
     *  2. the 2nd conflict doesn't necessarily imply the 1st conflict. E.g. it is conflict due to two different shards
     *   from the same shard group are request to schedule at the same time.
     */
    private static class ExclusiveLocker {
        // Lock-free Set, no need to protect by lock
        protected Set exclusiveContexts = new ConcurrentSkipListSet<>();
        // protect exclusiveShardGroups
        protected final ReentrantLock exclusiveMapLock = new ReentrantLock();
        // workerGroup -> (shardGroup, serviceId + shardId), check conflicts of a workerGroup shard scheduling in the same shardGroup
        // TODO: periodically clean the Map which has empty sets due to auto scaling creating/destroying workerGroups
        protected final Map> exclusiveShardGroups = new HashMap<>();

        /**
         * try to set exclusive marker for schedule request with given context.
         * @param ctx request context
         * @param manager shard manager
         * @return true - if the exclusive markers are all set
         *         false - fail to set the exclusive marker, conflicts detected.
         */
        public boolean tryLock(ScheduleRequestContext ctx, ShardManager manager) {
            // exclusiveContexts ensure the same request is executed sequentially.
            if (exclusiveContexts.add(ctx)) {
                // exclusiveShardGroups ensure only one request in progress in the specific shardGroup
                if (checkAndUpdateExclusiveShardGroups(ctx, manager)) {
                    // Set closeable callback
                    ctx.setRunnable(() -> tryUnlock(ctx));
                    return true;
                } else {
                    // rollback adding operation
                    exclusiveContexts.remove(ctx);
                }
            }
            return false;
        }

        private void tryUnlock(ScheduleRequestContext ctx) {
            // reverse order of tryLock()
            cleanExclusiveShardGroup(ctx);
            exclusiveContexts.remove(ctx);
        }

        /**
         * Clean exclusive shardGroup ids from exclusiveShardGroups
         * @param ctx ScheduleRequestContext to be processed
         */
        private void cleanExclusiveShardGroup(ScheduleRequestContext ctx) {
            // clean shardGroup exclusive marker.
            // can't use Shard.getShardGroupIds() from Shard since the shard's group list can be updated in between.
            Collection exclusiveGroups = ctx.getExclusiveGroupIds();
            if (exclusiveGroups != null && !exclusiveGroups.isEmpty()) {
                try (LockCloseable ignored = new LockCloseable(exclusiveMapLock)) {
                    Collection groupMarker = exclusiveShardGroups.get(ctx.getWorkerGroupId());
                    if (groupMarker != null) {
                        groupMarker.removeAll(exclusiveGroups);
                    }
                }
            }
        }

        /**
         * Check whether there is any conflict shard that belongs to the same shard group that is in processing.
         * @return true: check pass, no conflict. false: check fail, has conflict.
         */
        private boolean checkAndUpdateExclusiveShardGroups(ScheduleRequestContext ctx, ShardManager shardManager) {
            Shard shard = shardManager.getShard(ctx.getShardId());
            if (shard == null) {
                return true;
            }
            Set exclusiveIds = shard.getGroupIds().stream()
                    .map(shardManager::getShardGroup)
                    .filter(y -> y != null && conflictPolicies.contains(y.getPlacementPolicy()))
                    .map(ShardGroup::getGroupId)
                    .collect(Collectors.toSet());
            if (exclusiveIds.isEmpty()) {
                return true;
            }
            try (LockCloseable ignored = new LockCloseable(exclusiveMapLock)) {
                Set groupMarker = exclusiveShardGroups.get(ctx.getWorkerGroupId());
                if (groupMarker == null) {
                    groupMarker = new HashSet<>();
                    exclusiveShardGroups.put(ctx.getWorkerGroupId(), groupMarker);
                } else {
                    // check if there is any conflict shard group in scheduling
                    if (groupMarker.stream().anyMatch(exclusiveIds::contains)) {
                        // has conflict, need to do the schedule later.
                        LOG.info("Has conflict shardgroup running, retry later. {}", ctx);
                        return false;
                    }
                }
                // add all groupIds into exclusiveShardGroup to prevent shards in the same group to be scheduled.
                groupMarker.addAll(exclusiveIds);
                // save the exclusiveGroupIds and add into exclusiveShardGroups
                ctx.setExclusiveGroupIds(exclusiveIds);
            }
            return true;
        }
    }

    /**
     * Abstraction of phase 2 tasks.
     * Wrap of FutureTask with Comparable implementation, so it can be prioritized.
     */
    private static class DispatchTask extends FutureTask implements Comparable> {
        private final int priority;
        private final String description;

        public DispatchTask(Runnable runnable, T result, int priority, String description) {
            super(runnable, result);
            this.priority = priority;
            this.description = description;
        }

        public DispatchTask(Callable callable, int priority, String description) {
            super(callable);
            this.priority = priority;
            this.description = description;
        }

        @Override
        public int compareTo(DispatchTask o) {
            // reverse order of Integer, because PriorityQueue peaks element from least to most
            return Integer.compare(o.priority, this.priority);
        }

        @Override
        public String toString() {
            return description;
        }
    }

    private DispatchTask dispatchTaskForAddToWorker(
            String serviceId, List shardIds, long workerId, int priority) {
        Callable callable = () -> {
            try {
                executeAddToWorker(serviceId, shardIds, workerId);
                return null;
            } catch (StarException e) {
                return e;
            } catch (Exception e) {
                return new StarException(ExceptionCode.SCHEDULE, e.getMessage());
            }
        };
        String description = String.format("[AddToWorker Task] serviceId: %s, workerId: %s, priority: %d",
                serviceId, workerId, priority);
        return new DispatchTask<>(callable, priority, description);
    }

    private DispatchTask dispatchTaskForAddToGroup(
            ScheduleRequestContext ctx, int priority) {
        String description = String.format("[AddToGroup Task] %s, priority: %d", ctx, priority);
        return new DispatchTask<>(() -> executeAddToGroupPhase2(ctx), true, priority, description);
    }

    private DispatchTask dispatchTaskForRemoveFromGroup(ScheduleRequestContext ctx, int priority) {
        String description = String.format("[RemoveFromGroup Task] %s, priority: %d", ctx, priority);
        return new DispatchTask<>(() -> executeRemoveFromGroupPhase2(ctx), true, priority, description);
    }

    private DispatchTask dispatchTaskForRemoveFromWorker(
            String serviceId, List shardIds, long workerId, int priority) {
        Callable callable = () -> {
            try {
                executeRemoveFromWorker(serviceId, shardIds, workerId);
                return null;
            } catch (StarException e) {
                return e;
            } catch (Exception e) {
                return new StarException(ExceptionCode.SCHEDULE, e.getMessage());
            }
        };
        String description = String.format("[RemoveFromWorker Task] serviceId: %s, workerId: %s, priority: %d",
                serviceId, workerId, priority);
        return new DispatchTask<>(callable, priority, description);
    }

    public ShardSchedulerV2(ServiceManager serviceManager, WorkerManager workerManager) {
        this.serviceManager = serviceManager;
        this.workerManager = workerManager;
    }

    /**
     * BLOCKING interface to schedule single shard (shardId) in service (serviceId) in workerGroup (wgId).
     * @param serviceId service id
     * @param shardId shard id
     * @param wgId worker group id
     * @throws StarException exception if fails
     */
    @Override
    public void scheduleAddToGroup(String serviceId, long shardId, long wgId) throws StarException {
        scheduleAddToGroup(serviceId, Collections.nCopies(1, shardId), wgId);
    }

    /**
     * BLOCKING interface of scheduling a list of shards to a WorkerGroup. These shards are not necessarily
     * to be in the same shard group.
     * @param serviceId  shard serviceId
     * @param shardIds  shard id list to be scheduled
     * @param wgId  workerGroup id to be scheduled
     * @throws StarException starException with ExceptionCode
     */
    @Override
    public void scheduleAddToGroup(String serviceId, List shardIds, long wgId) throws StarException {
        CountDownLatch latch = new CountDownLatch(shardIds.size());
        List ctxs = new ArrayList<>();
        for (Long id : shardIds) {
            ScheduleRequestContext ctx = new ScheduleRequestContext(serviceId, id, wgId, latch);
            ctxs.add(ctx);
            submitCalcTaskInternal(() -> executeAddToGroupPhase1(ctx), 0);
        }
        try {
            latch.await();
        } catch (InterruptedException e) {
            throw new StarException(ExceptionCode.SCHEDULE, e.getMessage());
        }
        for (ScheduleRequestContext ctx : ctxs) {
            if (ctx.getException() != null) {
                // If any shard schedule fails with exception, propagate it to the caller
                throw ctx.getException();
            }
        }
    }

    /**
     * Async schedule single shard (shardId) in service (serviceId) in wgId workerGroup (wgId), don't wait for result.
     * @param serviceId service identity
     * @param shardId shard identity
     * @param wgId worker group identity
     * @throws StarException exception if fails
     */
    @Override
    public void scheduleAsyncAddToGroup(String serviceId, long shardId, long wgId) throws StarException {
        ScheduleRequestContext ctx = new ScheduleRequestContext(serviceId, shardId, wgId, null);
        submitCalcTaskInternal(() -> executeAddToGroupPhase1(ctx), 0);
    }

    /**
     * Request to schedule a list of shards in service (serviceId), let schedule to choose the default worker group.
     *   Wait for the scheduling done.
     * @param serviceId service identity
     * @param shardIds list of shard identities
     * @throws StarException exception if fails
     */
    @Override
    public void scheduleAddToDefaultGroup(String serviceId, List shardIds) throws StarException {
        WorkerGroup group = workerManager.getDefaultWorkerGroup(serviceId);
        if (group == null) {
            throw new StarException(ExceptionCode.NOT_EXIST,
                    String.format("DefaultWorkerGroup not exist for service %s", serviceId));
        }
        // TODO: it is still possible that the worker group is deleted after this scheduleShards() called.
        scheduleAddToGroup(serviceId, shardIds, group.getGroupId());
    }

    /**
     * Remove redundant replicas for the shard, running:
     *   1. with low priority
     *   2. as background task
     *   3. remove one replica at most each time
     * @param serviceId target service id shard belongs to
     * @param shardId shard id
     * @param workerGroupId target worker group id
     * @throws StarException throws star exception if the task can't be added to thread pool.
     */
    @Override
    public void scheduleAsyncRemoveFromGroup(String serviceId, long shardId, long workerGroupId)
            throws StarException {
        ScheduleRequestContext ctx = new ScheduleRequestContext(serviceId, shardId, workerGroupId, null);
        submitCalcTaskInternal(() -> executeRemoveFromGroupPhase1(ctx), 0);
    }

    /**
     * Adding shard to group. Phase 1, choose workers from the worker group.
     */
    private void executeAddToGroupPhase1(ScheduleRequestContext ctx) {
        try {
            executeAddToGroupPhase1Detail(ctx);
        } catch (ScheduleConflictException e) {
            // A specific exception that schedule understands and retries the request
            // TODO: add RetryPolicy inside RequestCtx to achieve fine controlled retry behavior
            submitCalcTaskInternal(() -> executeAddToGroupPhase1(ctx), shortNap);
        } catch (StarException exception) {
            ctx.done(exception);
        } catch (Throwable throwable) {
            ctx.done(new StarException(ExceptionCode.SCHEDULE, throwable.getMessage()));
        }
    }

    /**
     * submit phase 1 task to calculateExecutors thread pool, optionally with delay
     * @param run runnable object
     * @param delay: delay time before execute, usually for retry
     * @exception StarException fail to submit the task
     */
    private void submitCalcTaskInternal(Runnable run, long delay) throws StarException {
        try {
            if (delay == 0) {
                calculateExecutors.execute(run);
            } else {
                calculateExecutors.schedule(run, delay, TimeUnit.MICROSECONDS);
            }
        } catch (RejectedExecutionException e) {
            if (!isRunning()) {
                throw new StarException(ExceptionCode.SCHEDULE, "Scheduling shutdown!");
            } else {
                throw new StarException(ExceptionCode.SCHEDULE, e.getMessage());
            }
        }
    }

    /**
     * Phase2: add a replica of a shard to a worker without
     * @param serviceId service Id
     * @param shardId shard id to be added
     * @param workerId target worker Id to be added
     * @throws StarException Schedule Error
     * TODO: refactor with Priority Queue
     */
    @Override
    public void scheduleAddToWorker(String serviceId, long shardId, long workerId) throws StarException {
        scheduleAddToWorker(serviceId, Collections.nCopies(1, shardId), workerId);
    }

    /**
     * Phase2: batch add list of shards to a worker
     * @param serviceId service Id
     * @param shardIds list of shard ids to be added
     * @param workerId target worker Id to be added
     * @throws StarException Schedule Error
     */
    @Override
    public void scheduleAddToWorker(String serviceId, List shardIds, long workerId) throws StarException {
        DispatchTask task = dispatchTaskForAddToWorker(serviceId, shardIds, workerId, PRIORITY_HIGH);
        submitDispatchTask(task, true /* wait */);
    }

    /**
     * Phase2: Async interface to schedule a list of shards to the target worker, don't wait for the result.
     * @param serviceId service id
     * @param shardIds list of shard ids
     * @param workerId target worker id
     * @throws StarException schedule error
     */
    @Override
    public void scheduleAsyncAddToWorker(String serviceId, List shardIds, long workerId) throws StarException {
        DispatchTask task = dispatchTaskForAddToWorker(serviceId, shardIds, workerId, PRIORITY_MEDIUM);
        submitDispatchTask(task, false /* wait */);
    }

    /**
     * Phase2: Remove single shard replica from target worker. worker should belong to the service (serviceId)
     * @param serviceId  shard service Id
     * @param shardId  shard id to be removed
     * @param workerId  worker id
     */
    @Override
    public void scheduleRemoveFromWorker(String serviceId, long shardId, long workerId) throws StarException {
        scheduleRemoveFromWorker(serviceId, Collections.nCopies(1, shardId), workerId);
    }

    /**
     * Phase2: Remove list of shards from target worker, wait for the result back.
     * @param serviceId  shard service Id
     * @param shardIds  list of shard ids to be removed
     * @param workerId  target worker id
     * @throws StarException error out if the request can't be fulfilled. ExceptionCode will represent the error category.
     */
    @Override
    public void scheduleRemoveFromWorker(String serviceId, List shardIds, long workerId) throws StarException {
        DispatchTask task = dispatchTaskForRemoveFromWorker(serviceId, shardIds, workerId, PRIORITY_MEDIUM);
        submitDispatchTask(task, true /* wait */);
    }

    /**
     * Phase2: Remove list of shards from target worker, don't wait for the result.
     * @param serviceId  shard service Id
     * @param shardIds  list of shard ids to be removed
     * @param workerId  target worker id
     */
    @Override
    public void scheduleAsyncRemoveFromWorker(String serviceId, List shardIds, long workerId) throws StarException {
        DispatchTask task = dispatchTaskForRemoveFromWorker(serviceId, shardIds, workerId, PRIORITY_LOW);
        submitDispatchTask(task, false /* wait */);
    }

    private   void submitDispatchTask(DispatchTask task, boolean wait) throws StarException {
        try {
            LOG.info("Submit task to executor services. Task: {}, waitForResult: {}", task, wait);
            dispatchExecutors.execute(task);
        } catch (Exception e) {
            LOG.error("Fail to submit schedule task {}", task, e);
            throw new StarException(ExceptionCode.SCHEDULE, e.getMessage());
        }
        if (wait) {
            Exception exception = null;
            try {
                if (task.get() != null) {
                    exception = task.get();
                }
            } catch (Throwable e) {
                LOG.error("Fail to get task result. task: {}", task, e);
                throw new StarException(ExceptionCode.SCHEDULE, e.getMessage());
            }
            if (exception != null) {
                if (exception instanceof StarException) {
                    throw (StarException) exception;
                } else {
                    // convert to StarException
                    throw new StarException(ExceptionCode.SCHEDULE, exception.getMessage());
                }
            }
        }
    }

    /**
     * Process a single shard schedule request.
     * @param ctx Request context
     * @throws ScheduleConflictException, schedule is conflicts, caller can retry the ctx.
     *         StarException, other error cases
     */
    private void executeAddToGroupPhase1Detail(ScheduleRequestContext ctx) {
        // TODO: to be multi-threads safe, shard/shardGroup/workerGroup accessing should be all thread-safe.
        ShardManager shardManager = serviceManager.getShardManager(ctx.getServiceId());
        if (shardManager == null) {
            LOG.info("Service {} is not available before request can be processed. Ignore the request!",
                    ctx.getServiceId());
            ctx.done(new StarException(ExceptionCode.NOT_EXIST, String.format("Service %s Not Exist", ctx.getServiceId())));
            return;
        }
        Shard shard = shardManager.getShard(ctx.getShardId());
        if (shard == null) {
            LOG.info("Shard {} is not available before request can be processed. Ignore the request!",
                    ctx.getShardId());
            ctx.done(new StarException(ExceptionCode.NOT_EXIST, String.format("Shard %d Not Exist", ctx.getShardId())));
            return;
        }

        // check replica numbers
        int replicaNum = shard.getExpectedReplicaCount();
        List existReplicas = shard.getReplicaWorkerIds().stream()
                .map(workerManager::getWorker)
                .filter(y -> y != null && y.isAlive() && y.getGroupId() == ctx.getWorkerGroupId())
                .map(Worker::getWorkerId)
                .collect(Collectors.toList());
        if (existReplicas.size() >= replicaNum) {
            // All good, nothing to do. Let ShardChecker takes care of the removal.
            ctx.done();
            return;
        }

        int desired = replicaNum - existReplicas.size();
        int priority = PRIORITY_MEDIUM;
        if (ctx.isWaited()) {
            // someone is waiting for the request result
            priority = PRIORITY_HIGH;
        } else {
            if (desired <= replicaNum / 2) {
                // already has more than half of replicas, set its priority to LOW
                priority = PRIORITY_LOW;
            }
        }

        // get WorkerList by groupId
        WorkerGroup wg = workerManager.getWorkerGroup(shard.getServiceId(), ctx.getWorkerGroupId());
        if (wg == null) {
            LOG.warn("WorkerGroup {} is not available before request can be processed. Ignore the request! {}",
                    ctx.getWorkerGroupId(), ctx);
            ctx.done(new StarException(ExceptionCode.NOT_EXIST,
                    String.format("WorkerGroup %d doesn't exist!", ctx.getWorkerGroupId())));
            return;
        }
        // get all alive worker Ids in the workerGroup
        Set wIds = wg.getWorkerList().stream()
                .filter(Worker::isAlive)
                .map(Worker::getWorkerId)
                .collect(Collectors.toSet());
        // exclude workers that already has a copy of the shard
        existReplicas.forEach(wIds::remove);

        // IMPORTANT: add exclusiveShardGroup marker so there is only one shard
        // from a shardGroup can be scheduled. This should be put before
        // pulling any shard replicas who co-exists in the same shard group as the processing one.
        if (!requestLocker.tryLock(ctx, shardManager)) {
            ctx.reset();
            throw new ScheduleConflictException();
        }

        try (DeferOp cleanOp = new DeferOp(ctx.getRunnable())) { // enters critical area
            // get shard's all 1st-degree shards categorized by shard group type
            Map> ppMap = new HashMap<>();
            for (Long gid : shard.getGroupIds()) {
                ShardGroup g = shardManager.getShardGroup(gid);
                PlacementPolicy policy = g.getPlacementPolicy();
                List workerIds = new ArrayList<>();
                for (Long id : g.getShardIds()) {
                    if (id == shard.getShardId()) {
                        continue;
                    }
                    Shard firstDegreeShard = shardManager.getShard(id);
                    if (firstDegreeShard == null) {
                        continue;
                    }
                    workerIds.addAll(firstDegreeShard.getReplicaWorkerIds()
                            .stream()
                            .filter(wIds::contains)
                            .collect(Collectors.toList()));
                }
                if (workerIds.isEmpty()) {
                    continue;
                }
                if (ppMap.containsKey(policy)) {
                    ppMap.get(policy).addAll(workerIds);
                } else {
                    ppMap.put(policy, workerIds);
                }
                LOG.debug("Add policy {} workers: {} for request ctx: {}", policy, workerIds, ctx);
            }

            ShardPolicyFilter.filter(ppMap, wIds);

            // Final: Select valid workerIds
            if (wIds.isEmpty()) {
                LOG.warn("Can't find any worker for schedule request: {}", ctx);
                ctx.done(new StarException(ExceptionCode.SCHEDULE,
                        String.format("Can't find worker for request: %s", ctx)));
                return;
            }

            List workerIds;
            if (wIds.size() < desired) {
                LOG.warn("Schedule requests {} workers, but only {} available. {}", desired, wIds.size(), ctx);
                workerIds = new ArrayList<>(wIds);
            } else {
                ScheduleScorer scorer = new ScheduleScorer(wIds);
                // apply shardgroup policy scoring
                ppMap.forEach(scorer::apply);
                // apply worker status scoring
                scorer.apply(workerManager);
                LOG.debug("final scores for selection: {}, for request {}", scorer.getScores(), ctx);
                // Select the worker with the HIGHEST score
                workerIds = scorer.selectHighEnd(scoreSelector, desired);
            }
            ctx.setWorkerIds(workerIds);

            // submit to next thread pool to do the execution.
            LOG.info("Schedule request {}, pending schedule to workerList: {}", ctx, ctx.getWorkerIds());
            DispatchTask task = dispatchTaskForAddToGroup(ctx, priority);
            try {
                LOG.info("submit dispatch task: {}", task);
                dispatchExecutors.execute(task);
                // submit success, don't clean the exclusive group marker
                cleanOp.cancel();
            } catch (Throwable e) {
                LOG.error("Fail to add task {} into dispatchWorkerExecutors", task, e);
                // error out
                ctx.done(new StarException(ExceptionCode.SCHEDULE, e.getMessage()));
            }
        }
    }

    /**
     * Phase2: Process tasks submitted by phase1, talk to workers to assign shards.
     *  NOTE: the ctx will not be removed from pendingFinishRequests until its execution is done.
     * @param ctx ScheduleRequestContext to be processed
     */
    private void executeAddToGroupPhase2(ScheduleRequestContext ctx) {
        try (DeferOp ignored = new DeferOp(ctx.getRunnable())) {
            if (isRunning()) {
                executeAddToWorker(ctx);
            } else {
                ctx.done(new StarException(ExceptionCode.SCHEDULE, "Schedule shutdown in progress"));
            }
        }
    }

    private void executeAddToWorker(ScheduleRequestContext ctx) {
        StarException exception = null;
        for (long workerId : ctx.getWorkerIds()) {
            try {
                executeAddToWorker(ctx.getServiceId(), Collections.nCopies(1, ctx.getShardId()), workerId);
            } catch (StarException e) {
                exception = e;
            } catch (Exception e) {
                exception = new StarException(ExceptionCode.SCHEDULE, e.getMessage());
            }
        }
        ctx.done(exception);
    }

    /**
     * Do execution of adding shards to worker
     * @param serviceId service Id to be operated
     * @param shardIds  list of shards to be removed
     * @param workerId  target worker id
     */
    private void executeAddToWorker(String serviceId, List shardIds, long workerId) {
        ShardManager shardManager = serviceManager.getShardManager(serviceId);
        if (shardManager == null) {
            LOG.warn("service {} not exist for schedule request.", serviceId);
            throw new StarException(ExceptionCode.NOT_EXIST, String.format("service %s not exists", serviceId));
        }

        Worker worker = workerManager.getWorker(workerId);
        if (worker == null) {
            LOG.warn("worker {} not exist when execute add shard task for service {}.", workerId, serviceId);
            throw new StarException(ExceptionCode.NOT_EXIST, String.format("worker %d not exists", workerId));
        }

        if (!worker.getServiceId().equals(serviceId)) {
            throw new StarException(ExceptionCode.INVALID_ARGUMENT,
                    String.format("worker %d doesn't belong to service: %s", workerId, serviceId));
        }

        List info = new ArrayList<>(shardIds.size());
        for (Long id : shardIds) {
            Shard shard = shardManager.getShard(id);
            if (shard == null) {
                LOG.warn("shard {} not exist when execute add shard task for service {}.", id, serviceId);
                if (shardIds.size() == 1) {
                    throw new StarException(ExceptionCode.NOT_EXIST, String.format("shard %d not exists", id));
                }
            } else {
                info.add(shard.getAddShardInfo());
            }
        }

        AddShardRequest request = AddShardRequest.newBuilder()
                .setServiceId(serviceId)
                .setWorkerId(workerId)
                .addAllShardInfo(info)
                .build();

        if (worker.addShard(request)) {
            shardManager.addShardReplicas(shardIds, workerId);
        } else {
            throw new StarException(ExceptionCode.SCHEDULE,
                    String.format("Schedule add shard task execution failed serviceId: %s, workerId: %d, shardIds: %s",
                            serviceId, workerId, shardIds));
        }
    }

    /**
     * Do execution of removing shards
     * @param serviceId service Id to be operated
     * @param shardIds  list of shards to be removed
     * @param workerId  target worker id
     */
    private void executeRemoveFromWorker(String serviceId, List shardIds, long workerId) {
        ShardManager shardManager = serviceManager.getShardManager(serviceId);
        if (shardManager == null) {
            LOG.warn("service {} not exist when execute remove-shard.", serviceId);
            throw new StarException(ExceptionCode.NOT_EXIST, String.format("service %s not exist!", serviceId));
        }

        boolean doClean = true;
        Worker worker = workerManager.getWorker(workerId);
        if (worker == null) {
            LOG.warn("worker {} not exist when execute remove shards for service {}!", workerId, serviceId);
        } else if (!worker.isAlive()) {
            LOG.warn("worker {} dead when execute remove shards for service {}.", workerId, serviceId);
        } else if (!worker.getServiceId().equals(serviceId)) {
            LOG.warn("worker {} doesn't belong to service {}", workerId, serviceId);
        } else {
            RemoveShardRequest request = RemoveShardRequest.newBuilder()
                    .setServiceId(serviceId)
                    .setWorkerId(workerId)
                    .addAllShardIds(shardIds).build();
            doClean = worker.removeShard(request);
        }
        if (doClean) {
            shardManager.removeShardReplicas(shardIds, workerId);
        }
    }

    /**
     * Try to select a replica to remove for given shard from given worker Group
     * @param ctx request context
     */
    private void executeRemoveFromGroupPhase1(ScheduleRequestContext ctx) {
        if (!isRunning()) {
            ctx.done(new StarException(ExceptionCode.SCHEDULE, "Schedule in shutdown progress"));
            return;
        }
        try {
            executeRemoveFromGroupPhase1Detail(ctx);
        } catch (ScheduleConflictException exception) {
            // TODO: add RetryPolicy inside RequestCtx to achieve fine controlled retry behavior
            submitCalcTaskInternal(() -> executeRemoveFromGroupPhase1(ctx), shortNap);
        } catch (StarException exception) {
            ctx.done(exception);
        } catch (Throwable throwable) {
            ctx.done(new StarException(ExceptionCode.SCHEDULE, throwable.getMessage()));
        }
    }

    /**
     * execute the task of selecting a replica of the shard from the worker group to remove.
     *
     * @param ctx Request context
     * @throws ScheduleConflictException - schedule is conflicts, caller can retry the ctx.
     *         StarException - other error cases
     */
    private void executeRemoveFromGroupPhase1Detail(ScheduleRequestContext ctx) {
        ShardManager shardManager = serviceManager.getShardManager(ctx.getServiceId());
        if (shardManager == null) {
            LOG.info("Service {} is not available before request can be processed. Ignore the request!",
                    ctx.getServiceId());
            ctx.done(new StarException(ExceptionCode.NOT_EXIST, String.format("Service %s Not Exist", ctx.getServiceId())));
            return;
        }

        Shard shard = shardManager.getShard(ctx.getShardId());
        if (shard == null) {
            LOG.info("Shard {} is not available before request can be processed. Ignore the request!",
                    ctx.getShardId());
            ctx.done(new StarException(ExceptionCode.NOT_EXIST, String.format("Shard %d Not Exist", ctx.getShardId())));
            return;
        }

        // check replica numbers
        int replicaNum = shard.getExpectedReplicaCount();
        List healthyWorkerIds = new ArrayList<>();
        List deadWorkers = new ArrayList<>();
        shard.getReplicaWorkerIds().forEach(id -> {
            Worker worker = workerManager.getWorker(id);
            if (worker == null) {
                // Ask schedule to remove the replica directly.
                scheduleAsyncRemoveFromWorker(
                        ctx.getServiceId(), Collections.nCopies(1, ctx.getShardId()), id);
            } else if (worker.getGroupId() == ctx.getWorkerGroupId()) {
                // Our interested worker group
                if (!worker.isAlive()) {
                    deadWorkers.add(worker);
                } else {
                    healthyWorkerIds.add(worker.getWorkerId());
                }
            }
        });

        if (healthyWorkerIds.size() + deadWorkers.size() <= replicaNum) {
            // Don't do anything, because right now the total replicas are less than expected. Wish dead replicas can
            // be back some time in the future. Finger crossed.
            LOG.debug("{}, Number of replicas (include dead ones) are less than expected replica. Skip it.", ctx);
            ctx.done();
            return;
        }

        List workerIds;
        if (!requestLocker.tryLock(ctx, shardManager)) {
            throw new ScheduleConflictException();
        }
        // tryLock will call ctx.setCloseable() to register a closeable to clean the exclusive markers
        try (DeferOp cleanOp = new DeferOp(ctx.getRunnable())) {
            if (!deadWorkers.isEmpty()) { // easy case
                workerIds = Arrays.asList(selectOldestWorkerLastSeen(deadWorkers).getWorkerId());
            } else {
                // scoring all the replicas
                ScheduleScorer scorer = new ScheduleScorer(healthyWorkerIds);
                for (long groupId : shard.getGroupIds()) {
                    ShardGroup group = shardManager.getShardGroup(groupId);
                    if (group == null) {
                        continue;
                    }

                    List allReplicaWorkerIds = new ArrayList<>();
                    for (long sid : group.getShardIds()) {
                        if (sid == shard.getShardId()) {
                            continue;
                        }
                        Shard firstDegreeShard = shardManager.getShard(sid);
                        if (firstDegreeShard == null) {
                            continue;
                        }
                        allReplicaWorkerIds.addAll(firstDegreeShard.getReplicaWorkerIds());
                    }
                    scorer.apply(group.getPlacementPolicy(), allReplicaWorkerIds);
                }
                scorer.apply(workerManager);
                LOG.debug("final scores for selection: {}, for request {}", scorer.getScores(), ctx);
                // select replica with the LOWEST score to remove
                workerIds = scorer.selectLowEnd(scoreSelector, 1);
                LOG.debug("Final selection for remove-healthy shard, request:{} selection: {}", ctx, workerIds);
            }

            Preconditions.checkState(workerIds.size() == 1L, "Should only have one replica to remove!");

            ctx.setWorkerIds(workerIds);
            LOG.info("Schedule request {}, pending schedule to workerList: {}", ctx, ctx.getWorkerIds());
            DispatchTask task = dispatchTaskForRemoveFromGroup(ctx, ctx.isWaited() ? PRIORITY_MEDIUM : PRIORITY_LOW);
            try {
                dispatchExecutors.execute(task);
                // submit success, don't clean the exclusive group marker
                cleanOp.cancel();
            } catch (Throwable e) {
                LOG.error("Fail to add task {} into dispatchWorkerExecutors", task, e);
                // error out
                ctx.done(new StarException(ExceptionCode.SCHEDULE, e.getMessage()));
            }
        }
    }

    /**
     * Select the worker whose lastSeenTime is oldest.
     * @param workers workers to choose
     * @return the worker with smallest lastSeenTime
     */
    private Worker selectOldestWorkerLastSeen(List workers) {
        Worker targetWorker = workers.get(0);
        for (Worker worker : workers) {
            if (worker.getLastSeenTime() < targetWorker.getLastSeenTime()) {
                targetWorker = worker;
            }
        }
        return targetWorker;
    }

    private void executeRemoveFromGroupPhase2(ScheduleRequestContext ctx) {
        try (DeferOp ignored = new DeferOp(ctx.getRunnable())) {
            if (isRunning()) {
                for (long workerId : ctx.getWorkerIds()) {
                    executeRemoveFromWorker(ctx.getServiceId(), Collections.nCopies(1, ctx.getShardId()), workerId);
                }
            } else {
                ctx.done(new StarException(ExceptionCode.SCHEDULE, "Schedule shutdown in progress"));
            }
        }
    }

    @Override
    public boolean isIdle() {
        if (!isRunning()) {
            return true;
        }
        return calculateExecutors.getCompletedTaskCount() == calculateExecutors.getTaskCount() &&
                dispatchExecutors.getCompletedTaskCount() == dispatchExecutors.getTaskCount();
    }

    @Override
    public void doStart() {
        calculateExecutors = new ScheduledThreadPoolExecutor(Config.SCHEDULER_CORE_THREAD_POOL_SIZE);
        dispatchExecutors = new ThreadPoolExecutor(Config.SCHEDULER_DISPATCH_WORKER_THREAD_POOL_SIZE,
                Config.SCHEDULER_DISPATCH_WORKER_THREAD_POOL_SIZE,
                0L,
                TimeUnit.MILLISECONDS,
                new PriorityBlockingQueue<>());
    }

    @Override
    public void doStop() {
        shutdownAndAwaitTermination(calculateExecutors);
        shutdownAndAwaitTermination(dispatchExecutors);
    }

    private void shutdownAndAwaitTermination(ExecutorService pool) {
        pool.shutdown(); // Disable new tasks from being submitted
        try {
            // Wait a while for existing tasks to terminate
            if (!pool.awaitTermination(10, TimeUnit.SECONDS)) {
                pool.shutdownNow(); // Cancel currently executing tasks
                // Wait a while for tasks to respond to being cancelled
                if (!pool.awaitTermination(10, TimeUnit.SECONDS)) {
                    LOG.error("Pool did not terminate");
                }
            }
        } catch (InterruptedException ie) {
            // (Re-)Cancel if current thread also interrupted
            pool.shutdownNow();
            // Preserve interrupt status
            Thread.currentThread().interrupt();
        }
    }
}