All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.alibaba.schedulerx.worker.master.MapTaskMaster Maven / Gradle / Ivy

There is a newer version: 1.12.2
Show newest version
package com.alibaba.schedulerx.worker.master;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.NumberUtils;
import org.springframework.util.CollectionUtils;

import com.alibaba.schedulerx.common.domain.InstanceStatus;
import com.alibaba.schedulerx.common.domain.JobInstanceInfo;
import com.alibaba.schedulerx.common.domain.MapTaskProgress;
import com.alibaba.schedulerx.common.domain.MapTaskXAttrs;
import com.alibaba.schedulerx.common.domain.Metrics;
import com.alibaba.schedulerx.common.domain.TaskDispatchMode;
import com.alibaba.schedulerx.common.domain.TaskProgressCounter;
import com.alibaba.schedulerx.common.domain.TaskStatus;
import com.alibaba.schedulerx.common.domain.TimeType;
import com.alibaba.schedulerx.common.domain.WorkerProgressCounter;
import com.alibaba.schedulerx.common.domain.enums.RouteStrategyEnum;
import com.alibaba.schedulerx.common.monitor.MetricsCollector;
import com.alibaba.schedulerx.common.util.ConfigUtil;
import com.alibaba.schedulerx.common.util.ExceptionUtil;
import com.alibaba.schedulerx.common.util.HessianUtil;
import com.alibaba.schedulerx.common.util.IdUtil;
import com.alibaba.schedulerx.common.util.IdUtil.IdType;
import com.alibaba.schedulerx.common.util.JobUtil;
import com.alibaba.schedulerx.common.util.JsonUtil;
import com.alibaba.schedulerx.protocol.Worker;
import com.alibaba.schedulerx.protocol.Worker.ContainerReportTaskStatusRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterBatchStartContainersRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterBatchStartContainersResponse;
import com.alibaba.schedulerx.protocol.Worker.MasterCheckWorkerAliveRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterCheckWorkerAliveResponse;
import com.alibaba.schedulerx.protocol.Worker.MasterDestroyContainerPoolRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterKillContainerRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterNotifyWorkerPullRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterNotifyWorkerPullResponse;
import com.alibaba.schedulerx.protocol.Worker.MasterStartContainerRequest;
import com.alibaba.schedulerx.protocol.Worker.WorkerReportJobInstanceProgressRequest;
import com.alibaba.schedulerx.protocol.utils.FutureUtils;
import com.alibaba.schedulerx.worker.SchedulerxWorker;
import com.alibaba.schedulerx.worker.actor.FutureExecutorPool;
import com.alibaba.schedulerx.worker.batch.ReqQueue;
import com.alibaba.schedulerx.worker.batch.TMStatusReqHandler;
import com.alibaba.schedulerx.worker.batch.TaskDispatchReqHandler;
import com.alibaba.schedulerx.worker.domain.JavaProcessorProfile;
import com.alibaba.schedulerx.worker.domain.JobContext;
import com.alibaba.schedulerx.worker.domain.TaskInfo;
import com.alibaba.schedulerx.worker.domain.WorkerConstants;
import com.alibaba.schedulerx.worker.log.LogFactory;
import com.alibaba.schedulerx.worker.log.Logger;
import com.alibaba.schedulerx.worker.logcollector.ClientLoggerMessage;
import com.alibaba.schedulerx.worker.logcollector.LogCollector;
import com.alibaba.schedulerx.worker.logcollector.LogCollectorFactory;
import com.alibaba.schedulerx.worker.master.persistence.TaskPersistence;
import com.alibaba.schedulerx.worker.metrics.WorkerLoadRegister;
import com.alibaba.schedulerx.worker.processor.JobProcessor;
import com.alibaba.schedulerx.worker.processor.JobProcessorEx;
import com.alibaba.schedulerx.worker.processor.MapJobProcessor;
import com.alibaba.schedulerx.worker.processor.MapReduceJobProcessor;
import com.alibaba.schedulerx.worker.processor.ProcessResult;
import com.alibaba.schedulerx.worker.route.Router;
import com.alibaba.schedulerx.worker.route.RouterFactory;
import com.alibaba.schedulerx.worker.util.ActorPathUtil;
import com.alibaba.schedulerx.worker.util.JobProcessorUtil;
import com.alibaba.schedulerx.worker.util.WorkerConfigUtil;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.protobuf.ByteString;

import akka.actor.ActorContext;
import akka.actor.ActorSelection;
import akka.dispatch.OnFailure;
import akka.dispatch.OnSuccess;
import akka.pattern.Patterns;
import akka.util.Timeout;
import scala.concurrent.ExecutionContext;
import scala.concurrent.Future;
import scala.concurrent.duration.Duration;

/**
 * @author xiaomeng.hxm
 */
public abstract class MapTaskMaster extends TaskMaster {

    private static final Logger LOGGER = LogFactory.getLogger(MapTaskMaster.class);
    private volatile int index = 0;
    protected volatile int pageSize = ConfigUtil.getWorkerConfig().getInt(WorkerConstants.MAP_MASTER_PAGE_SIZE,
        WorkerConstants.MAP_MASTER_PAGE_SIZE_DEFAULT);
    protected volatile int queueSize = ConfigUtil.getWorkerConfig().getInt(WorkerConstants.MAP_MASTER_QUEUE_SIZE,
        WorkerConstants.MAP_MASTER_QUEUE_SIZE_DEFAULT);
    private volatile int dispatcherSize = ConfigUtil.getWorkerConfig().getInt(WorkerConstants.MAP_MASTER_DISPATCHER_SIZE,
        WorkerConstants.MAP_MASTER_DISPATCHER_SIZE_DEFAULT);

    //task批量汇报队列
    protected ReqQueue taskStatusReqQueue;
    protected TMStatusReqHandler taskStatusReqBatchHandler;

    //子任务内存缓存队列,推模型通过TaskDispatchReqHandler进行拉取主动推送,拉模型通过PullThread进行拉取。
    protected ReqQueue taskBlockingQueue;

    protected TaskDispatchReqHandler taskDispatchReqHandler;

    // 跟任务失败单独处理
    private volatile String rootTaskResult;

    protected TaskPersistence taskPersistence;
    /**
     * taskName -> TaskProgressCounter(total, init, pulled, running, success, failed)
     */
    protected Map taskProgressMap = Maps.newConcurrentMap();
    /**
     * workerAddr -> WorkerProgressCounter(total, running, success, failed)
     */
    protected Map workerProgressMap = Maps.newConcurrentMap();

    private Map taskResultMap = Maps.newHashMap();

    private Map taskStatusMap = Maps.newHashMap();

    protected MapTaskXAttrs xAttrs = null;

    protected volatile AtomicInteger taskCounter = new AtomicInteger(0);
    
    protected ExecutionContext futureExecutor;

    private LogCollector logCollector = LogCollectorFactory.get();
    
    protected volatile boolean startStatusCheck = true;

    protected volatile boolean needReduce;

    protected Router router;

    public MapTaskMaster(JobInstanceInfo jobInstanceInfo, ActorContext actorContext) throws Exception {
        super(jobInstanceInfo, actorContext);
        this.futureExecutor = FutureExecutorPool.INSTANCE.get("MapTaskMaster");
        // 开启子任务分发随机打散
        boolean random = ConfigUtil.getWorkerConfig().getBoolean(WorkerConstants.MAP_MASTER_DISPATCH_RANDOM,
                false);
        List allWorkers = jobInstanceInfo.getAllWorkers();
        if (CollectionUtils.isEmpty(allWorkers)) {
            throw new IllegalArgumentException("workers can't be empty! ");
        }
        if (random && !CollectionUtils.isEmpty(allWorkers)) {
            index = new Random().nextInt(allWorkers.size());
        }
    }

    @Override
    protected void init() {
    	startStatusCheck = true;
    	
        if (INITED){
            return;
        }
        super.init();

        if(isWorkerLoadRouter()) {
            // 全新基于worker负载的路由模式
            router = RouterFactory.getRouter(jobInstanceInfo.getAppGroupId(), jobInstanceInfo.getJobId(),
                    RouteStrategyEnum.WORKER_LOAD.getValue(), jobInstanceInfo.getRouteStrategyContent());
            // 清理路由配置
            if (this.router != null && this.router instanceof WorkerLoadRegister) {
                ((WorkerLoadRegister) this.router).clear();
            }
        }

        final String jobIdAndInstanceId = jobInstanceInfo.getJobId() + "_" + jobInstanceInfo.getJobInstanceId();
        LOGGER.info("jobInstanceId={}, map master config, pageSize:{}, queueSize:{}, dispatcherSize:{}, workerSize:{}",
            jobIdAndInstanceId, pageSize, queueSize, dispatcherSize, jobInstanceInfo.getAllWorkers().size());

        // pull thread
        new Thread(new Runnable() {
            @Override
            public void run() {
                while (!isFinished()) {
                    try {
                        List taskInfos;
                        long startTime = System.currentTimeMillis();
                        taskInfos = taskPersistence.pull(jobInstanceInfo.getJobInstanceId(), pageSize);
                        LOGGER.debug("jobInstanceId={}, pull cost={}ms", jobInstanceInfo.getJobInstanceId(),
                            (System.currentTimeMillis() - startTime));
                        if (taskInfos.isEmpty()) {
                            LOGGER.debug("pull task empty of jobInstanceId={}, sleep 10000 ms ...",
                                jobInstanceInfo.getJobInstanceId());
                            Thread.sleep(10 * 1000);
                        } else {
                            LOGGER.info("jobInstanceId={}, failover retry dispatch taskList, size:{} , cost={}ms",
                                    jobInstanceInfo.getJobInstanceId(), taskInfos.size(), System.currentTimeMillis() - startTime);
                            for (TaskInfo taskInfo : taskInfos) {
                                ByteString taskBody = null;
                                if (taskInfo.getTaskBody() != null) {
                                    taskBody = ByteString.copyFrom(taskInfo.getTaskBody());
                                }
                                MasterStartContainerRequest.Builder builder = convert2StartContainerRequestBuilder(jobInstanceInfo, 
                                        taskInfo.getTaskId(), taskInfo.getTaskName(), taskBody, true);
                                taskBlockingQueue.submitRequest(builder.build());
                            }
                        }
                    } catch (TimeoutException te) {
                        LOGGER.error("pull task timeout, uniqueId:{}", jobIdAndInstanceId, te);
                        logCollector.collect(jobInstanceInfo.getAppGroupId(), jobIdAndInstanceId, ClientLoggerMessage.MAP_INSTANCE_PULL_JOB_FAIL, 
                            te, jobInstanceInfo.getGroupId());
                        try {
                            Thread.sleep(10 * 1000);
                        } catch (InterruptedException e) {
                            //do nothing
                        }
                    } catch (Throwable e) {
                        updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, ExceptionUtil.getMessage(e));
                        logCollector.collect(jobInstanceInfo.getAppGroupId(), jobIdAndInstanceId, ClientLoggerMessage.MAP_INSTANCE_PULL_JOB_FAIL, 
                            e, jobInstanceInfo.getGroupId());
                        LOGGER.error("pull task error, uniqueId:{}", jobIdAndInstanceId, e);
                    }
                }
            }
        }, "Schedulerx-MapTaskMaster-pull-thread-" + jobIdAndInstanceId).start();

        // status check thread
        new Thread(new Runnable() {
            @Override
            public void run() {
                int checkInterval = ConfigUtil.getWorkerConfig().getInt(WorkerConstants.Map_MASTER_STATUS_CHECK_INTERVAL,
                        WorkerConstants.Map_MASTER_STATUS_CHECK_INTERVAL_DEFAULT);
                while (!isFinished()) {
                    try {
                        Thread.sleep(checkInterval);
                        //秒级别任务,每次跑完会停止任务状态检查,否则会触发多次迭代
                        if (!isStartStatusCheck()) {
                        	continue;
                        }
                        InstanceStatus newStatus = taskPersistence.checkInstanceStatus(jobInstanceInfo.getJobInstanceId());
                        if (newStatus.isFinish() && taskDispatchReqHandler.isActive()) {
                            // avoid wrong early finish instance in condition root task was success but sub tasks are still creating.
                            Thread.sleep(checkInterval);
                            continue;
                        }
                        String result = getRootTaskResult();
                        if (newStatus.equals(InstanceStatus.SUCCESS)) {
                            // if return finish status, we need check counter;
                            int failCnt = 0;
                            int successCnt = 0;
                            int totalCnt = 0;
                            for (TaskProgressCounter taskProgressCounter : taskProgressMap.values()) {
                                failCnt += taskProgressCounter.getFailed();
                                successCnt += taskProgressCounter.getSuccess();
                                totalCnt += taskProgressCounter.getTotal();
                            }
                            if (successCnt + failCnt < totalCnt) {
                                newStatus = InstanceStatus.FAILED;
                                LOGGER.warn("jobInstanceId={} turn into finish status,"
                                    + " but count isn't correct, successCnt:{}, failCnt:{}, totalCnt:{}",
                                    jobInstanceInfo.getJobInstanceId(), successCnt, failCnt, totalCnt);
                                result = "Turn into finish status, but count is wrong, sucCnt:" + successCnt + ", failCnt:" + failCnt +
                                    ", totalCnt:" + totalCnt + "; Basically, the reason is that some workers are shutdown.";
                            } else {
                                newStatus = failCnt > 0 ? InstanceStatus.FAILED : InstanceStatus.SUCCESS;
                            }
                        }
                        updateNewInstanceStatus(getSerialNum(), jobInstanceInfo.getJobInstanceId(), newStatus, result);
                    } catch (Throwable e) {
                        LOGGER.error("status check error, uniqueId:{}", jobIdAndInstanceId, e);
                    }
                }
            }
        }, "Schedulerx-MapTaskMaster-status-check-thread-" + jobIdAndInstanceId).start();

        // job instance progress report thread
        if(!JobUtil.isSecondTypeJob(TimeType.parseValue(jobInstanceInfo.getTimeType()))) {
            new Thread(new Runnable() {
                @Override
                public void run() {
                    while (!isFinished()) {
                        WorkerReportJobInstanceProgressRequest request = WorkerReportJobInstanceProgressRequest.newBuilder().setJobId(jobInstanceInfo.getJobId()).setJobInstanceId(
                            jobInstanceInfo.getJobInstanceId()).setProgress(getJobInstanceProgress()).build();
                        SERVER_DISCOVERY.getMapMasterRouter().tell(request, null);
                        try {
                            Thread.sleep(5000);
                        } catch (InterruptedException e) {
                            LOGGER.error("report status error, uniqueId={}", jobIdAndInstanceId, e);
                            break;
                        }
                    }
                }
            }, "Schedulerx-MapTaskMaster-report-progress-thread-" + jobIdAndInstanceId).start();
        }

        //worker alive check thread
        new Thread(new Runnable() {
            @Override
            public void run() {
                while (!isFinished()) {
//                    aliveCheckWorkerSet.addAll(jobInstanceInfo.getAllWorkers());
//                    if (aliveCheckWorkerSet.isEmpty()) {
//                        LOGGER.warn("worker list is empty, jobInstanceId={}", jobInstanceInfo.getJobInstanceId());
//                        taskPersistence.batchUpdateTaskStatus(jobInstanceInfo.getJobInstanceId(), TaskStatus.FAILED,
//                            null, null);
//                        break;
//                    } else {
                        try {
                            for (String workerIdAddr : aliveCheckWorkerSet) {
                                try {
                                    String workerAddr = workerIdAddr.split("@")[1];
                                    String tokens[] = workerAddr.split(":");
                                    String host = tokens[0];
                                    int port = Integer.valueOf(tokens[1]);
                                    int times = 0;
                                    while (times < 3) {
                                        Socket socket = new Socket();
                                        try {
                                            socket.connect(new InetSocketAddress(host, port), 5000);
                                            LOGGER.info("socket to {}:{} is reachable, times={}", host, port, times);
                                            break;
                                        } catch (Exception e) {
                                            LOGGER.info("socket to {}:{} is not reachable, times={}", host, port, times);
                                            Thread.sleep(5000);
                                            times++;
                                        } finally {
                                            if (socket != null) {
                                                socket.close();
                                            }
                                        }
                                    }
                                    if (times >= 3) {
                                        LOGGER.warn("worker[{}] is down, start to remove this worker and failover tasks, jobInstanceId={}",
                                                workerIdAddr, jobInstanceInfo.getJobInstanceId());
                                        handleWorkerShutdown(workerIdAddr, true);
                                        continue;
                                    }
                                    final long startTime = System.currentTimeMillis();
                                    ActorSelection selection = getActorContext().actorSelection(
                                            ActorPathUtil.getWorkerHeartbeatRouterPath(workerIdAddr));
                                    MasterCheckWorkerAliveRequest request = MasterCheckWorkerAliveRequest.newBuilder()
                                        .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                                        .setDispatchMode(xAttrs.getTaskDispatchMode())
                                        .build();
                                    MasterCheckWorkerAliveResponse response = (MasterCheckWorkerAliveResponse)
                                            FutureUtils.awaitResult(selection, request, 10);
                                    if (!response.getSuccess()) {
                                        LOGGER.warn("jobInstanceId={} of worker={} is not alive", jobInstanceInfo.getJobInstanceId(),
                                                workerIdAddr, response.getMessage());
                                        handleWorkerShutdown(workerIdAddr, true);

                                        // destroy containers of worker of PullModel
                                        MasterDestroyContainerPoolRequest destroyContainerPoolRequest = MasterDestroyContainerPoolRequest.newBuilder()
                                                .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                                                .setJobId(jobInstanceInfo.getJobId())
                                                .setWorkerIdAddr(workerIdAddr)
                                                .setSerialNum(getSerialNum())
                                                .build();
                                       SchedulerxWorker.AtLeastDeliveryRoutingActor.tell(destroyContainerPoolRequest, null);
                                    } else {
                                        // 设置当前worker load
                                        setWorkerLoad(workerIdAddr, response.getMetricsJson(), System.currentTimeMillis()-startTime);
                                    }
                                } catch (Exception e) {
                                    //TODO 确认下是否需要shutdown
                                    // 一旦出现异常,且telnet正常情况下会导致任务卡死
                                    LOGGER.error("Alive worker check failed.", e);
                                    handleWorkerShutdown(workerIdAddr, true);
                                    // destroy containers of worker of PullModel
                                    MasterDestroyContainerPoolRequest destroyContainerPoolRequest = MasterDestroyContainerPoolRequest.newBuilder()
                                            .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                                            .setJobId(jobInstanceInfo.getJobId())
                                            .setWorkerIdAddr(workerIdAddr)
                                            .setSerialNum(getSerialNum())
                                            .build();
                                    SchedulerxWorker.AtLeastDeliveryRoutingActor.tell(destroyContainerPoolRequest, null);
                                }
                            }

                            //每隔10秒进行一次worker探活
                            Thread.sleep(10000);
                        } catch (Throwable e) {
                            LOGGER.error("check worker error, jobInstanceId={}", jobInstanceInfo.getJobInstanceId(), e);
                        }
//                    }
                }
            }
        }, "Schedulerx-MapTaskMaster-check-worker-alive-thread-" + jobIdAndInstanceId).start();

        // PULL_MODEL specially
        if (xAttrs.getTaskDispatchMode().equals(TaskDispatchMode.PULL.getValue())) {
            new Thread(new Runnable() {
                @Override
                public void run() {
                    while (!isFinished()) {
                        for (String workerIdAddr : jobInstanceInfo.getAllWorkers()) {
                            try {
                                ActorSelection selection = getActorContext().actorSelection(
                                        ActorPathUtil.getWorkerJobInstancePath(workerIdAddr));
                                MasterNotifyWorkerPullRequest request = MasterNotifyWorkerPullRequest.newBuilder()
                                        .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                                        .setPageSize(xAttrs.getPageSize())
                                        .setQueueSize(xAttrs.getQueueSize())
                                        .setTaskMasterAkkaPath(getLocalTaskRouterPath())
                                        .setConsumerSize(xAttrs.getConsumerSize())
                                        .setSerialNum(getSerialNum())
                                        .build();
                                MasterNotifyWorkerPullResponse response = (MasterNotifyWorkerPullResponse) FutureUtils.awaitResult(
                                        selection, request, 5);
                                if (!response.getSuccess()) {
                                    String errorMsg = response.getMessage();
                                    LOGGER.error("notify worker pull failed, jobInstanceId={}", jobInstanceInfo.getJobInstanceId(), errorMsg);
                                    updateNewInstanceStatus(getSerialNum(), jobInstanceInfo.getJobInstanceId(),
                                            InstanceStatus.FAILED, errorMsg);
                                    // TODO 一但出现异常后继续分发Worker会出现致命问题
                                }
                            } catch (Throwable e) {
                                LOGGER.error("notify worker pull error, jobInstanceId={}, worker={}", jobIdAndInstanceId, workerIdAddr, e);
                            }
                        }
                        try {
                            Thread.sleep(5000);
                        } catch (InterruptedException e) {
                            LOGGER.error("", e);
                        }
                    }
                }
            }, "Schedulerx-PullTaskMaster-notify-workers-pull-thread-" + jobIdAndInstanceId).start();
        }
    }

    @Override
    public synchronized void submitInstance(JobInstanceInfo jobInstanceInfo) throws Exception {
        try {
            long startTime = System.currentTimeMillis();
            if (dispatcherSize > WorkerConstants.MAP_MASTER_DISPATCHER_SIZE_MAX) {
                dispatcherSize = WorkerConstants.MAP_MASTER_DISPATCHER_SIZE_MAX;
            }
            startBatchHandler();
            createRootTask();
            LOGGER.info("jobInstanceId={} create root task, cost={}ms", jobInstanceInfo.getJobInstanceId(),
                (System.currentTimeMillis() - startTime));
            init();
        } catch (Throwable e) {
            String jobIdAndInstanceId = jobInstanceInfo.getJobId() + "_" + jobInstanceInfo.getJobInstanceId();
            LOGGER.error("", e);
            updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, ExceptionUtil.getMessage(e));
            logCollector.collect(jobInstanceInfo.getAppGroupId(), jobIdAndInstanceId, ClientLoggerMessage.INSTANCE_INIT_FAIL, 
                e, jobInstanceInfo.getGroupId());
        }
    }

    @Override
    public void batchUpdateTaskStatus(Worker.ContainerBatchReportTaskStatuesRequest request)throws Exception {
        String workerIdAddr = request.getWorkerId()+"@"+request.getWorkerAddr();
        this.setWorkerLoad(workerIdAddr, request.getMetricsJson(), null);
        super.batchUpdateTaskStatus(request);
    }

    @Override
    public void updateTaskStatus(ContainerReportTaskStatusRequest request) {
        try {
            taskStatusReqQueue.submitRequest(request);
        } catch (Throwable e) {
            LOGGER.error("", e);
        }
    }

    @Override
    public void batchUpdateTaskStatues(List requests) {
        Map finalTaskStatus = Maps.newHashMap();
        try {
            for (ContainerReportTaskStatusRequest request : requests) {
                TaskStatus taskStatus = TaskStatus.parseValue(request.getStatus());

                // 过滤中间状态
                if(!finalTaskStatus.containsKey(request.getTaskId()) || taskStatus.isFinish()){
                    finalTaskStatus.put(request.getTaskId(), request);
                }

                String workerAddr = request.getWorkerAddr();
                String taskName = request.getTaskName();
                LOGGER.debug("report task status:{} from worker:{}, uniqueId:{}", taskStatus.getDescription(),
                    workerAddr, IdUtil.getUniqueId(request.getJobId(), request.getJobInstanceId(), request.getTaskId()));
                // update progress
                if (!taskProgressMap.containsKey(taskName)) {
                    synchronized (this) {
                        if (!taskProgressMap.containsKey(taskName)) {
                            TaskProgressCounter taskProgressCounter = new TaskProgressCounter(taskName);
                            taskProgressMap.put(taskName, taskProgressCounter);
                        }
                    }
                }
                if (workerAddr != null && !workerProgressMap.containsKey(workerAddr)) {
                    synchronized (this) {
                        if (!workerProgressMap.containsKey(workerAddr)) {
                            WorkerProgressCounter workerProgressCounter = new WorkerProgressCounter(workerAddr);
                            workerProgressMap.put(workerAddr, workerProgressCounter);
                            if (StringUtils.isNotBlank(request.getTraceId())) {
                                workerProgressCounter.setTraceId(request.getTraceId());
                            }
                        }
                    }
                }
                if (taskStatus.equals(TaskStatus.RUNNING)) {
                    taskProgressMap.get(taskName).incrementRunning();
                    if (workerAddr != null) {
                        workerProgressMap.get(workerAddr).incrementRunning();
                    }
                } else if (taskStatus.equals(TaskStatus.SUCCESS)) {
                    taskProgressMap.get(taskName).incrementSuccess();
                    if (workerAddr != null) {
                        workerProgressMap.get(workerAddr).incrementSuccess();
                    }
                } else if (taskStatus.equals(TaskStatus.FAILED)) {
                    // 为了方便秒级别Map任务定位业务执行失败的机器和原因
                    LOGGER.error("Report task status:{} result:{} from worker:{}, uniqueId:{}", taskStatus.getDescription(),
                            request.getResult(), workerAddr, IdUtil.getUniqueId(request.getJobId(), request.getJobInstanceId(), request.getTaskId()));
                    taskProgressMap.get(taskName).incrementFailed();
                    if (workerAddr != null) {
                        workerProgressMap.get(workerAddr).incrementFailed();
                        if (StringUtils.isNotBlank(request.getTraceId())) {
                            workerProgressMap.get(workerAddr).setTraceId(request.getTraceId());
                        }
                    }
                }

                //update taskResultMap and taskStatusMap
                if (this.needReduce) {
                    taskResultMap.put(request.getTaskId(), request.getResult());
                    taskStatusMap.put(request.getTaskId(), taskStatus);
                }
            }
        } catch (Throwable e) {
            LOGGER.error("jobInstanceId={}, update progressMap error.", jobInstanceInfo.getJobInstanceId(), e);
        }
            
        try {
            long startTime = System.currentTimeMillis();

            // 跟任务节点失败原因回传,有可能同时返回两个根任务request,分别是running和failed状态,取最后一个
            int index = requests.size() - 1;
            if (index >=0 && (TaskStatus.FAILED.getValue() == requests.get(index).getStatus())
                && WorkerConstants.MAP_TASK_ROOT_NAME.equals(requests.get(index).getTaskName())) {
                setRootTaskResult(requests.get(index).getResult());
            }

            boolean updateSuccess = false;
            for (int i=0 ; i<3; i++) {
                // try 3 times
                try {
                    taskPersistence.updateTaskStatues(Lists.newArrayList(finalTaskStatus.values()));
                    updateSuccess = true;
                    break;
                } catch (Throwable t) {
                    LOGGER.error("jobInstanceId={}, persistent batch updateTaskStatus error.", t);
                }
            }

            // 如果没有更新成功
            if (!updateSuccess) {
                updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, "persistent batch update TaskStatus error up to 3 times");
            }

            LOGGER.debug("{} batch update status db cost:{}", jobInstanceInfo.getJobInstanceId(),
                System.currentTimeMillis() - startTime);
        } catch (Throwable e) {
            LOGGER.error("jobInstanceId={}, batch updateTaskStatus error.", jobInstanceInfo.getJobInstanceId(), e);
        }
    }

    public boolean map(List taskList, String taskName) throws Exception {
        LOGGER.debug("map taskName:{}, size:{}", taskName, taskList.size());
        initTaskProgress(taskName, taskList.size());
        for (ByteString taskBody : taskList) {
            MasterStartContainerRequest startContainerRequest = convert2StartContainerRequest(jobInstanceInfo, 
                    aquireTaskId(), taskName, taskBody);
            taskBlockingQueue.submitRequest(startContainerRequest);
        }
        // TODO overload clientinfo日志
        return machineOverload();
    }

    protected void clearTasks(long jobInstanceId) {
        try {
            taskPersistence.clearTasks(jobInstanceId);
            LOGGER.info("jobInstanceId={} clearTasks success.", jobInstanceId);
        } catch (Throwable ex){
            LOGGER.error("jobInstanceId={} clearTasks error", jobInstanceId, ex);
        }
    }

    protected void createRootTask() throws Exception {
        String taskName = WorkerConstants.MAP_TASK_ROOT_NAME;
        ByteString taskBody = ByteString.copyFrom(HessianUtil.toBytes(WorkerConstants.MAP_TASK_ROOT_NAME));
        initTaskProgress(taskName, 1);
        MasterStartContainerRequest startContainerRequest = convert2StartContainerRequest(jobInstanceInfo, aquireTaskId(),
                taskName, taskBody);
        batchDispatchTasks(Lists.newArrayList(startContainerRequest), getLocalWorkerIdAddr());
    }

    private void initTaskFailover(final List reqs, final String workerIdAddr) {
        LOGGER.warn("jobInstanceId={}, worker[{}] is down, try another worker, size:{}",
                jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size());
        final String workerId = workerIdAddr.split("@")[0];
        final String workerAddr = workerIdAddr.split("@")[1];
        List taskIds = Lists.newArrayList();
        for (MasterStartContainerRequest req : reqs) {
            taskIds.add(req.getTaskId());
        }
        try {
            int affectCnt = taskPersistence.updateTaskStatus(jobInstanceInfo.getJobInstanceId(), taskIds,
                    TaskStatus.INIT, workerId, workerAddr);
            LOGGER.warn("jobInstanceId={}, worker[{}] is down, reset task status, size:{}",
                    jobInstanceInfo.getJobInstanceId(), workerIdAddr, affectCnt);
            // 恢复当前机器子任务量
            workerProgressMap.get(workerAddr).decPulledAndTotal(affectCnt);

        } catch (Exception e1) {
            LOGGER.error("jobInstanceId={}, timeout return init error", jobInstanceInfo.getJobInstanceId());
            updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, "timeout dispatch return init error");
        }
    }

    private void processDispatchException(final String workerIdAddr, final List reqs, Throwable e) {
        final String workerId = workerIdAddr.split("@")[0];
        final String workerAddr = workerIdAddr.split("@")[1];
        boolean failover = (xAttrs != null && xAttrs.isFailover());
        if (failover && (e instanceof TimeoutException)) {
            // 失败重试处理
            initTaskFailover(reqs, workerIdAddr);
        } else {
            //如果是其他异常(比如序列化失败,找不到worker),直接把tasks置为失败
            String uniqueIdWithoutTask = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());
            LOGGER.error("jobInstanceId:{}, batch dispatch Tasks error worker={}, size:{}",
                    jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size(), e);
            logCollector.collect(jobInstanceInfo.getAppGroupId(), uniqueIdWithoutTask, ClientLoggerMessage.MAP_INSTANCE_DISPATCH_JOB_FAIL, 
                e, jobInstanceInfo.getGroupId());
            for (MasterStartContainerRequest req : reqs) {
                ContainerReportTaskStatusRequest faileReq = ContainerReportTaskStatusRequest.newBuilder()
                        .setJobId(jobInstanceInfo.getJobId())
                        .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                        .setTaskId(req.getTaskId())
                        .setStatus(TaskStatus.FAILED.getValue())
                        .setResult("Dispatch tasks error. Cause by "+e.getMessage())
                        .setWorkerId(workerId)
                        .setTaskName(req.getTaskName())
                        .setWorkerAddr(workerAddr)
                        .setTaskName(req.getTaskName())
                        .build();
                updateTaskStatus(faileReq);
            }
        }
        // 设置当前worker不可用
        setWorkerInvalid(workerIdAddr);
    }

    private void processDispatchResponse(final String workerIdAddr, final List reqs,
                                         MasterBatchStartContainersResponse response, long startTime) {
        final String workerId = workerIdAddr.split("@")[0];
        final String workerAddr = workerIdAddr.split("@")[1];
        if (response.getSuccess()) {
            LOGGER.info("jobInstanceId={}, batch start containers successfully, size:{} , worker={}, cost={}ms",
                    jobInstanceInfo.getJobInstanceId(), reqs.size(), workerIdAddr,
                    System.currentTimeMillis() - startTime);
            aliveCheckWorkerSet.add(workerIdAddr);
            String metricsJson = response.getMetricsJson();
            setWorkerLoad(workerIdAddr, metricsJson, System.currentTimeMillis() - startTime);
        } else {
            boolean failover = (xAttrs != null && xAttrs.isFailover());
            if (failover && (response.getMessage() != null && response.getMessage().contains(WorkerConstants.WORKER_NOT_RUNNING_MESSAGE))) {
                initTaskFailover(reqs, workerIdAddr);
            } else {
                LOGGER.error("jobInstanceId={}, batch start containers failed, worker={}, response={}, size:{}",
                        jobInstanceInfo.getJobInstanceId(), workerIdAddr, response.getMessage(), reqs.size());
                //当前是直接置为失败
                for (MasterStartContainerRequest req : reqs) {
                    ContainerReportTaskStatusRequest faileStatusRequest = ContainerReportTaskStatusRequest
                            .newBuilder()
                            .setJobId(jobInstanceInfo.getJobId())
                            .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                            .setTaskId(req.getTaskId())
                            .setStatus(TaskStatus.FAILED.getValue())
                            .setResult(response.getMessage())
                            .setWorkerId(workerId)
                            .setTaskName(req.getTaskName())
                            .setWorkerAddr(workerAddr)
                            .setTaskName(req.getTaskName())
                            .build();
                    updateTaskStatus(faileStatusRequest);
                }
            }
            // 设置当前worker不可用
            setWorkerInvalid(workerIdAddr);
        }
    }

    private void batchHandleContainers(final String workerIdAddr, final List reqs, boolean isFailover,
            TaskDispatchMode dispatchMode) {
        final String workerId = workerIdAddr.split("@")[0];
        final String workerAddr = workerIdAddr.split("@")[1];
        LOGGER.debug("jobInstanceId={}, batch dispatch, worker:{}, size:{}", jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size());

        try {
            batchHandlePersistence(workerId, workerAddr, reqs, isFailover);

            if (dispatchMode.equals(TaskDispatchMode.PUSH)) {
                final long startTime = System.currentTimeMillis();
                ActorSelection selection = getActorContext().actorSelection(
                        ActorPathUtil.getContainerRouterPath(workerIdAddr));
                MasterBatchStartContainersRequest request = MasterBatchStartContainersRequest.newBuilder()
                    .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                    .setJobId(jobInstanceInfo.getJobId())
                    .addAllStartReqs(reqs)
                    .build();
                Timeout timeout = new Timeout(Duration.create(3, TimeUnit.SECONDS));
                if(isWorkerLoadRouter()) {
                    // 基于负载路由策略,采用同步分发
                    try {
                        MasterBatchStartContainersResponse response = (MasterBatchStartContainersResponse) FutureUtils.awaitResult(selection, request, 3L);
                        processDispatchResponse(workerIdAddr, reqs, response, startTime);
                    } catch (Throwable e) {
                        processDispatchException(workerIdAddr, reqs, e);
                    }
                } else {
                    // 异步分发
                    Future future = Patterns.ask(selection, request, timeout);
                    // 触发成功回调
                    future.onSuccess(new OnSuccess() {
                        @Override
                        public void onSuccess(Object obj) throws Throwable {
                            MasterBatchStartContainersResponse response = (MasterBatchStartContainersResponse) obj;
                            processDispatchResponse(workerIdAddr, reqs, response, startTime);
                        }
                    }, futureExecutor);
                    //触发超时或失败回调
                    future.onFailure(new OnFailure() {
                        @Override
                        public void onFailure(Throwable e) throws Throwable {
                            processDispatchException(workerIdAddr, reqs, e);
                        }
                    }, futureExecutor);
                }
            } else {
                aliveCheckWorkerSet.add(workerIdAddr);
            }
        } catch (Throwable exception) {
            //如果是其他异常(比如序列化失败,找不到worker),直接把tasks置为失败
            String uniqueIdWithoutTask = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());
            LOGGER.error("jobInstanceId:{}, batch dispatch Tasks error worker={}, size:{}",
                jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size(), exception);
            logCollector.collect(jobInstanceInfo.getAppGroupId(), uniqueIdWithoutTask, ClientLoggerMessage.MAP_INSTANCE_DISPATCH_JOB_FAIL, 
                exception, jobInstanceInfo.getGroupId());
            for (MasterStartContainerRequest req : reqs) {
                ContainerReportTaskStatusRequest faileReq = ContainerReportTaskStatusRequest.newBuilder()
                    .setJobId(jobInstanceInfo.getJobId())
                    .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                    .setTaskId(req.getTaskId())
                    .setStatus(TaskStatus.FAILED.getValue())
                    .setWorkerId(workerId)
                    .setTaskName(req.getTaskName())
                    .setWorkerAddr(workerAddr)
                    .setTaskName(req.getTaskName())
                    .build();
                updateTaskStatus(faileReq);
            }
        }
    }

    /**
     * 设置worker load
     * @param workerIdAddr
     * @param metricsJson
     * @param cost
     */
    private void setWorkerLoad(String workerIdAddr, String metricsJson, Long cost){
        try {
            if (router != null) {
                if (router instanceof WorkerLoadRegister && StringUtils.isNotEmpty(metricsJson)) {
                    Metrics metrics = JsonUtil.fromJson(metricsJson, Metrics.class);
                    if (metrics != null) {
                        LOGGER.info("update worker load, worker={}, sharePoolAvailableSize={}, cost={}", workerIdAddr, metrics.getSharePoolAvailableSize(), cost);
                        ((WorkerLoadRegister) router).setAvailableSize(workerIdAddr, metrics.getSharePoolAvailableSize());
                        ((WorkerLoadRegister) router).setRemainCpu(workerIdAddr, (int) (metrics.getCpuProcessors() - metrics.getCpuLoad1()));
                        ((WorkerLoadRegister) router).setRemainMemory(workerIdAddr, (long) (100 - metrics.getHeap1Usage() * 100));
                        if (cost != null) {
                            ((WorkerLoadRegister) router).setCost(workerIdAddr, cost);
                        }
                    }
                    synchronized (router) {
                        router.notifyAll();
                    }
                }
            }
        }catch (Exception e) {
            LOGGER.warn("Set worker load failed.", e);
        }
    }

    /**
     * 设置worker不可用
     * @param workerIdAddr
     */
    private void setWorkerInvalid(String workerIdAddr){
        try {
            invalidWorkerSet.add(workerIdAddr);
            if (router != null) {
                if (router instanceof WorkerLoadRegister) {
                    ((WorkerLoadRegister) router).setAvailableSize(workerIdAddr, 0);
                }
            }
        }catch (Exception e) {
            LOGGER.warn("Set worker load failed.", e);
        }
    }

    private void batchHandlePersistence(String workerId, String workerAddr, List reqs, boolean isFailover) throws Exception {
        long startTime = System.currentTimeMillis();
        if (!isFailover) {
            // first dispatch
            taskPersistence.createTasks(reqs, workerId, workerAddr);
            if (this.needReduce) {
                for (MasterStartContainerRequest req : reqs) {
                    this.taskStatusMap.put(req.getTaskId(), TaskStatus.INIT);
                }
            }
        } else {
            // failover, not first dispatch
            List taskIds = Lists.newArrayList();
            for (MasterStartContainerRequest req : reqs) {
                taskIds.add(req.getTaskId());
            }
            taskPersistence.updateTaskStatus(jobInstanceInfo.getJobInstanceId(), taskIds, TaskStatus.RUNNING, workerId, workerAddr);
        }
        LOGGER.debug("jobInstance={}, batch dispatch db cost:{} ms, size:{}",
            jobInstanceInfo.getJobInstanceId(), System.currentTimeMillis() - startTime, reqs.size());
    }

    protected void batchHandlePulledProgress(List masterStartContainerRequests,
            Map> worker2ReqsWithNormal,
            Map> worker2ReqsWithFailover,
            String remoteWorker) {
        for (MasterStartContainerRequest request : masterStartContainerRequests) {
            String workerIdAddr = ((remoteWorker != null) ? remoteWorker : selectWorker(request.getFailover()));
            if (workerIdAddr == null) {
                updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, "all worker is down!");
                break;
            }
            String workerAddr = workerIdAddr.split("@")[1];
            if (request.getFailover()) {
                if (!worker2ReqsWithFailover.containsKey(workerIdAddr)) {
                    worker2ReqsWithFailover.put(workerIdAddr, Lists.newArrayList(request));
                } else {
                    worker2ReqsWithFailover.get(workerIdAddr).add(request);
                }
            } else {
                if (!worker2ReqsWithNormal.containsKey(workerIdAddr)) {
                    worker2ReqsWithNormal.put(workerIdAddr, Lists.newArrayList(request));
                } else {
                    worker2ReqsWithNormal.get(workerIdAddr).add(request);
                }
                // failover的子任务无需再计数
                taskProgressMap.get(request.getTaskName()).incrementPulled();
            }

            if (workerAddr != null && !workerProgressMap.containsKey(workerAddr)) {
                synchronized (this) {
                    if (!workerProgressMap.containsKey(workerAddr)) {
                        WorkerProgressCounter workerProgressCounter = new WorkerProgressCounter(workerAddr);
                        workerProgressMap.put(workerAddr, workerProgressCounter);
                    }
                }
            }
            workerProgressMap.get(workerAddr).incrementTotal();
            workerProgressMap.get(workerAddr).incrementPulled();
        }
    }

    public void batchDispatchTasks(List masterStartContainerRequests) {
        batchDispatchTasks(masterStartContainerRequests, null);
    }
    
    public void batchDispatchTasks(List masterStartContainerRequests, String remoteWorker) {
        Map> worker2ReqsWithNormal = Maps.newHashMap();
        Map> worker2ReqsWithFailover = Maps.newHashMap();
        batchHandlePulledProgress(masterStartContainerRequests, worker2ReqsWithNormal, worker2ReqsWithFailover, remoteWorker);

        //推模型正常启动子任务
        for (Entry> entry : worker2ReqsWithNormal.entrySet()) {
            batchHandleContainers(entry.getKey(), entry.getValue(), false, TaskDispatchMode.PUSH);
        }

        //推模型worker挂了,failover子任务到其他worker
        for (Entry> entry : worker2ReqsWithFailover.entrySet()) {
            batchHandleContainers(entry.getKey(), entry.getValue(), true, TaskDispatchMode.PUSH);
        }
    }

    public void batchPullTasks(List masterStartContainerRequests, String workerIdAddr) {
        Map> worker2ReqsWithNormal = Maps.newHashMap();
        Map> worker2ReqsWithFailover = Maps.newHashMap();
        batchHandlePulledProgress(masterStartContainerRequests, worker2ReqsWithNormal, worker2ReqsWithFailover, workerIdAddr);

        //拉模型持久化tasks
        for (Entry> entry : worker2ReqsWithNormal.entrySet()) {
            batchHandleContainers(entry.getKey(), entry.getValue(), false, TaskDispatchMode.PULL);
        }

        //拉模型更新tasks
        for (Entry> entry : worker2ReqsWithFailover.entrySet()) {
            batchHandleContainers(entry.getKey(), entry.getValue(), true, TaskDispatchMode.PULL);
        }
    }

    //TODO 可以重构成不抢锁的方式?
    protected synchronized String selectWorker(Boolean failover) {
        List allWorkers = jobInstanceInfo.getAllWorkers();
        if (failover && !CollectionUtils.isEmpty(getAliveCheckWorkerSet())) {
            allWorkers = new ArrayList<>(getAliveCheckWorkerSet());
            allWorkers.removeAll(invalidWorkerSet);
        }
        String worker;
        if(router != null ) {
            worker = router.route(jobInstanceInfo.getAppGroupId(), jobInstanceInfo.getJobId(),
                    allWorkers, jobInstanceInfo.getTargetWorkerAddrsMap(), getSerialNum(), getLocalWorkerIdAddr());
        } else {
            int size = allWorkers.size();
            if (size == 0) {
                return null;
            }

            boolean doNext;
            int count = 0;
            do {
                doNext = false;
                if (index >= size) {
                    index = index % size;
                }
                worker = allWorkers.get(index++);
                if (xAttrs != null && allWorkers.size() > 1 && !xAttrs.isExecOnMaster()) {
                    // 仅当存在多个节点时,可配置主节点不参与执行有效
                    if (worker.equals(getLocalWorkerIdAddr())) {
                        doNext = true;
                    }
                }
                if (invalidWorkerSet.contains(worker)){
                    LOGGER.warn("Failover={}, Worker={} is invalid skip. Invalid worker set={}, All workers={}", failover, worker, invalidWorkerSet, allWorkers);
                    doNext = true;
                }
            } while (doNext && ++count < size);
        }
        return worker;
    }

    @Override
    public void killInstance(boolean mayInterruptIfRunning, String reason) {
        super.killInstance(mayInterruptIfRunning, reason);
        //ip:port format
        List allWorkers = jobInstanceInfo.getAllWorkers();
        this.sendKillContainerRequest(mayInterruptIfRunning, allWorkers);
        //update instance status, stop on-going process
        updateNewInstanceStatus(getSerialNum(), jobInstanceInfo.getJobInstanceId(), InstanceStatus.FAILED, reason);
    }

    @Override
    public void destroyContainerPool() {
        List allWorkers = jobInstanceInfo.getAllWorkers();
        for (String workerIdAddr : allWorkers) {
            MasterDestroyContainerPoolRequest request = MasterDestroyContainerPoolRequest.newBuilder()
                .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                .setSerialNum(getSerialNum())
                .setJobId(jobInstanceInfo.getJobId())
                .setWorkerIdAddr(workerIdAddr)
                .build();
            SchedulerxWorker.AtLeastDeliveryRoutingActor.tell(request, null);
        }
    }

    @Override
    public void killTask(String uniqueId, String workerId, String workerAddr) {
        String workerIdAddr = workerId + "@" + workerAddr;
        try {
            ActorSelection selection = getActorContext().actorSelection(
                ActorPathUtil.getContainerRouterPath(workerIdAddr));
            MasterKillContainerRequest request = MasterKillContainerRequest.newBuilder()
                .setJobId(IdUtil.parse(uniqueId, IdType.JOB_ID))
                .setJobInstanceId(IdUtil.parse(uniqueId, IdType.JOB_INSTANCE_ID))
                .setTaskId(IdUtil.parse(uniqueId, IdType.TASK_ID))
                .setMayInterruptIfRunning(true)
                .setAppGroupId(jobInstanceInfo.getAppGroupId())
                .setGroupId(jobInstanceInfo.getGroupId())
                .build();
            selection.tell(request, null);
        } catch (Throwable e) {
            LOGGER.error("send kill request exception, worker:" + workerIdAddr);
        }
    }

    @Override
    public String getJobInstanceProgress() {
        MapTaskProgress detail = new MapTaskProgress();
        detail.setTaskProgress(taskProgressMap.values());
        detail.setWorkerProgress(workerProgressMap.values());
        return JsonUtil.toJson(detail);
    }

    @SuppressWarnings("resource")
    @Override
    public ProcessResult postFinish(long jobInstanceId) {
        ProcessResult reduceResult = null;
        try {
            JobContext context = JobContext.newBuilder()
                    .setJobId(jobInstanceInfo.getJobId())
                    .setJobInstanceId(jobInstanceId)
                    .setJobType(jobInstanceInfo.getJobType())
                    .setContent(jobInstanceInfo.getContent())
                    .setScheduleTime(jobInstanceInfo.getScheduleTime())
                    .setDataTime(jobInstanceInfo.getDataTime())
                    .setJobParameters(jobInstanceInfo.getParameters())
                    .setInstanceParameters(jobInstanceInfo.getInstanceParameters())
                    .setUser(jobInstanceInfo.getUser())
                    .setTaskResults(taskResultMap)
                    .setTaskStatuses(taskStatusMap)
                    .setSerialNum(this.getSerialNum())
                    .build();
            JobProcessor jobProcessor = JobProcessorUtil.getJavaProcessor(context.getContent());
            if (needReduce) {
                if (jobProcessor instanceof MapReduceJobProcessor) {
                    boolean runReduceIfFail = ((MapReduceJobProcessor) jobProcessor).runReduceIfFail(context);
                    if (getInstanceStatus().equals(InstanceStatus.FAILED) && !runReduceIfFail) {
                        LOGGER.warn("jobInstanceId={} is failed, skip reduce", jobInstanceId);
                        return null;
                    }
                } else {
                    reduceResult = new ProcessResult(false);
                    reduceResult.setResult(String.format("JobProcessor[%s] can not cast to com.alibaba.schedulerx.worker.processor.MapReduceJobProcessor, " +
                            "you can make CGLIB enabled (e.g. @EnableAspectJAutoProxy(proxyTargetClass = true)) to support reduce operations if spring aop is used.",jobProcessor.getClass().getName()));
                    return reduceResult;
                }

                String reduceTaskName = WorkerConstants.REDUCE_TASK_NAME;
                if (!taskProgressMap.containsKey(reduceTaskName)) {
                    TaskProgressCounter taskProgressCounter = new TaskProgressCounter(reduceTaskName);
                    taskProgressMap.put(reduceTaskName, taskProgressCounter);
                }
                taskProgressMap.get(reduceTaskName).incrementTotal();
                taskProgressMap.get(reduceTaskName).incrementRunning();

                String workerAddr = getActorContext().provider().getDefaultAddress().host().get() + ":" +
                        getActorContext().provider().getDefaultAddress().port().get();
                if (!workerProgressMap.containsKey(workerAddr)) {
                    WorkerProgressCounter workerProgressCounter = new WorkerProgressCounter(workerAddr);
                    workerProgressMap.put(workerAddr, workerProgressCounter);
                }
                workerProgressMap.get(workerAddr).incrementTotal();
                workerProgressMap.get(workerAddr).incrementRunning();

                try {
                    reduceResult = ((MapReduceJobProcessor)jobProcessor).reduce(context);
                } catch (Exception e) {
                    LOGGER.error("do reduce process failed.", e);
                    reduceResult = new ProcessResult(false);
                    reduceResult.setResult("reduce exception: " + ExceptionUtil.getMessage(e));
                }

                if (reduceResult.getStatus().equals(InstanceStatus.SUCCESS)) {
                    taskProgressMap.get(reduceTaskName).incrementSuccess();
                    workerProgressMap.get(workerAddr).incrementSuccess();
                } else {
                    taskProgressMap.get(reduceTaskName).incrementFailed();
                    workerProgressMap.get(workerAddr).incrementFailed();
                }
            } else {
                if (jobProcessor instanceof JobProcessorEx) {
                    ((JobProcessorEx)jobProcessor).postProcess(context);
                }
            }
        } catch (Throwable e) {
            LOGGER.error("Map task postFinish failed.", e);
        }
        return reduceResult;
    }

    @Override
    public void stop() {
        if (taskDispatchReqHandler != null) {
            taskDispatchReqHandler.stop();
        }
        if (taskStatusReqBatchHandler != null) {
            taskStatusReqBatchHandler.stop();
        }
        LOGGER.info("jobInstanceId:{}, instance master successfully stop.", jobInstanceInfo.getJobInstanceId());
    }

    @Override
    protected void doTerminate(){
        if (taskDispatchReqHandler != null) {
            taskDispatchReqHandler.stop();
        }
    }

    protected void startBatchHandler() {
        if (INITED) {
            return;
        }
        // start batch handlers
        taskStatusReqQueue.init();
        taskStatusReqBatchHandler.start();

        taskBlockingQueue.setCapacity(queueSize);
        taskBlockingQueue.init();

        if (xAttrs.getTaskDispatchMode().equals(TaskDispatchMode.PUSH.getValue())) {
            if(isWorkerLoadRouter()) {
                // 采用单线程分发,确保能进行Master限流
                taskDispatchReqHandler.start();
            } else {
                taskDispatchReqHandler.setWorkThreadNum(dispatcherSize);
                taskDispatchReqHandler.setDispatchSize(pageSize * jobInstanceInfo.getAllWorkers().size());
                taskDispatchReqHandler.start();
            }
        }
    }

    private int getTotalPulledAndRunning() {
        int total = 0;
        List taskCounters = Lists.newArrayList(taskProgressMap.values());
        for (TaskProgressCounter taskProgressCounter : taskCounters) {
            total += taskProgressCounter.getPulled();
            total += taskProgressCounter.getRunning();
        }
        return total;
    }

    private boolean machineOverload() {
        boolean memOverload = false;
        boolean loadOverload = false;
        boolean taskQueueOverload = false;
        Metrics vmDetail = MetricsCollector.getMetrics();
        if (vmDetail != null) {
            memOverload = vmDetail.getHeap1Usage() >= WorkerConstants.USER_MEMORY_PERCENT_DEFAULT;
            loadOverload = vmDetail.getCpuLoad1() >= vmDetail.getCpuProcessors();
        }
        return memOverload || loadOverload || taskQueueOverload;
    }

    public String getRootTaskResult() {
        return rootTaskResult;
    }

    public void setRootTaskResult(String rootTaskResult) {
        this.rootTaskResult = rootTaskResult;
    }

    private void initTaskProgress(String taskName, int delta) {
        if (!taskProgressMap.containsKey(taskName)) {
            synchronized (this) {
                if (!taskProgressMap.containsKey(taskName)) {
                    TaskProgressCounter taskProgressCounter = new TaskProgressCounter(taskName);
                    taskProgressMap.put(taskName, taskProgressCounter);
                }
            }
        }
        taskProgressMap.get(taskName).incrementTotal(delta);
    }

    @Override
    public void clear() {
        super.clear();
        if (taskStatusReqQueue != null) {
            taskStatusReqQueue.clear();
        }
        if (taskBlockingQueue != null) {
            taskBlockingQueue.clear();
        }
        if (taskDispatchReqHandler != null) {
            taskDispatchReqHandler.clear();
        }
        if (taskStatusReqBatchHandler != null) { 
            taskStatusReqBatchHandler.clear();
        }
        if (taskProgressMap != null) {
            taskProgressMap.clear();
        }
        if (workerProgressMap != null) {
            workerProgressMap.clear(); 
        }
        if (taskResultMap != null) {
            taskResultMap.clear();
        }
        if (taskStatusMap != null) {
            taskStatusMap.clear();
        }
        clearTasks(jobInstanceInfo.getJobInstanceId());
        taskCounter.set(0);
        setStartStatusCheck(false);
    }

    /**
     * Getter method for property taskProgressMap.
     *
     * @return property value of taskProgressMap
     */
    public Map getTaskProgressMap() {
        return taskProgressMap;
    }

    public synchronized List syncPullTasks(long serialNum, int pageSize, String workerIdAddr) {
        if (getTotalPulledAndRunning() >= xAttrs.getGlobalConsumerSize()) {
            return Lists.newArrayList();
        } else {
            if (this.getSerialNum() == serialNum) {
                return taskDispatchReqHandler.syncHandleReqs(pageSize, workerIdAddr);
            } else {
                return Lists.newArrayList();
            }
        }
    }

    @Override
    protected void checkProcessor() throws Exception {
        if ("java".equalsIgnoreCase(jobInstanceInfo.getJobType())) {
            JavaProcessorProfile profile = JsonUtil.fromJson(jobInstanceInfo.getContent(), JavaProcessorProfile.class);
            if (!JobProcessorUtil.checkJavaProcessor(profile.getClassName(), MapJobProcessor.class)) {
                throw new IOException(profile.getClassName() + " must extends MapJobProcessor or MapReduceJobProcessor");
            }
            if (JobProcessorUtil.checkJavaProcessor(profile.getClassName(), MapReduceJobProcessor.class)){
                this.needReduce = true;
            }
        }
    }

    @Override
    public synchronized void handleWorkerShutdown(String workerIdAddr, boolean withFailover) {
        this.existInvalidWorker = true;
        this.invalidWorkerSet.add(workerIdAddr);
        if (!aliveCheckWorkerSet.contains(workerIdAddr)) {
            return;
        }
        String[] workerInfo = workerIdAddr.split("@");
        String workerAddr = workerInfo[1];
        String workerId = workerInfo[0];
        aliveCheckWorkerSet.remove(workerIdAddr);
        jobInstanceInfo.getAllWorkers().remove(workerIdAddr);
        // adjust dispatch batch size
        taskDispatchReqHandler.setDispatchSize(aliveCheckWorkerSet.size() * pageSize);

//        boolean isSecondTypeJob = JobUtil.isSecondTypeJob(TimeType.parseValue(jobInstanceInfo.getTimeType()));
        if (withFailover && (xAttrs != null && xAttrs.isFailover())) {
            // 如果启用failover,置为init状态等待重新拉取
            int affectCnt = taskPersistence.batchUpdateTaskStatus(jobInstanceInfo.getJobInstanceId(), TaskStatus.INIT, workerId, workerAddr);
            LOGGER.warn("jobInstanceId={}, failover task number:{}, workerId:{}, workerAddr:{}",
                jobInstanceInfo.getJobInstanceId(), affectCnt, workerId, workerAddr);
            if (affectCnt > 0) {
                // recover counter
                workerProgressMap.get(workerAddr).decRunningAndTotal(affectCnt);
            }
        } else {
            // 如果不启用failover,直接把这台worker上的子任务置为失败
            int affectCnt = taskPersistence.batchUpdateTaskStatus(jobInstanceInfo.getJobInstanceId(), TaskStatus.FAILED, workerId, workerAddr);
            LOGGER.warn("jobInstanceId={}, worker shutdown, failed task number:{}, workerId:{}, workerAddr:{}",
                jobInstanceInfo.getJobInstanceId(), affectCnt, workerId, workerAddr);
            if (affectCnt > 0) {
                workerProgressMap.get(workerAddr).incrementFailed(affectCnt);
                // taskProgress数据会不准确
            }
        }
    }

	public boolean isStartStatusCheck() {
		return startStatusCheck;
	}

	public void setStartStatusCheck(boolean startStatusCheck) {
		this.startStatusCheck = startStatusCheck;
	}

    /**
     * 判断是否采用Worker负载最优路由
     * @return
     */
    protected boolean isWorkerLoadRouter(){
        Integer routerStrategy = ConfigUtil.getWorkerConfig().getInteger(WorkerConstants.MAP_MASTER_ROUTER_STRATEGY, null);
        boolean enableShareContainerPool = WorkerConfigUtil.isEnableShareContainerPool();
        return enableShareContainerPool && ((xAttrs != null && RouteStrategyEnum.WORKER_LOAD.getValue().equals(xAttrs.getRouteType()))
                || RouteStrategyEnum.WORKER_LOAD.getValue().equals(routerStrategy));
    }

    /**
     * 解析分发速率
     * @return
     */
    protected Long parseDispatchSpeed() {
        // 解析分发速率
        String dispatchSpeed = xAttrs.getDispatchSpeed();
        Long dispatchDelay = null;
        if (StringUtils.isNotEmpty(dispatchSpeed)) {
            Integer speed = null;
            TimeUnit timeUnit = TimeUnit.MILLISECONDS;
            if (NumberUtils.isNumber(dispatchSpeed)) {
                speed = NumberUtils.toInt(dispatchSpeed);
            } else {
                String[] arr = dispatchSpeed.split("/");
                if (arr != null && arr.length == 2) {
                    if (NumberUtils.isNumber(arr[0])) {
                        speed = NumberUtils.toInt(arr[0]);
                    }
                    if (StringUtils.isNotEmpty(arr[1])) {
                        switch (StringUtils.upperCase(arr[1])) {
                            case "S":
                                timeUnit = TimeUnit.SECONDS;
                                break;
                            case "M":
                                timeUnit = TimeUnit.MINUTES;
                                break;
                            case "H":
                                timeUnit = TimeUnit.HOURS;
                                break;
                        }
                    }
                }
            }
            if (speed != null && speed > 0) {
                dispatchDelay = TimeUnit.MILLISECONDS.convert(1, timeUnit)/speed;
            }
        }
        return dispatchDelay;
    }
}