All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.alibaba.schedulerx.worker.master.StreamTaskMaster Maven / Gradle / Ivy

There is a newer version: 1.12.2
Show newest version
package com.alibaba.schedulerx.worker.master;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.MapUtils;
import org.apache.commons.lang.StringUtils;
import org.joda.time.DateTime;

import com.alibaba.schedulerx.common.domain.InstanceStatus;
import com.alibaba.schedulerx.common.domain.JobInstanceInfo;
import com.alibaba.schedulerx.common.domain.LimitedQueue;
import com.alibaba.schedulerx.common.domain.MapTaskXAttrs;
import com.alibaba.schedulerx.common.domain.Metrics;
import com.alibaba.schedulerx.common.domain.StreamJobProgress;
import com.alibaba.schedulerx.common.domain.StreamJobProgressDetail;
import com.alibaba.schedulerx.common.domain.TaskDispatchMode;
import com.alibaba.schedulerx.common.domain.TaskProgressCounter;
import com.alibaba.schedulerx.common.domain.TaskStatus;
import com.alibaba.schedulerx.common.domain.TimeType;
import com.alibaba.schedulerx.common.domain.WorkerProgressCounter;
import com.alibaba.schedulerx.common.domain.enums.RouteStrategyEnum;
import com.alibaba.schedulerx.common.util.ConfigUtil;
import com.alibaba.schedulerx.common.util.ExceptionUtil;
import com.alibaba.schedulerx.common.util.HessianUtil;
import com.alibaba.schedulerx.common.util.IdUtil;
import com.alibaba.schedulerx.common.util.JobUtil;
import com.alibaba.schedulerx.common.util.JsonUtil;
import com.alibaba.schedulerx.protocol.Worker;
import com.alibaba.schedulerx.protocol.Worker.ContainerReportTaskStatusRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterStartContainerRequest;
import com.alibaba.schedulerx.protocol.utils.FutureUtils;
import com.alibaba.schedulerx.worker.SchedulerxWorker;
import com.alibaba.schedulerx.worker.batch.ReqQueue;
import com.alibaba.schedulerx.worker.batch.StreamTaskPushReqHandler;
import com.alibaba.schedulerx.worker.batch.TMStatusReqHandler;
import com.alibaba.schedulerx.worker.domain.JavaProcessorProfile;
import com.alibaba.schedulerx.worker.domain.JobContext;
import com.alibaba.schedulerx.worker.domain.TaskInfo;
import com.alibaba.schedulerx.worker.domain.WorkerConstants;
import com.alibaba.schedulerx.worker.log.LogFactory;
import com.alibaba.schedulerx.worker.log.Logger;
import com.alibaba.schedulerx.worker.logcollector.ClientLoggerMessage;
import com.alibaba.schedulerx.worker.logcollector.LogCollector;
import com.alibaba.schedulerx.worker.logcollector.LogCollectorFactory;
import com.alibaba.schedulerx.worker.master.persistence.H2FilePersistence;
import com.alibaba.schedulerx.worker.master.persistence.TaskPersistence;
import com.alibaba.schedulerx.worker.metrics.WorkerLoadRegister;
import com.alibaba.schedulerx.worker.processor.JobProcessor;
import com.alibaba.schedulerx.worker.processor.ProcessResult;
import com.alibaba.schedulerx.worker.processor.StreamJobProcessor;
import com.alibaba.schedulerx.worker.route.Router;
import com.alibaba.schedulerx.worker.route.RouterFactory;
import com.alibaba.schedulerx.worker.util.ActorPathUtil;
import com.alibaba.schedulerx.worker.util.ContanerUtil;
import com.alibaba.schedulerx.worker.util.JobProcessorUtil;
import com.alibaba.schedulerx.worker.util.WorkerConfigUtil;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.protobuf.ByteString;

import akka.actor.ActorContext;
import akka.actor.ActorSelection;

/**
 * StreamTaskMaster
 * @author yaohui
 * @create 2023/5/18 11:19 AM
 **/
public class StreamTaskMaster extends TaskMaster {

    private static final Logger LOGGER = LogFactory.getLogger(StreamTaskMaster.class);

    private volatile int index = 0;

    /**
     * 子任务队列缓冲
     */
    protected ReqQueue taskBlockingQueue;

    protected StreamTaskPushReqHandler taskDispatchReqHandler;

    protected Thread streamProduceThread;

    private LogCollector logCollector = LogCollectorFactory.get();

    private LimitedQueue streamJobProgressHistory = new LimitedQueue<>(10);

    private Map streamJobProgressMap = Maps.newConcurrentMap();

    protected Router router;

    //task批量汇报队列
    protected ReqQueue taskStatusReqQueue;

    protected TMStatusReqHandler taskStatusReqBatchHandler;

    protected TaskPersistence taskPersistence;

    protected MapTaskXAttrs xAttrs = null;

    protected StreamJobProcessor streamJobProcessor;

    private Map> taskResultMap = Maps.newHashMap();

    private Map> taskStatusMap = Maps.newHashMap();

    private TaskProgressCounter totalCounter = new TaskProgressCounter("TotalCounter");


    public StreamTaskMaster(JobInstanceInfo jobInstanceInfo, ActorContext actorContext) throws Exception {
        super(jobInstanceInfo, actorContext);
        this.taskPersistence = H2FilePersistence.getInstance();
        this.taskPersistence.initTable();
        this.streamJobProcessor = (StreamJobProcessor)JobProcessorUtil.getJavaProcessor(jobInstanceInfo.getContent());
        if (jobInstanceInfo.getXattrs() != null) {
            this.xAttrs = JsonUtil.fromJson(jobInstanceInfo.getXattrs(), MapTaskXAttrs.class);
        }
    }

    /**
     * 初始化
     * @param batchNo
     * @param taskName
     * @param delta
     */
    private void initTaskProgress(Long batchNo, String taskName, int delta) {
        if (!streamJobProgressMap.containsKey(batchNo)) {
            synchronized (this) {
                if (!streamJobProgressMap.containsKey(batchNo)) {
                    TaskProgressCounter taskProgressCounter = new TaskProgressCounter(taskName);
                    taskProgressCounter.incrementTotal(delta);
                    StreamJobProgressDetail streamJobProgressDetail = new StreamJobProgressDetail(batchNo, DateTime.now().getMillis(), taskProgressCounter);
                    streamJobProgressMap.put(batchNo, streamJobProgressDetail);
                }
            }
        } else {
            TaskProgressCounter taskProgressCounter = streamJobProgressMap.get(batchNo).getTaskProgressCounter();
            taskProgressCounter.incrementTotal(delta);
        }
        if (!WorkerConstants.MAP_TASK_ROOT_NAME.equals(taskName)) {
            totalCounter.incrementTotal(delta);
        }
    }


    @Override
    public void batchUpdateTaskStatus(Worker.ContainerBatchReportTaskStatuesRequest request)throws Exception {
        String workerIdAddr = request.getWorkerId()+"@"+request.getWorkerAddr();
        this.setWorkerLoad(workerIdAddr, request.getMetricsJson(), null);
        super.batchUpdateTaskStatus(request);
    }

    @Override
    public void updateTaskStatus(ContainerReportTaskStatusRequest request) {
        try {
            taskStatusReqQueue.submitRequest(request);
        } catch (Throwable e) {
            LOGGER.error("", e);
        }
    }

    /**
     * 批量更新子任务状态
     * @param requests
     */
    @Override
    public void batchUpdateTaskStatues(List requests) {
        Map finalTaskStatus = Maps.newHashMap();
        for (ContainerReportTaskStatusRequest request : requests) {
            try {
                TaskStatus taskStatus = TaskStatus.parseValue(request.getStatus());

                // 过滤中间状态
                if(!finalTaskStatus.containsKey(request.getTaskId()) || taskStatus.isFinish()){
                    finalTaskStatus.put(request.getTaskId(), request);
                }

                String workerAddr = request.getWorkerAddr();
                LOGGER.debug("report task status:{} from worker:{}, uniqueId:{}", taskStatus.getDescription(),
                        workerAddr, IdUtil.getUniqueId(request.getJobId(), request.getJobInstanceId(), request.getTaskId()));

                StreamJobProgressDetail streamJobProgressDetail = this.streamJobProgressMap.get(request.getSerialNum());
                TaskProgressCounter taskProgressCounter = streamJobProgressDetail.getTaskProgressCounter();
                streamJobProgressDetail.setStatus(TaskStatus.RUNNING.getValue());

                Map workerProgressMap = streamJobProgressDetail.getWorkerProgressMap();
                if (taskStatus.equals(TaskStatus.RUNNING)) {
                    taskProgressCounter.incrementRunning();
                    totalCounter.incrementRunning();
                    if (workerAddr != null) {
                        workerProgressMap.get(workerAddr).incrementRunning();
                    }
                } else if (taskStatus.equals(TaskStatus.SUCCESS)) {
                    taskProgressCounter.incrementSuccess();
                    totalCounter.incrementSuccess();
                    this.taskDispatchReqHandler.release();
                    if (workerAddr != null) {
                        workerProgressMap.get(workerAddr).incrementSuccess();
                    }
                } else if (taskStatus.equals(TaskStatus.FAILED)) {
                    taskProgressCounter.incrementFailed();
                    totalCounter.incrementFailed();
                    this.taskDispatchReqHandler.release();
                    if (workerAddr != null) {
                        workerProgressMap.get(workerAddr).incrementFailed();
                        if (StringUtils.isNotBlank(request.getTraceId())) {
                            workerProgressMap.get(workerAddr).setTraceId(request.getTraceId());
                        }
                    }
                }

                if(TaskStatus.FAILED.equals(taskStatus)) {
                    String uniqueIdWithoutTask = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());
                    LOGGER.info("jobInstanceId={}, taskId={}, report status failed. result:{}", jobInstanceInfo.getJobInstanceId(), request.getTaskId(), request.getResult());
                    logCollector.collect(jobInstanceInfo.getAppGroupId(), uniqueIdWithoutTask,
                        ClientLoggerMessage.JOB_PROCESSOR_EXEC_FAIL + request.getTaskId()+", "+request.getResult(), jobInstanceInfo.getGroupId());
                }

                // 设置子任务返回结果
                if (this.streamJobProcessor.needReduce() && taskStatus.isFinish()) {
                    Map batchTaskResultMap = taskResultMap.get(request.getSerialNum());
                    if (batchTaskResultMap == null) {
                        synchronized (taskResultMap) {
                            batchTaskResultMap = taskResultMap.get(request.getSerialNum());
                            if (batchTaskResultMap == null) {
                                batchTaskResultMap = new HashMap<>();
                                taskResultMap.put(request.getSerialNum(), batchTaskResultMap);
                            }
                        }
                    }
                    batchTaskResultMap.put(request.getTaskId(), request.getResult());
                    Map batchTaskStatusMap = taskStatusMap.get(request.getSerialNum());
                    if (batchTaskStatusMap == null) {
                        synchronized (taskStatusMap) {
                            batchTaskStatusMap = taskStatusMap.get(request.getSerialNum());
                            if (batchTaskStatusMap == null) {
                                batchTaskStatusMap = new HashMap<>();
                                taskStatusMap.put(request.getSerialNum(), batchTaskStatusMap);
                            }
                        }
                    }
                    batchTaskStatusMap.put(request.getTaskId(), taskStatus);
                }

            } catch (Throwable e) {
                LOGGER.error("jobInstanceId={}, batchNo={}, taskId={}, update progressMap error.", jobInstanceInfo.getJobInstanceId(),
                        request.getSerialNum(), request.getTaskId(), e);
                updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, "update progressMap error."+e.getMessage());
            }
        }

        try {
            long startTime = System.currentTimeMillis();
            boolean updateSuccess = false;
            for (int i=0 ; i<3; i++) {
                // try 3 times
                try {
                    taskPersistence.updateTaskStatues(Lists.newArrayList(finalTaskStatus.values()));
                    updateSuccess = true;
                    break;
                } catch (Throwable t) {
                    LOGGER.error("jobInstanceId={}, persistent batch updateTaskStatus error.", t);
                }
            }
            // 如果没有更新成功
            if (!updateSuccess) {
                updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, "persistent batch update TaskStatus error up to 3 times");
            }
            LOGGER.debug("{} batch update status db cost:{}", jobInstanceInfo.getJobInstanceId(),
                    System.currentTimeMillis() - startTime);
        } catch (Throwable e) {
            LOGGER.error("jobInstanceId={}, batch updateTaskStatus error.", jobInstanceInfo.getJobInstanceId(), e);
        }
    }

    @Override
    protected void init() {
        if (INITED) {
            return;
        }
        INITED = true;
        // 初始化缓冲队列
        int queueSize = xAttrs.getQueueSize();
        taskBlockingQueue = new ReqQueue<>(jobInstanceInfo.getJobInstanceId(), queueSize);
        taskBlockingQueue.init();

        taskStatusReqQueue = new ReqQueue<>(jobInstanceInfo.getJobInstanceId(), 10 * 10000);
        taskStatusReqQueue.init();
        taskStatusReqBatchHandler = new TMStatusReqHandler<>(jobInstanceInfo.getJobInstanceId(), 1,
                1, 3000, taskStatusReqQueue);

        // 设置全局并发数
        int globalConsumerSize = xAttrs.getGlobalConsumerSize();
        taskDispatchReqHandler = new StreamTaskPushReqHandler<>(jobInstanceInfo.getJobInstanceId(), globalConsumerSize,
                jobInstanceInfo.getAllWorkers().size(), taskBlockingQueue);

        boolean enableShareContainerPool = WorkerConfigUtil.isEnableShareContainerPool();
        // 路由策略加载,仅针对共享线程池模式
        if (enableShareContainerPool) {
            router = RouterFactory.getRouter(jobInstanceInfo.getAppGroupId(), jobInstanceInfo.getJobId(),
                    xAttrs.getRouteType(), jobInstanceInfo.getRouteStrategyContent());
        } else {
            router = RouterFactory.getRouter(jobInstanceInfo.getAppGroupId(), jobInstanceInfo.getJobId(),
                    RouteStrategyEnum.ROUND_ROBIN.getValue(), jobInstanceInfo.getRouteStrategyContent());
        }
        // 清理路由配置
        if (this.router != null && this.router instanceof WorkerLoadRegister) {
            ((WorkerLoadRegister) this.router).clear();
        }

        final String jobIdAndInstanceId = jobInstanceInfo.getJobId() + "_" + jobInstanceInfo.getJobInstanceId();
        //worker alive check thread
        new Thread(new Runnable() {
            @Override
            public void run() {
                while (!instanceStatus.isFinish()) {
                    try {
                        for (String workerIdAddr : aliveCheckWorkerSet) {
                            try {
                                String workerAddr = workerIdAddr.split("@")[1];
                                String tokens[] = workerAddr.split(":");
                                String host = tokens[0];
                                int port = Integer.valueOf(tokens[1]);
                                int times = 0;
                                while (times < 3) {
                                    Socket socket = new Socket();
                                    try {
                                        socket.connect(new InetSocketAddress(host, port), 5000);
                                        LOGGER.info("socket to {}:{} is reachable, times={}", host, port, times);
                                        break;
                                    } catch (Exception e) {
                                        LOGGER.info("socket to {}:{} is not reachable, times={}", host, port, times);
                                        Thread.sleep(5000);
                                        times++;
                                    } finally {
                                        if (socket != null) {
                                            socket.close();
                                        }
                                    }
                                }
                                if (times >= 3) {
                                    LOGGER.warn("worker[{}] is down, start to remove this worker and failover tasks, jobInstanceId={}",
                                            workerIdAddr, jobInstanceInfo.getJobInstanceId());
                                    handleWorkerShutdown(workerIdAddr, true);
                                    continue;
                                }
                                final long startTime = System.currentTimeMillis();
                                ActorSelection selection = getActorContext().actorSelection(
                                        ActorPathUtil.getWorkerHeartbeatRouterPath(workerIdAddr));
                                Worker.MasterCheckWorkerAliveRequest request = Worker.MasterCheckWorkerAliveRequest.newBuilder()
                                        .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                                        .setDispatchMode(xAttrs.getTaskDispatchMode())
                                        .build();
                                Worker.MasterCheckWorkerAliveResponse response = (Worker.MasterCheckWorkerAliveResponse)
                                        FutureUtils.awaitResult(selection, request, 10);
                                if (!response.getSuccess()) {
                                    LOGGER.warn("jobInstanceId={} of worker={} is not alive", jobInstanceInfo.getJobInstanceId(),
                                            workerIdAddr, response.getMessage());
                                    handleWorkerShutdown(workerIdAddr, true);

                                    // destroy containers of worker of PullModel
                                    Worker.MasterDestroyContainerPoolRequest destroyContainerPoolRequest = Worker.MasterDestroyContainerPoolRequest.newBuilder()
                                            .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                                            .setJobId(jobInstanceInfo.getJobId())
                                            .setWorkerIdAddr(workerIdAddr)
                                            .setSerialNum(getSerialNum())
                                            .build();
                                    SchedulerxWorker.AtLeastDeliveryRoutingActor.tell(destroyContainerPoolRequest, null);
                                } else {
                                    // 设置当前worker load
                                    setWorkerLoad(workerIdAddr, response.getMetricsJson(), System.currentTimeMillis()-startTime);
                                }
                            } catch (Exception e) {
                                // 一旦出现异常,且telnet正常情况下会导致任务卡死
                                LOGGER.error("Alive worker check failed.", e);
                                handleWorkerShutdown(workerIdAddr, true);
                                // destroy containers of worker of PullModel
                                Worker.MasterDestroyContainerPoolRequest destroyContainerPoolRequest = Worker.MasterDestroyContainerPoolRequest.newBuilder()
                                        .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                                        .setJobId(jobInstanceInfo.getJobId())
                                        .setWorkerIdAddr(workerIdAddr)
                                        .setSerialNum(getSerialNum())
                                        .build();
                                SchedulerxWorker.AtLeastDeliveryRoutingActor.tell(destroyContainerPoolRequest, null);
                            }
                        }
                        //每隔10秒进行一次worker探活
                        Thread.sleep(10000);
                    } catch (Throwable e) {
                        LOGGER.error("check worker error, jobInstanceId={}", jobInstanceInfo.getJobInstanceId(), e);
                    }
                }
            }
        }, "Schedulerx-StreamTaskMaster-check-worker-alive-thread-" + jobIdAndInstanceId).start();


        // failover pull thread
        new Thread(new Runnable() {
            @Override
            public void run() {
                int pageSize = ConfigUtil.getWorkerConfig().getInt(WorkerConstants.MAP_MASTER_PAGE_SIZE, WorkerConstants.MAP_MASTER_PAGE_SIZE_DEFAULT);
                while (!instanceStatus.isFinish()) {
                    try {
                        List taskInfos;
                        long startTime = System.currentTimeMillis();
                        taskInfos = taskPersistence.pull(jobInstanceInfo.getJobInstanceId(), pageSize);
                        LOGGER.debug("jobInstanceId={}, pull cost={}ms", jobInstanceInfo.getJobInstanceId(),
                                (System.currentTimeMillis() - startTime));
                        if (taskInfos.isEmpty()) {
                            LOGGER.debug("pull task empty of jobInstanceId={}, sleep 10000 ms ...",
                                    jobInstanceInfo.getJobInstanceId());
                            Thread.sleep(10 * 1000);
                        } else {
                            LOGGER.info("jobInstanceId={}, failover retry dispatch taskList, size:{} , cost={}ms",
                                    jobInstanceInfo.getJobInstanceId(), taskInfos.size(), System.currentTimeMillis() - startTime);
                            for (TaskInfo taskInfo : taskInfos) {
                                ByteString taskBody = null;
                                if (taskInfo.getTaskBody() != null) {
                                    taskBody = ByteString.copyFrom(taskInfo.getTaskBody());
                                }
                                MasterStartContainerRequest.Builder builder = convert2StartContainerRequestBuilder(jobInstanceInfo,
                                        taskInfo.getTaskId(), taskInfo.getTaskName(), taskBody, true);
                                builder.setSerialNum(taskInfo.getBatchNo());
                                taskBlockingQueue.submitRequest(builder.build());
                            }
                        }
                    } catch (TimeoutException te) {
                        LOGGER.error("pull task timeout, uniqueId:{}", jobIdAndInstanceId, te);
                        logCollector.collect(jobInstanceInfo.getAppGroupId(), jobIdAndInstanceId, ClientLoggerMessage.MAP_INSTANCE_PULL_JOB_FAIL, 
                            te, jobInstanceInfo.getGroupId());
                        try {
                            Thread.sleep(10 * 1000);
                        } catch (InterruptedException e) {
                        }
                    } catch (Throwable e) {
                        updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, ExceptionUtil.getMessage(e));
                        logCollector.collect(jobInstanceInfo.getAppGroupId(), jobIdAndInstanceId, ClientLoggerMessage.MAP_INSTANCE_PULL_JOB_FAIL, 
                            e, jobInstanceInfo.getGroupId());
                        LOGGER.error("pull task error, uniqueId:{}", jobIdAndInstanceId, e);
                    }
                }
            }
        }, "Schedulerx-StreamTaskMaster-pull-thread-" + jobIdAndInstanceId).start();
    }

    @Override
    public void submitInstance(final JobInstanceInfo jobInstanceInfo) throws Exception {
        try {
            //初始化
            init();

            // 启动task batch handler
            taskDispatchReqHandler.start();
            taskStatusReqBatchHandler.start();

            // 创建根任务
            createProduceTask();

            final String jobIdAndInstanceId = jobInstanceInfo.getJobId() + "_" + jobInstanceInfo.getJobInstanceId();

            //Status Check Thread
            new Thread(new Runnable() {
                @Override
                public void run() {
                    int checkInterval = ConfigUtil.getWorkerConfig().getInt(WorkerConstants.Map_MASTER_STATUS_CHECK_INTERVAL,
                            WorkerConstants.Map_MASTER_STATUS_CHECK_INTERVAL_DEFAULT);
                    while (!instanceStatus.isFinish()) {
                        try {
                            Thread.sleep(checkInterval);
                            if (MapUtils.isNotEmpty(streamJobProgressMap)) {
                                // 执行结果检查
                                for (Map.Entry entry : streamJobProgressMap.entrySet()) {
                                    boolean allTasksPushed = taskDispatchReqHandler.allTasksPushed(entry.getKey());
                                    StreamJobProgressDetail streamJobProgressDetail = entry.getValue();
                                    TaskProgressCounter taskProgressCounter = streamJobProgressDetail.getTaskProgressCounter();
                                    if (allTasksPushed || taskProgressCounter.getTotal() <= (taskProgressCounter.getFailed() + taskProgressCounter.getSuccess())) {
                                        InstanceStatus newStatus = taskPersistence.checkInstanceStatus(jobInstanceInfo.getJobInstanceId(), entry.getKey());
                                        if (newStatus.isFinish()) {
                                            ProcessResult processResult = new ProcessResult(true);
                                            // Root任务无需进行reduce
                                            if (entry.getKey() > 0 && streamJobProcessor.needReduce()) {
                                                try {
                                                    JobContext context = JobContext.newBuilder()
                                                            .setJobId(jobInstanceInfo.getJobId())
                                                            .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                                                            .setJobType(jobInstanceInfo.getJobType())
                                                            .setContent(jobInstanceInfo.getContent())
                                                            .setScheduleTime(jobInstanceInfo.getScheduleTime())
                                                            .setDataTime(jobInstanceInfo.getDataTime())
                                                            .setJobParameters(jobInstanceInfo.getParameters())
                                                            .setInstanceParameters(jobInstanceInfo.getInstanceParameters())
                                                            .setUser(jobInstanceInfo.getUser())
                                                            .setTaskResults(taskResultMap.get(entry.getKey()))
                                                            .setTaskStatuses(taskStatusMap.get(entry.getKey()))
                                                            .setSerialNum(entry.getKey())
                                                            .build();
                                                    processResult = streamJobProcessor.reduce(context);
                                                    if (processResult == null) {
                                                        processResult = new ProcessResult(false, "Reduce can not return NULL.");
                                                    }
                                                } catch (Throwable t) {
                                                    LOGGER.error("Stream job jobId={} jobInstanceId={} batchNo={} reduce exception.", jobInstanceInfo.getJobId(),
                                                            jobInstanceInfo.getJobInstanceId(), entry.getKey(), t);
                                                    processResult = new ProcessResult(false, t.getMessage());
                                                }
                                            }

                                            if (InstanceStatus.FAILED.equals(processResult.getStatus())) {
                                                LOGGER.error("Stream job jobId={} jobInstanceId={} batchNo={} reduce failed. Result:{}", jobInstanceInfo.getJobId(),
                                                        jobInstanceInfo.getJobInstanceId(), entry.getKey(), processResult.getResult());
                                            }

                                            // 当前批次执行结束
                                            long failedCount = streamJobProgressDetail.getTaskProgressCounter().getFailed();
                                            if (failedCount > 0 || InstanceStatus.FAILED.equals(processResult.getStatus())) {
                                                streamJobProgressDetail.setStatus(InstanceStatus.FAILED.getValue());
                                            } else {
                                                streamJobProgressDetail.setStatus(InstanceStatus.SUCCESS.getValue());
                                            }
                                            streamJobProgressDetail.setEndTime(DateTime.now().getMillis());
                                            streamJobProgressHistory.add(streamJobProgressDetail);
                                            streamJobProgressMap.remove(entry.getKey());

                                            // 清理中间过程
                                            taskStatusMap.remove(entry.getKey());
                                            taskResultMap.remove(entry.getKey());
                                        }
                                    }
                                }
                            } else {
                                // 说明当前Root任务已经结束,直接关闭当前任务实例
                                if (!streamProduceThread.isAlive()) {
                                    String result = SchedulerxWorker.INITED?"Produce task is stopped.":"Worker master shutdown.";
                                    updateNewInstanceStatus(getSerialNum(), jobInstanceInfo.getJobInstanceId(), InstanceStatus.FAILED, result);
                                }
                            }
                        } catch (Throwable e) {
                            LOGGER.error("status check error, uniqueId:{}", jobIdAndInstanceId, e);
                        }
                    }
                }
            }, "Schedulerx-StreamTaskMaster-status-check-thread-" + jobIdAndInstanceId).start();

            // job instance progress report thread
            if(!JobUtil.isSecondTypeJob(TimeType.parseValue(jobInstanceInfo.getTimeType()))) {
                new Thread(new Runnable() {
                    @Override
                    public void run() {
                        while (!instanceStatus.isFinish()) {
                            Worker.WorkerReportJobInstanceProgressRequest request = Worker.WorkerReportJobInstanceProgressRequest.newBuilder().setJobId(jobInstanceInfo.getJobId()).setJobInstanceId(
                                    jobInstanceInfo.getJobInstanceId()).setProgress(getJobInstanceProgress()).build();
                            SERVER_DISCOVERY.getMapMasterRouter().tell(request, null);
                            try {
                                Thread.sleep(5000);
                            } catch (InterruptedException e) {
                                LOGGER.error("report status error, uniqueId={}", jobIdAndInstanceId, e);
                                break;
                            }
                        }
                    }
                }, "Schedulerx-StreamTaskMaster-report-progress-thread-" + jobIdAndInstanceId).start();
            }
        } catch (Throwable t) {
            String jobIdAndInstanceId = jobInstanceInfo.getJobId() + "_" + jobInstanceInfo.getJobInstanceId();
            LOGGER.error("submit instance failed.", t);
            updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, ExceptionUtil.getMessage(t));
            logCollector.collect(jobInstanceInfo.getAppGroupId(), jobIdAndInstanceId, ClientLoggerMessage.INSTANCE_INIT_FAIL, t, jobInstanceInfo.getGroupId());
        }
    }

    /**
     * 创建Stream Root Task
     * @throws Exception
     */
    protected void createProduceTask() throws Exception {

        // 初始根任务
        initTaskProgress(this.getSerialNum(), WorkerConstants.MAP_TASK_ROOT_NAME, 1);

        final MasterStartContainerRequest startContainerRequest = convert2StartContainerRequest(jobInstanceInfo, aquireTaskId(), WorkerConstants.MAP_TASK_ROOT_NAME, null);
        final String workerIdAddr = getLocalWorkerIdAddr();
        final String workerId = workerIdAddr.split("@")[0];
        final String workerAddr = workerIdAddr.split("@")[1];
        batchHandlePersistence(workerId, workerAddr, Lists.newArrayList(startContainerRequest), false);
        batchHandlePulledProgress(Lists.newArrayList(startContainerRequest), Maps.>newHashMap(),
                Maps.>newHashMap(), workerIdAddr);

        // Stream produce
        streamProduceThread = new Thread(new Runnable() {
            @Override
            public void run() {
                try {
                    streamJobProgressMap.get(getSerialNum()).setStatus(TaskStatus.RUNNING.getValue());
                    streamJobProgressMap.get(getSerialNum()).getTaskProgressCounter().incrementRunning();
                    streamJobProgressMap.get(getSerialNum()).getWorkerProgressMap().get(workerAddr).incrementRunning();

                    JobContext context = ContanerUtil.convert2JobContext(startContainerRequest);

                    List tasks;
                    long produceInterval = xAttrs.getProduceInterval();
                    while (!instanceStatus.isFinish() && SchedulerxWorker.INITED) {
                        //TODO 防止执行中批次过多,进行主动限流
                        if (streamJobProgressMap.size() < 15) {
                            // 初始当前批次加载起始时间
                            streamJobProgressMap.get(0L).setStartTime(DateTime.now().getMillis());
                            context.setSerialNum(aquireSerialNum());

                            tasks = streamJobProcessor.produce(context);
                            if (!CollectionUtils.isEmpty(tasks)) {
                                initTaskProgress(getSerialNum(), "SubTask", tasks.size());
                                for (Object task : tasks) {
                                    byte[] taskBody = HessianUtil.toBytes(task);
                                    MasterStartContainerRequest taskContainerRequest = convert2StartContainerRequest(jobInstanceInfo,
                                            aquireTaskId(), "SubTask", ByteString.copyFrom(taskBody));
                                    taskBlockingQueue.submitRequest(taskContainerRequest);
                                }
                            }
                        }
                        // 如果超过5批运行中,则开始进行限流
                        int plus = (streamJobProgressMap.size() / 5) + 1;
                        TimeUnit.SECONDS.sleep(plus * produceInterval);
                    }

                    // Root任务执行结束
                    ContainerReportTaskStatusRequest rootTaskStatusRequest = ContainerReportTaskStatusRequest.newBuilder()
                            .setJobId(jobInstanceInfo.getJobId())
                            .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                            .setTaskId(startContainerRequest.getTaskId())
                            .setStatus(TaskStatus.SUCCESS.getValue())
                            .setWorkerId(workerId)
                            .setTaskName(startContainerRequest.getTaskName())
                            .setWorkerAddr(workerAddr)
                            .setSerialNum(startContainerRequest.getSerialNum())
                            .build();
                    updateTaskStatus(rootTaskStatusRequest);
                } catch (Throwable e) {
                    LOGGER.error("stream job produce running failed.", e);
                    String workerIdAddr = getLocalWorkerIdAddr();
                    final String workerId = workerIdAddr.split("@")[0];
                    final String workerAddr = workerIdAddr.split("@")[1];
                    if (startContainerRequest != null) {
                        ContainerReportTaskStatusRequest faileReq = ContainerReportTaskStatusRequest.newBuilder()
                                .setJobId(jobInstanceInfo.getJobId())
                                .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                                .setTaskId(startContainerRequest.getTaskId())
                                .setStatus(TaskStatus.FAILED.getValue())
                                .setWorkerId(workerId)
                                .setTaskName(startContainerRequest.getTaskName())
                                .setWorkerAddr(workerAddr)
                                .setSerialNum(startContainerRequest.getSerialNum())
                                .build();
                        updateTaskStatus(faileReq);
                    } else {
                        String jobIdAndInstanceId = jobInstanceInfo.getJobId() + "_" + jobInstanceInfo.getJobInstanceId();
                        updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, ExceptionUtil.getMessage(e));
                        logCollector.collect(jobInstanceInfo.getAppGroupId(), jobIdAndInstanceId, ClientLoggerMessage.INSTANCE_INIT_FAIL, 
                            e, jobInstanceInfo.getGroupId());
                    }
                }
            }
        }, "Schedulerx-stream-produce-thread-" + this.jobInstanceInfo.getJobInstanceId());
        streamProduceThread.start();
    }

    @Override
    protected void checkProcessor() throws Exception {
        if ("java".equalsIgnoreCase(jobInstanceInfo.getJobType())) {
            JavaProcessorProfile profile = JsonUtil.fromJson(jobInstanceInfo.getContent(), JavaProcessorProfile.class);
            if (!JobProcessorUtil.checkJavaProcessor(profile.getClassName(), StreamJobProcessor.class)) {
                throw new IOException(profile.getClassName() + " must extends StreamJobProcessor");
            }
        }
    }

    @Override
    public void destroyContainerPool() {
        List allWorkers = jobInstanceInfo.getAllWorkers();
        for (String workerIdAddr : allWorkers) {
            Worker.MasterDestroyContainerPoolRequest request = Worker.MasterDestroyContainerPoolRequest.newBuilder()
                    .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                    .setSerialNum(getSerialNum())
                    .setJobId(jobInstanceInfo.getJobId())
                    .setWorkerIdAddr(workerIdAddr)
                    .build();
            SchedulerxWorker.AtLeastDeliveryRoutingActor.tell(request, null);
        }
    }

    @Override
    public void clear() {
        super.clear();
        if (taskStatusReqQueue != null) {
            taskStatusReqQueue.clear();
        }
        if (taskBlockingQueue != null) {
            taskBlockingQueue.clear();
        }
        if (taskDispatchReqHandler != null) {
            taskDispatchReqHandler.clear();
        }
        if (taskStatusReqBatchHandler != null) {
            taskStatusReqBatchHandler.clear();
        }
        clearTasks(jobInstanceInfo.getJobInstanceId());
    }

    private void clearTasks(long jobInstanceId) {
        try {
            taskPersistence.clearTasks(jobInstanceId);
            LOGGER.info("jobInstanceId={} clearTasks success.", jobInstanceId);
        } catch (Throwable ex){
            LOGGER.error("jobInstanceId={} clearTasks error", jobInstanceId, ex);
        }
    }

    @Override
    public void stop() {
        if (taskDispatchReqHandler != null) {
            taskDispatchReqHandler.stop();
        }
        if (taskStatusReqBatchHandler != null) {
            taskStatusReqBatchHandler.stop();
        }
        LOGGER.info("jobInstanceId:{}, instance master successfully stop.", jobInstanceInfo.getJobInstanceId());
    }

    @Override
    protected void doTerminate(){
        if (taskDispatchReqHandler != null) {
            taskDispatchReqHandler.stop();
        }
    }

    @Override
    public String getJobInstanceProgress() {
        Map detailMap = new LinkedHashMap<>();
        detailMap.putAll(streamJobProgressMap);
        Iterator iterator = this.streamJobProgressHistory.iterator();
        while (iterator.hasNext()){
            StreamJobProgressDetail streamJobProgressDetail = iterator.next();
            detailMap.put(streamJobProgressDetail.getBatchNum(), streamJobProgressDetail);
        }
        // 初始化缓冲队列
        int queueSize = xAttrs.getQueueSize();
        StreamJobProgressDetail produceProcessDetail = detailMap.get(0L);
        return JsonUtil.toJson(new StreamJobProgress(produceProcessDetail.getStatus(), this.totalCounter,
                queueSize, this.taskBlockingQueue.size(), detailMap));
    }

    /**
     * 设置worker不可用
     * @param workerIdAddr
     */
    private void setWorkerInvalid(String workerIdAddr){
        try {
            invalidWorkerSet.add(workerIdAddr);
            if (router != null) {
                if (router instanceof WorkerLoadRegister) {
                    ((WorkerLoadRegister) router).setAvailableSize(workerIdAddr, 0);
                }
            }
        }catch (Exception e) {
            LOGGER.warn("Set worker load failed.", e);
        }
    }

    private void initTaskFailover(final List reqs, final String workerIdAddr) {
        LOGGER.warn("jobInstanceId={}, worker[{}] is down, try another worker, size:{}",
                jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size());
        final String workerId = workerIdAddr.split("@")[0];
        final String workerAddr = workerIdAddr.split("@")[1];
        List taskIds = Lists.newArrayList();
        Map affectCntMap = new HashMap<>();
        for (MasterStartContainerRequest req : reqs) {
            taskIds.add(req.getTaskId());
            Integer count = affectCntMap.get(req.getSerialNum());
            if (count == null) {
                affectCntMap.put(req.getSerialNum(), 1);
            } else {
                affectCntMap.put(req.getSerialNum(), count+1);
            }
        }
        try {
            int affectCnt = taskPersistence.updateTaskStatus(jobInstanceInfo.getJobInstanceId(), taskIds,
                    TaskStatus.INIT, workerId, workerAddr);
            LOGGER.warn("jobInstanceId={}, worker[{}] is down, reset task status, size:{}",
                    jobInstanceInfo.getJobInstanceId(), workerIdAddr, affectCnt);

            // 恢复当前机器子任务量
            for (Map.Entry entry:affectCntMap.entrySet()) {
                streamJobProgressMap.get(entry.getKey()).getWorkerProgressMap().get(workerAddr).decPulledAndTotal(affectCnt);
            }
        } catch (Exception e1) {
            LOGGER.error("jobInstanceId={}, timeout return init error", jobInstanceInfo.getJobInstanceId());
            updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, "timeout dispatch return init error");
        }
    }

    /**
     * 处理子任务分发异常
     * @param workerIdAddr
     * @param reqs
     * @param e
     */
    private void processDispatchException(final String workerIdAddr, final List reqs, Throwable e) {
        final String workerId = workerIdAddr.split("@")[0];
        final String workerAddr = workerIdAddr.split("@")[1];
        boolean failover = (xAttrs != null && xAttrs.isFailover());
        if (failover && (e instanceof TimeoutException)) {
            initTaskFailover(reqs, workerIdAddr);
        } else {
            // 如果是其他异常(比如序列化失败,找不到worker),直接把tasks置为失败
            String uniqueIdWithoutTask = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());
            LOGGER.error("jobInstanceId:{}, batch dispatch Tasks error worker={}, size:{}",
                    jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size(), e);
            logCollector.collect(jobInstanceInfo.getAppGroupId(), uniqueIdWithoutTask, ClientLoggerMessage.MAP_INSTANCE_DISPATCH_JOB_FAIL, 
                e, jobInstanceInfo.getGroupId());
            for (MasterStartContainerRequest req : reqs) {
                ContainerReportTaskStatusRequest faileReq = ContainerReportTaskStatusRequest.newBuilder()
                        .setJobId(jobInstanceInfo.getJobId())
                        .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                        .setTaskId(req.getTaskId())
                        .setStatus(TaskStatus.FAILED.getValue())
                        .setResult("Dispatch tasks error. Cause by "+e.getMessage())
                        .setWorkerId(workerId)
                        .setTaskName(req.getTaskName())
                        .setWorkerAddr(workerAddr)
                        .setTaskName(req.getTaskName())
                        .setSerialNum(req.getSerialNum())
                        .build();
                updateTaskStatus(faileReq);
            }
        }
        // 设置当前worker不可用
        setWorkerInvalid(workerIdAddr);
    }

    private void processDispatchResponse(final String workerIdAddr, final List reqs,
                                         Worker.MasterBatchStartContainersResponse response, long startTime) {
        final String workerId = workerIdAddr.split("@")[0];
        final String workerAddr = workerIdAddr.split("@")[1];
        if (response.getSuccess()) {
            LOGGER.info("jobInstanceId={}, batch start containers successfully, size:{} , worker={}, cost={}ms",
                    jobInstanceInfo.getJobInstanceId(), reqs.size(), workerIdAddr,
                    System.currentTimeMillis() - startTime);
            aliveCheckWorkerSet.add(workerIdAddr);
            String metricsJson = response.getMetricsJson();
            setWorkerLoad(workerIdAddr, metricsJson, System.currentTimeMillis() - startTime);
        } else {
            boolean failover = (xAttrs != null && xAttrs.isFailover());
            if (failover && (response.getMessage() != null && response.getMessage().contains(WorkerConstants.WORKER_NOT_RUNNING_MESSAGE))) {
                initTaskFailover(reqs, workerIdAddr);
            } else {
                LOGGER.error("jobInstanceId={}, batch start containers failed, worker={}, response={}, size:{}",
                        jobInstanceInfo.getJobInstanceId(), workerIdAddr, response.getMessage(), reqs.size());
                //当前是直接置为失败
                for (MasterStartContainerRequest req : reqs) {
                    ContainerReportTaskStatusRequest faileStatusRequest = ContainerReportTaskStatusRequest.newBuilder()
                            .setJobId(jobInstanceInfo.getJobId())
                            .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                            .setTaskId(req.getTaskId())
                            .setStatus(TaskStatus.FAILED.getValue())
                            .setResult(response.getMessage())
                            .setWorkerId(workerId)
                            .setTaskName(req.getTaskName())
                            .setWorkerAddr(workerAddr)
                            .setTaskName(req.getTaskName())
                            .setSerialNum(req.getSerialNum())
                            .build();
                    updateTaskStatus(faileStatusRequest);
                }
            }
            // 设置当前worker不可用
            setWorkerInvalid(workerIdAddr);
        }
    }

    /**
     * 设置worker load
     * @param workerIdAddr
     * @param metricsJson
     * @param cost
     */
    private void setWorkerLoad(String workerIdAddr, String metricsJson, Long cost){
        try {
            if (router != null) {
                if (router instanceof WorkerLoadRegister && StringUtils.isNotEmpty(metricsJson)) {
                    Metrics metrics = JsonUtil.fromJson(metricsJson, Metrics.class);
                    if (metrics != null) {
                        LOGGER.info("update worker load, worker={}, sharePoolAvailableSize={}, cost={}", workerIdAddr, metrics.getSharePoolAvailableSize(), cost);
                        ((WorkerLoadRegister) router).setAvailableSize(workerIdAddr, metrics.getSharePoolAvailableSize());
                        ((WorkerLoadRegister) router).setRemainCpu(workerIdAddr, (int) (metrics.getCpuProcessors() - metrics.getCpuLoad1()));
                        ((WorkerLoadRegister) router).setRemainMemory(workerIdAddr, (long) (100 - metrics.getHeap1Usage() * 100));
                        if (cost != null) {
                            ((WorkerLoadRegister) router).setCost(workerIdAddr, cost);
                        }
                    }
                    synchronized (router) {
                        router.notifyAll();
                    }
                }
            }
        }catch (Exception e) {
            LOGGER.warn("Set worker load failed.", e);
        }
    }

    private void batchHandleContainers(final String workerIdAddr, final List reqs, boolean isFailover, TaskDispatchMode dispatchMode) {
        final String workerId = workerIdAddr.split("@")[0];
        final String workerAddr = workerIdAddr.split("@")[1];
        LOGGER.debug("jobInstanceId={}, batch dispatch, worker:{}, size:{}", jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size());
        try {
            batchHandlePersistence(workerId, workerAddr, reqs, isFailover);

            final long startTime = System.currentTimeMillis();
            ActorSelection selection = getActorContext().actorSelection(
                    ActorPathUtil.getContainerRouterPath(workerIdAddr));
            Worker.MasterBatchStartContainersRequest request = Worker.MasterBatchStartContainersRequest.newBuilder()
                    .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                    .setJobId(jobInstanceInfo.getJobId())
                    .addAllStartReqs(reqs)
                    .build();
            // 基于负载路由策略,采用同步分发
            try {
                Worker.MasterBatchStartContainersResponse response = (Worker.MasterBatchStartContainersResponse) FutureUtils.awaitResult(selection, request, 3L);
                processDispatchResponse(workerIdAddr, reqs, response, startTime);
            } catch (Throwable e) {
                processDispatchException(workerIdAddr, reqs, e);
            }
        } catch (Throwable exception) {
            //如果是其他异常(比如序列化失败,找不到worker),直接把tasks置为失败
            String uniqueIdWithoutTask = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());
            LOGGER.error("jobInstanceId:{}, batch dispatch Tasks error worker={}, size:{}",
                    jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size(), exception);
            logCollector.collect(jobInstanceInfo.getAppGroupId(), uniqueIdWithoutTask, ClientLoggerMessage.MAP_INSTANCE_DISPATCH_JOB_FAIL, 
                exception, jobInstanceInfo.getGroupId());
            for (MasterStartContainerRequest req : reqs) {
                ContainerReportTaskStatusRequest faileReq = ContainerReportTaskStatusRequest.newBuilder()
                        .setJobId(jobInstanceInfo.getJobId())
                        .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                        .setTaskId(req.getTaskId())
                        .setStatus(TaskStatus.FAILED.getValue())
                        .setWorkerId(workerId)
                        .setTaskName(req.getTaskName())
                        .setWorkerAddr(workerAddr)
                        .setTaskName(req.getTaskName())
                        .setSerialNum(req.getSerialNum())
                        .build();
                updateTaskStatus(faileReq);
            }
        }
    }


    private void batchHandlePersistence(String workerId, String workerAddr, List reqs, boolean isFailover) throws Exception {
        long startTime = System.currentTimeMillis();
        if (!isFailover) {
            // first dispatch
            taskPersistence.createTasks(reqs, workerId, workerAddr);
        } else {
            // failover, not first dispatch
            List taskIds = Lists.newArrayList();
            for (MasterStartContainerRequest req : reqs) {
                taskIds.add(req.getTaskId());
            }
            taskPersistence.updateTaskStatus(jobInstanceInfo.getJobInstanceId(), taskIds, TaskStatus.RUNNING, workerId, workerAddr);
        }
        LOGGER.debug("jobInstance={}, batch dispatch db cost:{} ms, size:{}",
                jobInstanceInfo.getJobInstanceId(), System.currentTimeMillis() - startTime, reqs.size());
    }

    /**
     * 批量分发子任务
     * @param masterStartContainerRequests
     */
    public void batchDispatchTasks(List masterStartContainerRequests) {
        Map> worker2ReqsWithNormal = Maps.newHashMap();
        Map> worker2ReqsWithFailover = Maps.newHashMap();
        batchHandlePulledProgress(masterStartContainerRequests, worker2ReqsWithNormal, worker2ReqsWithFailover, null);

        //推模型正常启动子任务
        for (Entry> entry : worker2ReqsWithNormal.entrySet()) {
            batchHandleContainers(entry.getKey(), entry.getValue(), false, TaskDispatchMode.PUSH);
        }

        //推模型worker挂了,failover子任务到其他worker
        for (Entry> entry : worker2ReqsWithFailover.entrySet()) {
            batchHandleContainers(entry.getKey(), entry.getValue(), true, TaskDispatchMode.PUSH);
        }
    }

    protected void batchHandlePulledProgress(List masterStartContainerRequests,
                                             Map> worker2ReqsWithNormal,
                                             Map> worker2ReqsWithFailover,
                                             String worker) {
        for (MasterStartContainerRequest request : masterStartContainerRequests) {
            String workerIdAddr = worker==null?selectWorker():worker;
            if (workerIdAddr == null) {
                updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, "all worker is down!");
                break;
            }
            String workerAddr = workerIdAddr.split("@")[1];
            StreamJobProgressDetail streamJobProgressDetail = this.streamJobProgressMap.get(request.getSerialNum());
            if (request.getFailover()) {
                if (!worker2ReqsWithFailover.containsKey(workerIdAddr)) {
                    worker2ReqsWithFailover.put(workerIdAddr, Lists.newArrayList(request));
                } else {
                    worker2ReqsWithFailover.get(workerIdAddr).add(request);
                }
            } else {
                if (!worker2ReqsWithNormal.containsKey(workerIdAddr)) {
                    worker2ReqsWithNormal.put(workerIdAddr, Lists.newArrayList(request));
                } else {
                    worker2ReqsWithNormal.get(workerIdAddr).add(request);
                }
                streamJobProgressDetail.getTaskProgressCounter().incrementPulled();
                if (request.getSerialNum() > 0) {
                    // 合计非Root数据
                    totalCounter.incrementPulled();
                }
            }
            streamJobProgressDetail.setStatus(TaskStatus.PULLED.getValue());
            Map workerProgressMap = streamJobProgressDetail.getWorkerProgressMap();
            if (workerAddr != null && !workerProgressMap.containsKey(workerAddr)) {
                synchronized (this) {
                    if (!workerProgressMap.containsKey(workerAddr)) {
                        WorkerProgressCounter workerProgressCounter = new WorkerProgressCounter(workerAddr);
                        workerProgressMap.put(workerAddr, workerProgressCounter);
                    }
                }
            }
            workerProgressMap.get(workerAddr).incrementTotal();
            workerProgressMap.get(workerAddr).incrementPulled();
        }
    }

    /**
     * 选择Worker
     * @return
     */
    private synchronized String selectWorker() {
        if (index < 0 || index >= Integer.MAX_VALUE) {
            index = 0;
        }
        String worker = router.route(jobInstanceInfo.getAppGroupId(), jobInstanceInfo.getJobId(), jobInstanceInfo.getAllWorkers(),
                jobInstanceInfo.getTargetWorkerAddrsMap(), index++, getLocalWorkerIdAddr());
        return worker;
    }

    @Override
    public synchronized void handleWorkerShutdown(String workerIdAddr, boolean withFailover) {
        this.existInvalidWorker = true;
        this.invalidWorkerSet.add(workerIdAddr);
        if (!aliveCheckWorkerSet.contains(workerIdAddr)) {
            return;
        }
        String[] workerInfo = workerIdAddr.split("@");
        String workerAddr = workerInfo[1];
        String workerId = workerInfo[0];
        aliveCheckWorkerSet.remove(workerIdAddr);
        jobInstanceInfo.getAllWorkers().remove(workerIdAddr);

        if (withFailover && (xAttrs != null && xAttrs.isFailover())) {
            // 如果启用failover,置为init状态等待重新拉取
            int affectCnt = taskPersistence.batchUpdateTaskStatus(jobInstanceInfo.getJobInstanceId(), TaskStatus.INIT, workerId, workerAddr);
            LOGGER.warn("jobInstanceId={}, failover task number:{}, workerId:{}, workerAddr:{}",
                    jobInstanceInfo.getJobInstanceId(), affectCnt, workerId, workerAddr);
            if (affectCnt > 0) {
                for(StreamJobProgressDetail progressDetail:streamJobProgressMap.values()) {
                    WorkerProgressCounter workerProgressCounter = progressDetail.getWorkerProgressMap().get(workerAddr);
                    int count = workerProgressCounter.getRunning() + workerProgressCounter.getPulled();
                    workerProgressCounter.decRunningAndTotal(count);
                }
                this.taskDispatchReqHandler.release(affectCnt);
            }
        } else {
            // 如果不启用failover,直接把这台worker上的子任务置为失败
            int affectCnt = taskPersistence.batchUpdateTaskStatus(jobInstanceInfo.getJobInstanceId(), TaskStatus.FAILED, workerId, workerAddr);
            LOGGER.warn("jobInstanceId={}, worker shutdown, failed task number:{}, workerId:{}, workerAddr:{}",
                    jobInstanceInfo.getJobInstanceId(), affectCnt, workerId, workerAddr);
            if (affectCnt > 0) {
                for(StreamJobProgressDetail progressDetail:streamJobProgressMap.values()) {
                    WorkerProgressCounter workerProgressCounter = progressDetail.getWorkerProgressMap().get(workerAddr);
                    int count = workerProgressCounter.getRunning() + workerProgressCounter.getPulled();
                    workerProgressCounter.incrementFailed(count);
                }
                // taskProgress数据会不准确
                this.taskDispatchReqHandler.release(affectCnt);
            }
        }
    }

    @Override
    public ProcessResult postFinish(long jobInstanceId) {
        ProcessResult reduceResult = null;
        try {
            JobContext context = JobContext.newBuilder()
                    .setJobId(jobInstanceInfo.getJobId())
                    .setJobInstanceId(jobInstanceId)
                    .setJobType(jobInstanceInfo.getJobType())
                    .setContent(jobInstanceInfo.getContent())
                    .setScheduleTime(jobInstanceInfo.getScheduleTime())
                    .setDataTime(jobInstanceInfo.getDataTime())
                    .setJobParameters(jobInstanceInfo.getParameters())
                    .setInstanceParameters(jobInstanceInfo.getInstanceParameters())
                    .setUser(jobInstanceInfo.getUser())
                    .setSerialNum(this.getSerialNum())
                    .build();
            JobProcessor jobProcessor = JobProcessorUtil.getJavaProcessor(context.getContent());
            if (jobProcessor instanceof StreamJobProcessor) {
                reduceResult = ((StreamJobProcessor) jobProcessor).postProcess(context);
            }
        } catch (Throwable e) {
            LOGGER.info("Stream job post finish failed.", e);
            String fixedErrMsg = ExceptionUtil.getFixedErrMsgByThrowable(e, 800);
            return new ProcessResult(false, "Stream job post finish failed:" + fixedErrMsg);
        }
        return reduceResult;
    }

    @Override
    public void killInstance(boolean mayInterruptIfRunning, String reason) {
        super.killInstance(mayInterruptIfRunning, reason);
        //ip:port format
        List allWorkers = jobInstanceInfo.getAllWorkers();
        this.sendKillContainerRequest(mayInterruptIfRunning, allWorkers);
        //update instance status, stop on-going process
        updateNewInstanceStatus(getSerialNum(), jobInstanceInfo.getJobInstanceId(), InstanceStatus.FAILED, reason);
    }
}