All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.alibaba.schedulerx.worker.master.BroadcastTaskMaster Maven / Gradle / Ivy

There is a newer version: 1.12.2
Show newest version
package com.alibaba.schedulerx.worker.master;

import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.commons.lang.StringUtils;

import com.alibaba.schedulerx.common.domain.InstanceStatus;
import com.alibaba.schedulerx.common.domain.JobInstanceInfo;
import com.alibaba.schedulerx.common.domain.MapTaskProgress;
import com.alibaba.schedulerx.common.domain.TaskStatus;
import com.alibaba.schedulerx.common.domain.TimeType;
import com.alibaba.schedulerx.common.domain.WorkerProgressCounter;
import com.alibaba.schedulerx.common.util.ConfigUtil;
import com.alibaba.schedulerx.common.util.ExceptionUtil;
import com.alibaba.schedulerx.common.util.IdUtil;
import com.alibaba.schedulerx.common.util.JobUtil;
import com.alibaba.schedulerx.common.util.JsonUtil;
import com.alibaba.schedulerx.protocol.Worker.ContainerReportTaskStatusRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterCheckWorkerAliveRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterDestroyContainerPoolRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterStartContainerRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterStartContainerResponse;
import com.alibaba.schedulerx.protocol.Worker.WorkerReportJobInstanceProgressRequest;
import com.alibaba.schedulerx.protocol.utils.FutureUtils;
import com.alibaba.schedulerx.worker.SchedulerxWorker;
import com.alibaba.schedulerx.worker.domain.JavaProcessorProfile;
import com.alibaba.schedulerx.worker.domain.JobContext;
import com.alibaba.schedulerx.worker.domain.WorkerConstants;
import com.alibaba.schedulerx.worker.log.LogFactory;
import com.alibaba.schedulerx.worker.log.Logger;
import com.alibaba.schedulerx.worker.logcollector.ClientLoggerMessage;
import com.alibaba.schedulerx.worker.logcollector.LogCollector;
import com.alibaba.schedulerx.worker.logcollector.LogCollectorFactory;
import com.alibaba.schedulerx.worker.processor.JobProcessor;
import com.alibaba.schedulerx.worker.processor.JobProcessorEx;
import com.alibaba.schedulerx.worker.processor.MapJobProcessor;
import com.alibaba.schedulerx.worker.processor.ProcessResult;
import com.alibaba.schedulerx.worker.util.ActorPathUtil;
import com.alibaba.schedulerx.worker.util.JobProcessorUtil;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

import akka.actor.ActorContext;
import akka.actor.ActorSelection;

/**
 * @author xiaomeng.hxm
 */
public class BroadcastTaskMaster extends TaskMaster {

    private static final Logger LOGGER = LogFactory.getLogger(BroadcastTaskMaster.class);
    private Map worker2uniqueIdMap = Maps.newConcurrentMap();
    private Map workerProgressMap = Maps.newConcurrentMap();
    private LogCollector logCollector = LogCollectorFactory.get();

    private volatile boolean running = false;

    private volatile boolean monitor = false;
    
    private Map taskIdResultMap = Maps.newHashMap();
    private Map taskIdStatusMap = Maps.newHashMap();

    private static ThreadPoolExecutor dispatchThreadPool;

    private List allWorkers = Lists.newArrayList();

    public BroadcastTaskMaster(JobInstanceInfo jobInstanceInfo, ActorContext actorContext) throws Exception {
        super(jobInstanceInfo, actorContext);
        Boolean enableDispatchThreadPool = ConfigUtil.getWorkerConfig().getBoolean(WorkerConstants.BROADCAST_DISPATCH_THREAD_ENABLE, false);
        if (enableDispatchThreadPool) {
            if (dispatchThreadPool == null) {
                synchronized (BroadcastTaskMaster.class) {
                    if (dispatchThreadPool == null) {
                        int threadNum = ConfigUtil.getWorkerConfig().getInt(WorkerConstants.BROADCAST_DISPATCH_THREAD_NUM, 4);
                        dispatchThreadPool = new ThreadPoolExecutor(threadNum, threadNum * 2, 30, TimeUnit.SECONDS,
                                new LinkedBlockingQueue(1024), new ThreadFactory() {
                            private final AtomicInteger nextId = new AtomicInteger(1);
                            private final String namePrefix = "Schedulerx-BroadcastTaskMaster-dispatch-thread-";

                            @Override
                            public Thread newThread(Runnable r) {
                                return new Thread(r, namePrefix + nextId.getAndIncrement());
                            }
                        }, new ThreadPoolExecutor.CallerRunsPolicy());
                    }
                }
            }
        }
    }

    @Override
    public synchronized void submitInstance(final JobInstanceInfo info) {
        if ("java".equalsIgnoreCase(info.getJobType())) {
            try {
                preProcess(info);
            } catch (Exception e) {
                LOGGER.error("BroadcastTaskMaster.preProcess failed, jobInstanceId={}", info.getJobInstanceId(), e);
                String uniqueId = IdUtil.getUniqueId(info.getJobId(), info.getJobInstanceId(), 0);
                logCollector.collect(info.getAppGroupId(), uniqueId, ClientLoggerMessage.appendMessage(ClientLoggerMessage.BROADCAST_INSTANCE_INIT_FAIL,
                        SchedulerxWorker.WORKER_ADDR, ExceptionUtil.getMessage(e)), info.getGroupId());
                updateNewInstanceStatus(getSerialNum(), jobInstanceInfo.getJobInstanceId(), InstanceStatus.FAILED, "Preprocess failed. " + e.getMessage());
                return;
            }
        }

        // 在发送前先置为已初始化状态,方式秒级别任务在首次广播过程中因为单个机器失败强制停止整个任务实例
        super.init();

        this.allWorkers = info.getAllWorkers();
        // 广播主节点不参与
        boolean enable = ConfigUtil.getWorkerConfig().getBoolean(WorkerConstants.BROADCAST_MASTER_EXEC_ENABLE, true);
        if(!enable) {
            this.allWorkers.remove(getLocalWorkerIdAddr());
        }

        // 先将状态Map全部构建出来,防止广播处理过程中未广播发送完毕任务就判断statusMap为完成状态
        final Map taskIdMap = new HashMap<>();
        Collections.sort(allWorkers);
        for(String workerIdAddr : allWorkers){
            String[] workerInfo = workerIdAddr.split("@");
            String workerAddr = workerInfo[1];
            long taskId = aquireTaskId();
            String uniqueId = IdUtil.getUniqueId(info.getJobId(), info.getJobInstanceId(), taskId);
            taskStatusMap.put(uniqueId, TaskStatus.INIT);
            if (!workerProgressMap.containsKey(workerAddr)) {
                WorkerProgressCounter workerProgressCounter = new WorkerProgressCounter(workerAddr);
                workerProgressMap.put(workerAddr, workerProgressCounter);
            }
            workerProgressMap.get(workerAddr).incrementTotal();
            taskIdMap.put(workerIdAddr, taskId);
        }

        for (final String workerIdAddr : allWorkers) {
            if (dispatchThreadPool != null) {
                dispatchThreadPool.execute(new Runnable() {
                    @Override
                    public void run() {
                        dispatchTask(info, workerIdAddr, taskIdMap);
                    }
                });
            } else {
                dispatchTask(info, workerIdAddr, taskIdMap);
            }
        }

        // 在任务分发完成后,开启广播任务检测线程,防止在广播分发过程中进行各个节点处理状态的判断导致误判
        this.startMonitorThreads();
    }

    /**
     * 分发广播任务
     */
    private void dispatchTask(JobInstanceInfo info, String workerIdAddr, Map taskIdMap) {
        String[] workerInfo = workerIdAddr.split("@");
        String workerAddr = workerInfo[1];
        String workerId = workerInfo[0];
        ActorSelection selection = getActorContext().actorSelection(ActorPathUtil.getContainerRouterPath(workerIdAddr));
        long taskId = taskIdMap.get(workerIdAddr);
        String uniqueId = IdUtil.getUniqueId(info.getJobId(), info.getJobInstanceId(), taskId);
        MasterStartContainerRequest.Builder builder = convert2StartContainerRequestBuilder(info, taskId);
        builder.setShardingNum(allWorkers.size());
        MasterStartContainerRequest request = builder.build();
        taskIdStatusMap.put(taskId, TaskStatus.INIT);
        int retryTimes = 0;
        String result = "";
        int maxRetryTimes = ConfigUtil.getWorkerConfig().getInt(WorkerConstants.BROADCAST_DISPATCH_RETRY_TIMES, 1);
        while (retryTimes++ < maxRetryTimes) {
            try {
                MasterStartContainerResponse response = (MasterStartContainerResponse) FutureUtils.awaitResult(selection, request, 5);
                if (response.getSuccess()) {
                    worker2uniqueIdMap.put(workerIdAddr, uniqueId);
                    logCollector.collect(info.getAppGroupId(), uniqueId, ClientLoggerMessage.appendMessage(ClientLoggerMessage.BROADCAST_INSTANCE_INIT_SUCCESS, workerAddr),
                        info.getGroupId());
                    return;
                } else {
                    result = response.getMessage();
                    LOGGER.error("submitTask[{}] serialNum={} to worker error, {}", uniqueId, this.getSerialNum(), workerAddr, response.getMessage());
                    logCollector.collect(info.getAppGroupId(), uniqueId, ClientLoggerMessage.appendMessage(ClientLoggerMessage.BROADCAST_INSTANCE_INIT_FAIL,
                            workerAddr, response.getMessage()), info.getGroupId());
                    TimeUnit.SECONDS.sleep(2L);
                }
            } catch (Throwable e) {
                result = e.getMessage();
                LOGGER.error("start container failed, worker:{}, uniqueId:{}, serialNum={}", workerAddr, this.getSerialNum(), uniqueId, e);
                logCollector.collect(info.getAppGroupId(), uniqueId, ClientLoggerMessage.appendMessage(ClientLoggerMessage.BROADCAST_INSTANCE_INIT_FAIL, workerAddr), 
                    e, info.getGroupId());
            }
        }
        this.existInvalidWorker = true;
        ContainerReportTaskStatusRequest faileRequest = ContainerReportTaskStatusRequest.newBuilder()
                .setJobId(info.getJobId())
                .setJobInstanceId(info.getJobInstanceId())
                .setTaskId(taskId)
                .setStatus(TaskStatus.FAILED.getValue())
                .setResult(result)
                .setWorkerId(workerId)
                .setWorkerAddr(workerAddr)
                .setSerialNum(getSerialNum())
                .build();
        updateTaskStatus(faileRequest);
    }


    @Override
    public void killInstance(boolean mayInterruptIfRunning, String reason) {
        super.killInstance(mayInterruptIfRunning, reason);
        this.sendKillContainerRequest(mayInterruptIfRunning, allWorkers);
        updateNewInstanceStatus(getSerialNum(), jobInstanceInfo.getJobInstanceId(), InstanceStatus.FAILED, reason);
        //将taskStatusMap清空,可以直接把任务置为结束
        taskStatusMap.clear();
    }

    @Override
    public void destroyContainerPool() {
        for (String workerIdAddr : this.allWorkers) {
            MasterDestroyContainerPoolRequest request = MasterDestroyContainerPoolRequest.newBuilder()
                .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                .setJobId(jobInstanceInfo.getJobId())
                .setWorkerIdAddr(workerIdAddr)
                .setSerialNum(getSerialNum())
                .build();
            SchedulerxWorker.AtLeastDeliveryRoutingActor.tell(request, null);
        }
    }

    @Override
    public synchronized void updateTaskStatus(ContainerReportTaskStatusRequest request) {
        if(request.getSerialNum() != this.getSerialNum()){
            LOGGER.warn("ignore ContainerReportTaskStatusRequest, current serialNum={}, but request serialNum={}.",
                this.getSerialNum(), request.getSerialNum());
            return;
        }
        long jobId = request.getJobId();
        long jobInstanceId = request.getJobInstanceId();
        long taskId = request.getTaskId();
        String workerAddr = request.getWorkerAddr();
        TaskStatus taskStatus = TaskStatus.parseValue(request.getStatus());
        String uniqueId = IdUtil.getUniqueId(jobId, jobInstanceId, taskId);
        LOGGER.info("update task status serialNum={}, uniqueId={}, status={}, workerAddr={}", request.getSerialNum(), uniqueId,
                taskStatus.getDescription(), workerAddr);
        
        if (taskStatusMap.containsKey(uniqueId)) {
            if (taskStatusMap.get(uniqueId).equals(taskStatus)) {
                LOGGER.warn("duplicated ContainerReportTaskStatusRequest, uniqueId={}, taskStatus={}", uniqueId, taskStatus);
            } else {
                if (taskStatus.equals(TaskStatus.SUCCESS)) {
                    //如果某台机器运行完了,直接从taskStatusMap中移除
                    taskStatusMap.remove(uniqueId);
                } else {
                    //更新running状态
                    taskStatusMap.put(uniqueId, taskStatus);
                }
        
                if (!workerProgressMap.containsKey(workerAddr)) {
                    WorkerProgressCounter workerProgressCounter = new WorkerProgressCounter(workerAddr);
                    workerProgressMap.put(workerAddr, workerProgressCounter);
                }
                if (taskStatus.equals(TaskStatus.RUNNING)) {
                    workerProgressMap.get(workerAddr).incrementRunning();
                } else if (taskStatus.equals(TaskStatus.SUCCESS)) {
                    workerProgressMap.get(workerAddr).incrementSuccess();
                } else if (taskStatus.equals(TaskStatus.FAILED)) {
                    workerProgressMap.get(workerAddr).incrementFailed();
                }
                if (StringUtils.isNotBlank(request.getTraceId())) {
                    workerProgressMap.get(workerAddr).setTraceId(request.getTraceId());
                }
                
                //update taskResultMap and taskStatusMap
                taskIdResultMap.put(request.getTaskId(), request.getResult());
                taskIdStatusMap.put(request.getTaskId(), taskStatus);
        
                updateNewInstanceStatus(request.getSerialNum(), jobInstanceId, request.getResult());
            }
        }
    }

    private synchronized void updateNewInstanceStatus(long serialNum, long jobInstanceId, String result) {
        InstanceStatus newStatus = killed ? InstanceStatus.FAILED : InstanceStatus.SUCCESS;
        if (taskStatusMap.size() > 0) {
            if (!isJobInstanceFinished()) {
                newStatus = InstanceStatus.RUNNING;
            } else {
                newStatus = InstanceStatus.SUCCESS;
                //只要有一个子任务状态为FAILED,则返回FAILED
                for (TaskStatus status : taskStatusMap.values()) {
                    if (status.equals(TaskStatus.FAILED)) {
                        newStatus = InstanceStatus.FAILED;
                        break;
                    }
                }
            }
        }
        LOGGER.info("update serialNum={}, jobInstanceId={} status={}", serialNum, jobInstanceId, newStatus.getDescription());
        updateNewInstanceStatus(serialNum, jobInstanceId, newStatus, result);
    }
    
    @Override
    public String getJobInstanceProgress() {
        MapTaskProgress detail = new MapTaskProgress();
        detail.setWorkerProgress(workerProgressMap.values());
        return JsonUtil.toJson(detail);
    }

    /**
     * 开启默认线程
     */
    private synchronized void startMonitorThreads() {
        // 开启本轮执行状态监视,针对秒级别任务执行过程中通过monitor来控制每一轮广播任务分发完成后才进行状态检测,防止分发过程中任务检测异常
        monitor = true;
        if(running){
            return;
        }
        final String jobIdAndInstanceId = jobInstanceInfo.getJobId() + "_" + jobInstanceInfo.getJobInstanceId();
        final BroadcastTaskMaster taskMaster = this;
        new Thread(new Runnable() {
            @Override
            public void run() {
                while (!isFinished()) {
                    if(monitor) {
                        aliveCheckWorkerSet.addAll(worker2uniqueIdMap.keySet());
                        for (String workerIdAddr : aliveCheckWorkerSet) {
                            try {
                                ActorSelection selection = getActorContext().actorSelection(ActorPathUtil.getWorkerHeartbeatRouterPath(workerIdAddr));
                                MasterCheckWorkerAliveRequest request = MasterCheckWorkerAliveRequest.newBuilder()
                                        .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                                        .build();
                                FutureUtils.awaitResult(selection, request, 5);
                            } catch (TimeoutException e) {
                                handleWorkerShutdown(workerIdAddr, true);
                            } catch (Throwable e) {
                                LOGGER.error("check worker alive failed.", e);
                            }
                        }
                    }
                    try {
                        Thread.sleep(5000);
                    } catch (InterruptedException e) {
                        LOGGER.error("", e);
                        break;
                    }
                }
            }
        }, "Schedulerx-BroadcastTaskMaster-check-worker-alive-thread-" + jobInstanceInfo.getJobId() + "_"
            + jobInstanceInfo.getJobInstanceId()).start();
        
        // job instance progress report thread
        if(!JobUtil.isSecondTypeJob(TimeType.parseValue(jobInstanceInfo.getTimeType()))) {
            new Thread(new Runnable() {
                @Override
                public void run() {
                    while (!isFinished()) {
                        WorkerReportJobInstanceProgressRequest request = WorkerReportJobInstanceProgressRequest.newBuilder().setJobId(jobInstanceInfo.getJobId()).setJobInstanceId(
                            jobInstanceInfo.getJobInstanceId()).setProgress(getJobInstanceProgress()).build();
                        SERVER_DISCOVERY.getMapMasterRouter().tell(request, null);
                        try {
                            Thread.sleep(5000);
                        } catch (InterruptedException e) {
                            LOGGER.error("report status error, uniqueId={}", jobIdAndInstanceId, e);
                            break;
                        }
                    }
                }
            }, "Schedulerx-BroadcastTaskMaster-report-progress-thread-" + jobIdAndInstanceId).start();
        }
        
        // status check thread
        new Thread(new Runnable() {
            @Override
            public void run() {
            while (!isFinished()) {
                try {
                    Thread.sleep(5000);
                    synchronized (taskMaster) {
                        if (!monitor) {
                            continue;
                        }
                        if (taskStatusMap.size() < 10) {
                            LOGGER.info("taskStatusMap=" + taskStatusMap);
                        }
                        updateNewInstanceStatus(getSerialNum(), jobInstanceInfo.getJobInstanceId(), "");
                    }
                } catch (Throwable e) {
                    LOGGER.error("status check error, uniqueId:{}", jobIdAndInstanceId, e);
                }
            }
            }
        }, "Schedulerx-BroadcastTaskMaster-status-check-thread-" + jobIdAndInstanceId).start();

        running = true;
    }

    /**
     * Getter method for property workerProgressMap.
     *
     * @return property value of workerProgressMap
     */
    public Map getWorkerProgressMap() {
        return workerProgressMap;
    }

    @Override
    protected void checkProcessor() throws Exception {
        if ("java".equalsIgnoreCase(jobInstanceInfo.getJobType())) {
            JavaProcessorProfile profile = JsonUtil.fromJson(jobInstanceInfo.getContent(), JavaProcessorProfile.class);
            if (JobProcessorUtil.checkJavaProcessor(profile.getClassName(), MapJobProcessor.class)) {
                throw new IOException(profile.getClassName() + " shouldn't extends MapJobProcessor or MapReduceJobProcessor");
            }
        }
    }

    @SuppressWarnings("resource")
    @Override
    public ProcessResult postFinish(long jobInstanceId) {
        ProcessResult postResult = new ProcessResult(true);
        if ("java".equalsIgnoreCase(jobInstanceInfo.getJobType())) {
            try {
                JobContext context = JobContext.newBuilder()
                        .setJobId(jobInstanceInfo.getJobId())
                        .setJobInstanceId(jobInstanceId)
                        .setJobType(jobInstanceInfo.getJobType())
                        .setContent(jobInstanceInfo.getContent())
                        .setScheduleTime(jobInstanceInfo.getScheduleTime())
                        .setDataTime(jobInstanceInfo.getDataTime())
                        .setJobParameters(jobInstanceInfo.getParameters())
                        .setInstanceParameters(jobInstanceInfo.getInstanceParameters())
                        .setUser(jobInstanceInfo.getUser())
                        .setTaskResults(taskIdResultMap)
                        .setTaskStatuses(taskIdStatusMap)
                        .setSerialNum(this.getSerialNum())
                        .build();
                JobProcessor jobProcessor = JobProcessorUtil.getJavaProcessor(context.getContent());
                if (jobProcessor instanceof JobProcessorEx) {
                    postResult = ((JobProcessorEx)jobProcessor).postProcess(context);
                }
            } catch (Throwable e) {
                LOGGER.error("", e);
            }
        }
        return postResult;
    }

    private void preProcess(JobInstanceInfo jobInstanceInfo) throws Exception {
        JobContext context = JobContext.newBuilder()
                .setJobId(jobInstanceInfo.getJobId())
                .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                .setJobType(jobInstanceInfo.getJobType())
                .setContent(jobInstanceInfo.getContent())
                .setScheduleTime(jobInstanceInfo.getScheduleTime())
                .setDataTime(jobInstanceInfo.getDataTime())
                .setJobParameters(jobInstanceInfo.getParameters())
                .setInstanceParameters(jobInstanceInfo.getInstanceParameters())
                .setUser(jobInstanceInfo.getUser())
                .setSerialNum(this.getSerialNum())
                .build();
        JobProcessor jobProcessor = JobProcessorUtil.getJavaProcessor(context.getContent());
        if (jobProcessor instanceof JobProcessorEx) {
            ((JobProcessorEx)jobProcessor).preProcess(context);
        }
    }

    @Override
    public void clear() {
        super.clear();
        worker2uniqueIdMap.clear();
        workerProgressMap.clear();
        monitor = false;
        if (taskIdResultMap != null) {
            taskIdResultMap.clear();
        }
        
        if (taskIdStatusMap != null) {
            taskIdStatusMap.clear();
        }
    }

    @Override
    public void handleWorkerShutdown(String workerIdAddr, boolean withFailover) {
        this.existInvalidWorker = true;
        String uniqueId = worker2uniqueIdMap.get(workerIdAddr);
        if (uniqueId != null) {
            String[] workerInfo = workerIdAddr.split("@");
            String workerAddr = workerInfo[1];
            String workerId = workerInfo[0];
            String[] tokens = uniqueId.split(IdUtil.SPLITTER_TOKEN);
            ContainerReportTaskStatusRequest request = ContainerReportTaskStatusRequest.newBuilder()
                    .setJobId(Long.valueOf(tokens[0]))
                    .setJobInstanceId(Long.valueOf(tokens[1]))
                    .setTaskId(Long.valueOf(tokens[2]))
                    .setStatus(TaskStatus.FAILED.getValue())
                    .setWorkerAddr(workerAddr)
                    .setWorkerId(workerId)
                    .setSerialNum(this.getSerialNum())
                    .build();
            updateTaskStatus(request);
            LOGGER.warn("worker[{}] is down, set {} to failed", workerAddr, uniqueId);
        } else {
            LOGGER.error("can't found workerAddr of uniqueId={}", uniqueId);
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy