All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.alibaba.schedulerx.worker.master.TaskMaster Maven / Gradle / Ivy

There is a newer version: 1.12.2
Show newest version
package com.alibaba.schedulerx.worker.master;

import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.collections.CollectionUtils;

import com.alibaba.schedulerx.common.domain.InstanceStatus;
import com.alibaba.schedulerx.common.domain.JobInstanceData;
import com.alibaba.schedulerx.common.domain.JobInstanceInfo;
import com.alibaba.schedulerx.common.domain.MapTaskXAttrs;
import com.alibaba.schedulerx.common.domain.Pair;
import com.alibaba.schedulerx.common.domain.TaskStatus;
import com.alibaba.schedulerx.common.util.IdUtil;
import com.alibaba.schedulerx.common.util.JsonUtil;
import com.alibaba.schedulerx.common.util.StringUtils;
import com.alibaba.schedulerx.protocol.Common.UpstreamData;
import com.alibaba.schedulerx.protocol.Server.RetryTaskEntity;
import com.alibaba.schedulerx.protocol.Worker;
import com.alibaba.schedulerx.protocol.Worker.ContainerBatchReportTaskStatuesRequest;
import com.alibaba.schedulerx.protocol.Worker.ContainerReportTaskStatusRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterStartContainerRequest;
import com.alibaba.schedulerx.protocol.Worker.TaskStatusInfo;
import com.alibaba.schedulerx.worker.container.ShutdownMode;
import com.alibaba.schedulerx.worker.discovery.ServerDiscovery;
import com.alibaba.schedulerx.worker.discovery.ServerDiscoveryFactory;
import com.alibaba.schedulerx.worker.domain.WorkerConstants;
import com.alibaba.schedulerx.worker.log.LogFactory;
import com.alibaba.schedulerx.worker.log.Logger;
import com.alibaba.schedulerx.worker.master.handler.UpdateInstanceStatusHandler;
import com.alibaba.schedulerx.worker.master.handler.UpdateInstanceStatusHandlerFactory;
import com.alibaba.schedulerx.worker.master.scheduler.TimeScheduler;
import com.alibaba.schedulerx.worker.processor.ProcessResult;
import com.alibaba.schedulerx.worker.util.ActorPathUtil;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.protobuf.ByteString;

import akka.actor.ActorContext;
import akka.actor.ActorSelection;
import akka.pattern.Patterns;
import akka.util.Timeout;
import scala.concurrent.Await;
import scala.concurrent.Future;
import scala.concurrent.duration.Duration;

/**
 * @author xiaomeng.hxm
 */
public abstract class TaskMaster {
    private final ActorContext actorContext;
    protected volatile InstanceStatus instanceStatus = InstanceStatus.RUNNING;
    protected volatile Map taskStatusMap = Maps.newHashMap();
    protected AtomicLong taskIdGenerator = new AtomicLong(0);
    //private final String localWorkerAkkaPath;
    private final String localWorkIdAddr;
    private final String localContainerRouterPath;
    private final String localTaskRouterPath;
    private final String localInstanceRouterPath;
    protected final JobInstanceInfo jobInstanceInfo;
    protected String jobInstanceProgress;
    protected final UpdateInstanceStatusHandler statusHandler;
    protected volatile boolean killed = false;
    protected volatile boolean INITED = false;
    protected volatile Set aliveCheckWorkerSet = Sets.newConcurrentHashSet();
    protected volatile Set invalidWorkerSet = Sets.newConcurrentHashSet();
    protected final ServerDiscovery SERVER_DISCOVERY;
    // 秒级任务使用,当前循环次数
    protected AtomicLong serialNum = new AtomicLong(0);
    private static final Logger LOGGER = LogFactory.getLogger(TaskMaster.class);

    protected final CountDownLatch latch = new CountDownLatch(1);

    /**
     * 是否存在失效Worker
     */
    protected volatile boolean existInvalidWorker = false;

    public TaskMaster(JobInstanceInfo jobInstanceInfo, ActorContext actorContext) throws Exception {
        this.jobInstanceInfo = jobInstanceInfo;
        this.actorContext = actorContext;
        this.localWorkIdAddr = actorContext.provider().getDefaultAddress().hostPort();
        this.localInstanceRouterPath = actorContext.provider().getDefaultAddress().toString()
                + WorkerConstants.WORKER_AKKA_JOB_INSTANCE_ROUTING_PATH;
        this.localContainerRouterPath = actorContext.provider().getDefaultAddress().toString()
            + WorkerConstants.WORKER_AKKA_CONTAINER_ROUTING_PATH;
        this.localTaskRouterPath = actorContext.provider().getDefaultAddress().toString()
            + WorkerConstants.WORKER_AKKA_TASK_ROUTING_PATH;
        this.SERVER_DISCOVERY = ServerDiscoveryFactory.getDiscovery(jobInstanceInfo.getGroupId());
//        this.aliveCheckWorkerSet.addAll(jobInstanceInfo.getAllWorkers());
        checkProcessor();
        this.statusHandler = UpdateInstanceStatusHandlerFactory.create(this, jobInstanceInfo);
    }

    public ActorContext getActorContext() {
        return actorContext;
    }
    
    public String getLocalWorkerIdAddr() {
        return localWorkIdAddr;
    }

    public String getLocalJobInstanceRouterPath() {
        return localInstanceRouterPath;
    }

    public String getLocalContainerRouterPath() {
        return localContainerRouterPath;
    }

    public String getLocalTaskRouterPath() {
        return localTaskRouterPath;
    }

    public boolean isJobInstanceFinished() {
        boolean isFinish = true;
        //TODO 需要重构,效率低下
        for (TaskStatus status : taskStatusMap.values()) {
            if (!status.isFinish()) {
                isFinish = false;
                break;
            }
        }
        return isFinish;
    }

    public void updateTaskStatus(ContainerReportTaskStatusRequest request) {
        long jobId = request.getJobId();
        long jobInstanceId = request.getJobInstanceId();
        long taskId = request.getTaskId();
        TaskStatus taskStatus = TaskStatus.parseValue(request.getStatus());

        //TODO 这段代码写得特别挫,效率极其低下,需要重构
        String uniqueId = IdUtil.getUniqueId(jobId, jobInstanceId, taskId);
        taskStatusMap.put(uniqueId, taskStatus);

        InstanceStatus newStatus = InstanceStatus.UNKNOWN;
        if (taskStatusMap.size() > 0) {
            if (!isJobInstanceFinished()) {
                newStatus = InstanceStatus.RUNNING;
            } else {
                newStatus = InstanceStatus.SUCCESS;
                //只要有一个子任务状态为FAILED,则返回FAILED
                if (!newStatus.equals(InstanceStatus.FAILED)) {
                    for (TaskStatus status : taskStatusMap.values()) {
                        if (status.equals(TaskStatus.FAILED)) {
                            newStatus = InstanceStatus.FAILED;
                            break;
                        }
                    }
                }
            }
        }
        this.jobInstanceInfo.setTraceId(request.getTraceId());
        this.jobInstanceProgress = request.getProgress();
        updateNewInstanceStatus(request.getSerialNum(), jobInstanceId, newStatus, request.getResult());
    }

    //TODO: MapTaskMaster may override this method do really batch process
    public void batchUpdateTaskStatus(ContainerBatchReportTaskStatuesRequest request) throws Exception {
        for (TaskStatusInfo taskStatusInfo : request.getTaskStatuesList()) {
            ContainerReportTaskStatusRequest.Builder builder = ContainerReportTaskStatusRequest.newBuilder()
                .setJobId(request.getJobId())
                .setJobInstanceId(request.getJobInstanceId())
                .setTaskId(taskStatusInfo.getTaskId())
                .setWorkerAddr(request.getWorkerAddr())
                .setWorkerId(request.getWorkerId())
                .setStatus(taskStatusInfo.getStatus());
            if (taskStatusInfo.hasResult()) {
                builder.setResult(taskStatusInfo.getResult());
            }
            if (taskStatusInfo.hasTaskName()) {
                builder.setTaskName(taskStatusInfo.getTaskName());
            }
            if (taskStatusInfo.hasProgress()) {
                builder.setProgress(taskStatusInfo.getProgress());
            }
            if (request.hasSerialNum()) {
                builder.setSerialNum(request.getSerialNum());
            }
            if (taskStatusInfo.hasTraceId()) {
                builder.setTraceId(taskStatusInfo.getTraceId());
            }
            updateTaskStatus(builder.build());
        }
    }

    public void batchUpdateTaskStatues(List requests){}

    public void killInstance(boolean mayInterruptIfRunning, String reason) {
        this.killed = true;
        TimeScheduler.INSTANCE.remove(jobInstanceInfo.getJobInstanceId());
    }

    protected void sendKillContainerRequest(boolean mayInterruptIfRunning, List allWorkers) {
        String uniqueId = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());
//        CountDownLatch latch = new CountDownLatch();
        Set> futures = Sets.newHashSet();
        Timeout timeout = new Timeout(Duration.create(5, TimeUnit.SECONDS));
        for (String workerIdAddr : allWorkers) {
            try {
                ActorSelection selection = getActorContext().actorSelection(ActorPathUtil.getContainerRouterPath(workerIdAddr));
                Worker.MasterKillContainerRequest request = Worker.MasterKillContainerRequest.newBuilder()
                        .setJobId(jobInstanceInfo.getJobId())
                        .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                        .setMayInterruptIfRunning(mayInterruptIfRunning)
                        .setAppGroupId(jobInstanceInfo.getAppGroupId())
                        .setGroupId(jobInstanceInfo.getGroupId())
                        .build();
                if (mayInterruptIfRunning) {
                    // 强制终止,直接异步分发处理
                    selection.tell(request, null);
                } else {
                    // 异步分发
                    Future future = Patterns.ask(selection, request, timeout);
                    futures.add(new Pair(workerIdAddr, future));
                }
            } catch (Throwable e) {
                LOGGER.warn("send kill instance request exception, worker:{}, uninqueId:{}", workerIdAddr, uniqueId);
            }
        }

        // 等待所有结果返回
        for (Pair pair:futures) {
            try {
                Await.result(pair.getSecond(), timeout.duration());
            } catch (Exception e) {
                LOGGER.warn("send kill instance request timeout, worker:{}, uninqueId:{}", pair.getFirst(), uniqueId);
            }
        }
    }

    public abstract void destroyContainerPool();

    public void killTask(String uniqueId, String workerId, String workerAddr) {
        //TODO
    }

    protected void init() {
        if (INITED) {
            return;
        }
        INITED = true;
    }

    public void retryTasks(List taskEntities) {}

    public abstract void submitInstance(JobInstanceInfo jobInstanceInfo) throws Exception;

    protected long aquireTaskId() {
        return taskIdGenerator.getAndIncrement();
    }

    public String getJobInstanceProgress() {
        return jobInstanceProgress;
    }

    public void updateNewInstanceStatus(long serialNum, InstanceStatus newStatus, String result) {
        updateNewInstanceStatus(serialNum, jobInstanceInfo.getJobInstanceId(), newStatus, result);
    }

    protected synchronized void updateNewInstanceStatus(long serialNum, long jobInstanceId, InstanceStatus newStatus,
        String result) {
        try {
            this.statusHandler.handle(serialNum, newStatus, result);
        } catch (Exception e) {
            LOGGER.error("update jobInstanceId={} serialNum={}, status={} failed", jobInstanceId, serialNum, newStatus.getValue(), e);
        }
    }

    public void stop() {}

    public void clear() {
        taskStatusMap.clear();
        taskIdGenerator.set(0);
        instanceStatus = InstanceStatus.RUNNING;
        aliveCheckWorkerSet.clear();
        invalidWorkerSet.clear();
        destroyContainerPool();
    }

    public ProcessResult postFinish(long jobInstanceId) {
        return null;
    }

    protected MasterStartContainerRequest.Builder convert2StartContainerRequestBuilder(JobInstanceInfo jobInstanceInfo, long taskId) {
        return convert2StartContainerRequestBuilder(jobInstanceInfo, taskId, null, null, false);
    }
    
    protected MasterStartContainerRequest convert2StartContainerRequest(JobInstanceInfo jobInstanceInfo, long taskId,
            String taskName, ByteString taskBody) {
        MasterStartContainerRequest.Builder builder =  convert2StartContainerRequestBuilder(jobInstanceInfo, taskId, taskName, taskBody, false);
        return builder.build();
    }
    
    protected MasterStartContainerRequest.Builder convert2StartContainerRequestBuilder(JobInstanceInfo jobInstanceInfo, long taskId,
            String taskName, ByteString taskBody, boolean failover) {
        MasterStartContainerRequest.Builder builder = MasterStartContainerRequest.newBuilder();
        builder.setJobId(jobInstanceInfo.getJobId());
        builder.setJobInstanceId(jobInstanceInfo.getJobInstanceId());
        builder.setTaskId(taskId);
        builder.setAppGroupId(jobInstanceInfo.getAppGroupId());       
        builder.setUser(jobInstanceInfo.getUser());
        builder.setJobType(jobInstanceInfo.getJobType());
        builder.setContent(jobInstanceInfo.getContent());
        builder.setScheduleTime(jobInstanceInfo.getScheduleTime().getMillis());
        builder.setDataTime(jobInstanceInfo.getDataTime().getMillis());
        builder.setParameters(jobInstanceInfo.getParameters());
        builder.setInstanceParameters(jobInstanceInfo.getInstanceParameters());
        builder.setInstanceMasterAkkaPath(getLocalTaskRouterPath());
        builder.setGroupId(jobInstanceInfo.getGroupId());
        builder.setMaxAttempt(jobInstanceInfo.getMaxAttempt());
        builder.setAttempt(jobInstanceInfo.getAttempt());
        builder.setTriggerType(jobInstanceInfo.getTriggerType());
        if (jobInstanceInfo.getUpstreamData() != null && !jobInstanceInfo.getUpstreamData().isEmpty()) {
            for (JobInstanceData jobInstanceData : jobInstanceInfo.getUpstreamData()) {
                UpstreamData upstreamData = UpstreamData.newBuilder()
                        .setJobName(jobInstanceData.getJobName())
                        .setData(jobInstanceData.getData())
                        .build();
                builder.addUpstreamData(upstreamData);
            }
        }
        if (StringUtils.isNotEmpty(jobInstanceInfo.getXattrs())) {
            builder.setXattrs(jobInstanceInfo.getXattrs());
            MapTaskXAttrs xAttrs = JsonUtil.fromJson(jobInstanceInfo.getXattrs(), MapTaskXAttrs.class);
            builder.setConsumerNum(xAttrs.getConsumerSize());
            builder.setTaskMaxAttempt(xAttrs.getTaskMaxAttempt());
            builder.setTaskAttemptInterval(xAttrs.getTaskAttemptInterval());
        }
        if (taskName != null) {
            builder.setTaskName(taskName);
        }
        if (taskBody != null) {
            builder.setTask(taskBody);
        }
        if (failover) {
            builder.setFailover(true);
        }
        if (jobInstanceInfo.getWfInstanceId() != null) {
            builder.setWfInstanceId(jobInstanceInfo.getWfInstanceId());
        }
        builder.setSerialNum(getSerialNum());
        builder.setExecuteMode(jobInstanceInfo.getExecuteMode());
        if (jobInstanceInfo.getJobName() != null) {
            builder.setJobName(jobInstanceInfo.getJobName());
        }
        builder.setTimeType(jobInstanceInfo.getTimeType());
        builder.setTimeExpression(jobInstanceInfo.getTimeExpression());
        if (StringUtils.isNotEmpty(jobInstanceInfo.getNamespace())) {
            builder.setNamespace(jobInstanceInfo.getNamespace());
        }
        if (StringUtils.isNotEmpty(jobInstanceInfo.getTemplate())) {
            builder.setTemplate(jobInstanceInfo.getTemplate());
        }
        if (jobInstanceInfo.getWorkflowId() != null) {
            builder.setWorkflowId(jobInstanceInfo.getWorkflowId());
        }
            
        return builder;
    }

    /**
     * Getter method for property instanceStatus.
     *
     * @return property value of instanceStatus
     */
    public InstanceStatus getInstanceStatus() {
        return instanceStatus;
    }

    /**
     * Setter method for property instanceStatus .
     *
     * @param instanceStatus value to be assigned to property instanceStatus
     */
    public void setInstanceStatus(InstanceStatus instanceStatus) {
        this.instanceStatus = instanceStatus;
    }

    public boolean isKilled() {
        return killed;
    }

    /**
     * Getter method for property jobInstanceInfo.
     *
     * @return property value of jobInstanceInfo
     */
    public JobInstanceInfo getJobInstanceInfo() {
        return jobInstanceInfo;
    }

    /**
     * Getter method for property aliveCheckWorkerSet.
     *
     * @return property value of aliveCheckWorkerSet
     */
    public Set getAliveCheckWorkerSet() {
        return aliveCheckWorkerSet;
    }

    public boolean isInited(){
        return INITED;
    }

    public long getSerialNum() {
        return serialNum.get();
    }

    public long aquireSerialNum() {
        return serialNum.incrementAndGet();
    }

    protected void checkProcessor() throws Exception {}

    public boolean existInvalidWorker() {
        return existInvalidWorker;
    }

    /**
     * 重置任务实例可用worker列表
     * @param freeWorkers
     */
    public void restJobInstanceWorkerList(Set freeWorkers){
        if(CollectionUtils.isNotEmpty(freeWorkers)) {
            this.jobInstanceInfo.setAllWorkers(Lists.newCopyOnWriteArrayList(freeWorkers));
            this.existInvalidWorker = false;
            LOGGER.info("restJobInstanceWorkerList appGroupId={} instanceId={} workerSize={}.", jobInstanceInfo.getAppGroupId(),
                    jobInstanceInfo.getJobInstanceId(), freeWorkers.size());
        }else {
            LOGGER.warn("restJobInstanceWorkerList update appGroupId={} instanceId={} workers=0.", jobInstanceInfo.getAppGroupId(),
                    jobInstanceInfo.getJobInstanceId());
        }
    }

    /**
     * 对指定worker进行下线处理
     * @param workerIdAddr
     */
    public synchronized void handleWorkerShutdown(String workerIdAddr, boolean withFailover) {}

    /**
     * 对指定worker标记下线
     * @param workerIdAddr
     */
    public synchronized void handleWorkerOffline(String workerIdAddr) {
        //TODO 此处仅标记当前节点下线,依然通过定时检测判断是否下线(存在延迟可能)
        //1、需要支持worker最终下线后,还是要通知master进行shutdown操作
        //2、worker自身做好残留子任务收尾和状态反馈
        this.existInvalidWorker = true;
        this.getJobInstanceInfo().getAllWorkers().remove(workerIdAddr);
        this.invalidWorkerSet.add(workerIdAddr);
        LOGGER.info("handle worker={} offline.", workerIdAddr);
    }

    /**
     * Worker Terminate
     * @param shutdownMode
     * @throws InterruptedException
     */
    public void terminate(ShutdownMode shutdownMode) throws InterruptedException {
        if (ShutdownMode.IMMEDIATE.equals(shutdownMode)) {
            // 强制停止
            this.doTerminate();
            this.killInstance(true, "Worker master shutdown.");
        }else if(ShutdownMode.WAIT_RUNNING.equals(shutdownMode)) {
            // 停止任务继续分发,通知运行中的执行完,队列中的直接清理
            this.doTerminate();
            this.killInstance(false, "Worker master shutdown.");
        }
        // master优雅下线,等待所有子任务任务执行完成
        latch.await();
    }

    protected void doTerminate(){}

    public void latchRelease() {
        latch.countDown();
    }

    /**
     * 判断当前Master是否完成
     * @return
     */
    protected boolean isFinished() {
        return this.instanceStatus.isFinish() || !TaskMasterPool.INSTANCE.contains(this.jobInstanceInfo.getJobInstanceId());
    }
}