All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.alibaba.schedulerx.worker.master.handler.SecondJobUpdateInstanceStatusHandler Maven / Gradle / Ivy

There is a newer version: 1.12.2
Show newest version
package com.alibaba.schedulerx.worker.master.handler;

import java.util.Map;
import java.util.Set;

import com.alibaba.schedulerx.common.constants.CommonConstants;
import com.alibaba.schedulerx.common.domain.ExecuteMode;
import com.alibaba.schedulerx.common.domain.InstanceStatus;
import com.alibaba.schedulerx.common.domain.JobInstanceInfo;
import com.alibaba.schedulerx.common.domain.LimitedQueue;
import com.alibaba.schedulerx.common.domain.ProgressHistory;
import com.alibaba.schedulerx.common.domain.SecondProgressDetail;
import com.alibaba.schedulerx.common.domain.TaskProgressCounter;
import com.alibaba.schedulerx.common.domain.WorkerProgressCounter;
import com.alibaba.schedulerx.common.util.ConfigUtil;
import com.alibaba.schedulerx.common.util.IdUtil;
import com.alibaba.schedulerx.common.util.JsonUtil;
import com.alibaba.schedulerx.common.util.UnirestUtil;
import com.alibaba.schedulerx.protocol.Worker.WorkerReportJobInstanceProgressRequest;
import com.alibaba.schedulerx.protocol.Worker.WorkerReportJobInstanceStatusRequest;
import com.alibaba.schedulerx.worker.SchedulerxWorker;
import com.alibaba.schedulerx.worker.discovery.ServerDiscovery;
import com.alibaba.schedulerx.worker.discovery.ServerDiscoveryFactory;
import com.alibaba.schedulerx.worker.domain.WorkerConstants;
import com.alibaba.schedulerx.worker.ha.HealthTimeHolder;
import com.alibaba.schedulerx.worker.log.LogFactory;
import com.alibaba.schedulerx.worker.log.Logger;
import com.alibaba.schedulerx.worker.master.BroadcastTaskMaster;
import com.alibaba.schedulerx.worker.master.MapTaskMaster;
import com.alibaba.schedulerx.worker.master.StandaloneTaskMaster;
import com.alibaba.schedulerx.worker.master.TaskMaster;
import com.alibaba.schedulerx.worker.master.scheduler.TimePlanEntry;
import com.alibaba.schedulerx.worker.master.scheduler.TimeScheduler;
import com.alibaba.schedulerx.worker.processor.ProcessResult;
import com.alibaba.schedulerx.worker.util.ActorPathUtil;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.MapUtils;
import org.apache.commons.lang.StringUtils;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;

/**
 *
 * @author zhaibian
 * @version $Id: SecondJobUpdateInstanceStatusHandler.java, v 0.1 2019年02月28日 19:42 zhaibian Exp $
 */
public class SecondJobUpdateInstanceStatusHandler extends UpdateInstanceStatusHandler{

//    private LogCollector logCollector = LogCollectorFactory.get();
    private static final Logger LOGGER = LogFactory.getLogger(SecondJobUpdateInstanceStatusHandler.class);

    private static final int MISS_SERVER_KILL_TIME = 30;
    SecondProgressDetail secondProgressDetail;
    LimitedQueue recentProgressHistory = new LimitedQueue<>(10);
    long cycleStartTime = System.currentTimeMillis();
    private volatile int triggerTimes = 0;
    private volatile int triggerCus = 0;
    private boolean enableCycleIntervalMs = ConfigUtil.getWorkerConfig().getBoolean(WorkerConstants.SECOND_DELAY_INTERVAL_MS_ENABLE, 
            WorkerConstants.SECOND_DELAY_INTERVAL_MS_ENABLE_DEFAULT);

    SecondJobUpdateInstanceStatusHandler(TaskMaster taskMaster, JobInstanceInfo jobInstanceInfo) {
        super(taskMaster, jobInstanceInfo);
        secondProgressDetail = new SecondProgressDetail();
        init();
    }

    @Override
    public void handle(long serialNum, InstanceStatus instanceStatus, String result) throws Exception {
        String cycleId = IdUtil.getUniqueId(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId(), taskMaster.getSerialNum());
        LOGGER.info("cycleId: {} instanceStatus={} result={} cycle update status.", cycleId, instanceStatus, result);

        // if init failed, instance status finished and master has not been killed, so second job kill self
        if (!taskMaster.isInited() && (instanceStatus.isFinish()) && !taskMaster.isKilled()) {
            taskMaster.killInstance(true,"killed, because of worker init failed.");
            LOGGER.warn("Init failed need to kill self, cycleId={}", cycleId);
            return;
        }

        // if instance is killed, need to report to server
        // 从逻辑看只需要判断master是否被kill即可,无需判断result是否包含指定信息,但历史这么写着短期不敢删减
//        if (taskMaster.isKilled()) {
        if (taskMaster.isKilled() && (StringUtils.contains(result, "killed") ||
                StringUtils.contains(result, "Worker master shutdown"))) {
            taskMaster.setInstanceStatus(InstanceStatus.FAILED);
            taskMaster.stop();
            masterPool.remove(jobInstanceInfo.getJobInstanceId());

            if (!"killed from server".equals(result)) {
                // 对服务端强制停止操作不做状态反馈
                WorkerReportJobInstanceStatusRequest.Builder builder = WorkerReportJobInstanceStatusRequest
                        .newBuilder()
                        .setJobId(jobInstanceInfo.getJobId())
                        .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
                        .setStatus(instanceStatus.getValue())
                        .setGroupId(jobInstanceInfo.getGroupId());

                if (result != null) {
                    builder.setResult(result);
                }

                String progress = getJobInstanceProgress();
                if (progress != null) {
                    builder.setProgress(progress);
                }
                SchedulerxWorker.AtLeastDeliveryRoutingActor.tell(builder.build(), null);
                LOGGER.info("report cycleId={}, status={} to AtLeastDeliveryRoutingActor", cycleId,
                        instanceStatus);
            }
            // 如果实例终止无需进行后续操作
            return;
        }

        //if job instance is finished, remove from TaskMasterPool
        if (instanceStatus.isFinish()) {
            triggerNextCycle(cycleId, serialNum, instanceStatus);
        }
    }

    private synchronized void triggerNextCycle(String cycleId, long serialNum, InstanceStatus instanceStatus) throws Exception {
        if (serialNum != taskMaster.getSerialNum()) {
            LOGGER.info("triggerNextCycle={} ignore, current serialNum={}, but trigger serialNum={}, status={}, killed={}.",cycleId,
                taskMaster.getSerialNum(), serialNum, instanceStatus, taskMaster.isKilled());
            return;
        }

        ProcessResult postResult = taskMaster.postFinish(jobInstanceInfo.getJobInstanceId());
        if (postResult != null) {
            LOGGER.info("cycleId: {} cycle post status, result={}.", cycleId, postResult.getStatus(),
                postResult.getResult());
        }

//        logCollector.collect(cycleId, ClientLoggerMessage.appendMessage(ClientLoggerMessage.INSTANCE_FINISH,
//            instanceStatus.getDescription()));
        LOGGER.info("cycleId: {} cycle end.", cycleId);

        setHistory(taskMaster.getSerialNum(), cycleStartTime, instanceStatus);

        if (!taskMaster.isKilled()) {
        	//TODO: 先清理这次迭代的资源,未来可以优化不需要每次清理
        	taskMaster.clear();

            // 当前节点已下线
            if (!SchedulerxWorker.INITED) {
                LOGGER.info("Current worker is not running. To shutdown this master JobInstanceId={}", jobInstanceInfo.getJobInstanceId());
                taskMaster.killInstance(true,"Worker master shutdown.");
                return;
            }

        	//计算下一次调度时间,加入到时间调度器中
            long delayTime = 0;
            if (enableCycleIntervalMs) {
                delayTime = Long.parseLong(jobInstanceInfo.getTimeExpression());
            } else {
                delayTime = Long.parseLong(jobInstanceInfo.getTimeExpression()) * 1000;
            }
            cycleStartTime = System.currentTimeMillis() + delayTime;
            TimePlanEntry entry = new TimePlanEntry(jobInstanceInfo.getJobInstanceId(), cycleStartTime, this); 
            TimeScheduler.INSTANCE.add(entry);    		
        } else {
            taskMaster.aquireSerialNum();
        }
    }

    /**
     * 获取当最新客户端列表
     * @param appGroupId
     * @param jobId
     * @return
     * @throws Exception
     */
    private Set getAllWorkers(Long appGroupId, Long jobId) throws Exception {
        String url = "http://{0}/app/getAllUsefulWorkerList.json?appGroupId={1}&jobId={2}";
        try {
            return UnirestUtil.getSetData(url,
                    ConfigUtil.getWorkerConfig().getString(WorkerConstants.WORKER_DOMAIN_NAME),
                    appGroupId, jobId);
        } catch (Exception ex) {
            LOGGER.error("getAllWorkers failed.", ex);
        }
        return null;
    }

    void init(){
    	TimeScheduler.INSTANCE.init();
    	
        final String jobIdAndInstanceId = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());
        // job instance progress report thread
        new Thread(new Runnable() {
            @Override
            public void run() {
                int intervalTimes = 0;
                while (!taskMaster.isKilled()) {
                	try {
	                    Thread.sleep(1000);
	                    if (intervalTimes++ > 10) {
	                        WorkerReportJobInstanceProgressRequest request = WorkerReportJobInstanceProgressRequest
	                            .newBuilder().setJobId(jobInstanceInfo.getJobId())
	                            .setJobInstanceId(jobInstanceInfo.getJobInstanceId())
	                            .setProgress(getJobInstanceProgress())
	                            .setTriggerTimes(triggerTimes)
	                            .build();
	                        ServerDiscovery serverDiscovery = ServerDiscoveryFactory.getDiscovery(jobInstanceInfo.getGroupId());
	                        if (serverDiscovery != null) {
	                        	serverDiscovery.getMapMasterRouter().tell(request, null);
	                        } else {
	                        	LOGGER.error("failed to get serverDiscovery, groupId={}", jobInstanceInfo.getGroupId());
	                        }
	                        intervalTimes = 0;
	                        triggerTimes = 0;
	                        triggerCus = 0;
	                    }
	
	                    need2KillSelf();
                	} catch (Throwable e) {
                        LOGGER.error("report status error, jobIdAndInstanceId={}.", jobIdAndInstanceId, e);
                    }
                }
            }
        }, "Schedulerx-SecondTaskMaster-report-progress-thread-" + jobIdAndInstanceId).start();
        
        
    }

    /**
     * 满足下列任意条件需要自杀
     * 1. 与服务端失联超过30秒
     * 2. 或者网格任务没有可用worker列表
     */
    private void need2KillSelf() {
        if (!taskMaster.isInited()){
            return;
        }

        final String jobIdAndInstanceId = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());

        if ( HealthTimeHolder.INSTANCE.isServerHeartbeatHealthTimeout(MISS_SERVER_KILL_TIME)) {
            taskMaster.killInstance(true,"killed, because of worker missed active server.");
            LOGGER.warn("Missed server timeout={}ms, kill jobIdAndInstanceId={}.", HealthTimeHolder.INSTANCE.getServerHeartbeatMsInterval(), jobIdAndInstanceId);
            return;
        }

        if (CollectionUtils.isEmpty(taskMaster.getAliveCheckWorkerSet())
            && CollectionUtils.isEmpty(taskMaster.getJobInstanceInfo().getAllWorkers())) {
            LOGGER.warn("Missed useful worker list, kill jobIdAndInstanceId={}.",
                jobIdAndInstanceId);
            taskMaster.killInstance(true,"killed, because of missed useful worker list.");
            return;
        }
    }

    protected String getJobInstanceProgress() {
        secondProgressDetail.setRunningProgress(taskMaster.getJobInstanceProgress());
        secondProgressDetail.setRunningStartTime(cycleStartTime);
        secondProgressDetail.setRecentProgressHistory(Lists.newArrayList(recentProgressHistory));
        String progressJson = JsonUtil.toJson(secondProgressDetail);
        secondProgressDetail.setRunningProgress(null);
        return progressJson;
    }

    private void setHistory(long serialNum, long loopStartTime, InstanceStatus status){
        if (status == InstanceStatus.SUCCESS) {
            secondProgressDetail.getTodayProgressCounter().incrementSuccess();
        } else {
            secondProgressDetail.getTodayProgressCounter().incrementFailed();
        }

        if(!taskMaster.isKilled()){
            secondProgressDetail.getTodayProgressCounter().incrementRunning();
            secondProgressDetail.getTodayProgressCounter().incrementTotal();
        }

        DateTimeFormatter formatter = DateTimeFormat.forPattern(CommonConstants.DATE_TIME_PATTERN);
        // reset today progress counter
        if(DateTime.now().dayOfMonth().get() != DateTime.parse(secondProgressDetail.getTodayBeginTime(), formatter).dayOfMonth().get()){
            secondProgressDetail.setYesterdayProgressCounter(secondProgressDetail.getTodayProgressCounter());
            secondProgressDetail.setTodayBeginTime(DateTime.now().toString(CommonConstants.DATE_TIME_PATTERN));
            secondProgressDetail.setTodayProgressCounter(new TaskProgressCounter(secondProgressDetail.getTodayBeginTime()));
        }

        Map taskProgressMap = null;
        String ipAndPort = ActorPathUtil.getIpAndPortFromAkkaPath(taskMaster.getLocalTaskRouterPath());

        if (taskMaster instanceof MapTaskMaster) {
            taskProgressMap = Maps.newHashMap(((MapTaskMaster) taskMaster).getTaskProgressMap());
        } else if(taskMaster instanceof BroadcastTaskMaster){
            Map workerProgressCounterMap = ((BroadcastTaskMaster)taskMaster).getWorkerProgressMap();
            if (MapUtils.isEmpty(workerProgressCounterMap)) {
                return;
            }
            taskProgressMap = Maps.newHashMap();
            for (WorkerProgressCounter worker : workerProgressCounterMap.values()) {
                TaskProgressCounter counter = new TaskProgressCounter(worker.getWorkerAddr());
                counter.incrementSuccess(worker.getSuccess());
                counter.incrementFailed(worker.getFailed());
                counter.incrementTotal(worker.getTotal());
                taskProgressMap.put(counter.getName(), counter);
            }
        } else if(taskMaster instanceof StandaloneTaskMaster){
            taskProgressMap = Maps.newHashMap();
            ipAndPort = ActorPathUtil.getIpAndPortFromAkkaPath(((StandaloneTaskMaster) taskMaster).getCurrentSelection().toSerializationFormat());
            TaskProgressCounter counter = new TaskProgressCounter(ipAndPort);
            taskProgressMap.put(ipAndPort, counter);
            counter.incrementTotal();
            if (status == InstanceStatus.SUCCESS) {
                counter.incrementSuccess();
            } else {
                counter.incrementFailed();
            }
        }

        if (MapUtils.isEmpty(taskProgressMap)){
            return;
        }

        ProgressHistory history = new ProgressHistory();
        history.setSerialNum(serialNum);
        history.setStartTime(loopStartTime);
        history.setEndTime(System.currentTimeMillis());
        history.setCostTime(history.getEndTime() -  history.getStartTime());
        history.setTaskProgressMap(taskProgressMap);
        history.setSuccess((status == InstanceStatus.SUCCESS) ? true : false);
        recentProgressHistory.offer(history);
    }
    
    /**
     * 调度新一轮迭代
     */
    public void triggerNewCycle() {
    	String cycleId = IdUtil.getUniqueId(jobInstanceInfo.getJobId(),
                jobInstanceInfo.getJobInstanceId(), taskMaster.aquireSerialNum());
        LOGGER.info("cycleId: {} cycle begin.", cycleId);
        cycleStartTime = System.currentTimeMillis();

        JobInstanceInfo.newBuilder(jobInstanceInfo).setScheduleTime(DateTime.now());
        try {    
            // 如果存在无效worker节点则重新获取最新列表
            if(taskMaster.existInvalidWorker()) {
                Set freeWorkers = getAllWorkers(jobInstanceInfo.getAppGroupId(), jobInstanceInfo.getJobId());
                taskMaster.restJobInstanceWorkerList(freeWorkers);
            }
            taskMaster.submitInstance(jobInstanceInfo);
            
            triggerTimes++;
            //如果是单机任务,调度cu+1
            //如果是分布式任务,调度cu+worker数量
            if (jobInstanceInfo.getExecuteMode().equals(ExecuteMode.STANDALONE.getKey())) {
                triggerCus++;
            } else {
                triggerCus += jobInstanceInfo.getAllWorkers().size();
            }
        } catch (Exception e) {
        	taskMaster.killInstance(true,"killed, because of cycle submit failed.");
            LOGGER.error("cycleId: {} cycle submit failed, need to kill.", cycleId, e);
		}
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy