Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.alibaba.schedulerx.worker.master.handler.SecondJobUpdateInstanceStatusHandler Maven / Gradle / Ivy
package com.alibaba.schedulerx.worker.master.handler;
import java.util.Map;
import java.util.Set;
import com.alibaba.schedulerx.common.constants.CommonConstants;
import com.alibaba.schedulerx.common.domain.ExecuteMode;
import com.alibaba.schedulerx.common.domain.InstanceStatus;
import com.alibaba.schedulerx.common.domain.JobInstanceInfo;
import com.alibaba.schedulerx.common.domain.LimitedQueue;
import com.alibaba.schedulerx.common.domain.ProgressHistory;
import com.alibaba.schedulerx.common.domain.SecondProgressDetail;
import com.alibaba.schedulerx.common.domain.TaskProgressCounter;
import com.alibaba.schedulerx.common.domain.WorkerProgressCounter;
import com.alibaba.schedulerx.common.util.ConfigUtil;
import com.alibaba.schedulerx.common.util.IdUtil;
import com.alibaba.schedulerx.common.util.JsonUtil;
import com.alibaba.schedulerx.common.util.UnirestUtil;
import com.alibaba.schedulerx.protocol.Worker.WorkerReportJobInstanceProgressRequest;
import com.alibaba.schedulerx.protocol.Worker.WorkerReportJobInstanceStatusRequest;
import com.alibaba.schedulerx.worker.SchedulerxWorker;
import com.alibaba.schedulerx.worker.discovery.ServerDiscovery;
import com.alibaba.schedulerx.worker.discovery.ServerDiscoveryFactory;
import com.alibaba.schedulerx.worker.domain.WorkerConstants;
import com.alibaba.schedulerx.worker.ha.HealthTimeHolder;
import com.alibaba.schedulerx.worker.log.LogFactory;
import com.alibaba.schedulerx.worker.log.Logger;
import com.alibaba.schedulerx.worker.master.BroadcastTaskMaster;
import com.alibaba.schedulerx.worker.master.MapTaskMaster;
import com.alibaba.schedulerx.worker.master.StandaloneTaskMaster;
import com.alibaba.schedulerx.worker.master.TaskMaster;
import com.alibaba.schedulerx.worker.master.scheduler.TimePlanEntry;
import com.alibaba.schedulerx.worker.master.scheduler.TimeScheduler;
import com.alibaba.schedulerx.worker.processor.ProcessResult;
import com.alibaba.schedulerx.worker.util.ActorPathUtil;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.MapUtils;
import org.apache.commons.lang.StringUtils;
import org.joda.time.DateTime;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
/**
*
* @author zhaibian
* @version $Id: SecondJobUpdateInstanceStatusHandler.java, v 0.1 2019年02月28日 19:42 zhaibian Exp $
*/
public class SecondJobUpdateInstanceStatusHandler extends UpdateInstanceStatusHandler{
// private LogCollector logCollector = LogCollectorFactory.get();
private static final Logger LOGGER = LogFactory.getLogger(SecondJobUpdateInstanceStatusHandler.class);
private static final int MISS_SERVER_KILL_TIME = 30;
SecondProgressDetail secondProgressDetail;
LimitedQueue recentProgressHistory = new LimitedQueue<>(10);
long cycleStartTime = System.currentTimeMillis();
private volatile int triggerTimes = 0;
private volatile int triggerCus = 0;
private boolean enableCycleIntervalMs = ConfigUtil.getWorkerConfig().getBoolean(WorkerConstants.SECOND_DELAY_INTERVAL_MS_ENABLE,
WorkerConstants.SECOND_DELAY_INTERVAL_MS_ENABLE_DEFAULT);
SecondJobUpdateInstanceStatusHandler(TaskMaster taskMaster, JobInstanceInfo jobInstanceInfo) {
super(taskMaster, jobInstanceInfo);
secondProgressDetail = new SecondProgressDetail();
init();
}
@Override
public void handle(long serialNum, InstanceStatus instanceStatus, String result) throws Exception {
String cycleId = IdUtil.getUniqueId(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId(), taskMaster.getSerialNum());
LOGGER.info("cycleId: {} instanceStatus={} result={} cycle update status.", cycleId, instanceStatus, result);
// if init failed, instance status finished and master has not been killed, so second job kill self
if (!taskMaster.isInited() && (instanceStatus.isFinish()) && !taskMaster.isKilled()) {
taskMaster.killInstance(true,"killed, because of worker init failed.");
LOGGER.warn("Init failed need to kill self, cycleId={}", cycleId);
return;
}
// if instance is killed, need to report to server
// 从逻辑看只需要判断master是否被kill即可,无需判断result是否包含指定信息,但历史这么写着短期不敢删减
// if (taskMaster.isKilled()) {
if (taskMaster.isKilled() && (StringUtils.contains(result, "killed") ||
StringUtils.contains(result, "Worker master shutdown"))) {
taskMaster.setInstanceStatus(InstanceStatus.FAILED);
taskMaster.stop();
masterPool.remove(jobInstanceInfo.getJobInstanceId());
if (!"killed from server".equals(result)) {
// 对服务端强制停止操作不做状态反馈
WorkerReportJobInstanceStatusRequest.Builder builder = WorkerReportJobInstanceStatusRequest
.newBuilder()
.setJobId(jobInstanceInfo.getJobId())
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setStatus(instanceStatus.getValue())
.setGroupId(jobInstanceInfo.getGroupId());
if (result != null) {
builder.setResult(result);
}
String progress = getJobInstanceProgress();
if (progress != null) {
builder.setProgress(progress);
}
SchedulerxWorker.AtLeastDeliveryRoutingActor.tell(builder.build(), null);
LOGGER.info("report cycleId={}, status={} to AtLeastDeliveryRoutingActor", cycleId,
instanceStatus);
}
// 如果实例终止无需进行后续操作
return;
}
//if job instance is finished, remove from TaskMasterPool
if (instanceStatus.isFinish()) {
triggerNextCycle(cycleId, serialNum, instanceStatus);
}
}
private synchronized void triggerNextCycle(String cycleId, long serialNum, InstanceStatus instanceStatus) throws Exception {
if (serialNum != taskMaster.getSerialNum()) {
LOGGER.info("triggerNextCycle={} ignore, current serialNum={}, but trigger serialNum={}, status={}, killed={}.",cycleId,
taskMaster.getSerialNum(), serialNum, instanceStatus, taskMaster.isKilled());
return;
}
ProcessResult postResult = taskMaster.postFinish(jobInstanceInfo.getJobInstanceId());
if (postResult != null) {
LOGGER.info("cycleId: {} cycle post status, result={}.", cycleId, postResult.getStatus(),
postResult.getResult());
}
// logCollector.collect(cycleId, ClientLoggerMessage.appendMessage(ClientLoggerMessage.INSTANCE_FINISH,
// instanceStatus.getDescription()));
LOGGER.info("cycleId: {} cycle end.", cycleId);
setHistory(taskMaster.getSerialNum(), cycleStartTime, instanceStatus);
if (!taskMaster.isKilled()) {
//TODO: 先清理这次迭代的资源,未来可以优化不需要每次清理
taskMaster.clear();
// 当前节点已下线
if (!SchedulerxWorker.INITED) {
LOGGER.info("Current worker is not running. To shutdown this master JobInstanceId={}", jobInstanceInfo.getJobInstanceId());
taskMaster.killInstance(true,"Worker master shutdown.");
return;
}
//计算下一次调度时间,加入到时间调度器中
long delayTime = 0;
if (enableCycleIntervalMs) {
delayTime = Long.parseLong(jobInstanceInfo.getTimeExpression());
} else {
delayTime = Long.parseLong(jobInstanceInfo.getTimeExpression()) * 1000;
}
cycleStartTime = System.currentTimeMillis() + delayTime;
TimePlanEntry entry = new TimePlanEntry(jobInstanceInfo.getJobInstanceId(), cycleStartTime, this);
TimeScheduler.INSTANCE.add(entry);
} else {
taskMaster.aquireSerialNum();
}
}
/**
* 获取当最新客户端列表
* @param appGroupId
* @param jobId
* @return
* @throws Exception
*/
private Set getAllWorkers(Long appGroupId, Long jobId) throws Exception {
String url = "http://{0}/app/getAllUsefulWorkerList.json?appGroupId={1}&jobId={2}";
try {
return UnirestUtil.getSetData(url,
ConfigUtil.getWorkerConfig().getString(WorkerConstants.WORKER_DOMAIN_NAME),
appGroupId, jobId);
} catch (Exception ex) {
LOGGER.error("getAllWorkers failed.", ex);
}
return null;
}
void init(){
TimeScheduler.INSTANCE.init();
final String jobIdAndInstanceId = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());
// job instance progress report thread
new Thread(new Runnable() {
@Override
public void run() {
int intervalTimes = 0;
while (!taskMaster.isKilled()) {
try {
Thread.sleep(1000);
if (intervalTimes++ > 10) {
WorkerReportJobInstanceProgressRequest request = WorkerReportJobInstanceProgressRequest
.newBuilder().setJobId(jobInstanceInfo.getJobId())
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setProgress(getJobInstanceProgress())
.setTriggerTimes(triggerTimes)
.build();
ServerDiscovery serverDiscovery = ServerDiscoveryFactory.getDiscovery(jobInstanceInfo.getGroupId());
if (serverDiscovery != null) {
serverDiscovery.getMapMasterRouter().tell(request, null);
} else {
LOGGER.error("failed to get serverDiscovery, groupId={}", jobInstanceInfo.getGroupId());
}
intervalTimes = 0;
triggerTimes = 0;
triggerCus = 0;
}
need2KillSelf();
} catch (Throwable e) {
LOGGER.error("report status error, jobIdAndInstanceId={}.", jobIdAndInstanceId, e);
}
}
}
}, "Schedulerx-SecondTaskMaster-report-progress-thread-" + jobIdAndInstanceId).start();
}
/**
* 满足下列任意条件需要自杀
* 1. 与服务端失联超过30秒
* 2. 或者网格任务没有可用worker列表
*/
private void need2KillSelf() {
if (!taskMaster.isInited()){
return;
}
final String jobIdAndInstanceId = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());
if ( HealthTimeHolder.INSTANCE.isServerHeartbeatHealthTimeout(MISS_SERVER_KILL_TIME)) {
taskMaster.killInstance(true,"killed, because of worker missed active server.");
LOGGER.warn("Missed server timeout={}ms, kill jobIdAndInstanceId={}.", HealthTimeHolder.INSTANCE.getServerHeartbeatMsInterval(), jobIdAndInstanceId);
return;
}
if (CollectionUtils.isEmpty(taskMaster.getAliveCheckWorkerSet())
&& CollectionUtils.isEmpty(taskMaster.getJobInstanceInfo().getAllWorkers())) {
LOGGER.warn("Missed useful worker list, kill jobIdAndInstanceId={}.",
jobIdAndInstanceId);
taskMaster.killInstance(true,"killed, because of missed useful worker list.");
return;
}
}
protected String getJobInstanceProgress() {
secondProgressDetail.setRunningProgress(taskMaster.getJobInstanceProgress());
secondProgressDetail.setRunningStartTime(cycleStartTime);
secondProgressDetail.setRecentProgressHistory(Lists.newArrayList(recentProgressHistory));
String progressJson = JsonUtil.toJson(secondProgressDetail);
secondProgressDetail.setRunningProgress(null);
return progressJson;
}
private void setHistory(long serialNum, long loopStartTime, InstanceStatus status){
if (status == InstanceStatus.SUCCESS) {
secondProgressDetail.getTodayProgressCounter().incrementSuccess();
} else {
secondProgressDetail.getTodayProgressCounter().incrementFailed();
}
if(!taskMaster.isKilled()){
secondProgressDetail.getTodayProgressCounter().incrementRunning();
secondProgressDetail.getTodayProgressCounter().incrementTotal();
}
DateTimeFormatter formatter = DateTimeFormat.forPattern(CommonConstants.DATE_TIME_PATTERN);
// reset today progress counter
if(DateTime.now().dayOfMonth().get() != DateTime.parse(secondProgressDetail.getTodayBeginTime(), formatter).dayOfMonth().get()){
secondProgressDetail.setYesterdayProgressCounter(secondProgressDetail.getTodayProgressCounter());
secondProgressDetail.setTodayBeginTime(DateTime.now().toString(CommonConstants.DATE_TIME_PATTERN));
secondProgressDetail.setTodayProgressCounter(new TaskProgressCounter(secondProgressDetail.getTodayBeginTime()));
}
Map taskProgressMap = null;
String ipAndPort = ActorPathUtil.getIpAndPortFromAkkaPath(taskMaster.getLocalTaskRouterPath());
if (taskMaster instanceof MapTaskMaster) {
taskProgressMap = Maps.newHashMap(((MapTaskMaster) taskMaster).getTaskProgressMap());
} else if(taskMaster instanceof BroadcastTaskMaster){
Map workerProgressCounterMap = ((BroadcastTaskMaster)taskMaster).getWorkerProgressMap();
if (MapUtils.isEmpty(workerProgressCounterMap)) {
return;
}
taskProgressMap = Maps.newHashMap();
for (WorkerProgressCounter worker : workerProgressCounterMap.values()) {
TaskProgressCounter counter = new TaskProgressCounter(worker.getWorkerAddr());
counter.incrementSuccess(worker.getSuccess());
counter.incrementFailed(worker.getFailed());
counter.incrementTotal(worker.getTotal());
taskProgressMap.put(counter.getName(), counter);
}
} else if(taskMaster instanceof StandaloneTaskMaster){
taskProgressMap = Maps.newHashMap();
ipAndPort = ActorPathUtil.getIpAndPortFromAkkaPath(((StandaloneTaskMaster) taskMaster).getCurrentSelection().toSerializationFormat());
TaskProgressCounter counter = new TaskProgressCounter(ipAndPort);
taskProgressMap.put(ipAndPort, counter);
counter.incrementTotal();
if (status == InstanceStatus.SUCCESS) {
counter.incrementSuccess();
} else {
counter.incrementFailed();
}
}
if (MapUtils.isEmpty(taskProgressMap)){
return;
}
ProgressHistory history = new ProgressHistory();
history.setSerialNum(serialNum);
history.setStartTime(loopStartTime);
history.setEndTime(System.currentTimeMillis());
history.setCostTime(history.getEndTime() - history.getStartTime());
history.setTaskProgressMap(taskProgressMap);
history.setSuccess((status == InstanceStatus.SUCCESS) ? true : false);
recentProgressHistory.offer(history);
}
/**
* 调度新一轮迭代
*/
public void triggerNewCycle() {
String cycleId = IdUtil.getUniqueId(jobInstanceInfo.getJobId(),
jobInstanceInfo.getJobInstanceId(), taskMaster.aquireSerialNum());
LOGGER.info("cycleId: {} cycle begin.", cycleId);
cycleStartTime = System.currentTimeMillis();
JobInstanceInfo.newBuilder(jobInstanceInfo).setScheduleTime(DateTime.now());
try {
// 如果存在无效worker节点则重新获取最新列表
if(taskMaster.existInvalidWorker()) {
Set freeWorkers = getAllWorkers(jobInstanceInfo.getAppGroupId(), jobInstanceInfo.getJobId());
taskMaster.restJobInstanceWorkerList(freeWorkers);
}
taskMaster.submitInstance(jobInstanceInfo);
triggerTimes++;
//如果是单机任务,调度cu+1
//如果是分布式任务,调度cu+worker数量
if (jobInstanceInfo.getExecuteMode().equals(ExecuteMode.STANDALONE.getKey())) {
triggerCus++;
} else {
triggerCus += jobInstanceInfo.getAllWorkers().size();
}
} catch (Exception e) {
taskMaster.killInstance(true,"killed, because of cycle submit failed.");
LOGGER.error("cycleId: {} cycle submit failed, need to kill.", cycleId, e);
}
}
}