Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.alibaba.schedulerx.worker.master.TaskMaster Maven / Gradle / Ivy
package com.alibaba.schedulerx.worker.master;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.collections.CollectionUtils;
import com.alibaba.schedulerx.common.domain.InstanceStatus;
import com.alibaba.schedulerx.common.domain.JobInstanceData;
import com.alibaba.schedulerx.common.domain.JobInstanceInfo;
import com.alibaba.schedulerx.common.domain.MapTaskXAttrs;
import com.alibaba.schedulerx.common.domain.Pair;
import com.alibaba.schedulerx.common.domain.TaskStatus;
import com.alibaba.schedulerx.common.util.IdUtil;
import com.alibaba.schedulerx.common.util.JsonUtil;
import com.alibaba.schedulerx.common.util.StringUtils;
import com.alibaba.schedulerx.protocol.Common.UpstreamData;
import com.alibaba.schedulerx.protocol.Server.RetryTaskEntity;
import com.alibaba.schedulerx.protocol.Worker;
import com.alibaba.schedulerx.protocol.Worker.ContainerBatchReportTaskStatuesRequest;
import com.alibaba.schedulerx.protocol.Worker.ContainerReportTaskStatusRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterStartContainerRequest;
import com.alibaba.schedulerx.protocol.Worker.TaskStatusInfo;
import com.alibaba.schedulerx.worker.container.ShutdownMode;
import com.alibaba.schedulerx.worker.discovery.ServerDiscovery;
import com.alibaba.schedulerx.worker.discovery.ServerDiscoveryFactory;
import com.alibaba.schedulerx.worker.domain.WorkerConstants;
import com.alibaba.schedulerx.worker.log.LogFactory;
import com.alibaba.schedulerx.worker.log.Logger;
import com.alibaba.schedulerx.worker.master.handler.UpdateInstanceStatusHandler;
import com.alibaba.schedulerx.worker.master.handler.UpdateInstanceStatusHandlerFactory;
import com.alibaba.schedulerx.worker.master.scheduler.TimeScheduler;
import com.alibaba.schedulerx.worker.processor.ProcessResult;
import com.alibaba.schedulerx.worker.util.ActorPathUtil;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.google.protobuf.ByteString;
import akka.actor.ActorContext;
import akka.actor.ActorSelection;
import akka.pattern.Patterns;
import akka.util.Timeout;
import scala.concurrent.Await;
import scala.concurrent.Future;
import scala.concurrent.duration.Duration;
/**
* @author xiaomeng.hxm
*/
public abstract class TaskMaster {
private final ActorContext actorContext;
protected volatile InstanceStatus instanceStatus = InstanceStatus.RUNNING;
protected volatile Map taskStatusMap = Maps.newHashMap();
protected AtomicLong taskIdGenerator = new AtomicLong(0);
//private final String localWorkerAkkaPath;
private final String localWorkIdAddr;
private final String localContainerRouterPath;
private final String localTaskRouterPath;
private final String localInstanceRouterPath;
protected final JobInstanceInfo jobInstanceInfo;
protected String jobInstanceProgress;
protected final UpdateInstanceStatusHandler statusHandler;
protected volatile boolean killed = false;
protected volatile boolean INITED = false;
protected volatile Set aliveCheckWorkerSet = Sets.newConcurrentHashSet();
protected volatile Set invalidWorkerSet = Sets.newConcurrentHashSet();
protected final ServerDiscovery SERVER_DISCOVERY;
// 秒级任务使用,当前循环次数
protected AtomicLong serialNum = new AtomicLong(0);
private static final Logger LOGGER = LogFactory.getLogger(TaskMaster.class);
protected final CountDownLatch latch = new CountDownLatch(1);
/**
* 是否存在失效Worker
*/
protected volatile boolean existInvalidWorker = false;
public TaskMaster(JobInstanceInfo jobInstanceInfo, ActorContext actorContext) throws Exception {
this.jobInstanceInfo = jobInstanceInfo;
this.actorContext = actorContext;
this.localWorkIdAddr = actorContext.provider().getDefaultAddress().hostPort();
this.localInstanceRouterPath = actorContext.provider().getDefaultAddress().toString()
+ WorkerConstants.WORKER_AKKA_JOB_INSTANCE_ROUTING_PATH;
this.localContainerRouterPath = actorContext.provider().getDefaultAddress().toString()
+ WorkerConstants.WORKER_AKKA_CONTAINER_ROUTING_PATH;
this.localTaskRouterPath = actorContext.provider().getDefaultAddress().toString()
+ WorkerConstants.WORKER_AKKA_TASK_ROUTING_PATH;
this.SERVER_DISCOVERY = ServerDiscoveryFactory.getDiscovery(jobInstanceInfo.getGroupId());
// this.aliveCheckWorkerSet.addAll(jobInstanceInfo.getAllWorkers());
checkProcessor();
this.statusHandler = UpdateInstanceStatusHandlerFactory.create(this, jobInstanceInfo);
}
public ActorContext getActorContext() {
return actorContext;
}
public String getLocalWorkerIdAddr() {
return localWorkIdAddr;
}
public String getLocalJobInstanceRouterPath() {
return localInstanceRouterPath;
}
public String getLocalContainerRouterPath() {
return localContainerRouterPath;
}
public String getLocalTaskRouterPath() {
return localTaskRouterPath;
}
public boolean isJobInstanceFinished() {
boolean isFinish = true;
//TODO 需要重构,效率低下
for (TaskStatus status : taskStatusMap.values()) {
if (!status.isFinish()) {
isFinish = false;
break;
}
}
return isFinish;
}
public void updateTaskStatus(ContainerReportTaskStatusRequest request) {
long jobId = request.getJobId();
long jobInstanceId = request.getJobInstanceId();
long taskId = request.getTaskId();
TaskStatus taskStatus = TaskStatus.parseValue(request.getStatus());
//TODO 这段代码写得特别挫,效率极其低下,需要重构
String uniqueId = IdUtil.getUniqueId(jobId, jobInstanceId, taskId);
taskStatusMap.put(uniqueId, taskStatus);
InstanceStatus newStatus = InstanceStatus.UNKNOWN;
if (taskStatusMap.size() > 0) {
if (!isJobInstanceFinished()) {
newStatus = InstanceStatus.RUNNING;
} else {
newStatus = InstanceStatus.SUCCESS;
//只要有一个子任务状态为FAILED,则返回FAILED
if (!newStatus.equals(InstanceStatus.FAILED)) {
for (TaskStatus status : taskStatusMap.values()) {
if (status.equals(TaskStatus.FAILED)) {
newStatus = InstanceStatus.FAILED;
break;
}
}
}
}
}
this.jobInstanceInfo.setTraceId(request.getTraceId());
this.jobInstanceProgress = request.getProgress();
updateNewInstanceStatus(request.getSerialNum(), jobInstanceId, newStatus, request.getResult());
}
//TODO: MapTaskMaster may override this method do really batch process
public void batchUpdateTaskStatus(ContainerBatchReportTaskStatuesRequest request) throws Exception {
for (TaskStatusInfo taskStatusInfo : request.getTaskStatuesList()) {
ContainerReportTaskStatusRequest.Builder builder = ContainerReportTaskStatusRequest.newBuilder()
.setJobId(request.getJobId())
.setJobInstanceId(request.getJobInstanceId())
.setTaskId(taskStatusInfo.getTaskId())
.setWorkerAddr(request.getWorkerAddr())
.setWorkerId(request.getWorkerId())
.setStatus(taskStatusInfo.getStatus());
if (taskStatusInfo.hasResult()) {
builder.setResult(taskStatusInfo.getResult());
}
if (taskStatusInfo.hasTaskName()) {
builder.setTaskName(taskStatusInfo.getTaskName());
}
if (taskStatusInfo.hasProgress()) {
builder.setProgress(taskStatusInfo.getProgress());
}
if (request.hasSerialNum()) {
builder.setSerialNum(request.getSerialNum());
}
if (taskStatusInfo.hasTraceId()) {
builder.setTraceId(taskStatusInfo.getTraceId());
}
updateTaskStatus(builder.build());
}
}
public void batchUpdateTaskStatues(List requests){}
public void killInstance(boolean mayInterruptIfRunning, String reason) {
this.killed = true;
TimeScheduler.INSTANCE.remove(jobInstanceInfo.getJobInstanceId());
}
protected void sendKillContainerRequest(boolean mayInterruptIfRunning, List allWorkers) {
String uniqueId = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());
// CountDownLatch latch = new CountDownLatch();
Set> futures = Sets.newHashSet();
Timeout timeout = new Timeout(Duration.create(5, TimeUnit.SECONDS));
for (String workerIdAddr : allWorkers) {
try {
ActorSelection selection = getActorContext().actorSelection(ActorPathUtil.getContainerRouterPath(workerIdAddr));
Worker.MasterKillContainerRequest request = Worker.MasterKillContainerRequest.newBuilder()
.setJobId(jobInstanceInfo.getJobId())
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setMayInterruptIfRunning(mayInterruptIfRunning)
.setAppGroupId(jobInstanceInfo.getAppGroupId())
.setGroupId(jobInstanceInfo.getGroupId())
.build();
if (mayInterruptIfRunning) {
// 强制终止,直接异步分发处理
selection.tell(request, null);
} else {
// 异步分发
Future future = Patterns.ask(selection, request, timeout);
futures.add(new Pair(workerIdAddr, future));
}
} catch (Throwable e) {
LOGGER.warn("send kill instance request exception, worker:{}, uninqueId:{}", workerIdAddr, uniqueId);
}
}
// 等待所有结果返回
for (Pair pair:futures) {
try {
Await.result(pair.getSecond(), timeout.duration());
} catch (Exception e) {
LOGGER.warn("send kill instance request timeout, worker:{}, uninqueId:{}", pair.getFirst(), uniqueId);
}
}
}
public abstract void destroyContainerPool();
public void killTask(String uniqueId, String workerId, String workerAddr) {
//TODO
}
protected void init() {
if (INITED) {
return;
}
INITED = true;
}
public void retryTasks(List taskEntities) {}
public abstract void submitInstance(JobInstanceInfo jobInstanceInfo) throws Exception;
protected long aquireTaskId() {
return taskIdGenerator.getAndIncrement();
}
public String getJobInstanceProgress() {
return jobInstanceProgress;
}
public void updateNewInstanceStatus(long serialNum, InstanceStatus newStatus, String result) {
updateNewInstanceStatus(serialNum, jobInstanceInfo.getJobInstanceId(), newStatus, result);
}
protected synchronized void updateNewInstanceStatus(long serialNum, long jobInstanceId, InstanceStatus newStatus,
String result) {
try {
this.statusHandler.handle(serialNum, newStatus, result);
} catch (Exception e) {
LOGGER.error("update jobInstanceId={} serialNum={}, status={} failed", jobInstanceId, serialNum, newStatus.getValue(), e);
}
}
public void stop() {}
public void clear() {
taskStatusMap.clear();
taskIdGenerator.set(0);
instanceStatus = InstanceStatus.RUNNING;
aliveCheckWorkerSet.clear();
invalidWorkerSet.clear();
destroyContainerPool();
}
public ProcessResult postFinish(long jobInstanceId) {
return null;
}
protected MasterStartContainerRequest.Builder convert2StartContainerRequestBuilder(JobInstanceInfo jobInstanceInfo, long taskId) {
return convert2StartContainerRequestBuilder(jobInstanceInfo, taskId, null, null, false);
}
protected MasterStartContainerRequest convert2StartContainerRequest(JobInstanceInfo jobInstanceInfo, long taskId,
String taskName, ByteString taskBody) {
MasterStartContainerRequest.Builder builder = convert2StartContainerRequestBuilder(jobInstanceInfo, taskId, taskName, taskBody, false);
return builder.build();
}
protected MasterStartContainerRequest.Builder convert2StartContainerRequestBuilder(JobInstanceInfo jobInstanceInfo, long taskId,
String taskName, ByteString taskBody, boolean failover) {
MasterStartContainerRequest.Builder builder = MasterStartContainerRequest.newBuilder();
builder.setJobId(jobInstanceInfo.getJobId());
builder.setJobInstanceId(jobInstanceInfo.getJobInstanceId());
builder.setTaskId(taskId);
builder.setAppGroupId(jobInstanceInfo.getAppGroupId());
builder.setUser(jobInstanceInfo.getUser());
builder.setJobType(jobInstanceInfo.getJobType());
builder.setContent(jobInstanceInfo.getContent());
builder.setScheduleTime(jobInstanceInfo.getScheduleTime().getMillis());
builder.setDataTime(jobInstanceInfo.getDataTime().getMillis());
builder.setParameters(jobInstanceInfo.getParameters());
builder.setInstanceParameters(jobInstanceInfo.getInstanceParameters());
builder.setInstanceMasterAkkaPath(getLocalTaskRouterPath());
builder.setGroupId(jobInstanceInfo.getGroupId());
builder.setMaxAttempt(jobInstanceInfo.getMaxAttempt());
builder.setAttempt(jobInstanceInfo.getAttempt());
builder.setTriggerType(jobInstanceInfo.getTriggerType());
if (jobInstanceInfo.getUpstreamData() != null && !jobInstanceInfo.getUpstreamData().isEmpty()) {
for (JobInstanceData jobInstanceData : jobInstanceInfo.getUpstreamData()) {
UpstreamData upstreamData = UpstreamData.newBuilder()
.setJobName(jobInstanceData.getJobName())
.setData(jobInstanceData.getData())
.build();
builder.addUpstreamData(upstreamData);
}
}
if (StringUtils.isNotEmpty(jobInstanceInfo.getXattrs())) {
builder.setXattrs(jobInstanceInfo.getXattrs());
MapTaskXAttrs xAttrs = JsonUtil.fromJson(jobInstanceInfo.getXattrs(), MapTaskXAttrs.class);
builder.setConsumerNum(xAttrs.getConsumerSize());
builder.setTaskMaxAttempt(xAttrs.getTaskMaxAttempt());
builder.setTaskAttemptInterval(xAttrs.getTaskAttemptInterval());
}
if (taskName != null) {
builder.setTaskName(taskName);
}
if (taskBody != null) {
builder.setTask(taskBody);
}
if (failover) {
builder.setFailover(true);
}
if (jobInstanceInfo.getWfInstanceId() != null) {
builder.setWfInstanceId(jobInstanceInfo.getWfInstanceId());
}
builder.setSerialNum(getSerialNum());
builder.setExecuteMode(jobInstanceInfo.getExecuteMode());
if (jobInstanceInfo.getJobName() != null) {
builder.setJobName(jobInstanceInfo.getJobName());
}
builder.setTimeType(jobInstanceInfo.getTimeType());
builder.setTimeExpression(jobInstanceInfo.getTimeExpression());
if (StringUtils.isNotEmpty(jobInstanceInfo.getNamespace())) {
builder.setNamespace(jobInstanceInfo.getNamespace());
}
if (StringUtils.isNotEmpty(jobInstanceInfo.getTemplate())) {
builder.setTemplate(jobInstanceInfo.getTemplate());
}
if (jobInstanceInfo.getWorkflowId() != null) {
builder.setWorkflowId(jobInstanceInfo.getWorkflowId());
}
return builder;
}
/**
* Getter method for property instanceStatus .
*
* @return property value of instanceStatus
*/
public InstanceStatus getInstanceStatus() {
return instanceStatus;
}
/**
* Setter method for property instanceStatus .
*
* @param instanceStatus value to be assigned to property instanceStatus
*/
public void setInstanceStatus(InstanceStatus instanceStatus) {
this.instanceStatus = instanceStatus;
}
public boolean isKilled() {
return killed;
}
/**
* Getter method for property jobInstanceInfo .
*
* @return property value of jobInstanceInfo
*/
public JobInstanceInfo getJobInstanceInfo() {
return jobInstanceInfo;
}
/**
* Getter method for property aliveCheckWorkerSet .
*
* @return property value of aliveCheckWorkerSet
*/
public Set getAliveCheckWorkerSet() {
return aliveCheckWorkerSet;
}
public boolean isInited(){
return INITED;
}
public long getSerialNum() {
return serialNum.get();
}
public long aquireSerialNum() {
return serialNum.incrementAndGet();
}
protected void checkProcessor() throws Exception {}
public boolean existInvalidWorker() {
return existInvalidWorker;
}
/**
* 重置任务实例可用worker列表
* @param freeWorkers
*/
public void restJobInstanceWorkerList(Set freeWorkers){
if(CollectionUtils.isNotEmpty(freeWorkers)) {
this.jobInstanceInfo.setAllWorkers(Lists.newCopyOnWriteArrayList(freeWorkers));
this.existInvalidWorker = false;
LOGGER.info("restJobInstanceWorkerList appGroupId={} instanceId={} workerSize={}.", jobInstanceInfo.getAppGroupId(),
jobInstanceInfo.getJobInstanceId(), freeWorkers.size());
}else {
LOGGER.warn("restJobInstanceWorkerList update appGroupId={} instanceId={} workers=0.", jobInstanceInfo.getAppGroupId(),
jobInstanceInfo.getJobInstanceId());
}
}
/**
* 对指定worker进行下线处理
* @param workerIdAddr
*/
public synchronized void handleWorkerShutdown(String workerIdAddr, boolean withFailover) {}
/**
* 对指定worker标记下线
* @param workerIdAddr
*/
public synchronized void handleWorkerOffline(String workerIdAddr) {
//TODO 此处仅标记当前节点下线,依然通过定时检测判断是否下线(存在延迟可能)
//1、需要支持worker最终下线后,还是要通知master进行shutdown操作
//2、worker自身做好残留子任务收尾和状态反馈
this.existInvalidWorker = true;
this.getJobInstanceInfo().getAllWorkers().remove(workerIdAddr);
this.invalidWorkerSet.add(workerIdAddr);
LOGGER.info("handle worker={} offline.", workerIdAddr);
}
/**
* Worker Terminate
* @param shutdownMode
* @throws InterruptedException
*/
public void terminate(ShutdownMode shutdownMode) throws InterruptedException {
if (ShutdownMode.IMMEDIATE.equals(shutdownMode)) {
// 强制停止
this.doTerminate();
this.killInstance(true, "Worker master shutdown.");
}else if(ShutdownMode.WAIT_RUNNING.equals(shutdownMode)) {
// 停止任务继续分发,通知运行中的执行完,队列中的直接清理
this.doTerminate();
this.killInstance(false, "Worker master shutdown.");
}
// master优雅下线,等待所有子任务任务执行完成
latch.await();
}
protected void doTerminate(){}
public void latchRelease() {
latch.countDown();
}
/**
* 判断当前Master是否完成
* @return
*/
protected boolean isFinished() {
return this.instanceStatus.isFinish() || !TaskMasterPool.INSTANCE.contains(this.jobInstanceInfo.getJobInstanceId());
}
}