Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.alibaba.schedulerx.worker.master.StreamTaskMaster Maven / Gradle / Ivy
package com.alibaba.schedulerx.worker.master;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.MapUtils;
import org.apache.commons.lang.StringUtils;
import org.joda.time.DateTime;
import com.alibaba.schedulerx.common.domain.InstanceStatus;
import com.alibaba.schedulerx.common.domain.JobInstanceInfo;
import com.alibaba.schedulerx.common.domain.LimitedQueue;
import com.alibaba.schedulerx.common.domain.MapTaskXAttrs;
import com.alibaba.schedulerx.common.domain.Metrics;
import com.alibaba.schedulerx.common.domain.StreamJobProgress;
import com.alibaba.schedulerx.common.domain.StreamJobProgressDetail;
import com.alibaba.schedulerx.common.domain.TaskDispatchMode;
import com.alibaba.schedulerx.common.domain.TaskProgressCounter;
import com.alibaba.schedulerx.common.domain.TaskStatus;
import com.alibaba.schedulerx.common.domain.TimeType;
import com.alibaba.schedulerx.common.domain.WorkerProgressCounter;
import com.alibaba.schedulerx.common.domain.enums.RouteStrategyEnum;
import com.alibaba.schedulerx.common.util.ConfigUtil;
import com.alibaba.schedulerx.common.util.ExceptionUtil;
import com.alibaba.schedulerx.common.util.HessianUtil;
import com.alibaba.schedulerx.common.util.IdUtil;
import com.alibaba.schedulerx.common.util.JobUtil;
import com.alibaba.schedulerx.common.util.JsonUtil;
import com.alibaba.schedulerx.protocol.Worker;
import com.alibaba.schedulerx.protocol.Worker.ContainerReportTaskStatusRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterStartContainerRequest;
import com.alibaba.schedulerx.protocol.utils.FutureUtils;
import com.alibaba.schedulerx.worker.SchedulerxWorker;
import com.alibaba.schedulerx.worker.batch.ReqQueue;
import com.alibaba.schedulerx.worker.batch.StreamTaskPushReqHandler;
import com.alibaba.schedulerx.worker.batch.TMStatusReqHandler;
import com.alibaba.schedulerx.worker.domain.JavaProcessorProfile;
import com.alibaba.schedulerx.worker.domain.JobContext;
import com.alibaba.schedulerx.worker.domain.TaskInfo;
import com.alibaba.schedulerx.worker.domain.WorkerConstants;
import com.alibaba.schedulerx.worker.log.LogFactory;
import com.alibaba.schedulerx.worker.log.Logger;
import com.alibaba.schedulerx.worker.logcollector.ClientLoggerMessage;
import com.alibaba.schedulerx.worker.logcollector.LogCollector;
import com.alibaba.schedulerx.worker.logcollector.LogCollectorFactory;
import com.alibaba.schedulerx.worker.master.persistence.H2FilePersistence;
import com.alibaba.schedulerx.worker.master.persistence.TaskPersistence;
import com.alibaba.schedulerx.worker.metrics.WorkerLoadRegister;
import com.alibaba.schedulerx.worker.processor.JobProcessor;
import com.alibaba.schedulerx.worker.processor.ProcessResult;
import com.alibaba.schedulerx.worker.processor.StreamJobProcessor;
import com.alibaba.schedulerx.worker.route.Router;
import com.alibaba.schedulerx.worker.route.RouterFactory;
import com.alibaba.schedulerx.worker.util.ActorPathUtil;
import com.alibaba.schedulerx.worker.util.ContanerUtil;
import com.alibaba.schedulerx.worker.util.JobProcessorUtil;
import com.alibaba.schedulerx.worker.util.WorkerConfigUtil;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.protobuf.ByteString;
import akka.actor.ActorContext;
import akka.actor.ActorSelection;
/**
* StreamTaskMaster
* @author yaohui
* @create 2023/5/18 11:19 AM
**/
public class StreamTaskMaster extends TaskMaster {
private static final Logger LOGGER = LogFactory.getLogger(StreamTaskMaster.class);
private volatile int index = 0;
/**
* 子任务队列缓冲
*/
protected ReqQueue taskBlockingQueue;
protected StreamTaskPushReqHandler taskDispatchReqHandler;
protected Thread streamProduceThread;
private LogCollector logCollector = LogCollectorFactory.get();
private LimitedQueue streamJobProgressHistory = new LimitedQueue<>(10);
private Map streamJobProgressMap = Maps.newConcurrentMap();
protected Router router;
//task批量汇报队列
protected ReqQueue taskStatusReqQueue;
protected TMStatusReqHandler taskStatusReqBatchHandler;
protected TaskPersistence taskPersistence;
protected MapTaskXAttrs xAttrs = null;
protected StreamJobProcessor streamJobProcessor;
private Map> taskResultMap = Maps.newHashMap();
private Map> taskStatusMap = Maps.newHashMap();
private TaskProgressCounter totalCounter = new TaskProgressCounter("TotalCounter");
public StreamTaskMaster(JobInstanceInfo jobInstanceInfo, ActorContext actorContext) throws Exception {
super(jobInstanceInfo, actorContext);
this.taskPersistence = H2FilePersistence.getInstance();
this.taskPersistence.initTable();
this.streamJobProcessor = (StreamJobProcessor)JobProcessorUtil.getJavaProcessor(jobInstanceInfo.getContent());
if (jobInstanceInfo.getXattrs() != null) {
this.xAttrs = JsonUtil.fromJson(jobInstanceInfo.getXattrs(), MapTaskXAttrs.class);
}
}
/**
* 初始化
* @param batchNo
* @param taskName
* @param delta
*/
private void initTaskProgress(Long batchNo, String taskName, int delta) {
if (!streamJobProgressMap.containsKey(batchNo)) {
synchronized (this) {
if (!streamJobProgressMap.containsKey(batchNo)) {
TaskProgressCounter taskProgressCounter = new TaskProgressCounter(taskName);
taskProgressCounter.incrementTotal(delta);
StreamJobProgressDetail streamJobProgressDetail = new StreamJobProgressDetail(batchNo, DateTime.now().getMillis(), taskProgressCounter);
streamJobProgressMap.put(batchNo, streamJobProgressDetail);
}
}
} else {
TaskProgressCounter taskProgressCounter = streamJobProgressMap.get(batchNo).getTaskProgressCounter();
taskProgressCounter.incrementTotal(delta);
}
if (!WorkerConstants.MAP_TASK_ROOT_NAME.equals(taskName)) {
totalCounter.incrementTotal(delta);
}
}
@Override
public void batchUpdateTaskStatus(Worker.ContainerBatchReportTaskStatuesRequest request)throws Exception {
String workerIdAddr = request.getWorkerId()+"@"+request.getWorkerAddr();
this.setWorkerLoad(workerIdAddr, request.getMetricsJson(), null);
super.batchUpdateTaskStatus(request);
}
@Override
public void updateTaskStatus(ContainerReportTaskStatusRequest request) {
try {
taskStatusReqQueue.submitRequest(request);
} catch (Throwable e) {
LOGGER.error("", e);
}
}
/**
* 批量更新子任务状态
* @param requests
*/
@Override
public void batchUpdateTaskStatues(List requests) {
Map finalTaskStatus = Maps.newHashMap();
for (ContainerReportTaskStatusRequest request : requests) {
try {
TaskStatus taskStatus = TaskStatus.parseValue(request.getStatus());
// 过滤中间状态
if(!finalTaskStatus.containsKey(request.getTaskId()) || taskStatus.isFinish()){
finalTaskStatus.put(request.getTaskId(), request);
}
String workerAddr = request.getWorkerAddr();
LOGGER.debug("report task status:{} from worker:{}, uniqueId:{}", taskStatus.getDescription(),
workerAddr, IdUtil.getUniqueId(request.getJobId(), request.getJobInstanceId(), request.getTaskId()));
StreamJobProgressDetail streamJobProgressDetail = this.streamJobProgressMap.get(request.getSerialNum());
TaskProgressCounter taskProgressCounter = streamJobProgressDetail.getTaskProgressCounter();
streamJobProgressDetail.setStatus(TaskStatus.RUNNING.getValue());
Map workerProgressMap = streamJobProgressDetail.getWorkerProgressMap();
if (taskStatus.equals(TaskStatus.RUNNING)) {
taskProgressCounter.incrementRunning();
totalCounter.incrementRunning();
if (workerAddr != null) {
workerProgressMap.get(workerAddr).incrementRunning();
}
} else if (taskStatus.equals(TaskStatus.SUCCESS)) {
taskProgressCounter.incrementSuccess();
totalCounter.incrementSuccess();
this.taskDispatchReqHandler.release();
if (workerAddr != null) {
workerProgressMap.get(workerAddr).incrementSuccess();
}
} else if (taskStatus.equals(TaskStatus.FAILED)) {
taskProgressCounter.incrementFailed();
totalCounter.incrementFailed();
this.taskDispatchReqHandler.release();
if (workerAddr != null) {
workerProgressMap.get(workerAddr).incrementFailed();
if (StringUtils.isNotBlank(request.getTraceId())) {
workerProgressMap.get(workerAddr).setTraceId(request.getTraceId());
}
}
}
if(TaskStatus.FAILED.equals(taskStatus)) {
String uniqueIdWithoutTask = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());
LOGGER.info("jobInstanceId={}, taskId={}, report status failed. result:{}", jobInstanceInfo.getJobInstanceId(), request.getTaskId(), request.getResult());
logCollector.collect(jobInstanceInfo.getAppGroupId(), uniqueIdWithoutTask,
ClientLoggerMessage.JOB_PROCESSOR_EXEC_FAIL + request.getTaskId()+", "+request.getResult(), jobInstanceInfo.getGroupId());
}
// 设置子任务返回结果
if (this.streamJobProcessor.needReduce() && taskStatus.isFinish()) {
Map batchTaskResultMap = taskResultMap.get(request.getSerialNum());
if (batchTaskResultMap == null) {
synchronized (taskResultMap) {
batchTaskResultMap = taskResultMap.get(request.getSerialNum());
if (batchTaskResultMap == null) {
batchTaskResultMap = new HashMap<>();
taskResultMap.put(request.getSerialNum(), batchTaskResultMap);
}
}
}
batchTaskResultMap.put(request.getTaskId(), request.getResult());
Map batchTaskStatusMap = taskStatusMap.get(request.getSerialNum());
if (batchTaskStatusMap == null) {
synchronized (taskStatusMap) {
batchTaskStatusMap = taskStatusMap.get(request.getSerialNum());
if (batchTaskStatusMap == null) {
batchTaskStatusMap = new HashMap<>();
taskStatusMap.put(request.getSerialNum(), batchTaskStatusMap);
}
}
}
batchTaskStatusMap.put(request.getTaskId(), taskStatus);
}
} catch (Throwable e) {
LOGGER.error("jobInstanceId={}, batchNo={}, taskId={}, update progressMap error.", jobInstanceInfo.getJobInstanceId(),
request.getSerialNum(), request.getTaskId(), e);
updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, "update progressMap error."+e.getMessage());
}
}
try {
long startTime = System.currentTimeMillis();
boolean updateSuccess = false;
for (int i=0 ; i<3; i++) {
// try 3 times
try {
taskPersistence.updateTaskStatues(Lists.newArrayList(finalTaskStatus.values()));
updateSuccess = true;
break;
} catch (Throwable t) {
LOGGER.error("jobInstanceId={}, persistent batch updateTaskStatus error.", t);
}
}
// 如果没有更新成功
if (!updateSuccess) {
updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, "persistent batch update TaskStatus error up to 3 times");
}
LOGGER.debug("{} batch update status db cost:{}", jobInstanceInfo.getJobInstanceId(),
System.currentTimeMillis() - startTime);
} catch (Throwable e) {
LOGGER.error("jobInstanceId={}, batch updateTaskStatus error.", jobInstanceInfo.getJobInstanceId(), e);
}
}
@Override
protected void init() {
if (INITED) {
return;
}
INITED = true;
// 初始化缓冲队列
int queueSize = xAttrs.getQueueSize();
taskBlockingQueue = new ReqQueue<>(jobInstanceInfo.getJobInstanceId(), queueSize);
taskBlockingQueue.init();
taskStatusReqQueue = new ReqQueue<>(jobInstanceInfo.getJobInstanceId(), 10 * 10000);
taskStatusReqQueue.init();
taskStatusReqBatchHandler = new TMStatusReqHandler<>(jobInstanceInfo.getJobInstanceId(), 1,
1, 3000, taskStatusReqQueue);
// 设置全局并发数
int globalConsumerSize = xAttrs.getGlobalConsumerSize();
taskDispatchReqHandler = new StreamTaskPushReqHandler<>(jobInstanceInfo.getJobInstanceId(), globalConsumerSize,
jobInstanceInfo.getAllWorkers().size(), taskBlockingQueue);
boolean enableShareContainerPool = WorkerConfigUtil.isEnableShareContainerPool();
// 路由策略加载,仅针对共享线程池模式
if (enableShareContainerPool) {
router = RouterFactory.getRouter(jobInstanceInfo.getAppGroupId(), jobInstanceInfo.getJobId(),
xAttrs.getRouteType(), jobInstanceInfo.getRouteStrategyContent());
} else {
router = RouterFactory.getRouter(jobInstanceInfo.getAppGroupId(), jobInstanceInfo.getJobId(),
RouteStrategyEnum.ROUND_ROBIN.getValue(), jobInstanceInfo.getRouteStrategyContent());
}
// 清理路由配置
if (this.router != null && this.router instanceof WorkerLoadRegister) {
((WorkerLoadRegister) this.router).clear();
}
final String jobIdAndInstanceId = jobInstanceInfo.getJobId() + "_" + jobInstanceInfo.getJobInstanceId();
//worker alive check thread
new Thread(new Runnable() {
@Override
public void run() {
while (!instanceStatus.isFinish()) {
try {
for (String workerIdAddr : aliveCheckWorkerSet) {
try {
String workerAddr = workerIdAddr.split("@")[1];
String tokens[] = workerAddr.split(":");
String host = tokens[0];
int port = Integer.valueOf(tokens[1]);
int times = 0;
while (times < 3) {
Socket socket = new Socket();
try {
socket.connect(new InetSocketAddress(host, port), 5000);
LOGGER.info("socket to {}:{} is reachable, times={}", host, port, times);
break;
} catch (Exception e) {
LOGGER.info("socket to {}:{} is not reachable, times={}", host, port, times);
Thread.sleep(5000);
times++;
} finally {
if (socket != null) {
socket.close();
}
}
}
if (times >= 3) {
LOGGER.warn("worker[{}] is down, start to remove this worker and failover tasks, jobInstanceId={}",
workerIdAddr, jobInstanceInfo.getJobInstanceId());
handleWorkerShutdown(workerIdAddr, true);
continue;
}
final long startTime = System.currentTimeMillis();
ActorSelection selection = getActorContext().actorSelection(
ActorPathUtil.getWorkerHeartbeatRouterPath(workerIdAddr));
Worker.MasterCheckWorkerAliveRequest request = Worker.MasterCheckWorkerAliveRequest.newBuilder()
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setDispatchMode(xAttrs.getTaskDispatchMode())
.build();
Worker.MasterCheckWorkerAliveResponse response = (Worker.MasterCheckWorkerAliveResponse)
FutureUtils.awaitResult(selection, request, 10);
if (!response.getSuccess()) {
LOGGER.warn("jobInstanceId={} of worker={} is not alive", jobInstanceInfo.getJobInstanceId(),
workerIdAddr, response.getMessage());
handleWorkerShutdown(workerIdAddr, true);
// destroy containers of worker of PullModel
Worker.MasterDestroyContainerPoolRequest destroyContainerPoolRequest = Worker.MasterDestroyContainerPoolRequest.newBuilder()
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setJobId(jobInstanceInfo.getJobId())
.setWorkerIdAddr(workerIdAddr)
.setSerialNum(getSerialNum())
.build();
SchedulerxWorker.AtLeastDeliveryRoutingActor.tell(destroyContainerPoolRequest, null);
} else {
// 设置当前worker load
setWorkerLoad(workerIdAddr, response.getMetricsJson(), System.currentTimeMillis()-startTime);
}
} catch (Exception e) {
// 一旦出现异常,且telnet正常情况下会导致任务卡死
LOGGER.error("Alive worker check failed.", e);
handleWorkerShutdown(workerIdAddr, true);
// destroy containers of worker of PullModel
Worker.MasterDestroyContainerPoolRequest destroyContainerPoolRequest = Worker.MasterDestroyContainerPoolRequest.newBuilder()
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setJobId(jobInstanceInfo.getJobId())
.setWorkerIdAddr(workerIdAddr)
.setSerialNum(getSerialNum())
.build();
SchedulerxWorker.AtLeastDeliveryRoutingActor.tell(destroyContainerPoolRequest, null);
}
}
//每隔10秒进行一次worker探活
Thread.sleep(10000);
} catch (Throwable e) {
LOGGER.error("check worker error, jobInstanceId={}", jobInstanceInfo.getJobInstanceId(), e);
}
}
}
}, "Schedulerx-StreamTaskMaster-check-worker-alive-thread-" + jobIdAndInstanceId).start();
// failover pull thread
new Thread(new Runnable() {
@Override
public void run() {
int pageSize = ConfigUtil.getWorkerConfig().getInt(WorkerConstants.MAP_MASTER_PAGE_SIZE, WorkerConstants.MAP_MASTER_PAGE_SIZE_DEFAULT);
while (!instanceStatus.isFinish()) {
try {
List taskInfos;
long startTime = System.currentTimeMillis();
taskInfos = taskPersistence.pull(jobInstanceInfo.getJobInstanceId(), pageSize);
LOGGER.debug("jobInstanceId={}, pull cost={}ms", jobInstanceInfo.getJobInstanceId(),
(System.currentTimeMillis() - startTime));
if (taskInfos.isEmpty()) {
LOGGER.debug("pull task empty of jobInstanceId={}, sleep 10000 ms ...",
jobInstanceInfo.getJobInstanceId());
Thread.sleep(10 * 1000);
} else {
LOGGER.info("jobInstanceId={}, failover retry dispatch taskList, size:{} , cost={}ms",
jobInstanceInfo.getJobInstanceId(), taskInfos.size(), System.currentTimeMillis() - startTime);
for (TaskInfo taskInfo : taskInfos) {
ByteString taskBody = null;
if (taskInfo.getTaskBody() != null) {
taskBody = ByteString.copyFrom(taskInfo.getTaskBody());
}
MasterStartContainerRequest.Builder builder = convert2StartContainerRequestBuilder(jobInstanceInfo,
taskInfo.getTaskId(), taskInfo.getTaskName(), taskBody, true);
builder.setSerialNum(taskInfo.getBatchNo());
taskBlockingQueue.submitRequest(builder.build());
}
}
} catch (TimeoutException te) {
LOGGER.error("pull task timeout, uniqueId:{}", jobIdAndInstanceId, te);
logCollector.collect(jobInstanceInfo.getAppGroupId(), jobIdAndInstanceId, ClientLoggerMessage.MAP_INSTANCE_PULL_JOB_FAIL,
te, jobInstanceInfo.getGroupId());
try {
Thread.sleep(10 * 1000);
} catch (InterruptedException e) {
}
} catch (Throwable e) {
updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, ExceptionUtil.getMessage(e));
logCollector.collect(jobInstanceInfo.getAppGroupId(), jobIdAndInstanceId, ClientLoggerMessage.MAP_INSTANCE_PULL_JOB_FAIL,
e, jobInstanceInfo.getGroupId());
LOGGER.error("pull task error, uniqueId:{}", jobIdAndInstanceId, e);
}
}
}
}, "Schedulerx-StreamTaskMaster-pull-thread-" + jobIdAndInstanceId).start();
}
@Override
public void submitInstance(final JobInstanceInfo jobInstanceInfo) throws Exception {
try {
//初始化
init();
// 启动task batch handler
taskDispatchReqHandler.start();
taskStatusReqBatchHandler.start();
// 创建根任务
createProduceTask();
final String jobIdAndInstanceId = jobInstanceInfo.getJobId() + "_" + jobInstanceInfo.getJobInstanceId();
//Status Check Thread
new Thread(new Runnable() {
@Override
public void run() {
int checkInterval = ConfigUtil.getWorkerConfig().getInt(WorkerConstants.Map_MASTER_STATUS_CHECK_INTERVAL,
WorkerConstants.Map_MASTER_STATUS_CHECK_INTERVAL_DEFAULT);
while (!instanceStatus.isFinish()) {
try {
Thread.sleep(checkInterval);
if (MapUtils.isNotEmpty(streamJobProgressMap)) {
// 执行结果检查
for (Map.Entry entry : streamJobProgressMap.entrySet()) {
boolean allTasksPushed = taskDispatchReqHandler.allTasksPushed(entry.getKey());
StreamJobProgressDetail streamJobProgressDetail = entry.getValue();
TaskProgressCounter taskProgressCounter = streamJobProgressDetail.getTaskProgressCounter();
if (allTasksPushed || taskProgressCounter.getTotal() <= (taskProgressCounter.getFailed() + taskProgressCounter.getSuccess())) {
InstanceStatus newStatus = taskPersistence.checkInstanceStatus(jobInstanceInfo.getJobInstanceId(), entry.getKey());
if (newStatus.isFinish()) {
ProcessResult processResult = new ProcessResult(true);
// Root任务无需进行reduce
if (entry.getKey() > 0 && streamJobProcessor.needReduce()) {
try {
JobContext context = JobContext.newBuilder()
.setJobId(jobInstanceInfo.getJobId())
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setJobType(jobInstanceInfo.getJobType())
.setContent(jobInstanceInfo.getContent())
.setScheduleTime(jobInstanceInfo.getScheduleTime())
.setDataTime(jobInstanceInfo.getDataTime())
.setJobParameters(jobInstanceInfo.getParameters())
.setInstanceParameters(jobInstanceInfo.getInstanceParameters())
.setUser(jobInstanceInfo.getUser())
.setTaskResults(taskResultMap.get(entry.getKey()))
.setTaskStatuses(taskStatusMap.get(entry.getKey()))
.setSerialNum(entry.getKey())
.build();
processResult = streamJobProcessor.reduce(context);
if (processResult == null) {
processResult = new ProcessResult(false, "Reduce can not return NULL.");
}
} catch (Throwable t) {
LOGGER.error("Stream job jobId={} jobInstanceId={} batchNo={} reduce exception.", jobInstanceInfo.getJobId(),
jobInstanceInfo.getJobInstanceId(), entry.getKey(), t);
processResult = new ProcessResult(false, t.getMessage());
}
}
if (InstanceStatus.FAILED.equals(processResult.getStatus())) {
LOGGER.error("Stream job jobId={} jobInstanceId={} batchNo={} reduce failed. Result:{}", jobInstanceInfo.getJobId(),
jobInstanceInfo.getJobInstanceId(), entry.getKey(), processResult.getResult());
}
// 当前批次执行结束
long failedCount = streamJobProgressDetail.getTaskProgressCounter().getFailed();
if (failedCount > 0 || InstanceStatus.FAILED.equals(processResult.getStatus())) {
streamJobProgressDetail.setStatus(InstanceStatus.FAILED.getValue());
} else {
streamJobProgressDetail.setStatus(InstanceStatus.SUCCESS.getValue());
}
streamJobProgressDetail.setEndTime(DateTime.now().getMillis());
streamJobProgressHistory.add(streamJobProgressDetail);
streamJobProgressMap.remove(entry.getKey());
// 清理中间过程
taskStatusMap.remove(entry.getKey());
taskResultMap.remove(entry.getKey());
}
}
}
} else {
// 说明当前Root任务已经结束,直接关闭当前任务实例
if (!streamProduceThread.isAlive()) {
String result = SchedulerxWorker.INITED?"Produce task is stopped.":"Worker master shutdown.";
updateNewInstanceStatus(getSerialNum(), jobInstanceInfo.getJobInstanceId(), InstanceStatus.FAILED, result);
}
}
} catch (Throwable e) {
LOGGER.error("status check error, uniqueId:{}", jobIdAndInstanceId, e);
}
}
}
}, "Schedulerx-StreamTaskMaster-status-check-thread-" + jobIdAndInstanceId).start();
// job instance progress report thread
if(!JobUtil.isSecondTypeJob(TimeType.parseValue(jobInstanceInfo.getTimeType()))) {
new Thread(new Runnable() {
@Override
public void run() {
while (!instanceStatus.isFinish()) {
Worker.WorkerReportJobInstanceProgressRequest request = Worker.WorkerReportJobInstanceProgressRequest.newBuilder().setJobId(jobInstanceInfo.getJobId()).setJobInstanceId(
jobInstanceInfo.getJobInstanceId()).setProgress(getJobInstanceProgress()).build();
SERVER_DISCOVERY.getMapMasterRouter().tell(request, null);
try {
Thread.sleep(5000);
} catch (InterruptedException e) {
LOGGER.error("report status error, uniqueId={}", jobIdAndInstanceId, e);
break;
}
}
}
}, "Schedulerx-StreamTaskMaster-report-progress-thread-" + jobIdAndInstanceId).start();
}
} catch (Throwable t) {
String jobIdAndInstanceId = jobInstanceInfo.getJobId() + "_" + jobInstanceInfo.getJobInstanceId();
LOGGER.error("submit instance failed.", t);
updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, ExceptionUtil.getMessage(t));
logCollector.collect(jobInstanceInfo.getAppGroupId(), jobIdAndInstanceId, ClientLoggerMessage.INSTANCE_INIT_FAIL, t, jobInstanceInfo.getGroupId());
}
}
/**
* 创建Stream Root Task
* @throws Exception
*/
protected void createProduceTask() throws Exception {
// 初始根任务
initTaskProgress(this.getSerialNum(), WorkerConstants.MAP_TASK_ROOT_NAME, 1);
final MasterStartContainerRequest startContainerRequest = convert2StartContainerRequest(jobInstanceInfo, aquireTaskId(), WorkerConstants.MAP_TASK_ROOT_NAME, null);
final String workerIdAddr = getLocalWorkerIdAddr();
final String workerId = workerIdAddr.split("@")[0];
final String workerAddr = workerIdAddr.split("@")[1];
batchHandlePersistence(workerId, workerAddr, Lists.newArrayList(startContainerRequest), false);
batchHandlePulledProgress(Lists.newArrayList(startContainerRequest), Maps.>newHashMap(),
Maps.>newHashMap(), workerIdAddr);
// Stream produce
streamProduceThread = new Thread(new Runnable() {
@Override
public void run() {
try {
streamJobProgressMap.get(getSerialNum()).setStatus(TaskStatus.RUNNING.getValue());
streamJobProgressMap.get(getSerialNum()).getTaskProgressCounter().incrementRunning();
streamJobProgressMap.get(getSerialNum()).getWorkerProgressMap().get(workerAddr).incrementRunning();
JobContext context = ContanerUtil.convert2JobContext(startContainerRequest);
List tasks;
long produceInterval = xAttrs.getProduceInterval();
while (!instanceStatus.isFinish() && SchedulerxWorker.INITED) {
//TODO 防止执行中批次过多,进行主动限流
if (streamJobProgressMap.size() < 15) {
// 初始当前批次加载起始时间
streamJobProgressMap.get(0L).setStartTime(DateTime.now().getMillis());
context.setSerialNum(aquireSerialNum());
tasks = streamJobProcessor.produce(context);
if (!CollectionUtils.isEmpty(tasks)) {
initTaskProgress(getSerialNum(), "SubTask", tasks.size());
for (Object task : tasks) {
byte[] taskBody = HessianUtil.toBytes(task);
MasterStartContainerRequest taskContainerRequest = convert2StartContainerRequest(jobInstanceInfo,
aquireTaskId(), "SubTask", ByteString.copyFrom(taskBody));
taskBlockingQueue.submitRequest(taskContainerRequest);
}
}
}
// 如果超过5批运行中,则开始进行限流
int plus = (streamJobProgressMap.size() / 5) + 1;
TimeUnit.SECONDS.sleep(plus * produceInterval);
}
// Root任务执行结束
ContainerReportTaskStatusRequest rootTaskStatusRequest = ContainerReportTaskStatusRequest.newBuilder()
.setJobId(jobInstanceInfo.getJobId())
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setTaskId(startContainerRequest.getTaskId())
.setStatus(TaskStatus.SUCCESS.getValue())
.setWorkerId(workerId)
.setTaskName(startContainerRequest.getTaskName())
.setWorkerAddr(workerAddr)
.setSerialNum(startContainerRequest.getSerialNum())
.build();
updateTaskStatus(rootTaskStatusRequest);
} catch (Throwable e) {
LOGGER.error("stream job produce running failed.", e);
String workerIdAddr = getLocalWorkerIdAddr();
final String workerId = workerIdAddr.split("@")[0];
final String workerAddr = workerIdAddr.split("@")[1];
if (startContainerRequest != null) {
ContainerReportTaskStatusRequest faileReq = ContainerReportTaskStatusRequest.newBuilder()
.setJobId(jobInstanceInfo.getJobId())
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setTaskId(startContainerRequest.getTaskId())
.setStatus(TaskStatus.FAILED.getValue())
.setWorkerId(workerId)
.setTaskName(startContainerRequest.getTaskName())
.setWorkerAddr(workerAddr)
.setSerialNum(startContainerRequest.getSerialNum())
.build();
updateTaskStatus(faileReq);
} else {
String jobIdAndInstanceId = jobInstanceInfo.getJobId() + "_" + jobInstanceInfo.getJobInstanceId();
updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, ExceptionUtil.getMessage(e));
logCollector.collect(jobInstanceInfo.getAppGroupId(), jobIdAndInstanceId, ClientLoggerMessage.INSTANCE_INIT_FAIL,
e, jobInstanceInfo.getGroupId());
}
}
}
}, "Schedulerx-stream-produce-thread-" + this.jobInstanceInfo.getJobInstanceId());
streamProduceThread.start();
}
@Override
protected void checkProcessor() throws Exception {
if ("java".equalsIgnoreCase(jobInstanceInfo.getJobType())) {
JavaProcessorProfile profile = JsonUtil.fromJson(jobInstanceInfo.getContent(), JavaProcessorProfile.class);
if (!JobProcessorUtil.checkJavaProcessor(profile.getClassName(), StreamJobProcessor.class)) {
throw new IOException(profile.getClassName() + " must extends StreamJobProcessor");
}
}
}
@Override
public void destroyContainerPool() {
List allWorkers = jobInstanceInfo.getAllWorkers();
for (String workerIdAddr : allWorkers) {
Worker.MasterDestroyContainerPoolRequest request = Worker.MasterDestroyContainerPoolRequest.newBuilder()
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setSerialNum(getSerialNum())
.setJobId(jobInstanceInfo.getJobId())
.setWorkerIdAddr(workerIdAddr)
.build();
SchedulerxWorker.AtLeastDeliveryRoutingActor.tell(request, null);
}
}
@Override
public void clear() {
super.clear();
if (taskStatusReqQueue != null) {
taskStatusReqQueue.clear();
}
if (taskBlockingQueue != null) {
taskBlockingQueue.clear();
}
if (taskDispatchReqHandler != null) {
taskDispatchReqHandler.clear();
}
if (taskStatusReqBatchHandler != null) {
taskStatusReqBatchHandler.clear();
}
clearTasks(jobInstanceInfo.getJobInstanceId());
}
private void clearTasks(long jobInstanceId) {
try {
taskPersistence.clearTasks(jobInstanceId);
LOGGER.info("jobInstanceId={} clearTasks success.", jobInstanceId);
} catch (Throwable ex){
LOGGER.error("jobInstanceId={} clearTasks error", jobInstanceId, ex);
}
}
@Override
public void stop() {
if (taskDispatchReqHandler != null) {
taskDispatchReqHandler.stop();
}
if (taskStatusReqBatchHandler != null) {
taskStatusReqBatchHandler.stop();
}
LOGGER.info("jobInstanceId:{}, instance master successfully stop.", jobInstanceInfo.getJobInstanceId());
}
@Override
protected void doTerminate(){
if (taskDispatchReqHandler != null) {
taskDispatchReqHandler.stop();
}
}
@Override
public String getJobInstanceProgress() {
Map detailMap = new LinkedHashMap<>();
detailMap.putAll(streamJobProgressMap);
Iterator iterator = this.streamJobProgressHistory.iterator();
while (iterator.hasNext()){
StreamJobProgressDetail streamJobProgressDetail = iterator.next();
detailMap.put(streamJobProgressDetail.getBatchNum(), streamJobProgressDetail);
}
// 初始化缓冲队列
int queueSize = xAttrs.getQueueSize();
StreamJobProgressDetail produceProcessDetail = detailMap.get(0L);
return JsonUtil.toJson(new StreamJobProgress(produceProcessDetail.getStatus(), this.totalCounter,
queueSize, this.taskBlockingQueue.size(), detailMap));
}
/**
* 设置worker不可用
* @param workerIdAddr
*/
private void setWorkerInvalid(String workerIdAddr){
try {
invalidWorkerSet.add(workerIdAddr);
if (router != null) {
if (router instanceof WorkerLoadRegister) {
((WorkerLoadRegister) router).setAvailableSize(workerIdAddr, 0);
}
}
}catch (Exception e) {
LOGGER.warn("Set worker load failed.", e);
}
}
private void initTaskFailover(final List reqs, final String workerIdAddr) {
LOGGER.warn("jobInstanceId={}, worker[{}] is down, try another worker, size:{}",
jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size());
final String workerId = workerIdAddr.split("@")[0];
final String workerAddr = workerIdAddr.split("@")[1];
List taskIds = Lists.newArrayList();
Map affectCntMap = new HashMap<>();
for (MasterStartContainerRequest req : reqs) {
taskIds.add(req.getTaskId());
Integer count = affectCntMap.get(req.getSerialNum());
if (count == null) {
affectCntMap.put(req.getSerialNum(), 1);
} else {
affectCntMap.put(req.getSerialNum(), count+1);
}
}
try {
int affectCnt = taskPersistence.updateTaskStatus(jobInstanceInfo.getJobInstanceId(), taskIds,
TaskStatus.INIT, workerId, workerAddr);
LOGGER.warn("jobInstanceId={}, worker[{}] is down, reset task status, size:{}",
jobInstanceInfo.getJobInstanceId(), workerIdAddr, affectCnt);
// 恢复当前机器子任务量
for (Map.Entry entry:affectCntMap.entrySet()) {
streamJobProgressMap.get(entry.getKey()).getWorkerProgressMap().get(workerAddr).decPulledAndTotal(affectCnt);
}
} catch (Exception e1) {
LOGGER.error("jobInstanceId={}, timeout return init error", jobInstanceInfo.getJobInstanceId());
updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, "timeout dispatch return init error");
}
}
/**
* 处理子任务分发异常
* @param workerIdAddr
* @param reqs
* @param e
*/
private void processDispatchException(final String workerIdAddr, final List reqs, Throwable e) {
final String workerId = workerIdAddr.split("@")[0];
final String workerAddr = workerIdAddr.split("@")[1];
boolean failover = (xAttrs != null && xAttrs.isFailover());
if (failover && (e instanceof TimeoutException)) {
initTaskFailover(reqs, workerIdAddr);
} else {
// 如果是其他异常(比如序列化失败,找不到worker),直接把tasks置为失败
String uniqueIdWithoutTask = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());
LOGGER.error("jobInstanceId:{}, batch dispatch Tasks error worker={}, size:{}",
jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size(), e);
logCollector.collect(jobInstanceInfo.getAppGroupId(), uniqueIdWithoutTask, ClientLoggerMessage.MAP_INSTANCE_DISPATCH_JOB_FAIL,
e, jobInstanceInfo.getGroupId());
for (MasterStartContainerRequest req : reqs) {
ContainerReportTaskStatusRequest faileReq = ContainerReportTaskStatusRequest.newBuilder()
.setJobId(jobInstanceInfo.getJobId())
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setTaskId(req.getTaskId())
.setStatus(TaskStatus.FAILED.getValue())
.setResult("Dispatch tasks error. Cause by "+e.getMessage())
.setWorkerId(workerId)
.setTaskName(req.getTaskName())
.setWorkerAddr(workerAddr)
.setTaskName(req.getTaskName())
.setSerialNum(req.getSerialNum())
.build();
updateTaskStatus(faileReq);
}
}
// 设置当前worker不可用
setWorkerInvalid(workerIdAddr);
}
private void processDispatchResponse(final String workerIdAddr, final List reqs,
Worker.MasterBatchStartContainersResponse response, long startTime) {
final String workerId = workerIdAddr.split("@")[0];
final String workerAddr = workerIdAddr.split("@")[1];
if (response.getSuccess()) {
LOGGER.info("jobInstanceId={}, batch start containers successfully, size:{} , worker={}, cost={}ms",
jobInstanceInfo.getJobInstanceId(), reqs.size(), workerIdAddr,
System.currentTimeMillis() - startTime);
aliveCheckWorkerSet.add(workerIdAddr);
String metricsJson = response.getMetricsJson();
setWorkerLoad(workerIdAddr, metricsJson, System.currentTimeMillis() - startTime);
} else {
boolean failover = (xAttrs != null && xAttrs.isFailover());
if (failover && (response.getMessage() != null && response.getMessage().contains(WorkerConstants.WORKER_NOT_RUNNING_MESSAGE))) {
initTaskFailover(reqs, workerIdAddr);
} else {
LOGGER.error("jobInstanceId={}, batch start containers failed, worker={}, response={}, size:{}",
jobInstanceInfo.getJobInstanceId(), workerIdAddr, response.getMessage(), reqs.size());
//当前是直接置为失败
for (MasterStartContainerRequest req : reqs) {
ContainerReportTaskStatusRequest faileStatusRequest = ContainerReportTaskStatusRequest.newBuilder()
.setJobId(jobInstanceInfo.getJobId())
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setTaskId(req.getTaskId())
.setStatus(TaskStatus.FAILED.getValue())
.setResult(response.getMessage())
.setWorkerId(workerId)
.setTaskName(req.getTaskName())
.setWorkerAddr(workerAddr)
.setTaskName(req.getTaskName())
.setSerialNum(req.getSerialNum())
.build();
updateTaskStatus(faileStatusRequest);
}
}
// 设置当前worker不可用
setWorkerInvalid(workerIdAddr);
}
}
/**
* 设置worker load
* @param workerIdAddr
* @param metricsJson
* @param cost
*/
private void setWorkerLoad(String workerIdAddr, String metricsJson, Long cost){
try {
if (router != null) {
if (router instanceof WorkerLoadRegister && StringUtils.isNotEmpty(metricsJson)) {
Metrics metrics = JsonUtil.fromJson(metricsJson, Metrics.class);
if (metrics != null) {
LOGGER.info("update worker load, worker={}, sharePoolAvailableSize={}, cost={}", workerIdAddr, metrics.getSharePoolAvailableSize(), cost);
((WorkerLoadRegister) router).setAvailableSize(workerIdAddr, metrics.getSharePoolAvailableSize());
((WorkerLoadRegister) router).setRemainCpu(workerIdAddr, (int) (metrics.getCpuProcessors() - metrics.getCpuLoad1()));
((WorkerLoadRegister) router).setRemainMemory(workerIdAddr, (long) (100 - metrics.getHeap1Usage() * 100));
if (cost != null) {
((WorkerLoadRegister) router).setCost(workerIdAddr, cost);
}
}
synchronized (router) {
router.notifyAll();
}
}
}
}catch (Exception e) {
LOGGER.warn("Set worker load failed.", e);
}
}
private void batchHandleContainers(final String workerIdAddr, final List reqs, boolean isFailover, TaskDispatchMode dispatchMode) {
final String workerId = workerIdAddr.split("@")[0];
final String workerAddr = workerIdAddr.split("@")[1];
LOGGER.debug("jobInstanceId={}, batch dispatch, worker:{}, size:{}", jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size());
try {
batchHandlePersistence(workerId, workerAddr, reqs, isFailover);
final long startTime = System.currentTimeMillis();
ActorSelection selection = getActorContext().actorSelection(
ActorPathUtil.getContainerRouterPath(workerIdAddr));
Worker.MasterBatchStartContainersRequest request = Worker.MasterBatchStartContainersRequest.newBuilder()
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setJobId(jobInstanceInfo.getJobId())
.addAllStartReqs(reqs)
.build();
// 基于负载路由策略,采用同步分发
try {
Worker.MasterBatchStartContainersResponse response = (Worker.MasterBatchStartContainersResponse) FutureUtils.awaitResult(selection, request, 3L);
processDispatchResponse(workerIdAddr, reqs, response, startTime);
} catch (Throwable e) {
processDispatchException(workerIdAddr, reqs, e);
}
} catch (Throwable exception) {
//如果是其他异常(比如序列化失败,找不到worker),直接把tasks置为失败
String uniqueIdWithoutTask = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());
LOGGER.error("jobInstanceId:{}, batch dispatch Tasks error worker={}, size:{}",
jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size(), exception);
logCollector.collect(jobInstanceInfo.getAppGroupId(), uniqueIdWithoutTask, ClientLoggerMessage.MAP_INSTANCE_DISPATCH_JOB_FAIL,
exception, jobInstanceInfo.getGroupId());
for (MasterStartContainerRequest req : reqs) {
ContainerReportTaskStatusRequest faileReq = ContainerReportTaskStatusRequest.newBuilder()
.setJobId(jobInstanceInfo.getJobId())
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setTaskId(req.getTaskId())
.setStatus(TaskStatus.FAILED.getValue())
.setWorkerId(workerId)
.setTaskName(req.getTaskName())
.setWorkerAddr(workerAddr)
.setTaskName(req.getTaskName())
.setSerialNum(req.getSerialNum())
.build();
updateTaskStatus(faileReq);
}
}
}
private void batchHandlePersistence(String workerId, String workerAddr, List reqs, boolean isFailover) throws Exception {
long startTime = System.currentTimeMillis();
if (!isFailover) {
// first dispatch
taskPersistence.createTasks(reqs, workerId, workerAddr);
} else {
// failover, not first dispatch
List taskIds = Lists.newArrayList();
for (MasterStartContainerRequest req : reqs) {
taskIds.add(req.getTaskId());
}
taskPersistence.updateTaskStatus(jobInstanceInfo.getJobInstanceId(), taskIds, TaskStatus.RUNNING, workerId, workerAddr);
}
LOGGER.debug("jobInstance={}, batch dispatch db cost:{} ms, size:{}",
jobInstanceInfo.getJobInstanceId(), System.currentTimeMillis() - startTime, reqs.size());
}
/**
* 批量分发子任务
* @param masterStartContainerRequests
*/
public void batchDispatchTasks(List masterStartContainerRequests) {
Map> worker2ReqsWithNormal = Maps.newHashMap();
Map> worker2ReqsWithFailover = Maps.newHashMap();
batchHandlePulledProgress(masterStartContainerRequests, worker2ReqsWithNormal, worker2ReqsWithFailover, null);
//推模型正常启动子任务
for (Entry> entry : worker2ReqsWithNormal.entrySet()) {
batchHandleContainers(entry.getKey(), entry.getValue(), false, TaskDispatchMode.PUSH);
}
//推模型worker挂了,failover子任务到其他worker
for (Entry> entry : worker2ReqsWithFailover.entrySet()) {
batchHandleContainers(entry.getKey(), entry.getValue(), true, TaskDispatchMode.PUSH);
}
}
protected void batchHandlePulledProgress(List masterStartContainerRequests,
Map> worker2ReqsWithNormal,
Map> worker2ReqsWithFailover,
String worker) {
for (MasterStartContainerRequest request : masterStartContainerRequests) {
String workerIdAddr = worker==null?selectWorker():worker;
if (workerIdAddr == null) {
updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, "all worker is down!");
break;
}
String workerAddr = workerIdAddr.split("@")[1];
StreamJobProgressDetail streamJobProgressDetail = this.streamJobProgressMap.get(request.getSerialNum());
if (request.getFailover()) {
if (!worker2ReqsWithFailover.containsKey(workerIdAddr)) {
worker2ReqsWithFailover.put(workerIdAddr, Lists.newArrayList(request));
} else {
worker2ReqsWithFailover.get(workerIdAddr).add(request);
}
} else {
if (!worker2ReqsWithNormal.containsKey(workerIdAddr)) {
worker2ReqsWithNormal.put(workerIdAddr, Lists.newArrayList(request));
} else {
worker2ReqsWithNormal.get(workerIdAddr).add(request);
}
streamJobProgressDetail.getTaskProgressCounter().incrementPulled();
if (request.getSerialNum() > 0) {
// 合计非Root数据
totalCounter.incrementPulled();
}
}
streamJobProgressDetail.setStatus(TaskStatus.PULLED.getValue());
Map workerProgressMap = streamJobProgressDetail.getWorkerProgressMap();
if (workerAddr != null && !workerProgressMap.containsKey(workerAddr)) {
synchronized (this) {
if (!workerProgressMap.containsKey(workerAddr)) {
WorkerProgressCounter workerProgressCounter = new WorkerProgressCounter(workerAddr);
workerProgressMap.put(workerAddr, workerProgressCounter);
}
}
}
workerProgressMap.get(workerAddr).incrementTotal();
workerProgressMap.get(workerAddr).incrementPulled();
}
}
/**
* 选择Worker
* @return
*/
private synchronized String selectWorker() {
if (index < 0 || index >= Integer.MAX_VALUE) {
index = 0;
}
String worker = router.route(jobInstanceInfo.getAppGroupId(), jobInstanceInfo.getJobId(), jobInstanceInfo.getAllWorkers(),
jobInstanceInfo.getTargetWorkerAddrsMap(), index++, getLocalWorkerIdAddr());
return worker;
}
@Override
public synchronized void handleWorkerShutdown(String workerIdAddr, boolean withFailover) {
this.existInvalidWorker = true;
this.invalidWorkerSet.add(workerIdAddr);
if (!aliveCheckWorkerSet.contains(workerIdAddr)) {
return;
}
String[] workerInfo = workerIdAddr.split("@");
String workerAddr = workerInfo[1];
String workerId = workerInfo[0];
aliveCheckWorkerSet.remove(workerIdAddr);
jobInstanceInfo.getAllWorkers().remove(workerIdAddr);
if (withFailover && (xAttrs != null && xAttrs.isFailover())) {
// 如果启用failover,置为init状态等待重新拉取
int affectCnt = taskPersistence.batchUpdateTaskStatus(jobInstanceInfo.getJobInstanceId(), TaskStatus.INIT, workerId, workerAddr);
LOGGER.warn("jobInstanceId={}, failover task number:{}, workerId:{}, workerAddr:{}",
jobInstanceInfo.getJobInstanceId(), affectCnt, workerId, workerAddr);
if (affectCnt > 0) {
for(StreamJobProgressDetail progressDetail:streamJobProgressMap.values()) {
WorkerProgressCounter workerProgressCounter = progressDetail.getWorkerProgressMap().get(workerAddr);
int count = workerProgressCounter.getRunning() + workerProgressCounter.getPulled();
workerProgressCounter.decRunningAndTotal(count);
}
this.taskDispatchReqHandler.release(affectCnt);
}
} else {
// 如果不启用failover,直接把这台worker上的子任务置为失败
int affectCnt = taskPersistence.batchUpdateTaskStatus(jobInstanceInfo.getJobInstanceId(), TaskStatus.FAILED, workerId, workerAddr);
LOGGER.warn("jobInstanceId={}, worker shutdown, failed task number:{}, workerId:{}, workerAddr:{}",
jobInstanceInfo.getJobInstanceId(), affectCnt, workerId, workerAddr);
if (affectCnt > 0) {
for(StreamJobProgressDetail progressDetail:streamJobProgressMap.values()) {
WorkerProgressCounter workerProgressCounter = progressDetail.getWorkerProgressMap().get(workerAddr);
int count = workerProgressCounter.getRunning() + workerProgressCounter.getPulled();
workerProgressCounter.incrementFailed(count);
}
// taskProgress数据会不准确
this.taskDispatchReqHandler.release(affectCnt);
}
}
}
@Override
public ProcessResult postFinish(long jobInstanceId) {
ProcessResult reduceResult = null;
try {
JobContext context = JobContext.newBuilder()
.setJobId(jobInstanceInfo.getJobId())
.setJobInstanceId(jobInstanceId)
.setJobType(jobInstanceInfo.getJobType())
.setContent(jobInstanceInfo.getContent())
.setScheduleTime(jobInstanceInfo.getScheduleTime())
.setDataTime(jobInstanceInfo.getDataTime())
.setJobParameters(jobInstanceInfo.getParameters())
.setInstanceParameters(jobInstanceInfo.getInstanceParameters())
.setUser(jobInstanceInfo.getUser())
.setSerialNum(this.getSerialNum())
.build();
JobProcessor jobProcessor = JobProcessorUtil.getJavaProcessor(context.getContent());
if (jobProcessor instanceof StreamJobProcessor) {
reduceResult = ((StreamJobProcessor) jobProcessor).postProcess(context);
}
} catch (Throwable e) {
LOGGER.info("Stream job post finish failed.", e);
String fixedErrMsg = ExceptionUtil.getFixedErrMsgByThrowable(e, 800);
return new ProcessResult(false, "Stream job post finish failed:" + fixedErrMsg);
}
return reduceResult;
}
@Override
public void killInstance(boolean mayInterruptIfRunning, String reason) {
super.killInstance(mayInterruptIfRunning, reason);
//ip:port format
List allWorkers = jobInstanceInfo.getAllWorkers();
this.sendKillContainerRequest(mayInterruptIfRunning, allWorkers);
//update instance status, stop on-going process
updateNewInstanceStatus(getSerialNum(), jobInstanceInfo.getJobInstanceId(), InstanceStatus.FAILED, reason);
}
}