Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.alibaba.schedulerx.worker.master.MapTaskMaster Maven / Gradle / Ivy
package com.alibaba.schedulerx.worker.master;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.NumberUtils;
import org.springframework.util.CollectionUtils;
import com.alibaba.schedulerx.common.domain.InstanceStatus;
import com.alibaba.schedulerx.common.domain.JobInstanceInfo;
import com.alibaba.schedulerx.common.domain.MapTaskProgress;
import com.alibaba.schedulerx.common.domain.MapTaskXAttrs;
import com.alibaba.schedulerx.common.domain.Metrics;
import com.alibaba.schedulerx.common.domain.TaskDispatchMode;
import com.alibaba.schedulerx.common.domain.TaskProgressCounter;
import com.alibaba.schedulerx.common.domain.TaskStatus;
import com.alibaba.schedulerx.common.domain.TimeType;
import com.alibaba.schedulerx.common.domain.WorkerProgressCounter;
import com.alibaba.schedulerx.common.domain.enums.RouteStrategyEnum;
import com.alibaba.schedulerx.common.monitor.MetricsCollector;
import com.alibaba.schedulerx.common.util.ConfigUtil;
import com.alibaba.schedulerx.common.util.ExceptionUtil;
import com.alibaba.schedulerx.common.util.HessianUtil;
import com.alibaba.schedulerx.common.util.IdUtil;
import com.alibaba.schedulerx.common.util.IdUtil.IdType;
import com.alibaba.schedulerx.common.util.JobUtil;
import com.alibaba.schedulerx.common.util.JsonUtil;
import com.alibaba.schedulerx.protocol.Worker;
import com.alibaba.schedulerx.protocol.Worker.ContainerReportTaskStatusRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterBatchStartContainersRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterBatchStartContainersResponse;
import com.alibaba.schedulerx.protocol.Worker.MasterCheckWorkerAliveRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterCheckWorkerAliveResponse;
import com.alibaba.schedulerx.protocol.Worker.MasterDestroyContainerPoolRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterKillContainerRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterNotifyWorkerPullRequest;
import com.alibaba.schedulerx.protocol.Worker.MasterNotifyWorkerPullResponse;
import com.alibaba.schedulerx.protocol.Worker.MasterStartContainerRequest;
import com.alibaba.schedulerx.protocol.Worker.WorkerReportJobInstanceProgressRequest;
import com.alibaba.schedulerx.protocol.utils.FutureUtils;
import com.alibaba.schedulerx.worker.SchedulerxWorker;
import com.alibaba.schedulerx.worker.actor.FutureExecutorPool;
import com.alibaba.schedulerx.worker.batch.ReqQueue;
import com.alibaba.schedulerx.worker.batch.TMStatusReqHandler;
import com.alibaba.schedulerx.worker.batch.TaskDispatchReqHandler;
import com.alibaba.schedulerx.worker.domain.JavaProcessorProfile;
import com.alibaba.schedulerx.worker.domain.JobContext;
import com.alibaba.schedulerx.worker.domain.TaskInfo;
import com.alibaba.schedulerx.worker.domain.WorkerConstants;
import com.alibaba.schedulerx.worker.log.LogFactory;
import com.alibaba.schedulerx.worker.log.Logger;
import com.alibaba.schedulerx.worker.logcollector.ClientLoggerMessage;
import com.alibaba.schedulerx.worker.logcollector.LogCollector;
import com.alibaba.schedulerx.worker.logcollector.LogCollectorFactory;
import com.alibaba.schedulerx.worker.master.persistence.TaskPersistence;
import com.alibaba.schedulerx.worker.metrics.WorkerLoadRegister;
import com.alibaba.schedulerx.worker.processor.JobProcessor;
import com.alibaba.schedulerx.worker.processor.JobProcessorEx;
import com.alibaba.schedulerx.worker.processor.MapJobProcessor;
import com.alibaba.schedulerx.worker.processor.MapReduceJobProcessor;
import com.alibaba.schedulerx.worker.processor.ProcessResult;
import com.alibaba.schedulerx.worker.route.Router;
import com.alibaba.schedulerx.worker.route.RouterFactory;
import com.alibaba.schedulerx.worker.util.ActorPathUtil;
import com.alibaba.schedulerx.worker.util.JobProcessorUtil;
import com.alibaba.schedulerx.worker.util.WorkerConfigUtil;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.protobuf.ByteString;
import akka.actor.ActorContext;
import akka.actor.ActorSelection;
import akka.dispatch.OnFailure;
import akka.dispatch.OnSuccess;
import akka.pattern.Patterns;
import akka.util.Timeout;
import scala.concurrent.ExecutionContext;
import scala.concurrent.Future;
import scala.concurrent.duration.Duration;
/**
* @author xiaomeng.hxm
*/
public abstract class MapTaskMaster extends TaskMaster {
private static final Logger LOGGER = LogFactory.getLogger(MapTaskMaster.class);
private volatile int index = 0;
protected volatile int pageSize = ConfigUtil.getWorkerConfig().getInt(WorkerConstants.MAP_MASTER_PAGE_SIZE,
WorkerConstants.MAP_MASTER_PAGE_SIZE_DEFAULT);
protected volatile int queueSize = ConfigUtil.getWorkerConfig().getInt(WorkerConstants.MAP_MASTER_QUEUE_SIZE,
WorkerConstants.MAP_MASTER_QUEUE_SIZE_DEFAULT);
private volatile int dispatcherSize = ConfigUtil.getWorkerConfig().getInt(WorkerConstants.MAP_MASTER_DISPATCHER_SIZE,
WorkerConstants.MAP_MASTER_DISPATCHER_SIZE_DEFAULT);
//task批量汇报队列
protected ReqQueue taskStatusReqQueue;
protected TMStatusReqHandler taskStatusReqBatchHandler;
//子任务内存缓存队列,推模型通过TaskDispatchReqHandler进行拉取主动推送,拉模型通过PullThread进行拉取。
protected ReqQueue taskBlockingQueue;
protected TaskDispatchReqHandler taskDispatchReqHandler;
// 跟任务失败单独处理
private volatile String rootTaskResult;
protected TaskPersistence taskPersistence;
/**
* taskName -> TaskProgressCounter(total, init, pulled, running, success, failed)
*/
protected Map taskProgressMap = Maps.newConcurrentMap();
/**
* workerAddr -> WorkerProgressCounter(total, running, success, failed)
*/
protected Map workerProgressMap = Maps.newConcurrentMap();
private Map taskResultMap = Maps.newHashMap();
private Map taskStatusMap = Maps.newHashMap();
protected MapTaskXAttrs xAttrs = null;
protected volatile AtomicInteger taskCounter = new AtomicInteger(0);
protected ExecutionContext futureExecutor;
private LogCollector logCollector = LogCollectorFactory.get();
protected volatile boolean startStatusCheck = true;
protected volatile boolean needReduce;
protected Router router;
public MapTaskMaster(JobInstanceInfo jobInstanceInfo, ActorContext actorContext) throws Exception {
super(jobInstanceInfo, actorContext);
this.futureExecutor = FutureExecutorPool.INSTANCE.get("MapTaskMaster");
// 开启子任务分发随机打散
boolean random = ConfigUtil.getWorkerConfig().getBoolean(WorkerConstants.MAP_MASTER_DISPATCH_RANDOM,
false);
List allWorkers = jobInstanceInfo.getAllWorkers();
if (CollectionUtils.isEmpty(allWorkers)) {
throw new IllegalArgumentException("workers can't be empty! ");
}
if (random && !CollectionUtils.isEmpty(allWorkers)) {
index = new Random().nextInt(allWorkers.size());
}
}
@Override
protected void init() {
startStatusCheck = true;
if (INITED){
return;
}
super.init();
if(isWorkerLoadRouter()) {
// 全新基于worker负载的路由模式
router = RouterFactory.getRouter(jobInstanceInfo.getAppGroupId(), jobInstanceInfo.getJobId(),
RouteStrategyEnum.WORKER_LOAD.getValue(), jobInstanceInfo.getRouteStrategyContent());
// 清理路由配置
if (this.router != null && this.router instanceof WorkerLoadRegister) {
((WorkerLoadRegister) this.router).clear();
}
}
final String jobIdAndInstanceId = jobInstanceInfo.getJobId() + "_" + jobInstanceInfo.getJobInstanceId();
LOGGER.info("jobInstanceId={}, map master config, pageSize:{}, queueSize:{}, dispatcherSize:{}, workerSize:{}",
jobIdAndInstanceId, pageSize, queueSize, dispatcherSize, jobInstanceInfo.getAllWorkers().size());
// pull thread
new Thread(new Runnable() {
@Override
public void run() {
while (!isFinished()) {
try {
List taskInfos;
long startTime = System.currentTimeMillis();
taskInfos = taskPersistence.pull(jobInstanceInfo.getJobInstanceId(), pageSize);
LOGGER.debug("jobInstanceId={}, pull cost={}ms", jobInstanceInfo.getJobInstanceId(),
(System.currentTimeMillis() - startTime));
if (taskInfos.isEmpty()) {
LOGGER.debug("pull task empty of jobInstanceId={}, sleep 10000 ms ...",
jobInstanceInfo.getJobInstanceId());
Thread.sleep(10 * 1000);
} else {
LOGGER.info("jobInstanceId={}, failover retry dispatch taskList, size:{} , cost={}ms",
jobInstanceInfo.getJobInstanceId(), taskInfos.size(), System.currentTimeMillis() - startTime);
for (TaskInfo taskInfo : taskInfos) {
ByteString taskBody = null;
if (taskInfo.getTaskBody() != null) {
taskBody = ByteString.copyFrom(taskInfo.getTaskBody());
}
MasterStartContainerRequest.Builder builder = convert2StartContainerRequestBuilder(jobInstanceInfo,
taskInfo.getTaskId(), taskInfo.getTaskName(), taskBody, true);
taskBlockingQueue.submitRequest(builder.build());
}
}
} catch (TimeoutException te) {
LOGGER.error("pull task timeout, uniqueId:{}", jobIdAndInstanceId, te);
logCollector.collect(jobInstanceInfo.getAppGroupId(), jobIdAndInstanceId, ClientLoggerMessage.MAP_INSTANCE_PULL_JOB_FAIL,
te, jobInstanceInfo.getGroupId());
try {
Thread.sleep(10 * 1000);
} catch (InterruptedException e) {
//do nothing
}
} catch (Throwable e) {
updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, ExceptionUtil.getMessage(e));
logCollector.collect(jobInstanceInfo.getAppGroupId(), jobIdAndInstanceId, ClientLoggerMessage.MAP_INSTANCE_PULL_JOB_FAIL,
e, jobInstanceInfo.getGroupId());
LOGGER.error("pull task error, uniqueId:{}", jobIdAndInstanceId, e);
}
}
}
}, "Schedulerx-MapTaskMaster-pull-thread-" + jobIdAndInstanceId).start();
// status check thread
new Thread(new Runnable() {
@Override
public void run() {
int checkInterval = ConfigUtil.getWorkerConfig().getInt(WorkerConstants.Map_MASTER_STATUS_CHECK_INTERVAL,
WorkerConstants.Map_MASTER_STATUS_CHECK_INTERVAL_DEFAULT);
while (!isFinished()) {
try {
Thread.sleep(checkInterval);
//秒级别任务,每次跑完会停止任务状态检查,否则会触发多次迭代
if (!isStartStatusCheck()) {
continue;
}
InstanceStatus newStatus = taskPersistence.checkInstanceStatus(jobInstanceInfo.getJobInstanceId());
if (newStatus.isFinish() && taskDispatchReqHandler.isActive()) {
// avoid wrong early finish instance in condition root task was success but sub tasks are still creating.
Thread.sleep(checkInterval);
continue;
}
String result = getRootTaskResult();
if (newStatus.equals(InstanceStatus.SUCCESS)) {
// if return finish status, we need check counter;
int failCnt = 0;
int successCnt = 0;
int totalCnt = 0;
for (TaskProgressCounter taskProgressCounter : taskProgressMap.values()) {
failCnt += taskProgressCounter.getFailed();
successCnt += taskProgressCounter.getSuccess();
totalCnt += taskProgressCounter.getTotal();
}
if (successCnt + failCnt < totalCnt) {
newStatus = InstanceStatus.FAILED;
LOGGER.warn("jobInstanceId={} turn into finish status,"
+ " but count isn't correct, successCnt:{}, failCnt:{}, totalCnt:{}",
jobInstanceInfo.getJobInstanceId(), successCnt, failCnt, totalCnt);
result = "Turn into finish status, but count is wrong, sucCnt:" + successCnt + ", failCnt:" + failCnt +
", totalCnt:" + totalCnt + "; Basically, the reason is that some workers are shutdown.";
} else {
newStatus = failCnt > 0 ? InstanceStatus.FAILED : InstanceStatus.SUCCESS;
}
}
updateNewInstanceStatus(getSerialNum(), jobInstanceInfo.getJobInstanceId(), newStatus, result);
} catch (Throwable e) {
LOGGER.error("status check error, uniqueId:{}", jobIdAndInstanceId, e);
}
}
}
}, "Schedulerx-MapTaskMaster-status-check-thread-" + jobIdAndInstanceId).start();
// job instance progress report thread
if(!JobUtil.isSecondTypeJob(TimeType.parseValue(jobInstanceInfo.getTimeType()))) {
new Thread(new Runnable() {
@Override
public void run() {
while (!isFinished()) {
WorkerReportJobInstanceProgressRequest request = WorkerReportJobInstanceProgressRequest.newBuilder().setJobId(jobInstanceInfo.getJobId()).setJobInstanceId(
jobInstanceInfo.getJobInstanceId()).setProgress(getJobInstanceProgress()).build();
SERVER_DISCOVERY.getMapMasterRouter().tell(request, null);
try {
Thread.sleep(5000);
} catch (InterruptedException e) {
LOGGER.error("report status error, uniqueId={}", jobIdAndInstanceId, e);
break;
}
}
}
}, "Schedulerx-MapTaskMaster-report-progress-thread-" + jobIdAndInstanceId).start();
}
//worker alive check thread
new Thread(new Runnable() {
@Override
public void run() {
while (!isFinished()) {
// aliveCheckWorkerSet.addAll(jobInstanceInfo.getAllWorkers());
// if (aliveCheckWorkerSet.isEmpty()) {
// LOGGER.warn("worker list is empty, jobInstanceId={}", jobInstanceInfo.getJobInstanceId());
// taskPersistence.batchUpdateTaskStatus(jobInstanceInfo.getJobInstanceId(), TaskStatus.FAILED,
// null, null);
// break;
// } else {
try {
for (String workerIdAddr : aliveCheckWorkerSet) {
try {
String workerAddr = workerIdAddr.split("@")[1];
String tokens[] = workerAddr.split(":");
String host = tokens[0];
int port = Integer.valueOf(tokens[1]);
int times = 0;
while (times < 3) {
Socket socket = new Socket();
try {
socket.connect(new InetSocketAddress(host, port), 5000);
LOGGER.info("socket to {}:{} is reachable, times={}", host, port, times);
break;
} catch (Exception e) {
LOGGER.info("socket to {}:{} is not reachable, times={}", host, port, times);
Thread.sleep(5000);
times++;
} finally {
if (socket != null) {
socket.close();
}
}
}
if (times >= 3) {
LOGGER.warn("worker[{}] is down, start to remove this worker and failover tasks, jobInstanceId={}",
workerIdAddr, jobInstanceInfo.getJobInstanceId());
handleWorkerShutdown(workerIdAddr, true);
continue;
}
final long startTime = System.currentTimeMillis();
ActorSelection selection = getActorContext().actorSelection(
ActorPathUtil.getWorkerHeartbeatRouterPath(workerIdAddr));
MasterCheckWorkerAliveRequest request = MasterCheckWorkerAliveRequest.newBuilder()
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setDispatchMode(xAttrs.getTaskDispatchMode())
.build();
MasterCheckWorkerAliveResponse response = (MasterCheckWorkerAliveResponse)
FutureUtils.awaitResult(selection, request, 10);
if (!response.getSuccess()) {
LOGGER.warn("jobInstanceId={} of worker={} is not alive", jobInstanceInfo.getJobInstanceId(),
workerIdAddr, response.getMessage());
handleWorkerShutdown(workerIdAddr, true);
// destroy containers of worker of PullModel
MasterDestroyContainerPoolRequest destroyContainerPoolRequest = MasterDestroyContainerPoolRequest.newBuilder()
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setJobId(jobInstanceInfo.getJobId())
.setWorkerIdAddr(workerIdAddr)
.setSerialNum(getSerialNum())
.build();
SchedulerxWorker.AtLeastDeliveryRoutingActor.tell(destroyContainerPoolRequest, null);
} else {
// 设置当前worker load
setWorkerLoad(workerIdAddr, response.getMetricsJson(), System.currentTimeMillis()-startTime);
}
} catch (Exception e) {
//TODO 确认下是否需要shutdown
// 一旦出现异常,且telnet正常情况下会导致任务卡死
LOGGER.error("Alive worker check failed.", e);
handleWorkerShutdown(workerIdAddr, true);
// destroy containers of worker of PullModel
MasterDestroyContainerPoolRequest destroyContainerPoolRequest = MasterDestroyContainerPoolRequest.newBuilder()
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setJobId(jobInstanceInfo.getJobId())
.setWorkerIdAddr(workerIdAddr)
.setSerialNum(getSerialNum())
.build();
SchedulerxWorker.AtLeastDeliveryRoutingActor.tell(destroyContainerPoolRequest, null);
}
}
//每隔10秒进行一次worker探活
Thread.sleep(10000);
} catch (Throwable e) {
LOGGER.error("check worker error, jobInstanceId={}", jobInstanceInfo.getJobInstanceId(), e);
}
// }
}
}
}, "Schedulerx-MapTaskMaster-check-worker-alive-thread-" + jobIdAndInstanceId).start();
// PULL_MODEL specially
if (xAttrs.getTaskDispatchMode().equals(TaskDispatchMode.PULL.getValue())) {
new Thread(new Runnable() {
@Override
public void run() {
while (!isFinished()) {
for (String workerIdAddr : jobInstanceInfo.getAllWorkers()) {
try {
ActorSelection selection = getActorContext().actorSelection(
ActorPathUtil.getWorkerJobInstancePath(workerIdAddr));
MasterNotifyWorkerPullRequest request = MasterNotifyWorkerPullRequest.newBuilder()
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setPageSize(xAttrs.getPageSize())
.setQueueSize(xAttrs.getQueueSize())
.setTaskMasterAkkaPath(getLocalTaskRouterPath())
.setConsumerSize(xAttrs.getConsumerSize())
.setSerialNum(getSerialNum())
.build();
MasterNotifyWorkerPullResponse response = (MasterNotifyWorkerPullResponse) FutureUtils.awaitResult(
selection, request, 5);
if (!response.getSuccess()) {
String errorMsg = response.getMessage();
LOGGER.error("notify worker pull failed, jobInstanceId={}", jobInstanceInfo.getJobInstanceId(), errorMsg);
updateNewInstanceStatus(getSerialNum(), jobInstanceInfo.getJobInstanceId(),
InstanceStatus.FAILED, errorMsg);
// TODO 一但出现异常后继续分发Worker会出现致命问题
}
} catch (Throwable e) {
LOGGER.error("notify worker pull error, jobInstanceId={}, worker={}", jobIdAndInstanceId, workerIdAddr, e);
}
}
try {
Thread.sleep(5000);
} catch (InterruptedException e) {
LOGGER.error("", e);
}
}
}
}, "Schedulerx-PullTaskMaster-notify-workers-pull-thread-" + jobIdAndInstanceId).start();
}
}
@Override
public synchronized void submitInstance(JobInstanceInfo jobInstanceInfo) throws Exception {
try {
long startTime = System.currentTimeMillis();
if (dispatcherSize > WorkerConstants.MAP_MASTER_DISPATCHER_SIZE_MAX) {
dispatcherSize = WorkerConstants.MAP_MASTER_DISPATCHER_SIZE_MAX;
}
startBatchHandler();
createRootTask();
LOGGER.info("jobInstanceId={} create root task, cost={}ms", jobInstanceInfo.getJobInstanceId(),
(System.currentTimeMillis() - startTime));
init();
} catch (Throwable e) {
String jobIdAndInstanceId = jobInstanceInfo.getJobId() + "_" + jobInstanceInfo.getJobInstanceId();
LOGGER.error("", e);
updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, ExceptionUtil.getMessage(e));
logCollector.collect(jobInstanceInfo.getAppGroupId(), jobIdAndInstanceId, ClientLoggerMessage.INSTANCE_INIT_FAIL,
e, jobInstanceInfo.getGroupId());
}
}
@Override
public void batchUpdateTaskStatus(Worker.ContainerBatchReportTaskStatuesRequest request)throws Exception {
String workerIdAddr = request.getWorkerId()+"@"+request.getWorkerAddr();
this.setWorkerLoad(workerIdAddr, request.getMetricsJson(), null);
super.batchUpdateTaskStatus(request);
}
@Override
public void updateTaskStatus(ContainerReportTaskStatusRequest request) {
try {
taskStatusReqQueue.submitRequest(request);
} catch (Throwable e) {
LOGGER.error("", e);
}
}
@Override
public void batchUpdateTaskStatues(List requests) {
Map finalTaskStatus = Maps.newHashMap();
try {
for (ContainerReportTaskStatusRequest request : requests) {
TaskStatus taskStatus = TaskStatus.parseValue(request.getStatus());
// 过滤中间状态
if(!finalTaskStatus.containsKey(request.getTaskId()) || taskStatus.isFinish()){
finalTaskStatus.put(request.getTaskId(), request);
}
String workerAddr = request.getWorkerAddr();
String taskName = request.getTaskName();
LOGGER.debug("report task status:{} from worker:{}, uniqueId:{}", taskStatus.getDescription(),
workerAddr, IdUtil.getUniqueId(request.getJobId(), request.getJobInstanceId(), request.getTaskId()));
// update progress
if (!taskProgressMap.containsKey(taskName)) {
synchronized (this) {
if (!taskProgressMap.containsKey(taskName)) {
TaskProgressCounter taskProgressCounter = new TaskProgressCounter(taskName);
taskProgressMap.put(taskName, taskProgressCounter);
}
}
}
if (workerAddr != null && !workerProgressMap.containsKey(workerAddr)) {
synchronized (this) {
if (!workerProgressMap.containsKey(workerAddr)) {
WorkerProgressCounter workerProgressCounter = new WorkerProgressCounter(workerAddr);
workerProgressMap.put(workerAddr, workerProgressCounter);
if (StringUtils.isNotBlank(request.getTraceId())) {
workerProgressCounter.setTraceId(request.getTraceId());
}
}
}
}
if (taskStatus.equals(TaskStatus.RUNNING)) {
taskProgressMap.get(taskName).incrementRunning();
if (workerAddr != null) {
workerProgressMap.get(workerAddr).incrementRunning();
}
} else if (taskStatus.equals(TaskStatus.SUCCESS)) {
taskProgressMap.get(taskName).incrementSuccess();
if (workerAddr != null) {
workerProgressMap.get(workerAddr).incrementSuccess();
}
} else if (taskStatus.equals(TaskStatus.FAILED)) {
// 为了方便秒级别Map任务定位业务执行失败的机器和原因
LOGGER.error("Report task status:{} result:{} from worker:{}, uniqueId:{}", taskStatus.getDescription(),
request.getResult(), workerAddr, IdUtil.getUniqueId(request.getJobId(), request.getJobInstanceId(), request.getTaskId()));
taskProgressMap.get(taskName).incrementFailed();
if (workerAddr != null) {
workerProgressMap.get(workerAddr).incrementFailed();
if (StringUtils.isNotBlank(request.getTraceId())) {
workerProgressMap.get(workerAddr).setTraceId(request.getTraceId());
}
}
}
//update taskResultMap and taskStatusMap
if (this.needReduce) {
taskResultMap.put(request.getTaskId(), request.getResult());
taskStatusMap.put(request.getTaskId(), taskStatus);
}
}
} catch (Throwable e) {
LOGGER.error("jobInstanceId={}, update progressMap error.", jobInstanceInfo.getJobInstanceId(), e);
}
try {
long startTime = System.currentTimeMillis();
// 跟任务节点失败原因回传,有可能同时返回两个根任务request,分别是running和failed状态,取最后一个
int index = requests.size() - 1;
if (index >=0 && (TaskStatus.FAILED.getValue() == requests.get(index).getStatus())
&& WorkerConstants.MAP_TASK_ROOT_NAME.equals(requests.get(index).getTaskName())) {
setRootTaskResult(requests.get(index).getResult());
}
boolean updateSuccess = false;
for (int i=0 ; i<3; i++) {
// try 3 times
try {
taskPersistence.updateTaskStatues(Lists.newArrayList(finalTaskStatus.values()));
updateSuccess = true;
break;
} catch (Throwable t) {
LOGGER.error("jobInstanceId={}, persistent batch updateTaskStatus error.", t);
}
}
// 如果没有更新成功
if (!updateSuccess) {
updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, "persistent batch update TaskStatus error up to 3 times");
}
LOGGER.debug("{} batch update status db cost:{}", jobInstanceInfo.getJobInstanceId(),
System.currentTimeMillis() - startTime);
} catch (Throwable e) {
LOGGER.error("jobInstanceId={}, batch updateTaskStatus error.", jobInstanceInfo.getJobInstanceId(), e);
}
}
public boolean map(List taskList, String taskName) throws Exception {
LOGGER.debug("map taskName:{}, size:{}", taskName, taskList.size());
initTaskProgress(taskName, taskList.size());
for (ByteString taskBody : taskList) {
MasterStartContainerRequest startContainerRequest = convert2StartContainerRequest(jobInstanceInfo,
aquireTaskId(), taskName, taskBody);
taskBlockingQueue.submitRequest(startContainerRequest);
}
// TODO overload clientinfo日志
return machineOverload();
}
protected void clearTasks(long jobInstanceId) {
try {
taskPersistence.clearTasks(jobInstanceId);
LOGGER.info("jobInstanceId={} clearTasks success.", jobInstanceId);
} catch (Throwable ex){
LOGGER.error("jobInstanceId={} clearTasks error", jobInstanceId, ex);
}
}
protected void createRootTask() throws Exception {
String taskName = WorkerConstants.MAP_TASK_ROOT_NAME;
ByteString taskBody = ByteString.copyFrom(HessianUtil.toBytes(WorkerConstants.MAP_TASK_ROOT_NAME));
initTaskProgress(taskName, 1);
MasterStartContainerRequest startContainerRequest = convert2StartContainerRequest(jobInstanceInfo, aquireTaskId(),
taskName, taskBody);
batchDispatchTasks(Lists.newArrayList(startContainerRequest), getLocalWorkerIdAddr());
}
private void initTaskFailover(final List reqs, final String workerIdAddr) {
LOGGER.warn("jobInstanceId={}, worker[{}] is down, try another worker, size:{}",
jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size());
final String workerId = workerIdAddr.split("@")[0];
final String workerAddr = workerIdAddr.split("@")[1];
List taskIds = Lists.newArrayList();
for (MasterStartContainerRequest req : reqs) {
taskIds.add(req.getTaskId());
}
try {
int affectCnt = taskPersistence.updateTaskStatus(jobInstanceInfo.getJobInstanceId(), taskIds,
TaskStatus.INIT, workerId, workerAddr);
LOGGER.warn("jobInstanceId={}, worker[{}] is down, reset task status, size:{}",
jobInstanceInfo.getJobInstanceId(), workerIdAddr, affectCnt);
// 恢复当前机器子任务量
workerProgressMap.get(workerAddr).decPulledAndTotal(affectCnt);
} catch (Exception e1) {
LOGGER.error("jobInstanceId={}, timeout return init error", jobInstanceInfo.getJobInstanceId());
updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, "timeout dispatch return init error");
}
}
private void processDispatchException(final String workerIdAddr, final List reqs, Throwable e) {
final String workerId = workerIdAddr.split("@")[0];
final String workerAddr = workerIdAddr.split("@")[1];
boolean failover = (xAttrs != null && xAttrs.isFailover());
if (failover && (e instanceof TimeoutException)) {
// 失败重试处理
initTaskFailover(reqs, workerIdAddr);
} else {
//如果是其他异常(比如序列化失败,找不到worker),直接把tasks置为失败
String uniqueIdWithoutTask = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());
LOGGER.error("jobInstanceId:{}, batch dispatch Tasks error worker={}, size:{}",
jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size(), e);
logCollector.collect(jobInstanceInfo.getAppGroupId(), uniqueIdWithoutTask, ClientLoggerMessage.MAP_INSTANCE_DISPATCH_JOB_FAIL,
e, jobInstanceInfo.getGroupId());
for (MasterStartContainerRequest req : reqs) {
ContainerReportTaskStatusRequest faileReq = ContainerReportTaskStatusRequest.newBuilder()
.setJobId(jobInstanceInfo.getJobId())
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setTaskId(req.getTaskId())
.setStatus(TaskStatus.FAILED.getValue())
.setResult("Dispatch tasks error. Cause by "+e.getMessage())
.setWorkerId(workerId)
.setTaskName(req.getTaskName())
.setWorkerAddr(workerAddr)
.setTaskName(req.getTaskName())
.build();
updateTaskStatus(faileReq);
}
}
// 设置当前worker不可用
setWorkerInvalid(workerIdAddr);
}
private void processDispatchResponse(final String workerIdAddr, final List reqs,
MasterBatchStartContainersResponse response, long startTime) {
final String workerId = workerIdAddr.split("@")[0];
final String workerAddr = workerIdAddr.split("@")[1];
if (response.getSuccess()) {
LOGGER.info("jobInstanceId={}, batch start containers successfully, size:{} , worker={}, cost={}ms",
jobInstanceInfo.getJobInstanceId(), reqs.size(), workerIdAddr,
System.currentTimeMillis() - startTime);
aliveCheckWorkerSet.add(workerIdAddr);
String metricsJson = response.getMetricsJson();
setWorkerLoad(workerIdAddr, metricsJson, System.currentTimeMillis() - startTime);
} else {
boolean failover = (xAttrs != null && xAttrs.isFailover());
if (failover && (response.getMessage() != null && response.getMessage().contains(WorkerConstants.WORKER_NOT_RUNNING_MESSAGE))) {
initTaskFailover(reqs, workerIdAddr);
} else {
LOGGER.error("jobInstanceId={}, batch start containers failed, worker={}, response={}, size:{}",
jobInstanceInfo.getJobInstanceId(), workerIdAddr, response.getMessage(), reqs.size());
//当前是直接置为失败
for (MasterStartContainerRequest req : reqs) {
ContainerReportTaskStatusRequest faileStatusRequest = ContainerReportTaskStatusRequest
.newBuilder()
.setJobId(jobInstanceInfo.getJobId())
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setTaskId(req.getTaskId())
.setStatus(TaskStatus.FAILED.getValue())
.setResult(response.getMessage())
.setWorkerId(workerId)
.setTaskName(req.getTaskName())
.setWorkerAddr(workerAddr)
.setTaskName(req.getTaskName())
.build();
updateTaskStatus(faileStatusRequest);
}
}
// 设置当前worker不可用
setWorkerInvalid(workerIdAddr);
}
}
private void batchHandleContainers(final String workerIdAddr, final List reqs, boolean isFailover,
TaskDispatchMode dispatchMode) {
final String workerId = workerIdAddr.split("@")[0];
final String workerAddr = workerIdAddr.split("@")[1];
LOGGER.debug("jobInstanceId={}, batch dispatch, worker:{}, size:{}", jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size());
try {
batchHandlePersistence(workerId, workerAddr, reqs, isFailover);
if (dispatchMode.equals(TaskDispatchMode.PUSH)) {
final long startTime = System.currentTimeMillis();
ActorSelection selection = getActorContext().actorSelection(
ActorPathUtil.getContainerRouterPath(workerIdAddr));
MasterBatchStartContainersRequest request = MasterBatchStartContainersRequest.newBuilder()
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setJobId(jobInstanceInfo.getJobId())
.addAllStartReqs(reqs)
.build();
Timeout timeout = new Timeout(Duration.create(3, TimeUnit.SECONDS));
if(isWorkerLoadRouter()) {
// 基于负载路由策略,采用同步分发
try {
MasterBatchStartContainersResponse response = (MasterBatchStartContainersResponse) FutureUtils.awaitResult(selection, request, 3L);
processDispatchResponse(workerIdAddr, reqs, response, startTime);
} catch (Throwable e) {
processDispatchException(workerIdAddr, reqs, e);
}
} else {
// 异步分发
Future future = Patterns.ask(selection, request, timeout);
// 触发成功回调
future.onSuccess(new OnSuccess() {
@Override
public void onSuccess(Object obj) throws Throwable {
MasterBatchStartContainersResponse response = (MasterBatchStartContainersResponse) obj;
processDispatchResponse(workerIdAddr, reqs, response, startTime);
}
}, futureExecutor);
//触发超时或失败回调
future.onFailure(new OnFailure() {
@Override
public void onFailure(Throwable e) throws Throwable {
processDispatchException(workerIdAddr, reqs, e);
}
}, futureExecutor);
}
} else {
aliveCheckWorkerSet.add(workerIdAddr);
}
} catch (Throwable exception) {
//如果是其他异常(比如序列化失败,找不到worker),直接把tasks置为失败
String uniqueIdWithoutTask = IdUtil.getUniqueIdWithoutTask(jobInstanceInfo.getJobId(), jobInstanceInfo.getJobInstanceId());
LOGGER.error("jobInstanceId:{}, batch dispatch Tasks error worker={}, size:{}",
jobInstanceInfo.getJobInstanceId(), workerIdAddr, reqs.size(), exception);
logCollector.collect(jobInstanceInfo.getAppGroupId(), uniqueIdWithoutTask, ClientLoggerMessage.MAP_INSTANCE_DISPATCH_JOB_FAIL,
exception, jobInstanceInfo.getGroupId());
for (MasterStartContainerRequest req : reqs) {
ContainerReportTaskStatusRequest faileReq = ContainerReportTaskStatusRequest.newBuilder()
.setJobId(jobInstanceInfo.getJobId())
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setTaskId(req.getTaskId())
.setStatus(TaskStatus.FAILED.getValue())
.setWorkerId(workerId)
.setTaskName(req.getTaskName())
.setWorkerAddr(workerAddr)
.setTaskName(req.getTaskName())
.build();
updateTaskStatus(faileReq);
}
}
}
/**
* 设置worker load
* @param workerIdAddr
* @param metricsJson
* @param cost
*/
private void setWorkerLoad(String workerIdAddr, String metricsJson, Long cost){
try {
if (router != null) {
if (router instanceof WorkerLoadRegister && StringUtils.isNotEmpty(metricsJson)) {
Metrics metrics = JsonUtil.fromJson(metricsJson, Metrics.class);
if (metrics != null) {
LOGGER.info("update worker load, worker={}, sharePoolAvailableSize={}, cost={}", workerIdAddr, metrics.getSharePoolAvailableSize(), cost);
((WorkerLoadRegister) router).setAvailableSize(workerIdAddr, metrics.getSharePoolAvailableSize());
((WorkerLoadRegister) router).setRemainCpu(workerIdAddr, (int) (metrics.getCpuProcessors() - metrics.getCpuLoad1()));
((WorkerLoadRegister) router).setRemainMemory(workerIdAddr, (long) (100 - metrics.getHeap1Usage() * 100));
if (cost != null) {
((WorkerLoadRegister) router).setCost(workerIdAddr, cost);
}
}
synchronized (router) {
router.notifyAll();
}
}
}
}catch (Exception e) {
LOGGER.warn("Set worker load failed.", e);
}
}
/**
* 设置worker不可用
* @param workerIdAddr
*/
private void setWorkerInvalid(String workerIdAddr){
try {
invalidWorkerSet.add(workerIdAddr);
if (router != null) {
if (router instanceof WorkerLoadRegister) {
((WorkerLoadRegister) router).setAvailableSize(workerIdAddr, 0);
}
}
}catch (Exception e) {
LOGGER.warn("Set worker load failed.", e);
}
}
private void batchHandlePersistence(String workerId, String workerAddr, List reqs, boolean isFailover) throws Exception {
long startTime = System.currentTimeMillis();
if (!isFailover) {
// first dispatch
taskPersistence.createTasks(reqs, workerId, workerAddr);
if (this.needReduce) {
for (MasterStartContainerRequest req : reqs) {
this.taskStatusMap.put(req.getTaskId(), TaskStatus.INIT);
}
}
} else {
// failover, not first dispatch
List taskIds = Lists.newArrayList();
for (MasterStartContainerRequest req : reqs) {
taskIds.add(req.getTaskId());
}
taskPersistence.updateTaskStatus(jobInstanceInfo.getJobInstanceId(), taskIds, TaskStatus.RUNNING, workerId, workerAddr);
}
LOGGER.debug("jobInstance={}, batch dispatch db cost:{} ms, size:{}",
jobInstanceInfo.getJobInstanceId(), System.currentTimeMillis() - startTime, reqs.size());
}
protected void batchHandlePulledProgress(List masterStartContainerRequests,
Map> worker2ReqsWithNormal,
Map> worker2ReqsWithFailover,
String remoteWorker) {
for (MasterStartContainerRequest request : masterStartContainerRequests) {
String workerIdAddr = ((remoteWorker != null) ? remoteWorker : selectWorker(request.getFailover()));
if (workerIdAddr == null) {
updateNewInstanceStatus(getSerialNum(), InstanceStatus.FAILED, "all worker is down!");
break;
}
String workerAddr = workerIdAddr.split("@")[1];
if (request.getFailover()) {
if (!worker2ReqsWithFailover.containsKey(workerIdAddr)) {
worker2ReqsWithFailover.put(workerIdAddr, Lists.newArrayList(request));
} else {
worker2ReqsWithFailover.get(workerIdAddr).add(request);
}
} else {
if (!worker2ReqsWithNormal.containsKey(workerIdAddr)) {
worker2ReqsWithNormal.put(workerIdAddr, Lists.newArrayList(request));
} else {
worker2ReqsWithNormal.get(workerIdAddr).add(request);
}
// failover的子任务无需再计数
taskProgressMap.get(request.getTaskName()).incrementPulled();
}
if (workerAddr != null && !workerProgressMap.containsKey(workerAddr)) {
synchronized (this) {
if (!workerProgressMap.containsKey(workerAddr)) {
WorkerProgressCounter workerProgressCounter = new WorkerProgressCounter(workerAddr);
workerProgressMap.put(workerAddr, workerProgressCounter);
}
}
}
workerProgressMap.get(workerAddr).incrementTotal();
workerProgressMap.get(workerAddr).incrementPulled();
}
}
public void batchDispatchTasks(List masterStartContainerRequests) {
batchDispatchTasks(masterStartContainerRequests, null);
}
public void batchDispatchTasks(List masterStartContainerRequests, String remoteWorker) {
Map> worker2ReqsWithNormal = Maps.newHashMap();
Map> worker2ReqsWithFailover = Maps.newHashMap();
batchHandlePulledProgress(masterStartContainerRequests, worker2ReqsWithNormal, worker2ReqsWithFailover, remoteWorker);
//推模型正常启动子任务
for (Entry> entry : worker2ReqsWithNormal.entrySet()) {
batchHandleContainers(entry.getKey(), entry.getValue(), false, TaskDispatchMode.PUSH);
}
//推模型worker挂了,failover子任务到其他worker
for (Entry> entry : worker2ReqsWithFailover.entrySet()) {
batchHandleContainers(entry.getKey(), entry.getValue(), true, TaskDispatchMode.PUSH);
}
}
public void batchPullTasks(List masterStartContainerRequests, String workerIdAddr) {
Map> worker2ReqsWithNormal = Maps.newHashMap();
Map> worker2ReqsWithFailover = Maps.newHashMap();
batchHandlePulledProgress(masterStartContainerRequests, worker2ReqsWithNormal, worker2ReqsWithFailover, workerIdAddr);
//拉模型持久化tasks
for (Entry> entry : worker2ReqsWithNormal.entrySet()) {
batchHandleContainers(entry.getKey(), entry.getValue(), false, TaskDispatchMode.PULL);
}
//拉模型更新tasks
for (Entry> entry : worker2ReqsWithFailover.entrySet()) {
batchHandleContainers(entry.getKey(), entry.getValue(), true, TaskDispatchMode.PULL);
}
}
//TODO 可以重构成不抢锁的方式?
protected synchronized String selectWorker(Boolean failover) {
List allWorkers = jobInstanceInfo.getAllWorkers();
if (failover && !CollectionUtils.isEmpty(getAliveCheckWorkerSet())) {
allWorkers = new ArrayList<>(getAliveCheckWorkerSet());
allWorkers.removeAll(invalidWorkerSet);
}
String worker;
if(router != null ) {
worker = router.route(jobInstanceInfo.getAppGroupId(), jobInstanceInfo.getJobId(),
allWorkers, jobInstanceInfo.getTargetWorkerAddrsMap(), getSerialNum(), getLocalWorkerIdAddr());
} else {
int size = allWorkers.size();
if (size == 0) {
return null;
}
boolean doNext;
int count = 0;
do {
doNext = false;
if (index >= size) {
index = index % size;
}
worker = allWorkers.get(index++);
if (xAttrs != null && allWorkers.size() > 1 && !xAttrs.isExecOnMaster()) {
// 仅当存在多个节点时,可配置主节点不参与执行有效
if (worker.equals(getLocalWorkerIdAddr())) {
doNext = true;
}
}
if (invalidWorkerSet.contains(worker)){
LOGGER.warn("Failover={}, Worker={} is invalid skip. Invalid worker set={}, All workers={}", failover, worker, invalidWorkerSet, allWorkers);
doNext = true;
}
} while (doNext && ++count < size);
}
return worker;
}
@Override
public void killInstance(boolean mayInterruptIfRunning, String reason) {
super.killInstance(mayInterruptIfRunning, reason);
//ip:port format
List allWorkers = jobInstanceInfo.getAllWorkers();
this.sendKillContainerRequest(mayInterruptIfRunning, allWorkers);
//update instance status, stop on-going process
updateNewInstanceStatus(getSerialNum(), jobInstanceInfo.getJobInstanceId(), InstanceStatus.FAILED, reason);
}
@Override
public void destroyContainerPool() {
List allWorkers = jobInstanceInfo.getAllWorkers();
for (String workerIdAddr : allWorkers) {
MasterDestroyContainerPoolRequest request = MasterDestroyContainerPoolRequest.newBuilder()
.setJobInstanceId(jobInstanceInfo.getJobInstanceId())
.setSerialNum(getSerialNum())
.setJobId(jobInstanceInfo.getJobId())
.setWorkerIdAddr(workerIdAddr)
.build();
SchedulerxWorker.AtLeastDeliveryRoutingActor.tell(request, null);
}
}
@Override
public void killTask(String uniqueId, String workerId, String workerAddr) {
String workerIdAddr = workerId + "@" + workerAddr;
try {
ActorSelection selection = getActorContext().actorSelection(
ActorPathUtil.getContainerRouterPath(workerIdAddr));
MasterKillContainerRequest request = MasterKillContainerRequest.newBuilder()
.setJobId(IdUtil.parse(uniqueId, IdType.JOB_ID))
.setJobInstanceId(IdUtil.parse(uniqueId, IdType.JOB_INSTANCE_ID))
.setTaskId(IdUtil.parse(uniqueId, IdType.TASK_ID))
.setMayInterruptIfRunning(true)
.setAppGroupId(jobInstanceInfo.getAppGroupId())
.setGroupId(jobInstanceInfo.getGroupId())
.build();
selection.tell(request, null);
} catch (Throwable e) {
LOGGER.error("send kill request exception, worker:" + workerIdAddr);
}
}
@Override
public String getJobInstanceProgress() {
MapTaskProgress detail = new MapTaskProgress();
detail.setTaskProgress(taskProgressMap.values());
detail.setWorkerProgress(workerProgressMap.values());
return JsonUtil.toJson(detail);
}
@SuppressWarnings("resource")
@Override
public ProcessResult postFinish(long jobInstanceId) {
ProcessResult reduceResult = null;
try {
JobContext context = JobContext.newBuilder()
.setJobId(jobInstanceInfo.getJobId())
.setJobInstanceId(jobInstanceId)
.setJobType(jobInstanceInfo.getJobType())
.setContent(jobInstanceInfo.getContent())
.setScheduleTime(jobInstanceInfo.getScheduleTime())
.setDataTime(jobInstanceInfo.getDataTime())
.setJobParameters(jobInstanceInfo.getParameters())
.setInstanceParameters(jobInstanceInfo.getInstanceParameters())
.setUser(jobInstanceInfo.getUser())
.setTaskResults(taskResultMap)
.setTaskStatuses(taskStatusMap)
.setSerialNum(this.getSerialNum())
.build();
JobProcessor jobProcessor = JobProcessorUtil.getJavaProcessor(context.getContent());
if (needReduce) {
if (jobProcessor instanceof MapReduceJobProcessor) {
boolean runReduceIfFail = ((MapReduceJobProcessor) jobProcessor).runReduceIfFail(context);
if (getInstanceStatus().equals(InstanceStatus.FAILED) && !runReduceIfFail) {
LOGGER.warn("jobInstanceId={} is failed, skip reduce", jobInstanceId);
return null;
}
} else {
reduceResult = new ProcessResult(false);
reduceResult.setResult(String.format("JobProcessor[%s] can not cast to com.alibaba.schedulerx.worker.processor.MapReduceJobProcessor, " +
"you can make CGLIB enabled (e.g. @EnableAspectJAutoProxy(proxyTargetClass = true)) to support reduce operations if spring aop is used.",jobProcessor.getClass().getName()));
return reduceResult;
}
String reduceTaskName = WorkerConstants.REDUCE_TASK_NAME;
if (!taskProgressMap.containsKey(reduceTaskName)) {
TaskProgressCounter taskProgressCounter = new TaskProgressCounter(reduceTaskName);
taskProgressMap.put(reduceTaskName, taskProgressCounter);
}
taskProgressMap.get(reduceTaskName).incrementTotal();
taskProgressMap.get(reduceTaskName).incrementRunning();
String workerAddr = getActorContext().provider().getDefaultAddress().host().get() + ":" +
getActorContext().provider().getDefaultAddress().port().get();
if (!workerProgressMap.containsKey(workerAddr)) {
WorkerProgressCounter workerProgressCounter = new WorkerProgressCounter(workerAddr);
workerProgressMap.put(workerAddr, workerProgressCounter);
}
workerProgressMap.get(workerAddr).incrementTotal();
workerProgressMap.get(workerAddr).incrementRunning();
try {
reduceResult = ((MapReduceJobProcessor)jobProcessor).reduce(context);
} catch (Exception e) {
LOGGER.error("do reduce process failed.", e);
reduceResult = new ProcessResult(false);
reduceResult.setResult("reduce exception: " + ExceptionUtil.getMessage(e));
}
if (reduceResult.getStatus().equals(InstanceStatus.SUCCESS)) {
taskProgressMap.get(reduceTaskName).incrementSuccess();
workerProgressMap.get(workerAddr).incrementSuccess();
} else {
taskProgressMap.get(reduceTaskName).incrementFailed();
workerProgressMap.get(workerAddr).incrementFailed();
}
} else {
if (jobProcessor instanceof JobProcessorEx) {
((JobProcessorEx)jobProcessor).postProcess(context);
}
}
} catch (Throwable e) {
LOGGER.error("Map task postFinish failed.", e);
}
return reduceResult;
}
@Override
public void stop() {
if (taskDispatchReqHandler != null) {
taskDispatchReqHandler.stop();
}
if (taskStatusReqBatchHandler != null) {
taskStatusReqBatchHandler.stop();
}
LOGGER.info("jobInstanceId:{}, instance master successfully stop.", jobInstanceInfo.getJobInstanceId());
}
@Override
protected void doTerminate(){
if (taskDispatchReqHandler != null) {
taskDispatchReqHandler.stop();
}
}
protected void startBatchHandler() {
if (INITED) {
return;
}
// start batch handlers
taskStatusReqQueue.init();
taskStatusReqBatchHandler.start();
taskBlockingQueue.setCapacity(queueSize);
taskBlockingQueue.init();
if (xAttrs.getTaskDispatchMode().equals(TaskDispatchMode.PUSH.getValue())) {
if(isWorkerLoadRouter()) {
// 采用单线程分发,确保能进行Master限流
taskDispatchReqHandler.start();
} else {
taskDispatchReqHandler.setWorkThreadNum(dispatcherSize);
taskDispatchReqHandler.setDispatchSize(pageSize * jobInstanceInfo.getAllWorkers().size());
taskDispatchReqHandler.start();
}
}
}
private int getTotalPulledAndRunning() {
int total = 0;
List taskCounters = Lists.newArrayList(taskProgressMap.values());
for (TaskProgressCounter taskProgressCounter : taskCounters) {
total += taskProgressCounter.getPulled();
total += taskProgressCounter.getRunning();
}
return total;
}
private boolean machineOverload() {
boolean memOverload = false;
boolean loadOverload = false;
boolean taskQueueOverload = false;
Metrics vmDetail = MetricsCollector.getMetrics();
if (vmDetail != null) {
memOverload = vmDetail.getHeap1Usage() >= WorkerConstants.USER_MEMORY_PERCENT_DEFAULT;
loadOverload = vmDetail.getCpuLoad1() >= vmDetail.getCpuProcessors();
}
return memOverload || loadOverload || taskQueueOverload;
}
public String getRootTaskResult() {
return rootTaskResult;
}
public void setRootTaskResult(String rootTaskResult) {
this.rootTaskResult = rootTaskResult;
}
private void initTaskProgress(String taskName, int delta) {
if (!taskProgressMap.containsKey(taskName)) {
synchronized (this) {
if (!taskProgressMap.containsKey(taskName)) {
TaskProgressCounter taskProgressCounter = new TaskProgressCounter(taskName);
taskProgressMap.put(taskName, taskProgressCounter);
}
}
}
taskProgressMap.get(taskName).incrementTotal(delta);
}
@Override
public void clear() {
super.clear();
if (taskStatusReqQueue != null) {
taskStatusReqQueue.clear();
}
if (taskBlockingQueue != null) {
taskBlockingQueue.clear();
}
if (taskDispatchReqHandler != null) {
taskDispatchReqHandler.clear();
}
if (taskStatusReqBatchHandler != null) {
taskStatusReqBatchHandler.clear();
}
if (taskProgressMap != null) {
taskProgressMap.clear();
}
if (workerProgressMap != null) {
workerProgressMap.clear();
}
if (taskResultMap != null) {
taskResultMap.clear();
}
if (taskStatusMap != null) {
taskStatusMap.clear();
}
clearTasks(jobInstanceInfo.getJobInstanceId());
taskCounter.set(0);
setStartStatusCheck(false);
}
/**
* Getter method for property taskProgressMap .
*
* @return property value of taskProgressMap
*/
public Map getTaskProgressMap() {
return taskProgressMap;
}
public synchronized List syncPullTasks(long serialNum, int pageSize, String workerIdAddr) {
if (getTotalPulledAndRunning() >= xAttrs.getGlobalConsumerSize()) {
return Lists.newArrayList();
} else {
if (this.getSerialNum() == serialNum) {
return taskDispatchReqHandler.syncHandleReqs(pageSize, workerIdAddr);
} else {
return Lists.newArrayList();
}
}
}
@Override
protected void checkProcessor() throws Exception {
if ("java".equalsIgnoreCase(jobInstanceInfo.getJobType())) {
JavaProcessorProfile profile = JsonUtil.fromJson(jobInstanceInfo.getContent(), JavaProcessorProfile.class);
if (!JobProcessorUtil.checkJavaProcessor(profile.getClassName(), MapJobProcessor.class)) {
throw new IOException(profile.getClassName() + " must extends MapJobProcessor or MapReduceJobProcessor");
}
if (JobProcessorUtil.checkJavaProcessor(profile.getClassName(), MapReduceJobProcessor.class)){
this.needReduce = true;
}
}
}
@Override
public synchronized void handleWorkerShutdown(String workerIdAddr, boolean withFailover) {
this.existInvalidWorker = true;
this.invalidWorkerSet.add(workerIdAddr);
if (!aliveCheckWorkerSet.contains(workerIdAddr)) {
return;
}
String[] workerInfo = workerIdAddr.split("@");
String workerAddr = workerInfo[1];
String workerId = workerInfo[0];
aliveCheckWorkerSet.remove(workerIdAddr);
jobInstanceInfo.getAllWorkers().remove(workerIdAddr);
// adjust dispatch batch size
taskDispatchReqHandler.setDispatchSize(aliveCheckWorkerSet.size() * pageSize);
// boolean isSecondTypeJob = JobUtil.isSecondTypeJob(TimeType.parseValue(jobInstanceInfo.getTimeType()));
if (withFailover && (xAttrs != null && xAttrs.isFailover())) {
// 如果启用failover,置为init状态等待重新拉取
int affectCnt = taskPersistence.batchUpdateTaskStatus(jobInstanceInfo.getJobInstanceId(), TaskStatus.INIT, workerId, workerAddr);
LOGGER.warn("jobInstanceId={}, failover task number:{}, workerId:{}, workerAddr:{}",
jobInstanceInfo.getJobInstanceId(), affectCnt, workerId, workerAddr);
if (affectCnt > 0) {
// recover counter
workerProgressMap.get(workerAddr).decRunningAndTotal(affectCnt);
}
} else {
// 如果不启用failover,直接把这台worker上的子任务置为失败
int affectCnt = taskPersistence.batchUpdateTaskStatus(jobInstanceInfo.getJobInstanceId(), TaskStatus.FAILED, workerId, workerAddr);
LOGGER.warn("jobInstanceId={}, worker shutdown, failed task number:{}, workerId:{}, workerAddr:{}",
jobInstanceInfo.getJobInstanceId(), affectCnt, workerId, workerAddr);
if (affectCnt > 0) {
workerProgressMap.get(workerAddr).incrementFailed(affectCnt);
// taskProgress数据会不准确
}
}
}
public boolean isStartStatusCheck() {
return startStatusCheck;
}
public void setStartStatusCheck(boolean startStatusCheck) {
this.startStatusCheck = startStatusCheck;
}
/**
* 判断是否采用Worker负载最优路由
* @return
*/
protected boolean isWorkerLoadRouter(){
Integer routerStrategy = ConfigUtil.getWorkerConfig().getInteger(WorkerConstants.MAP_MASTER_ROUTER_STRATEGY, null);
boolean enableShareContainerPool = WorkerConfigUtil.isEnableShareContainerPool();
return enableShareContainerPool && ((xAttrs != null && RouteStrategyEnum.WORKER_LOAD.getValue().equals(xAttrs.getRouteType()))
|| RouteStrategyEnum.WORKER_LOAD.getValue().equals(routerStrategy));
}
/**
* 解析分发速率
* @return
*/
protected Long parseDispatchSpeed() {
// 解析分发速率
String dispatchSpeed = xAttrs.getDispatchSpeed();
Long dispatchDelay = null;
if (StringUtils.isNotEmpty(dispatchSpeed)) {
Integer speed = null;
TimeUnit timeUnit = TimeUnit.MILLISECONDS;
if (NumberUtils.isNumber(dispatchSpeed)) {
speed = NumberUtils.toInt(dispatchSpeed);
} else {
String[] arr = dispatchSpeed.split("/");
if (arr != null && arr.length == 2) {
if (NumberUtils.isNumber(arr[0])) {
speed = NumberUtils.toInt(arr[0]);
}
if (StringUtils.isNotEmpty(arr[1])) {
switch (StringUtils.upperCase(arr[1])) {
case "S":
timeUnit = TimeUnit.SECONDS;
break;
case "M":
timeUnit = TimeUnit.MINUTES;
break;
case "H":
timeUnit = TimeUnit.HOURS;
break;
}
}
}
}
if (speed != null && speed > 0) {
dispatchDelay = TimeUnit.MILLISECONDS.convert(1, timeUnit)/speed;
}
}
return dispatchDelay;
}
}