![JAR search and dependency download from the Maven repository](/logo.png)
com.github.ltsopensource.jobtracker.support.checker.ExecutingDeadJobChecker Maven / Gradle / Ivy
package com.github.ltsopensource.jobtracker.support.checker;
import com.github.ltsopensource.biz.logger.domain.JobLogPo;
import com.github.ltsopensource.biz.logger.domain.LogType;
import com.github.ltsopensource.core.cluster.NodeType;
import com.github.ltsopensource.core.commons.utils.CollectionUtils;
import com.github.ltsopensource.core.commons.utils.QuietUtils;
import com.github.ltsopensource.core.constant.Constants;
import com.github.ltsopensource.core.constant.ExtConfig;
import com.github.ltsopensource.core.constant.Level;
import com.github.ltsopensource.core.exception.RemotingSendException;
import com.github.ltsopensource.core.factory.NamedThreadFactory;
import com.github.ltsopensource.core.json.JSON;
import com.github.ltsopensource.core.logger.Logger;
import com.github.ltsopensource.core.logger.LoggerFactory;
import com.github.ltsopensource.core.protocol.JobProtos;
import com.github.ltsopensource.core.protocol.command.JobAskRequest;
import com.github.ltsopensource.core.protocol.command.JobAskResponse;
import com.github.ltsopensource.core.remoting.RemotingServerDelegate;
import com.github.ltsopensource.core.support.JobDomainConverter;
import com.github.ltsopensource.core.support.SystemClock;
import com.github.ltsopensource.jobtracker.channel.ChannelWrapper;
import com.github.ltsopensource.jobtracker.domain.JobTrackerAppContext;
import com.github.ltsopensource.jobtracker.monitor.JobTrackerMStatReporter;
import com.github.ltsopensource.queue.domain.JobPo;
import com.github.ltsopensource.remoting.AsyncCallback;
import com.github.ltsopensource.remoting.Channel;
import com.github.ltsopensource.remoting.ResponseFuture;
import com.github.ltsopensource.remoting.protocol.RemotingCommand;
import com.github.ltsopensource.remoting.protocol.RemotingProtos;
import com.github.ltsopensource.store.jdbc.exception.DupEntryException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
/**
* @author Robert HG ([email protected]) on 8/19/14.
* 死掉的任务
* 1. 分发出去的,并且执行节点不存在的任务
* 2. 分发出去,执行节点还在, 但是没有在执行的任务
*/
public class ExecutingDeadJobChecker {
private static final Logger LOGGER = LoggerFactory.getLogger(ExecutingDeadJobChecker.class);
private final ScheduledExecutorService FIXED_EXECUTOR_SERVICE = Executors.newScheduledThreadPool(1, new NamedThreadFactory("LTS-ExecutingJobQueue-Fix-Executor", true));
private JobTrackerAppContext appContext;
private JobTrackerMStatReporter stat;
public ExecutingDeadJobChecker(JobTrackerAppContext appContext) {
this.appContext = appContext;
this.stat = (JobTrackerMStatReporter) appContext.getMStatReporter();
}
private AtomicBoolean start = new AtomicBoolean(false);
private ScheduledFuture> scheduledFuture;
public void start() {
try {
if (start.compareAndSet(false, true)) {
int fixCheckPeriodSeconds = appContext.getConfig().getParameter(ExtConfig.JOB_TRACKER_EXECUTING_JOB_FIX_CHECK_INTERVAL_SECONDS, 30);
if (fixCheckPeriodSeconds < 5) {
fixCheckPeriodSeconds = 5;
} else if (fixCheckPeriodSeconds > 5 * 60) {
fixCheckPeriodSeconds = 5 * 60;
}
scheduledFuture = FIXED_EXECUTOR_SERVICE.scheduleWithFixedDelay(new Runnable() {
@Override
public void run() {
try {
// 判断注册中心是否可用,如果不可用,那么直接返回,不进行处理
if (!appContext.getRegistryStatMonitor().isAvailable()) {
return;
}
checkAndFix();
} catch (Throwable t) {
LOGGER.error("Check executing dead job error ", t);
}
}
}, fixCheckPeriodSeconds, fixCheckPeriodSeconds, TimeUnit.SECONDS);
}
LOGGER.info("Executing dead job checker started!");
} catch (Throwable e) {
LOGGER.error("Executing dead job checker start failed!", e);
}
}
private void checkAndFix() throws RemotingSendException {
// 30s没有收到反馈信息,需要去检查这个任务是否还在执行
int maxDeadCheckTime = appContext.getConfig().getParameter(ExtConfig.JOB_TRACKER_EXECUTING_JOB_FIX_DEADLINE_SECONDS, 20);
if (maxDeadCheckTime < 10) {
maxDeadCheckTime = 10;
} else if (maxDeadCheckTime > 5 * 60) {
maxDeadCheckTime = 5 * 60;
}
// 查询出所有死掉的任务 (其实可以直接在数据库中fix的, 查询出来主要是为了日志打印)
// 一般来说这个是没有多大的,我就不分页去查询了
List maybeDeadJobPos = appContext.getExecutingJobQueue().getDeadJobs(
SystemClock.now() - maxDeadCheckTime * 1000);
if (CollectionUtils.isNotEmpty(maybeDeadJobPos)) {
Map> jobMap = new HashMap>();
for (JobPo jobPo : maybeDeadJobPos) {
List jobPos = jobMap.get(jobPo.getTaskTrackerIdentity());
if (jobPos == null) {
jobPos = new ArrayList();
jobMap.put(jobPo.getTaskTrackerIdentity(), jobPos);
}
jobPos.add(jobPo);
}
for (Map.Entry> entry : jobMap.entrySet()) {
String taskTrackerNodeGroup = entry.getValue().get(0).getTaskTrackerNodeGroup();
String taskTrackerIdentity = entry.getKey();
// 去查看这个TaskTrackerIdentity是否存活
ChannelWrapper channelWrapper = appContext.getChannelManager().getChannel(taskTrackerNodeGroup, NodeType.TASK_TRACKER, taskTrackerIdentity);
if (channelWrapper == null && taskTrackerIdentity != null) {
Long offlineTimestamp = appContext.getChannelManager().getOfflineTimestamp(taskTrackerIdentity);
// 已经离线太久,直接修复
if (offlineTimestamp == null || SystemClock.now() - offlineTimestamp > Constants.DEFAULT_TASK_TRACKER_OFFLINE_LIMIT_MILLIS) {
// fixDeadJob
fixDeadJob(entry.getValue());
}
} else {
// 去询问是否在执行该任务
if (channelWrapper != null && channelWrapper.getChannel() != null && channelWrapper.isOpen()) {
askTimeoutJob(channelWrapper.getChannel(), entry.getValue());
}
}
}
}
}
/**
* 向taskTracker询问执行中的任务
*/
private void askTimeoutJob(Channel channel, final List jobPos) {
try {
RemotingServerDelegate remotingServer = appContext.getRemotingServer();
List jobIds = new ArrayList(jobPos.size());
for (JobPo jobPo : jobPos) {
jobIds.add(jobPo.getJobId());
}
JobAskRequest requestBody = appContext.getCommandBodyWrapper().wrapper(new JobAskRequest());
requestBody.setJobIds(jobIds);
RemotingCommand request = RemotingCommand.createRequestCommand(JobProtos.RequestCode.JOB_ASK.code(), requestBody);
remotingServer.invokeAsync(channel, request, new AsyncCallback() {
@Override
public void operationComplete(ResponseFuture responseFuture) {
RemotingCommand response = responseFuture.getResponseCommand();
if (response != null && RemotingProtos.ResponseCode.SUCCESS.code() == response.getCode()) {
JobAskResponse responseBody = response.getBody();
List deadJobIds = responseBody.getJobIds();
if (CollectionUtils.isNotEmpty(deadJobIds)) {
// 睡了1秒再修复, 防止任务刚好执行完正在传输中. 1s可以让完成的正常完成
QuietUtils.sleep(appContext.getConfig().getParameter(ExtConfig.JOB_TRACKER_FIX_EXECUTING_JOB_WAITING_MILLS, 1000L));
for (JobPo jobPo : jobPos) {
if (deadJobIds.contains(jobPo.getJobId())) {
fixDeadJob(jobPo);
}
}
}
}
}
});
} catch (RemotingSendException e) {
LOGGER.error("Ask timeout Job error, ", e);
}
}
private void fixDeadJob(List jobPos) {
for (JobPo jobPo : jobPos) {
fixDeadJob(jobPo);
}
}
private void fixDeadJob(JobPo jobPo) {
try {
// 已经被移除了
if (appContext.getExecutingJobQueue().getJob(jobPo.getJobId()) == null) {
return;
}
jobPo.setGmtModified(SystemClock.now());
jobPo.setTaskTrackerIdentity(null);
jobPo.setIsRunning(false);
// 1. add to executable queue
try {
appContext.getExecutableJobQueue().add(jobPo);
} catch (DupEntryException e) {
LOGGER.warn("ExecutableJobQueue already exist:" + JSON.toJSONString(jobPo));
}
// 2. remove from executing queue
appContext.getExecutingJobQueue().remove(jobPo.getJobId());
JobLogPo jobLogPo = JobDomainConverter.convertJobLog(jobPo);
jobLogPo.setLogTime(SystemClock.now());
jobLogPo.setSuccess(true);
jobLogPo.setLevel(Level.WARN);
jobLogPo.setLogType(LogType.FIXED_DEAD);
appContext.getJobLogger().log(jobLogPo);
stat.incFixExecutingJobNum();
} catch (Throwable t) {
LOGGER.error(t.getMessage(), t);
}
LOGGER.info("checkAndFix dead job ! {}", JSON.toJSONString(jobPo));
}
public void stop() {
try {
if (start.compareAndSet(true, false)) {
scheduledFuture.cancel(true);
FIXED_EXECUTOR_SERVICE.shutdown();
}
LOGGER.info("Executing dead job checker stopped!");
} catch (Throwable t) {
LOGGER.error("Executing dead job checker stop failed!", t);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy