All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cn.ponfee.disjob.supervisor.component.JobManager Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2022-2024 Ponfee (http://www.ponfee.cn/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cn.ponfee.disjob.supervisor.component;

import cn.ponfee.disjob.common.base.IdGenerator;
import cn.ponfee.disjob.common.base.Symbol;
import cn.ponfee.disjob.common.collect.Collects;
import cn.ponfee.disjob.common.dag.DAGEdge;
import cn.ponfee.disjob.common.dag.DAGNode;
import cn.ponfee.disjob.common.date.Dates;
import cn.ponfee.disjob.common.exception.Throwables.ThrowingRunnable;
import cn.ponfee.disjob.common.exception.Throwables.ThrowingSupplier;
import cn.ponfee.disjob.common.model.BaseEntity;
import cn.ponfee.disjob.common.tuple.Tuple2;
import cn.ponfee.disjob.common.tuple.Tuple3;
import cn.ponfee.disjob.common.util.Strings;
import cn.ponfee.disjob.core.base.CoreUtils;
import cn.ponfee.disjob.core.base.JobConstants;
import cn.ponfee.disjob.core.base.Worker;
import cn.ponfee.disjob.core.dag.PredecessorInstance;
import cn.ponfee.disjob.core.dto.supervisor.StartTaskParam;
import cn.ponfee.disjob.core.dto.supervisor.StartTaskResult;
import cn.ponfee.disjob.core.dto.supervisor.StopTaskParam;
import cn.ponfee.disjob.core.dto.worker.SplitJobParam;
import cn.ponfee.disjob.core.enums.*;
import cn.ponfee.disjob.core.exception.JobException;
import cn.ponfee.disjob.dispatch.ExecuteTaskParam;
import cn.ponfee.disjob.dispatch.event.TaskDispatchFailedEvent;
import cn.ponfee.disjob.supervisor.base.ExecuteTaskParamBuilder;
import cn.ponfee.disjob.supervisor.base.ModelConverter;
import cn.ponfee.disjob.supervisor.base.TriggerTimes;
import cn.ponfee.disjob.supervisor.configuration.SupervisorProperties;
import cn.ponfee.disjob.supervisor.dag.WorkflowGraph;
import cn.ponfee.disjob.supervisor.dao.mapper.*;
import cn.ponfee.disjob.supervisor.exception.KeyExistsException;
import cn.ponfee.disjob.supervisor.instance.TriggerInstance;
import cn.ponfee.disjob.supervisor.model.*;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.collections4.MapUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.context.event.EventListener;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.transaction.support.TransactionTemplate;
import org.springframework.util.Assert;

import java.util.*;
import java.util.function.Consumer;
import java.util.function.LongFunction;
import java.util.function.Predicate;
import java.util.stream.Collectors;

import static cn.ponfee.disjob.common.spring.TransactionUtils.*;
import static cn.ponfee.disjob.common.util.Functions.convert;
import static cn.ponfee.disjob.common.util.Functions.doIfTrue;
import static cn.ponfee.disjob.core.base.JobConstants.PROCESS_BATCH_SIZE;
import static cn.ponfee.disjob.supervisor.dao.SupervisorDataSourceConfig.SPRING_BEAN_NAME_TX_MANAGER;
import static cn.ponfee.disjob.supervisor.dao.SupervisorDataSourceConfig.SPRING_BEAN_NAME_TX_TEMPLATE;
import static com.google.common.collect.ImmutableList.of;

/**
 * Job manager
 *
 * @author Ponfee
 */
@Component
public class JobManager {

    private static final Logger LOG = LoggerFactory.getLogger(JobManager.class);
    private static final Comparator> WORKLOAD_COMPARATOR = Comparator.comparingLong(e -> e.b);
    private static final List RS_TERMINABLE = of(RunState.WAITING.value(), RunState.RUNNING.value(), RunState.PAUSED.value());
    private static final List RS_RUNNABLE   = of(RunState.WAITING.value(), RunState.PAUSED.value());
    private static final List RS_PAUSABLE   = of(RunState.WAITING.value(), RunState.RUNNING.value());
    private static final List RS_WAITING    = of(RunState.WAITING.value());
    private static final List RS_RUNNING    = of(RunState.RUNNING.value());
    private static final List RS_PAUSED     = of(RunState.PAUSED.value());
    private static final List ES_EXECUTABLE = of(ExecuteState.WAITING.value(), ExecuteState.PAUSED.value());
    private static final List ES_PAUSABLE   = of(ExecuteState.WAITING.value(), ExecuteState.EXECUTING.value());
    private static final List ES_WAITING    = of(ExecuteState.WAITING.value());
    private static final List ES_EXECUTING  = of(ExecuteState.EXECUTING.value());
    private static final List ES_PAUSED     = of(ExecuteState.PAUSED.value());
    private static final List ES_COMPLETED  = of(ExecuteState.COMPLETED.value());

    private final SupervisorProperties conf;
    private final IdGenerator idGenerator;
    private final SchedJobMapper jobMapper;
    private final SchedDependMapper dependMapper;
    private final SchedInstanceMapper instanceMapper;
    private final SchedWorkflowMapper workflowMapper;
    private final SchedTaskMapper taskMapper;
    private final WorkerClient workerClient;
    private final TransactionTemplate transactionTemplate;

    public JobManager(SupervisorProperties conf,
                      IdGenerator idGenerator,
                      SchedJobMapper jobMapper,
                      SchedDependMapper dependMapper,
                      SchedInstanceMapper instanceMapper,
                      SchedWorkflowMapper workflowMapper,
                      SchedTaskMapper taskMapper,
                      WorkerClient workerClient,
                      @Qualifier(SPRING_BEAN_NAME_TX_TEMPLATE) TransactionTemplate txTemplate) {
        conf.check();
        this.conf = conf;
        this.idGenerator = idGenerator;
        this.jobMapper = jobMapper;
        this.dependMapper = dependMapper;
        this.instanceMapper = instanceMapper;
        this.workflowMapper = workflowMapper;
        this.taskMapper = taskMapper;
        this.workerClient = workerClient;
        this.transactionTemplate = txTemplate;
    }

    // ------------------------------------------------------------------non-database operation methods

    public long generateId() {
        return idGenerator.generateId();
    }

    public List splitJob(String group, long instanceId, SplitJobParam param) throws JobException {
        return workerClient.splitJob(group, instanceId, param, this::generateId, conf.getMaximumSplitTaskSize());
    }

    public boolean dispatch(SchedJob job, SchedInstance instance, List tasks) {
        return dispatch(false, job, instance, tasks);
    }

    public boolean redispatch(SchedJob job, SchedInstance instance, List tasks) {
        return dispatch(true, job, instance, tasks);
    }

    // ------------------------------------------------------------------database single operation without spring transactional

    public void disableJob(SchedJob job) {
        jobMapper.disable(job);
    }

    public boolean updateJobNextTriggerTime(SchedJob job) {
        return isOneAffectedRow(jobMapper.updateNextTriggerTime(job));
    }

    public void updateJobNextScanTime(SchedJob job) {
        jobMapper.updateNextScanTime(job);
    }

    public boolean updateInstanceNextScanTime(SchedInstance inst, Date nextScanTime) {
        Assert.notNull(nextScanTime, "Instance next scan time cannot be null.");
        return isOneAffectedRow(instanceMapper.updateNextScanTime(inst.getInstanceId(), nextScanTime, inst.getVersion()));
    }

    public boolean savepoint(long taskId, String worker, String executeSnapshot) {
        CoreUtils.checkClobMaximumLength(executeSnapshot, "Execute snapshot");
        return isOneAffectedRow(taskMapper.savepoint(taskId, worker, executeSnapshot));
    }

    // ------------------------------------------------------------------must in transaction active(propagation Mandatory)

    public void saveLeadInstanceAndWorkflows(SchedInstance instance, List workflows) {
        Assert.isTrue(instance.isWorkflowLead(), () -> "Must be workflow lead instance: " + instance);
        assertDoInTransaction();
        instanceMapper.insert(instance.fillUniqueFlag());
        Collects.batchProcess(workflows, workflowMapper::batchInsert, PROCESS_BATCH_SIZE);
    }

    public void saveInstanceAndTasks(SchedInstance instance, List tasks) {
        assertDoInTransaction();
        instanceMapper.insert(instance.fillUniqueFlag());
        Collects.batchProcess(tasks, taskMapper::batchInsert, PROCESS_BATCH_SIZE);
    }

    // ------------------------------------------------------------------database operation within spring @transactional

    @Transactional(transactionManager = SPRING_BEAN_NAME_TX_MANAGER, rollbackFor = Exception.class)
    public Long addJob(SchedJob job) throws JobException {
        job.setUpdatedBy(job.getCreatedBy());
        job.verifyForAdd(conf.getMaximumJobRetryCount());
        if (jobMapper.getJobId(job.getGroup(), job.getJobName()) != null) {
            throw new KeyExistsException("Exists job name: " + job.getJobName());
        }
        workerClient.verifyJob(job);
        job.setJobId(generateId());
        parseTriggerConfig(job);

        jobMapper.insert(job);
        return job.getJobId();
    }

    @Transactional(transactionManager = SPRING_BEAN_NAME_TX_MANAGER, rollbackFor = Exception.class)
    public void updateJob(SchedJob job) throws JobException {
        job.verifyForUpdate(conf.getMaximumJobRetryCount());
        if (job.requiredUpdateExecutor()) {
            workerClient.verifyJob(job);
        }
        Long jobId0 = jobMapper.getJobId(job.getGroup(), job.getJobName());
        if (jobId0 != null && !jobId0.equals(job.getJobId())) {
            throw new IllegalArgumentException("Exists job name: " + job.getJobName());
        }

        SchedJob dbJob = jobMapper.get(job.getJobId());
        Assert.notNull(dbJob, () -> "Sched job id not found " + job.getJobId());
        Assert.isTrue(dbJob.getGroup().equals(job.getGroup()), "Job group cannot be modify.");
        if (job.requiredUpdateTrigger(dbJob.getTriggerType(), dbJob.getTriggerValue())) {
            dependMapper.deleteByChildJobId(job.getJobId());
            parseTriggerConfig(job);
        }

        assertOneAffectedRow(jobMapper.update(job), "Update sched job fail or conflict.");
    }

    @Transactional(transactionManager = SPRING_BEAN_NAME_TX_MANAGER, rollbackFor = Exception.class)
    public void deleteJob(long jobId) {
        SchedJob job = jobMapper.get(jobId);
        Assert.notNull(job, () -> "Job id not found: " + jobId);
        Assert.state(!job.isEnabled(), "Please disable job before delete this job.");
        assertOneAffectedRow(jobMapper.softDelete(jobId), "Delete sched job fail or conflict.");
        dependMapper.deleteByParentJobId(jobId);
        dependMapper.deleteByChildJobId(jobId);
    }

    @Transactional(transactionManager = SPRING_BEAN_NAME_TX_MANAGER, rollbackFor = Exception.class)
    public boolean changeJobState(long jobId, JobState toState) {
        boolean updated = isOneAffectedRow(jobMapper.updateState(jobId, toState.value(), 1 ^ toState.value()));
        if (updated && toState == JobState.ENABLED) {
            updateNextTriggerTime(jobMapper.get(jobId));
        }
        return updated;
    }

    @Transactional(transactionManager = SPRING_BEAN_NAME_TX_MANAGER, rollbackFor = Exception.class)
    public void manualTriggerJob(long jobId) throws JobException {
        triggerJob(getRequiredJob(jobId), RunType.MANUAL, System.currentTimeMillis());
    }

    @Transactional(transactionManager = SPRING_BEAN_NAME_TX_MANAGER, rollbackFor = Exception.class)
    public void scheduleTriggerJob(SchedJob job, long triggerTime) throws JobException {
        if (isOneAffectedRow(jobMapper.updateNextTriggerTime(job))) {
            triggerJob(job, RunType.SCHEDULE, triggerTime);
        }
    }

    /**
     * Set or clear task worker
     *
     * @param worker  the worker
     * @param taskIds the task id list
     */
    @Transactional(transactionManager = SPRING_BEAN_NAME_TX_MANAGER, rollbackFor = Exception.class)
    public void updateTaskWorker(String worker, List taskIds) {
        if (CollectionUtils.isNotEmpty(taskIds)) {
            // Sort for prevent sql deadlock: Deadlock found when trying to get lock; try restarting transaction
            Collections.sort(taskIds);
            Lists.partition(taskIds, PROCESS_BATCH_SIZE).forEach(ids -> taskMapper.batchUpdateWorker(worker, ids));
        }
    }

    // ------------------------------------------------------------------database operation within spring TransactionTemplate

    /**
     * Listen task dispatch failed event
     * 

{@code `@Async`}需要标注{@code `@EnableAsync`}来启用,默认使用的是`SimpleAsyncTaskExecutor`线程池,会为每个任务创建一个新线程(慎用默认的线程池) * * @param event the TaskDispatchFailedEvent */ @EventListener public void processTaskDispatchFailedEvent(TaskDispatchFailedEvent event) { transactionTemplate.executeWithoutResult(status -> { long taskId = event.getTaskId(); if (!shouldTerminateDispatchFailedTask(taskId)) { return; } if (!taskMapper.terminate(taskId, null, ExecuteState.DISPATCH_FAILED, ExecuteState.WAITING, null, null)) { LOG.warn("Terminate dispatch failed task unsuccessful: {}", taskId); } }); } /** * Starts the task * * @param param the start task param * @return start result */ public StartTaskResult startTask(StartTaskParam param) { param.check(); return doInSynchronizedTransaction0(param.getInstanceId(), param.getWnstanceId(), lockInstanceId -> { String startRequestId = param.getStartRequestId(); LOG.info("Task trace [{}] starting: {}, {}", param.getTaskId(), param.getWorker(), startRequestId); Date now = new Date(); // 如果先`get`查一次然后`start`,最后再`get`查的话数据可能会被缓存,返回`runState=10` // 若先`instanceMapper.lock(lockInstanceId)`,不会出现以上问题 if (isNotAffectedRow(taskMapper.start(param.getTaskId(), param.getWorker(), startRequestId, now))) { if (!taskMapper.checkStartIdempotent(param.getTaskId(), param.getWorker(), startRequestId)) { return StartTaskResult.failure("Start task failure."); } LOG.info("Start task idempotent: {}, {}, {}", param.getTaskId(), param.getWorker(), startRequestId); } if (isNotAffectedRow(instanceMapper.start(param.getInstanceId(), now))) { SchedInstance instance = instanceMapper.get(param.getInstanceId()); Assert.state(instance != null && instance.isRunning(), () -> "Start instance failure: " + instance); } return ModelConverter.toStartTaskResult(taskMapper.get(param.getTaskId())); }); } /** * Stops the task * * @param param the stop task param * @return {@code true} if stopped task successful */ public boolean stopTask(StopTaskParam param) { param.check(); Operation ops = param.getOperation(); LOG.info("Task trace [{}] stopping: {}, {}, {}", param.getTaskId(), ops, param.getToState(), param.getWorker()); return doInSynchronizedTransaction(param.getInstanceId(), param.getWnstanceId(), instance -> { Assert.isTrue(!instance.isWorkflowLead(), () -> "Stop task instance cannot be workflow lead: " + instance); if (instance.isTerminal()) { return false; } ExecuteState toState = param.getToState(); Date executeEndTime = toState.isTerminal() ? new Date() : null; String errMsg = param.getErrorMsg(); if (!taskMapper.terminate(param.getTaskId(), param.getWorker(), toState, ExecuteState.EXECUTING, executeEndTime, errMsg)) { // usual is worker invoke http timeout, then retry LOG.warn("Conflict stop executing task: {}, {}", param.getTaskId(), toState); return false; } List tasks = taskMapper.findBaseByInstanceId(param.getInstanceId()); if (toState == ExecuteState.WAITING) { Assert.isTrue(ops == Operation.SHUTDOWN_RESUME, () -> "Operation must be SHUTDOWN_RESUME, but actual: " + ops); if (tasks.stream().allMatch(SchedTask::isWaiting)) { boolean updated = instanceMapper.updateState(param.getInstanceId(), RunState.WAITING, RunState.RUNNING); Assert.isTrue(updated, () -> "Shutdown resume instance state to WAITING failed: " + param.getInstanceId()); } if (!updateInstanceNextScanTime(instance, new Date(System.currentTimeMillis() + conf.getShutdownTaskDelayResumeMs()))) { LOG.warn("Resume task renew instance update time failed: {}", param.getTaskId()); } return true; } Tuple2 tuple = obtainRunState(tasks); if (tuple == null) { // If the instance has (WAITING or EXECUTING) task return true; } if (!tuple.a.isTerminal()) { Assert.isTrue(tuple.a == RunState.PAUSED, () -> "Run state must be PAUSED, but actual: " + tuple.a); pauseInstance(instance.isWorkflow() ? instanceMapper.get(instance.getWnstanceId()) : instance); return true; } boolean updated = instanceMapper.terminate(param.getInstanceId(), tuple.a, RS_TERMINABLE, tuple.b); Assert.state(updated, () -> "Stop task instance failed: " + param.getInstanceId() + ", " + tuple.a); // the last executing task of this sched instance instance.markTerminated(tuple.a, tuple.b); if (ops.isTrigger()) { // trigger operation afterTerminateTask(instance); } else if (instance.isWorkflowNode()) { Assert.isTrue(tuple.a == RunState.CANCELED, () -> "Invalid workflow non-trigger stop state: " + tuple.a); updateWorkflowNodeState(instance, tuple.a, RS_TERMINABLE); updateWorkflowLeadState(instanceMapper.get(instance.getWnstanceId()), tuple.a, RS_RUNNABLE); } else { Assert.isTrue(tuple.a == RunState.CANCELED, () -> "Invalid general non-trigger stop state: " + tuple.a); renewFixedNextTriggerTime(instance); } return true; }); } /** * Force change instance state, for example: CANCELED -> WAITING * * @param instanceId the instance id, unsupported workflow * @param toExecuteState the target execute state */ public void changeInstanceState(long instanceId, ExecuteState toExecuteState) { Assert.isTrue(toExecuteState != ExecuteState.EXECUTING, () -> "Force change state invalid target: " + toExecuteState); doInSynchronizedTransaction(instanceId, null, instance -> { Assert.isTrue(!instance.isWorkflow(), () -> "Force change state unsupported workflow: " + instanceId); RunState fromRunState = RunState.of(instance.getRunState()); RunState toRunState = toExecuteState.runState(); Assert.isTrue(fromRunState != RunState.RUNNING, "Force change state current cannot be RUNNING."); Assert.isTrue(fromRunState != toRunState, () -> "Force change state current cannot equals target " + toRunState); boolean updated = instanceMapper.updateState(instanceId, toRunState, fromRunState); Assert.state(updated, () -> "Force change state failed: " + instanceId); int changedTaskRows = taskMapper.forceChangeState(instanceId, toExecuteState.value()); if (toExecuteState == ExecuteState.WAITING) { Tuple3> tuple = buildDispatchParam(instanceId, changedTaskRows); doAfterTransactionCommit(() -> dispatch(tuple.a, tuple.b, tuple.c)); } LOG.info("Force change state success {}, {}", instanceId, toExecuteState); }); } public void deleteInstance(long instanceId) { doInSynchronizedTransaction(instanceId, requireWnstanceIdIfWorkflow(instanceId), instance -> { Assert.isTrue(instance.isTerminal(), () -> "Deleting instance must be terminal: " + instance); if (instance.isWorkflow()) { Assert.isTrue(instance.isWorkflowLead(), () -> "Delete workflow instance must be lead: " + instanceId); List nodeInstances = instanceMapper.findWorkflowNode(instanceId); assertHasAffectedRow(instanceMapper.deleteByWnstanceId(instanceId), () -> "Delete workflow instance failed: " + instanceId); assertHasAffectedRow(workflowMapper.deleteByWnstanceId(instanceId), () -> "Delete workflow config failed: " + instanceId); for (SchedInstance nodeInstance : nodeInstances) { int row = taskMapper.deleteByInstanceId(nodeInstance.getInstanceId()); assertHasAffectedRow(row, () -> "Delete workflow task failed: " + nodeInstance); } } else { Assert.isTrue(!instance.getRetrying(), "Cannot delete retrying original instance."); Assert.isTrue(!instance.isRunRetry(), "Cannot delete run retry sub instance."); Set instanceIds = instanceMapper.findRunRetry(instanceId) .stream().map(SchedInstance::getInstanceId).collect(Collectors.toSet()); instanceIds.add(instanceId); for (Long id : instanceIds) { assertOneAffectedRow(instanceMapper.deleteByInstanceId(id), () -> "Delete instance failed: " + id); assertHasAffectedRow(taskMapper.deleteByInstanceId(id), () -> "Delete task failed: " + id); } } LOG.info("Delete sched instance success {}", instanceId); }); } /** * Purge the zombie instance which maybe dead * * @param inst the sched instance * @return {@code true} if purged successfully */ public boolean purgeInstance(SchedInstance inst) { Long instanceId = inst.getInstanceId(); LOG.info("Purge instance: {}", instanceId); return doInSynchronizedTransaction(instanceId, inst.getWnstanceId(), instance -> { Assert.isTrue(!instance.isWorkflowLead(), () -> "Purge instance cannot be workflow lead: " + instance); // instance run state must in (10, 20) if (!instance.isPausable()) { return false; } List tasks = taskMapper.findBaseByInstanceId(instanceId); if (tasks.stream().anyMatch(SchedTask::isWaiting) || workerClient.hasAliveExecutingTasks(tasks)) { LOG.warn("Purge instance failed, has waiting or alive executing task: {}", tasks); return false; } Tuple2 tuple = obtainRunState(tasks); if (tuple == null) { tuple = Tuple2.of(RunState.CANCELED, new Date()); } Assert.isTrue(tuple.a.isTerminal(), () -> "Purge instance state must be terminal state: " + instance); if (!instanceMapper.terminate(instanceId, tuple.a, RS_TERMINABLE, tuple.b)) { throw new IllegalStateException("Purge instance failed: " + instance + ", " + tuple.a); } tasks.stream().filter(SchedTask::isPausable).forEach(e -> { String worker = e.isExecuting() ? Strings.requireNonBlank(e.getWorker()) : null; ExecuteState fromState = ExecuteState.of(e.getExecuteState()); taskMapper.terminate(e.getTaskId(), worker, ExecuteState.EXECUTE_ABORTED, fromState, new Date(), null); }); instance.markTerminated(tuple.a, tuple.b); afterTerminateTask(instance); LOG.warn("Purge instance {} to state {}", instanceId, tuple.a); return true; }); } /** * Pause instance * * @param instanceId the instance id, if workflow must be lead instance id * @return {@code true} if paused successfully */ public boolean pauseInstance(long instanceId) { LOG.info("Pause instance: {}", instanceId); return doInSynchronizedTransaction(instanceId, requireWnstanceIdIfWorkflow(instanceId), instance -> { return doIfTrue(instance.isPausable(), () -> pauseInstance(instance)); }); } /** * Cancel instance * * @param instanceId the instance id, if workflow must be lead instance id * @param ops the operation * @return {@code true} if canceled successfully */ public boolean cancelInstance(long instanceId, Operation ops) { LOG.info("Cancel instance: {}, {}", instanceId, ops); Assert.isTrue(ops.toState().isFailure(), () -> "Cancel instance operation invalid: " + ops); return doInSynchronizedTransaction(instanceId, requireWnstanceIdIfWorkflow(instanceId), instance -> { return doIfTrue(!instance.isTerminal(), () -> cancelInstance(instance, ops)); }); } /** * Resume the instance from paused to waiting state * * @param instanceId the instance id, if workflow must be lead instance id * @return {@code true} if resumed successfully */ public boolean resumeInstance(long instanceId) { LOG.info("Resume instance: {}", instanceId); return doInSynchronizedTransaction(instanceId, requireWnstanceIdIfWorkflow(instanceId), instance -> { return doIfTrue(instance.isPaused(), () -> resumeInstance(instance)); }); } // ------------------------------------------------------------------private methods private SchedJob getRequiredJob(long jobId) { return Objects.requireNonNull(jobMapper.get(jobId), () -> "Job not found: " + jobId); } private void triggerJob(SchedJob job, RunType runType, long triggerTime) throws JobException { TriggerInstance triggerInstance = TriggerInstance.of(this, job, null, runType, triggerTime); triggerInstance.save(); doAfterTransactionCommit(triggerInstance::dispatch); } private boolean shouldTerminateDispatchFailedTask(long taskId) { SchedTask task = taskMapper.get(taskId); if (!task.isWaiting()) { return false; } int currentDispatchFailedCount = task.getDispatchFailedCount(); if (currentDispatchFailedCount >= conf.getTaskDispatchFailedCountThreshold()) { return true; } return isOneAffectedRow(taskMapper.incrementDispatchFailedCount(taskId, currentDispatchFailedCount)) && (currentDispatchFailedCount + 1) == conf.getTaskDispatchFailedCountThreshold(); } private void updateNextTriggerTime(SchedJob job) { if (TriggerType.of(job.getTriggerType()) == TriggerType.DEPEND) { return; } Long nextTriggerTime = TriggerTimes.updateNextTriggerTime(job); if (!nextTriggerTime.equals(job.getNextTriggerTime())) { job.setNextTriggerTime(nextTriggerTime); assertOneAffectedRow(jobMapper.updateNextTriggerTime(job), () -> "Update next trigger time failed: " + job); } } private void parseTriggerConfig(SchedJob job) { String triggerValue = CoreUtils.trimRequired(job.getTriggerValue(), 255, "Trigger value"); job.setTriggerValue(triggerValue); Long jobId = job.getJobId(); if (TriggerType.of(job.getTriggerType()) == TriggerType.DEPEND) { List parentJobIds = Collects.split(triggerValue, Long::parseLong); Assert.notEmpty(parentJobIds, () -> "Invalid dependency parent job id config: " + triggerValue); Assert.isTrue(!parentJobIds.contains(jobId), () -> "Cannot depends self: " + jobId + ", " + parentJobIds); Map parentJobMap = Collects.toMap(jobMapper.findByJobIds(parentJobIds), SchedJob::getJobId); for (Long parentJobId : parentJobIds) { SchedJob parentJob = parentJobMap.get(parentJobId); Assert.notNull(parentJob, () -> "Parent job id not found: " + parentJobId); String cGroup = job.getGroup(), pGroup = parentJob.getGroup(); Assert.isTrue(cGroup.equals(pGroup), () -> "Inconsistent depend group: " + cGroup + ", " + pGroup); } // 校验是否有循环依赖 以及 依赖层级是否太深 checkCircularDepends(jobId, new HashSet<>(parentJobIds)); List list = Collects.convert(parentJobIds, pid -> SchedDepend.of(pid, jobId)); Collects.batchProcess(list, dependMapper::batchInsert, JobConstants.PROCESS_BATCH_SIZE); job.setTriggerValue(Joiner.on(Symbol.Str.COMMA).join(parentJobIds)); job.setNextTriggerTime(null); } else { job.setNextTriggerTime(TriggerTimes.updateNextTriggerTime(job)); } } private void checkCircularDepends(Long jobId, Set parentJobIds) { Set outerDepends = parentJobIds; for (int i = 1; ; i++) { Map map = Collects.toMap(dependMapper.findByChildJobIds(parentJobIds), SchedDepend::getParentJobId); if (MapUtils.isEmpty(map)) { return; } Assert.isTrue(!map.containsKey(jobId), () -> "Circular depends job: " + map.get(jobId)); Assert.isTrue(i < conf.getMaximumJobDependsDepth(), () -> "Exceed depends depth: " + outerDepends); parentJobIds = map.keySet(); } } private boolean dispatch(boolean isRedispatch, SchedJob job, SchedInstance instance, List tasks) { ExecuteTaskParamBuilder builder = new ExecuteTaskParamBuilder(job, instance); RouteStrategy routeStrategy = RouteStrategy.of(job.getRouteStrategy()); List list = new ArrayList<>(tasks.size()); List> workload; if (routeStrategy.isBroadcast()) { for (SchedTask task : tasks) { Worker worker = task.worker(); if (!workerClient.isAliveWorker(worker)) { // 上游调用方有些处于事务中,有些不在事务中。因为此处的update操作非必须要求原子性,所以未加Spring事务。 taskMapper.terminate(task.getTaskId(), null, ExecuteState.BROADCAST_ABORTED, ExecuteState.WAITING, null, null); } else { list.add(builder.build(Operation.TRIGGER, task.getTaskId(), instance.getTriggerTime(), worker)); } } } else if (!isRedispatch || routeStrategy.isNotRoundRobin() || (workload = calculateWorkload(job, instance)).isEmpty()) { for (SchedTask task : tasks) { list.add(builder.build(Operation.TRIGGER, task.getTaskId(), instance.getTriggerTime(), null)); } } else { // 轮询算法:选择分配到task最少的worker for (SchedTask task : tasks) { workload.sort(WORKLOAD_COMPARATOR); Tuple2 first = workload.get(0); list.add(builder.build(Operation.TRIGGER, task.getTaskId(), instance.getTriggerTime(), first.a)); first.b += 1; } } return workerClient.dispatch(job.getGroup(), list); } private List> calculateWorkload(SchedJob job, SchedInstance instance) { List workers = workerClient.getDiscoveredWorkers(job.getGroup()); if (CollectionUtils.isEmpty(workers)) { LOG.error("Not found available worker for calculate workload: {}", job.getGroup()); return Collections.emptyList(); } List pausableTasks = taskMapper.findBaseByInstanceIdAndStates(instance.getInstanceId(), ES_PAUSABLE); Map workerScoreMapping = pausableTasks.stream() .filter(e -> StringUtils.isNotBlank(e.getWorker())) .collect(Collectors.groupingBy(SchedTask::getWorker, Collectors.counting())); return Collects.convert(workers, e -> Tuple2.of(e, workerScoreMapping.getOrDefault(e.serialize(), 0L))); } private Long requireWnstanceIdIfWorkflow(long instanceId) { Long wnstanceId = instanceMapper.getWnstanceId(instanceId); if (wnstanceId != null && instanceId != wnstanceId) { throw new IllegalArgumentException("Must be workflow wnstance id: " + wnstanceId + ", " + instanceId); } return wnstanceId; } private void doInSynchronizedTransaction(long instanceId, Long wnstanceId, Consumer action) { doInSynchronizedTransaction(instanceId, wnstanceId, convert(action, true)); } /** * 加JVM锁是为了尽量避免单节点内对数据库锁的等待及数据连接超时 * * @param instanceId the instance id * @param wnstanceId the workflow instance id * @param action the action * @return boolean value of action result */ private boolean doInSynchronizedTransaction(long instanceId, Long wnstanceId, Predicate action) { return doInSynchronizedTransaction0(instanceId, wnstanceId, lockInstanceId -> { SchedInstance lockedInstance = instanceMapper.lock(lockInstanceId); Assert.notNull(lockedInstance, () -> "Locked instance not found: " + lockInstanceId); SchedInstance instance = (instanceId == lockInstanceId) ? lockedInstance : instanceMapper.get(instanceId); Assert.notNull(instance, () -> "Instance not found: " + instanceId); if (!Objects.equals(instance.getWnstanceId(), wnstanceId)) { throw new IllegalStateException("Inconsistent workflow instance id: " + wnstanceId + ", " + instance); } return action.test(instance); }); } private T doInSynchronizedTransaction0(long instanceId, Long wnstanceId, LongFunction action) { Long lockInstanceId = wnstanceId != null ? wnstanceId : (Long) instanceId; synchronized (CoreUtils.INSTANCE_LOCK_POOL.intern(lockInstanceId)) { return transactionTemplate.execute(status -> action.apply(lockInstanceId)); } } private Tuple2 obtainRunState(List tasks) { List states = tasks.stream().map(e -> ExecuteState.of(e.getExecuteState())).collect(Collectors.toList()); if (states.stream().allMatch(ExecuteState::isTerminal)) { // executeEndTime is null: canceled task maybe never not started RunState toState = states.stream().anyMatch(ExecuteState::isFailure) ? RunState.CANCELED : RunState.COMPLETED; Date maxExecuteEndTime = tasks.stream() .map(SchedTask::getExecuteEndTime).filter(Objects::nonNull).max(Comparator.naturalOrder()).orElseGet(Date::new); return Tuple2.of(toState, maxExecuteEndTime); } // if task has WAITING or EXECUTING state, then return null return states.stream().anyMatch(ExecuteState::isPausable) ? null : Tuple2.of(RunState.PAUSED, null); } private void pauseInstance(SchedInstance instance) { if (instance.isWorkflow()) { long instanceId = instance.getInstanceId(); Assert.isTrue(instance.isWorkflowLead(), () -> "Pause instance must be workflow lead: " + instanceId); // pause sched_workflow running node instanceMapper.findWorkflowNode(instanceId).stream().filter(SchedInstance::isPausable).forEach(this::pauseInstance0); updateWorkflowLeadState(instance, RunState.PAUSED, RS_WAITING); } else { pauseInstance0(instance); } } private void pauseInstance0(SchedInstance instance) { Assert.isTrue(instance.isPausable(), () -> "Invalid pause instance state: " + instance); long instanceId = instance.getInstanceId(); Operation ops = Operation.PAUSE; // update task state: (WAITING) -> (PAUSE) taskMapper.updateStateByInstanceId(instanceId, ops.toState().value(), ES_WAITING, null); // load the alive executing tasks List executingTasks = loadExecutingTasks(instance, ops); if (executingTasks.isEmpty()) { // has non executing task, update sched instance state Tuple2 tuple = obtainRunState(taskMapper.findBaseByInstanceId(instanceId)); // must be paused or terminated Assert.notNull(tuple, () -> "Pause instance failed: " + instanceId); boolean updated = instanceMapper.terminate(instanceId, tuple.a, RS_PAUSABLE, tuple.b); Assert.state(updated, () -> "Pause instance failed: " + instance + ", " + tuple.a); if (instance.isWorkflowNode()) { updateWorkflowNodeState(instance, tuple.a, RS_PAUSABLE); } else if (tuple.a.isTerminal()) { instance.markTerminated(tuple.a, tuple.b); renewFixedNextTriggerTime(instance); } } else { // has alive executing tasks: dispatch and pause executing tasks doAfterTransactionCommit(() -> workerClient.dispatch(executingTasks)); } } private void cancelInstance(SchedInstance instance, Operation ops) { if (instance.isWorkflow()) { long instanceId = instance.getInstanceId(); Assert.isTrue(instance.isWorkflowLead(), () -> "Cancel instance must be workflow lead: " + instanceId); instanceMapper.findWorkflowNode(instanceId).stream().filter(e -> !e.isTerminal()).forEach(e -> cancelInstance0(e, ops)); updateWorkflowLeadState(instance, RunState.CANCELED, RS_RUNNABLE); } else { cancelInstance0(instance, ops); } } private void cancelInstance0(SchedInstance instance, Operation ops) { long instanceId = instance.getInstanceId(); // update: (WAITING or PAUSED) -> (CANCELED) taskMapper.updateStateByInstanceId(instanceId, ops.toState().value(), ES_EXECUTABLE, new Date()); // load the alive executing tasks List executingTasks = loadExecutingTasks(instance, ops); if (executingTasks.isEmpty()) { // has non executing execute_state Tuple2 tuple = obtainRunState(taskMapper.findBaseByInstanceId(instanceId)); Assert.notNull(tuple, () -> "Cancel instance obtain run state failed: " + instanceId); // if all task paused, should update to canceled state if (tuple.a == RunState.PAUSED) { tuple = Tuple2.of(RunState.CANCELED, new Date()); } if (!instanceMapper.terminate(instanceId, tuple.a, RS_TERMINABLE, tuple.b)) { throw new IllegalStateException("Cancel instance failed: " + instance + ", " + tuple.a); } instance.markTerminated(tuple.a, tuple.b); if (instance.isWorkflowNode()) { updateWorkflowNodeState(instance, tuple.a, RS_TERMINABLE); } else { renewFixedNextTriggerTime(instance); } } else { // dispatch and cancel executing tasks doAfterTransactionCommit(() -> workerClient.dispatch(executingTasks)); } } private void resumeInstance(SchedInstance instance) { if (instance.isWorkflow()) { long instanceId = instance.getInstanceId(); Assert.isTrue(instance.isWorkflowLead(), () -> "Resume instance must be workflow lead: " + instanceId); // update sched_instance paused lead to running state boolean updated = instanceMapper.updateState(instanceId, RunState.RUNNING, RunState.PAUSED); Assert.state(updated, () -> "Resume workflow lead instance failed: " + instanceId); workflowMapper.resumeWaiting(instanceId); for (SchedInstance nodeInstance : instanceMapper.findWorkflowNode(instanceId)) { if (nodeInstance.isPaused()) { resumeInstance0(nodeInstance); updateWorkflowNodeState(nodeInstance, RunState.RUNNING, RS_PAUSED); } } WorkflowGraph graph = WorkflowGraph.of(workflowMapper.findByWnstanceId(instanceId)); try { List dispatchActions = processWorkflowGraph(instance, graph, graph.map()); doAfterTransactionCommit(dispatchActions); } catch (JobException e) { ExceptionUtils.rethrow(e); } } else { resumeInstance0(instance); } } private void resumeInstance0(SchedInstance instance) { long instanceId = instance.getInstanceId(); boolean updated = instanceMapper.updateState(instanceId, RunState.WAITING, RunState.PAUSED); Assert.state(updated, () -> "Resume sched instance failed: " + instanceId); int row = taskMapper.updateStateByInstanceId(instanceId, ExecuteState.WAITING.value(), ES_PAUSED, null); assertHasAffectedRow(row, "Resume sched task failed."); // dispatch task Tuple3> param = buildDispatchParam(instanceId, row); doAfterTransactionCommit(() -> dispatch(param.a, param.b, param.c)); } private void afterTerminateTask(SchedInstance instance) { Assert.isTrue(!instance.isWorkflowLead(), () -> "After terminate task cannot be workflow lead: " + instance); RunState runState = RunState.of(instance.getRunState()); if (runState == RunState.CANCELED) { retryJob(instance); } else if (runState == RunState.COMPLETED) { if (!instance.isWorkflowNode()) { renewFixedNextTriggerTime(instance); } processWorkflowInstance(instance); dependJob(instance); } else { throw new IllegalStateException("Unknown terminate run state " + runState); } } private void retryJob(SchedInstance failed) { Long retryingInstanceId = ThrowingSupplier.doCaught(() -> retryJob0(failed)); if (retryingInstanceId != null) { startRetrying(failed); return; } if (failed.isWorkflowNode()) { // If workflow without retry, then require update workflow graph state updateWorkflowNodeState(failed, RunState.CANCELED, RS_TERMINABLE); updateWorkflowLeadState(instanceMapper.get(failed.getWnstanceId()), RunState.CANCELED, RS_RUNNABLE); } else { renewFixedNextTriggerTime(failed); } } private Long retryJob0(SchedInstance failed) throws JobException { SchedJob job = getRequiredJob(failed.getJobId()); int retriedCount = failed.obtainRetriedCount(); if (!job.retryable(RunState.of(failed.getRunState()), retriedCount)) { return null; } // build retry instance long retryInstanceId = generateId(); long triggerTime = job.computeRetryTriggerTime(++retriedCount); SchedInstance retryInstance = SchedInstance.of(failed, retryInstanceId, job.getJobId(), RunType.RETRY, triggerTime, retriedCount); retryInstance.setWorkflowCurNode(failed.getWorkflowCurNode()); // build retry tasks List tasks = splitRetryTask(job, failed, retryInstance); if (tasks.isEmpty()) { // the broadcast failed task worker all dead then terminate retry LOG.warn("Retry instance split tasks is empty: {}, {}", job, failed); return null; } ThrowingSupplier persistenceAction = () -> { if (failed.isWorkflowNode()) { // 如果是workflow,则需要更新sched_workflow.instance_id String curNode = failed.getWorkflowCurNode(); int row = workflowMapper.update(failed.getWnstanceId(), curNode, null, retryInstanceId, RS_RUNNING, failed.getInstanceId()); assertHasAffectedRow(row, () -> "Retry instance, workflow node update failed."); } saveInstanceAndTasks(retryInstance, tasks); return () -> dispatch(job, retryInstance, tasks); }; Consumer errorHandler = t -> { throw new IllegalStateException("Create retry instance failed: " + failed, t); }; // 使用嵌套事务:保证`workflow & instance & tasks`操作的原子性,异常则回滚而不影响外层事务 Runnable dispatchAction = doInNestedTransaction(transactionTemplate, persistenceAction, errorHandler); doAfterTransactionCommit(dispatchAction); return retryInstanceId; } private List splitRetryTask(SchedJob job, SchedInstance failed, SchedInstance retry) throws JobException { RetryType retryType = RetryType.of(job.getRetryType()); if (retryType == RetryType.ALL) { // re-split job SplitJobParam splitJobParam; if (failed.isWorkflow()) { List list = loadWorkflowPredecessorInstances(job, failed.getWnstanceId(), failed.getInstanceId()); splitJobParam = ModelConverter.toSplitJobParam(job, retry, list); } else { splitJobParam = ModelConverter.toSplitJobParam(job, retry); } return splitJob(job.getGroup(), retry.getInstanceId(), splitJobParam); } else if (retryType == RetryType.FAILED) { return taskMapper.findLargeByInstanceId(failed.getInstanceId()) .stream() .filter(SchedTask::isFailure) // Broadcast task must be retried with the same worker .filter(e -> !job.isBroadcast() || workerClient.isAliveWorker(e.worker())) .map(e -> SchedTask.of(e.getTaskParam(), generateId(), retry.getInstanceId(), e.getTaskNo(), e.getTaskCount(), e.getWorker())) .collect(Collectors.toList()); } else { throw new UnsupportedOperationException("Retry instance, unknown retry type: " + job.getJobId() + ", " + retryType); } } private void dependJob(SchedInstance parent) { if (parent.isWorkflowNode() || !parent.isCompleted()) { return; } for (SchedDepend depend : dependMapper.findByParentJobId(parent.getJobId())) { ThrowingRunnable.doCaught(() -> dependJob(parent, depend), () -> "Depend job error: " + parent + ", " + depend); } } private void dependJob(SchedInstance parent, SchedDepend depend) throws JobException { SchedJob childJob = getRequiredJob(depend.getChildJobId()); if (childJob.isDisabled()) { LOG.warn("Depend child job disabled: {}", childJob); return; } // 使用嵌套事务:保证`save`方法内部数据操作的原子性,异常则回滚而不影响外层事务 TriggerInstance dependInstance = TriggerInstance.of(this, childJob, parent, RunType.DEPEND, System.currentTimeMillis()); Consumer errorHandler = t -> LOG.error("Create depend instance failed: {}, {}", childJob, parent, t); ThrowingSupplier persistenceAction = () -> { dependInstance.save(); return dependInstance::dispatch; }; Runnable dispatchAction = doInNestedTransaction(transactionTemplate, persistenceAction, errorHandler); doAfterTransactionCommit(dispatchAction); } private List loadExecutingTasks(SchedInstance instance, Operation ops) { List executingTasks = new ArrayList<>(); ExecuteTaskParamBuilder builder = null; long triggerTime = System.currentTimeMillis(); for (SchedTask task : taskMapper.findBaseByInstanceIdAndStates(instance.getInstanceId(), ES_EXECUTING)) { Worker worker = task.worker(); if (workerClient.isAliveWorker(worker)) { if (builder == null) { builder = new ExecuteTaskParamBuilder(getRequiredJob(instance.getJobId()), instance); } executingTasks.add(builder.build(ops, task.getTaskId(), triggerTime, worker)); } else { // update dead task Date executeEndTime = ops.toState().isTerminal() ? new Date() : null; ExecuteState toState = ops.toState().isTerminal() ? ExecuteState.EXECUTE_ABORTED : ops.toState(); ExecuteState fromState = ExecuteState.EXECUTING; if (taskMapper.terminate(task.getTaskId(), task.getWorker(), toState, fromState, executeEndTime, null)) { LOG.info("Terminate dead worker executing task success: {}", task); } else { LOG.error("Terminate dead worker executing task failed: {}", task); } } } return executingTasks; } private Tuple3> buildDispatchParam(long instanceId, int expectTaskSize) { SchedInstance instance = instanceMapper.get(instanceId); SchedJob job = getRequiredJob(instance.getJobId()); List waitingTasks = taskMapper.findLargeByInstanceIdAndStates(instanceId, ES_WAITING); int size = waitingTasks.size(); Assert.state(size == expectTaskSize, () -> "Invalid dispatch tasks size: " + size + ", " + expectTaskSize); return Tuple3.of(job, instance, waitingTasks); } private void startRetrying(SchedInstance instance) { if (!instance.isRunRetry()) { RunState state = RunState.CANCELED; boolean updated = instanceMapper.updateRetrying(instance.getInstanceId(), true, state, state); Assert.state(updated, () -> "Start retrying failed: " + instance); } } private void stopRetrying(SchedInstance instance, RunState toState) { if (instance.isRunRetry()) { long id = instance.obtainRetryOriginalInstanceId(); boolean updated = instanceMapper.updateRetrying(id, false, toState, RunState.CANCELED); Assert.state(updated, () -> "Stop retrying failed: " + toState + ", " + instance); } } private void renewFixedNextTriggerTime(SchedInstance instance) { Assert.isTrue(instance.isTerminal(), () -> "Renew fixed instance must be terminal state: " + instance); Assert.isTrue(!instance.isWorkflowNode(), () -> "Renew fixed instance cannot be workflow node: " + instance); if (instance.isRunRetry()) { stopRetrying(instance, RunState.of(instance.getRunState())); } long instanceId = instance.obtainRetryOriginalInstanceId(); SchedInstance original = (instanceId == instance.getInstanceId()) ? instance : instanceMapper.get(instanceId); if (!original.getJobId().equals(instance.getJobId()) || !RunType.SCHEDULE.equalsValue(original.getRunType())) { return; } SchedJob job = jobMapper.get(original.getJobId()); TriggerType triggerType; if (job == null || job.isDisabled() || !(triggerType = TriggerType.of(job.getTriggerType())).isFixedTriggerType()) { return; } long lastTriggerTime = original.getTriggerTime(), nextTriggerTime; if (triggerType == TriggerType.FIXED_RATE) { Date time = triggerType.computeNextTriggerTime(job.getTriggerValue(), new Date(original.getTriggerTime())); nextTriggerTime = Dates.max(time, original.getRunEndTime()).getTime(); } else { // TriggerType.FIXED_DELAY nextTriggerTime = triggerType.computeNextTriggerTime(job.getTriggerValue(), original.getRunEndTime()).getTime(); } boolean updated = isOneAffectedRow(jobMapper.updateFixedNextTriggerTime(job.getJobId(), lastTriggerTime, nextTriggerTime)); LOG.info("Renew fixed next trigger time: {}, {}, {}, {}", job.getJobId(), lastTriggerTime, nextTriggerTime, updated); } // ------------------------------------------------------------------private workflow methods private void updateWorkflowNodeState(SchedInstance node, RunState toState, List fromStates) { Assert.isTrue(node.isWorkflowNode(), () -> "Update workflow cur node state must be node: " + node); String curNode = node.getWorkflowCurNode(); int row = workflowMapper.update(node.getWnstanceId(), curNode, toState.value(), null, fromStates, node.getInstanceId()); assertHasAffectedRow(row, () -> "Update workflow state failed: " + node + ", " + toState); if (toState.isTerminal()) { stopRetrying(node, toState); } } private void updateWorkflowLeadState(SchedInstance lead, RunState toState, List fromStates) { Assert.isTrue(lead.isWorkflowLead(), () -> "Update workflow free node state must be lead: " + lead); long wnstanceId = lead.getWnstanceId(); workflowMapper.update(wnstanceId, null, toState.value(), null, fromStates, null); stopWorkflowGraph(wnstanceId, WorkflowGraph.of(workflowMapper.findByWnstanceId(wnstanceId))); } private void processWorkflowInstance(SchedInstance node) { if (!node.isWorkflowNode()) { return; } // update current node state updateWorkflowNodeState(node, RunState.COMPLETED, RS_TERMINABLE); // if terminal all, then update workflow nodes long wnstanceId = node.getWnstanceId(); WorkflowGraph graph = WorkflowGraph.of(workflowMapper.findByWnstanceId(wnstanceId)); if (stopWorkflowGraph(wnstanceId, graph)) { return; } // process next workflow node Map map = graph.successors(node.parseWorkflowCurNode()); SchedInstance lead = instanceMapper.get(wnstanceId); Consumer errorHandler = t -> { LOG.error("Process workflow node error: {}", node, t); updateWorkflowLeadState(lead, RunState.CANCELED, RS_RUNNABLE); }; // 使用嵌套事务:保证`processWorkflowNode`方法内部数据操作的原子性,异常则回滚而不影响外层事务 ThrowingSupplier, Throwable> persistenceAction = () -> processWorkflowGraph(lead, graph, map); List dispatchActions = doInNestedTransaction(transactionTemplate, persistenceAction, errorHandler); doAfterTransactionCommit(dispatchActions); } private List processWorkflowGraph(SchedInstance lead, WorkflowGraph graph, Map map) throws JobException { Assert.isTrue(lead.isWorkflowLead(), () -> "Process workflow node must be lead: " + lead); List dispatchActions = new ArrayList<>(); if (!map.isEmpty()) { SchedJob job = getRequiredJob(lead.getJobId()); Set duplicates = new HashSet<>(); for (Map.Entry edge : map.entrySet()) { processWorkflowGraph(dispatchActions, job, lead, graph, duplicates, edge); } } return dispatchActions; } private void processWorkflowGraph(List dispatchActions, SchedJob job, SchedInstance lead, WorkflowGraph graph, Set duplicates, Map.Entry edge) throws JobException { long wnstanceId = lead.getWnstanceId(); DAGNode target = edge.getKey().getTarget(); SchedWorkflow workflow = edge.getValue(); if (target.isEnd() || !workflow.isWaiting() || !duplicates.add(target)) { // 当前节点为结束结点 或 当前节点不为等待状态,则跳过 return; } Collection predecessors = graph.predecessors(target).values(); if (predecessors.stream().anyMatch(e -> !e.isTerminal())) { // 前置节点还未结束,则跳过 return; } if (predecessors.stream().anyMatch(SchedWorkflow::isFailure)) { RunState state = RunState.CANCELED; int row = workflowMapper.update(wnstanceId, workflow.getCurNode(), state.value(), null, RS_TERMINABLE, null); assertHasAffectedRow(row, () -> "Update workflow cur node state failed: " + workflow + ", " + state); return; } long nextInstanceId = generateId(); RunType runType = RunType.of(lead.getRunType()); SchedWorkflow lastPredecessor = predecessors.stream().max(BaseEntity.UPDATED_AT_COMPARATOR).orElse(null); SchedInstance parent = (lastPredecessor == null) ? lead : instanceMapper.get(lastPredecessor.getInstanceId()); SchedInstance nextInstance = SchedInstance.of(parent, nextInstanceId, job.getJobId(), runType, System.currentTimeMillis(), 0); nextInstance.setWorkflowCurNode(workflow.getCurNode()); int row = workflowMapper.update(wnstanceId, workflow.getCurNode(), RunState.RUNNING.value(), nextInstanceId, RS_WAITING, null); assertHasAffectedRow(row, () -> "Start workflow node failed: " + workflow); List list = predecessors.isEmpty() ? null : loadWorkflowPredecessorInstances(job, wnstanceId, nextInstanceId); SplitJobParam splitJobParam = ModelConverter.toSplitJobParam(job, nextInstance, list); List tasks = splitJob(job.getGroup(), nextInstanceId, splitJobParam); // save to db saveInstanceAndTasks(nextInstance, tasks); dispatchActions.add(() -> dispatch(job, nextInstance, tasks)); } private boolean stopWorkflowGraph(long wnstanceId, WorkflowGraph graph) { if (graph.anyMatch(e -> e.getKey().getTarget().isEnd() && !e.getValue().isTerminal())) { // if end node is not terminal state, then process the end node run state Map ends = graph.predecessors(DAGNode.END); if (ends.values().stream().allMatch(SchedWorkflow::isTerminal)) { RunState endState = ends.values().stream().anyMatch(SchedWorkflow::isFailure) ? RunState.CANCELED : RunState.COMPLETED; int row = workflowMapper.update(wnstanceId, DAGNode.END.toString(), endState.value(), null, RS_TERMINABLE, null); assertHasAffectedRow(row, () -> "Update workflow end node failed: " + wnstanceId + ", " + endState); ends.forEach((k, v) -> graph.get(k.getTarget(), DAGNode.END).setRunState(endState.value())); } } if (graph.allMatch(e -> e.getValue().isTerminal())) { // terminate lead instance RunState state = graph.anyMatch(e -> e.getValue().isFailure()) ? RunState.CANCELED : RunState.COMPLETED; boolean updated = instanceMapper.terminate(wnstanceId, state, RS_TERMINABLE, new Date()); Assert.state(updated, () -> "Stop workflow instance failed: " + wnstanceId + ", " + state); SchedInstance lead = instanceMapper.get(wnstanceId); dependJob(lead); renewFixedNextTriggerTime(lead); return true; } if (graph.allMatch(e -> e.getValue().isTerminal() || e.getValue().isPaused())) { // At Least one paused and others is terminal boolean updated = instanceMapper.updateState(wnstanceId, RunState.PAUSED, RunState.RUNNING); Assert.state(updated, () -> "Update workflow pause state failed: " + wnstanceId); return true; } return false; } private List loadWorkflowPredecessorInstances(SchedJob job, long wnstanceId, Long instanceId) { List workflows = workflowMapper.findByWnstanceId(wnstanceId); SchedWorkflow curWorkflow = workflows.stream().filter(e -> instanceId.equals(e.getInstanceId())).findAny().orElse(null); Assert.state(curWorkflow != null, () -> "Not found current workflow node: " + wnstanceId + ", " + instanceId); Map predecessors = WorkflowGraph.of(workflows).predecessors(curWorkflow.parseCurNode()); if (predecessors.isEmpty()) { return null; } RetryType retryType = RetryType.of(job.getRetryType()); return Collects.convert(predecessors.values(), e -> { // predecessor instance下的task是全部执行成功的 List tasks = taskMapper.findLargeByInstanceId(e.getInstanceId()); SchedInstance prev; if (retryType == RetryType.FAILED && (prev = instanceMapper.get(e.getInstanceId())).isRunRetry()) { Set instanceIds = instanceMapper.findChildren(prev.getPnstanceId(), RunType.RETRY.value()) .stream() .map(SchedInstance::getInstanceId) .filter(t -> !Objects.equals(t, e.getInstanceId())) .collect(Collectors.toSet()); instanceIds.add(prev.getPnstanceId()); instanceIds.forEach(t -> tasks.addAll(taskMapper.findLargeByInstanceIdAndStates(t, ES_COMPLETED))); } tasks.sort(SchedTask.TASK_NO_COMPARATOR); return ModelConverter.toPredecessorInstance(e, tasks); }); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy