cn.ponfee.disjob.supervisor.component.JobManager Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of disjob-supervisor Show documentation
Distributed job supervisor module
The newest version!
/*
 * Copyright 2022-2024 Ponfee (http://www.ponfee.cn/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package cn.ponfee.disjob.supervisor.component;

import cn.ponfee.disjob.common.base.IdGenerator;
import cn.ponfee.disjob.common.base.Symbol;
import cn.ponfee.disjob.common.collect.Collects;
import cn.ponfee.disjob.common.dag.DAGEdge;
import cn.ponfee.disjob.common.dag.DAGNode;
import cn.ponfee.disjob.common.date.Dates;
import cn.ponfee.disjob.common.exception.Throwables.ThrowingRunnable;
import cn.ponfee.disjob.common.exception.Throwables.ThrowingSupplier;
import cn.ponfee.disjob.common.model.BaseEntity;
import cn.ponfee.disjob.common.tuple.Tuple2;
import cn.ponfee.disjob.common.tuple.Tuple3;
import cn.ponfee.disjob.common.util.Strings;
import cn.ponfee.disjob.core.base.CoreUtils;
import cn.ponfee.disjob.core.base.JobConstants;
import cn.ponfee.disjob.core.base.Worker;
import cn.ponfee.disjob.core.dag.PredecessorInstance;
import cn.ponfee.disjob.core.dto.supervisor.StartTaskParam;
import cn.ponfee.disjob.core.dto.supervisor.StartTaskResult;
import cn.ponfee.disjob.core.dto.supervisor.StopTaskParam;
import cn.ponfee.disjob.core.dto.worker.SplitJobParam;
import cn.ponfee.disjob.core.enums.*;
import cn.ponfee.disjob.core.exception.JobException;
import cn.ponfee.disjob.dispatch.ExecuteTaskParam;
import cn.ponfee.disjob.dispatch.event.TaskDispatchFailedEvent;
import cn.ponfee.disjob.supervisor.base.ExecuteTaskParamBuilder;
import cn.ponfee.disjob.supervisor.base.ModelConverter;
import cn.ponfee.disjob.supervisor.base.TriggerTimes;
import cn.ponfee.disjob.supervisor.configuration.SupervisorProperties;
import cn.ponfee.disjob.supervisor.dag.WorkflowGraph;
import cn.ponfee.disjob.supervisor.dao.mapper.*;
import cn.ponfee.disjob.supervisor.exception.KeyExistsException;
import cn.ponfee.disjob.supervisor.instance.TriggerInstance;
import cn.ponfee.disjob.supervisor.model.*;
import com.google.common.base.Joiner;
import com.google.common.collect.Lists;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.collections4.MapUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.context.event.EventListener;
import org.springframework.stereotype.Component;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.transaction.support.TransactionTemplate;
import org.springframework.util.Assert;

import java.util.*;
import java.util.function.Consumer;
import java.util.function.LongFunction;
import java.util.function.Predicate;
import java.util.stream.Collectors;

import static cn.ponfee.disjob.common.spring.TransactionUtils.*;
import static cn.ponfee.disjob.common.util.Functions.convert;
import static cn.ponfee.disjob.common.util.Functions.doIfTrue;
import static cn.ponfee.disjob.core.base.JobConstants.PROCESS_BATCH_SIZE;
import static cn.ponfee.disjob.supervisor.dao.SupervisorDataSourceConfig.SPRING_BEAN_NAME_TX_MANAGER;
import static cn.ponfee.disjob.supervisor.dao.SupervisorDataSourceConfig.SPRING_BEAN_NAME_TX_TEMPLATE;
import static com.google.common.collect.ImmutableList.of;

/**
 * Job manager
 *
 * @author Ponfee
 */
@Component
public class JobManager {

    private static final Logger LOG = LoggerFactory.getLogger(JobManager.class);
    private static final Comparator> WORKLOAD_COMPARATOR = Comparator.comparingLong(e -> e.b);
    private static final List RS_TERMINABLE = of(RunState.WAITING.value(), RunState.RUNNING.value(), RunState.PAUSED.value());
    private static final List RS_RUNNABLE   = of(RunState.WAITING.value(), RunState.PAUSED.value());
    private static final List RS_PAUSABLE   = of(RunState.WAITING.value(), RunState.RUNNING.value());
    private static final List RS_WAITING    = of(RunState.WAITING.value());
    private static final List RS_RUNNING    = of(RunState.RUNNING.value());
    private static final List RS_PAUSED     = of(RunState.PAUSED.value());
    private static final List ES_EXECUTABLE = of(ExecuteState.WAITING.value(), ExecuteState.PAUSED.value());
    private static final List ES_PAUSABLE   = of(ExecuteState.WAITING.value(), ExecuteState.EXECUTING.value());
    private static final List ES_WAITING    = of(ExecuteState.WAITING.value());
    private static final List ES_EXECUTING  = of(ExecuteState.EXECUTING.value());
    private static final List ES_PAUSED     = of(ExecuteState.PAUSED.value());
    private static final List ES_COMPLETED  = of(ExecuteState.COMPLETED.value());

    private final SupervisorProperties conf;
    private final IdGenerator idGenerator;
    private final SchedJobMapper jobMapper;
    private final SchedDependMapper dependMapper;
    private final SchedInstanceMapper instanceMapper;
    private final SchedWorkflowMapper workflowMapper;
    private final SchedTaskMapper taskMapper;
    private final WorkerClient workerClient;
    private final TransactionTemplate transactionTemplate;

    public JobManager(SupervisorProperties conf,
                      IdGenerator idGenerator,
                      SchedJobMapper jobMapper,
                      SchedDependMapper dependMapper,
                      SchedInstanceMapper instanceMapper,
                      SchedWorkflowMapper workflowMapper,
                      SchedTaskMapper taskMapper,
                      WorkerClient workerClient,
                      @Qualifier(SPRING_BEAN_NAME_TX_TEMPLATE) TransactionTemplate txTemplate) {
        conf.check();
        this.conf = conf;
        this.idGenerator = idGenerator;
        this.jobMapper = jobMapper;
        this.dependMapper = dependMapper;
        this.instanceMapper = instanceMapper;
        this.workflowMapper = workflowMapper;
        this.taskMapper = taskMapper;
        this.workerClient = workerClient;
        this.transactionTemplate = txTemplate;
    }

    // ------------------------------------------------------------------non-database operation methods

    public long generateId() {
        return idGenerator.generateId();
    }

    public List splitJob(String group, long instanceId, SplitJobParam param) throws JobException {
        return workerClient.splitJob(group, instanceId, param, this::generateId, conf.getMaximumSplitTaskSize());
    }

    public boolean dispatch(SchedJob job, SchedInstance instance, List tasks) {
        return dispatch(false, job, instance, tasks);
    }

    public boolean redispatch(SchedJob job, SchedInstance instance, List tasks) {
        return dispatch(true, job, instance, tasks);
    }

    // ------------------------------------------------------------------database single operation without spring transactional

    public void disableJob(SchedJob job) {
        jobMapper.disable(job);
    }

    public boolean updateJobNextTriggerTime(SchedJob job) {
        return isOneAffectedRow(jobMapper.updateNextTriggerTime(job));
    }

    public void updateJobNextScanTime(SchedJob job) {
        jobMapper.updateNextScanTime(job);
    }

    public boolean updateInstanceNextScanTime(SchedInstance inst, Date nextScanTime) {
        Assert.notNull(nextScanTime, "Instance next scan time cannot be null.");
        return isOneAffectedRow(instanceMapper.updateNextScanTime(inst.getInstanceId(), nextScanTime, inst.getVersion()));
    }

    public boolean savepoint(long taskId, String worker, String executeSnapshot) {
        CoreUtils.checkClobMaximumLength(executeSnapshot, "Execute snapshot");
        return isOneAffectedRow(taskMapper.savepoint(taskId, worker, executeSnapshot));
    }

    // ------------------------------------------------------------------must in transaction active(propagation Mandatory)

    public void saveLeadInstanceAndWorkflows(SchedInstance instance, List workflows) {
        Assert.isTrue(instance.isWorkflowLead(), () -> "Must be workflow lead instance: " + instance);
        assertDoInTransaction();
        instanceMapper.insert(instance.fillUniqueFlag());
        Collects.batchProcess(workflows, workflowMapper::batchInsert, PROCESS_BATCH_SIZE);
    }

    public void saveInstanceAndTasks(SchedInstance instance, List tasks) {
        assertDoInTransaction();
        instanceMapper.insert(instance.fillUniqueFlag());
        Collects.batchProcess(tasks, taskMapper::batchInsert, PROCESS_BATCH_SIZE);
    }

    // ------------------------------------------------------------------database operation within spring @transactional

    @Transactional(transactionManager = SPRING_BEAN_NAME_TX_MANAGER, rollbackFor = Exception.class)
    public Long addJob(SchedJob job) throws JobException {
        job.setUpdatedBy(job.getCreatedBy());
        job.verifyForAdd(conf.getMaximumJobRetryCount());
        if (jobMapper.getJobId(job.getGroup(), job.getJobName()) != null) {
            throw new KeyExistsException("Exists job name: " + job.getJobName());
        }
        workerClient.verifyJob(job);
        job.setJobId(generateId());
        parseTriggerConfig(job);

        jobMapper.insert(job);
        return job.getJobId();
    }

    @Transactional(transactionManager = SPRING_BEAN_NAME_TX_MANAGER, rollbackFor = Exception.class)
    public void updateJob(SchedJob job) throws JobException {
        job.verifyForUpdate(conf.getMaximumJobRetryCount());
        if (job.requiredUpdateExecutor()) {
            workerClient.verifyJob(job);
        }
        Long jobId0 = jobMapper.getJobId(job.getGroup(), job.getJobName());
        if (jobId0 != null && !jobId0.equals(job.getJobId())) {
            throw new IllegalArgumentException("Exists job name: " + job.getJobName());
        }

        SchedJob dbJob = jobMapper.get(job.getJobId());
        Assert.notNull(dbJob, () -> "Sched job id not found " + job.getJobId());
        Assert.isTrue(dbJob.getGroup().equals(job.getGroup()), "Job group cannot be modify.");
        if (job.requiredUpdateTrigger(dbJob.getTriggerType(), dbJob.getTriggerValue())) {
            dependMapper.deleteByChildJobId(job.getJobId());
            parseTriggerConfig(job);
        }

        assertOneAffectedRow(jobMapper.update(job), "Update sched job fail or conflict.");
    }

    @Transactional(transactionManager = SPRING_BEAN_NAME_TX_MANAGER, rollbackFor = Exception.class)
    public void deleteJob(long jobId) {
        SchedJob job = jobMapper.get(jobId);
        Assert.notNull(job, () -> "Job id not found: " + jobId);
        Assert.state(!job.isEnabled(), "Please disable job before delete this job.");
        assertOneAffectedRow(jobMapper.softDelete(jobId), "Delete sched job fail or conflict.");
        dependMapper.deleteByParentJobId(jobId);
        dependMapper.deleteByChildJobId(jobId);
    }

    @Transactional(transactionManager = SPRING_BEAN_NAME_TX_MANAGER, rollbackFor = Exception.class)
    public boolean changeJobState(long jobId, JobState toState) {
        boolean updated = isOneAffectedRow(jobMapper.updateState(jobId, toState.value(), 1 ^ toState.value()));
        if (updated && toState == JobState.ENABLED) {
            updateNextTriggerTime(jobMapper.get(jobId));
        }
        return updated;
    }

    @Transactional(transactionManager = SPRING_BEAN_NAME_TX_MANAGER, rollbackFor = Exception.class)
    public void manualTriggerJob(long jobId) throws JobException {
        triggerJob(getRequiredJob(jobId), RunType.MANUAL, System.currentTimeMillis());
    }

    @Transactional(transactionManager = SPRING_BEAN_NAME_TX_MANAGER, rollbackFor = Exception.class)
    public void scheduleTriggerJob(SchedJob job, long triggerTime) throws JobException {
        if (isOneAffectedRow(jobMapper.updateNextTriggerTime(job))) {
            triggerJob(job, RunType.SCHEDULE, triggerTime);
        }
    }

    /**
     * Set or clear task worker
     *
     * @param worker  the worker
     * @param taskIds the task id list
     */
    @Transactional(transactionManager = SPRING_BEAN_NAME_TX_MANAGER, rollbackFor = Exception.class)
    public void updateTaskWorker(String worker, List taskIds) {
        if (CollectionUtils.isNotEmpty(taskIds)) {
            // Sort for prevent sql deadlock: Deadlock found when trying to get lock; try restarting transaction
            Collections.sort(taskIds);
            Lists.partition(taskIds, PROCESS_BATCH_SIZE).forEach(ids -> taskMapper.batchUpdateWorker(worker, ids));
        }
    }

    // ------------------------------------------------------------------database operation within spring TransactionTemplate

    /**
     * Listen task dispatch failed event
     *  {@code `@Async`}需要标注{@code `@EnableAsync`}来启用，默认使用的是`SimpleAsyncTaskExecutor`线程池，会为每个任务创建一个新线程(慎用默认的线程池)
     *
     * @param event the TaskDispatchFailedEvent
     */
    @EventListener
    public void processTaskDispatchFailedEvent(TaskDispatchFailedEvent event) {
        transactionTemplate.executeWithoutResult(status -> {
            long taskId = event.getTaskId();
            if (!shouldTerminateDispatchFailedTask(taskId)) {
                return;
            }
            if (!taskMapper.terminate(taskId, null, ExecuteState.DISPATCH_FAILED, ExecuteState.WAITING, null, null)) {
                LOG.warn("Terminate dispatch failed task unsuccessful: {}", taskId);
            }
        });
    }

    /**
     * Starts the task
     *
     * @param param the start task param
     * @return start result
     */
    public StartTaskResult startTask(StartTaskParam param) {
        param.check();
        return doInSynchronizedTransaction0(param.getInstanceId(), param.getWnstanceId(), lockInstanceId -> {
            String startRequestId = param.getStartRequestId();
            LOG.info("Task trace [{}] starting: {}, {}", param.getTaskId(), param.getWorker(), startRequestId);
            Date now = new Date();
            // 如果先`get`查一次然后`start`，最后再`get`查的话数据可能会被缓存，返回`runState=10`
            // 若先`instanceMapper.lock(lockInstanceId)`，不会出现以上问题
            if (isNotAffectedRow(taskMapper.start(param.getTaskId(), param.getWorker(), startRequestId, now))) {
                if (!taskMapper.checkStartIdempotent(param.getTaskId(), param.getWorker(), startRequestId)) {
                    return StartTaskResult.failure("Start task failure.");
                }
                LOG.info("Start task idempotent: {}, {}, {}", param.getTaskId(), param.getWorker(), startRequestId);
            }
            if (isNotAffectedRow(instanceMapper.start(param.getInstanceId(), now))) {
                SchedInstance instance = instanceMapper.get(param.getInstanceId());
                Assert.state(instance != null && instance.isRunning(), () -> "Start instance failure: " + instance);
            }
            return ModelConverter.toStartTaskResult(taskMapper.get(param.getTaskId()));
        });
    }

    /**
     * Stops the task
     *
     * @param param the stop task param
     * @return {@code true} if stopped task successful
     */
    public boolean stopTask(StopTaskParam param) {
        param.check();
        Operation ops = param.getOperation();
        LOG.info("Task trace [{}] stopping: {}, {}, {}", param.getTaskId(), ops, param.getToState(), param.getWorker());
        return doInSynchronizedTransaction(param.getInstanceId(), param.getWnstanceId(), instance -> {
            Assert.isTrue(!instance.isWorkflowLead(), () -> "Stop task instance cannot be workflow lead: " + instance);
            if (instance.isTerminal()) {
                return false;
            }
            ExecuteState toState = param.getToState();
            Date executeEndTime = toState.isTerminal() ? new Date() : null;
            String errMsg = param.getErrorMsg();
            if (!taskMapper.terminate(param.getTaskId(), param.getWorker(), toState, ExecuteState.EXECUTING, executeEndTime, errMsg)) {
                // usual is worker invoke http timeout, then retry
                LOG.warn("Conflict stop executing task: {}, {}", param.getTaskId(), toState);
                return false;
            }

            List tasks = taskMapper.findBaseByInstanceId(param.getInstanceId());
            if (toState == ExecuteState.WAITING) {
                Assert.isTrue(ops == Operation.SHUTDOWN_RESUME, () -> "Operation must be SHUTDOWN_RESUME, but actual: " + ops);
                if (tasks.stream().allMatch(SchedTask::isWaiting)) {
                    boolean updated = instanceMapper.updateState(param.getInstanceId(), RunState.WAITING, RunState.RUNNING);
                    Assert.isTrue(updated, () -> "Shutdown resume instance state to WAITING failed: " + param.getInstanceId());
                }
                if (!updateInstanceNextScanTime(instance, new Date(System.currentTimeMillis() + conf.getShutdownTaskDelayResumeMs()))) {
                    LOG.warn("Resume task renew instance update time failed: {}", param.getTaskId());
                }
                return true;
            }

            Tuple2 tuple = obtainRunState(tasks);
            if (tuple == null) {
                // If the instance has (WAITING or EXECUTING) task
                return true;
            }

            if (!tuple.a.isTerminal()) {
                Assert.isTrue(tuple.a == RunState.PAUSED, () -> "Run state must be PAUSED, but actual: " + tuple.a);
                pauseInstance(instance.isWorkflow() ? instanceMapper.get(instance.getWnstanceId()) : instance);
                return true;
            }

            boolean updated = instanceMapper.terminate(param.getInstanceId(), tuple.a, RS_TERMINABLE, tuple.b);
            Assert.state(updated, () -> "Stop task instance failed: " + param.getInstanceId() + ", " + tuple.a);
            // the last executing task of this sched instance
            instance.markTerminated(tuple.a, tuple.b);
            if (ops.isTrigger()) {
                // trigger operation
                afterTerminateTask(instance);
            } else if (instance.isWorkflowNode()) {
                Assert.isTrue(tuple.a == RunState.CANCELED, () -> "Invalid workflow non-trigger stop state: " + tuple.a);
                updateWorkflowNodeState(instance, tuple.a, RS_TERMINABLE);
                updateWorkflowLeadState(instanceMapper.get(instance.getWnstanceId()), tuple.a, RS_RUNNABLE);
            } else {
                Assert.isTrue(tuple.a == RunState.CANCELED, () -> "Invalid general non-trigger stop state: " + tuple.a);
                renewFixedNextTriggerTime(instance);
            }

            return true;
        });
    }

    /**
     * Force change instance state, for example: CANCELED -> WAITING
     *
     * @param instanceId     the instance id, unsupported workflow
     * @param toExecuteState the target execute state
     */
    public void changeInstanceState(long instanceId, ExecuteState toExecuteState) {
        Assert.isTrue(toExecuteState != ExecuteState.EXECUTING, () -> "Force change state invalid target: " + toExecuteState);
        doInSynchronizedTransaction(instanceId, null, instance -> {
            Assert.isTrue(!instance.isWorkflow(), () -> "Force change state unsupported workflow: " + instanceId);
            RunState fromRunState = RunState.of(instance.getRunState());
            RunState toRunState = toExecuteState.runState();
            Assert.isTrue(fromRunState != RunState.RUNNING, "Force change state current cannot be RUNNING.");
            Assert.isTrue(fromRunState != toRunState, () -> "Force change state current cannot equals target " + toRunState);

            boolean updated = instanceMapper.updateState(instanceId, toRunState, fromRunState);
            Assert.state(updated, () -> "Force change state failed: " + instanceId);
            int changedTaskRows = taskMapper.forceChangeState(instanceId, toExecuteState.value());
            if (toExecuteState == ExecuteState.WAITING) {
                Tuple3> tuple = buildDispatchParam(instanceId, changedTaskRows);
                doAfterTransactionCommit(() -> dispatch(tuple.a, tuple.b, tuple.c));
            }
            LOG.info("Force change state success {}, {}", instanceId, toExecuteState);
        });
    }

    public void deleteInstance(long instanceId) {
        doInSynchronizedTransaction(instanceId, requireWnstanceIdIfWorkflow(instanceId), instance -> {
            Assert.isTrue(instance.isTerminal(), () -> "Deleting instance must be terminal: " + instance);
            if (instance.isWorkflow()) {
                Assert.isTrue(instance.isWorkflowLead(), () -> "Delete workflow instance must be lead: " + instanceId);
                List nodeInstances = instanceMapper.findWorkflowNode(instanceId);
                assertHasAffectedRow(instanceMapper.deleteByWnstanceId(instanceId), () -> "Delete workflow instance failed: " + instanceId);
                assertHasAffectedRow(workflowMapper.deleteByWnstanceId(instanceId), () -> "Delete workflow config failed: " + instanceId);
                for (SchedInstance nodeInstance : nodeInstances) {
                    int row = taskMapper.deleteByInstanceId(nodeInstance.getInstanceId());
                    assertHasAffectedRow(row, () -> "Delete workflow task failed: " + nodeInstance);
                }
            } else {
                Assert.isTrue(!instance.getRetrying(), "Cannot delete retrying original instance.");
                Assert.isTrue(!instance.isRunRetry(), "Cannot delete run retry sub instance.");
                Set instanceIds = instanceMapper.findRunRetry(instanceId)
                    .stream().map(SchedInstance::getInstanceId).collect(Collectors.toSet());
                instanceIds.add(instanceId);
                for (Long id : instanceIds) {
                    assertOneAffectedRow(instanceMapper.deleteByInstanceId(id), () -> "Delete instance failed: " + id);
                    assertHasAffectedRow(taskMapper.deleteByInstanceId(id), () -> "Delete task failed: " + id);
                }
            }
            LOG.info("Delete sched instance success {}", instanceId);
        });
    }

    /**
     * Purge the zombie instance which maybe dead
     *
     * @param inst the sched instance
     * @return {@code true} if purged successfully
     */
    public boolean purgeInstance(SchedInstance inst) {
        Long instanceId = inst.getInstanceId();
        LOG.info("Purge instance: {}", instanceId);
        return doInSynchronizedTransaction(instanceId, inst.getWnstanceId(), instance -> {
            Assert.isTrue(!instance.isWorkflowLead(), () -> "Purge instance cannot be workflow lead: " + instance);
            // instance run state must in (10, 20)
            if (!instance.isPausable()) {
                return false;
            }
            List tasks = taskMapper.findBaseByInstanceId(instanceId);
            if (tasks.stream().anyMatch(SchedTask::isWaiting) || workerClient.hasAliveExecutingTasks(tasks)) {
                LOG.warn("Purge instance failed, has waiting or alive executing task: {}", tasks);
                return false;
            }

            Tuple2 tuple = obtainRunState(tasks);
            if (tuple == null) {
                tuple = Tuple2.of(RunState.CANCELED, new Date());
            }
            Assert.isTrue(tuple.a.isTerminal(), () -> "Purge instance state must be terminal state: " + instance);
            if (!instanceMapper.terminate(instanceId, tuple.a, RS_TERMINABLE, tuple.b)) {
                throw new IllegalStateException("Purge instance failed: " + instance + ", " + tuple.a);
            }
            tasks.stream().filter(SchedTask::isPausable).forEach(e -> {
                String worker = e.isExecuting() ? Strings.requireNonBlank(e.getWorker()) : null;
                ExecuteState fromState = ExecuteState.of(e.getExecuteState());
                taskMapper.terminate(e.getTaskId(), worker, ExecuteState.EXECUTE_ABORTED, fromState, new Date(), null);
            });

            instance.markTerminated(tuple.a, tuple.b);
            afterTerminateTask(instance);
            LOG.warn("Purge instance {} to state {}", instanceId, tuple.a);
            return true;
        });
    }

    /**
     * Pause instance
     *
     * @param instanceId the instance id, if workflow must be lead instance id
     * @return {@code true} if paused successfully
     */
    public boolean pauseInstance(long instanceId) {
        LOG.info("Pause instance: {}", instanceId);
        return doInSynchronizedTransaction(instanceId, requireWnstanceIdIfWorkflow(instanceId), instance -> {
            return doIfTrue(instance.isPausable(), () -> pauseInstance(instance));
        });
    }

    /**
     * Cancel instance
     *
     * @param instanceId the instance id, if workflow must be lead instance id
     * @param ops        the operation
     * @return {@code true} if canceled successfully
     */
    public boolean cancelInstance(long instanceId, Operation ops) {
        LOG.info("Cancel instance: {}, {}", instanceId, ops);
        Assert.isTrue(ops.toState().isFailure(), () -> "Cancel instance operation invalid: " + ops);
        return doInSynchronizedTransaction(instanceId, requireWnstanceIdIfWorkflow(instanceId), instance -> {
            return doIfTrue(!instance.isTerminal(), () -> cancelInstance(instance, ops));
        });
    }

    /**
     * Resume the instance from paused to waiting state
     *
     * @param instanceId the instance id, if workflow must be lead instance id
     * @return {@code true} if resumed successfully
     */
    public boolean resumeInstance(long instanceId) {
        LOG.info("Resume instance: {}", instanceId);
        return doInSynchronizedTransaction(instanceId, requireWnstanceIdIfWorkflow(instanceId), instance -> {
            return doIfTrue(instance.isPaused(), () -> resumeInstance(instance));
        });
    }

    // ------------------------------------------------------------------private methods

    private SchedJob getRequiredJob(long jobId) {
        return Objects.requireNonNull(jobMapper.get(jobId), () -> "Job not found: " + jobId);
    }

    private void triggerJob(SchedJob job, RunType runType, long triggerTime) throws JobException {
        TriggerInstance triggerInstance = TriggerInstance.of(this, job, null, runType, triggerTime);
        triggerInstance.save();
        doAfterTransactionCommit(triggerInstance::dispatch);
    }

    private boolean shouldTerminateDispatchFailedTask(long taskId) {
        SchedTask task = taskMapper.get(taskId);
        if (!task.isWaiting()) {
            return false;
        }
        int currentDispatchFailedCount = task.getDispatchFailedCount();
        if (currentDispatchFailedCount >= conf.getTaskDispatchFailedCountThreshold()) {
            return true;
        }
        return isOneAffectedRow(taskMapper.incrementDispatchFailedCount(taskId, currentDispatchFailedCount))
            && (currentDispatchFailedCount + 1) == conf.getTaskDispatchFailedCountThreshold();
    }

    private void updateNextTriggerTime(SchedJob job) {
        if (TriggerType.of(job.getTriggerType()) == TriggerType.DEPEND) {
            return;
        }
        Long nextTriggerTime = TriggerTimes.updateNextTriggerTime(job);
        if (!nextTriggerTime.equals(job.getNextTriggerTime())) {
            job.setNextTriggerTime(nextTriggerTime);
            assertOneAffectedRow(jobMapper.updateNextTriggerTime(job), () -> "Update next trigger time failed: " + job);
        }
    }

    private void parseTriggerConfig(SchedJob job) {
        String triggerValue = CoreUtils.trimRequired(job.getTriggerValue(), 255, "Trigger value");
        job.setTriggerValue(triggerValue);

        Long jobId = job.getJobId();
        if (TriggerType.of(job.getTriggerType()) == TriggerType.DEPEND) {
            List parentJobIds = Collects.split(triggerValue, Long::parseLong);
            Assert.notEmpty(parentJobIds, () -> "Invalid dependency parent job id config: " + triggerValue);
            Assert.isTrue(!parentJobIds.contains(jobId), () -> "Cannot depends self: " + jobId + ", " + parentJobIds);

            Map parentJobMap = Collects.toMap(jobMapper.findByJobIds(parentJobIds), SchedJob::getJobId);
            for (Long parentJobId : parentJobIds) {
                SchedJob parentJob = parentJobMap.get(parentJobId);
                Assert.notNull(parentJob, () -> "Parent job id not found: " + parentJobId);
                String cGroup = job.getGroup(), pGroup = parentJob.getGroup();
                Assert.isTrue(cGroup.equals(pGroup), () -> "Inconsistent depend group: " + cGroup + ", " + pGroup);
            }
            // 校验是否有循环依赖 以及 依赖层级是否太深
            checkCircularDepends(jobId, new HashSet<>(parentJobIds));

            List list = Collects.convert(parentJobIds, pid -> SchedDepend.of(pid, jobId));
            Collects.batchProcess(list, dependMapper::batchInsert, JobConstants.PROCESS_BATCH_SIZE);
            job.setTriggerValue(Joiner.on(Symbol.Str.COMMA).join(parentJobIds));
            job.setNextTriggerTime(null);
        } else {
            job.setNextTriggerTime(TriggerTimes.updateNextTriggerTime(job));
        }
    }

    private void checkCircularDepends(Long jobId, Set parentJobIds) {
        Set outerDepends = parentJobIds;
        for (int i = 1; ; i++) {
            Map map = Collects.toMap(dependMapper.findByChildJobIds(parentJobIds), SchedDepend::getParentJobId);
            if (MapUtils.isEmpty(map)) {
                return;
            }
            Assert.isTrue(!map.containsKey(jobId), () -> "Circular depends job: " + map.get(jobId));
            Assert.isTrue(i < conf.getMaximumJobDependsDepth(), () -> "Exceed depends depth: " + outerDepends);
            parentJobIds = map.keySet();
        }
    }

    private boolean dispatch(boolean isRedispatch, SchedJob job, SchedInstance instance, List tasks) {
        ExecuteTaskParamBuilder builder = new ExecuteTaskParamBuilder(job, instance);
        RouteStrategy routeStrategy = RouteStrategy.of(job.getRouteStrategy());
        List list = new ArrayList<>(tasks.size());
        List> workload;

        if (routeStrategy.isBroadcast()) {
            for (SchedTask task : tasks) {
                Worker worker = task.worker();
                if (!workerClient.isAliveWorker(worker)) {
                    // 上游调用方有些处于事务中，有些不在事务中。因为此处的update操作非必须要求原子性，所以未加Spring事务。
                    taskMapper.terminate(task.getTaskId(), null, ExecuteState.BROADCAST_ABORTED, ExecuteState.WAITING, null, null);
                } else {
                    list.add(builder.build(Operation.TRIGGER, task.getTaskId(), instance.getTriggerTime(), worker));
                }
            }
        } else if (!isRedispatch || routeStrategy.isNotRoundRobin() || (workload = calculateWorkload(job, instance)).isEmpty()) {
            for (SchedTask task : tasks) {
                list.add(builder.build(Operation.TRIGGER, task.getTaskId(), instance.getTriggerTime(), null));
            }
        } else {
            // 轮询算法：选择分配到task最少的worker
            for (SchedTask task : tasks) {
                workload.sort(WORKLOAD_COMPARATOR);
                Tuple2 first = workload.get(0);
                list.add(builder.build(Operation.TRIGGER, task.getTaskId(), instance.getTriggerTime(), first.a));
                first.b += 1;
            }
        }

        return workerClient.dispatch(job.getGroup(), list);
    }

    private List> calculateWorkload(SchedJob job, SchedInstance instance) {
        List workers = workerClient.getDiscoveredWorkers(job.getGroup());
        if (CollectionUtils.isEmpty(workers)) {
            LOG.error("Not found available worker for calculate workload: {}", job.getGroup());
            return Collections.emptyList();
        }
        List pausableTasks = taskMapper.findBaseByInstanceIdAndStates(instance.getInstanceId(), ES_PAUSABLE);
        Map workerScoreMapping = pausableTasks.stream()
            .filter(e -> StringUtils.isNotBlank(e.getWorker()))
            .collect(Collectors.groupingBy(SchedTask::getWorker, Collectors.counting()));
        return Collects.convert(workers, e -> Tuple2.of(e, workerScoreMapping.getOrDefault(e.serialize(), 0L)));
    }

    private Long requireWnstanceIdIfWorkflow(long instanceId) {
        Long wnstanceId = instanceMapper.getWnstanceId(instanceId);
        if (wnstanceId != null && instanceId != wnstanceId) {
            throw new IllegalArgumentException("Must be workflow wnstance id: " + wnstanceId + ", " + instanceId);
        }
        return wnstanceId;
    }

    private void doInSynchronizedTransaction(long instanceId, Long wnstanceId, Consumer action) {
        doInSynchronizedTransaction(instanceId, wnstanceId, convert(action, true));
    }

    /**
     * 加JVM锁是为了尽量避免单节点内对数据库锁的等待及数据连接超时
     *
     * @param instanceId the instance id
     * @param wnstanceId the workflow instance id
     * @param action     the action
     * @return boolean value of action result
     */
    private boolean doInSynchronizedTransaction(long instanceId, Long wnstanceId, Predicate action) {
        return doInSynchronizedTransaction0(instanceId, wnstanceId, lockInstanceId -> {
            SchedInstance lockedInstance = instanceMapper.lock(lockInstanceId);
            Assert.notNull(lockedInstance, () -> "Locked instance not found: " + lockInstanceId);
            SchedInstance instance = (instanceId == lockInstanceId) ? lockedInstance : instanceMapper.get(instanceId);
            Assert.notNull(instance, () -> "Instance not found: " + instanceId);
            if (!Objects.equals(instance.getWnstanceId(), wnstanceId)) {
                throw new IllegalStateException("Inconsistent workflow instance id: " + wnstanceId + ", " + instance);
            }
            return action.test(instance);
        });
    }

    private  T doInSynchronizedTransaction0(long instanceId, Long wnstanceId, LongFunction action) {
        Long lockInstanceId = wnstanceId != null ? wnstanceId : (Long) instanceId;
        synchronized (CoreUtils.INSTANCE_LOCK_POOL.intern(lockInstanceId)) {
            return transactionTemplate.execute(status -> action.apply(lockInstanceId));
        }
    }

    private Tuple2 obtainRunState(List tasks) {
        List states = tasks.stream().map(e -> ExecuteState.of(e.getExecuteState())).collect(Collectors.toList());
        if (states.stream().allMatch(ExecuteState::isTerminal)) {
            // executeEndTime is null: canceled task maybe never not started
            RunState toState = states.stream().anyMatch(ExecuteState::isFailure) ? RunState.CANCELED : RunState.COMPLETED;
            Date maxExecuteEndTime = tasks.stream()
                .map(SchedTask::getExecuteEndTime).filter(Objects::nonNull).max(Comparator.naturalOrder()).orElseGet(Date::new);
            return Tuple2.of(toState, maxExecuteEndTime);
        }
        // if task has WAITING or EXECUTING state, then return null
        return states.stream().anyMatch(ExecuteState::isPausable) ? null : Tuple2.of(RunState.PAUSED, null);
    }

    private void pauseInstance(SchedInstance instance) {
        if (instance.isWorkflow()) {
            long instanceId = instance.getInstanceId();
            Assert.isTrue(instance.isWorkflowLead(), () -> "Pause instance must be workflow lead: " + instanceId);
            // pause sched_workflow running node
            instanceMapper.findWorkflowNode(instanceId).stream().filter(SchedInstance::isPausable).forEach(this::pauseInstance0);
            updateWorkflowLeadState(instance, RunState.PAUSED, RS_WAITING);
        } else {
            pauseInstance0(instance);
        }
    }

    private void pauseInstance0(SchedInstance instance) {
        Assert.isTrue(instance.isPausable(), () -> "Invalid pause instance state: " + instance);
        long instanceId = instance.getInstanceId();
        Operation ops = Operation.PAUSE;

        // update task state: (WAITING) -> (PAUSE)
        taskMapper.updateStateByInstanceId(instanceId, ops.toState().value(), ES_WAITING, null);

        // load the alive executing tasks
        List executingTasks = loadExecutingTasks(instance, ops);
        if (executingTasks.isEmpty()) {
            // has non executing task, update sched instance state
            Tuple2 tuple = obtainRunState(taskMapper.findBaseByInstanceId(instanceId));
            // must be paused or terminated
            Assert.notNull(tuple, () -> "Pause instance failed: " + instanceId);
            boolean updated = instanceMapper.terminate(instanceId, tuple.a, RS_PAUSABLE, tuple.b);
            Assert.state(updated, () -> "Pause instance failed: " + instance + ", " + tuple.a);
            if (instance.isWorkflowNode()) {
                updateWorkflowNodeState(instance, tuple.a, RS_PAUSABLE);
            } else if (tuple.a.isTerminal()) {
                instance.markTerminated(tuple.a, tuple.b);
                renewFixedNextTriggerTime(instance);
            }
        } else {
            // has alive executing tasks: dispatch and pause executing tasks
            doAfterTransactionCommit(() -> workerClient.dispatch(executingTasks));
        }
    }

    private void cancelInstance(SchedInstance instance, Operation ops) {
        if (instance.isWorkflow()) {
            long instanceId = instance.getInstanceId();
            Assert.isTrue(instance.isWorkflowLead(), () -> "Cancel instance must be workflow lead: " + instanceId);
            instanceMapper.findWorkflowNode(instanceId).stream().filter(e -> !e.isTerminal()).forEach(e -> cancelInstance0(e, ops));
            updateWorkflowLeadState(instance, RunState.CANCELED, RS_RUNNABLE);
        } else {
            cancelInstance0(instance, ops);
        }
    }

    private void cancelInstance0(SchedInstance instance, Operation ops) {
        long instanceId = instance.getInstanceId();
        // update: (WAITING or PAUSED) -> (CANCELED)
        taskMapper.updateStateByInstanceId(instanceId, ops.toState().value(), ES_EXECUTABLE, new Date());

        // load the alive executing tasks
        List executingTasks = loadExecutingTasks(instance, ops);
        if (executingTasks.isEmpty()) {
            // has non executing execute_state
            Tuple2 tuple = obtainRunState(taskMapper.findBaseByInstanceId(instanceId));
            Assert.notNull(tuple, () -> "Cancel instance obtain run state failed: " + instanceId);
            // if all task paused, should update to canceled state
            if (tuple.a == RunState.PAUSED) {
                tuple = Tuple2.of(RunState.CANCELED, new Date());
            }
            if (!instanceMapper.terminate(instanceId, tuple.a, RS_TERMINABLE, tuple.b)) {
                throw new IllegalStateException("Cancel instance failed: " + instance + ", " + tuple.a);
            }
            instance.markTerminated(tuple.a, tuple.b);
            if (instance.isWorkflowNode()) {
                updateWorkflowNodeState(instance, tuple.a, RS_TERMINABLE);
            } else {
                renewFixedNextTriggerTime(instance);
            }
        } else {
            // dispatch and cancel executing tasks
            doAfterTransactionCommit(() -> workerClient.dispatch(executingTasks));
        }
    }

    private void resumeInstance(SchedInstance instance) {
        if (instance.isWorkflow()) {
            long instanceId = instance.getInstanceId();
            Assert.isTrue(instance.isWorkflowLead(), () -> "Resume instance must be workflow lead: " + instanceId);
            // update sched_instance paused lead to running state
            boolean updated = instanceMapper.updateState(instanceId, RunState.RUNNING, RunState.PAUSED);
            Assert.state(updated, () -> "Resume workflow lead instance failed: " + instanceId);
            workflowMapper.resumeWaiting(instanceId);
            for (SchedInstance nodeInstance : instanceMapper.findWorkflowNode(instanceId)) {
                if (nodeInstance.isPaused()) {
                    resumeInstance0(nodeInstance);
                    updateWorkflowNodeState(nodeInstance, RunState.RUNNING, RS_PAUSED);
                }
            }
            WorkflowGraph graph = WorkflowGraph.of(workflowMapper.findByWnstanceId(instanceId));
            try {
                List dispatchActions = processWorkflowGraph(instance, graph, graph.map());
                doAfterTransactionCommit(dispatchActions);
            } catch (JobException e) {
                ExceptionUtils.rethrow(e);
            }
        } else {
            resumeInstance0(instance);
        }
    }

    private void resumeInstance0(SchedInstance instance) {
        long instanceId = instance.getInstanceId();
        boolean updated = instanceMapper.updateState(instanceId, RunState.WAITING, RunState.PAUSED);
        Assert.state(updated, () -> "Resume sched instance failed: " + instanceId);

        int row = taskMapper.updateStateByInstanceId(instanceId, ExecuteState.WAITING.value(), ES_PAUSED, null);
        assertHasAffectedRow(row, "Resume sched task failed.");

        // dispatch task
        Tuple3> param = buildDispatchParam(instanceId, row);
        doAfterTransactionCommit(() -> dispatch(param.a, param.b, param.c));
    }

    private void afterTerminateTask(SchedInstance instance) {
        Assert.isTrue(!instance.isWorkflowLead(), () -> "After terminate task cannot be workflow lead: " + instance);
        RunState runState = RunState.of(instance.getRunState());

        if (runState == RunState.CANCELED) {
            retryJob(instance);
        } else if (runState == RunState.COMPLETED) {
            if (!instance.isWorkflowNode()) {
                renewFixedNextTriggerTime(instance);
            }
            processWorkflowInstance(instance);
            dependJob(instance);
        } else {
            throw new IllegalStateException("Unknown terminate run state " + runState);
        }
    }

    private void retryJob(SchedInstance failed) {
        Long retryingInstanceId = ThrowingSupplier.doCaught(() -> retryJob0(failed));
        if (retryingInstanceId != null) {
            startRetrying(failed);
            return;
        }
        if (failed.isWorkflowNode()) {
            // If workflow without retry, then require update workflow graph state
            updateWorkflowNodeState(failed, RunState.CANCELED, RS_TERMINABLE);
            updateWorkflowLeadState(instanceMapper.get(failed.getWnstanceId()), RunState.CANCELED, RS_RUNNABLE);
        } else {
            renewFixedNextTriggerTime(failed);
        }
    }

    private Long retryJob0(SchedInstance failed) throws JobException {
        SchedJob job = getRequiredJob(failed.getJobId());
        int retriedCount = failed.obtainRetriedCount();
        if (!job.retryable(RunState.of(failed.getRunState()), retriedCount)) {
            return null;
        }

        // build retry instance
        long retryInstanceId = generateId();
        long triggerTime = job.computeRetryTriggerTime(++retriedCount);
        SchedInstance retryInstance = SchedInstance.of(failed, retryInstanceId, job.getJobId(), RunType.RETRY, triggerTime, retriedCount);
        retryInstance.setWorkflowCurNode(failed.getWorkflowCurNode());
        // build retry tasks
        List tasks = splitRetryTask(job, failed, retryInstance);
        if (tasks.isEmpty()) {
            // the broadcast failed task worker all dead then terminate retry
            LOG.warn("Retry instance split tasks is empty: {}, {}", job, failed);
            return null;
        }

        ThrowingSupplier persistenceAction = () -> {
            if (failed.isWorkflowNode()) {
                // 如果是workflow，则需要更新sched_workflow.instance_id
                String curNode = failed.getWorkflowCurNode();
                int row = workflowMapper.update(failed.getWnstanceId(), curNode, null, retryInstanceId, RS_RUNNING, failed.getInstanceId());
                assertHasAffectedRow(row, () -> "Retry instance, workflow node update failed.");
            }
            saveInstanceAndTasks(retryInstance, tasks);
            return () -> dispatch(job, retryInstance, tasks);
        };
        Consumer errorHandler = t -> { throw new IllegalStateException("Create retry instance failed: " + failed, t); };
        // 使用嵌套事务：保证`workflow & instance & tasks`操作的原子性，异常则回滚而不影响外层事务
        Runnable dispatchAction = doInNestedTransaction(transactionTemplate, persistenceAction, errorHandler);
        doAfterTransactionCommit(dispatchAction);

        return retryInstanceId;
    }

    private List splitRetryTask(SchedJob job, SchedInstance failed, SchedInstance retry) throws JobException {
        RetryType retryType = RetryType.of(job.getRetryType());
        if (retryType == RetryType.ALL) {
            // re-split job
            SplitJobParam splitJobParam;
            if (failed.isWorkflow()) {
                List list = loadWorkflowPredecessorInstances(job, failed.getWnstanceId(), failed.getInstanceId());
                splitJobParam = ModelConverter.toSplitJobParam(job, retry, list);
            } else {
                splitJobParam = ModelConverter.toSplitJobParam(job, retry);
            }
            return splitJob(job.getGroup(), retry.getInstanceId(), splitJobParam);
        } else if (retryType == RetryType.FAILED) {
            return taskMapper.findLargeByInstanceId(failed.getInstanceId())
                .stream()
                .filter(SchedTask::isFailure)
                // Broadcast task must be retried with the same worker
                .filter(e -> !job.isBroadcast() || workerClient.isAliveWorker(e.worker()))
                .map(e -> SchedTask.of(e.getTaskParam(), generateId(), retry.getInstanceId(), e.getTaskNo(), e.getTaskCount(), e.getWorker()))
                .collect(Collectors.toList());
        } else {
            throw new UnsupportedOperationException("Retry instance, unknown retry type: " + job.getJobId() + ", " + retryType);
        }
    }

    private void dependJob(SchedInstance parent) {
        if (parent.isWorkflowNode() || !parent.isCompleted()) {
            return;
        }
        for (SchedDepend depend : dependMapper.findByParentJobId(parent.getJobId())) {
            ThrowingRunnable.doCaught(() -> dependJob(parent, depend), () -> "Depend job error: " + parent + ", " + depend);
        }
    }

    private void dependJob(SchedInstance parent, SchedDepend depend) throws JobException {
        SchedJob childJob = getRequiredJob(depend.getChildJobId());
        if (childJob.isDisabled()) {
            LOG.warn("Depend child job disabled: {}", childJob);
            return;
        }

        // 使用嵌套事务：保证`save`方法内部数据操作的原子性，异常则回滚而不影响外层事务
        TriggerInstance dependInstance = TriggerInstance.of(this, childJob, parent, RunType.DEPEND, System.currentTimeMillis());
        Consumer errorHandler = t -> LOG.error("Create depend instance failed: {}, {}", childJob, parent, t);
        ThrowingSupplier persistenceAction = () -> { dependInstance.save(); return dependInstance::dispatch; };
        Runnable dispatchAction = doInNestedTransaction(transactionTemplate, persistenceAction, errorHandler);
        doAfterTransactionCommit(dispatchAction);
    }

    private List loadExecutingTasks(SchedInstance instance, Operation ops) {
        List executingTasks = new ArrayList<>();
        ExecuteTaskParamBuilder builder = null;
        long triggerTime = System.currentTimeMillis();
        for (SchedTask task : taskMapper.findBaseByInstanceIdAndStates(instance.getInstanceId(), ES_EXECUTING)) {
            Worker worker = task.worker();
            if (workerClient.isAliveWorker(worker)) {
                if (builder == null) {
                    builder = new ExecuteTaskParamBuilder(getRequiredJob(instance.getJobId()), instance);
                }
                executingTasks.add(builder.build(ops, task.getTaskId(), triggerTime, worker));
            } else {
                // update dead task
                Date executeEndTime = ops.toState().isTerminal() ? new Date() : null;
                ExecuteState toState = ops.toState().isTerminal() ? ExecuteState.EXECUTE_ABORTED : ops.toState();
                ExecuteState fromState = ExecuteState.EXECUTING;
                if (taskMapper.terminate(task.getTaskId(), task.getWorker(), toState, fromState, executeEndTime, null)) {
                    LOG.info("Terminate dead worker executing task success: {}", task);
                } else {
                    LOG.error("Terminate dead worker executing task failed: {}", task);
                }
            }
        }
        return executingTasks;
    }

    private Tuple3> buildDispatchParam(long instanceId, int expectTaskSize) {
        SchedInstance instance = instanceMapper.get(instanceId);
        SchedJob job = getRequiredJob(instance.getJobId());
        List waitingTasks = taskMapper.findLargeByInstanceIdAndStates(instanceId, ES_WAITING);
        int size = waitingTasks.size();
        Assert.state(size == expectTaskSize, () -> "Invalid dispatch tasks size: " + size + ", " + expectTaskSize);
        return Tuple3.of(job, instance, waitingTasks);
    }

    private void startRetrying(SchedInstance instance) {
        if (!instance.isRunRetry()) {
            RunState state = RunState.CANCELED;
            boolean updated = instanceMapper.updateRetrying(instance.getInstanceId(), true, state, state);
            Assert.state(updated, () -> "Start retrying failed: " + instance);
        }
    }

    private void stopRetrying(SchedInstance instance, RunState toState) {
        if (instance.isRunRetry()) {
            long id = instance.obtainRetryOriginalInstanceId();
            boolean updated = instanceMapper.updateRetrying(id, false, toState, RunState.CANCELED);
            Assert.state(updated, () -> "Stop retrying failed: " + toState + ", " + instance);
        }
    }

    private void renewFixedNextTriggerTime(SchedInstance instance) {
        Assert.isTrue(instance.isTerminal(), () -> "Renew fixed instance must be terminal state: " + instance);
        Assert.isTrue(!instance.isWorkflowNode(), () -> "Renew fixed instance cannot be workflow node: " + instance);
        if (instance.isRunRetry()) {
            stopRetrying(instance, RunState.of(instance.getRunState()));
        }

        long instanceId = instance.obtainRetryOriginalInstanceId();
        SchedInstance original = (instanceId == instance.getInstanceId()) ? instance : instanceMapper.get(instanceId);
        if (!original.getJobId().equals(instance.getJobId()) || !RunType.SCHEDULE.equalsValue(original.getRunType())) {
            return;
        }
        SchedJob job = jobMapper.get(original.getJobId());
        TriggerType triggerType;
        if (job == null || job.isDisabled() || !(triggerType = TriggerType.of(job.getTriggerType())).isFixedTriggerType()) {
            return;
        }
        long lastTriggerTime = original.getTriggerTime(), nextTriggerTime;
        if (triggerType == TriggerType.FIXED_RATE) {
            Date time = triggerType.computeNextTriggerTime(job.getTriggerValue(), new Date(original.getTriggerTime()));
            nextTriggerTime = Dates.max(time, original.getRunEndTime()).getTime();
        } else {
            // TriggerType.FIXED_DELAY
            nextTriggerTime = triggerType.computeNextTriggerTime(job.getTriggerValue(), original.getRunEndTime()).getTime();
        }
        boolean updated = isOneAffectedRow(jobMapper.updateFixedNextTriggerTime(job.getJobId(), lastTriggerTime, nextTriggerTime));
        LOG.info("Renew fixed next trigger time: {}, {}, {}, {}", job.getJobId(), lastTriggerTime, nextTriggerTime, updated);
    }

    // ------------------------------------------------------------------private workflow methods

    private void updateWorkflowNodeState(SchedInstance node, RunState toState, List fromStates) {
        Assert.isTrue(node.isWorkflowNode(), () -> "Update workflow cur node state must be node: " + node);
        String curNode = node.getWorkflowCurNode();
        int row = workflowMapper.update(node.getWnstanceId(), curNode, toState.value(), null, fromStates, node.getInstanceId());
        assertHasAffectedRow(row, () -> "Update workflow state failed: " + node + ", " + toState);
        if (toState.isTerminal()) {
            stopRetrying(node, toState);
        }
    }

    private void updateWorkflowLeadState(SchedInstance lead, RunState toState, List fromStates) {
        Assert.isTrue(lead.isWorkflowLead(), () -> "Update workflow free node state must be lead: " + lead);
        long wnstanceId = lead.getWnstanceId();
        workflowMapper.update(wnstanceId, null, toState.value(), null, fromStates, null);
        stopWorkflowGraph(wnstanceId, WorkflowGraph.of(workflowMapper.findByWnstanceId(wnstanceId)));
    }

    private void processWorkflowInstance(SchedInstance node) {
        if (!node.isWorkflowNode()) {
            return;
        }
        // update current node state
        updateWorkflowNodeState(node, RunState.COMPLETED, RS_TERMINABLE);

        // if terminal all, then update workflow nodes
        long wnstanceId = node.getWnstanceId();
        WorkflowGraph graph = WorkflowGraph.of(workflowMapper.findByWnstanceId(wnstanceId));
        if (stopWorkflowGraph(wnstanceId, graph)) {
            return;
        }

        // process next workflow node
        Map map = graph.successors(node.parseWorkflowCurNode());
        SchedInstance lead = instanceMapper.get(wnstanceId);

        Consumer errorHandler = t -> {
            LOG.error("Process workflow node error: {}", node, t);
            updateWorkflowLeadState(lead, RunState.CANCELED, RS_RUNNABLE);
        };
        // 使用嵌套事务：保证`processWorkflowNode`方法内部数据操作的原子性，异常则回滚而不影响外层事务
        ThrowingSupplier, Throwable> persistenceAction = () -> processWorkflowGraph(lead, graph, map);
        List dispatchActions = doInNestedTransaction(transactionTemplate, persistenceAction, errorHandler);
        doAfterTransactionCommit(dispatchActions);
    }

    private List processWorkflowGraph(SchedInstance lead, WorkflowGraph graph,
                                                Map map) throws JobException {
        Assert.isTrue(lead.isWorkflowLead(), () -> "Process workflow node must be lead: " + lead);
        List dispatchActions = new ArrayList<>();
        if (!map.isEmpty()) {
            SchedJob job = getRequiredJob(lead.getJobId());
            Set duplicates = new HashSet<>();
            for (Map.Entry edge : map.entrySet()) {
                processWorkflowGraph(dispatchActions, job, lead, graph, duplicates, edge);
            }
        }
        return dispatchActions;
    }

    private void processWorkflowGraph(List dispatchActions, SchedJob job, SchedInstance lead, WorkflowGraph graph,
                                      Set duplicates, Map.Entry edge) throws JobException {
        long wnstanceId = lead.getWnstanceId();
        DAGNode target = edge.getKey().getTarget();
        SchedWorkflow workflow = edge.getValue();
        if (target.isEnd() || !workflow.isWaiting() || !duplicates.add(target)) {
            // 当前节点为结束结点 或 当前节点不为等待状态，则跳过
            return;
        }
        Collection predecessors = graph.predecessors(target).values();
        if (predecessors.stream().anyMatch(e -> !e.isTerminal())) {
            // 前置节点还未结束，则跳过
            return;
        }
        if (predecessors.stream().anyMatch(SchedWorkflow::isFailure)) {
            RunState state = RunState.CANCELED;
            int row = workflowMapper.update(wnstanceId, workflow.getCurNode(), state.value(), null, RS_TERMINABLE, null);
            assertHasAffectedRow(row, () -> "Update workflow cur node state failed: " + workflow + ", " + state);
            return;
        }

        long nextInstanceId = generateId();
        RunType runType = RunType.of(lead.getRunType());
        SchedWorkflow lastPredecessor = predecessors.stream().max(BaseEntity.UPDATED_AT_COMPARATOR).orElse(null);
        SchedInstance parent = (lastPredecessor == null) ? lead : instanceMapper.get(lastPredecessor.getInstanceId());
        SchedInstance nextInstance = SchedInstance.of(parent, nextInstanceId, job.getJobId(), runType, System.currentTimeMillis(), 0);
        nextInstance.setWorkflowCurNode(workflow.getCurNode());

        int row = workflowMapper.update(wnstanceId, workflow.getCurNode(), RunState.RUNNING.value(), nextInstanceId, RS_WAITING, null);
        assertHasAffectedRow(row, () -> "Start workflow node failed: " + workflow);

        List list = predecessors.isEmpty() ? null : loadWorkflowPredecessorInstances(job, wnstanceId, nextInstanceId);
        SplitJobParam splitJobParam = ModelConverter.toSplitJobParam(job, nextInstance, list);
        List tasks = splitJob(job.getGroup(), nextInstanceId, splitJobParam);

        // save to db
        saveInstanceAndTasks(nextInstance, tasks);
        dispatchActions.add(() -> dispatch(job, nextInstance, tasks));
    }

    private boolean stopWorkflowGraph(long wnstanceId, WorkflowGraph graph) {
        if (graph.anyMatch(e -> e.getKey().getTarget().isEnd() && !e.getValue().isTerminal())) {
            // if end node is not terminal state, then process the end node run state
            Map ends = graph.predecessors(DAGNode.END);
            if (ends.values().stream().allMatch(SchedWorkflow::isTerminal)) {
                RunState endState = ends.values().stream().anyMatch(SchedWorkflow::isFailure) ? RunState.CANCELED : RunState.COMPLETED;
                int row = workflowMapper.update(wnstanceId, DAGNode.END.toString(), endState.value(), null, RS_TERMINABLE, null);
                assertHasAffectedRow(row, () -> "Update workflow end node failed: " + wnstanceId + ", " + endState);
                ends.forEach((k, v) -> graph.get(k.getTarget(), DAGNode.END).setRunState(endState.value()));
            }
        }
        if (graph.allMatch(e -> e.getValue().isTerminal())) {
            // terminate lead instance
            RunState state = graph.anyMatch(e -> e.getValue().isFailure()) ? RunState.CANCELED : RunState.COMPLETED;
            boolean updated = instanceMapper.terminate(wnstanceId, state, RS_TERMINABLE, new Date());
            Assert.state(updated, () -> "Stop workflow instance failed: " + wnstanceId + ", " + state);
            SchedInstance lead = instanceMapper.get(wnstanceId);
            dependJob(lead);
            renewFixedNextTriggerTime(lead);
            return true;
        }
        if (graph.allMatch(e -> e.getValue().isTerminal() || e.getValue().isPaused())) {
            // At Least one paused and others is terminal
            boolean updated = instanceMapper.updateState(wnstanceId, RunState.PAUSED, RunState.RUNNING);
            Assert.state(updated, () -> "Update workflow pause state failed: " + wnstanceId);
            return true;
        }
        return false;
    }

    private List loadWorkflowPredecessorInstances(SchedJob job, long wnstanceId, Long instanceId) {
        List workflows = workflowMapper.findByWnstanceId(wnstanceId);
        SchedWorkflow curWorkflow = workflows.stream().filter(e -> instanceId.equals(e.getInstanceId())).findAny().orElse(null);
        Assert.state(curWorkflow != null, () -> "Not found current workflow node: " + wnstanceId + ", " + instanceId);
        Map predecessors = WorkflowGraph.of(workflows).predecessors(curWorkflow.parseCurNode());
        if (predecessors.isEmpty()) {
            return null;
        }
        RetryType retryType = RetryType.of(job.getRetryType());
        return Collects.convert(predecessors.values(), e -> {
            // predecessor instance下的task是全部执行成功的
            List tasks = taskMapper.findLargeByInstanceId(e.getInstanceId());
            SchedInstance prev;
            if (retryType == RetryType.FAILED && (prev = instanceMapper.get(e.getInstanceId())).isRunRetry()) {
                Set instanceIds = instanceMapper.findChildren(prev.getPnstanceId(), RunType.RETRY.value())
                    .stream()
                    .map(SchedInstance::getInstanceId)
                    .filter(t -> !Objects.equals(t, e.getInstanceId()))
                    .collect(Collectors.toSet());
                instanceIds.add(prev.getPnstanceId());
                instanceIds.forEach(t -> tasks.addAll(taskMapper.findLargeByInstanceIdAndStates(t, ES_COMPLETED)));
            }
            tasks.sort(SchedTask.TASK_NO_COMPARATOR);
            return ModelConverter.toPredecessorInstance(e, tasks);
        });
    }

}