All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.distelli.monitor.impl.TaskManagerImpl Maven / Gradle / Ivy

There is a newer version: 10.0.4
Show newest version
package com.distelli.monitor.impl;

import com.distelli.jackson.transform.TransformModule;
import com.distelli.monitor.Monitor;
import com.distelli.monitor.MonitorInfo;
import com.distelli.monitor.Sequence;
import com.distelli.monitor.TaskBuilder;
import com.distelli.monitor.TaskContext;
import com.distelli.monitor.TaskFunction;
import com.distelli.monitor.TaskInfo;
import com.distelli.monitor.TaskManager;
import com.distelli.monitor.TaskState;
import com.distelli.persistence.AttrType;
import com.distelli.persistence.Index;
import com.distelli.persistence.FilterCondExpr;
import com.distelli.persistence.PageIterator;
import com.distelli.persistence.TableDescription;
import com.distelli.persistence.UpdateItemBuilder;
import com.distelli.utils.CompactUUID;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Future;
import java.util.concurrent.Semaphore;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import javax.inject.Inject;
import javax.inject.Singleton;
import javax.persistence.RollbackException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.distelli.persistence.AttrType;
import java.util.function.Predicate;
import java.util.function.Consumer;
import static com.distelli.utils.LongSortKey.*;

/**
 * If we loose DB connection or the JVM is `kill -9`ed, then tasks in these states will
 * move out of these states by these mechanisms:
 *
 *  QUEUED: startRunnableTasks()
 *  RUNNING: releaseLocksForMonitorId() when broken monitor is found.
 *  WAITING_FOR_PREREQUISITE: releaseLocks("_TASK"+longToSortKey(prerequisiteId))
 *  WAITING_FOR_INTERVAL: releaseLocksForMonitorId() when broken monitor is found.
 *  WAITING_FOR_LOCK: releaseLocks(lockId)
 */
@Singleton
public class TaskManagerImpl implements TaskManager {
    private static final int POLL_INTERVAL_MS = 10000;
    private static final int MAX_TASKS_IN_INTERVAL = 10;
    // Every POLL_INTERVAL_MS*CLEANUP_INTERVALS we perform a scan
    // to see if we need to do some cleanup:
    private static final int CLEANUP_INTERVALS = 30;
    private static final String TASK_ID_NONE = "#";
    private static final String MONITOR_ID_QUEUED = "#";
    private static final String MONITOR_ID_WAITING = "$";
    private static final Logger LOG = LoggerFactory.getLogger(TaskManagerImpl.class);

    @Inject
    private Monitor _monitor;
    @Inject
    private ScheduledExecutorService _executor;
    @Inject
    private Map _taskFunctions;
    @Inject
    private Sequence _sequence;

    // constants (no locking needed):
    private Index _locks;
    private Index _locksForMonitor;
    private Index _tasks;
    private Index _tasksForMonitor;
    private Index _tasksForEntity;
    private Index _nonTerminalTasksForEntity;
    private final ObjectMapper _om = new ObjectMapper();
    private final Map _delayedTasks =
        new ConcurrentHashMap<>();
    private Semaphore _capacity;
    private int _maxCapacity;

    // synchronized(_queuedTasks):
    private Set _taskQueue = new LinkedHashSet<>();
    // synchronized(this):
    private ScheduledFuture _monitorTasks;
    // synchronized(this):
    private Set> _spawnedFutures = new HashSet<>();
    // synchronized(this):
    private long _lastRunTask = 0;
    // synchronized(this):
    private Future _runNextTaskFuture = null;

    private Set> _onTerminalState =
        Collections.synchronizedSet(new HashSet>());

    private Predicate _taskMatches;

    private AtomicInteger _pollCount = new AtomicInteger(
        ThreadLocalRandom.current().nextInt(0, CLEANUP_INTERVALS-1));

    private static class DelayedTask {
        private DelayedTask(long millisRemaining) {
            this.millisTimeBegin = milliTime();
            this.millisRemaining = millisRemaining;
        }
        public long millisTimeBegin;
        public long millisRemaining;
    }

    public static class TasksTable {
        public static TableDescription getTableDescription() {
            return TableDescription.builder()
                .tableName("monitor-tasks")
                // Query on task id:
                .index((idx) -> idx
                       .readCapacity(2L)
                       .writeCapacity(12L)
                       .hashKey("id", AttrType.NUM))
                // Query on monitor ids, or "special" state:
                //    '#' - Runnable
                //    '$' - Waiting on lock/prequisite
                .index((idx) -> idx
                       .writeCapacity(6L)
                       .indexName("mid-id-index")
                       .hashKey("mid", AttrType.STR)
                       .rangeKey("id", AttrType.NUM))
                // Query on entities:
                .index((idx) -> idx
                       .writeCapacity(6L)
                       .indexName("ety-eid-index")
                       .hashKey("ety", AttrType.STR)
                       .rangeKey("eid", AttrType.STR))
                // Same as above, but only include non-terminals:
                .index((idx) -> idx
                       .writeCapacity(6L)
                       .indexName("ntty-ntid-index")
                       .hashKey("ntty", AttrType.STR)
                       .rangeKey("ntid", AttrType.STR))
                .build();
        }
    }

    public static class LocksTable {
        // Entries in this table are either locks or tasks waiting
        // on a lock.
        public static TableDescription getTableDescription() {
            return TableDescription.builder()
                .tableName("monitor-locks")
                .index((idx) -> idx
                       // We do a LOT of writing on this table!
                       .writeCapacity(8L)
                       .hashKey("lid", AttrType.STR)
                       // "actual" locks have tid=TASK_ID_NONE:
                       .rangeKey("tid", AttrType.STR))
                // Query on monitor ids:
                .index((idx) -> idx
                       .writeCapacity(3L)
                       .indexName("mid-index")
                       .hashKey("mid", AttrType.STR))
                .build();
        }
    }

    @Override
    public List getTasksByEntityType(String entityType, PageIterator iter) {
        return getTasksByEntityType(entityType, null, iter);
    }

    @Override
    public List getTasksByEntityType(
        String entityType, String entityIdBeginsWith, PageIterator iter)
    {
        if ( null == entityIdBeginsWith || "".equals(entityIdBeginsWith) ) {
            return _tasksForEntity.queryItems(entityType, iter).list();
        }
        return _tasksForEntity.queryItems(entityType, iter)
            .beginsWith(entityIdBeginsWith)
            .list();
    }

    @Override
    public List getNonTerminalTasksByEntityIdBeginsWith(
        String entityType, String taskIdBeginsWith, PageIterator iter)
    {

        if ( null == taskIdBeginsWith || "".equals(taskIdBeginsWith) ) {
            return _nonTerminalTasksForEntity.queryItems(entityType, iter)
                .list();
        }
        return _nonTerminalTasksForEntity.queryItems(entityType, iter)
            .beginsWith(taskIdBeginsWith)
            .list();
    }

    @Override
    public List getNonTerminalTasks(PageIterator iter) {
        return _tasksForMonitor.scanItems(iter);
    }

    @Override
    public List getAllTasks(PageIterator iter) {
        return _tasks.scanItems(iter);
    }

    @Override
    public TaskInfo getTask(Long taskId) {
        return _tasks.getItem(taskId, null);
    }

    @Override
    public TaskBuilder createTask() {
        return new TaskBuilderImpl() {
            @Override
            public TaskInfo build() {
                Task task = (Task)super.build();
                task.taskId = _sequence.next(_tasks.getTableName());
                return task;
            }
        };
    }

    @Override
    public void addTask(TaskInfo taskInfo) {
        if ( ! (taskInfo instanceof Task) ) {
            throw new IllegalArgumentException("TaskInfo must be created from TaskManager.createTask()");
        }
        Task task = (Task)taskInfo;
        if ( null == task.getEntityType() ) {
            throw new IllegalArgumentException("missing task.entityType");
        }
        if ( null == task.getEntityId() ) {
            throw new IllegalArgumentException("missing task.entityId");
        }
        if ( null == task.getTaskId() ) {
            throw new IllegalStateException("_sequence.next("+_tasks.getTableName()+") returned null!");
        }
        TaskFunction taskFunction =
            _taskFunctions.get(task.getEntityType());
        if ( null == taskFunction ) {
            throw new IllegalArgumentException(
                "missing TaskFunction for task.entityType="+task.getEntityType());
        }
        if ( null == task.getMillisecondsRemaining() ) {
            task.taskState = TaskState.QUEUED;
        } else {
            task.taskState = TaskState.WAITING_FOR_INTERVAL;
        }
        task.monitorId = MONITOR_ID_QUEUED;
        // Reset the task (just in case):
        task.startTime = null;
        task.endTime = null;
        task.errorMessage = null;
        task.errorId = null;
        task.errorMessageStackTrace = null;
        task.runCount = 0L;
        task.canceledBy = null;
        // Save the task:
        _tasks.putItemOrThrow(task);
        // Dispatch:
        submitRunTask(task.getTaskId());
        submit(this::runNextTask);
    }

    @Override
    public void deleteTask(long taskId) throws IllegalStateException {
        try {
            _tasks.deleteItem(
                taskId, null, (expr) ->
                expr.or(
                    expr.not(expr.exists("id")),
                    expr.or(
                        expr.not(expr.exists("mid")),
                        expr.in(
                            "mid",
                            Arrays.asList(MONITOR_ID_QUEUED, MONITOR_ID_WAITING)))));
        } catch ( RollbackException ex ) {
            throw new IllegalStateException("Attempt to deleteTask("+taskId+") which is currently locked");
        }
    }

    // Marks a task as "to be canceled":
    @Override
    public void cancelTask(String canceledBy, long taskId) {
        if ( null == canceledBy ) throw new IllegalArgumentException("canceledBy may not be null");
        try {
            _tasks.updateItem(taskId, null)
                .set("cancel", AttrType.STR, canceledBy)
                .when((expr) -> expr.exists("mid"));
        } catch ( RollbackException ex ) {
            LOG.debug("Attempt to cancel taskId="+taskId+" that is in a final state, ignoring");
            return;
        }
        try {
            _tasks.updateItem(taskId, null)
                .set("mid", AttrType.STR, MONITOR_ID_QUEUED)
                .set("stat", AttrType.STR, toString(TaskState.QUEUED))
                .when((expr) -> expr.beginsWith("mid", "$"));
        } catch ( RollbackException ex ) {
            return;
        }
        // We moved the task out of waiting, so let's execute it:
        submitRunTask(taskId);
        submit(this::runNextTask);
    }

    @Override
    public synchronized void monitorTaskQueue() {
        monitorTaskQueueFor(null);
    }

    @Override
    public synchronized void monitorTaskQueueFor(
        Predicate taskMatches)
    {
        _taskMatches = ( null == taskMatches ) ? (info) -> true : taskMatches;
        if ( null != _monitorTasks ) return;
        if ( null == _executor ) return;
        _monitorTasks = _executor.scheduleAtFixedRate(
            this::startRunnableTasks,
            ThreadLocalRandom.current().nextLong(POLL_INTERVAL_MS),
            POLL_INTERVAL_MS,
            TimeUnit.MILLISECONDS);
    }

    @Override
    public void addOnTerminalState(Consumer onTerminalState) {
        _onTerminalState.add(onTerminalState);
    }

    @Override
    public void removeOnTerminalState(Consumer onTerminalState) {
        _onTerminalState.remove(onTerminalState);
    }

    @Override
    public void stopTaskQueueMonitor(boolean mayInterruptIfRunning) {
        Set> allFutures = new HashSet<>();
        synchronized ( this ) {
            if ( null == _monitorTasks ) return;
            _monitorTasks.cancel(false);
            allFutures.add(_monitorTasks);
            _monitorTasks = null;
        }
        for ( Long taskId : _delayedTasks.keySet() ) {
            updateDelayedTask(taskId, null);
        }
        synchronized ( this ) {
            if ( null != _monitorTasks ) return;
            for ( Future future : _spawnedFutures ) {
                future.cancel(mayInterruptIfRunning);
                allFutures.add(future);
            }
            _spawnedFutures.clear();
        }
        try {
            // Ensure cancelation eventually occurs:
            for ( long seconds = 60;
                  ! _capacity.tryAcquire(_maxCapacity, seconds, TimeUnit.SECONDS);
                  seconds = seconds / 2)
            {
                for ( Future future : allFutures ) {
                    // Force!
                    future.cancel(true);
                }
                if ( seconds <= 0 ) {
                    LOG.error("Failed to cancel task threads!");
                    return;
                }
            }
            _capacity.release(_maxCapacity);
        } catch ( InterruptedException ex ) {
            Thread.currentThread().interrupt();
            return;
        }
    }

    @Override
    public void updateTask(byte[] updateData, long taskId) {
        if ( null == updateData ) throw new IllegalArgumentException("updateData may not be null");
        try {
            _tasks.updateItem(taskId, null)
                .set("upd", AttrType.BIN, updateData)
                .when((expr) -> expr.exists("mid"));
        } catch ( RollbackException ex ) {
            LOG.debug("Attempt to set updateData on taskId="+taskId+" that is in a final state, ignoring");
            return;
        }
        try {
            _tasks.updateItem(taskId, null)
                .set("mid", AttrType.STR, MONITOR_ID_QUEUED)
                .set("stat", AttrType.STR, toString(TaskState.QUEUED))
                .when((expr) -> expr.beginsWith("mid", "$"));
        } catch ( RollbackException ex ) {
            return;
        }
        // We moved the task out of waiting, so let's execute it:
        submitRunTask(taskId);
        submit(this::runNextTask);
    }

    public void releaseLocksForMonitorId(String monitorId) throws InterruptedException {
        LOG.debug("Releasing locks for monitorId="+monitorId);
        List taskIdsToRun = new ArrayList<>();
        // Release locks on the "locks" table:
        for ( PageIterator iter : new PageIterator() ) {
            for ( Lock lock : _locksForMonitor.queryItems(monitorId, iter).list() ) {
                // Mark next task as runnable:
                unblockWaitingTasks(lock.lockId, monitorId, taskIdsToRun, false);
                // Remove the lock:
                try {
                    _locks.deleteItem(lock.lockId, TASK_ID_NONE,
                                      (expr) -> expr.eq("mid", monitorId));
                } catch ( RollbackException ex ) {
                    LOG.debug("LostLockException: releaseLock="+lock.lockId+" for monitorId="+monitorId+" taskId="+
                              lock.runningTaskId);
                }
                // Do not immediately dispatch these blocked tasks, so the next code can
                // resume the task that WAS running previously.
                // for ( Long taskId : taskIdsToRun ) {
                //    submitRunTask(taskId);
                // }
                taskIdsToRun.clear();
            }
        }
        // Put the tasks back into a runnable state:
        boolean taskSubmitted = false;
        for ( PageIterator iter : new PageIterator() ) {
            for ( Task task : _tasksForMonitor.queryItems(monitorId, iter).list() ) {
                try {
                    _tasks.updateItem(task.getTaskId(), null)
                        .set("mid", AttrType.STR, MONITOR_ID_QUEUED)
                        .set("stat", AttrType.STR, toString(TaskState.QUEUED))
                        .when((expr) -> expr.eq("mid", monitorId));
                } catch ( RollbackException ex ) {
                    LOG.debug("LostLockException: releaseLocksForMonitorId="+monitorId+" taskId="+
                              task.getTaskId());
                    continue;
                }
                submitRunTask(task.getTaskId());
                taskSubmitted = true;
            }
        }
        if ( taskSubmitted ) {
            submit(this::runNextTask);
        }
    }

    @Inject
    protected TaskManagerImpl() {}

    @Inject
    protected void init(Index.Factory indexFactory, ScheduledExecutorService executorService) {
        // TODO: Make this tunable with a guice configuration:
        _maxCapacity = 5;
        if ( executorService instanceof ScheduledThreadPoolExecutor ) {
            _maxCapacity = ((ScheduledThreadPoolExecutor)executorService).getCorePoolSize() - 1;
            if ( _maxCapacity <= 0 ) {
                _maxCapacity = 1;
                LOG.error("Detected ScheduledThreadPoolExecutor with corePoolSize <=1, thread starvation is likely.");
            } else if ( _maxCapacity > 10 ) _maxCapacity--;
        }
        _capacity = new Semaphore(_maxCapacity);

        _om.registerModule(createTransforms(new TransformModule()));

        String[] noEncrypt = new String[]{"cnt", "agn", "tic", "upd"};
        _tasks = indexFactory.create(Task.class)
            .withTableDescription(TasksTable.getTableDescription())
            .withConvertValue(_om::convertValue)
            .withNoEncrypt(noEncrypt)
            .build();

        _tasksForMonitor = indexFactory.create(Task.class)
            .withTableDescription(TasksTable.getTableDescription(), "mid-id-index")
            .withConvertValue(_om::convertValue)
            .withNoEncrypt(noEncrypt)
            .build();

        _tasksForEntity = indexFactory.create(Task.class)
            .withTableDescription(TasksTable.getTableDescription(), "ety-eid-index")
            .withConvertValue(_om::convertValue)
            .withNoEncrypt(noEncrypt)
            .build();

        _nonTerminalTasksForEntity = indexFactory.create(Task.class)
            .withTableDescription(TasksTable.getTableDescription(), "ntty-ntid-index")
            .withConvertValue(_om::convertValue)
            .withNoEncrypt(noEncrypt)
            .build();

        noEncrypt = new String[]{"mid", "agn", "rtid"};
        _locks = indexFactory.create(Lock.class)
            .withNoEncrypt(noEncrypt)
            .withTableDescription(LocksTable.getTableDescription())
            .withConvertValue(_om::convertValue)
            .build();

        _locksForMonitor = indexFactory.create(Lock.class)
            .withNoEncrypt(noEncrypt)
            .withTableDescription(LocksTable.getTableDescription(), "mid-index")
            .withConvertValue(_om::convertValue)
            .build();
    }

    private TransformModule createTransforms(TransformModule module) {
        module.createTransform(Task.class)
            .put("id", Long.class, "taskId")
            .put("ety", String.class, "entityType")
            .put("eid", String.class, TaskManagerImpl::toEid, TaskManagerImpl::fromEid)
            .put("ntty", String.class, TaskManagerImpl::toNtty)
            .put("ntid", String.class, TaskManagerImpl::toNtid) // non-terminal state?
            .put("stat", String.class, TaskManagerImpl::toState, TaskManagerImpl::fromState)
            .put("lids", new TypeReference>(){}, "lockIds")
            .put("preq", new TypeReference>(){}, "prerequisiteTaskIds")
            .put("any", Boolean.class, "anyPrerequisiteTaskId")
            .put("mid", String.class, "monitorId")
            .put("upd", byte[].class, "updateData")
            .put("st8", byte[].class, "checkpointData")
            .put("err", String.class, "errorMessage")
            .put("errT", String.class, "errorMessageStackTrace")
            .put("errId", String.class, "errorId")
            .put("ts", Long.class, "startTime")
            .put("tf", Long.class, "endTime")
            .put("cnt", Long.class, "runCount")
            .put("agn", Long.class, "requeues")
            .put("tic", Long.class, "millisecondsRemaining")
            .put("cancel", String.class, "canceledBy");
        module.createTransform(Lock.class)
            .put("lid", String.class, "lockId")
            .put("tid", String.class, "taskId")
            .put("rtid", Long.class, "runningTaskId")
            .put("mid", String.class, "monitorId")
            .put("agn", Long.class, "tasksQueued");
        return module;
    }

    private static String toState(Task task) {
        return toString(task.getTaskState());
    }

    private static void fromState(Task task, String state) {
        task.taskState = toTaskState(state);
    }

    private void onTerminalState(TaskInfo terminalTask) {
        synchronized ( _onTerminalState ) {
            for ( Consumer consumer : _onTerminalState ) {
                try {
                    consumer.accept(terminalTask);
                } catch ( Throwable ex ) {
                    LOG.error(
                        "OnTerminalState["+consumer+"]("+terminalTask+") FAILED: "+ex.getMessage(),
                        ex);
                }
            }
        }
    }

    private static String toString(TaskState taskState) {
        if ( null == taskState ) return null;
        switch ( taskState ) {
        case QUEUED: return "Q";
        case RUNNING: return "R";
        case WAITING_FOR_INTERVAL: return "T";
        case WAITING_FOR_PREREQUISITE: return "N";
        case WAITING_FOR_LOCK: return "L";
        case FAILED: return "F";
        case SUCCESS: return "S";
        case CANCELED: return "C";
        }
        throw new UnsupportedOperationException(
            "taskState="+taskState+" is not supported in TaskManagerImpl");
    }

    private static TaskState toTaskState(String state) {
        if ( null == state ) return null;
        switch ( state ) {
        case "Q": return TaskState.QUEUED;
        case "R": return TaskState.RUNNING;
        case "T": return TaskState.WAITING_FOR_INTERVAL;
        case "N": return TaskState.WAITING_FOR_PREREQUISITE;
        case "L": return TaskState.WAITING_FOR_LOCK;
        case "F": return TaskState.FAILED;
        case "S": return TaskState.SUCCESS;
        case "C": return TaskState.CANCELED;
        default:
            LOG.info("Unknown TaskState="+state);
        }
        return null;
    }

    // We add the taskId on the end to force sort order:
    private static String toEid(Task task) {
        if ( null == task.entityId ) return null;
        return task.entityId + "@" + longToSortKey(task.taskId);
    }

    private static void fromEid(Task task, String eid) {
        if ( null == eid || eid.length() < LONG_SORT_KEY_LENGTH ) return;
        task.entityId = eid.substring(0, eid.length()-LONG_SORT_KEY_LENGTH-1);
    }

    private static String toNtty(Task task) {
        if ( null != task.taskState && task.taskState.isTerminal() ) {
            // terminal state:
            return null;
        }
        return task.entityType;
    }

    private static String toNtid(Task task) {
        if ( null != task.taskState && task.taskState.isTerminal() ) {
            // terminal state:
            return null;
        }
        return toEid(task);
    }

    private static long milliTime() {
        return TimeUnit.MILLISECONDS.convert(System.nanoTime(), TimeUnit.NANOSECONDS);
    }

    private void runNextTask() {
        // We are at capacity, don't actually run a task:
        if ( ! _capacity.tryAcquire(1) ) {
            // This is pretty common place, so do not log it:
            //LOG.warn("Insufficient capacity to run the next task, try increasing thread pool size");
            return;
        }
        Long taskId = null;
        try {
            // Spread-out running tasks to avoid slamming the DB with writes:
            synchronized ( this ) {
                long now = milliTime();
                long remaining = _lastRunTask + POLL_INTERVAL_MS/MAX_TASKS_IN_INTERVAL - now;
                if ( remaining > 0 ) {
                    // Delay the next task run:
                    if ( null == _runNextTaskFuture ) {
                        _runNextTaskFuture = schedule(this::runNextTask, remaining);
                    }
                    return;
                } else {
                    _runNextTaskFuture = null;
                    _lastRunTask = now;
                }
            }

            // Read tasks from the queue until we find one that is in a queued state and
            // therefore is likely to acquire the lock:
            while ( true ) {
                synchronized ( _taskQueue ) {
                    if ( _taskQueue.isEmpty() ) return;
                    taskId = _taskQueue.iterator().next();
                    _taskQueue.remove(taskId);
                }

                Task task = _tasks.getItem(taskId);
                if ( null != task && MONITOR_ID_QUEUED.equals(task.getMonitorId()) ) {
                    break;
                }
            }
            LOG.debug("runTask("+taskId+")");

            runTask(taskId);
        } catch ( Throwable ex ) {
            LOG.error("runTask("+taskId+"): "+ex.getMessage(), ex);
        } finally {
            _capacity.release();

            synchronized ( _taskQueue ) {
                if ( _taskQueue.isEmpty() ) return;
            }

            // It doesn't hurt to run this to much, but it does hurt to not
            // run it enough:
            submit(this::runNextTask);
        }
    }

    private synchronized void submit(Runnable run) {
        if ( null == _monitorTasks ) return;
        if ( null == _executor ) return;
        AtomicReference> futureRef = new AtomicReference<>();
        futureRef.set(_executor.submit(() -> {
                    try {
                        run.run();
                    } catch ( Throwable ex ) {
                        LOG.error(ex.getMessage(), ex);
                    } finally {
                        synchronized ( this ) {
                            _spawnedFutures.remove(futureRef.get());
                        }
                    }
                }));
        _spawnedFutures.add(futureRef.get());
    }

    private synchronized Future schedule(Runnable run, long intervalMS) {
        if ( null == _monitorTasks ) return null;
        if ( null == _executor ) return null;
        AtomicReference> futureRef = new AtomicReference<>();
        futureRef.set(_executor.schedule(() -> {
                    try {
                        run.run();
                    } catch ( Throwable ex ) {
                        LOG.error(ex.getMessage(), ex);
                    } finally {
                        synchronized ( this ) {
                            _spawnedFutures.remove(futureRef.get());
                        }
                    }
                }, intervalMS, TimeUnit.MILLISECONDS));
        _spawnedFutures.add(futureRef.get());
        return futureRef.get();
    }

    private void submitRunTask(long taskId) {
        synchronized ( _taskQueue ) {
            _taskQueue.add(taskId);
        }
    }

    private synchronized void scheduleDelayedTask(long taskId, long intervalMS) {
        schedule(() ->
                 _monitor.monitor((mon) -> updateDelayedTask(taskId, mon)),
                 intervalMS);
    }

    private void monitorDelayedTask(Task task) {
        long taskId = task.getTaskId();
        DelayedTask delayedTask = new DelayedTask(task.getMillisecondsRemaining());
        synchronized ( delayedTask ) {
            if ( null != _delayedTasks.putIfAbsent(taskId, delayedTask) ) {
                LOG.debug("Already monitoring delayed taskId="+taskId);
                return;
            }
            long interval = Math.min(POLL_INTERVAL_MS, delayedTask.millisRemaining)
                - (milliTime() - delayedTask.millisTimeBegin);
            scheduleDelayedTask(taskId, interval);
            LOG.debug("Monitoring delayed taskId="+taskId);
        }
    }

    private void startRunnableTasks() {
        try {
            boolean taskSubmitted = false;
            for ( PageIterator iter : new PageIterator().pageSize(100) ) {
                for ( Task task : _tasksForMonitor.queryItems(MONITOR_ID_QUEUED, iter).list() ) {
                    if ( ! _taskMatches.test(task) ) continue;
                    submitRunTask(task.getTaskId());
                    taskSubmitted = true;
                }
            }
            if ( taskSubmitted ) {
                submit(this::runNextTask);
            }

            if ( 1 == _pollCount.incrementAndGet() % CLEANUP_INTERVALS ) {
                doCleanup();
            }

            // TODO: Occassionally find all MONITOR_ID_WAITING tasks to check if
            // cleanup needs to happen?

            // TODO: Poll for canceled tasks that we are running and interrupt them...
        } catch ( Throwable ex ) {
            LOG.error(ex.getMessage(), ex);
        }
    }

    private void doCleanup() {
        _monitor.monitor(this::doCleanup);
    }
    private void doCleanup(MonitorInfo monitorInfo) {
        // Delete all lid=... tid=TASK_ID_NONE, rtid=terminalStateTaskId,
        // mid=... entries (this "force" breaks locks):
        Map lockedIds = new HashMap<>();
        Map taskInTerminalState = new HashMap<>();
        for ( PageIterator it : new PageIterator().pageSize(100) ) {
            for ( Lock lock :  _locksForMonitor.scanItems(it) ) {
                if ( null == lock.runningTaskId ) continue;
                if ( ! TASK_ID_NONE.equals(lock.taskId) ) continue;
                if ( Boolean.FALSE == taskInTerminalState.computeIfAbsent(
                         lock.runningTaskId, this::isTerminalTaskId) )
                {
                    lockedIds.put(lock.lockId, Boolean.TRUE);
                    continue;
                }
                // Simply delete the lock. Note that this will cause
                // waiting tasks to never run, however our next cleanup
                // task is to find tasks that are waiting and
                // clean them up, so we should be "okay".
                try {
                    _locks.deleteItem(
                        lock.lockId,
                        lock.taskId,
                        (expr) -> expr.and(
                            expr.eq("mid", lock.monitorId),
                            expr.and(
                                expr.eq("rtid", lock.runningTaskId),
                                expr.eq("agn", lock.tasksQueued))));
                    lockedIds.put(lock.lockId, Boolean.FALSE);
                    LOG.error("Found lockId="+lock.lockId+" was NOT removed, even though taskId="+
                              lock.runningTaskId+" is in a terminal state!");
                } catch ( RollbackException ex ) {
                    lockedIds.put(lock.lockId, Boolean.TRUE);
                    LOG.debug("Found lockId="+lock.lockId+" was already updated");
                }
            }
        }

        // Find tasks with mid=MONITOR_ID_WAITING and see if they should be MONITOR_ID_QUEUED.
        for ( PageIterator it : new PageIterator().pageSize(100) ) {
            for ( Task task : _tasksForMonitor.queryItems(MONITOR_ID_WAITING, it).list(
                      Arrays.asList(
                          "id", "any", "preq", "lids", "stat")) )
            {
                boolean anyPrereq = task.isAnyPrerequisiteTaskId();
                boolean unblock = ( anyPrereq ) ? false : true;
                for ( Long prereqTaskId : task.getPrerequisiteTaskIds() ) {
                    Boolean isTerminal = taskInTerminalState.computeIfAbsent(
                        prereqTaskId, this::isTerminalTaskId);
                    if ( anyPrereq ) {
                        if ( Boolean.TRUE == isTerminal ) {
                            unblock = true;
                            break;
                        }
                    } else if ( Boolean.FALSE == isTerminal ) {
                        unblock = false;
                        break;
                    }
                }
                // Still waiting for prerequisites:
                if ( ! unblock ) continue;
                // Check locks are free:
                for ( String lockId : task.getLockIds() ) {
                    if ( Boolean.TRUE == lockedIds.computeIfAbsent(
                             lockId, this::isLocked) )
                    {
                        unblock = false;
                        break;
                    }
                }
                // Still waiting to obtain lock:
                if ( ! unblock ) continue;
                // set mid="#"!
                try {
                    _tasks.updateItem(task.getTaskId(), null)
                        .set("mid", "#")
                        .when((expr) -> expr.eq("mid", "$"));
                    LOG.error(
                        "Found taskId="+task.getTaskId()+
                        " was NOT enqueued, even though all prerequisite tasks ("+
                        task.getPrerequisiteTaskIds()+") are satisfied and all locks ("+
                        task.getLockIds()+") are available!");
                } catch ( RollbackException ex ) {
                    LOG.debug("Found taskId="+task.getTaskId()+" was already enqueued");
                }
            }
        }
    }

    private boolean isTerminalTaskId(long taskId) {
        return getTaskState(
            _tasks.getItem(
                taskId,
                null,
                Arrays.asList("stat"),
                Task.class))
            .isTerminal();
    }

    private Boolean isLocked(String lockId) {
        Lock lock = _locks.getItem(
            lockId,
            TASK_ID_NONE,
            Arrays.asList("lid"),
            Lock.class);
        return lock != null;
    }

    private void updateDelayedTask(final long taskId, MonitorInfo monitorInfo) {
        DelayedTask delayedTask = _delayedTasks.get(taskId);
        if ( null == delayedTask ) return;
        synchronized ( delayedTask ) {
            UpdateItemBuilder update = _tasks.updateItem(taskId, null);
            long now = milliTime();
            long newRemaining = delayedTask.millisRemaining - (now - delayedTask.millisTimeBegin);
            if ( newRemaining <= 0 || null == monitorInfo ) {
                update.remove("tic")
                    .set("mid", AttrType.STR, MONITOR_ID_QUEUED)
                    .set("stat", AttrType.STR, toString(TaskState.QUEUED));
            } else {
                update.set("tic", AttrType.NUM, newRemaining)
                    .set("mid", AttrType.STR, monitorInfo.getMonitorId());
            }
            try {
                update.when((expr) -> expr.eq("tic", delayedTask.millisRemaining));
            } catch ( RollbackException ex ) {
                LOG.debug("Failed to update taskId="+taskId+" due to tic != "+delayedTask.millisRemaining);
                monitorInfo = null;
            }
            if ( newRemaining <= 0 || null == monitorInfo ) { // stop monitoring:
                _delayedTasks.remove(taskId, delayedTask);
                submitRunTask(taskId);
                submit(this::runNextTask);
                return;
            }
            delayedTask.millisRemaining = newRemaining;
            delayedTask.millisTimeBegin = now;
            long interval = Math.min(POLL_INTERVAL_MS, delayedTask.millisRemaining)
                - (milliTime() - now);
            scheduleDelayedTask(taskId, interval);
        }
    }

    private void runTask(long taskId) {
        try {
            _monitor.monitor((monitorInfo) -> lockAndRunTask(taskId, monitorInfo));
        } catch ( ShuttingDownException ex ) {
            synchronized ( this ) {
                _executor = null;
                LOG.error(ex.getMessage(), ex);
            }
        } catch ( Throwable ex ) {
            LOG.error("runTask("+taskId+") FAILED: "+ex.getMessage(), ex);
        }
    }

    private String getLockForTaskId(long taskId) {
        return "_TASK:"+longToSortKey(taskId);
    }

    private TaskState getTaskState(Task task) {
        if ( null == task ) return TaskState.FAILED;
        TaskState taskState = task.getTaskState();
        if ( null == taskState ) {
            LOG.error("Unexpected taskState=null for taskId="+task.getTaskId());
            return TaskState.FAILED;
        }
        return taskState;
    }

    private boolean checkPrerequisites(Task task) {
        boolean anyPrerequisiteComplete = false;
        List incompletePrerequisites = ( task.isAnyPrerequisiteTaskId() ) ?
            new ArrayList<>() : null;
        // First check the prerequisites:
        for ( Long prerequisiteId : task.getPrerequisiteTaskIds() ) {
            if ( null == prerequisiteId ) continue;
            String lockId = getLockForTaskId(prerequisiteId);
            String taskIdSortKey = longToSortKey(task.getTaskId());
            if ( anyPrerequisiteComplete ) {
                // Avoid cruft buildup:
                _locks.deleteItem(lockId, taskIdSortKey);
                continue;
            }
            TaskState state = getTaskState(_tasks.getItem(prerequisiteId));
            if ( state.isTerminal() ) {
                // Task completed, avoid leaving cruft:
                _locks.deleteItem(lockId, taskIdSortKey);
                if ( null != incompletePrerequisites ) {
                    anyPrerequisiteComplete = true;
                }
                continue;
            }
            // Enqueue:
            _locks.putItem(new Lock(lockId, taskIdSortKey));
            LOG.debug("enqueue prerequisite="+prerequisiteId+" for taskId="+task.getTaskId());
            try {
                // Force unblockWaitingTasks() to see our entry:
                _locks.updateItem(lockId, TASK_ID_NONE)
                    .increment("agn", 1)
                    .when((expr) -> expr.exists("mid"));
            } catch ( RollbackException ex ) {
                LOG.debug("Unable to increment agn field of lockId="+lockId+", checking if task is now terminal");
                // TODO: Make this a consistent read!
                state = getTaskState(_tasks.getItem(prerequisiteId));
                if ( state.isTerminal() ) {
                    // Task completed (and removed the lockId, TASK_ID_NONE field):
                    _locks.deleteItem(lockId, taskIdSortKey);
                    if ( null != incompletePrerequisites ) {
                        anyPrerequisiteComplete = true;
                    }
                    continue;
                }
                // Task isn't running, we have already enqueued, so we are good.
            }
            if ( null != incompletePrerequisites ) {
                incompletePrerequisites.add(prerequisiteId);
            } else {
                LOG.debug("Waiting on prerequisiteTaskId="+prerequisiteId+" for taskId="+task.getTaskId());
                task.taskState = TaskState.WAITING_FOR_PREREQUISITE;
                return false;
            }
        }
        if ( null != incompletePrerequisites ) {
            if ( ! anyPrerequisiteComplete ) {
                LOG.debug("Waiting on one of the prerequisiteTaskIds="+incompletePrerequisites+
                          " for taskId="+task.getTaskId());
                task.taskState = TaskState.WAITING_FOR_PREREQUISITE;
                return false;
            }
            // Avoid cruft buildup:
            for ( Long prerequisiteId : incompletePrerequisites ) {
                String lockId = getLockForTaskId(prerequisiteId);
                String taskIdSortKey = longToSortKey(task.getTaskId());
                _locks.deleteItem(lockId, taskIdSortKey);
            }
        }
        return true;
    }

    private boolean acquireLocks(Task task, List locksAcquired, String monitorId)
        throws InterruptedException
    {
        List lockIds = new ArrayList<>(task.getLockIds());
        lockIds.add(getLockForTaskId(task.getTaskId()));
        Collections.sort(lockIds);
      NEXT_LOCK:
        for ( String lockId : lockIds ) {
          RETRY:
            for ( int retry=0;; retry++ ) {
                // If we retry, do a random sleep:
                if ( retry > 0 ) Thread.sleep(ThreadLocalRandom.current().nextLong(500));

                // Try to acquire the lock:
                try {
                    Lock lock = _locks.updateItem(lockId, TASK_ID_NONE)
                        .set("mid", AttrType.STR, monitorId)
                        .set("rtid", AttrType.NUM, task.getTaskId())
                        .increment("agn", 1)
                        .returnAllNew()
                        .when((expr) -> expr.or(
                                  expr.eq("rtid", task.getTaskId()),
                                  expr.not(expr.exists("mid"))));
                    locksAcquired.add(lockId);
                } catch ( RollbackException ex1 ) {
                    LOG.debug("Unable to acquire lockId="+lockId+" for taskId="+task.getTaskId());

                    // Enqueue:
                    String taskIdStr = longToSortKey(task.getTaskId());
                    _locks.putItem(new Lock(lockId, taskIdStr));

                    // Force unblockWaitingTasks() to see our entry:
                    try {
                        _locks.updateItem(lockId, TASK_ID_NONE)
                            .increment("agn", 1)
                            .when((expr) -> expr.exists("mid"));
                    } catch ( RollbackException ex2 ) {
                        LOG.debug("Unable to increment agn field of lockId="+lockId+", retrying");
                        continue RETRY;
                    }

                    task.taskState = TaskState.WAITING_FOR_LOCK;
                    return false;
                }
                continue NEXT_LOCK;
            }
        }
        return true;
    }

    private class TaskContextImpl implements TaskContext {
        private Task _task;
        private MonitorInfo _monitorInfo;
        private TaskContextImpl(Task task, MonitorInfo monitorInfo) {
            _task = task;
            _monitorInfo = monitorInfo;
        }
        @Override
        public TaskInfo getTaskInfo() {
            return _task;
        }
        @Override
        public MonitorInfo getMonitorInfo() {
            return _monitorInfo;
        }
        @Override
        public byte[] getUpdateData() {
            return (null == _task) ? null : _task.updateData;
        }
        @Override
        public void commitCheckpointData(byte[] checkpointData) {
            try {
                _tasks.updateItem(_task.getTaskId(), null)
                    .set("st8", AttrType.BIN, checkpointData)
                    .when((expr) -> expr.eq("mid", _monitorInfo.getMonitorId()));
            } catch ( RollbackException ex ) {
                throw new LostLockException("taskId="+_task.getTaskId());
            }
        }
    }

    private String getThreadName(TaskInfo task) {
        return String.format("TASK:0x%x", task.getTaskId());
    }

    private void lockAndRunTask(long taskId, MonitorInfo monitorInfo) {
        // Lock the task:
        Task originalTask;
        List locksAcquired = new ArrayList<>();
        String threadName = null;
        try {
            originalTask = _tasks.updateItem(taskId, null)
                .set("mid", AttrType.STR, monitorInfo.getMonitorId())
                .set("stat", AttrType.STR, toString(TaskState.RUNNING))
                .set("ts", AttrType.NUM, System.currentTimeMillis())
                .increment("cnt", 1)
                .returnAllNew()
                .when((expr) -> expr.eq("mid", MONITOR_ID_QUEUED));
        } catch ( RollbackException ex ) {
            // Someone else already locked this task:
            LOG.debug("Something else is running taskId="+taskId);
            return;
        }
        boolean submitQueuedTask = true;
        boolean interrupted = false;
        Task finalTask = new Task(originalTask);
        finalTask.taskState = TaskState.QUEUED;
        try {
            threadName = Thread.currentThread().getName();
            Thread.currentThread().setName(getThreadName(finalTask));
            if ( null != finalTask.getCanceledBy() ) {
                finalTask.taskState = TaskState.CANCELED;
                return;
            }
            if ( null == originalTask.updateData ) {
                if ( null != finalTask.getMillisecondsRemaining() ) {
                    monitorDelayedTask(finalTask);
                    // NOTE: we MUST keep the mid locked in this scenario!
                    finalTask.taskState = TaskState.WAITING_FOR_INTERVAL;
                    return;
                }

                if ( ! checkPrerequisites(finalTask) ) return;
                if ( ! acquireLocks(finalTask, locksAcquired, monitorInfo.getMonitorId()) ) {
                    return;
                }
            }
            TaskFunction taskFunction = _taskFunctions.get(finalTask.getEntityType());
            if ( null == taskFunction ) {
                LOG.info("Unsupported entityType="+finalTask.getEntityType()+" taskId="+taskId);
                // Wait for a time period:
                finalTask.millisecondsRemaining = 60000L;
                finalTask.taskState = TaskState.SUCCESS;
                submitQueuedTask = false;
                return;
            }

            Throwable err = null;
            try {
                LOG.debug("Running taskId="+taskId+" entityType="+finalTask.getEntityType()+" entityId="+finalTask.getEntityId());
                TaskInfo taskInfo = taskFunction.run(new TaskContextImpl(originalTask, monitorInfo));
                if ( null != taskInfo ) {
                    finalTask = new Task(taskInfo);
                }
                finalTask.taskState = TaskState.SUCCESS;
            } catch ( Throwable ex ) {
                if ( ! Thread.interrupted() && ! isInterruptedException(ex) ) {
                    finalTask.errorId = CompactUUID.randomUUID().toString();
                    LOG.debug("Failed taskId="+taskId+" errorId="+finalTask.errorId+": "+ex.getMessage(), ex);

                    StringWriter stackTrace = new StringWriter();
                    stackTrace.append("on nodeName="+monitorInfo.getNodeName()+" ");
                    ex.printStackTrace(new PrintWriter(stackTrace));
                    finalTask.errorMessage = ex.getMessage();
                    finalTask.errorMessageStackTrace = stackTrace.toString();
                    finalTask.taskState = TaskState.FAILED;
                } else {
                    interrupted = true;
                    LOG.debug("Ignoring interrupted thread exception "+ex.getMessage(), ex);
                    finalTask.taskState = TaskState.QUEUED;
                    return;
                }
            }
        } catch ( LostLockException|InterruptedException ex ) {
            LOG.error("Failing heartbeat "+monitorInfo.getMonitorId()+" due to taskId="+
                      taskId+": "+ex.getMessage(), ex);
            monitorInfo.forceHeartbeatFailure();
        } finally {
            try {
                if ( monitorInfo.hasFailedHeartbeat() ) {
                    if ( interrupted ) Thread.currentThread().interrupt();
                    return;
                }
                try {
                    interrupted |= updateTaskState(
                        originalTask, finalTask, locksAcquired, monitorInfo, submitQueuedTask);
                } catch ( Throwable ex ) {
                    LOG.error("Failing heartbeat "+monitorInfo.getMonitorId()+
                              " in updateTaskState due to taskId="+taskId+": "+
                              ex.getMessage(), ex);
                    monitorInfo.forceHeartbeatFailure();
                }
                if ( interrupted ) {
                    Thread.currentThread().interrupt();
                }
                if ( finalTask.getTaskState().isTerminal() ) {
                    onTerminalState(finalTask);
                }
            } finally {
                if ( null != threadName ) {
                    Thread.currentThread().setName(threadName);
                }
            }
        }
    }

    // Returns true if thread was interrupted...
    private boolean updateTaskState(Task originalTask, Task finalTask, List locksAcquired, MonitorInfo monitorInfo, boolean submitQueuedTask) {
        boolean interrupted = false;
        long taskId = originalTask.getTaskId();
        List taskIdsToRun = new ArrayList<>();
        for ( int retry=0; retry < 3; retry++ ) {
            try {
                // First update all DB state:
                UpdateItemBuilder update = buildUpdateTaskState(originalTask, finalTask);

                // We MUST commit any terminal states of the task BEFORE the
                // prereqs lock is processed, otherwise this log line:
                // "Unable to increment agn field of lockId="... in
                // acquireLocks() will do a double-check of the task state only
                // to find the task still is not in a terminal state and
                // therefore the code will assume the enqueue of the prereq was
                // successful.

                boolean checkForRequeue = MONITOR_ID_WAITING.equals(finalTask.monitorId);
                try {
                    update.when((expr) -> {
                            FilterCondExpr result = expr.eq("mid", monitorInfo.getMonitorId());
                            if ( checkForRequeue ) {
                                result = expr.and(result,
                                                  ( null == originalTask.requeues ) ?
                                                  expr.not(expr.exists("agn")) :
                                                  expr.eq("agn", originalTask.requeues));
                            }
                            return result;
                        });
                } catch ( RollbackException rollbackEx ) {
                    if ( checkForRequeue ) {
                        Task task = _tasks.getItem(taskId);
                        if ( null != task && monitorInfo.getMonitorId().equals(task.getMonitorId()) ) {
                            LOG.debug("'agn'="+originalTask.requeues+" of taskId="+taskId+
                                      " changed to="+task.requeues+" during run, retrying");
                            finalTask.taskState = TaskState.QUEUED;
                            if ( submitQueuedTask ) taskIdsToRun.add(taskId);
                            continue;
                        }
                    }
                    throw new LostLockException("taskId="+taskId);
                }
                if ( null != originalTask.updateData ) {
                    // Remove the update data, but only if has remained unchanged (which
                    // is why we have to do a separate update):
                    try {
                        UpdateItemBuilder removeUpd = _tasks.updateItem(originalTask.getTaskId(), null)
                            .remove("upd");
                        if ( finalTask.getTaskState().isTerminal() ) {
                            removeUpd.always();
                        } else {
                            removeUpd.when((expr) -> expr.eq("upd", originalTask.updateData));
                        }
                    } catch ( RollbackException ex ) {
                        LOG.debug("'upd' of taskId="+taskId+" changed during run, not clearing 'upd'");
                    }
                }

                releaseLocks(locksAcquired,
                             taskId,
                             monitorInfo.getMonitorId(),
                             taskIdsToRun,
                             finalTask.getTaskState().isTerminal());

                // Get the tasks to run immediately:
                if ( _monitor.isActiveMonitor(monitorInfo) ) {
                    boolean taskSubmitted = false;
                    for ( Long taskIdToRun : taskIdsToRun ) {
                        submitRunTask(taskIdToRun);
                        taskSubmitted = true;
                    }
                    taskIdsToRun.clear();

                    if ( submitQueuedTask && TaskState.QUEUED == finalTask.getTaskState() ) {
                        submitRunTask(taskId);
                        taskSubmitted = true;
                    }
                    if ( taskSubmitted ) {
                        submit(this::runNextTask);
                    }
                }
                return interrupted;
            } catch ( RuntimeException|InterruptedException ex ) {
                if ( Thread.interrupted() || isInterruptedException(ex) ) {
                    interrupted = true;
                    LOG.debug("Interrupted in attempt to updateTaskState("+taskId+"): "+ex.getMessage(), ex);
                    continue;
                }
                throw (RuntimeException)ex;
            }
        }
        monitorInfo.forceHeartbeatFailure();
        LOG.error("Interrupted to many times, giving up on updateTaskState("+taskId+"), failing the monitor!");
        return interrupted;
    }

    private UpdateItemBuilder buildUpdateTaskState(Task originalTask, Task finalTask) {
        UpdateItemBuilder update = _tasks.updateItem(originalTask.getTaskId(), null);
        finalTask.monitorId = null;
        StringBuilder logMsg = new StringBuilder();
        switch ( finalTask.getTaskState() ) {
        case WAITING_FOR_INTERVAL:
            // Special case since we keep the mid locked:
            update.set("stat", AttrType.STR, toString(TaskState.WAITING_FOR_INTERVAL));
            LOG.debug("taskId="+originalTask.getTaskId()+" state="+finalTask.getTaskState());
            return update;
        case FAILED:
            update.set("err", AttrType.STR, finalTask.getErrorMessage())
                .set("errId", AttrType.STR, finalTask.getErrorId())
                .set("errT", AttrType.STR, finalTask.getErrorStackTrace());
            logMsg.append(" update err, errId, errT");
            break;
        case SUCCESS:
            if ( ! Arrays.equals(originalTask.getCheckpointData(), finalTask.getCheckpointData()) ) {
                update.set("st8", AttrType.BIN, finalTask.getCheckpointData());
                logMsg.append(" update st8");
            }
            if ( ! originalTask.getLockIds().equals(finalTask.getLockIds()) ) {
                finalTask.monitorId = MONITOR_ID_QUEUED;
                finalTask.taskState = TaskState.QUEUED;
                if ( finalTask.getLockIds().isEmpty() ) {
                    update.remove("lids");
                    logMsg.append(" delete lids");
                } else {
                    update.set("lids", AttrType.STR_SET, finalTask.getLockIds());
                    logMsg.append(" set lids="+finalTask.getLockIds());
                }
            }
            if ( ! originalTask.getPrerequisiteTaskIds().equals(finalTask.getPrerequisiteTaskIds()) ) {
                finalTask.monitorId = MONITOR_ID_QUEUED;
                finalTask.taskState = TaskState.QUEUED;
                if ( finalTask.getPrerequisiteTaskIds().isEmpty() ) {
                    update.remove("preq")
                        .remove("any");
                    logMsg.append(" remove preq");
                } else {
                    update.set("preq", AttrType.STR_SET, finalTask.getPrerequisiteTaskIds())
                        .set("any", AttrType.BOOL, finalTask.isAnyPrerequisiteTaskId());
                    logMsg.append(" set preq="+finalTask.getPrerequisiteTaskIds());
                }
            }
            if ( null != finalTask.getMillisecondsRemaining() ) {
                finalTask.monitorId = MONITOR_ID_QUEUED;
                finalTask.taskState = TaskState.QUEUED;
                update.set("tic", AttrType.NUM, finalTask.getMillisecondsRemaining());
                logMsg.append(" set tic="+finalTask.getMillisecondsRemaining());
            }
            break;
        case WAITING_FOR_PREREQUISITE:
        case WAITING_FOR_LOCK:
            finalTask.monitorId = MONITOR_ID_WAITING;
            break;
        case QUEUED:
            finalTask.monitorId = MONITOR_ID_QUEUED;
            break;
        case RUNNING:
            throw new IllegalStateException("finalTask should NEVER be marked as RUNNING!");
        case CANCELED:
            // release the monitor and set the task state as such...
            break;
        }
        update.set("stat", AttrType.STR, toString(finalTask.getTaskState()))
            .set("tf", AttrType.NUM, System.currentTimeMillis());
        if ( null == finalTask.monitorId ) {
            logMsg.append(" delete mid, ntty, ntid");
            update.remove("mid")
                .remove("ntty")
                .remove("ntid");
        } else {
            logMsg.append(" set mid="+finalTask.monitorId);
            update.set("mid", AttrType.STR, finalTask.monitorId);
        }
        LOG.debug(
            "taskId="+originalTask.getTaskId()+" state="+finalTask.getTaskState()+logMsg.toString());
        return update;
    }

    private boolean isInterruptedException(Throwable ex) {
        switch ( ex.getClass().getName() ) {
        case "com.amazonaws.AbortedException":
        case "java.lang.InterruptedException":
            return true;
        }
        return false;
    }

    private void releaseLocks(List locks, Long taskId, String monitorId, List taskIdsToRun, boolean isTerminal)
        throws InterruptedException
    {
        String prereqsLock = getLockForTaskId(taskId);
        while ( ! locks.isEmpty() ) {
            String lockId = locks.get(locks.size()-1);

            // Mark next task as runnable:
            unblockWaitingTasks(lockId, monitorId, taskIdsToRun,
                                isTerminal && lockId.equals(prereqsLock));
            // Remove our queued mark:
            if ( null != taskId ) {
                _locks.deleteItem(lockId, longToSortKey(taskId));
            }
            locks.remove(locks.size()-1);
        }
    }

    private void unblockWaitingTasks(String lockId, String monitorId, List taskIdsToRun, boolean processPrereqs)
        throws InterruptedException
    {
        if ( lockId.startsWith("_TASK:") ) {
            if ( ! processPrereqs ) {
                // We still need to release the lock:
                try {
                    _locks.deleteItem(lockId, TASK_ID_NONE, (expr) -> expr.eq("mid", monitorId));
                    LOG.debug("released lockId="+lockId);
                } catch ( RollbackException ex ) {
                    throw new LostLockException("lockId="+lockId);
                }
                return;
            }
            LOG.debug("unblocking all prerequisites lockId="+lockId);
        } else {
            processPrereqs = false;
        }
        for ( int retry=0;; retry++ ) {
            // If we retry, do a random sleep:
            if ( retry > 0 ) Thread.sleep(ThreadLocalRandom.current().nextLong(500));
            Long tasksQueued = null;
            for ( PageIterator iter : new PageIterator().pageSize(processPrereqs ? 100 : 2) ) {
              NEXT_LOCK:
                for ( Lock lock : _locks.queryItems(lockId, iter).list() ) {
                    if ( TASK_ID_NONE.equals(lock.taskId) ) {
                        tasksQueued = lock.tasksQueued;
                        continue;
                    }
                    Long taskId = sortKeyToLong(lock.taskId);
                    boolean first = true;
                    while ( true ) {
                        try {
                            _tasks.updateItem(taskId, null)
                                .set("mid", AttrType.STR, MONITOR_ID_QUEUED)
                                .set("stat", AttrType.STR, toString(TaskState.QUEUED))
                                .when((expr) -> expr.eq("mid", MONITOR_ID_WAITING));
                            LOG.debug("unblocked taskId="+taskId+" that was waiting for "+lockId);
                            taskIdsToRun.add(taskId);
                            break;
                        } catch ( RollbackException ex ) {
                            if ( first ) {
                                first = false;
                                LOG.debug("taskId="+taskId+" was not in a waiting state, incrementing 'agn'");
                                _tasks.updateItem(taskId, null)
                                    .increment("agn", 1)
                                    .always();
                            } else {
                                LOG.debug("taskId="+taskId+" was not in a waiting state");
                                continue NEXT_LOCK;
                            }
                        }
                    }
                    // Just unblocking a single task
                    if ( ! processPrereqs ) break;
                }
            }
            if ( null == tasksQueued ) {
                LOG.error("Expected lockId="+lockId+" to exist!");
                return;
            }
            try {
                Long finalTasksQueued = tasksQueued;
                _locks.deleteItem(lockId, TASK_ID_NONE, (expr) -> expr.and(
                                      expr.eq("mid", monitorId),
                                      expr.eq("agn", finalTasksQueued)));
                LOG.debug("released lockId="+lockId);
            } catch ( RollbackException ex ) {
                Lock lock = _locks.getItem(lockId, TASK_ID_NONE);
                if ( null == lock || ! monitorId.equals(lock.monitorId) ) {
                    throw new LostLockException("lockId="+lockId);
                }
                LOG.debug("Retrying unblockWaitingTask("+lockId+")");
                continue;
            }
            return;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy