All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.projectnessie.nessie.tasks.service.impl.TasksServiceImpl Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (C) 2024 Dremio
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.projectnessie.nessie.tasks.service.impl;

import static com.google.common.base.Preconditions.checkState;
import static java.lang.String.format;
import static java.util.Objects.requireNonNull;
import static java.util.concurrent.CompletableFuture.completedStage;
import static java.util.concurrent.CompletableFuture.failedStage;

import jakarta.inject.Inject;
import jakarta.inject.Singleton;
import java.time.Instant;
import java.time.temporal.ChronoUnit;
import java.util.ConcurrentModificationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionStage;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import org.immutables.value.Value;
import org.projectnessie.nessie.tasks.api.TaskBehavior;
import org.projectnessie.nessie.tasks.api.TaskObj;
import org.projectnessie.nessie.tasks.api.TaskRequest;
import org.projectnessie.nessie.tasks.api.TaskState;
import org.projectnessie.nessie.tasks.api.TaskStatus;
import org.projectnessie.nessie.tasks.api.Tasks;
import org.projectnessie.nessie.tasks.api.TasksService;
import org.projectnessie.nessie.tasks.async.TasksAsync;
import org.projectnessie.nessie.tasks.service.TasksServiceConfig;
import org.projectnessie.versioned.storage.common.exceptions.ObjNotFoundException;
import org.projectnessie.versioned.storage.common.exceptions.ObjTooLargeException;
import org.projectnessie.versioned.storage.common.persist.Obj;
import org.projectnessie.versioned.storage.common.persist.ObjId;
import org.projectnessie.versioned.storage.common.persist.Persist;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@Singleton
public class TasksServiceImpl implements TasksService {
  private static final Logger LOGGER = LoggerFactory.getLogger(TasksServiceImpl.class);

  private final String name;
  private final TasksAsync async;
  private final TaskServiceMetrics metrics;

  private final long raceWaitMillisMin;
  private final long raceWaitMillisMax;

  private final ConcurrentMap> currentTasks =
      new ConcurrentHashMap<>();

  private volatile boolean shutdown;

  public TasksServiceImpl() {
    this(null, null, null);
  }

  @Inject
  public TasksServiceImpl(
      @TasksServiceExecutor TasksAsync async,
      TaskServiceMetrics metrics,
      TasksServiceConfig config) {
    this.async = async;
    this.metrics = metrics;
    this.name = config.name();
    this.raceWaitMillisMin = config.raceWaitMillisMin();
    this.raceWaitMillisMax = config.raceWaitMillisMax();
  }

  @Override
  public CompletionStage shutdown() {
    shutdown = true;
    return CompletableFuture.allOf(
            currentTasks.values().stream()
                .map(CompletionStage::toCompletableFuture)
                .toArray(CompletableFuture[]::new))
        .thenApply(x -> null);
  }

  @Override
  public Tasks forPersist(Persist persist) {
    return new TasksImpl(persist);
  }

   CompletionStage submit(
      Persist persist, TaskRequest taskRequest) {
    ObjId objId = taskRequest.objId();

    // Try to get the object and immediately return if it has a final state. We expect to hit final
    // states way more often, so preventing the concurrent-hash-map interactions and especially the
    // asynchronous task handling improves the implementation.
    // TODO using `fetchObj()` would be wrong here, because it is *synchronous* and can block.
    //  Options:
    //  a) remove this "optimization"
    //  b) add a `getObjIfCached(ObjId)` --> chosen for now
    //  c) add a `fetchObjAsync()`, but adding async variants to all database implementations will
    //     be tricky
    Obj obj = persist.getImmediate(taskRequest.objId());
    if (obj != null) {
      T taskObj = castObj(taskRequest, obj);
      TaskStatus status = taskObj.taskState().status();
      switch (status) {
        case FAILURE:
          metrics.taskHasFinalFailure();
          return failedStage(taskRequest.behavior().stateAsException(taskObj));
        case SUCCESS:
          metrics.taskHasFinalSuccess();
          return completedStage(taskObj);
        default:
          // task object exists but has a non-final state, handle it asynchronously
          checkState(!status.isFinal(), "Expect non-final task status");
          break;
      }
    }

    // Ensure that only one "base" completable future exists for each obj-id.
    TaskKey taskKey = TaskKey.taskKey(persist.config().repositoryId(), objId);

    // The shutdown-check can be racy, if shutdown() is called after the `if` but before the access
    // to `currentTasks`. But since `shutdown()` is usually only relevant for tests, that trade-off
    // is acceptable.
    if (shutdown) {
      return CompletableFuture.failedStage(
          new IllegalStateException("Tasks service already shutdown"));
    }

    @SuppressWarnings("unchecked")
    CompletionStage r =
        (CompletionStage)
            currentTasks.computeIfAbsent(
                taskKey,
                id -> {
                  metrics.startNewTaskController();
                  ExecParams execParams = new ExecParams(persist, taskRequest);
                  LOGGER.trace("{}: Starting new local task controller for {}", name, execParams);
                  async.call(() -> tryLocal(execParams));
                  return execParams.resultFuture;
                });
    return r;
  }

  private void finalResult(ExecParams params, TaskObj result) {
    try {
      params.resultFuture.complete(result);
    } finally {
      removeFromCurrentTasks(params);
    }
  }

  private void finalFailure(ExecParams params, Throwable t) {
    try {
      params.resultFuture.completeExceptionally(t);
    } finally {
      removeFromCurrentTasks(params);
    }
  }

  private void removeFromCurrentTasks(ExecParams params) {
    TaskKey taskKey = TaskKey.taskKey(params.persist.config().repositoryId(), params.objId());
    currentTasks.remove(taskKey);
  }

  private void tryLocal(ExecParams params) {
    // Called from a thread pool, need to lock.
    params.lock.lock();
    try {
      metrics.taskAttempt();
      LOGGER.trace("{}: Task evaluation attempt for {}", name, params);

      TaskObj obj = castObj(params.taskRequest, params.persist.fetchObj(params.objId()));
      // keep in mind: `obj` might be a locally cached instance that is not in sync w/ the
      // database!

      TaskState state = obj.taskState();
      LOGGER.trace("{}: Evaluating task for {} with state {}", name, params, state);

      switch (state.status()) {
        case SUCCESS:
          metrics.taskAttemptFinalSuccess();
          finalResult(params, obj);
          break;
        case FAILURE:
          metrics.taskAttemptFinalFailure();
          finalFailure(params, params.taskRequest.behavior().stateAsException(obj));
          break;
        case RUNNING:
          metrics.taskAttemptRunning();
          checkRunningTask(params, state, obj);
          break;
        case ERROR_RETRY:
          metrics.taskAttemptErrorRetry();
          maybeAttemptErrorRetry(params, state, obj);
          break;
        default:
          throw new IllegalStateException("Unknown task status " + state.status());
      }

    } catch (ObjNotFoundException e) {
      LOGGER.trace("{}: Task for {} does not yet exist, creating", name, params);

      try {
        metrics.taskCreation();
        TaskBehavior behavior = params.taskRequest.behavior();
        TaskObj obj =
            withNewVersionToken(
                params
                    .taskRequest
                    .applyRequestToObjBuilder(behavior.newObjBuilder())
                    .id(params.taskRequest.objId())
                    .type(params.taskRequest.objType())
                    .taskState(behavior.runningTaskState(async.clock(), null)));

        if (params.persist.storeObj(obj)) {
          LOGGER.trace("{}: Task creation for {} succeeded", name, params);
          issueLocalTaskExecution(params, obj);
        } else {
          LOGGER.trace("{}: Task creation for {} failed, retrying", name, params);

          // Another process stored the task-obj for the task-request, reschedule but do not loop to
          // be "nice" and give other requests the ability to run.
          metrics.taskCreationRace();
          reattemptAfterRace(params);
        }
      } catch (Throwable t) {
        // Unhandled failure
        LOGGER.error(
            "{}: Unhandled state while storing initial task execution state for {}",
            name,
            params,
            t);
        metrics.taskCreationUnhandled();
        finalFailure(params, t);
      }
    } catch (Throwable t) {
      // Unhandled failure
      LOGGER.error("{}: Unhandled state during local task attempt for {}", name, params, t);
      metrics.taskAttemptUnhandled();
      finalFailure(params, t);
    } finally {
      params.lock.unlock();
    }
  }

  // Called while ExecParams is locked from tryLocal()
  private void checkRunningTask(ExecParams params, TaskState state, TaskObj obj)
      throws ObjTooLargeException {
    Instant now = async.clock().instant();
    if (now.compareTo(requireNonNull(state.lostNotBefore())) >= 0) {
      metrics.taskLossDetected();
      LOGGER.warn("{}: Detected lost task for {}", name, params);
      TaskBehavior behavior = params.taskRequest.behavior();
      TaskObj retryState =
          withNewVersionToken(
              behavior
                  .newObjBuilder()
                  .from(obj)
                  .taskState(behavior.runningTaskState(async.clock(), obj)));

      if (params.persist.updateConditional(obj, retryState)) {
        metrics.taskLostReassigned();
        issueLocalTaskExecution(params, retryState);
      } else {
        metrics.taskLostReassignRace();
        reattemptAfterRace(params);
      }
    } else {
      async.schedule(() -> tryLocal(params), state.retryNotBefore());
    }
  }

  // Called while ExecParams is locked from tryLocal()
  private void maybeAttemptErrorRetry(ExecParams params, TaskState state, TaskObj obj)
      throws ObjTooLargeException {
    Instant now = async.clock().instant();
    if (now.compareTo(requireNonNull(state.retryNotBefore())) >= 0) {
      TaskBehavior behavior = params.taskRequest.behavior();
      TaskObj retryState =
          withNewVersionToken(
              behavior
                  .newObjBuilder()
                  .from(obj)
                  .taskState(behavior.runningTaskState(async.clock(), obj)));

      if (params.persist.updateConditional(obj, retryState)) {
        metrics.taskRetryStateChangeSucceeded();
        issueLocalTaskExecution(params, retryState);
      } else {
        metrics.taskRetryStateChangeRace();
        reattemptAfterRace(params);
      }
    } else {
      async.schedule(() -> tryLocal(params), state.retryNotBefore());
    }
  }

  private void reattemptAfterRace(ExecParams params) {
    long raceWaitMillis =
        ThreadLocalRandom.current().nextLong(raceWaitMillisMin, raceWaitMillisMax);
    async.schedule(
        () -> tryLocal(params), async.clock().instant().plus(raceWaitMillis, ChronoUnit.MILLIS));
  }

  // Called while ExecParams is locked from tryLocal()
  private void issueLocalTaskExecution(ExecParams params, TaskObj obj) {
    LOGGER.debug("{}: Starting local task execution for {}", name, params);
    metrics.taskExecution();

    params.runningObj = obj;
    scheduleTaskRunningUpdate(params, obj);

    params
        .taskRequest
        .submitExecution()
        .whenComplete(
            (resultBuilder, failure) -> localTaskFinished(params, resultBuilder, failure));
  }

  private void localTaskFinished(
      ExecParams params, TaskObj.Builder resultBuilder, Throwable failure) {
    // Called from a thread pool, need to lock.
    params.lock.lock();
    try {
      TaskObj expected = params.runningObj;

      params.cancelRunningStateUpdate();

      metrics.taskExecutionFinished();

      if (expected == null) {
        unexpectedNullExpectedState(params, resultBuilder, failure);
        return;
      }

      if (resultBuilder != null) {
        TaskObj r = withNewVersionToken(resultBuilder);

        LOGGER.trace("{}, Task execution for {} succeeded, updating database", name, params);

        // Task execution succeeded with a final result
        if (params.persist.updateConditional(expected, r)) {
          metrics.taskExecutionResult();
          // Database updated with final result
          LOGGER.debug(
              "{}: Task execution success result for {} updated in database, returning final result",
              name,
              params);
          finalResult(params, r);
        } else {
          metrics.taskExecutionResultRace();
          // Another process updated the database state in the meantime.
          String msg =
              format(
                  "Failed to update successful task execution result for %s in database (race condition), exposing as a failure",
                  params);
          LOGGER.warn("{}: {}", name, msg);
          finalFailure(params, new ConcurrentModificationException(msg, failure));
        }
      } else if (failure == null) {
        failure =
            new NullPointerException("Local task execution return a null object, which is illegal");
      }
      if (failure != null) {
        LOGGER.trace("{}: Task execution for {} failed, updating database", name, params);

        TaskBehavior behavior = params.taskRequest.behavior();
        TaskState newState = behavior.asErrorTaskState(async.clock(), expected, failure);
        checkState(newState.status().isError());
        TaskObj updatedObj =
            withNewVersionToken(behavior.newObjBuilder().from(expected).taskState(newState));
        if (params.persist.updateConditional(expected, updatedObj)) {
          // Database updated with final result
          if (newState.status().isRetryable()) {
            metrics.taskExecutionRetryableError();
            LOGGER.debug(
                "{}: Task execution raised retryable error for {} updated in database, retrying",
                name,
                params);
            reattemptAfterRetryableError(params, newState.retryNotBefore());
          } else {
            metrics.taskExecutionFailure();
            LOGGER.debug(
                "{}: Task execution ended in final failure for {} updated in database, returning final result",
                name,
                params);
            finalFailure(params, failure);
          }
        } else {
          metrics.taskExecutionFailureRace();
          String msg =
              format(
                  "Failed to update failure task execution result for %s in database (race condition)",
                  params);
          LOGGER.warn("{}: {}", name, msg);
          finalFailure(params, new ConcurrentModificationException(msg, failure));
        }
      }

    } catch (Throwable t2) {
      // Unhandled failure
      LOGGER.error(
          "{}: Unhandled state while evaluating task execution result for {}", name, params, t2);
      metrics.taskExecutionUnhandled();
      finalFailure(params, t2);
    } finally {
      params.lock.unlock();
    }
  }

  private void unexpectedNullExpectedState(
      ExecParams params, TaskObj.Builder resultBuilder, Throwable failure) {
    // Oops ... no clue how that might have happened, but handle it just in case.
    String res;
    if (failure != null) {
      res = "exceptionally";
    } else if (resultBuilder != null) {
      res = "successfully";
    } else {
      res = "with an illegal null result";
    }
    String msg =
        format(
            "Task execution for %s finished %s, but the expected task obj state is null. Cannot persist the task execution result.",
            params, res);
    LOGGER.error("{}, {}", name, msg);
    Exception ex = new IllegalStateException(msg);
    if (failure != null) {
      ex.addSuppressed(failure);
    }
    finalFailure(params, ex);
  }

  private void scheduleTaskRunningUpdate(ExecParams params, TaskObj current) {
    // Called while holding the ExecParams.lock
    Instant scheduleNotBefore =
        params.taskRequest.behavior().performRunningStateUpdateAt(async.clock(), current);
    params.runningUpdateScheduled =
        async.schedule(() -> updateRunningState(params), scheduleNotBefore);
  }

  private void updateRunningState(ExecParams params) {
    // Called from a thread pool, need to lock.
    params.lock.lock();
    try {

      TaskObj current = params.runningObj;
      if (current == null) {
        // Local task execution finished, do nothing.
        LOGGER.trace(
            "{}: Local task execution has finished, no need to update running state for {}",
            name,
            params);
        return;
      }

      metrics.taskUpdateRunningState();
      TaskState state = current.taskState();
      if (state.status() == TaskStatus.RUNNING) {
        TaskBehavior behavior = params.taskRequest.behavior();
        TaskObj updated =
            withNewVersionToken(
                behavior
                    .newObjBuilder()
                    .from(current)
                    .taskState(behavior.runningTaskState(async.clock(), null)));
        if (updated.taskState().status() != TaskStatus.RUNNING) {
          throw new IllegalStateException(
              format(
                  "TaskBehavior.runningTaskState() implementation %s returned illegal status %s, must return RUNNING",
                  behavior.getClass().getName(), updated.taskState().status()));
        }

        try {
          if (params.persist.updateConditional(current, updated)) {
            params.runningObj = updated;
            metrics.taskRunningStateUpdated();
            // Current state successfully updated in database, reschedule running task update
            LOGGER.trace(
                "{}: Successfully updated state for locally running task for {}", name, params);
            scheduleTaskRunningUpdate(params, updated);
          } else {
            metrics.taskRunningStateUpdateRace();
            // Ran into a (remote) race, retry running-update
            LOGGER.warn(
                "{}: Race on database update while updating running state for {}. The result of the local task "
                    + "execution might be lost. When the local task execution finishes, it may also run into an "
                    + "update-race, indicating that the task-result is lost.",
                name,
                params);
            return; // don't re-schedule, there's no chance that another update will succeed.
          }
        } catch (Throwable t) {
          LOGGER.error("{}: Unexpected exception updating task state for {}", name, params, t);
          // re-schedule ... and pray
          scheduleTaskRunningUpdate(params, current);
        }
      } else {
        metrics.taskRunningStateUpdateNoLongerRunning();
        LOGGER.trace(
            "{}: Task for {} no longer running, skipping further local running state updates",
            name,
            params);
      }
    } finally {
      params.lock.unlock();
    }
  }

  private void reattemptAfterRetryableError(ExecParams params, Instant retryNotBefore) {
    async.schedule(() -> tryLocal(params), retryNotBefore);
  }

  private static final class ExecParams {
    final Persist persist;
    final CompletableFuture resultFuture;
    final TaskRequest taskRequest;

    final Lock lock = new ReentrantLock();

    TaskObj runningObj;
    CompletionStage runningUpdateScheduled;

    @SuppressWarnings("unchecked")
    ExecParams(Persist persist, TaskRequest taskRequest) {
      this.persist = persist;
      this.resultFuture = new CompletableFuture<>();
      this.taskRequest = (TaskRequest) taskRequest;
    }

    ObjId objId() {
      return taskRequest.objId();
    }

    void cancelRunningStateUpdate() {
      // Cancel scheduled running-state update
      CompletionStage handle = runningUpdateScheduled;
      if (handle != null) {
        runningObj = null;
        runningUpdateScheduled = null;
        // Don't interrupt, not all implementations support that (Vert.X won't, see
        // https://github.com/eclipse-vertx/vert.x/issues/3334)
        handle.toCompletableFuture().cancel(false);
      }
    }

    @Override
    public String toString() {
      return taskRequest.objType().name() + ':' + taskRequest.objId();
    }
  }

  private TaskObj withNewVersionToken(TaskObj.Builder builder) {
    return builder.versionToken(ObjId.randomObjId().toString()).build();
  }

  private static  T castObj(
      TaskRequest taskRequest, Obj obj) {
    Class clazz = taskRequest.behavior().objType().targetClass();
    try {
      @SuppressWarnings("unchecked")
      T taskObj = (T) clazz.cast(obj);
      return taskObj;
    } catch (ClassCastException e) {
      throw new ClassCastException(
          "Failed to cast obj of type "
              + obj.type().name()
              + " to the task request's expected type "
              + clazz.getName());
    }
  }

  final class TasksImpl implements Tasks {
    final Persist persist;

    public TasksImpl(Persist persist) {
      this.persist = persist;
    }

    @Override
    public  CompletionStage submit(
        TaskRequest taskRequest) {
      return TasksServiceImpl.this.submit(persist, taskRequest);
    }
  }

  @Value.Immutable
  interface TaskKey {
    @Value.Parameter(order = 1)
    String repositoryId();

    @Value.Parameter(order = 2)
    ObjId objId();

    static TaskKey taskKey(String repositoryId, ObjId objId) {
      return ImmutableTaskKey.of(repositoryId, objId);
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy