All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hubspot.singularity.mesos.SingularityMesosStatusUpdateHandler Maven / Gradle / Ivy

The newest version!
package com.hubspot.singularity.mesos;

import com.codahale.metrics.Histogram;
import com.codahale.metrics.Meter;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multiset;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import com.hubspot.mesos.JavaUtils;
import com.hubspot.mesos.protos.MesosTaskState;
import com.hubspot.singularity.ExtendedTaskState;
import com.hubspot.singularity.InvalidSingularityTaskIdException;
import com.hubspot.singularity.LoadBalancerRequestType;
import com.hubspot.singularity.RequestType;
import com.hubspot.singularity.Singularity;
import com.hubspot.singularity.SingularityCreateResult;
import com.hubspot.singularity.SingularityLoadBalancerUpdate;
import com.hubspot.singularity.SingularityMainModule;
import com.hubspot.singularity.SingularityManagedThreadPoolFactory;
import com.hubspot.singularity.SingularityPendingDeploy;
import com.hubspot.singularity.SingularityPendingRequest;
import com.hubspot.singularity.SingularityPendingRequest.PendingType;
import com.hubspot.singularity.SingularityPendingRequestBuilder;
import com.hubspot.singularity.SingularityPendingTask;
import com.hubspot.singularity.SingularityRequestWithState;
import com.hubspot.singularity.SingularityTask;
import com.hubspot.singularity.SingularityTaskCleanup;
import com.hubspot.singularity.SingularityTaskHistory;
import com.hubspot.singularity.SingularityTaskHistoryUpdate;
import com.hubspot.singularity.SingularityTaskId;
import com.hubspot.singularity.SingularityTaskStatusHolder;
import com.hubspot.singularity.TaskCleanupType;
import com.hubspot.singularity.async.ExecutorAndQueue;
import com.hubspot.singularity.config.SingularityConfiguration;
import com.hubspot.singularity.data.DeployManager;
import com.hubspot.singularity.data.RequestManager;
import com.hubspot.singularity.data.TaskManager;
import com.hubspot.singularity.data.history.HistoryManager;
import com.hubspot.singularity.data.history.TaskHistoryHelper;
import com.hubspot.singularity.data.transcoders.IdTranscoder;
import com.hubspot.singularity.data.transcoders.SingularityTranscoderException;
import com.hubspot.singularity.helpers.MesosProtosUtils;
import com.hubspot.singularity.helpers.MesosUtils;
import com.hubspot.singularity.hooks.LoadBalancerClient;
import com.hubspot.singularity.scheduler.SingularityHealthchecker;
import com.hubspot.singularity.scheduler.SingularityLeaderCache;
import com.hubspot.singularity.scheduler.SingularityNewTaskChecker;
import com.hubspot.singularity.scheduler.SingularityScheduler;
import com.hubspot.singularity.sentry.SingularityExceptionNotifier;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import org.apache.mesos.v1.Protos;
import org.apache.mesos.v1.Protos.TaskState;
import org.apache.mesos.v1.Protos.TaskStatus;
import org.apache.mesos.v1.Protos.TaskStatus.Reason;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@Singleton
public class SingularityMesosStatusUpdateHandler {
  private static final Logger LOG = LoggerFactory.getLogger(
    SingularityMesosStatusUpdateHandler.class
  );

  private static final Set ACTIVE_STATES = ImmutableSet.of(
    MesosTaskState.TASK_STAGING,
    MesosTaskState.TASK_STARTING,
    MesosTaskState.TASK_RUNNING
  );
  private static final String RESOURCE_MISMATCH_ERR =
    "required by task and its executor is more than available";

  private final TaskManager taskManager;
  private final DeployManager deployManager;
  private final RequestManager requestManager;
  private final IdTranscoder taskIdTranscoder;
  private final SingularityExceptionNotifier exceptionNotifier;
  private final SingularityHealthchecker healthchecker;
  private final SingularityNewTaskChecker newTaskChecker;
  private final SingularityAgentAndRackManager agentAndRackManager;
  private final SingularityMesosExecutorInfoSupport logSupport;
  private final SingularityScheduler scheduler;
  private final SingularityLeaderCache leaderCache;
  private final MesosProtosUtils mesosProtosUtils;
  private final String serverId;
  private final SingularitySchedulerLock schedulerLock;
  private final SingularityConfiguration configuration;
  private final Multiset taskLostReasons;
  private final Meter lostTasksMeter;
  private final Histogram statusUpdateDeltas;
  private final LoadBalancerClient lbClient;
  private final HistoryManager historyManager;

  private final ExecutorAndQueue statusUpdatesExecutor;

  @Inject
  public SingularityMesosStatusUpdateHandler(
    TaskManager taskManager,
    DeployManager deployManager,
    RequestManager requestManager,
    IdTranscoder taskIdTranscoder,
    SingularityExceptionNotifier exceptionNotifier,
    SingularityHealthchecker healthchecker,
    SingularityNewTaskChecker newTaskChecker,
    SingularityAgentAndRackManager agentAndRackManager,
    SingularityMesosExecutorInfoSupport logSupport,
    SingularityScheduler scheduler,
    @Named(SingularityMainModule.SERVER_ID_PROPERTY) String serverId,
    SingularitySchedulerLock schedulerLock,
    SingularityConfiguration configuration,
    SingularityLeaderCache leaderCache,
    MesosProtosUtils mesosProtosUtils,
    LoadBalancerClient lbClient,
    HistoryManager historyManager,
    SingularityManagedThreadPoolFactory threadPoolFactory,
    @Named(
      SingularityMesosModule.TASK_LOST_REASONS_COUNTER
    ) Multiset taskLostReasons,
    @Named(SingularityMainModule.LOST_TASKS_METER) Meter lostTasksMeter,
    @Named(SingularityMainModule.STATUS_UPDATE_DELTAS) Histogram statusUpdateDeltas
  ) {
    this.taskManager = taskManager;
    this.deployManager = deployManager;
    this.requestManager = requestManager;
    this.taskIdTranscoder = taskIdTranscoder;
    this.exceptionNotifier = exceptionNotifier;
    this.healthchecker = healthchecker;
    this.newTaskChecker = newTaskChecker;
    this.agentAndRackManager = agentAndRackManager;
    this.logSupport = logSupport;
    this.scheduler = scheduler;
    this.leaderCache = leaderCache;
    this.mesosProtosUtils = mesosProtosUtils;
    this.serverId = serverId;
    this.schedulerLock = schedulerLock;
    this.configuration = configuration;
    this.lbClient = lbClient;
    this.historyManager = historyManager;
    this.taskLostReasons = taskLostReasons;
    this.lostTasksMeter = lostTasksMeter;
    this.statusUpdateDeltas = statusUpdateDeltas;
    this.statusUpdatesExecutor =
      threadPoolFactory.get(
        "status-updates",
        configuration.getMesosConfiguration().getStatusUpdateConcurrencyLimit(),
        configuration.getMesosConfiguration().getMaxStatusUpdateQueueSize(),
        true
      );
  }

  private boolean isRecoveryStatusUpdate(
    Optional previousTaskStatusHolder,
    Reason reason,
    ExtendedTaskState taskState,
    final SingularityTaskStatusHolder newTaskStatusHolder
  ) {
    if (
      !previousTaskStatusHolder.isPresent() && // Task was already removed from the active list
      !taskState.isDone() &&
      newTaskStatusHolder.getTaskStatus().isPresent() &&
      ACTIVE_STATES.contains(newTaskStatusHolder.getTaskStatus().get().getState())
    ) {
      LOG.warn(
        "Task {} recovered but may have already been replaced",
        newTaskStatusHolder.getTaskId()
      );
      return true;
    }
    return false;
  }

  /**
   * 1- we have a previous update, and this is a duplicate of it (ignore) 2- we don't have a
   * previous update, 2 cases: a - this task has already been destroyed (we can ignore it then) b -
   * we've never heard of this task (very unlikely since we first write a status into zk before we
   * launch a task)
   */
  private boolean isDuplicateOrIgnorableStatusUpdate(
    Optional previousTaskStatusHolder,
    final SingularityTaskStatusHolder newTaskStatusHolder
  ) {
    if (!previousTaskStatusHolder.isPresent()) {
      return true;
    }

    if (!previousTaskStatusHolder.get().getTaskStatus().isPresent()) { // this is our launch state
      return false;
    }

    return (
      previousTaskStatusHolder.get().getTaskStatus().get().getState() ==
      newTaskStatusHolder.getTaskStatus().get().getState()
    );
  }

  private void saveNewTaskStatusHolder(
    SingularityTaskId taskIdObj,
    SingularityTaskStatusHolder newTaskStatusHolder,
    ExtendedTaskState taskState
  ) {
    if (taskState.isDone()) {
      taskManager.deleteLastActiveTaskStatus(taskIdObj);
    } else {
      taskManager.saveLastActiveTaskStatus(newTaskStatusHolder);
    }
  }

  private Optional getTaskId(String taskId) {
    try {
      return Optional.of(taskIdTranscoder.fromString(taskId));
    } catch (InvalidSingularityTaskIdException | SingularityTranscoderException e) {
      exceptionNotifier.notify(String.format("Unexpected taskId %s", taskId), e);
      LOG.error("Unexpected taskId {} ", taskId, e);
      return Optional.empty();
    }
  }

  private Optional getStatusMessage(
    Protos.TaskStatus status,
    Optional task
  ) {
    if (status.hasMessage() && !Strings.isNullOrEmpty(status.getMessage())) {
      return Optional.of(status.getMessage());
    } else if (
      status.hasReason() &&
      status.getReason() == Reason.REASON_CONTAINER_LIMITATION_MEMORY
    ) {
      if (
        task.isPresent() &&
        task.get().getTaskRequest().getDeploy().getResources().isPresent()
      ) {
        if (
          task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb() > 0
        ) {
          return Optional.of(
            String.format(
              "Task exceeded one or more memory limits (%s MB mem, %s MB disk).",
              task.get().getTaskRequest().getDeploy().getResources().get().getMemoryMb(),
              task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb()
            )
          );
        } else {
          return Optional.of(
            String.format(
              "Task exceeded memory limit (%s MB mem).",
              task.get().getTaskRequest().getDeploy().getResources().get().getMemoryMb()
            )
          );
        }
      }
      return Optional.of("Task exceeded memory limit.");
    } else if (
      status.hasReason() && status.getReason() == Reason.REASON_CONTAINER_LIMITATION_DISK
    ) {
      if (
        task.isPresent() &&
        task.get().getTaskRequest().getDeploy().getResources().isPresent()
      ) {
        return Optional.of(
          String.format(
            "Task exceeded disk limit (%s MB disk).",
            task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb()
          )
        );
      } else {
        return Optional.of("Task exceeded disk limit.");
      }
    }

    return Optional.empty();
  }

  private void relaunchTask(SingularityTask task) {
    SingularityPendingTask pendingTask = task.getTaskRequest().getPendingTask();

    SingularityPendingRequest pendingRequest = new SingularityPendingRequestBuilder()
      .setRequestId(task.getTaskRequest().getRequest().getId())
      .setDeployId(task.getTaskRequest().getDeploy().getId())
      .setPendingType(PendingType.RETRY)
      .setUser(pendingTask.getUser())
      .setRunId(pendingTask.getRunId())
      .setCmdLineArgsList(pendingTask.getCmdLineArgsList())
      .setSkipHealthchecks(pendingTask.getSkipHealthchecks())
      .setMessage(pendingTask.getMessage())
      .setResources(pendingTask.getResources())
      .setS3UploaderAdditionalFiles(pendingTask.getS3UploaderAdditionalFiles())
      .setRunAsUserOverride(pendingTask.getRunAsUserOverride())
      .setEnvOverrides(pendingTask.getEnvOverrides())
      .setExtraArtifacts(pendingTask.getExtraArtifacts())
      .setActionId(pendingTask.getActionId())
      .setRunAt(pendingTask.getPendingTaskId().getNextRunAt())
      .setTimestamp(System.currentTimeMillis())
      .build();

    requestManager.addToPendingQueue(pendingRequest);
  }

  private StatusUpdateResult unsafeProcessStatusUpdate(
    Protos.TaskStatus status,
    SingularityTaskId taskIdObj
  ) {
    final String taskId = status.getTaskId().getValue();

    long timestamp = System.currentTimeMillis();

    if (status.hasTimestamp()) {
      timestamp = (long) (status.getTimestamp() * 1000);
    }

    long now = System.currentTimeMillis();
    long delta = now - timestamp;

    LOG.info(
      "Update: task {} is now {} ({}) at {} (delta: {})",
      taskId,
      status.getState(),
      status.getMessage(),
      timestamp,
      JavaUtils.durationFromMillis(delta)
    );
    statusUpdateDeltas.update(delta);

    final SingularityTaskStatusHolder newTaskStatusHolder = new SingularityTaskStatusHolder(
      taskIdObj,
      Optional.of(mesosProtosUtils.taskStatusFromProtos(status)),
      System.currentTimeMillis(),
      serverId,
      Optional.empty()
    );
    final Optional previousTaskStatusHolder = taskManager.getLastActiveTaskStatus(
      taskIdObj
    );
    final ExtendedTaskState taskState = MesosUtils.fromTaskState(status.getState());

    if (
      taskState == ExtendedTaskState.TASK_ERROR &&
      status.getMessage() != null &&
      status.getMessage().contains(RESOURCE_MISMATCH_ERR)
    ) {
      LOG.error(
        "Possible duplicate resource allocation",
        new IllegalStateException(
          String.format(
            "Duplicate resource allocation for %s: %s",
            taskId,
            status.getMessage()
          )
        )
      );
    }

    if (
      isRecoveryStatusUpdate(
        previousTaskStatusHolder,
        status.getReason(),
        taskState,
        newTaskStatusHolder
      )
    ) {
      return tryRecoverTask(
        status,
        taskIdObj,
        taskId,
        newTaskStatusHolder,
        taskState,
        now
      );
    }

    // If a task is missing data in Singularity there is not much we can do to recover it
    Optional maybeTask = taskManager.getTask(taskIdObj);
    if (!maybeTask.isPresent()) {
      maybeTask = tryFindMissingTaskData(taskIdObj, taskId, taskState);
    }
    if (!maybeTask.isPresent()) {
      return handledMissingTaskData(
        taskIdObj,
        taskId,
        newTaskStatusHolder,
        taskState,
        now
      );
    }

    SingularityTask task = maybeTask.get();

    if (
      isDuplicateOrIgnorableStatusUpdate(previousTaskStatusHolder, newTaskStatusHolder)
    ) {
      LOG.trace("Ignoring status update {} to {}", taskState, taskIdObj);
      saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState);
      return StatusUpdateResult.IGNORED;
    }

    if (status.getState() == TaskState.TASK_LOST) {
      boolean isMesosFailure =
        status.getReason() == Reason.REASON_INVALID_OFFERS ||
        status.getReason() == Reason.REASON_AGENT_REMOVED ||
        status.getReason() == Reason.REASON_AGENT_RESTARTED ||
        status.getReason() == Reason.REASON_AGENT_UNKNOWN ||
        status.getReason() == Reason.REASON_MASTER_DISCONNECTED ||
        status.getReason() == Reason.REASON_AGENT_DISCONNECTED;

      RequestType requestType = task.getTaskRequest().getRequest().getRequestType();
      boolean isRelaunchable = requestType != null && !requestType.isLongRunning();

      if (isMesosFailure && isRelaunchable) {
        LOG.info("Relaunching lost task {}", task);
        relaunchTask(task);
      }
      lostTasksMeter.mark();
      if (configuration.getDisasterDetection().isEnabled()) {
        taskLostReasons.add(status.getReason());
      }
    }

    if (!taskState.isDone()) {
      final Optional pendingDeploy = deployManager.getPendingDeploy(
        taskIdObj.getRequestId()
      );

      Optional requestWithState = Optional.empty();

      if (taskState == ExtendedTaskState.TASK_RUNNING) {
        requestWithState = requestManager.getRequest(taskIdObj.getRequestId());
        healthchecker.enqueueHealthcheck(task, pendingDeploy, requestWithState);
      }

      if (
        !pendingDeploy.isPresent() ||
        !pendingDeploy
          .get()
          .getDeployMarker()
          .getDeployId()
          .equals(taskIdObj.getDeployId())
      ) {
        if (!requestWithState.isPresent()) {
          requestWithState = requestManager.getRequest(taskIdObj.getRequestId());
        }
        newTaskChecker.enqueueNewTaskCheck(task, requestWithState, healthchecker);
      }
    }

    final Optional statusMessage = getStatusMessage(status, Optional.of(task));

    final SingularityTaskHistoryUpdate taskUpdate = new SingularityTaskHistoryUpdate(
      taskIdObj,
      timestamp,
      taskState,
      statusMessage,
      status.hasReason() ? Optional.of(status.getReason().name()) : Optional.empty()
    );
    final SingularityCreateResult taskHistoryUpdateCreateResult = taskManager.saveTaskHistoryUpdate(
      taskUpdate
    );

    logSupport.checkDirectoryAndContainerId(taskIdObj);

    if (taskState.isDone()) {
      healthchecker.cancelHealthcheck(taskId);
      newTaskChecker.cancelNewTaskCheck(taskId);

      taskManager.deleteKilledRecord(taskIdObj);

      handleCompletedTaskState(
        status,
        taskIdObj,
        taskState,
        taskHistoryUpdateCreateResult,
        Optional.of(task),
        timestamp
      );
    }

    saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState);
    return StatusUpdateResult.DONE;
  }

  private StatusUpdateResult tryRecoverTask(
    Protos.TaskStatus status,
    SingularityTaskId taskIdObj,
    String taskId,
    SingularityTaskStatusHolder newTaskStatusHolder,
    ExtendedTaskState taskState,
    long now
  ) {
    LOG.info(
      "Found recovery status update with reason {} for task {}",
      status.getReason(),
      taskId
    );
    final Optional maybeTaskHistory = taskManager.getTaskHistory(
      taskIdObj
    );
    if (
      !maybeTaskHistory.isPresent() ||
      !maybeTaskHistory.get().getLastTaskUpdate().isPresent()
    ) {
      LOG.warn(
        "Task {} not found to recover, it may have already been persisted. Triggering a kill via mesos",
        taskIdObj
      );

      return StatusUpdateResult.KILL_TASK;
    } else if (status.getReason() == Reason.REASON_AGENT_REREGISTERED) {
      Optional maybeLbUpdate = taskManager.getLoadBalancerState(
        taskIdObj,
        LoadBalancerRequestType.REMOVE
      );
      if (maybeLbUpdate.isPresent()) {
        LOG.info(
          "LB removal for recovered task {} was already started. Attempting to clear and start as new task",
          taskId
        );
        boolean canRecoverLbState = true;
        if (maybeLbUpdate.get().getLoadBalancerState().isInProgress()) {
          try {
            if (
              lbClient
                .getState(maybeLbUpdate.get().getLoadBalancerRequestId())
                .getLoadBalancerState()
                .isInProgress()
            ) {
              // We don't want to block here and wait for LB removal to finish in case it is stuck. Mark this task for cleaning
              canRecoverLbState = false;
            }
          } catch (Exception e) {
            LOG.warn("Could not verify LB state for {}", taskId, e);
            canRecoverLbState = false;
          }
        }
        if (
          canRecoverLbState &&
          deployManager
            .getActiveDeployId(taskIdObj.getRequestId())
            .map(d -> d.equals(taskIdObj.getDeployId()))
            .orElse(false) &&
          taskManager.reactivateTask(
            taskIdObj,
            taskState,
            newTaskStatusHolder,
            Optional.ofNullable(status.getMessage()),
            status.hasReason() ? Optional.of(status.getReason().name()) : Optional.empty()
          )
        ) {
          Optional maybeTask = taskManager.getTask(taskIdObj);
          Optional maybeRequest = requestManager.getRequest(
            taskIdObj.getRequestId()
          );
          if (
            maybeTask.isPresent() &&
            maybeRequest.isPresent() &&
            maybeRequest.get().getState().isRunnable()
          ) {
            LOG.info(
              "Task {} can be recovered. Clearing LB state and enqueuing check as new task",
              taskId
            );
            taskManager.clearLoadBalancerHistory(taskIdObj);
            newTaskChecker.enqueueCheckWithDelay(maybeTask.get(), 0, healthchecker);
            requestManager.addToPendingQueue(
              new SingularityPendingRequest(
                taskIdObj.getRequestId(),
                taskIdObj.getDeployId(),
                now,
                Optional.empty(),
                PendingType.TASK_RECOVERED,
                Optional.empty(),
                Optional.of(
                  String.format("Agent %s recovered", status.getAgentId().getValue())
                )
              )
            );
            return StatusUpdateResult.DONE;
          }
        } else {
          LOG.info("Could not recover task {}, will clean up", taskId);
          taskManager.createTaskCleanup(
            new SingularityTaskCleanup(
              Optional.empty(),
              TaskCleanupType.DECOMISSIONING,
              System.currentTimeMillis(),
              taskIdObj,
              Optional.of(
                "Agent re-registered after load balancer removal started. Task cannot be reactivated."
              ),
              Optional.empty(),
              Optional.empty()
            )
          );
          requestManager.addToPendingQueue(
            new SingularityPendingRequest(
              taskIdObj.getRequestId(),
              taskIdObj.getDeployId(),
              now,
              Optional.empty(),
              PendingType.TASK_RECOVERED,
              Optional.empty(),
              Optional.of(
                String.format("Agent %s recovered", status.getAgentId().getValue())
              )
            )
          );
          return StatusUpdateResult.DONE;
        }
      }
    }

    // Check tasks with no lb component or not yet removed from LB
    boolean reactivated =
      deployManager
        .getActiveDeployId(taskIdObj.getRequestId())
        .map(d -> d.equals(taskIdObj.getDeployId()))
        .orElse(false) &&
      taskManager.reactivateTask(
        taskIdObj,
        taskState,
        newTaskStatusHolder,
        Optional.ofNullable(status.getMessage()),
        status.hasReason() ? Optional.of(status.getReason().name()) : Optional.empty()
      );
    requestManager.addToPendingQueue(
      new SingularityPendingRequest(
        taskIdObj.getRequestId(),
        taskIdObj.getDeployId(),
        now,
        Optional.empty(),
        PendingType.TASK_RECOVERED,
        Optional.empty(),
        Optional.of(String.format("Agent %s recovered", status.getAgentId().getValue()))
      )
    );
    if (reactivated) {
      return StatusUpdateResult.DONE;
    } else {
      return StatusUpdateResult.KILL_TASK;
    }
  }

  private Optional tryFindMissingTaskData(
    SingularityTaskId taskIdObj,
    String taskId,
    ExtendedTaskState taskState
  ) {
    LOG.warn("Missing task data for {}, trying to recover", taskId);
    // If found in this first step, it was a bad zk write and everything should just work
    Optional maybeTask = taskManager.tryRepairTask(taskIdObj);
    if (!maybeTask.isPresent()) {
      // Ensure history manager calls cannot interrupt the status update path
      try {
        Optional maybeTaskHistory = historyManager.getTaskHistory(
          taskId
        );
        if (maybeTaskHistory.isPresent()) {
          maybeTask = maybeTaskHistory.map(SingularityTaskHistory::getTask);
          if (
            maybeTaskHistory
              .get()
              .getLastTaskUpdate()
              .map(SingularityTaskHistoryUpdate::getTaskState)
              .orElse(taskState)
              .isDone() &&
            !taskState.isDone()
          ) {
            // Don't bother with LB state/etc recovery, let the task get killed and replaced as a cleaner replacement
            LOG.info(
              "Recovered task {} was previously marked as done. Will not reactivate fully",
              taskId
            );
            taskManager.repairFoundTask(maybeTask.get());
            return Optional.empty();
          }
        }
      } catch (Exception e) {
        LOG.error("Could not fetch {} from history", taskId, e);
      }
      if (maybeTask.isPresent() && taskManager.repairFoundTask(maybeTask.get())) {
        LOG.info("Successfully repaired task data in zk for {}", taskId);
      }
    }
    // TODO - could we also try to fetch this from mesos agent somehow?
    return maybeTask;
  }

  private StatusUpdateResult handledMissingTaskData(
    SingularityTaskId taskIdObj,
    String taskId,
    SingularityTaskStatusHolder newTaskStatusHolder,
    ExtendedTaskState taskState,
    long now
  ) {
    if (taskState.isDone()) {
      LOG.info("No task data present for {} but task has finished, ignoring", taskId);
      saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState);
      requestManager.addToPendingQueue(
        new SingularityPendingRequest(
          taskIdObj.getRequestId(),
          taskIdObj.getDeployId(),
          now,
          Optional.empty(),
          PendingType.TASK_DONE,
          Optional.empty(),
          Optional.of(String.format("Unable to recover task %s", taskId))
        )
      );
      return StatusUpdateResult.DONE;
    } else {
      final String message = String.format(
        "Task %s is active but is missing task data, killing task",
        taskId
      );
      exceptionNotifier.notify(message);
      LOG.error(message);
      saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState);
      // Also save a task killed event to clean up active task list
      saveNewTaskStatusHolder(
        taskIdObj,
        newTaskStatusHolder,
        ExtendedTaskState.TASK_KILLED
      );
      requestManager.addToPendingQueue(
        new SingularityPendingRequest(
          taskIdObj.getRequestId(),
          taskIdObj.getDeployId(),
          now,
          Optional.empty(),
          PendingType.TASK_DONE,
          Optional.empty(),
          Optional.of(String.format("Unable to recover task %s", taskId))
        )
      );
      return StatusUpdateResult.KILL_TASK;
    }
  }

  private synchronized void handleCompletedTaskState(
    TaskStatus status,
    SingularityTaskId taskIdObj,
    ExtendedTaskState taskState,
    SingularityCreateResult taskHistoryUpdateCreateResult,
    Optional task,
    long timestamp
  ) {
    // Method synchronized to prevent race condition where two tasks complete at the same time but the leader cache holding the state
    // doesn't get updated between each task completion. If this were to happen, then agents would never transition from DECOMMISSIONING to
    // DECOMMISSIONED because each task state check thinks the other task is still running.
    agentAndRackManager.checkStateAfterFinishedTask(
      taskIdObj,
      status.getAgentId().getValue(),
      leaderCache
    );
    scheduler.handleCompletedTask(
      task,
      taskIdObj,
      timestamp,
      taskState,
      taskHistoryUpdateCreateResult,
      status
    );
  }

  public boolean hasRoomForMoreUpdates() {
    return (
      statusUpdatesExecutor.getQueue().size() < statusUpdatesExecutor.getQueueLimit()
    );
  }

  public CompletableFuture processStatusUpdateAsync(
    Protos.TaskStatus status
  ) {
    return CompletableFuture.supplyAsync(
      () -> {
        final String taskId = status.getTaskId().getValue();
        final Optional maybeTaskId = getTaskId(taskId);

        if (!maybeTaskId.isPresent()) {
          return StatusUpdateResult.INVALID_TASK_ID;
        }

        return schedulerLock.runWithRequestLockAndReturn(
          () -> unsafeProcessStatusUpdate(status, maybeTaskId.get()),
          maybeTaskId.get().getRequestId(),
          getClass().getSimpleName()
        );
      },
      statusUpdatesExecutor.getExecutorService()
    );
  }

  public int getQueueSize() {
    return statusUpdatesExecutor.getQueue().size();
  }

  public double getQueueFullness() {
    LOG.info(
      "Queue size: {}, queue limit: {}, queue fullness: {}",
      statusUpdatesExecutor.getQueue().size(),
      statusUpdatesExecutor.getQueueLimit(),
      (double) statusUpdatesExecutor.getQueue().size() /
      statusUpdatesExecutor.getQueueLimit()
    );
    return (
      (double) statusUpdatesExecutor.getQueue().size() /
      statusUpdatesExecutor.getQueueLimit()
    );
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy