All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hubspot.singularity.scheduler.SingularityDeployHealthHelper Maven / Gradle / Ivy

package com.hubspot.singularity.scheduler;

import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.inject.Inject;
import com.hubspot.deploy.HealthcheckOptions;
import com.hubspot.mesos.JavaUtils;
import com.hubspot.singularity.ExtendedTaskState;
import com.hubspot.singularity.SingularityAction;
import com.hubspot.singularity.SingularityDeploy;
import com.hubspot.singularity.SingularityDeployFailure;
import com.hubspot.singularity.SingularityDeployFailureReason;
import com.hubspot.singularity.SingularityRequest;
import com.hubspot.singularity.SingularityRequestHistory;
import com.hubspot.singularity.SingularityTask;
import com.hubspot.singularity.SingularityTaskHealthcheckResult;
import com.hubspot.singularity.SingularityTaskHistoryUpdate;
import com.hubspot.singularity.SingularityTaskHistoryUpdate.SimplifiedTaskState;
import com.hubspot.singularity.SingularityTaskId;
import com.hubspot.singularity.config.SingularityConfiguration;
import com.hubspot.singularity.data.DisasterManager;
import com.hubspot.singularity.data.RequestManager;
import com.hubspot.singularity.data.TaskManager;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import javax.inject.Singleton;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@Singleton
public class SingularityDeployHealthHelper {
  private static final Logger LOG = LoggerFactory.getLogger(
    SingularityDeployHealthHelper.class
  );

  private final TaskManager taskManager;
  private final SingularityConfiguration configuration;
  private final RequestManager requestManager;
  private final DisasterManager disasterManager;

  @Inject
  public SingularityDeployHealthHelper(
    TaskManager taskManager,
    SingularityConfiguration configuration,
    RequestManager requestManager,
    DisasterManager disasterManager
  ) {
    this.taskManager = taskManager;
    this.configuration = configuration;
    this.requestManager = requestManager;
    this.disasterManager = disasterManager;
  }

  public enum DeployHealth {
    WAITING,
    UNHEALTHY,
    HEALTHY
  }

  private boolean shouldCheckHealthchecks(
    final SingularityRequest request,
    final Optional deploy,
    final Collection activeTasks,
    final boolean isDeployPending
  ) {
    if (disasterManager.isDisabled(SingularityAction.RUN_HEALTH_CHECKS)) {
      return false;
    }
    if (!deploy.isPresent()) {
      return false;
    }

    if (!deploy.get().getHealthcheck().isPresent()) {
      return false;
    }

    if (isDeployPending && deploy.get().getSkipHealthchecksOnDeploy().orElse(false)) {
      return false;
    }

    if (request.getSkipHealthchecks().orElse(Boolean.FALSE)) {
      return false;
    }

    for (SingularityTask task : taskManager.getTasks(activeTasks).values()) {
      if (
        task.getTaskRequest().getPendingTask().getSkipHealthchecks().orElse(Boolean.FALSE)
      ) {
        return false;
      }
    }

    return true;
  }

  public DeployHealth getDeployHealth(
    final SingularityRequest request,
    final Optional deploy,
    final Collection activeTasks,
    final boolean isDeployPending
  ) {
    if (shouldCheckHealthchecks(request, deploy, activeTasks, isDeployPending)) {
      return getHealthcheckDeployState(deploy.get(), activeTasks, isDeployPending);
    } else {
      return getNoHealthcheckDeployHealth(deploy, activeTasks);
    }
  }

  public List getHealthyTasks(
    final SingularityRequest request,
    final Optional deploy,
    final Collection activeTasks,
    final boolean isDeployPending
  ) {
    if (shouldCheckHealthchecks(request, deploy, activeTasks, isDeployPending)) {
      return getHealthcheckedHealthyTasks(deploy.get(), activeTasks, isDeployPending);
    } else {
      return getNoHealthcheckHealthyTasks(deploy, activeTasks);
    }
  }

  private DeployHealth getNoHealthcheckDeployHealth(
    final Optional deploy,
    final Collection matchingActiveTasks
  ) {
    final Map> taskUpdates = taskManager.getTaskHistoryUpdates(
      matchingActiveTasks
    );

    for (SingularityTaskId taskId : matchingActiveTasks) {
      Collection updates = taskUpdates.get(taskId);

      SimplifiedTaskState currentState = SingularityTaskHistoryUpdate.getCurrentState(
        updates
      );

      switch (currentState) {
        case UNKNOWN:
        case WAITING:
          return DeployHealth.WAITING;
        case DONE:
          LOG.warn(
            "Unexpectedly found an active task ({}) in done state: {}}",
            taskId,
            updates
          );
          return DeployHealth.UNHEALTHY;
        case RUNNING:
          if (!isRunningTaskHealthy(deploy, updates, taskId)) {
            return DeployHealth.WAITING;
          }
      }
    }

    return DeployHealth.HEALTHY;
  }

  private List getNoHealthcheckHealthyTasks(
    final Optional deploy,
    final Collection matchingActiveTasks
  ) {
    final Map> taskUpdates = taskManager.getTaskHistoryUpdates(
      matchingActiveTasks
    );
    final List healthyTaskIds = Lists.newArrayListWithCapacity(
      matchingActiveTasks.size()
    );

    for (SingularityTaskId taskId : matchingActiveTasks) {
      Collection updates = taskUpdates.get(taskId);
      SimplifiedTaskState currentState = SingularityTaskHistoryUpdate.getCurrentState(
        updates
      );
      if (
        currentState == SimplifiedTaskState.RUNNING &&
        isRunningTaskHealthy(deploy, updates, taskId)
      ) {
        healthyTaskIds.add(taskId);
      }
    }

    return healthyTaskIds;
  }

  private boolean isRunningTaskHealthy(
    final Optional deploy,
    Collection updates,
    SingularityTaskId taskId
  ) {
    long runningThreshold = configuration.getConsiderTaskHealthyAfterRunningForSeconds();
    if (deploy.isPresent()) {
      runningThreshold =
        deploy.get().getConsiderHealthyAfterRunningForSeconds().orElse(runningThreshold);
    }

    if (runningThreshold < 1) {
      return true;
    }

    Optional runningUpdate = SingularityTaskHistoryUpdate.getUpdate(
      updates,
      ExtendedTaskState.TASK_RUNNING
    );
    long taskDuration = System.currentTimeMillis() - runningUpdate.get().getTimestamp();

    long runningThresholdMillis = TimeUnit.SECONDS.toMillis(runningThreshold);
    if (taskDuration < runningThresholdMillis) {
      LOG.debug(
        "Task {} has been running for {}, has not yet reached running threshold of {}",
        taskId,
        JavaUtils.durationFromMillis(taskDuration),
        JavaUtils.durationFromMillis(runningThresholdMillis)
      );
      return false;
    }
    return true;
  }

  private DeployHealth getHealthcheckDeployState(
    final SingularityDeploy deploy,
    final Collection matchingActiveTasks,
    final boolean isDeployPending
  ) {
    Map healthcheckResults = taskManager.getLastHealthcheck(
      matchingActiveTasks
    );
    List requestHistories = requestManager.getRequestHistory(
      deploy.getRequestId()
    );

    for (SingularityTaskId taskId : matchingActiveTasks) {
      DeployHealth individualTaskHealth;
      if (healthchecksSkipped(taskId, requestHistories, deploy)) {
        LOG.trace("Detected skipped healthchecks for {}", taskId);
        individualTaskHealth = DeployHealth.HEALTHY;
      } else {
        individualTaskHealth =
          getTaskHealth(
            deploy,
            isDeployPending,
            Optional.ofNullable(healthcheckResults.get(taskId)),
            taskId
          );
      }
      if (individualTaskHealth != DeployHealth.HEALTHY) {
        return individualTaskHealth;
      }
    }
    return DeployHealth.HEALTHY;
  }

  private List getHealthcheckedHealthyTasks(
    final SingularityDeploy deploy,
    final Collection matchingActiveTasks,
    final boolean isDeployPending
  ) {
    final Map healthcheckResults = taskManager.getLastHealthcheck(
      matchingActiveTasks
    );
    final List healthyTaskIds = Lists.newArrayListWithCapacity(
      matchingActiveTasks.size()
    );
    List requestHistories = requestManager.getRequestHistory(
      deploy.getRequestId()
    );

    for (SingularityTaskId taskId : matchingActiveTasks) {
      DeployHealth individualTaskHealth;
      if (healthchecksSkipped(taskId, requestHistories, deploy)) {
        LOG.trace("Detected skipped healthchecks for {}", taskId);
        individualTaskHealth = DeployHealth.HEALTHY;
      } else {
        individualTaskHealth =
          getTaskHealth(
            deploy,
            isDeployPending,
            Optional.ofNullable(healthcheckResults.get(taskId)),
            taskId
          );
      }
      if (individualTaskHealth == DeployHealth.HEALTHY) {
        healthyTaskIds.add(taskId);
      }
    }

    return healthyTaskIds;
  }

  private boolean healthchecksSkipped(
    SingularityTaskId taskId,
    List requestHistories,
    SingularityDeploy deploy
  ) {
    if (deploy.getSkipHealthchecksOnDeploy().orElse(false)) {
      return true;
    }

    Optional maybeTask = taskManager.getTask(taskId);
    if (maybeTask.isPresent()) {
      if (
        maybeTask
          .get()
          .getTaskRequest()
          .getPendingTask()
          .getSkipHealthchecks()
          .orElse(false)
      ) {
        return true;
      }

      Optional runningStartTime = getRunningAt(
        taskManager.getTaskHistoryUpdates(taskId)
      );
      if (runningStartTime.isPresent()) {
        Optional previousHistory = Optional.empty();
        for (SingularityRequestHistory history : requestHistories) {
          if (
            history.getCreatedAt() < runningStartTime.get() &&
            (
              !previousHistory.isPresent() ||
              previousHistory.get().getCreatedAt() < history.getCreatedAt()
            )
          ) {
            previousHistory = Optional.of(history);
          }
        }

        if (
          previousHistory.isPresent() &&
          previousHistory.get().getRequest().getSkipHealthchecks().orElse(false)
        ) {
          return true;
        }
      }
    }

    return false;
  }

  public DeployHealth getTaskHealth(
    SingularityDeploy deploy,
    boolean isDeployPending,
    Optional healthcheckResult,
    SingularityTaskId taskId
  ) {
    Optional task = taskManager.getTask(taskId);
    if (task.isPresent()) {
      if (task.get().getTaskRequest().getRequest().getSkipHealthchecks().orElse(false)) {
        LOG.debug("Healthcheck skipped for {}", taskId);
        return DeployHealth.HEALTHY;
      }
    }
    if (
      deploy.getHealthcheck().isPresent() &&
      deploy.getHealthcheck().get().getHealthcheckResultFilePath().isPresent()
    ) {
      if (
        taskManager
          .getTaskHistoryUpdate(taskId, ExtendedTaskState.TASK_RUNNING)
          .isPresent()
      ) {
        LOG.debug(
          "Task {} has non-web healthcheck and is in running state, marking healthy.",
          taskId
        );
        return DeployHealth.HEALTHY;
      }
    }
    if (!healthcheckResult.isPresent()) {
      LOG.debug("No healthcheck present for {}", taskId);
      return DeployHealth.WAITING;
    } else if (healthcheckResult.get().isFailed()) {
      LOG.debug("Found a failed healthcheck: {}", healthcheckResult);

      if (
        deploy.getHealthcheck().isPresent() &&
        healthcheckResult.get().getStatusCode().isPresent() &&
        deploy
          .getHealthcheck()
          .get()
          .getFailureStatusCodes()
          .orElse(configuration.getHealthcheckFailureStatusCodes())
          .contains(healthcheckResult.get().getStatusCode().get())
      ) {
        LOG.debug(
          "Failed healthcheck had bad status code: {}",
          healthcheckResult.get().getStatusCode().get()
        );
        return DeployHealth.UNHEALTHY;
      }

      final int startupTimeout = deploy.getHealthcheck().isPresent()
        ? deploy
          .getHealthcheck()
          .get()
          .getStartupTimeoutSeconds()
          .orElse(configuration.getStartupTimeoutSeconds())
        : configuration.getStartupTimeoutSeconds();
      Collection updates = taskManager.getTaskHistoryUpdates(
        taskId
      );
      Optional runningAt = getRunningAt(updates);
      if (runningAt.isPresent()) {
        final long durationSinceRunning = System.currentTimeMillis() - runningAt.get();
        if (
          healthcheckResult.get().isStartup() &&
          durationSinceRunning > TimeUnit.SECONDS.toMillis(startupTimeout)
        ) {
          LOG.debug(
            "{} has not responded to healthchecks in {}s",
            taskId,
            startupTimeout
          );
          return DeployHealth.UNHEALTHY;
        }
      }

      final Optional healthcheckMaxRetries = deploy.getHealthcheck().isPresent()
        ? (
          deploy.getHealthcheck().get().getMaxRetries().isPresent()
            ? deploy.getHealthcheck().get().getMaxRetries()
            : configuration.getHealthcheckMaxRetries()
        )
        : Optional.empty();

      if (
        healthcheckMaxRetries.isPresent() &&
        taskManager.getNumNonstartupHealthchecks(taskId) > healthcheckMaxRetries.get()
      ) {
        LOG.debug(
          "{} failed {} healthchecks, the max for the deploy",
          taskId,
          healthcheckMaxRetries.get()
        );
        return DeployHealth.UNHEALTHY;
      }

      final Optional healthcheckMaxTotalTimeoutSeconds = deploy
          .getHealthcheck()
          .isPresent()
        ? Optional.of(getMaxHealthcheckTimeoutSeconds(deploy.getHealthcheck().get()))
        : Optional.empty();

      if (isDeployPending && healthcheckMaxTotalTimeoutSeconds.isPresent()) {
        if (runningAt.isPresent()) {
          final long durationSinceRunning = System.currentTimeMillis() - runningAt.get();
          if (
            durationSinceRunning >
            TimeUnit.SECONDS.toMillis(healthcheckMaxTotalTimeoutSeconds.get())
          ) {
            LOG.debug(
              "{} has been running for {} and has yet to pass healthchecks, failing deploy",
              taskId,
              JavaUtils.durationFromMillis(durationSinceRunning)
            );

            return DeployHealth.UNHEALTHY;
          }
        }
      }

      return DeployHealth.WAITING;
    }
    return DeployHealth.HEALTHY;
  }

  public int getMaxHealthcheckTimeoutSeconds(HealthcheckOptions options) {
    int intervalSeconds = options
      .getIntervalSeconds()
      .orElse(configuration.getHealthcheckIntervalSeconds());
    int responseTimeSeconds = options
      .getResponseTimeoutSeconds()
      .orElse(configuration.getHealthcheckTimeoutSeconds());
    int startupTime = options
      .getStartupTimeoutSeconds()
      .orElse(configuration.getStartupTimeoutSeconds());
    int attempts =
      options.getMaxRetries().orElse(configuration.getHealthcheckMaxRetries().orElse(0)) +
      1;
    return startupTime + ((intervalSeconds + responseTimeSeconds) * attempts);
  }

  public List getTaskFailures(
    final Optional deploy,
    final Collection activeTasks
  ) {
    List failures = new ArrayList<>();
    Map> taskUpdates = taskManager.getTaskHistoryUpdates(
      activeTasks
    );
    Map healthcheckResults = taskManager.getLastHealthcheck(
      activeTasks
    );

    for (SingularityTaskId taskId : activeTasks) {
      Optional maybeFailure = getTaskFailure(
        deploy.get(),
        taskUpdates,
        healthcheckResults,
        taskId
      );
      if (maybeFailure.isPresent()) {
        failures.add(maybeFailure.get());
      }
    }
    return failures;
  }

  private Optional getTaskFailure(
    SingularityDeploy deploy,
    Map> taskUpdates,
    Map healthcheckResults,
    SingularityTaskId taskId
  ) {
    SingularityTaskHealthcheckResult healthcheckResult = healthcheckResults.get(taskId);
    Optional maybeFailure;
    if (healthcheckResult == null) {
      maybeFailure = getNonHealthcheckedTaskFailure(taskUpdates, taskId);
    } else {
      maybeFailure =
        getHealthcheckedTaskFailure(deploy, taskUpdates, healthcheckResult, taskId);
    }
    return maybeFailure;
  }

  private Optional getHealthcheckedTaskFailure(
    SingularityDeploy deploy,
    Map> taskUpdates,
    SingularityTaskHealthcheckResult healthcheckResult,
    SingularityTaskId taskId
  ) {
    Collection updates = taskUpdates.get(taskId);

    if (!healthcheckResult.isFailed()) {
      return Optional.empty();
    }

    SingularityTaskHistoryUpdate lastUpdate = Iterables.getLast(updates);
    if (lastUpdate.getTaskState().isDone()) {
      if (lastUpdate.getTaskState().isSuccess()) {
        return Optional.of(
          new SingularityDeployFailure(
            SingularityDeployFailureReason.TASK_EXPECTED_RUNNING_FINISHED,
            Optional.of(taskId),
            Optional.of(
              String.format(
                "Task was expected to maintain TASK_RUNNING state but finished. (%s)",
                lastUpdate.getStatusMessage().orElse("")
              )
            )
          )
        );
      } else {
        return Optional.of(
          new SingularityDeployFailure(
            SingularityDeployFailureReason.TASK_FAILED_ON_STARTUP,
            Optional.of(taskId),
            lastUpdate.getStatusMessage()
          )
        );
      }
    }

    final Optional healthcheckMaxRetries = deploy.getHealthcheck().isPresent()
      ? (
        deploy.getHealthcheck().get().getMaxRetries().isPresent()
          ? deploy.getHealthcheck().get().getMaxRetries()
          : configuration.getHealthcheckMaxRetries()
      )
      : configuration.getHealthcheckMaxRetries();
    if (
      healthcheckMaxRetries.isPresent() &&
      taskManager.getNumNonstartupHealthchecks(taskId) > healthcheckMaxRetries.get()
    ) {
      String message = String.format(
        "Instance %s failed %s healthchecks, the max for the deploy.",
        taskId.getInstanceNo(),
        healthcheckMaxRetries.get() + 1
      );
      if (healthcheckResult.getStatusCode().isPresent()) {
        message =
          String.format(
            "%s Last check returned with status code %s",
            message,
            healthcheckResult.getStatusCode().get()
          );
      }
      return Optional.of(
        new SingularityDeployFailure(
          SingularityDeployFailureReason.TASK_FAILED_HEALTH_CHECKS,
          Optional.of(taskId),
          Optional.of(message)
        )
      );
    }

    Optional runningAt = getRunningAt(updates);
    if (runningAt.isPresent()) {
      final long durationSinceRunning = System.currentTimeMillis() - runningAt.get();
      if (
        healthcheckResult.isStartup() &&
        deploy.getHealthcheck().isPresent() &&
        durationSinceRunning >
        deploy
          .getHealthcheck()
          .get()
          .getStartupTimeoutSeconds()
          .orElse(configuration.getStartupTimeoutSeconds())
      ) {
        String message = String.format(
          "Instance %s has not responded to healthchecks after running for %s",
          taskId.getInstanceNo(),
          JavaUtils.durationFromMillis(durationSinceRunning)
        );
        return Optional.of(
          new SingularityDeployFailure(
            SingularityDeployFailureReason.TASK_FAILED_HEALTH_CHECKS,
            Optional.of(taskId),
            Optional.of(message)
          )
        );
      }
      if (isRunningLongerThanThreshold(deploy, durationSinceRunning)) {
        String message = String.format(
          "Instance %s has been running for %s and has yet to pass healthchecks.",
          taskId.getInstanceNo(),
          JavaUtils.durationFromMillis(durationSinceRunning)
        );
        if (healthcheckResult.getStatusCode().isPresent()) {
          message =
            String.format(
              "%s Last check returned with status code %s",
              message,
              healthcheckResult.getStatusCode().get()
            );
        }
        return Optional.of(
          new SingularityDeployFailure(
            SingularityDeployFailureReason.TASK_FAILED_HEALTH_CHECKS,
            Optional.of(taskId),
            Optional.of(message)
          )
        );
      }
    }

    return Optional.empty();
  }

  private boolean isRunningLongerThanThreshold(
    SingularityDeploy deploy,
    long durationSinceRunning
  ) {
    long relevantTimeoutSeconds = deploy.getHealthcheck().isPresent()
      ? getMaxHealthcheckTimeoutSeconds(deploy.getHealthcheck().get())
      : deploy
        .getDeployHealthTimeoutSeconds()
        .orElse(configuration.getDeployHealthyBySeconds());
    return durationSinceRunning > TimeUnit.SECONDS.toMillis(relevantTimeoutSeconds);
  }

  private Optional getRunningAt(Collection updates) {
    for (SingularityTaskHistoryUpdate update : updates) {
      if (update.getTaskState() == ExtendedTaskState.TASK_RUNNING) {
        return Optional.of(update.getTimestamp());
      }
    }

    return Optional.empty();
  }

  private Optional getNonHealthcheckedTaskFailure(
    Map> taskUpdates,
    SingularityTaskId taskId
  ) {
    List updates = taskUpdates.get(taskId);
    SingularityTaskHistoryUpdate lastUpdate = Iterables.getLast(updates);

    if (lastUpdate.getTaskState().isSuccess()) {
      return Optional.of(
        new SingularityDeployFailure(
          SingularityDeployFailureReason.TASK_EXPECTED_RUNNING_FINISHED,
          Optional.of(taskId),
          Optional.of(
            String.format(
              "Task was expected to maintain TASK_RUNNING state but finished. (%s)",
              lastUpdate.getStatusMessage().orElse("")
            )
          )
        )
      );
    } else if (lastUpdate.getTaskState().isDone()) {
      return Optional.of(
        new SingularityDeployFailure(
          SingularityDeployFailureReason.TASK_FAILED_ON_STARTUP,
          Optional.of(taskId),
          lastUpdate.getStatusMessage()
        )
      );
    } else if (
      SingularityTaskHistoryUpdate.getCurrentState(updates) == SimplifiedTaskState.WAITING
    ) {
      return Optional.of(
        new SingularityDeployFailure(
          SingularityDeployFailureReason.TASK_NEVER_ENTERED_RUNNING,
          Optional.of(taskId),
          Optional.of(
            String.format(
              "Task never entered running state, last state was %s (%s)",
              lastUpdate.getTaskState().getDisplayName(),
              lastUpdate.getStatusMessage().orElse("")
            )
          )
        )
      );
    }
    return Optional.empty();
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy