All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hubspot.singularity.scheduler.SingularityDeployHealthHelper Maven / Gradle / Ivy

package com.hubspot.singularity.scheduler;

import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;

import javax.inject.Singleton;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Optional;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.inject.Inject;
import com.hubspot.deploy.HealthcheckOptions;
import com.hubspot.mesos.JavaUtils;
import com.hubspot.singularity.ExtendedTaskState;
import com.hubspot.singularity.SingularityAction;
import com.hubspot.singularity.SingularityDeploy;
import com.hubspot.singularity.SingularityDeployFailure;
import com.hubspot.singularity.SingularityDeployFailureReason;
import com.hubspot.singularity.SingularityRequest;
import com.hubspot.singularity.SingularityRequestHistory;
import com.hubspot.singularity.SingularityTask;
import com.hubspot.singularity.SingularityTaskHealthcheckResult;
import com.hubspot.singularity.SingularityTaskHistoryUpdate;
import com.hubspot.singularity.SingularityTaskHistoryUpdate.SimplifiedTaskState;
import com.hubspot.singularity.SingularityTaskId;
import com.hubspot.singularity.config.SingularityConfiguration;
import com.hubspot.singularity.data.DisasterManager;
import com.hubspot.singularity.data.RequestManager;
import com.hubspot.singularity.data.TaskManager;

@Singleton
public class SingularityDeployHealthHelper {

  private static final Logger LOG = LoggerFactory.getLogger(SingularityDeployHealthHelper.class);

  private final TaskManager taskManager;
  private final SingularityConfiguration configuration;
  private final RequestManager requestManager;
  private final DisasterManager disasterManager;

  @Inject
  public SingularityDeployHealthHelper(TaskManager taskManager, SingularityConfiguration configuration, RequestManager requestManager, DisasterManager disasterManager) {
    this.taskManager = taskManager;
    this.configuration = configuration;
    this.requestManager = requestManager;
    this.disasterManager = disasterManager;
  }

  public enum DeployHealth {
    WAITING, UNHEALTHY, HEALTHY;
  }

  private boolean shouldCheckHealthchecks(final SingularityRequest request, final Optional deploy, final Collection activeTasks, final boolean isDeployPending) {
    if (disasterManager.isDisabled(SingularityAction.RUN_HEALTH_CHECKS)) {
      return false;
    }
    if (!deploy.isPresent()) {
      return false;
    }

    if (!deploy.get().getHealthcheck().isPresent()) {
      return false;
    }

    if (isDeployPending && deploy.get().getSkipHealthchecksOnDeploy().or(false)) {
      return false;
    }

    if (request.getSkipHealthchecks().or(Boolean.FALSE)) {
      return false;
    }

    for (SingularityTask task : taskManager.getTasks(activeTasks).values()) {
      if (task.getTaskRequest().getPendingTask().getSkipHealthchecks().or(Boolean.FALSE)) {
        return false;
      }
    }

    return true;
  }

  public DeployHealth getDeployHealth(final SingularityRequest request, final Optional deploy, final Collection activeTasks, final boolean isDeployPending) {
    if (shouldCheckHealthchecks(request, deploy, activeTasks, isDeployPending)) {
      return getHealthcheckDeployState(deploy.get(), activeTasks, isDeployPending);
    } else {
      return getNoHealthcheckDeployHealth(deploy, activeTasks);
    }
  }

  public List getHealthyTasks(final SingularityRequest request, final Optional deploy, final Collection activeTasks, final boolean isDeployPending) {
    if (shouldCheckHealthchecks(request, deploy, activeTasks, isDeployPending)) {
      return getHealthcheckedHealthyTasks(deploy.get(), activeTasks, isDeployPending);
    } else {
      return getNoHealthcheckHealthyTasks(deploy, activeTasks);
    }
  }

  private DeployHealth getNoHealthcheckDeployHealth(final Optional deploy, final Collection matchingActiveTasks) {
    final Map> taskUpdates = taskManager.getTaskHistoryUpdates(matchingActiveTasks);

    for (SingularityTaskId taskId : matchingActiveTasks) {
      Collection updates = taskUpdates.get(taskId);

      SimplifiedTaskState currentState = SingularityTaskHistoryUpdate.getCurrentState(updates);

      switch (currentState) {
        case UNKNOWN:
        case WAITING:
          return DeployHealth.WAITING;
        case DONE:
          LOG.warn("Unexpectedly found an active task ({}) in done state: {}}", taskId, updates);
          return DeployHealth.UNHEALTHY;
        case RUNNING:
          if (!isRunningTaskHealthy(deploy, updates, taskId)) {
            return DeployHealth.WAITING;
          }
      }
    }

    return DeployHealth.HEALTHY;
  }

  private List getNoHealthcheckHealthyTasks(final Optional deploy, final Collection matchingActiveTasks) {
    final Map> taskUpdates = taskManager.getTaskHistoryUpdates(matchingActiveTasks);
    final List healthyTaskIds = Lists.newArrayListWithCapacity(matchingActiveTasks.size());

    for (SingularityTaskId taskId : matchingActiveTasks) {
      Collection updates = taskUpdates.get(taskId);
      SimplifiedTaskState currentState = SingularityTaskHistoryUpdate.getCurrentState(updates);
      if (currentState == SimplifiedTaskState.RUNNING && isRunningTaskHealthy(deploy, updates, taskId)) {
        healthyTaskIds.add(taskId);
      }
    }

    return healthyTaskIds;
  }

  private boolean isRunningTaskHealthy(final Optional deploy, Collection updates, SingularityTaskId taskId) {
    long runningThreshold = configuration.getConsiderTaskHealthyAfterRunningForSeconds();
    if (deploy.isPresent()) {
      runningThreshold = deploy.get().getConsiderHealthyAfterRunningForSeconds().or(runningThreshold);
    }

    if (runningThreshold < 1) {
      return true;
    }

    Optional runningUpdate = SingularityTaskHistoryUpdate.getUpdate(updates, ExtendedTaskState.TASK_RUNNING);
    long taskDuration = System.currentTimeMillis() - runningUpdate.get().getTimestamp();

    long runningThresholdMillis = TimeUnit.SECONDS.toMillis(runningThreshold);
    if (taskDuration < runningThresholdMillis) {
      LOG.debug("Task {} has been running for {}, has not yet reached running threshold of {}", taskId, JavaUtils.durationFromMillis(taskDuration), JavaUtils.durationFromMillis(runningThresholdMillis));
      return false;
    }
    return true;
  }

  private DeployHealth getHealthcheckDeployState(final SingularityDeploy deploy, final Collection matchingActiveTasks, final boolean isDeployPending) {
    Map healthcheckResults = taskManager.getLastHealthcheck(matchingActiveTasks);
    List requestHistories = requestManager.getRequestHistory(deploy.getRequestId());

    for (SingularityTaskId taskId : matchingActiveTasks) {
      DeployHealth individualTaskHealth;
      if (healthchecksSkipped(taskId, requestHistories, deploy)) {
        LOG.trace("Detected skipped healthchecks for {}", taskId);
        individualTaskHealth = DeployHealth.HEALTHY;
      } else {
        individualTaskHealth = getTaskHealth(deploy, isDeployPending, Optional.fromNullable(healthcheckResults.get(taskId)), taskId);
      }
      if (individualTaskHealth != DeployHealth.HEALTHY) {
        return individualTaskHealth;
      }
    }
    return DeployHealth.HEALTHY;
  }

  private List getHealthcheckedHealthyTasks(final SingularityDeploy deploy, final Collection matchingActiveTasks, final boolean isDeployPending) {
    final Map healthcheckResults = taskManager.getLastHealthcheck(matchingActiveTasks);
    final List healthyTaskIds = Lists.newArrayListWithCapacity(matchingActiveTasks.size());
    List requestHistories = requestManager.getRequestHistory(deploy.getRequestId());

    for (SingularityTaskId taskId : matchingActiveTasks) {
      DeployHealth individualTaskHealth;
      if (healthchecksSkipped(taskId, requestHistories, deploy)) {
        LOG.trace("Detected skipped healthchecks for {}", taskId);
        individualTaskHealth = DeployHealth.HEALTHY;
      } else {
        individualTaskHealth = getTaskHealth(deploy, isDeployPending, Optional.fromNullable(healthcheckResults.get(taskId)), taskId);
      }
      if (individualTaskHealth == DeployHealth.HEALTHY) {
        healthyTaskIds.add(taskId);
      }
    }

    return healthyTaskIds;
  }

  private boolean healthchecksSkipped(SingularityTaskId taskId, List requestHistories, SingularityDeploy deploy) {
    if (deploy.getSkipHealthchecksOnDeploy().or(false)) {
      return true;
    }

    Optional maybeTask = taskManager.getTask(taskId);
    if (maybeTask.isPresent()) {
      if (maybeTask.get().getTaskRequest().getPendingTask().getSkipHealthchecks().or(false)) {
        return true;
      }

      Optional runningStartTime = getRunningAt(taskManager.getTaskHistoryUpdates(taskId));
      if (runningStartTime.isPresent()) {
        Optional previousHistory = Optional.absent();
        for (SingularityRequestHistory history : requestHistories) {
          if (history.getCreatedAt() < runningStartTime.get() && (!previousHistory.isPresent() || previousHistory.get().getCreatedAt() < history.getCreatedAt())) {
            previousHistory = Optional.of(history);
          }
        }

        if (previousHistory.isPresent() && previousHistory.get().getRequest().getSkipHealthchecks().or(false)) {
          return true;
        }
      }
    }


    return false;
  }

  public DeployHealth getTaskHealth(SingularityDeploy deploy, boolean isDeployPending, Optional healthcheckResult, SingularityTaskId taskId) {
    if (!healthcheckResult.isPresent()) {
      LOG.debug("No healthcheck present for {}", taskId);
      return DeployHealth.WAITING;
    } else if (healthcheckResult.get().isFailed()) {
      LOG.debug("Found a failed healthcheck: {}", healthcheckResult);

      if (deploy.getHealthcheck().isPresent() && healthcheckResult.get().getStatusCode().isPresent()
        && deploy.getHealthcheck().get().getFailureStatusCodes().or(configuration.getHealthcheckFailureStatusCodes()).contains(healthcheckResult.get().getStatusCode().get())) {
        LOG.debug("Failed healthcheck had bad status code: {}", healthcheckResult.get().getStatusCode().get());
        return DeployHealth.UNHEALTHY;
      }

      final int startupTimeout = deploy.getHealthcheck().isPresent() ? deploy.getHealthcheck().get().getStartupTimeoutSeconds().or(configuration.getStartupTimeoutSeconds()) : configuration.getStartupTimeoutSeconds();
      Collection updates = taskManager.getTaskHistoryUpdates(taskId);
      Optional runningAt = getRunningAt(updates);
      if (runningAt.isPresent()) {
        final long durationSinceRunning = System.currentTimeMillis() - runningAt.get();
        if (healthcheckResult.get().isStartup() && durationSinceRunning > TimeUnit.SECONDS.toMillis(startupTimeout)) {
          LOG.debug("{} has not responded to healthchecks in {}s", taskId, startupTimeout);
          return DeployHealth.UNHEALTHY;
        }
      }

      final Optional healthcheckMaxRetries = deploy.getHealthcheck().isPresent() ? deploy.getHealthcheck().get().getMaxRetries().or(configuration.getHealthcheckMaxRetries()) : Optional.absent();

      if (healthcheckMaxRetries.isPresent() && taskManager.getNumNonstartupHealthchecks(taskId) > healthcheckMaxRetries.get()) {
        LOG.debug("{} failed {} healthchecks, the max for the deploy", taskId, healthcheckMaxRetries.get());
        return DeployHealth.UNHEALTHY;
      }

      final Optional healthcheckMaxTotalTimeoutSeconds = deploy.getHealthcheck().isPresent() ? Optional.of(getMaxHealthcheckTimeoutSeconds(deploy.getHealthcheck().get())) : Optional.absent();

      if (isDeployPending && healthcheckMaxTotalTimeoutSeconds.isPresent()) {
        if (runningAt.isPresent()) {
          final long durationSinceRunning = System.currentTimeMillis() - runningAt.get();
          if (durationSinceRunning > TimeUnit.SECONDS.toMillis(healthcheckMaxTotalTimeoutSeconds.get())) {
            LOG.debug("{} has been running for {} and has yet to pass healthchecks, failing deploy", taskId, JavaUtils.durationFromMillis(durationSinceRunning));

            return DeployHealth.UNHEALTHY;
          }
        }
      }

      return DeployHealth.WAITING;
    }
    return DeployHealth.HEALTHY;
  }

  public int getMaxHealthcheckTimeoutSeconds(HealthcheckOptions options) {
    int intervalSeconds = options.getIntervalSeconds().or(configuration.getHealthcheckIntervalSeconds());
    int responseTimeSeconds = options.getResponseTimeoutSeconds().or(configuration.getHealthcheckTimeoutSeconds());
    int startupTime = options.getStartupTimeoutSeconds().or(configuration.getStartupTimeoutSeconds());
    int attempts = options.getMaxRetries().or(configuration.getHealthcheckMaxRetries()).or(0) + 1;
    return startupTime + ((intervalSeconds + responseTimeSeconds) * attempts);
  }

  public List getTaskFailures(final Optional deploy, final Collection activeTasks) {
    List failures = new ArrayList<>();
    Map> taskUpdates = taskManager.getTaskHistoryUpdates(activeTasks);
    Map healthcheckResults = taskManager.getLastHealthcheck(activeTasks);

    for (SingularityTaskId taskId : activeTasks) {
      Optional maybeFailure = getTaskFailure(deploy.get(), taskUpdates, healthcheckResults, taskId);
      if (maybeFailure.isPresent()) {
        failures.add(maybeFailure.get());
      }
    }
    return failures;
  }

  private Optional getTaskFailure(SingularityDeploy deploy, Map> taskUpdates,
    Map healthcheckResults, SingularityTaskId taskId) {
    SingularityTaskHealthcheckResult healthcheckResult = healthcheckResults.get(taskId);
    Optional maybeFailure;
    if (healthcheckResult == null) {
      maybeFailure = getNonHealthcheckedTaskFailure(taskUpdates, taskId);
    } else {
      maybeFailure = getHealthcheckedTaskFailure(deploy, taskUpdates, healthcheckResult, taskId);
    }
    return maybeFailure;
  }

  private Optional getHealthcheckedTaskFailure(SingularityDeploy deploy, Map> taskUpdates,
    SingularityTaskHealthcheckResult healthcheckResult, SingularityTaskId taskId) {
    Collection updates = taskUpdates.get(taskId);

    if (!healthcheckResult.isFailed()) {
      return Optional.absent();
    }

    SingularityTaskHistoryUpdate lastUpdate = Iterables.getLast(updates);
    if (lastUpdate.getTaskState().isDone()) {
      if (lastUpdate.getTaskState().isSuccess()) {
        return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_EXPECTED_RUNNING_FINISHED, Optional.of(taskId),
          Optional.of(String.format("Task was expected to maintain TASK_RUNNING state but finished. (%s)", lastUpdate.getStatusMessage().or("")))));
      } else {
        return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_ON_STARTUP, Optional.of(taskId), lastUpdate.getStatusMessage()));
      }
    }

    final Optional healthcheckMaxRetries = deploy.getHealthcheck().isPresent() ?
      deploy.getHealthcheck().get().getMaxRetries().or(configuration.getHealthcheckMaxRetries()) : configuration.getHealthcheckMaxRetries();
    if (healthcheckMaxRetries.isPresent() && taskManager.getNumNonstartupHealthchecks(taskId) > healthcheckMaxRetries.get()) {
      String message = String.format("Instance %s failed %s healthchecks, the max for the deploy.", taskId.getInstanceNo(), healthcheckMaxRetries.get() + 1);
      if (healthcheckResult.getStatusCode().isPresent()) {
        message = String.format("%s Last check returned with status code %s", message, healthcheckResult.getStatusCode().get());
      }
      return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_HEALTH_CHECKS, Optional.of(taskId), Optional.of(message)));
    }

    Optional runningAt = getRunningAt(updates);
    if (runningAt.isPresent()) {
      final long durationSinceRunning = System.currentTimeMillis() - runningAt.get();
      if (healthcheckResult.isStartup() && deploy.getHealthcheck().isPresent() && durationSinceRunning > deploy.getHealthcheck().get().getStartupTimeoutSeconds()
        .or(configuration.getStartupTimeoutSeconds())) {
        String message = String.format("Instance %s has not responded to healthchecks after running for %s", taskId.getInstanceNo(), JavaUtils.durationFromMillis(durationSinceRunning));
        return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_HEALTH_CHECKS, Optional.of(taskId), Optional.of(message)));
      }
      if (isRunningLongerThanThreshold(deploy, durationSinceRunning)) {
        String message = String.format("Instance %s has been running for %s and has yet to pass healthchecks.", taskId.getInstanceNo(), JavaUtils.durationFromMillis(durationSinceRunning));
        if (healthcheckResult.getStatusCode().isPresent()) {
          message = String.format("%s Last check returned with status code %s", message, healthcheckResult.getStatusCode().get());
        }
        return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_HEALTH_CHECKS, Optional.of(taskId), Optional.of(message)));
      }
    }

    return Optional.absent();
  }

  private boolean isRunningLongerThanThreshold(SingularityDeploy deploy, long durationSinceRunning) {
    long relevantTimeoutSeconds = deploy.getHealthcheck().isPresent() ?
      getMaxHealthcheckTimeoutSeconds(deploy.getHealthcheck().get()) : deploy.getDeployHealthTimeoutSeconds().or(configuration.getDeployHealthyBySeconds());
    return durationSinceRunning > TimeUnit.SECONDS.toMillis(relevantTimeoutSeconds);
  }

  private Optional getRunningAt(Collection updates) {
    for (SingularityTaskHistoryUpdate update : updates) {
      if (update.getTaskState() == ExtendedTaskState.TASK_RUNNING) {
        return  Optional.of(update.getTimestamp());
      }
    }

    return Optional.absent();
  }

  private Optional getNonHealthcheckedTaskFailure(Map> taskUpdates, SingularityTaskId taskId) {
    List updates = taskUpdates.get(taskId);
    SingularityTaskHistoryUpdate lastUpdate = Iterables.getLast(updates);

    if (lastUpdate.getTaskState().isSuccess()) {
      return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_EXPECTED_RUNNING_FINISHED, Optional.of(taskId),
        Optional.of(String.format("Task was expected to maintain TASK_RUNNING state but finished. (%s)", lastUpdate.getStatusMessage().or("")))));
    } else if (lastUpdate.getTaskState().isDone()) {
      return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_ON_STARTUP, Optional.of(taskId), lastUpdate.getStatusMessage()));
    } else if (SingularityTaskHistoryUpdate.getCurrentState(updates) == SimplifiedTaskState.WAITING) {
      return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_NEVER_ENTERED_RUNNING, Optional.of(taskId),
        Optional.of(String.format("Task never entered running state, last state was %s (%s)", lastUpdate.getTaskState().getDisplayName(), lastUpdate.getStatusMessage().or("")))));
    }
    return Optional.absent();
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy