Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.hubspot.singularity.scheduler.SingularityDeployHealthHelper Maven / Gradle / Ivy
package com.hubspot.singularity.scheduler;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import javax.inject.Singleton;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Optional;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.inject.Inject;
import com.hubspot.deploy.HealthcheckOptions;
import com.hubspot.mesos.JavaUtils;
import com.hubspot.singularity.ExtendedTaskState;
import com.hubspot.singularity.SingularityAction;
import com.hubspot.singularity.SingularityDeploy;
import com.hubspot.singularity.SingularityDeployFailure;
import com.hubspot.singularity.SingularityDeployFailureReason;
import com.hubspot.singularity.SingularityRequest;
import com.hubspot.singularity.SingularityRequestHistory;
import com.hubspot.singularity.SingularityTask;
import com.hubspot.singularity.SingularityTaskHealthcheckResult;
import com.hubspot.singularity.SingularityTaskHistoryUpdate;
import com.hubspot.singularity.SingularityTaskHistoryUpdate.SimplifiedTaskState;
import com.hubspot.singularity.SingularityTaskId;
import com.hubspot.singularity.config.SingularityConfiguration;
import com.hubspot.singularity.data.DisasterManager;
import com.hubspot.singularity.data.RequestManager;
import com.hubspot.singularity.data.TaskManager;
@Singleton
public class SingularityDeployHealthHelper {
private static final Logger LOG = LoggerFactory.getLogger(SingularityDeployHealthHelper.class);
private final TaskManager taskManager;
private final SingularityConfiguration configuration;
private final RequestManager requestManager;
private final DisasterManager disasterManager;
@Inject
public SingularityDeployHealthHelper(TaskManager taskManager, SingularityConfiguration configuration, RequestManager requestManager, DisasterManager disasterManager) {
this.taskManager = taskManager;
this.configuration = configuration;
this.requestManager = requestManager;
this.disasterManager = disasterManager;
}
public enum DeployHealth {
WAITING, UNHEALTHY, HEALTHY;
}
private boolean shouldCheckHealthchecks(final SingularityRequest request, final Optional deploy, final Collection activeTasks, final boolean isDeployPending) {
if (disasterManager.isDisabled(SingularityAction.RUN_HEALTH_CHECKS)) {
return false;
}
if (!deploy.isPresent()) {
return false;
}
if (!deploy.get().getHealthcheck().isPresent()) {
return false;
}
if (isDeployPending && deploy.get().getSkipHealthchecksOnDeploy().or(false)) {
return false;
}
if (request.getSkipHealthchecks().or(Boolean.FALSE)) {
return false;
}
for (SingularityTask task : taskManager.getTasks(activeTasks).values()) {
if (task.getTaskRequest().getPendingTask().getSkipHealthchecks().or(Boolean.FALSE)) {
return false;
}
}
return true;
}
public DeployHealth getDeployHealth(final SingularityRequest request, final Optional deploy, final Collection activeTasks, final boolean isDeployPending) {
if (shouldCheckHealthchecks(request, deploy, activeTasks, isDeployPending)) {
return getHealthcheckDeployState(deploy.get(), activeTasks, isDeployPending);
} else {
return getNoHealthcheckDeployHealth(deploy, activeTasks);
}
}
public List getHealthyTasks(final SingularityRequest request, final Optional deploy, final Collection activeTasks, final boolean isDeployPending) {
if (shouldCheckHealthchecks(request, deploy, activeTasks, isDeployPending)) {
return getHealthcheckedHealthyTasks(deploy.get(), activeTasks, isDeployPending);
} else {
return getNoHealthcheckHealthyTasks(deploy, activeTasks);
}
}
private DeployHealth getNoHealthcheckDeployHealth(final Optional deploy, final Collection matchingActiveTasks) {
final Map> taskUpdates = taskManager.getTaskHistoryUpdates(matchingActiveTasks);
for (SingularityTaskId taskId : matchingActiveTasks) {
Collection updates = taskUpdates.get(taskId);
SimplifiedTaskState currentState = SingularityTaskHistoryUpdate.getCurrentState(updates);
switch (currentState) {
case UNKNOWN:
case WAITING:
return DeployHealth.WAITING;
case DONE:
LOG.warn("Unexpectedly found an active task ({}) in done state: {}}", taskId, updates);
return DeployHealth.UNHEALTHY;
case RUNNING:
if (!isRunningTaskHealthy(deploy, updates, taskId)) {
return DeployHealth.WAITING;
}
}
}
return DeployHealth.HEALTHY;
}
private List getNoHealthcheckHealthyTasks(final Optional deploy, final Collection matchingActiveTasks) {
final Map> taskUpdates = taskManager.getTaskHistoryUpdates(matchingActiveTasks);
final List healthyTaskIds = Lists.newArrayListWithCapacity(matchingActiveTasks.size());
for (SingularityTaskId taskId : matchingActiveTasks) {
Collection updates = taskUpdates.get(taskId);
SimplifiedTaskState currentState = SingularityTaskHistoryUpdate.getCurrentState(updates);
if (currentState == SimplifiedTaskState.RUNNING && isRunningTaskHealthy(deploy, updates, taskId)) {
healthyTaskIds.add(taskId);
}
}
return healthyTaskIds;
}
private boolean isRunningTaskHealthy(final Optional deploy, Collection updates, SingularityTaskId taskId) {
long runningThreshold = configuration.getConsiderTaskHealthyAfterRunningForSeconds();
if (deploy.isPresent()) {
runningThreshold = deploy.get().getConsiderHealthyAfterRunningForSeconds().or(runningThreshold);
}
if (runningThreshold < 1) {
return true;
}
Optional runningUpdate = SingularityTaskHistoryUpdate.getUpdate(updates, ExtendedTaskState.TASK_RUNNING);
long taskDuration = System.currentTimeMillis() - runningUpdate.get().getTimestamp();
long runningThresholdMillis = TimeUnit.SECONDS.toMillis(runningThreshold);
if (taskDuration < runningThresholdMillis) {
LOG.debug("Task {} has been running for {}, has not yet reached running threshold of {}", taskId, JavaUtils.durationFromMillis(taskDuration), JavaUtils.durationFromMillis(runningThresholdMillis));
return false;
}
return true;
}
private DeployHealth getHealthcheckDeployState(final SingularityDeploy deploy, final Collection matchingActiveTasks, final boolean isDeployPending) {
Map healthcheckResults = taskManager.getLastHealthcheck(matchingActiveTasks);
List requestHistories = requestManager.getRequestHistory(deploy.getRequestId());
for (SingularityTaskId taskId : matchingActiveTasks) {
DeployHealth individualTaskHealth;
if (healthchecksSkipped(taskId, requestHistories, deploy)) {
LOG.trace("Detected skipped healthchecks for {}", taskId);
individualTaskHealth = DeployHealth.HEALTHY;
} else {
individualTaskHealth = getTaskHealth(deploy, isDeployPending, Optional.fromNullable(healthcheckResults.get(taskId)), taskId);
}
if (individualTaskHealth != DeployHealth.HEALTHY) {
return individualTaskHealth;
}
}
return DeployHealth.HEALTHY;
}
private List getHealthcheckedHealthyTasks(final SingularityDeploy deploy, final Collection matchingActiveTasks, final boolean isDeployPending) {
final Map healthcheckResults = taskManager.getLastHealthcheck(matchingActiveTasks);
final List healthyTaskIds = Lists.newArrayListWithCapacity(matchingActiveTasks.size());
List requestHistories = requestManager.getRequestHistory(deploy.getRequestId());
for (SingularityTaskId taskId : matchingActiveTasks) {
DeployHealth individualTaskHealth;
if (healthchecksSkipped(taskId, requestHistories, deploy)) {
LOG.trace("Detected skipped healthchecks for {}", taskId);
individualTaskHealth = DeployHealth.HEALTHY;
} else {
individualTaskHealth = getTaskHealth(deploy, isDeployPending, Optional.fromNullable(healthcheckResults.get(taskId)), taskId);
}
if (individualTaskHealth == DeployHealth.HEALTHY) {
healthyTaskIds.add(taskId);
}
}
return healthyTaskIds;
}
private boolean healthchecksSkipped(SingularityTaskId taskId, List requestHistories, SingularityDeploy deploy) {
if (deploy.getSkipHealthchecksOnDeploy().or(false)) {
return true;
}
Optional maybeTask = taskManager.getTask(taskId);
if (maybeTask.isPresent()) {
if (maybeTask.get().getTaskRequest().getPendingTask().getSkipHealthchecks().or(false)) {
return true;
}
Optional runningStartTime = getRunningAt(taskManager.getTaskHistoryUpdates(taskId));
if (runningStartTime.isPresent()) {
Optional previousHistory = Optional.absent();
for (SingularityRequestHistory history : requestHistories) {
if (history.getCreatedAt() < runningStartTime.get() && (!previousHistory.isPresent() || previousHistory.get().getCreatedAt() < history.getCreatedAt())) {
previousHistory = Optional.of(history);
}
}
if (previousHistory.isPresent() && previousHistory.get().getRequest().getSkipHealthchecks().or(false)) {
return true;
}
}
}
return false;
}
public DeployHealth getTaskHealth(SingularityDeploy deploy, boolean isDeployPending, Optional healthcheckResult, SingularityTaskId taskId) {
if (!healthcheckResult.isPresent()) {
LOG.debug("No healthcheck present for {}", taskId);
return DeployHealth.WAITING;
} else if (healthcheckResult.get().isFailed()) {
LOG.debug("Found a failed healthcheck: {}", healthcheckResult);
if (deploy.getHealthcheck().isPresent() && healthcheckResult.get().getStatusCode().isPresent()
&& deploy.getHealthcheck().get().getFailureStatusCodes().or(configuration.getHealthcheckFailureStatusCodes()).contains(healthcheckResult.get().getStatusCode().get())) {
LOG.debug("Failed healthcheck had bad status code: {}", healthcheckResult.get().getStatusCode().get());
return DeployHealth.UNHEALTHY;
}
final int startupTimeout = deploy.getHealthcheck().isPresent() ? deploy.getHealthcheck().get().getStartupTimeoutSeconds().or(configuration.getStartupTimeoutSeconds()) : configuration.getStartupTimeoutSeconds();
Collection updates = taskManager.getTaskHistoryUpdates(taskId);
Optional runningAt = getRunningAt(updates);
if (runningAt.isPresent()) {
final long durationSinceRunning = System.currentTimeMillis() - runningAt.get();
if (healthcheckResult.get().isStartup() && durationSinceRunning > TimeUnit.SECONDS.toMillis(startupTimeout)) {
LOG.debug("{} has not responded to healthchecks in {}s", taskId, startupTimeout);
return DeployHealth.UNHEALTHY;
}
}
final Optional healthcheckMaxRetries = deploy.getHealthcheck().isPresent() ? deploy.getHealthcheck().get().getMaxRetries().or(configuration.getHealthcheckMaxRetries()) : Optional.absent();
if (healthcheckMaxRetries.isPresent() && taskManager.getNumNonstartupHealthchecks(taskId) > healthcheckMaxRetries.get()) {
LOG.debug("{} failed {} healthchecks, the max for the deploy", taskId, healthcheckMaxRetries.get());
return DeployHealth.UNHEALTHY;
}
final Optional healthcheckMaxTotalTimeoutSeconds = deploy.getHealthcheck().isPresent() ? Optional.of(getMaxHealthcheckTimeoutSeconds(deploy.getHealthcheck().get())) : Optional.absent();
if (isDeployPending && healthcheckMaxTotalTimeoutSeconds.isPresent()) {
if (runningAt.isPresent()) {
final long durationSinceRunning = System.currentTimeMillis() - runningAt.get();
if (durationSinceRunning > TimeUnit.SECONDS.toMillis(healthcheckMaxTotalTimeoutSeconds.get())) {
LOG.debug("{} has been running for {} and has yet to pass healthchecks, failing deploy", taskId, JavaUtils.durationFromMillis(durationSinceRunning));
return DeployHealth.UNHEALTHY;
}
}
}
return DeployHealth.WAITING;
}
return DeployHealth.HEALTHY;
}
public int getMaxHealthcheckTimeoutSeconds(HealthcheckOptions options) {
int intervalSeconds = options.getIntervalSeconds().or(configuration.getHealthcheckIntervalSeconds());
int responseTimeSeconds = options.getResponseTimeoutSeconds().or(configuration.getHealthcheckTimeoutSeconds());
int startupTime = options.getStartupTimeoutSeconds().or(configuration.getStartupTimeoutSeconds());
int attempts = options.getMaxRetries().or(configuration.getHealthcheckMaxRetries()).or(0) + 1;
return startupTime + ((intervalSeconds + responseTimeSeconds) * attempts);
}
public List getTaskFailures(final Optional deploy, final Collection activeTasks) {
List failures = new ArrayList<>();
Map> taskUpdates = taskManager.getTaskHistoryUpdates(activeTasks);
Map healthcheckResults = taskManager.getLastHealthcheck(activeTasks);
for (SingularityTaskId taskId : activeTasks) {
Optional maybeFailure = getTaskFailure(deploy.get(), taskUpdates, healthcheckResults, taskId);
if (maybeFailure.isPresent()) {
failures.add(maybeFailure.get());
}
}
return failures;
}
private Optional getTaskFailure(SingularityDeploy deploy, Map> taskUpdates,
Map healthcheckResults, SingularityTaskId taskId) {
SingularityTaskHealthcheckResult healthcheckResult = healthcheckResults.get(taskId);
Optional maybeFailure;
if (healthcheckResult == null) {
maybeFailure = getNonHealthcheckedTaskFailure(taskUpdates, taskId);
} else {
maybeFailure = getHealthcheckedTaskFailure(deploy, taskUpdates, healthcheckResult, taskId);
}
return maybeFailure;
}
private Optional getHealthcheckedTaskFailure(SingularityDeploy deploy, Map> taskUpdates,
SingularityTaskHealthcheckResult healthcheckResult, SingularityTaskId taskId) {
Collection updates = taskUpdates.get(taskId);
if (!healthcheckResult.isFailed()) {
return Optional.absent();
}
SingularityTaskHistoryUpdate lastUpdate = Iterables.getLast(updates);
if (lastUpdate.getTaskState().isDone()) {
if (lastUpdate.getTaskState().isSuccess()) {
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_EXPECTED_RUNNING_FINISHED, Optional.of(taskId),
Optional.of(String.format("Task was expected to maintain TASK_RUNNING state but finished. (%s)", lastUpdate.getStatusMessage().or("")))));
} else {
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_ON_STARTUP, Optional.of(taskId), lastUpdate.getStatusMessage()));
}
}
final Optional healthcheckMaxRetries = deploy.getHealthcheck().isPresent() ?
deploy.getHealthcheck().get().getMaxRetries().or(configuration.getHealthcheckMaxRetries()) : configuration.getHealthcheckMaxRetries();
if (healthcheckMaxRetries.isPresent() && taskManager.getNumNonstartupHealthchecks(taskId) > healthcheckMaxRetries.get()) {
String message = String.format("Instance %s failed %s healthchecks, the max for the deploy.", taskId.getInstanceNo(), healthcheckMaxRetries.get() + 1);
if (healthcheckResult.getStatusCode().isPresent()) {
message = String.format("%s Last check returned with status code %s", message, healthcheckResult.getStatusCode().get());
}
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_HEALTH_CHECKS, Optional.of(taskId), Optional.of(message)));
}
Optional runningAt = getRunningAt(updates);
if (runningAt.isPresent()) {
final long durationSinceRunning = System.currentTimeMillis() - runningAt.get();
if (healthcheckResult.isStartup() && deploy.getHealthcheck().isPresent() && durationSinceRunning > deploy.getHealthcheck().get().getStartupTimeoutSeconds()
.or(configuration.getStartupTimeoutSeconds())) {
String message = String.format("Instance %s has not responded to healthchecks after running for %s", taskId.getInstanceNo(), JavaUtils.durationFromMillis(durationSinceRunning));
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_HEALTH_CHECKS, Optional.of(taskId), Optional.of(message)));
}
if (isRunningLongerThanThreshold(deploy, durationSinceRunning)) {
String message = String.format("Instance %s has been running for %s and has yet to pass healthchecks.", taskId.getInstanceNo(), JavaUtils.durationFromMillis(durationSinceRunning));
if (healthcheckResult.getStatusCode().isPresent()) {
message = String.format("%s Last check returned with status code %s", message, healthcheckResult.getStatusCode().get());
}
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_HEALTH_CHECKS, Optional.of(taskId), Optional.of(message)));
}
}
return Optional.absent();
}
private boolean isRunningLongerThanThreshold(SingularityDeploy deploy, long durationSinceRunning) {
long relevantTimeoutSeconds = deploy.getHealthcheck().isPresent() ?
getMaxHealthcheckTimeoutSeconds(deploy.getHealthcheck().get()) : deploy.getDeployHealthTimeoutSeconds().or(configuration.getDeployHealthyBySeconds());
return durationSinceRunning > TimeUnit.SECONDS.toMillis(relevantTimeoutSeconds);
}
private Optional getRunningAt(Collection updates) {
for (SingularityTaskHistoryUpdate update : updates) {
if (update.getTaskState() == ExtendedTaskState.TASK_RUNNING) {
return Optional.of(update.getTimestamp());
}
}
return Optional.absent();
}
private Optional getNonHealthcheckedTaskFailure(Map> taskUpdates, SingularityTaskId taskId) {
List updates = taskUpdates.get(taskId);
SingularityTaskHistoryUpdate lastUpdate = Iterables.getLast(updates);
if (lastUpdate.getTaskState().isSuccess()) {
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_EXPECTED_RUNNING_FINISHED, Optional.of(taskId),
Optional.of(String.format("Task was expected to maintain TASK_RUNNING state but finished. (%s)", lastUpdate.getStatusMessage().or("")))));
} else if (lastUpdate.getTaskState().isDone()) {
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_ON_STARTUP, Optional.of(taskId), lastUpdate.getStatusMessage()));
} else if (SingularityTaskHistoryUpdate.getCurrentState(updates) == SimplifiedTaskState.WAITING) {
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_NEVER_ENTERED_RUNNING, Optional.of(taskId),
Optional.of(String.format("Task never entered running state, last state was %s (%s)", lastUpdate.getTaskState().getDisplayName(), lastUpdate.getStatusMessage().or("")))));
}
return Optional.absent();
}
}