Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.hubspot.singularity.scheduler.SingularityHealthchecker Maven / Gradle / Ivy
package com.hubspot.singularity.scheduler;
import com.codahale.metrics.annotation.Timed;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import com.google.inject.Inject;
import com.hubspot.deploy.HealthcheckOptions;
import com.hubspot.singularity.ExtendedTaskState;
import com.hubspot.singularity.HealthcheckMethod;
import com.hubspot.singularity.HealthcheckProtocol;
import com.hubspot.singularity.SingularityAbort;
import com.hubspot.singularity.SingularityAction;
import com.hubspot.singularity.SingularityManagedScheduledExecutorServiceFactory;
import com.hubspot.singularity.SingularityPendingDeploy;
import com.hubspot.singularity.SingularityRequestWithState;
import com.hubspot.singularity.SingularityTask;
import com.hubspot.singularity.SingularityTaskHealthcheckResult;
import com.hubspot.singularity.SingularityTaskHistoryUpdate;
import com.hubspot.singularity.SingularityTaskId;
import com.hubspot.singularity.config.SingularityConfiguration;
import com.hubspot.singularity.data.DisasterManager;
import com.hubspot.singularity.data.TaskManager;
import com.hubspot.singularity.helpers.MesosProtosUtils;
import com.hubspot.singularity.helpers.MesosUtils;
import com.hubspot.singularity.sentry.SingularityExceptionNotifier;
import com.ning.http.client.AsyncCompletionHandler;
import com.ning.http.client.AsyncHttpClient;
import com.ning.http.client.RequestBuilder;
import java.io.IOException;
import java.util.Collection;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import javax.inject.Singleton;
import okhttp3.Call;
import okhttp3.Callback;
import okhttp3.OkHttpClient;
import org.apache.commons.lang3.time.DurationFormatUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Singleton
public class SingularityHealthchecker {
private static final HealthcheckProtocol DEFAULT_HEALTH_CHECK_SCHEME =
HealthcheckProtocol.HTTP;
private static final Logger LOG = LoggerFactory.getLogger(
SingularityHealthchecker.class
);
private final AsyncHttpClient http;
private final OkHttpClient http2;
private final SingularityConfiguration configuration;
private final TaskManager taskManager;
private final SingularityAbort abort;
private final SingularityNewTaskChecker newTaskChecker;
private final Map> taskIdToHealthcheck;
private final ScheduledExecutorService executorService;
private final SingularityExceptionNotifier exceptionNotifier;
private final DisasterManager disasterManager;
private final MesosProtosUtils mesosProtosUtils;
@Inject
public SingularityHealthchecker(
SingularityManagedScheduledExecutorServiceFactory executorServiceFactory,
AsyncHttpClient http,
OkHttpClient http2,
SingularityConfiguration configuration,
SingularityNewTaskChecker newTaskChecker,
TaskManager taskManager,
SingularityAbort abort,
SingularityExceptionNotifier exceptionNotifier,
DisasterManager disasterManager,
MesosProtosUtils mesosProtosUtils
) {
this.http = http;
this.http2 = http2;
this.configuration = configuration;
this.newTaskChecker = newTaskChecker;
this.taskManager = taskManager;
this.abort = abort;
this.exceptionNotifier = exceptionNotifier;
this.taskIdToHealthcheck = Maps.newConcurrentMap();
this.executorService =
executorServiceFactory.get(
"health-checker",
configuration.getHealthcheckStartThreads()
);
this.disasterManager = disasterManager;
this.mesosProtosUtils = mesosProtosUtils;
}
public void enqueueHealthcheck(
SingularityTask task,
boolean ignoreExisting,
boolean inStartup,
boolean isFirstCheck
) {
HealthcheckOptions options = task.getTaskRequest().getDeploy().getHealthcheck().get();
final Optional healthcheckMaxRetries = options.getMaxRetries().isPresent()
? options.getMaxRetries()
: configuration.getHealthcheckMaxRetries();
Optional maybeRunningAt = getRunningAt(
taskManager.getTaskHistoryUpdates(task.getTaskId())
);
if (maybeRunningAt.isPresent()) {
final long durationSinceRunning = System.currentTimeMillis() - maybeRunningAt.get();
final int startupTimeout = options
.getStartupTimeoutSeconds()
.orElse(configuration.getStartupTimeoutSeconds());
if (inStartup && durationSinceRunning > TimeUnit.SECONDS.toMillis(startupTimeout)) {
LOG.debug("{} since running", durationSinceRunning);
LOG.info(
"Not enqueuing new healthcheck for {}, has not responded to healthchecks before startup timeout of {}s",
task.getTaskId(),
startupTimeout
);
return;
}
}
if (
healthcheckMaxRetries.isPresent() &&
taskManager.getNumNonstartupHealthchecks(task.getTaskId()) >
healthcheckMaxRetries.get()
) {
LOG.info(
"Not enqueuing new healthcheck for {}, it has already attempted {} times",
task.getTaskId(),
healthcheckMaxRetries.get()
);
return;
}
ScheduledFuture> existing = null;
try {
ScheduledFuture> future = enqueueHealthcheckWithDelay(
task,
getDelaySeconds(task.getTaskId(), options, inStartup, isFirstCheck),
inStartup
);
existing = taskIdToHealthcheck.put(task.getTaskId().getId(), future);
} catch (RejectedExecutionException ree) {
LOG.warn(
"Executor rejected execution, Singularity is shutting down, short circuiting"
);
}
if (existing != null) {
boolean canceledExisting = existing.cancel(false);
if (!ignoreExisting) {
LOG.warn(
"Found existing overlapping healthcheck for task {} - cancel success: {}",
task.getTaskId(),
canceledExisting
);
}
}
}
private Optional getRunningAt(Collection updates) {
for (SingularityTaskHistoryUpdate update : updates) {
if (update.getTaskState() == ExtendedTaskState.TASK_RUNNING) {
return Optional.of(update.getTimestamp());
}
}
return Optional.empty();
}
private int getDelaySeconds(
SingularityTaskId taskId,
HealthcheckOptions options,
boolean inStartup,
boolean isFirstCheck
) {
if (
isFirstCheck &&
(
options.getStartupDelaySeconds().isPresent() ||
configuration.getStartupDelaySeconds().isPresent()
)
) {
int delaySeconds = options
.getStartupDelaySeconds()
.orElseGet(() -> configuration.getStartupDelaySeconds().get());
LOG.trace(
"Delaying first healthcheck %s seconds for task {}",
delaySeconds,
taskId
);
return delaySeconds;
} else if (inStartup) {
return options
.getStartupIntervalSeconds()
.orElse(configuration.getStartupIntervalSeconds());
} else {
return options
.getIntervalSeconds()
.orElse(configuration.getHealthcheckIntervalSeconds());
}
}
@Timed
public boolean enqueueHealthcheck(
SingularityTask task,
Optional pendingDeploy,
Optional request
) {
if (!shouldHealthcheck(task, request, pendingDeploy)) {
return false;
}
Optional lastHealthcheck = taskManager.getLastHealthcheck(
task.getTaskId()
);
enqueueHealthcheck(task, true, true, !lastHealthcheck.isPresent());
return true;
}
public void checkHealthcheck(SingularityTask task) {
if (!taskIdToHealthcheck.containsKey(task.getTaskId().getId())) {
LOG.info("Enqueueing expected healthcheck for task {}", task.getTaskId());
Optional lastHealthcheck = taskManager.getLastHealthcheck(
task.getTaskId()
);
enqueueHealthcheck(task, false, true, !lastHealthcheck.isPresent());
}
}
@VisibleForTesting
Collection> getHealthCheckFutures() {
return taskIdToHealthcheck.values();
}
public void markHealthcheckFinished(String taskId) {
taskIdToHealthcheck.remove(taskId);
}
public boolean cancelHealthcheck(String taskId) {
ScheduledFuture> future = taskIdToHealthcheck.remove(taskId);
if (future == null) {
return false;
}
boolean canceled = future.cancel(false);
LOG.trace("Canceling healthcheck ({}) for task {}", canceled, taskId);
return canceled;
}
private ScheduledFuture> enqueueHealthcheckWithDelay(
final SingularityTask task,
long delaySeconds,
final boolean inStartup
) {
LOG.trace(
"Enqueuing a healthcheck for task {} with delay {}",
task.getTaskId(),
DurationFormatUtils.formatDurationHMS(TimeUnit.SECONDS.toMillis(delaySeconds))
);
return executorService.schedule(
new Runnable() {
@Override
public void run() {
try {
asyncHealthcheck(task);
} catch (Throwable t) {
LOG.error("Uncaught throwable in async healthcheck", t);
exceptionNotifier.notify(
String.format(
"Uncaught throwable in async healthcheck (%s)",
t.getMessage()
),
t,
ImmutableMap.of("taskId", task.getTaskId().toString())
);
reEnqueueOrAbort(task, inStartup);
}
}
},
delaySeconds,
TimeUnit.SECONDS
);
}
public void reEnqueueOrAbort(SingularityTask task, boolean inStartup) {
try {
enqueueHealthcheck(task, true, inStartup, false);
} catch (Throwable t) {
LOG.error(
"Caught throwable while re-enqueuing health check for {}, aborting",
task.getTaskId(),
t
);
exceptionNotifier.notify(
String.format(
"Caught throwable while re-enqueuing health check (%s)",
t.getMessage()
),
t,
ImmutableMap.of("taskId", task.getTaskId().toString())
);
abort.abort(SingularityAbort.AbortReason.UNRECOVERABLE_ERROR, Optional.of(t));
}
}
private Optional getHealthcheckUri(SingularityTask task) {
if (!task.getTaskRequest().getDeploy().getHealthcheck().isPresent()) {
return Optional.empty();
}
HealthcheckOptions options = task.getTaskRequest().getDeploy().getHealthcheck().get();
final String hostname = task.getHostname();
Optional healthcheckPort = options.getPortNumber().isPresent()
? options.getPortNumber()
: MesosUtils.getPortByIndex(
mesosProtosUtils.toResourceList(task.getMesosTask().getResources()),
options.getPortIndex().orElse(0)
);
if (!healthcheckPort.isPresent() || healthcheckPort.get() < 1L) {
return Optional.empty();
}
if (!task.getTaskRequest().getDeploy().getHealthcheck().get().getUri().isPresent()) {
return Optional.empty();
}
String uri = task.getTaskRequest().getDeploy().getHealthcheck().get().getUri().get();
if (uri.startsWith("/")) {
uri = uri.substring(1);
}
HealthcheckProtocol protocol = options
.getProtocol()
.orElse(DEFAULT_HEALTH_CHECK_SCHEME);
return Optional.of(
String.format(
"%s://%s:%d/%s",
protocol.getProtocol(),
hostname,
healthcheckPort.get(),
uri
)
);
}
private void saveFailure(SingularityHealthcheckAsyncHandler handler, String message) {
handler.saveResult(
Optional.empty(),
Optional.empty(),
Optional.of(message),
Optional.empty()
);
}
private boolean shouldHealthcheck(
final SingularityTask task,
final Optional request,
Optional pendingDeploy
) {
if (disasterManager.isDisabled(SingularityAction.RUN_HEALTH_CHECKS)) {
return false;
}
if (
!task.getTaskRequest().getRequest().isLongRunning() ||
!task.getTaskRequest().getDeploy().getHealthcheck().isPresent() ||
task
.getTaskRequest()
.getDeploy()
.getHealthcheck()
.get()
.getHealthcheckResultFilePath()
.isPresent()
) {
return false;
}
if (task.getTaskRequest().getPendingTask().getSkipHealthchecks().orElse(false)) {
return false;
}
if (
pendingDeploy.isPresent() &&
pendingDeploy
.get()
.getDeployMarker()
.getDeployId()
.equals(task.getTaskId().getDeployId()) &&
task.getTaskRequest().getDeploy().getSkipHealthchecksOnDeploy().orElse(false)
) {
return false;
}
if (
request.isPresent() &&
request.get().getRequest().getSkipHealthchecks().orElse(false)
) {
return false;
}
Optional lastHealthcheck = taskManager.getLastHealthcheck(
task.getTaskId()
);
if (lastHealthcheck.isPresent() && !lastHealthcheck.get().isFailed()) {
LOG.debug(
"Not submitting a new healthcheck for {} because it already passed a healthcheck",
task.getTaskId()
);
return false;
}
return true;
}
private Callback wrappedHttp2Handler(final SingularityHealthcheckAsyncHandler handler) {
return new Callback() {
@Override
public void onFailure(Call call, IOException e) {
handler.onFailed(e);
}
@Override
public void onResponse(Call call, okhttp3.Response response) throws IOException {
Optional maybeResponseExcerpt = Optional.empty();
String responseExcerpt = response
.peekBody(configuration.getMaxHealthcheckResponseBodyBytes())
.string();
if (responseExcerpt.length() > 0) {
maybeResponseExcerpt = Optional.of(responseExcerpt);
}
handler.onCompleted(Optional.of(response.code()), maybeResponseExcerpt);
}
};
}
private AsyncCompletionHandler wrappedHttp1Handler(
final SingularityHealthcheckAsyncHandler handler
) {
return new AsyncCompletionHandler() {
@Override
public void onThrowable(Throwable t) {
handler.onFailed(t);
}
@Override
public com.ning.http.client.Response onCompleted(
com.ning.http.client.Response response
)
throws Exception {
Optional maybeResponseExcerpt = Optional.empty();
if (response.hasResponseBody()) {
maybeResponseExcerpt =
Optional.of(
response.getResponseBodyExcerpt(
configuration.getMaxHealthcheckResponseBodyBytes()
)
);
}
handler.onCompleted(Optional.of(response.getStatusCode()), maybeResponseExcerpt);
return response;
}
};
}
@VisibleForTesting
void asyncHealthcheck(final SingularityTask task) {
final Optional uri = getHealthcheckUri(task);
final SingularityHealthcheckAsyncHandler handler = new SingularityHealthcheckAsyncHandler(
exceptionNotifier,
configuration,
this,
newTaskChecker,
taskManager,
task
);
if (!uri.isPresent()) {
saveFailure(handler, "Invalid healthcheck uri or ports not present");
return;
}
handler.setHealthcheckUri(uri.get());
final Integer timeoutSeconds;
final String method;
if (task.getTaskRequest().getDeploy().getHealthcheck().isPresent()) {
HealthcheckOptions options = task
.getTaskRequest()
.getDeploy()
.getHealthcheck()
.get();
method = options.getMethod().orElse(HealthcheckMethod.GET).getMethod();
timeoutSeconds =
options
.getResponseTimeoutSeconds()
.orElse(configuration.getHealthcheckTimeoutSeconds());
} else {
timeoutSeconds = configuration.getHealthcheckTimeoutSeconds();
method = HealthcheckMethod.GET.getMethod();
}
try {
HealthcheckProtocol protocol = task
.getTaskRequest()
.getDeploy()
.getHealthcheck()
.get()
.getProtocol()
.orElse(HealthcheckProtocol.HTTP);
LOG.trace(
"Issuing a healthcheck ({}) for task {} with timeout {}s",
uri.get(),
task.getTaskId(),
timeoutSeconds
);
if (
protocol == HealthcheckProtocol.HTTP2 || protocol == HealthcheckProtocol.HTTPS2
) {
// Creates a lightweight new client which shares the underlying resource pools of the original instance.
http2
.newBuilder()
.retryOnConnectionFailure(false)
.followRedirects(true)
.connectTimeout(timeoutSeconds, TimeUnit.SECONDS)
.readTimeout(timeoutSeconds, TimeUnit.SECONDS)
.cache(null)
.build()
.newCall(
new okhttp3.Request.Builder().method(method, null).url(uri.get()).build()
)
.enqueue(wrappedHttp2Handler(handler));
} else {
RequestBuilder builder = new RequestBuilder("GET");
builder.setFollowRedirects(true);
builder.setUrl(uri.get());
builder.setRequestTimeout((int) TimeUnit.SECONDS.toMillis(timeoutSeconds));
http.prepareRequest(builder.build()).execute(wrappedHttp1Handler(handler));
}
} catch (Throwable t) {
LOG.debug(
"Exception while preparing healthcheck ({}) for task ({})",
uri.get(),
task.getTaskId(),
t
);
exceptionNotifier.notify(
String.format("Error preparing healthcheck (%s)", t.getMessage()),
t,
ImmutableMap.of("taskId", task.getTaskId().toString())
);
saveFailure(
handler,
String.format(
"Healthcheck (%s) failed due to exception: %s",
uri.get(),
t.getMessage()
)
);
}
}
}