
com.hubspot.singularity.mesos.SingularityMesosStatusUpdateHandler Maven / Gradle / Ivy
package com.hubspot.singularity.mesos;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.Lock;
import org.apache.mesos.Protos;
import org.apache.mesos.Protos.TaskState;
import org.apache.mesos.Protos.TaskStatus.Reason;
import org.apache.mesos.SchedulerDriver;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.codahale.metrics.Meter;
import com.codahale.metrics.annotation.Timed;
import com.google.common.base.Optional;
import com.google.common.base.Strings;
import com.google.common.collect.Multiset;
import com.google.inject.Inject;
import com.google.inject.Provider;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import com.hubspot.singularity.ExtendedTaskState;
import com.hubspot.singularity.InvalidSingularityTaskIdException;
import com.hubspot.singularity.SingularityAbort;
import com.hubspot.singularity.SingularityAbort.AbortReason;
import com.hubspot.singularity.SingularityCreateResult;
import com.hubspot.singularity.SingularityMainModule;
import com.hubspot.singularity.SingularityPendingDeploy;
import com.hubspot.singularity.SingularityRequestWithState;
import com.hubspot.singularity.SingularityTask;
import com.hubspot.singularity.SingularityTaskHistoryUpdate;
import com.hubspot.singularity.SingularityTaskId;
import com.hubspot.singularity.SingularityTaskStatusHolder;
import com.hubspot.singularity.config.SingularityConfiguration;
import com.hubspot.singularity.data.DeployManager;
import com.hubspot.singularity.data.RequestManager;
import com.hubspot.singularity.data.TaskManager;
import com.hubspot.singularity.data.transcoders.IdTranscoder;
import com.hubspot.singularity.data.transcoders.SingularityTranscoderException;
import com.hubspot.singularity.scheduler.SingularityHealthchecker;
import com.hubspot.singularity.scheduler.SingularityNewTaskChecker;
import com.hubspot.singularity.scheduler.SingularityScheduler;
import com.hubspot.singularity.scheduler.SingularitySchedulerStateCache;
import com.hubspot.singularity.sentry.SingularityExceptionNotifier;
import io.dropwizard.lifecycle.Managed;
@Singleton
public class SingularityMesosStatusUpdateHandler implements Managed {
private static final Logger LOG = LoggerFactory.getLogger(SingularityMesosStatusUpdateHandler.class);
private final TaskManager taskManager;
private final DeployManager deployManager;
private final RequestManager requestManager;
private final IdTranscoder taskIdTranscoder;
private final SingularityExceptionNotifier exceptionNotifier;
private final SingularityHealthchecker healthchecker;
private final SingularityNewTaskChecker newTaskChecker;
private final SingularitySlaveAndRackManager slaveAndRackManager;
private final SingularityMesosExecutorInfoSupport logSupport;
private final SingularityScheduler scheduler;
private final Provider stateCacheProvider;
private final String serverId;
private final BlockingQueue statusUpdateQueue;
private final ExecutorService executorService;
private final SchedulerDriverSupplier schedulerDriverSupplier;
private final AtomicBoolean handlerStarted;
private final Lock schedulerLock;
private final boolean processStatusUpdatesInSeparateThread;
private final SingularityAbort singularityAbort;
private final SingularityConfiguration configuration;
private final Multiset taskLostReasons;
private final Meter lostTasksMeter;
private Future statusUpdateFuture;
@Inject
public SingularityMesosStatusUpdateHandler(TaskManager taskManager, DeployManager deployManager, RequestManager requestManager,
IdTranscoder taskIdTranscoder, SingularityExceptionNotifier exceptionNotifier, SingularityHealthchecker healthchecker,
SingularityNewTaskChecker newTaskChecker, SingularitySlaveAndRackManager slaveAndRackManager, SingularityMesosExecutorInfoSupport logSupport, SingularityScheduler scheduler,
Provider stateCacheProvider, @Named(SingularityMainModule.SERVER_ID_PROPERTY) String serverId,
SchedulerDriverSupplier schedulerDriverSupplier,
@Named(SingularityMesosModule.SCHEDULER_LOCK_NAME) final Lock schedulerLock,
@Named(SingularityMainModule.STATUS_UPDATE_THREADPOOL_NAME) ScheduledExecutorService executorService,
SingularityConfiguration configuration,
SingularityAbort singularityAbort,
@Named(SingularityMesosModule.TASK_LOST_REASONS_COUNTER) Multiset taskLostReasons,
@Named(SingularityMainModule.LOST_TASKS_METER) Meter lostTasksMeter) {
this.taskManager = taskManager;
this.deployManager = deployManager;
this.requestManager = requestManager;
this.taskIdTranscoder = taskIdTranscoder;
this.exceptionNotifier = exceptionNotifier;
this.healthchecker = healthchecker;
this.newTaskChecker = newTaskChecker;
this.slaveAndRackManager = slaveAndRackManager;
this.logSupport = logSupport;
this.scheduler = scheduler;
this.stateCacheProvider = stateCacheProvider;
this.serverId = serverId;
this.schedulerDriverSupplier = schedulerDriverSupplier;
this.schedulerLock = schedulerLock;
this.singularityAbort = singularityAbort;
this.configuration = configuration;
this.taskLostReasons = taskLostReasons;
this.lostTasksMeter = lostTasksMeter;
this.handlerStarted = new AtomicBoolean();
this.statusUpdateQueue = new ArrayBlockingQueue<>(configuration.getStatusUpdateQueueCapacity());
this.executorService = executorService;
this.processStatusUpdatesInSeparateThread = configuration.isProcessStatusUpdatesInSeparateThread();
}
/**
* 1- we have a previous update, and this is a duplicate of it (ignore) 2- we don't have a
* previous update, 2 cases: a - this task has already been destroyed (we can ignore it then) b -
* we've never heard of this task (very unlikely since we first write a status into zk before we
* launch a task)
*/
private boolean isDuplicateOrIgnorableStatusUpdate(Optional previousTaskStatusHolder, final SingularityTaskStatusHolder newTaskStatusHolder) {
if (!previousTaskStatusHolder.isPresent()) {
return true;
}
if (!previousTaskStatusHolder.get().getTaskStatus().isPresent()) { // this is our launch state
return false;
}
return previousTaskStatusHolder.get().getTaskStatus().get().getState() == newTaskStatusHolder.getTaskStatus().get().getState();
}
private void saveNewTaskStatusHolder(SingularityTaskId taskIdObj, SingularityTaskStatusHolder newTaskStatusHolder, ExtendedTaskState taskState) {
if (taskState.isDone()) {
taskManager.deleteLastActiveTaskStatus(taskIdObj);
} else {
taskManager.saveLastActiveTaskStatus(newTaskStatusHolder);
}
}
private Optional getTaskId(String taskId) {
try {
return Optional.of(taskIdTranscoder.fromString(taskId));
} catch (InvalidSingularityTaskIdException | SingularityTranscoderException e) {
exceptionNotifier.notify(String.format("Unexpected taskId %s", taskId), e);
LOG.error("Unexpected taskId {} ", taskId, e);
return Optional.absent();
}
}
private Optional getStatusMessage(Protos.TaskStatus status, Optional task) {
if (status.hasMessage() && !Strings.isNullOrEmpty(status.getMessage())) {
return Optional.of(status.getMessage());
} else if (status.hasReason() && status.getReason() == Reason.REASON_CONTAINER_LIMITATION_MEMORY) {
if (task.isPresent() && task.get().getTaskRequest().getDeploy().getResources().isPresent()) {
if (task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb() > 0) {
return Optional.of(String.format("Task exceeded one or more memory limits (%s MB mem, %s MB disk).", task.get().getTaskRequest().getDeploy().getResources().get().getMemoryMb(), task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb()));
} else {
return Optional.of(String.format("Task exceeded memory limit (%s MB mem).", task.get().getTaskRequest().getDeploy().getResources().get().getMemoryMb()));
}
}
return Optional.of("Task exceeded memory limit");
}
return Optional.absent();
}
private void updateDisasterStats(Protos.TaskStatus status) {
}
private SchedulerDriver getSchedulerDriver() {
final Optional maybeSchedulerDriver = schedulerDriverSupplier.get();
if (!maybeSchedulerDriver.isPresent()) {
throw new RuntimeException("scheduler driver not present!");
// TODO: how best to handle?
}
return maybeSchedulerDriver.get();
}
private void unsafeProcessStatusUpdate(Protos.TaskStatus status) {
final String taskId = status.getTaskId().getValue();
long timestamp = System.currentTimeMillis();
if (status.hasTimestamp()) {
timestamp = (long) (status.getTimestamp() * 1000);
}
LOG.debug("Task {} is now {} ({}) at {} ", taskId, status.getState(), status.getMessage(), timestamp);
final Optional maybeTaskId = getTaskId(taskId);
if (!maybeTaskId.isPresent()) {
getSchedulerDriver().acknowledgeStatusUpdate(status);
return;
}
final SingularityTaskId taskIdObj = maybeTaskId.get();
final SingularityTaskStatusHolder newTaskStatusHolder = new SingularityTaskStatusHolder(taskIdObj, Optional.of(status), System.currentTimeMillis(), serverId, Optional.absent());
final Optional previousTaskStatusHolder = taskManager.getLastActiveTaskStatus(taskIdObj);
final ExtendedTaskState taskState = ExtendedTaskState.fromTaskState(status.getState());
if (isDuplicateOrIgnorableStatusUpdate(previousTaskStatusHolder, newTaskStatusHolder)) {
LOG.trace("Ignoring status update {} to {}", taskState, taskIdObj);
saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState);
getSchedulerDriver().acknowledgeStatusUpdate(status);
return;
}
if (status.getState() == TaskState.TASK_LOST) {
lostTasksMeter.mark();
if (configuration.getDisasterDetection().isEnabled()) {
taskLostReasons.add(status.getReason());
}
}
final Optional task = taskManager.getTask(taskIdObj);
final boolean isActiveTask = taskManager.isActiveTask(taskId);
if (isActiveTask && !taskState.isDone()) {
if (task.isPresent()) {
final Optional pendingDeploy = deployManager.getPendingDeploy(taskIdObj.getRequestId());
Optional requestWithState = Optional.absent();
if (taskState == ExtendedTaskState.TASK_RUNNING) {
requestWithState = requestManager.getRequest(taskIdObj.getRequestId());
healthchecker.enqueueHealthcheck(task.get(), pendingDeploy, requestWithState);
}
if (!pendingDeploy.isPresent() || !pendingDeploy.get().getDeployMarker().getDeployId().equals(taskIdObj.getDeployId())) {
if (!requestWithState.isPresent()) {
requestWithState = requestManager.getRequest(taskIdObj.getRequestId());
}
newTaskChecker.enqueueNewTaskCheck(task.get(), requestWithState, healthchecker);
}
} else {
final String message = String.format("Task %s is active but is missing task data", taskId);
exceptionNotifier.notify(message);
LOG.error(message);
}
}
final Optional statusMessage = getStatusMessage(status, task);
final SingularityTaskHistoryUpdate taskUpdate =
new SingularityTaskHistoryUpdate(taskIdObj, timestamp, taskState, statusMessage, status.hasReason() ? Optional.of(status.getReason().name()) : Optional.absent());
final SingularityCreateResult taskHistoryUpdateCreateResult = taskManager.saveTaskHistoryUpdate(taskUpdate);
logSupport.checkDirectoryAndContainerId(taskIdObj);
if (taskState.isDone()) {
healthchecker.cancelHealthcheck(taskId);
newTaskChecker.cancelNewTaskCheck(taskId);
taskManager.deleteKilledRecord(taskIdObj);
SingularitySchedulerStateCache stateCache = stateCacheProvider.get();
slaveAndRackManager.checkStateAfterFinishedTask(taskIdObj, status.getSlaveId().getValue(), stateCache);
scheduler.handleCompletedTask(task, taskIdObj, isActiveTask, timestamp, taskState, taskHistoryUpdateCreateResult, stateCache, status);
}
saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState);
getSchedulerDriver().acknowledgeStatusUpdate(status);
}
@Timed
public void processStatusUpdate(Protos.TaskStatus status) {
schedulerLock.lock();
try {
unsafeProcessStatusUpdate(status);
} finally {
schedulerLock.unlock();
}
}
public void enqueueStatusUpdate(Protos.TaskStatus status) {
if (processStatusUpdatesInSeparateThread) {
try {
if (statusUpdateFuture == null || statusUpdateFuture.isDone()) {
singularityAbort.abort(AbortReason.NO_RUNNING_STATUS_UPDATE_THREAD, Optional.absent());
}
statusUpdateQueue.put(status);
} catch (InterruptedException ie) {
// If we do not ack the status update it will be resent, can log this and move on
LOG.error("Interrupted while adding status update to queue", ie);
}
} else {
processStatusUpdate(status);
}
}
@Override
public void start() {
if (!processStatusUpdatesInSeparateThread) {
return;
}
if (handlerStarted.getAndSet(true)) {
LOG.warn("StatusUpdateHandler already started!");
return;
}
statusUpdateFuture = executorService.submit(new Runnable() {
@Override
public void run() {
LOG.info("Status update handler thread started");
while (!Thread.currentThread().isInterrupted()) {
try {
final Protos.TaskStatus status = statusUpdateQueue.take();
LOG.info("Handling status update for {} {}", status.getTaskId().getValue(), status.getState());
processStatusUpdate(status);
} catch (InterruptedException ie) {
Thread.currentThread().interrupt();
break;
} catch (Throwable t) {
LOG.error("Caught exception in status update handler thread", t);
singularityAbort.abort(SingularityAbort.AbortReason.UNRECOVERABLE_ERROR, Optional.of(t));
}
}
}
});
}
@Override
public void stop() throws Exception {
// noop
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy