Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.hubspot.singularity.mesos.SingularityMesosStatusUpdateHandler Maven / Gradle / Ivy
package com.hubspot.singularity.mesos;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.mesos.v1.Protos;
import org.apache.mesos.v1.Protos.TaskState;
import org.apache.mesos.v1.Protos.TaskStatus;
import org.apache.mesos.v1.Protos.TaskStatus.Reason;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.codahale.metrics.Meter;
import com.google.common.base.Optional;
import com.google.common.base.Strings;
import com.google.common.collect.Multiset;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import com.hubspot.mesos.JavaUtils;
import com.hubspot.singularity.ExtendedTaskState;
import com.hubspot.singularity.InvalidSingularityTaskIdException;
import com.hubspot.singularity.RequestType;
import com.hubspot.singularity.SingularityCreateResult;
import com.hubspot.singularity.SingularityMainModule;
import com.hubspot.singularity.SingularityPendingDeploy;
import com.hubspot.singularity.SingularityPendingRequest;
import com.hubspot.singularity.SingularityPendingRequest.PendingType;
import com.hubspot.singularity.SingularityPendingRequestBuilder;
import com.hubspot.singularity.SingularityPendingTask;
import com.hubspot.singularity.SingularityRequestWithState;
import com.hubspot.singularity.SingularityTask;
import com.hubspot.singularity.SingularityTaskHistoryUpdate;
import com.hubspot.singularity.SingularityTaskId;
import com.hubspot.singularity.SingularityTaskStatusHolder;
import com.hubspot.singularity.async.AsyncSemaphore;
import com.hubspot.singularity.config.SingularityConfiguration;
import com.hubspot.singularity.data.DeployManager;
import com.hubspot.singularity.data.RequestManager;
import com.hubspot.singularity.data.TaskManager;
import com.hubspot.singularity.data.transcoders.IdTranscoder;
import com.hubspot.singularity.data.transcoders.SingularityTranscoderException;
import com.hubspot.singularity.helpers.MesosProtosUtils;
import com.hubspot.singularity.helpers.MesosUtils;
import com.hubspot.singularity.scheduler.SingularityHealthchecker;
import com.hubspot.singularity.scheduler.SingularityLeaderCache;
import com.hubspot.singularity.scheduler.SingularityNewTaskChecker;
import com.hubspot.singularity.scheduler.SingularityScheduler;
import com.hubspot.singularity.sentry.SingularityExceptionNotifier;
@Singleton
public class SingularityMesosStatusUpdateHandler {
private static final Logger LOG = LoggerFactory.getLogger(SingularityMesosStatusUpdateHandler.class);
private final TaskManager taskManager;
private final DeployManager deployManager;
private final RequestManager requestManager;
private final IdTranscoder taskIdTranscoder;
private final SingularityExceptionNotifier exceptionNotifier;
private final SingularityHealthchecker healthchecker;
private final SingularityNewTaskChecker newTaskChecker;
private final SingularitySlaveAndRackManager slaveAndRackManager;
private final SingularityMesosExecutorInfoSupport logSupport;
private final SingularityScheduler scheduler;
private final SingularityLeaderCache leaderCache;
private final MesosProtosUtils mesosProtosUtils;
private final String serverId;
private final SingularitySchedulerLock schedulerLock;
private final SingularityConfiguration configuration;
private final Multiset taskLostReasons;
private final Meter lostTasksMeter;
private final ConcurrentHashMap statusUpdateDeltas;
private final ExecutorService statusUpdatesExecutor = Executors.newCachedThreadPool(new ThreadFactoryBuilder().setNameFormat("status-updates-%d").build());
private final AsyncSemaphore statusUpdatesSemaphore;
@Inject
public SingularityMesosStatusUpdateHandler(TaskManager taskManager,
DeployManager deployManager,
RequestManager requestManager,
IdTranscoder taskIdTranscoder,
SingularityExceptionNotifier exceptionNotifier,
SingularityHealthchecker healthchecker,
SingularityNewTaskChecker newTaskChecker,
SingularitySlaveAndRackManager slaveAndRackManager,
SingularityMesosExecutorInfoSupport logSupport,
SingularityScheduler scheduler,
@Named(SingularityMainModule.SERVER_ID_PROPERTY) String serverId,
SingularitySchedulerLock schedulerLock,
SingularityConfiguration configuration,
SingularityLeaderCache leaderCache,
MesosProtosUtils mesosProtosUtils,
@Named(SingularityMesosModule.TASK_LOST_REASONS_COUNTER) Multiset taskLostReasons,
@Named(SingularityMainModule.LOST_TASKS_METER) Meter lostTasksMeter,
@Named(SingularityMainModule.STATUS_UPDATE_DELTAS) ConcurrentHashMap statusUpdateDeltas) {
this.taskManager = taskManager;
this.deployManager = deployManager;
this.requestManager = requestManager;
this.taskIdTranscoder = taskIdTranscoder;
this.exceptionNotifier = exceptionNotifier;
this.healthchecker = healthchecker;
this.newTaskChecker = newTaskChecker;
this.slaveAndRackManager = slaveAndRackManager;
this.logSupport = logSupport;
this.scheduler = scheduler;
this.leaderCache = leaderCache;
this.mesosProtosUtils = mesosProtosUtils;
this.serverId = serverId;
this.schedulerLock = schedulerLock;
this.configuration = configuration;
this.taskLostReasons = taskLostReasons;
this.lostTasksMeter = lostTasksMeter;
this.statusUpdateDeltas = statusUpdateDeltas;
this.statusUpdatesSemaphore = AsyncSemaphore
.newBuilder(() -> configuration.getMesosConfiguration().getStatusUpdateConcurrencyLimit())
.withQueueSize(configuration.getMesosConfiguration().getMaxStatusUpdateQueueSize())
.build();
}
/**
* 1- we have a previous update, and this is a duplicate of it (ignore) 2- we don't have a
* previous update, 2 cases: a - this task has already been destroyed (we can ignore it then) b -
* we've never heard of this task (very unlikely since we first write a status into zk before we
* launch a task)
*/
private boolean isDuplicateOrIgnorableStatusUpdate(Optional previousTaskStatusHolder, final SingularityTaskStatusHolder newTaskStatusHolder) {
if (!previousTaskStatusHolder.isPresent()) {
return true;
}
if (!previousTaskStatusHolder.get().getTaskStatus().isPresent()) { // this is our launch state
return false;
}
return previousTaskStatusHolder.get().getTaskStatus().get().getState() == newTaskStatusHolder.getTaskStatus().get().getState();
}
private void saveNewTaskStatusHolder(SingularityTaskId taskIdObj, SingularityTaskStatusHolder newTaskStatusHolder, ExtendedTaskState taskState) {
if (taskState.isDone()) {
taskManager.deleteLastActiveTaskStatus(taskIdObj);
} else {
taskManager.saveLastActiveTaskStatus(newTaskStatusHolder);
}
}
private Optional getTaskId(String taskId) {
try {
return Optional.of(taskIdTranscoder.fromString(taskId));
} catch (InvalidSingularityTaskIdException | SingularityTranscoderException e) {
exceptionNotifier.notify(String.format("Unexpected taskId %s", taskId), e);
LOG.error("Unexpected taskId {} ", taskId, e);
return Optional.absent();
}
}
private Optional getStatusMessage(Protos.TaskStatus status, Optional task) {
if (status.hasMessage() && !Strings.isNullOrEmpty(status.getMessage())) {
return Optional.of(status.getMessage());
} else if (status.hasReason() && status.getReason() == Reason.REASON_CONTAINER_LIMITATION_MEMORY) {
if (task.isPresent() && task.get().getTaskRequest().getDeploy().getResources().isPresent()) {
if (task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb() > 0) {
return Optional.of(String.format("Task exceeded one or more memory limits (%s MB mem, %s MB disk).", task.get().getTaskRequest().getDeploy().getResources().get().getMemoryMb(),
task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb()));
} else {
return Optional.of(String.format("Task exceeded memory limit (%s MB mem).", task.get().getTaskRequest().getDeploy().getResources().get().getMemoryMb()));
}
}
return Optional.of("Task exceeded memory limit.");
} else if (status.hasReason() && status.getReason() == Reason.REASON_CONTAINER_LIMITATION_DISK) {
if (task.isPresent() && task.get().getTaskRequest().getDeploy().getResources().isPresent()) {
return Optional.of(String.format("Task exceeded disk limit (%s MB disk).", task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb()));
} else {
return Optional.of("Task exceeded disk limit.");
}
}
return Optional.absent();
}
private void relaunchTask(SingularityTask task) {
SingularityPendingTask pendingTask = task.getTaskRequest().getPendingTask();
SingularityPendingRequest pendingRequest = new SingularityPendingRequestBuilder()
.setRequestId(task.getTaskRequest().getRequest().getId())
.setDeployId(task.getTaskRequest().getDeploy().getId())
.setPendingType(PendingType.RETRY)
.setUser(pendingTask.getUser())
.setRunId(pendingTask.getRunId())
.setCmdLineArgsList(pendingTask.getCmdLineArgsList())
.setSkipHealthchecks(pendingTask.getSkipHealthchecks())
.setMessage(pendingTask.getMessage())
.setResources(pendingTask.getResources())
.setS3UploaderAdditionalFiles(pendingTask.getS3UploaderAdditionalFiles())
.setRunAsUserOverride(pendingTask.getRunAsUserOverride())
.setEnvOverrides(pendingTask.getEnvOverrides())
.setExtraArtifacts(pendingTask.getExtraArtifacts())
.setActionId(pendingTask.getActionId())
.setRunAt(pendingTask.getPendingTaskId().getNextRunAt())
.setTimestamp(System.currentTimeMillis())
.build();
requestManager.addToPendingQueue(pendingRequest);
}
private void unsafeProcessStatusUpdate(Protos.TaskStatus status, SingularityTaskId taskIdObj) {
final String taskId = status.getTaskId().getValue();
long timestamp = System.currentTimeMillis();
if (status.hasTimestamp()) {
timestamp = (long) (status.getTimestamp() * 1000);
}
long now = System.currentTimeMillis();
long delta = now - timestamp;
LOG.debug("Update: task {} is now {} ({}) at {} (delta: {})", taskId, status.getState(), status.getMessage(), timestamp, JavaUtils.durationFromMillis(delta));
statusUpdateDeltas.put(now, delta);
final SingularityTaskStatusHolder newTaskStatusHolder = new SingularityTaskStatusHolder(taskIdObj, Optional.of(mesosProtosUtils.taskStatusFromProtos(status)), System.currentTimeMillis(), serverId, Optional.absent());
final Optional previousTaskStatusHolder = taskManager.getLastActiveTaskStatus(taskIdObj);
final ExtendedTaskState taskState = MesosUtils.fromTaskState(status.getState());
if (isDuplicateOrIgnorableStatusUpdate(previousTaskStatusHolder, newTaskStatusHolder)) {
LOG.trace("Ignoring status update {} to {}", taskState, taskIdObj);
saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState);
return;
}
final Optional task = taskManager.getTask(taskIdObj);
if (status.getState() == TaskState.TASK_LOST) {
boolean isMesosFailure =
status.getReason() == Reason.REASON_INVALID_OFFERS
|| status.getReason() == Reason.REASON_AGENT_REMOVED
|| status.getReason() == Reason.REASON_AGENT_RESTARTED
|| status.getReason() == Reason.REASON_AGENT_UNKNOWN
|| status.getReason() == Reason.REASON_MASTER_DISCONNECTED
|| status.getReason() == Reason.REASON_AGENT_DISCONNECTED;
RequestType requestType = task.isPresent() ? task.get().getTaskRequest().getRequest().getRequestType() : null;
boolean isRelaunchable = requestType != null && !requestType.isLongRunning();
if (isMesosFailure && isRelaunchable) {
LOG.info("Relaunching lost task {}", task);
relaunchTask(task.get());
}
lostTasksMeter.mark();
if (configuration.getDisasterDetection().isEnabled()) {
taskLostReasons.add(status.getReason());
}
}
final boolean isActiveTask = taskManager.isActiveTask(taskId);
if (isActiveTask && !taskState.isDone()) {
if (task.isPresent()) {
final Optional pendingDeploy = deployManager.getPendingDeploy(taskIdObj.getRequestId());
Optional requestWithState = Optional.absent();
if (taskState == ExtendedTaskState.TASK_RUNNING) {
requestWithState = requestManager.getRequest(taskIdObj.getRequestId());
healthchecker.enqueueHealthcheck(task.get(), pendingDeploy, requestWithState);
}
if (!pendingDeploy.isPresent() || !pendingDeploy.get().getDeployMarker().getDeployId().equals(taskIdObj.getDeployId())) {
if (!requestWithState.isPresent()) {
requestWithState = requestManager.getRequest(taskIdObj.getRequestId());
}
newTaskChecker.enqueueNewTaskCheck(task.get(), requestWithState, healthchecker);
}
} else {
final String message = String.format("Task %s is active but is missing task data", taskId);
exceptionNotifier.notify(message);
LOG.error(message);
}
}
final Optional statusMessage = getStatusMessage(status, task);
final SingularityTaskHistoryUpdate taskUpdate =
new SingularityTaskHistoryUpdate(taskIdObj, timestamp, taskState, statusMessage, status.hasReason() ? Optional.of(status.getReason().name()) : Optional.absent());
final SingularityCreateResult taskHistoryUpdateCreateResult = taskManager.saveTaskHistoryUpdate(taskUpdate);
logSupport.checkDirectoryAndContainerId(taskIdObj);
if (taskState.isDone()) {
healthchecker.cancelHealthcheck(taskId);
newTaskChecker.cancelNewTaskCheck(taskId);
taskManager.deleteKilledRecord(taskIdObj);
handleCompletedTaskState(status, taskIdObj, taskState, taskHistoryUpdateCreateResult, task, timestamp, isActiveTask);
}
saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState);
}
private synchronized void handleCompletedTaskState(TaskStatus status, SingularityTaskId taskIdObj, ExtendedTaskState taskState,
SingularityCreateResult taskHistoryUpdateCreateResult, Optional task, long timestamp, boolean isActiveTask) {
// Method synchronized to prevent race condition where two tasks complete at the same time but the leader cache holding the state
// doesn't get updated between each task completion. If this were to happen, then slaves would never transition from DECOMMISSIONING to
// DECOMMISSIONED because each task state check thinks the other task is still running.
slaveAndRackManager.checkStateAfterFinishedTask(taskIdObj, status.getAgentId().getValue(), leaderCache);
scheduler.handleCompletedTask(task, taskIdObj, isActiveTask, timestamp, taskState, taskHistoryUpdateCreateResult, status);
}
public CompletableFuture processStatusUpdateAsync(Protos.TaskStatus status) {
return statusUpdatesSemaphore.call(() -> CompletableFuture.supplyAsync(() -> {
final String taskId = status.getTaskId().getValue();
final Optional maybeTaskId = getTaskId(taskId);
if (!maybeTaskId.isPresent()) {
return false;
}
schedulerLock.runWithRequestLock(
() -> unsafeProcessStatusUpdate(status, maybeTaskId.get()),
maybeTaskId.get().getRequestId(),
getClass().getSimpleName()
);
return true;
}, statusUpdatesExecutor)
);
}
}