Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.hubspot.singularity.mesos.SingularityMesosStatusUpdateHandler Maven / Gradle / Ivy
package com.hubspot.singularity.mesos;
import com.codahale.metrics.Histogram;
import com.codahale.metrics.Meter;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Multiset;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import com.hubspot.mesos.JavaUtils;
import com.hubspot.mesos.protos.MesosTaskState;
import com.hubspot.singularity.ExtendedTaskState;
import com.hubspot.singularity.InvalidSingularityTaskIdException;
import com.hubspot.singularity.LoadBalancerRequestType;
import com.hubspot.singularity.RequestType;
import com.hubspot.singularity.SingularityCreateResult;
import com.hubspot.singularity.SingularityMainModule;
import com.hubspot.singularity.SingularityManagedThreadPoolFactory;
import com.hubspot.singularity.SingularityPendingDeploy;
import com.hubspot.singularity.SingularityPendingRequest;
import com.hubspot.singularity.SingularityPendingRequest.PendingType;
import com.hubspot.singularity.SingularityPendingRequestBuilder;
import com.hubspot.singularity.SingularityPendingTask;
import com.hubspot.singularity.SingularityRequestWithState;
import com.hubspot.singularity.SingularityTask;
import com.hubspot.singularity.SingularityTaskCleanup;
import com.hubspot.singularity.SingularityTaskHistory;
import com.hubspot.singularity.SingularityTaskHistoryUpdate;
import com.hubspot.singularity.SingularityTaskId;
import com.hubspot.singularity.SingularityTaskStatusHolder;
import com.hubspot.singularity.TaskCleanupType;
import com.hubspot.singularity.async.ExecutorAndQueue;
import com.hubspot.singularity.config.SingularityConfiguration;
import com.hubspot.singularity.data.DeployManager;
import com.hubspot.singularity.data.RequestManager;
import com.hubspot.singularity.data.TaskManager;
import com.hubspot.singularity.data.transcoders.IdTranscoder;
import com.hubspot.singularity.data.transcoders.SingularityTranscoderException;
import com.hubspot.singularity.helpers.MesosProtosUtils;
import com.hubspot.singularity.helpers.MesosUtils;
import com.hubspot.singularity.scheduler.SingularityHealthchecker;
import com.hubspot.singularity.scheduler.SingularityLeaderCache;
import com.hubspot.singularity.scheduler.SingularityNewTaskChecker;
import com.hubspot.singularity.scheduler.SingularityScheduler;
import com.hubspot.singularity.sentry.SingularityExceptionNotifier;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import org.apache.mesos.v1.Protos;
import org.apache.mesos.v1.Protos.TaskState;
import org.apache.mesos.v1.Protos.TaskStatus;
import org.apache.mesos.v1.Protos.TaskStatus.Reason;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Singleton
public class SingularityMesosStatusUpdateHandler {
private static final Logger LOG = LoggerFactory.getLogger(
SingularityMesosStatusUpdateHandler.class
);
private static final Set ACTIVE_STATES = ImmutableSet.of(
MesosTaskState.TASK_STAGING,
MesosTaskState.TASK_STARTING,
MesosTaskState.TASK_RUNNING
);
private final TaskManager taskManager;
private final DeployManager deployManager;
private final RequestManager requestManager;
private final IdTranscoder taskIdTranscoder;
private final SingularityExceptionNotifier exceptionNotifier;
private final SingularityHealthchecker healthchecker;
private final SingularityNewTaskChecker newTaskChecker;
private final SingularityAgentAndRackManager agentAndRackManager;
private final SingularityMesosExecutorInfoSupport logSupport;
private final SingularityScheduler scheduler;
private final SingularityLeaderCache leaderCache;
private final MesosProtosUtils mesosProtosUtils;
private final String serverId;
private final SingularitySchedulerLock schedulerLock;
private final SingularityConfiguration configuration;
private final Multiset taskLostReasons;
private final Meter lostTasksMeter;
private final Histogram statusUpdateDeltas;
private final ExecutorAndQueue statusUpdatesExecutor;
@Inject
public SingularityMesosStatusUpdateHandler(
TaskManager taskManager,
DeployManager deployManager,
RequestManager requestManager,
IdTranscoder taskIdTranscoder,
SingularityExceptionNotifier exceptionNotifier,
SingularityHealthchecker healthchecker,
SingularityNewTaskChecker newTaskChecker,
SingularityAgentAndRackManager agentAndRackManager,
SingularityMesosExecutorInfoSupport logSupport,
SingularityScheduler scheduler,
@Named(SingularityMainModule.SERVER_ID_PROPERTY) String serverId,
SingularitySchedulerLock schedulerLock,
SingularityConfiguration configuration,
SingularityLeaderCache leaderCache,
MesosProtosUtils mesosProtosUtils,
SingularityManagedThreadPoolFactory threadPoolFactory,
@Named(
SingularityMesosModule.TASK_LOST_REASONS_COUNTER
) Multiset taskLostReasons,
@Named(SingularityMainModule.LOST_TASKS_METER) Meter lostTasksMeter,
@Named(SingularityMainModule.STATUS_UPDATE_DELTAS) Histogram statusUpdateDeltas
) {
this.taskManager = taskManager;
this.deployManager = deployManager;
this.requestManager = requestManager;
this.taskIdTranscoder = taskIdTranscoder;
this.exceptionNotifier = exceptionNotifier;
this.healthchecker = healthchecker;
this.newTaskChecker = newTaskChecker;
this.agentAndRackManager = agentAndRackManager;
this.logSupport = logSupport;
this.scheduler = scheduler;
this.leaderCache = leaderCache;
this.mesosProtosUtils = mesosProtosUtils;
this.serverId = serverId;
this.schedulerLock = schedulerLock;
this.configuration = configuration;
this.taskLostReasons = taskLostReasons;
this.lostTasksMeter = lostTasksMeter;
this.statusUpdateDeltas = statusUpdateDeltas;
this.statusUpdatesExecutor =
threadPoolFactory.get(
"status-updates",
configuration.getMesosConfiguration().getStatusUpdateConcurrencyLimit(),
configuration.getMesosConfiguration().getMaxStatusUpdateQueueSize()
);
}
private boolean isRecoveryStatusUpdate(
Optional previousTaskStatusHolder,
Reason reason,
ExtendedTaskState taskState,
final SingularityTaskStatusHolder newTaskStatusHolder
) {
if (
!previousTaskStatusHolder.isPresent() && // Task was already removed from the active list
!taskState.isDone() &&
newTaskStatusHolder.getTaskStatus().isPresent() &&
ACTIVE_STATES.contains(newTaskStatusHolder.getTaskStatus().get().getState())
) {
LOG.warn(
"Task {} recovered but may have already been replaced",
newTaskStatusHolder.getTaskId()
);
return true;
}
return false;
}
/**
* 1- we have a previous update, and this is a duplicate of it (ignore) 2- we don't have a
* previous update, 2 cases: a - this task has already been destroyed (we can ignore it then) b -
* we've never heard of this task (very unlikely since we first write a status into zk before we
* launch a task)
*/
private boolean isDuplicateOrIgnorableStatusUpdate(
Optional previousTaskStatusHolder,
final SingularityTaskStatusHolder newTaskStatusHolder
) {
if (!previousTaskStatusHolder.isPresent()) {
return true;
}
if (!previousTaskStatusHolder.get().getTaskStatus().isPresent()) { // this is our launch state
return false;
}
return (
previousTaskStatusHolder.get().getTaskStatus().get().getState() ==
newTaskStatusHolder.getTaskStatus().get().getState()
);
}
private void saveNewTaskStatusHolder(
SingularityTaskId taskIdObj,
SingularityTaskStatusHolder newTaskStatusHolder,
ExtendedTaskState taskState
) {
if (taskState.isDone()) {
taskManager.deleteLastActiveTaskStatus(taskIdObj);
} else {
taskManager.saveLastActiveTaskStatus(newTaskStatusHolder);
}
}
private Optional getTaskId(String taskId) {
try {
return Optional.of(taskIdTranscoder.fromString(taskId));
} catch (InvalidSingularityTaskIdException | SingularityTranscoderException e) {
exceptionNotifier.notify(String.format("Unexpected taskId %s", taskId), e);
LOG.error("Unexpected taskId {} ", taskId, e);
return Optional.empty();
}
}
private Optional getStatusMessage(
Protos.TaskStatus status,
Optional task
) {
if (status.hasMessage() && !Strings.isNullOrEmpty(status.getMessage())) {
return Optional.of(status.getMessage());
} else if (
status.hasReason() &&
status.getReason() == Reason.REASON_CONTAINER_LIMITATION_MEMORY
) {
if (
task.isPresent() &&
task.get().getTaskRequest().getDeploy().getResources().isPresent()
) {
if (
task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb() > 0
) {
return Optional.of(
String.format(
"Task exceeded one or more memory limits (%s MB mem, %s MB disk).",
task.get().getTaskRequest().getDeploy().getResources().get().getMemoryMb(),
task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb()
)
);
} else {
return Optional.of(
String.format(
"Task exceeded memory limit (%s MB mem).",
task.get().getTaskRequest().getDeploy().getResources().get().getMemoryMb()
)
);
}
}
return Optional.of("Task exceeded memory limit.");
} else if (
status.hasReason() && status.getReason() == Reason.REASON_CONTAINER_LIMITATION_DISK
) {
if (
task.isPresent() &&
task.get().getTaskRequest().getDeploy().getResources().isPresent()
) {
return Optional.of(
String.format(
"Task exceeded disk limit (%s MB disk).",
task.get().getTaskRequest().getDeploy().getResources().get().getDiskMb()
)
);
} else {
return Optional.of("Task exceeded disk limit.");
}
}
return Optional.empty();
}
private void relaunchTask(SingularityTask task) {
SingularityPendingTask pendingTask = task.getTaskRequest().getPendingTask();
SingularityPendingRequest pendingRequest = new SingularityPendingRequestBuilder()
.setRequestId(task.getTaskRequest().getRequest().getId())
.setDeployId(task.getTaskRequest().getDeploy().getId())
.setPendingType(PendingType.RETRY)
.setUser(pendingTask.getUser())
.setRunId(pendingTask.getRunId())
.setCmdLineArgsList(pendingTask.getCmdLineArgsList())
.setSkipHealthchecks(pendingTask.getSkipHealthchecks())
.setMessage(pendingTask.getMessage())
.setResources(pendingTask.getResources())
.setS3UploaderAdditionalFiles(pendingTask.getS3UploaderAdditionalFiles())
.setRunAsUserOverride(pendingTask.getRunAsUserOverride())
.setEnvOverrides(pendingTask.getEnvOverrides())
.setExtraArtifacts(pendingTask.getExtraArtifacts())
.setActionId(pendingTask.getActionId())
.setRunAt(pendingTask.getPendingTaskId().getNextRunAt())
.setTimestamp(System.currentTimeMillis())
.build();
requestManager.addToPendingQueue(pendingRequest);
}
private StatusUpdateResult unsafeProcessStatusUpdate(
Protos.TaskStatus status,
SingularityTaskId taskIdObj
) {
final String taskId = status.getTaskId().getValue();
long timestamp = System.currentTimeMillis();
if (status.hasTimestamp()) {
timestamp = (long) (status.getTimestamp() * 1000);
}
long now = System.currentTimeMillis();
long delta = now - timestamp;
LOG.debug(
"Update: task {} is now {} ({}) at {} (delta: {})",
taskId,
status.getState(),
status.getMessage(),
timestamp,
JavaUtils.durationFromMillis(delta)
);
statusUpdateDeltas.update(delta);
final SingularityTaskStatusHolder newTaskStatusHolder = new SingularityTaskStatusHolder(
taskIdObj,
Optional.of(mesosProtosUtils.taskStatusFromProtos(status)),
System.currentTimeMillis(),
serverId,
Optional.empty()
);
final Optional previousTaskStatusHolder = taskManager.getLastActiveTaskStatus(
taskIdObj
);
final ExtendedTaskState taskState = MesosUtils.fromTaskState(status.getState());
if (
isRecoveryStatusUpdate(
previousTaskStatusHolder,
status.getReason(),
taskState,
newTaskStatusHolder
)
) {
LOG.info(
"Found recovery status update with reason {} for task {}",
status.getReason(),
taskId
);
final Optional maybeTaskHistory = taskManager.getTaskHistory(
taskIdObj
);
if (
!maybeTaskHistory.isPresent() ||
!maybeTaskHistory.get().getLastTaskUpdate().isPresent()
) {
LOG.warn(
"Task {} not found to recover, it may have already been persisted. Triggering a kill via mesos",
taskIdObj
);
return StatusUpdateResult.KILL_TASK;
} else if (
maybeTaskHistory.isPresent() &&
status.getReason() == Reason.REASON_AGENT_REREGISTERED
) {
boolean lbRemovalStarted = taskManager
.getLoadBalancerState(taskIdObj, LoadBalancerRequestType.REMOVE)
.isPresent();
if (lbRemovalStarted) {
taskManager.createTaskCleanup(
new SingularityTaskCleanup(
Optional.empty(),
TaskCleanupType.DECOMISSIONING,
System.currentTimeMillis(),
taskIdObj,
Optional.of(
"Agent re-registered after load balancer removal started. Task cannot be reactivated."
),
Optional.empty(),
Optional.empty()
)
);
return StatusUpdateResult.DONE;
}
}
boolean reactivated = taskManager.reactivateTask(
taskIdObj,
taskState,
newTaskStatusHolder,
Optional.ofNullable(status.getMessage()),
status.hasReason()
? Optional.of(status.getReason().name())
: Optional.empty()
);
if (reactivated) {
requestManager.addToPendingQueue(
new SingularityPendingRequest(
taskIdObj.getRequestId(),
taskIdObj.getDeployId(),
now,
Optional.empty(),
PendingType.TASK_RECOVERED,
Optional.empty(),
Optional.of(
String.format("Agent %s recovered", status.getAgentId().getValue())
)
)
);
return StatusUpdateResult.DONE;
} else {
return StatusUpdateResult.KILL_TASK;
}
} else if (
isDuplicateOrIgnorableStatusUpdate(previousTaskStatusHolder, newTaskStatusHolder)
) {
LOG.trace("Ignoring status update {} to {}", taskState, taskIdObj);
saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState);
return StatusUpdateResult.IGNORED;
}
final Optional task = taskManager.getTask(taskIdObj);
if (status.getState() == TaskState.TASK_LOST) {
boolean isMesosFailure =
status.getReason() == Reason.REASON_INVALID_OFFERS ||
status.getReason() == Reason.REASON_AGENT_REMOVED ||
status.getReason() == Reason.REASON_AGENT_RESTARTED ||
status.getReason() == Reason.REASON_AGENT_UNKNOWN ||
status.getReason() == Reason.REASON_MASTER_DISCONNECTED ||
status.getReason() == Reason.REASON_AGENT_DISCONNECTED;
RequestType requestType = task.isPresent()
? task.get().getTaskRequest().getRequest().getRequestType()
: null;
boolean isRelaunchable = requestType != null && !requestType.isLongRunning();
if (isMesosFailure && isRelaunchable) {
LOG.info("Relaunching lost task {}", task);
relaunchTask(task.get());
}
lostTasksMeter.mark();
if (configuration.getDisasterDetection().isEnabled()) {
taskLostReasons.add(status.getReason());
}
}
if (!taskState.isDone()) {
if (task.isPresent()) {
final Optional pendingDeploy = deployManager.getPendingDeploy(
taskIdObj.getRequestId()
);
Optional requestWithState = Optional.empty();
if (taskState == ExtendedTaskState.TASK_RUNNING) {
requestWithState = requestManager.getRequest(taskIdObj.getRequestId());
healthchecker.enqueueHealthcheck(task.get(), pendingDeploy, requestWithState);
}
if (
!pendingDeploy.isPresent() ||
!pendingDeploy
.get()
.getDeployMarker()
.getDeployId()
.equals(taskIdObj.getDeployId())
) {
if (!requestWithState.isPresent()) {
requestWithState = requestManager.getRequest(taskIdObj.getRequestId());
}
newTaskChecker.enqueueNewTaskCheck(task.get(), requestWithState, healthchecker);
}
} else {
final String message = String.format(
"Task %s is active but is missing task data",
taskId
);
exceptionNotifier.notify(message);
LOG.error(message);
}
}
final Optional statusMessage = getStatusMessage(status, task);
final SingularityTaskHistoryUpdate taskUpdate = new SingularityTaskHistoryUpdate(
taskIdObj,
timestamp,
taskState,
statusMessage,
status.hasReason()
? Optional.of(status.getReason().name())
: Optional.empty()
);
final SingularityCreateResult taskHistoryUpdateCreateResult = taskManager.saveTaskHistoryUpdate(
taskUpdate
);
logSupport.checkDirectoryAndContainerId(taskIdObj);
if (taskState.isDone()) {
healthchecker.cancelHealthcheck(taskId);
newTaskChecker.cancelNewTaskCheck(taskId);
taskManager.deleteKilledRecord(taskIdObj);
handleCompletedTaskState(
status,
taskIdObj,
taskState,
taskHistoryUpdateCreateResult,
task,
timestamp
);
}
saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState);
return StatusUpdateResult.DONE;
}
private synchronized void handleCompletedTaskState(
TaskStatus status,
SingularityTaskId taskIdObj,
ExtendedTaskState taskState,
SingularityCreateResult taskHistoryUpdateCreateResult,
Optional task,
long timestamp
) {
// Method synchronized to prevent race condition where two tasks complete at the same time but the leader cache holding the state
// doesn't get updated between each task completion. If this were to happen, then agents would never transition from DECOMMISSIONING to
// DECOMMISSIONED because each task state check thinks the other task is still running.
agentAndRackManager.checkStateAfterFinishedTask(
taskIdObj,
status.getAgentId().getValue(),
leaderCache
);
scheduler.handleCompletedTask(
task,
taskIdObj,
timestamp,
taskState,
taskHistoryUpdateCreateResult,
status
);
}
public boolean hasRoomForMoreUpdates() {
return (
statusUpdatesExecutor.getQueue().size() < statusUpdatesExecutor.getQueueLimit()
);
}
public CompletableFuture processStatusUpdateAsync(
Protos.TaskStatus status
) {
return CompletableFuture.supplyAsync(
() -> {
final String taskId = status.getTaskId().getValue();
final Optional maybeTaskId = getTaskId(taskId);
if (!maybeTaskId.isPresent()) {
return StatusUpdateResult.INVALID_TASK_ID;
}
return schedulerLock.runWithRequestLockAndReturn(
() -> unsafeProcessStatusUpdate(status, maybeTaskId.get()),
maybeTaskId.get().getRequestId(),
getClass().getSimpleName()
);
},
statusUpdatesExecutor.getExecutorService()
);
}
}