com.hubspot.singularity.executor.SingularityExecutorMonitor Maven / Gradle / Ivy
The newest version!
package com.hubspot.singularity.executor;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.common.util.concurrent.FutureCallback;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.ListeningExecutorService;
import com.google.common.util.concurrent.MoreExecutors;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.google.inject.name.Named;
import com.hubspot.mesos.JavaUtils;
import com.hubspot.singularity.SingularityTaskExecutorData;
import com.hubspot.singularity.executor.config.SingularityExecutorConfiguration;
import com.hubspot.singularity.executor.config.SingularityExecutorLogging;
import com.hubspot.singularity.executor.config.SingularityExecutorModule;
import com.hubspot.singularity.executor.task.ArtifactVerificationException;
import com.hubspot.singularity.executor.task.SingularityExecutorTask;
import com.hubspot.singularity.executor.task.SingularityExecutorTaskProcessCallable;
import com.hubspot.singularity.executor.utils.ExecutorUtils;
import java.util.Collection;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.mesos.ExecutorDriver;
import org.apache.mesos.Protos;
import org.apache.mesos.Protos.Status;
import org.apache.mesos.Protos.TaskID;
import org.apache.mesos.Protos.TaskState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Singleton
public class SingularityExecutorMonitor {
private static final Logger LOG = LoggerFactory.getLogger(
SingularityExecutorMonitor.class
);
private final ListeningExecutorService processBuilderPool;
private final ListeningExecutorService runningProcessPool;
private final ScheduledExecutorService exitChecker;
private final ExecutorService cgroupCfsWatcherService;
private final Lock exitLock;
private final AtomicBoolean alreadyShutDown;
private final CountDownLatch latch;
@SuppressWarnings("rawtypes")
private volatile Optional exitCheckerFuture;
private volatile RunState runState;
private final SingularityExecutorConfiguration configuration;
private final SingularityExecutorLogging logging;
private final ExecutorUtils executorUtils;
private final SingularityExecutorProcessKiller processKiller;
private final SingularityExecutorThreadChecker threadChecker;
private final Map tasks;
private final Map> processBuildingTasks;
private final Map processRunningTasks;
private final Map taskToShellCommandPool;
private final Map cgroupCheckers;
@Inject
public SingularityExecutorMonitor(
@Named(SingularityExecutorModule.ALREADY_SHUT_DOWN) AtomicBoolean alreadyShutDown,
SingularityExecutorLogging logging,
ExecutorUtils executorUtils,
SingularityExecutorProcessKiller processKiller,
SingularityExecutorThreadChecker threadChecker,
SingularityExecutorConfiguration configuration
) {
this.logging = logging;
this.configuration = configuration;
this.executorUtils = executorUtils;
this.processKiller = processKiller;
this.exitChecker =
Executors.newSingleThreadScheduledExecutor(
new ThreadFactoryBuilder()
.setNameFormat("SingularityExecutorExitChecker-%d")
.build()
);
this.threadChecker = threadChecker;
this.threadChecker.start(this);
this.tasks = Maps.newConcurrentMap();
this.processBuildingTasks = Maps.newConcurrentMap();
this.processRunningTasks = Maps.newConcurrentMap();
this.taskToShellCommandPool = Maps.newConcurrentMap();
this.cgroupCheckers = Maps.newConcurrentMap();
this.processBuilderPool =
MoreExecutors.listeningDecorator(
Executors.newCachedThreadPool(
new ThreadFactoryBuilder()
.setNameFormat("SingularityExecutorProcessBuilder-%d")
.build()
)
);
this.runningProcessPool =
MoreExecutors.listeningDecorator(
Executors.newCachedThreadPool(
new ThreadFactoryBuilder()
.setNameFormat("SingularityExecutorProcessRunner-%d")
.build()
)
);
this.cgroupCfsWatcherService =
Executors.newSingleThreadExecutor(
new ThreadFactoryBuilder().setNameFormat("cgroup-cfs-watcher-%d").build()
);
this.runState = RunState.STARTING;
this.exitLock = new ReentrantLock();
this.alreadyShutDown = alreadyShutDown;
this.latch = new CountDownLatch(4);
}
public void start(ExecutorDriver driver) {
Preconditions.checkState(runState == RunState.STARTING);
this.runState = RunState.RUNNING;
this.exitCheckerFuture =
Optional.of(
startExitChecker(driver, configuration.getInitialIdleExecutorShutdownWaitMillis())
);
}
public enum RunState {
STARTING,
RUNNING,
SHUTDOWN
}
public enum SubmitState {
SUBMITTED,
REJECTED,
TASK_ALREADY_EXISTED
}
public void shutdown(ExecutorDriver driver) {
if (!alreadyShutDown.compareAndSet(false, true)) {
LOG.info("Already ran shut down process");
return;
}
LOG.info("Shutdown requested with driver {}", driver);
threadChecker.getExecutorService().shutdown();
processBuilderPool.shutdown();
runningProcessPool.shutdown();
cgroupCfsWatcherService.shutdown();
for (SingularityExecutorTask task : tasks.values()) {
if (!task.wasKilled()) {
task
.getLog()
.info(
"Executor shutting down - requested task kill with state: {}",
requestKill(task.getTaskId())
);
}
}
processKiller.getExecutorService().shutdown();
for (Entry taskIdToShellCommandPool : taskToShellCommandPool.entrySet()) { // in case
LOG.warn("Shutting down abandoned pool for {}", taskIdToShellCommandPool.getKey());
taskIdToShellCommandPool.getValue().shutdown();
}
cgroupCheckers.values().forEach(SingularityExecutorCgroupCfsChecker::close);
exitChecker.shutdown();
final long start = System.currentTimeMillis();
JavaUtils.awaitTerminationWithLatch(
latch,
"threadChecker",
threadChecker.getExecutorService(),
configuration.getShutdownTimeoutWaitMillis()
);
JavaUtils.awaitTerminationWithLatch(
latch,
"processBuilder",
processBuilderPool,
configuration.getShutdownTimeoutWaitMillis()
);
JavaUtils.awaitTerminationWithLatch(
latch,
"runningProcess",
runningProcessPool,
configuration.getShutdownTimeoutWaitMillis()
);
JavaUtils.awaitTerminationWithLatch(
latch,
"processKiller",
processKiller.getExecutorService(),
configuration.getShutdownTimeoutWaitMillis()
);
LOG.info(
"Awaiting shutdown of all thread pools for a max of {}",
JavaUtils.durationFromMillis(configuration.getShutdownTimeoutWaitMillis())
);
try {
latch.await();
} catch (InterruptedException e) {
LOG.warn("While awaiting shutdown of executor services", e);
}
LOG.info(
"Waited {} for shutdown of thread pools, now waiting {} before exiting...",
JavaUtils.duration(start),
JavaUtils.durationFromMillis(configuration.getStopDriverAfterMillis())
);
try {
Thread.sleep(configuration.getStopDriverAfterMillis());
} catch (Throwable t) {
LOG.warn("While waiting to exit", t);
}
LOG.info("Stopping driver {}", driver);
Status status = driver.stop();
LOG.info("Driver stopped with status {}", status);
}
private void checkForExit(final ExecutorDriver driver, final long waitMillis) {
try {
exitLock.lockInterruptibly();
} catch (InterruptedException e) {
LOG.warn("Interrupted acquiring exit lock", e);
return;
}
boolean shuttingDown = false;
try {
if (tasks.isEmpty()) {
LOG.info(
"Shutting down executor due to no tasks being submitted within {}",
JavaUtils.durationFromMillis(waitMillis)
);
runState = RunState.SHUTDOWN;
shuttingDown = true;
}
} finally {
exitLock.unlock();
}
if (shuttingDown) {
shutdown(driver);
} else if (runState == RunState.SHUTDOWN) {
LOG.info("Already shutting down...");
} else {
LOG.info("Tasks wasn't empty, exit checker doing nothing...");
}
}
@SuppressWarnings("rawtypes")
private Future startExitChecker(
final ExecutorDriver driver,
final long waitTimeMillis
) {
LOG.info(
"Starting an exit checker that will run in {}",
JavaUtils.durationFromMillis(waitTimeMillis)
);
return exitChecker.schedule(
new Runnable() {
@Override
public void run() {
LOG.info("Exit checker running...");
try {
checkForExit(driver, waitTimeMillis);
} catch (Throwable t) {
logAndExit(2, "While shutting down", t);
}
}
},
waitTimeMillis,
TimeUnit.MILLISECONDS
);
}
private void clearExitCheckerUnsafe() {
if (exitCheckerFuture.isPresent()) {
LOG.info("Canceling an exit checker");
exitCheckerFuture.get().cancel(true);
exitCheckerFuture = Optional.empty();
}
}
public SubmitState submit(final SingularityExecutorTask task) {
exitLock.lock();
try {
final Lock taskLock = task.getLock();
taskLock.lock();
try {
if (runState == RunState.SHUTDOWN) {
finishTask(
task,
TaskState.TASK_LOST,
"Task couldn't start because executor is shutting down",
Optional.empty()
);
return SubmitState.REJECTED;
}
if (tasks.containsKey(task.getTaskId())) {
return SubmitState.TASK_ALREADY_EXISTED;
}
tasks.put(task.getTaskId(), task);
clearExitCheckerUnsafe();
final ListenableFuture processBuildFuture = processBuilderPool.submit(
task.getProcessBuilder()
);
processBuildingTasks.put(task.getTaskId(), processBuildFuture);
watchProcessBuilder(task, processBuildFuture);
} finally {
taskLock.unlock();
}
} finally {
exitLock.unlock();
}
return SubmitState.SUBMITTED;
}
private void logAndExit(int statusCode, String format, Object... args) {
try {
LOG.error(format, args);
} finally {
System.exit(statusCode);
}
}
public Collection getRunningTasks() {
return processRunningTasks.values();
}
public Optional getTaskProcess(String taskId) {
return Optional.ofNullable(processRunningTasks.get(taskId));
}
public Optional getTask(String taskId) {
return Optional.ofNullable(tasks.get(taskId));
}
public ListeningExecutorService getShellCommandExecutorServiceForTask(String taskId) {
if (!taskToShellCommandPool.containsKey(taskId)) {
ListeningExecutorService executorService = MoreExecutors.listeningDecorator(
Executors.newCachedThreadPool(
new ThreadFactoryBuilder()
.setNameFormat(taskId + "-shellCommandPool-%d")
.build()
)
);
taskToShellCommandPool.put(taskId, executorService);
}
return taskToShellCommandPool.get(taskId);
}
public void finishTask(
final SingularityExecutorTask task,
Protos.TaskState taskState,
String message,
Optional errorMsg,
Object... errorObjects
) {
try {
if (errorMsg.isPresent()) {
task.getLog().error(errorMsg.get(), errorObjects);
}
} finally {
try {
sendStatusUpdate(task, taskState, message);
onFinish(task, taskState);
} catch (Throwable t) {
logAndExit(
3,
"Failed while finishing task {} (state {})",
task.getTaskId(),
taskState,
t
);
}
}
}
private void watchProcessBuilder(
final SingularityExecutorTask task,
final ListenableFuture processBuildFuture
) {
Futures.addCallback(
processBuildFuture,
new FutureCallback() {
private void onSuccessThrows(ProcessBuilder processBuilder) {
task.getLog().debug("Process builder finished succesfully... ");
boolean wasKilled = false;
final Lock taskLock = task.getLock();
taskLock.lock();
try {
processBuildingTasks.remove(task.getTaskId());
wasKilled = task.wasKilled();
if (!wasKilled) {
processRunningTasks.put(
task.getTaskId(),
submitProcessMonitor(task, processBuilder)
);
startCgroupWatcher(task);
}
} finally {
taskLock.unlock();
}
if (wasKilled) {
finishTask(
task,
TaskState.TASK_KILLED,
"Task killed before service process started",
Optional.empty()
);
}
}
// these code blocks must not throw exceptions since they are executed inside an executor. (or must be caught)
@Override
public void onSuccess(ProcessBuilder processBuilder) {
try {
onSuccessThrows(processBuilder);
} catch (Throwable t) {
TaskState state = t instanceof ArtifactVerificationException
? TaskState.TASK_FAILED
: TaskState.TASK_LOST;
finishTask(
task,
state,
String.format(
"%s while transitioning due to: %s",
state,
t.getClass().getSimpleName()
),
Optional.of("While submitting process task"),
t
);
}
}
@Override
public void onFailure(Throwable t) {
TaskState state = TaskState.TASK_LOST;
String message = String.format(
"%s while initializing task: %s",
t.getClass().getSimpleName(),
t.getMessage()
);
try {
if (task.wasKilled()) {
state = TaskState.TASK_KILLED;
message =
String.format(
"Task killed, caught expected %s",
t.getClass().getSimpleName()
);
}
} finally {
finishTask(
task,
state,
message,
Optional.of("Task {} failed before starting process"),
task,
t
);
}
}
},
getShellCommandExecutorServiceForTask(task.getTaskId())
);
}
private void startCgroupWatcher(final SingularityExecutorTask task) {
SingularityTaskExecutorData taskExecutorData = (SingularityTaskExecutorData) task.getExecutorData();
if (taskExecutorData.getCpuHardLimit().isPresent()) {
cgroupCfsWatcherService.submit(
() -> {
try {
SingularityExecutorCgroupCfsChecker cfsChecker = new SingularityExecutorCgroupCfsChecker(
task,
taskExecutorData.getCpuHardLimit().get(),
configuration.getDefaultCfsPeriod()
);
cfsChecker.watch();
cgroupCheckers.put(task.getTaskId(), cfsChecker);
} catch (Throwable t) {
LOG.error("Could not start cgroup checker for task {}", task.getTaskId(), t);
}
}
);
}
}
private void sendStatusUpdate(
SingularityExecutorTask task,
Protos.TaskState taskState,
String message
) {
executorUtils.sendStatusUpdate(
task.getDriver(),
TaskID.newBuilder().setValue(task.getTaskId()).build(),
taskState,
message,
task.getLog()
);
}
private void onFinish(SingularityExecutorTask task, Protos.TaskState taskState) {
processKiller.cancelDestroyFuture(task.getTaskId());
tasks.remove(task.getTaskId());
processRunningTasks.remove(task.getTaskId());
processBuildingTasks.remove(task.getTaskId());
task.cleanup(taskState);
ListeningExecutorService executorService = taskToShellCommandPool.remove(
task.getTaskId()
);
if (executorService != null) {
executorService.shutdownNow();
try {
executorService.awaitTermination(5, TimeUnit.MILLISECONDS);
} catch (InterruptedException e) {
LOG.warn("Awaiting shutdown of shell executor service", e);
}
}
logging.stopTaskLogger(task.getTaskId(), task.getLogbackLog());
checkIdleExecutorShutdown(task.getDriver());
}
private void checkIdleExecutorShutdown(ExecutorDriver driver) {
exitLock.lock();
try {
clearExitCheckerUnsafe();
if (tasks.isEmpty() && runState == RunState.RUNNING) {
exitCheckerFuture =
Optional.of(
startExitChecker(driver, configuration.getIdleExecutorShutdownWaitMillis())
);
}
} finally {
exitLock.unlock();
}
}
public enum KillState {
DIDNT_EXIST,
INTERRUPTING_PRE_PROCESS,
KILLING_PROCESS,
DESTROYING_PROCESS,
INCONSISTENT_STATE
}
public KillState requestKill(String taskId) {
return requestKill(taskId, Optional.empty(), false);
}
public KillState requestKill(String taskId, Optional user, boolean destroy) {
final Optional maybeTask = Optional.ofNullable(
tasks.get(taskId)
);
if (!maybeTask.isPresent()) {
return KillState.DIDNT_EXIST;
}
final SingularityExecutorTask task = maybeTask.get();
if (!destroy && task.wasForceDestroyed()) {
task.getLog().debug("Already force destroyed, will not issue additional kill");
return KillState.DESTROYING_PROCESS;
}
task.getLog().info("Executor asked to kill {}", taskId);
ListenableFuture processBuilderFuture = null;
SingularityExecutorTaskProcessCallable runningProcess = null;
task.getLock().lock();
boolean wasKilled = task.wasKilled();
try {
if (!wasKilled) {
task.markKilled(user);
}
processBuilderFuture = processBuildingTasks.get(task.getTaskId());
runningProcess = processRunningTasks.get(task.getTaskId());
} finally {
task.getLock().unlock();
}
if (processBuilderFuture != null) {
task.getLog().info("Canceling process builder future for {}", taskId);
CancelThread cancelThread = new CancelThread(processBuilderFuture, task);
cancelThread.start();
return KillState.INTERRUPTING_PRE_PROCESS;
}
if (runningProcess != null) {
if (destroy) {
if (user.isPresent()) {
task
.getLog()
.info(
"Destroying process with pid {} for task {} by request from user {}",
runningProcess.getCurrentPid(),
taskId,
user.get()
);
} else {
task
.getLog()
.info(
"Destroying process with pid {} for task {}",
runningProcess.getCurrentPid(),
taskId
);
}
task.markForceDestroyed(user);
runningProcess.signalKillToProcessIfActive();
return KillState.DESTROYING_PROCESS;
}
if (processKiller.isKillInProgress(taskId)) {
task.getLog().info("Kill already in progress for task {}", taskId);
return KillState.KILLING_PROCESS;
}
if (user.isPresent()) {
task
.getLog()
.info("Killing process for task {} by request from {}", taskId, user.get());
} else {
task.getLog().info("Killing process for task {}", taskId);
}
processKiller.submitKillRequest(runningProcess);
return KillState.KILLING_PROCESS;
}
return KillState.INCONSISTENT_STATE;
}
private static class CancelThread extends Thread {
private final ListenableFuture processBuilderFuture;
private final SingularityExecutorTask task;
public CancelThread(
ListenableFuture processBuilderFuture,
SingularityExecutorTask task
) {
super("SingularityExecutorMonitor-cancel-thread");
this.processBuilderFuture = processBuilderFuture;
this.task = task;
}
@Override
public void run() {
processBuilderFuture.cancel(true);
task.getProcessBuilder().cancel();
}
}
private SingularityExecutorTaskProcessCallable buildProcessCallable(
final SingularityExecutorTask task,
ProcessBuilder processBuilder
) {
return new SingularityExecutorTaskProcessCallable(
configuration,
task,
processBuilder,
executorUtils
);
}
private SingularityExecutorTaskProcessCallable submitProcessMonitor(
final SingularityExecutorTask task,
ProcessBuilder processBuilder
) {
SingularityExecutorTaskProcessCallable processCallable = buildProcessCallable(
task,
processBuilder
);
final ListenableFuture processExitFuture = runningProcessPool.submit(
processCallable
);
watchProcessExitFuture(task, processExitFuture);
return processCallable;
}
private void watchProcessExitFuture(
final SingularityExecutorTask task,
final ListenableFuture processExitFuture
) {
Futures.addCallback(
processExitFuture,
new FutureCallback() {
// these code blocks must not throw exceptions since they are executed inside an executor. (or must be caught)
@Override
public void onSuccess(Integer exitCode) {
TaskState taskState = null;
String message = null;
Optional maybeKilledBy = task.getKilledBy();
if (task.wasKilledDueToThreads()) {
taskState = TaskState.TASK_FAILED;
message =
String.format(
"Task used %s threads and was killed (max %s)",
task.getThreadCountAtOverageTime(),
task.getExecutorData().getMaxTaskThreads().get()
);
} else if (task.wasKilled()) {
long shutdownTimeMs = System.currentTimeMillis() - task.getKilledAt();
taskState = TaskState.TASK_KILLED;
if (task.wasDestroyedAfterWaiting()) {
final long millisWaited = task
.getExecutorData()
.getSigKillProcessesAfterMillis()
.orElse(configuration.getHardKillAfterMillis());
message =
String.format(
"Task killed forcibly after waiting at least %s (shutdownTime: %dms)",
JavaUtils.durationFromMillis(millisWaited),
shutdownTimeMs
);
} else if (task.wasForceDestroyed()) {
if (maybeKilledBy.isPresent()) {
message =
String.format(
"Task killed forcibly by %s (shutdownTime: %dms)",
maybeKilledBy.get(),
shutdownTimeMs
);
} else {
message =
String.format(
"Task killed forcibly after multiple kill requests from framework, (shutdownTime: %dms)",
shutdownTimeMs
);
}
} else {
message =
String.format(
"Task killed. Process exited gracefully with code %d (shutdownTime: %dms)",
exitCode,
shutdownTimeMs
);
}
} else if (task.isSuccessExitCode(exitCode)) {
taskState = TaskState.TASK_FINISHED;
message = "Process exited normally with code " + exitCode;
} else {
taskState = TaskState.TASK_FAILED;
message = "Process failed with code " + exitCode;
}
sendStatusUpdate(task, taskState, message);
onFinish(task, taskState);
}
@Override
public void onFailure(Throwable t) {
task.getLog().error("Task {} failed while running process", task, t);
TaskState taskState = null;
String message = null;
if (task.wasKilled()) {
taskState = TaskState.TASK_KILLED;
message =
String.format("Task killed, caught %s", t.getClass().getSimpleName());
} else {
taskState = TaskState.TASK_LOST;
message =
String.format(
"%s while running process %s",
t.getClass().getSimpleName(),
t.getMessage()
);
}
sendStatusUpdate(task, taskState, message);
onFinish(task, taskState);
}
},
getShellCommandExecutorServiceForTask(task.getTaskId())
);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy