com.spotify.helios.agent.TaskMonitor Maven / Gradle / Ivy
/*
* Copyright (c) 2014 Spotify AB.
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.spotify.helios.agent;
import com.google.common.util.concurrent.MoreExecutors;
import com.spotify.docker.client.ImageNotFoundException;
import com.spotify.docker.client.ImagePullFailedException;
import com.spotify.helios.common.descriptors.JobId;
import com.spotify.helios.common.descriptors.TaskStatus;
import com.spotify.helios.common.descriptors.ThrottleState;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Closeable;
import java.util.Objects;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import static com.spotify.helios.common.descriptors.TaskStatus.State.CREATING;
import static com.spotify.helios.common.descriptors.TaskStatus.State.EXITED;
import static com.spotify.helios.common.descriptors.TaskStatus.State.FAILED;
import static com.spotify.helios.common.descriptors.TaskStatus.State.HEALTHCHECKING;
import static com.spotify.helios.common.descriptors.TaskStatus.State.PULLING_IMAGE;
import static com.spotify.helios.common.descriptors.TaskStatus.State.RUNNING;
import static com.spotify.helios.common.descriptors.TaskStatus.State.STARTING;
import static com.spotify.helios.common.descriptors.ThrottleState.FLAPPING;
import static com.spotify.helios.common.descriptors.ThrottleState.IMAGE_MISSING;
import static com.spotify.helios.common.descriptors.ThrottleState.IMAGE_PULL_FAILED;
import static com.spotify.helios.common.descriptors.ThrottleState.NO;
import static java.util.concurrent.TimeUnit.MILLISECONDS;
import static java.util.concurrent.TimeUnit.SECONDS;
/**
* A monitor for {@link TaskRunner}, processing events into observations about the health of a task,
* e.g. whether it's flapping etc. This information is published using the {@link StatusUpdater} and
* a {@link ThrottleState} is made available for supervisors to act on.
*/
public class TaskMonitor implements TaskRunner.Listener, Closeable {
private static final Logger log = LoggerFactory.getLogger(TaskMonitor.class);
private final JobId jobId;
private final ScheduledExecutorService scheduler;
private final FlapController flapController;
private final StatusUpdater statusUpdater;
private volatile ScheduledFuture flapTimeout;
private ThrottleState imageFailure;
private ThrottleState throttle = NO;
public TaskMonitor(final JobId jobId, final FlapController flapController,
final StatusUpdater statusUpdater) {
this.jobId = jobId;
this.flapController = flapController;
this.statusUpdater = statusUpdater;
final ScheduledThreadPoolExecutor executor = new ScheduledThreadPoolExecutor(1);
// Let core threads time out to avoid unnecessarily keeping a flapping state check thread alive
// for the majority of tasks that do not flap.
executor.setKeepAliveTime(5, SECONDS);
executor.allowCoreThreadTimeOut(true);
this.scheduler = MoreExecutors.getExitingScheduledExecutorService(executor, 0, SECONDS);
}
/**
* Get the current task throttle as derived from task runner events.
* @return The throttle state.
*/
public ThrottleState throttle() {
return throttle;
}
@Override
public void close() {
scheduler.shutdownNow();
}
@Override
protected void finalize() throws Throwable {
super.finalize();
if (!scheduler.isShutdown()) {
log.error("task monitor not properly closed: {}", jobId);
}
}
@Override
public void failed(final Throwable t) {
if (t instanceof InterruptedException) {
// Ignore failures due to interruptions as they're used when tearing down the agent and do
// not indicate actual runner failures.
return;
}
if (t instanceof ImageNotFoundException) {
imageFailure(IMAGE_MISSING);
} else if (t instanceof ImagePullFailedException) {
imageFailure(IMAGE_PULL_FAILED);
}
updateState(FAILED);
}
@Override
public void pulling() {
updateState(PULLING_IMAGE);
}
@Override
public void pulled() {
}
@Override
public void pullFailed() {
}
@Override
public void creating() {
updateState(CREATING);
}
@Override
public void created(final String containerId) {
// If we managed to create a container, any previous image failure has been resolved
resetImageFailure();
statusUpdater.setContainerId(containerId);
}
@Override
public void starting() {
// If we managed to create a container, any previous image failure has been resolved
resetImageFailure();
updateState(STARTING);
}
@Override
public void started() {
// If we managed to start a container, any previous image failure has been resolved
resetImageFailure();
}
@Override
public void healthChecking() {
// If the container is running a health check, any previous image failure has been resolved
resetImageFailure();
updateState(HEALTHCHECKING);
}
@Override
public void running() {
flapController.started();
// If the container is running, any previous image failure has been resolved
resetImageFailure();
updateState(RUNNING);
}
@Override
public void exited(final int code) {
flapController.exited();
updateThrottle();
updateState(EXITED);
}
private void imageFailure(final ThrottleState imageFailure) {
this.imageFailure = imageFailure;
updateThrottle();
}
private void resetImageFailure() {
imageFailure = null;
updateThrottle();
}
/**
* Derive a new throttle state and propagate it if needed. If flapping, schedule a future check
* to see if we're still flapping and potentially reset the flapping state.
*/
private boolean updateThrottle() {
// Derive new throttle state
final ThrottleState newThrottle;
final boolean flapping = flapController.isFlapping();
if (imageFailure != null) {
// Image failures take precedence
newThrottle = imageFailure;
} else {
newThrottle = flapping ? FLAPPING : NO;
}
// If the throttle state changed, propagate it
final boolean updated;
if (!Objects.equals(throttle, newThrottle)) {
log.info("throttle state change: {}: {} -> {}", jobId, throttle, newThrottle);
throttle = newThrottle;
statusUpdater.setThrottleState(throttle);
updated = true;
} else {
updated = false;
}
// If we're flapping, schedule a future check to potentially reset the flapping state
if (flapping) {
if (flapTimeout != null) {
flapTimeout.cancel(false);
}
flapTimeout = scheduler.schedule(new UpdateThrottle(),
flapController.millisLeftToUnflap(), MILLISECONDS);
}
// Let the caller know if they need to commit the state change
return updated;
}
/**
* Propagate a new task state by setting it and committing the status update.
*/
private void updateState(final TaskStatus.State state) {
statusUpdater.setState(state);
// Commit and push a new status
try {
statusUpdater.update();
} catch (InterruptedException e) {
// TODO: propagate interrupt instead?
Thread.currentThread().interrupt();
}
}
/**
* Used to schedule flapping state updates while task is running.
* @see #updateThrottle()
*/
private class UpdateThrottle implements Runnable {
@Override
public void run() {
if (updateThrottle()) {
try {
statusUpdater.update();
} catch (InterruptedException ignore) {
}
}
}
}
}