All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hubspot.singularity.scheduler.SingularityCleaner Maven / Gradle / Ivy

package com.hubspot.singularity.scheduler;

import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Collectors;

import javax.inject.Singleton;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Optional;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Multiset;
import com.google.inject.Inject;
import com.hubspot.baragon.models.BaragonRequestState;
import com.hubspot.mesos.JavaUtils;
import com.hubspot.singularity.LoadBalancerRequestType;
import com.hubspot.singularity.LoadBalancerRequestType.LoadBalancerRequestId;
import com.hubspot.singularity.RequestCleanupType;
import com.hubspot.singularity.RequestState;
import com.hubspot.singularity.SingularityDeleteResult;
import com.hubspot.singularity.SingularityDeploy;
import com.hubspot.singularity.SingularityDeployKey;
import com.hubspot.singularity.SingularityKilledTaskIdRecord;
import com.hubspot.singularity.SingularityLoadBalancerUpdate;
import com.hubspot.singularity.SingularityPendingRequest;
import com.hubspot.singularity.SingularityPendingRequest.PendingType;
import com.hubspot.singularity.SingularityPendingTask;
import com.hubspot.singularity.SingularityRequest;
import com.hubspot.singularity.SingularityRequestCleanup;
import com.hubspot.singularity.SingularityRequestDeployState;
import com.hubspot.singularity.SingularityRequestHistory;
import com.hubspot.singularity.SingularityRequestLbCleanup;
import com.hubspot.singularity.SingularityRequestWithState;
import com.hubspot.singularity.SingularityTask;
import com.hubspot.singularity.SingularityTaskCleanup;
import com.hubspot.singularity.SingularityTaskId;
import com.hubspot.singularity.SingularityTaskShellCommandRequest;
import com.hubspot.singularity.SingularityTaskShellCommandRequestId;
import com.hubspot.singularity.SingularityTaskShellCommandUpdate;
import com.hubspot.singularity.TaskCleanupType;
import com.hubspot.singularity.config.SingularityConfiguration;
import com.hubspot.singularity.data.DeployManager;
import com.hubspot.singularity.data.RequestManager;
import com.hubspot.singularity.data.TaskManager;
import com.hubspot.singularity.data.UsageManager;
import com.hubspot.singularity.data.history.RequestHistoryHelper;
import com.hubspot.singularity.expiring.SingularityExpiringBounce;
import com.hubspot.singularity.hooks.LoadBalancerClient;
import com.hubspot.singularity.mesos.SingularityMesosScheduler;
import com.hubspot.singularity.mesos.SingularitySchedulerLock;
import com.hubspot.singularity.scheduler.SingularityDeployHealthHelper.DeployHealth;
import com.hubspot.singularity.sentry.SingularityExceptionNotifier;

@Singleton
public class SingularityCleaner {

  private static final Logger LOG = LoggerFactory.getLogger(SingularityCleaner.class);

  private final TaskManager taskManager;
  private final DeployManager deployManager;
  private final RequestManager requestManager;
  private final SingularityDeployHealthHelper deployHealthHelper;
  private final LoadBalancerClient lbClient;
  private final SingularityExceptionNotifier exceptionNotifier;
  private final RequestHistoryHelper requestHistoryHelper;
  private final SingularityMesosScheduler scheduler;
  private final SingularitySchedulerLock lock;
  private final UsageManager usageManager;

  private final SingularityConfiguration configuration;
  private final long killNonLongRunningTasksInCleanupAfterMillis;

  @Inject
  public SingularityCleaner(TaskManager taskManager, SingularityDeployHealthHelper deployHealthHelper, DeployManager deployManager, RequestManager requestManager,
                            SingularityConfiguration configuration, LoadBalancerClient lbClient, SingularityExceptionNotifier exceptionNotifier,
                            RequestHistoryHelper requestHistoryHelper, SingularityMesosScheduler scheduler, SingularitySchedulerLock lock, UsageManager usageManager) {
    this.taskManager = taskManager;
    this.lbClient = lbClient;
    this.deployHealthHelper = deployHealthHelper;
    this.deployManager = deployManager;
    this.requestManager = requestManager;
    this.exceptionNotifier = exceptionNotifier;
    this.requestHistoryHelper = requestHistoryHelper;
    this.scheduler = scheduler;
    this.lock = lock;
    this.usageManager = usageManager;

    this.configuration = configuration;

    this.killNonLongRunningTasksInCleanupAfterMillis = TimeUnit.SECONDS.toMillis(configuration.getKillNonLongRunningTasksInCleanupAfterSeconds());
  }

  private boolean shouldKillTask(SingularityTaskCleanup taskCleanup, List activeTaskIds, Set cleaningTasks, Multiset incrementalCleaningTasks) {
    final Optional requestWithState = requestManager.getRequest(taskCleanup.getTaskId().getRequestId());

    if (!requestWithState.isPresent()) {
      LOG.debug("Killing a task {} immediately because the request was missing", taskCleanup);
      return true;
    }

    final SingularityRequest request = requestWithState.get().getRequest();

    if (taskCleanup.getRunBeforeKillId().isPresent()) {
      List shellCommandUpdates = taskManager.getTaskShellCommandUpdates(taskCleanup.getRunBeforeKillId().get());
      boolean finished = false;
      for (SingularityTaskShellCommandUpdate update : shellCommandUpdates) {
        if (update.getUpdateType().isFinished()) {
          finished = true;
          break;
        }
      }
      if (!finished) {
        LOG.debug("Waiting for pre-kill shell command {} to finish before killing task", taskCleanup.getRunBeforeKillId());
        return false;
      }
    }

    if (taskCleanup.getCleanupType().shouldKillTaskInstantly(request)) {
      LOG.debug("Killing a task {} immediately because of its cleanup type", taskCleanup);
      return true;
    }

    // If pausing, must be a long-running task to kill here
    if (requestWithState.get().getState() == RequestState.PAUSED &&
      (!(taskCleanup.getCleanupType() == TaskCleanupType.PAUSING) || request.isLongRunning())) {
      LOG.debug("Killing a task {} immediately because the request was paused", taskCleanup);
      return true;
    }

    if (!request.isLongRunning()) {
      final long timeSinceCleanup = System.currentTimeMillis() - taskCleanup.getTimestamp();
      final long maxWaitTime = request.getKillOldNonLongRunningTasksAfterMillis().or(killNonLongRunningTasksInCleanupAfterMillis);
      final boolean tooOld = (maxWaitTime < 1) || (timeSinceCleanup > maxWaitTime);

      if (!tooOld) {
        LOG.trace("Not killing a non-longRunning task {}, running time since cleanup {} (max wait time is {})", taskCleanup, timeSinceCleanup, maxWaitTime);
      } else {
        LOG.debug("Killing a non-longRunning task {} - running time since cleanup {} exceeded max wait time {}", taskCleanup, timeSinceCleanup, maxWaitTime);
      }

      return tooOld;
    }

    final String requestId = request.getId();

    final Optional deployState = deployManager.getRequestDeployState(requestId);

    if (taskCleanup.getCleanupType() == TaskCleanupType.DECOMISSIONING && deployState.get().getPendingDeploy().isPresent()
        && deployState.get().getPendingDeploy().get().getDeployId().equals(taskCleanup.getTaskId().getDeployId())) {
      final long timeSinceCleanup = System.currentTimeMillis() - taskCleanup.getTimestamp();
      final long maxWaitTime = configuration.getPendingDeployHoldTaskDuringDecommissionMillis();
      final boolean tooOld = (maxWaitTime < 1) || (timeSinceCleanup > maxWaitTime);

      if (!tooOld) {
        LOG.trace("Not killing {} - part of pending deploy - running time since cleanup {} (max wait time is {})", taskCleanup, timeSinceCleanup, maxWaitTime);
        return false;
      } else {
        LOG.debug("Killing {} - part of pending deploy but running time since cleanup {} exceeded max wait time {}", taskCleanup, timeSinceCleanup, maxWaitTime);
        return true;
      }
    }

    if (!deployState.isPresent() || !deployState.get().getActiveDeploy().isPresent()) {
      LOG.debug("Killing a task {} immediately because there is no active deploy state {}", taskCleanup, deployState);
      return true;
    }

    final String activeDeployId = deployState.get().getActiveDeploy().get().getDeployId();
    final String matchingTasksDeployId = taskCleanup.getCleanupType() == TaskCleanupType.INCREMENTAL_DEPLOY_CANCELLED || taskCleanup.getCleanupType() == TaskCleanupType.INCREMENTAL_DEPLOY_FAILED ? activeDeployId : taskCleanup.getTaskId().getDeployId();

    // check to see if there are enough active tasks out there that have been active for long enough that we can safely shut this task down.
    final List matchingTasks = new ArrayList<>();
    for (SingularityTaskId taskId : activeTaskIds) {
      if (!taskId.getRequestId().equals(requestId) || !taskId.getDeployId().equals(matchingTasksDeployId)) {
        continue;
      }
      if (cleaningTasks.contains(taskId)) {
        continue;
      }
      matchingTasks.add(taskId);
    }

    // For an incremental bounce or incremental deploy cleanup, shut down old tasks as new ones are started
    final SingularityDeployKey key = SingularityDeployKey.fromTaskId(taskCleanup.getTaskId());
    if (taskCleanup.getCleanupType() == TaskCleanupType.INCREMENTAL_BOUNCE) {
      return shouldKillIncrementalBounceTask(request, taskCleanup, matchingTasksDeployId, matchingTasks, key, incrementalCleaningTasks);
    } else if (isIncrementalDeployCleanup(taskCleanup)) {
      return shouldKillIncrementalDeployCleanupTask(request, taskCleanup, matchingTasksDeployId, matchingTasks, key, incrementalCleaningTasks);
    } else {
      if (matchingTasks.size() < request.getInstancesSafe()) {
        LOG.trace("Not killing a task {} yet, only {} matching out of a required {}", taskCleanup, matchingTasks.size(), request.getInstancesSafe());
        return false;
      }
    }

    final Optional deploy = deployManager.getDeploy(requestId, activeDeployId);

    final DeployHealth deployHealth = deployHealthHelper.getDeployHealth(requestWithState.get().getRequest(), deploy, matchingTasks, false);

    switch (deployHealth) {
      case HEALTHY:
        for (SingularityTaskId taskId : matchingTasks) {
          DeployHealth lbHealth = getLbHealth(request, taskId);

          if (lbHealth != DeployHealth.HEALTHY) {
            LOG.trace("Not killing a task {}, waiting for new replacement tasks to be added to LB (current state: {})", taskCleanup, lbHealth);
            return false;
          }
        }

        LOG.debug("Killing a task {}, at least {} replacement tasks are healthy [{}]", taskCleanup, request.getInstancesSafe(), matchingTasks);
        return true;
      case WAITING:
      case UNHEALTHY:
      default:
        LOG.trace("Not killing a task {}, waiting for new replacement tasks to be healthy (current state: {})", taskCleanup, deployHealth);
        return false;
    }
  }

  private boolean isIncrementalDeployCleanup(SingularityTaskCleanup taskCleanup) {
    return taskCleanup.getCleanupType() == TaskCleanupType.INCREMENTAL_DEPLOY_FAILED
      || taskCleanup.getCleanupType() == TaskCleanupType.INCREMENTAL_DEPLOY_CANCELLED;
  }

  private boolean shouldKillIncrementalBounceTask(SingularityRequest request, SingularityTaskCleanup taskCleanup, String matchingTasksDeployId, List matchingTasks,
    SingularityDeployKey key, Multiset incrementalCleaningTasks) {
    int healthyReplacementTasks = getNumHealthyTasks(request, matchingTasksDeployId, matchingTasks);
    if (healthyReplacementTasks + incrementalCleaningTasks.count(key) <= request.getInstancesSafe()) {
      LOG.trace("Not killing a task {} yet, only {} matching out of a required {}", taskCleanup, matchingTasks.size(), request.getInstancesSafe() - incrementalCleaningTasks.count(key));
      return false;
    } else {
      LOG.debug("Killing a task {}, {} replacement tasks are healthy", taskCleanup, healthyReplacementTasks);
      incrementalCleaningTasks.remove(key);
      return true;
    }
  }

  private boolean shouldKillIncrementalDeployCleanupTask(SingularityRequest request, SingularityTaskCleanup taskCleanup, String matchingTasksDeployId, List matchingTasks,
    SingularityDeployKey key, Multiset incrementalCleaningTasks) {
    int healthyActiveDeployTasks = getNumHealthyTasks(request, matchingTasksDeployId, matchingTasks);
    if (healthyActiveDeployTasks < request.getInstancesSafe()) {
      LOG.trace("Not killing a task {} yet, only {} matching out of a required {}", taskCleanup, matchingTasks.size(), request.getInstancesSafe() - incrementalCleaningTasks.count(key));
      return false;
    } else {
      LOG.debug("Killing a task {}, {} active deploy tasks are healthy", taskCleanup, healthyActiveDeployTasks);
      incrementalCleaningTasks.remove(key);
      return true;
    }
  }

  private int getNumHealthyTasks(SingularityRequest request, String deployId, List matchingTasks) {
    Optional deploy = deployManager.getDeploy(request.getId(), deployId);

    List healthyTasks = deployHealthHelper.getHealthyTasks(request, deploy, matchingTasks, false);

    int numHealthyTasks = 0;

    for (SingularityTaskId taskId : healthyTasks) {
      DeployHealth lbHealth = getLbHealth(request, taskId);

      if (lbHealth == DeployHealth.HEALTHY) {
        numHealthyTasks++;
      }
    }

    return numHealthyTasks;
  }

  private DeployHealth getLbHealth(SingularityRequest request, SingularityTaskId taskId) {
    if (!request.isLoadBalanced()) {
      return DeployHealth.HEALTHY;
    }

    Optional update = taskManager.getLoadBalancerState(taskId, LoadBalancerRequestType.ADD);

    if (!update.isPresent()) {
      return DeployHealth.WAITING;
    }

    switch (update.get().getLoadBalancerState()) {
      case SUCCESS:
        return DeployHealth.HEALTHY;
      case CANCELED:
      case CANCELING:
      case UNKNOWN:
      case INVALID_REQUEST_NOOP:
      case FAILED:
        return DeployHealth.UNHEALTHY;
      case WAITING:
        return DeployHealth.WAITING;
    }

    return DeployHealth.WAITING;
  }

  private boolean isObsolete(long start, long cleanupRequest) {
    final long delta = start - cleanupRequest;

    return delta > getObsoleteExpirationTime();
  }

  private long getObsoleteExpirationTime() {
    return TimeUnit.SECONDS.toMillis(configuration.getCleanupEverySeconds()) * 3;
  }

  private void drainRequestCleanupQueue() {
    final long start = System.currentTimeMillis();

    final List cleanupRequests = requestManager.getCleanupRequests();

    if (cleanupRequests.isEmpty()) {
      LOG.trace("Request cleanup queue is empty");
      return;
    }

    LOG.info("Cleaning up {} requests", cleanupRequests.size());

    AtomicInteger numTasksKilled = new AtomicInteger(0);
    AtomicInteger numScheduledTasksRemoved = new AtomicInteger(0);

    cleanupRequests.parallelStream().forEach((requestCleanup) -> {
      lock.runWithRequestLock(() -> {
        processRequestCleanup(start, numTasksKilled, numScheduledTasksRemoved, requestCleanup);
      }, requestCleanup.getRequestId(), String.format("%s#%s", getClass().getSimpleName(), "drainRequestCleanupQueue"));
    });

    LOG.info("Killed {} tasks (removed {} scheduled) in {}", numTasksKilled.get(), numScheduledTasksRemoved.get(), JavaUtils.duration(start));
  }

  private void processRequestCleanup(long start, AtomicInteger numTasksKilled, AtomicInteger numScheduledTasksRemoved, SingularityRequestCleanup requestCleanup) {
    final List activeTaskIds = taskManager.getActiveTaskIdsForRequest(requestCleanup.getRequestId());
    final List pendingTasks = taskManager.getPendingTasksForRequest(requestCleanup.getRequestId());
    final String requestId = requestCleanup.getRequestId();
    final Optional requestWithState = requestManager.getRequest(requestId);

    boolean killActiveTasks = requestCleanup.getKillTasks().or(configuration.isDefaultValueForKillTasksOfPausedRequests());
    boolean killScheduledTasks = true;

    switch (requestCleanup.getCleanupType()) {
      case PAUSING:
        if (SingularityRequestWithState.isActive(requestWithState)) {
          if (isObsolete(start, requestCleanup.getTimestamp())) {
            killScheduledTasks = false;
            killActiveTasks = false;
            LOG.info("Ignoring {}, because {} is {}", requestCleanup, requestCleanup.getRequestId(), requestWithState.get().getState());
          } else {
            LOG.debug("Waiting on {} (it will expire after {}), because {} is {}", requestCleanup, JavaUtils.durationFromMillis(getObsoleteExpirationTime()), requestCleanup.getRequestId(), requestWithState.get().getState());
            return;
          }
        } else {
          if (pause(requestCleanup, activeTaskIds) == TaskCleanupType.PAUSING) {
            killActiveTasks = false;
          }
        }
        break;
      case DELETING:
        if (!Iterables.isEmpty(activeTaskIds)) {
          killActiveTasks = false;
          killScheduledTasks = false;

          delete(requestCleanup, activeTaskIds);
        } else {
          Optional maybeHistory = requestHistoryHelper.getLastHistory(requestId);
          if (maybeHistory.isPresent()) {
            if (maybeHistory.get().getRequest().isLoadBalanced()
                && configuration.isDeleteRemovedRequestsFromLoadBalancer()
                && requestCleanup.getRemoveFromLoadBalancer().or(true)) {
              createLbCleanupRequest(requestId, activeTaskIds);
            }
            requestManager.markDeleted(maybeHistory.get().getRequest(), start, requestCleanup.getUser(), requestCleanup.getMessage());
          }
          cleanupRequestData(requestCleanup);
        }
        break;
      case BOUNCE:
      case INCREMENTAL_BOUNCE:
        killActiveTasks = false;
        killScheduledTasks = false;

        bounce(requestCleanup, activeTaskIds);
        break;
    }

    if (killActiveTasks) {
      for (SingularityTaskId matchingTaskId : activeTaskIds) {
        LOG.debug("Killing task {} due to {}", matchingTaskId, requestCleanup);
        scheduler.killAndRecord(matchingTaskId, requestCleanup.getCleanupType(), Optional.absent());
        numTasksKilled.getAndIncrement();
      }
    } else {
      LOG.info("Active tasks for {} not killed", requestCleanup);
    }

    if (killScheduledTasks) {
      for (SingularityPendingTask matchingTask : Iterables.filter(pendingTasks, SingularityPendingTask.matchingRequest(requestId))) {
        LOG.debug("Deleting scheduled task {} due to {}", matchingTask, requestCleanup);
        taskManager.deletePendingTask(matchingTask.getPendingTaskId());
        numScheduledTasksRemoved.getAndIncrement();
      }
    }

    requestManager.deleteCleanRequest(requestId, requestCleanup.getCleanupType());
  }

  private void createLbCleanupRequest(String requestId, Iterable matchingActiveTaskIds) {
    Optional maybeCurrentDeployId = deployManager.getInUseDeployId(requestId);
    Optional maybeDeploy = Optional.absent();
    if (maybeCurrentDeployId.isPresent()) {
      maybeDeploy = deployManager.getDeploy(requestId, maybeCurrentDeployId.get());
      if (maybeDeploy.isPresent()) {
        List taskIds = new ArrayList<>();
        for (SingularityTaskId taskId : matchingActiveTaskIds) {
          taskIds.add(taskId.getId());
        }
        requestManager.saveLbCleanupRequest(new SingularityRequestLbCleanup(requestId, maybeDeploy.get().getLoadBalancerGroups().get(), maybeDeploy.get().getServiceBasePath().get(), taskIds, Optional.absent()));
        return;
      }
    }
    exceptionNotifier.notify("Insufficient data to create LB request cleanup", ImmutableMap.of("requestId", requestId, "deployId", maybeCurrentDeployId.toString(), "deploy", maybeDeploy.toString()));
  }

  private void bounce(SingularityRequestCleanup requestCleanup, final List activeTaskIds) {
    final long start = System.currentTimeMillis();

    final List matchingTaskIds = new ArrayList<>();

    for (SingularityTaskId activeTaskId : activeTaskIds) {
      if (activeTaskId.getRequestId().equals(requestCleanup.getRequestId()) && activeTaskId.getDeployId().equals(requestCleanup.getDeployId().get())) {
        matchingTaskIds.add(activeTaskId);
      }
    }

    for (SingularityTaskId matchingTaskId : matchingTaskIds) {
      LOG.debug("Adding task {} to cleanup (bounce)", matchingTaskId.getId());

      Optional runBeforeKillId = Optional.absent();

      if (requestCleanup.getRunShellCommandBeforeKill().isPresent()) {
        SingularityTaskShellCommandRequest shellRequest = new SingularityTaskShellCommandRequest(matchingTaskId, requestCleanup.getUser(), System.currentTimeMillis(), requestCleanup.getRunShellCommandBeforeKill().get());
        taskManager.saveTaskShellCommandRequestToQueue(shellRequest);
        runBeforeKillId = Optional.of(shellRequest.getId());
      }

      taskManager.createTaskCleanup(new SingularityTaskCleanup(requestCleanup.getUser(), requestCleanup.getCleanupType().getTaskCleanupType().get(), start, matchingTaskId, requestCleanup.getMessage(), requestCleanup.getActionId(), runBeforeKillId));
    }

    if (matchingTaskIds.isEmpty() && requestCleanup.getDeployId().isPresent()) {
      Optional expiringBounce = requestManager.getExpiringBounce(requestCleanup.getRequestId());
      if (expiringBounce.isPresent() && expiringBounce.get().getDeployId().equalsIgnoreCase(requestCleanup.getDeployId().get())) {
        LOG.info("No running tasks for request {}. Marking bounce {} complete and starting new tasks", expiringBounce.get().getRequestId(), expiringBounce.get());

        requestManager.deleteExpiringObject(SingularityExpiringBounce.class, requestCleanup.getRequestId());
      }
      requestManager.markBounceComplete(requestCleanup.getRequestId());
    }

    requestManager.addToPendingQueue(new SingularityPendingRequest(requestCleanup.getRequestId(), requestCleanup.getDeployId().get(), requestCleanup.getTimestamp(),
        requestCleanup.getUser(), PendingType.BOUNCE, Optional.absent(), Optional.absent(), requestCleanup.getSkipHealthchecks(), requestCleanup.getMessage(), requestCleanup.getActionId()));

    LOG.info("Added {} tasks for request {} to cleanup bounce queue in {}", matchingTaskIds.size(), requestCleanup.getRequestId(), JavaUtils.duration(start));
  }

  private TaskCleanupType pause(SingularityRequestCleanup requestCleanup, Iterable activeTaskIds) {
    final long start = System.currentTimeMillis();
    boolean killTasks = requestCleanup.getKillTasks().or(configuration.isDefaultValueForKillTasksOfPausedRequests());
    if (requestCleanup.getRunShellCommandBeforeKill().isPresent()) {
      killTasks = false;
    }

    TaskCleanupType cleanupType = killTasks ? TaskCleanupType.PAUSE : TaskCleanupType.PAUSING;

    for (SingularityTaskId taskId : activeTaskIds) {
      LOG.debug("Adding task {} to cleanup (pause)", taskId.getId());

      Optional runBeforeKillId = Optional.absent();

      if (requestCleanup.getRunShellCommandBeforeKill().isPresent()) {
        SingularityTaskShellCommandRequest shellRequest = new SingularityTaskShellCommandRequest(taskId, requestCleanup.getUser(), System.currentTimeMillis(), requestCleanup.getRunShellCommandBeforeKill().get());
        taskManager.saveTaskShellCommandRequestToQueue(shellRequest);
        runBeforeKillId = Optional.of(shellRequest.getId());
      }

      taskManager.createTaskCleanup(new SingularityTaskCleanup(requestCleanup.getUser(), cleanupType, start, taskId, requestCleanup.getMessage(), requestCleanup.getActionId(), runBeforeKillId));
    }

    return cleanupType;
  }

  private void delete(SingularityRequestCleanup requestCleanup, Iterable activeTaskIds){
    final long start = System.currentTimeMillis();

    for (SingularityTaskId taskId : activeTaskIds) {
      LOG.debug("Adding task {} to cleanup (delete)", taskId.getId());

      Optional runBeforeKillId = Optional.absent();

      if (requestCleanup.getRunShellCommandBeforeKill().isPresent()) {
        SingularityTaskShellCommandRequest shellRequest = new SingularityTaskShellCommandRequest(taskId, requestCleanup.getUser(), System.currentTimeMillis(), requestCleanup.getRunShellCommandBeforeKill().get());
        taskManager.saveTaskShellCommandRequestToQueue(shellRequest);
        runBeforeKillId = Optional.of(shellRequest.getId());
      }

      taskManager.saveTaskCleanup(new SingularityTaskCleanup(requestCleanup.getUser(), TaskCleanupType.REQUEST_DELETING, start, taskId, requestCleanup.getMessage(), requestCleanup.getActionId(), runBeforeKillId, requestCleanup.getRemoveFromLoadBalancer()));
    }
  }

  private void cleanupRequestData(SingularityRequestCleanup requestCleanup) {
    SingularityDeleteResult deletePendingDeployResult = deployManager.deletePendingDeploy(requestCleanup.getRequestId());
    SingularityDeleteResult deleteRequestDeployStateResult = deployManager.deleteRequestDeployState(requestCleanup.getRequestId());
    LOG.trace("Deleted pendingDeploy ({}) and requestDeployState ({}) due to {}", deletePendingDeployResult, deleteRequestDeployStateResult, requestCleanup);
    taskManager.deleteRequestId(requestCleanup.getRequestId());
    deployManager.deleteRequestId(requestCleanup.getRequestId());
    LOG.trace("Deleted stale request data for {}", requestCleanup.getRequestId());
    usageManager.deleteRequestUtilization(requestCleanup.getRequestId());
  }

  public int drainCleanupQueue() {
    drainRequestCleanupQueue();
    int cleanupTasks = drainTaskCleanupQueue();

    final List lbCleanupTasks = taskManager.getLBCleanupTasks();
    drainLBTaskCleanupQueue(lbCleanupTasks);
    drainLBRequestCleanupQueue(lbCleanupTasks);

    checkKilledTaskIdRecords();
    return cleanupTasks;
  }

  private boolean isValidTask(SingularityTaskCleanup cleanupTask) {
    return taskManager.isActiveTask(cleanupTask.getTaskId().getId());
  }

  private void checkKilledTaskIdRecords() {
    final long start = System.currentTimeMillis();
    final List killedTaskIdRecords = taskManager.getKilledTaskIdRecords();

    if (killedTaskIdRecords.isEmpty()) {
      LOG.trace("No killed taskId records");
      return;
    }

    AtomicInteger obsolete = new AtomicInteger(0);
    AtomicInteger waiting = new AtomicInteger(0);
    AtomicInteger rekilled = new AtomicInteger(0);

    killedTaskIdRecords.stream()
        .collect(Collectors.groupingBy((record) -> record.getTaskId().getRequestId()))
        .entrySet().parallelStream()
        .forEach((killedTaskIdRecordsForRequest) -> {
          lock.runWithRequestLock(() -> {
            for (SingularityKilledTaskIdRecord killedTaskIdRecord : killedTaskIdRecordsForRequest.getValue()) {
              if (!taskManager.isActiveTask(killedTaskIdRecord.getTaskId().getId())) {
                SingularityDeleteResult deleteResult = taskManager.deleteKilledRecord(killedTaskIdRecord.getTaskId());

                LOG.debug("Deleting obsolete {} - {}", killedTaskIdRecord, deleteResult);

                obsolete.getAndIncrement();

                continue;
              }

              long duration = start - killedTaskIdRecord.getTimestamp();

              if (duration > configuration.getAskDriverToKillTasksAgainAfterMillis()) {
                LOG.info("{} is still active, and time since last kill {} is greater than configured (askDriverToKillTasksAgainAfterMillis) {} - asking driver to kill again",
                    killedTaskIdRecord, JavaUtils.durationFromMillis(duration), JavaUtils.durationFromMillis(configuration.getAskDriverToKillTasksAgainAfterMillis()));

                scheduler.killAndRecord(killedTaskIdRecord.getTaskId(), killedTaskIdRecord.getRequestCleanupType(),
                    killedTaskIdRecord.getTaskCleanupType(), Optional.of(killedTaskIdRecord.getOriginalTimestamp()), Optional.of(killedTaskIdRecord.getRetries()), Optional.absent());

                rekilled.getAndIncrement();
              } else {
                LOG.trace("Ignoring {}, because duration {} is less than configured (askDriverToKillTasksAgainAfterMillis) {}", killedTaskIdRecord, JavaUtils.durationFromMillis(duration),
                    JavaUtils.durationFromMillis(configuration.getAskDriverToKillTasksAgainAfterMillis()));

                waiting.getAndIncrement();
              }
            }
          }, killedTaskIdRecordsForRequest.getKey(), String.format("%s#%s", getClass().getSimpleName(), "checkKilledTaskIdRecords"));
        });

    LOG.info("{} obsolete, {} waiting, {} rekilled tasks based on {} killedTaskIdRecords", obsolete, waiting, rekilled, killedTaskIdRecords.size());
  }

  private int drainTaskCleanupQueue() {
    final long start = System.currentTimeMillis();

    final Map> cleanupTasks = taskManager.getCleanupTasks()
        .stream()
        .collect(Collectors.groupingBy((taskCleanup) -> taskCleanup.getTaskId().getRequestId()));

    if (cleanupTasks.isEmpty()) {
      LOG.trace("Task cleanup queue is empty");
      return 0;
    }

    AtomicInteger killedTasks = new AtomicInteger(0);
    cleanupTasks.entrySet()
        .parallelStream()
        .forEach((taskCleanupsForRequest) -> {
          lock.runWithRequestLock(() -> {
            processTaskCleanupsForRequest(taskCleanupsForRequest.getKey(), taskCleanupsForRequest.getValue(), killedTasks);
          }, taskCleanupsForRequest.getKey(), String.format("%s#%s", getClass().getSimpleName(), "drainTaskCleanupQueue"));
        });

    LOG.info("Killed {} tasks in {}", killedTasks, JavaUtils.duration(start));
    return cleanupTasks.size();
  }

  private void processTaskCleanupsForRequest(String requestId, List cleanupTasks, AtomicInteger killedTasks) {
    final Multiset incrementalCleaningTasks = HashMultiset.create(cleanupTasks.size());
    final List taskIdsForDeletedRequest = new ArrayList<>();
    boolean isRequestDeleting = false;

    // TODO - Better check for deleting request state
    final Set cleaningTasks = new HashSet<>(cleanupTasks.size());
    for (SingularityTaskCleanup cleanupTask : cleanupTasks) {
      cleaningTasks.add(cleanupTask.getTaskId());
      if (isIncrementalDeployCleanup(cleanupTask) || cleanupTask.getCleanupType() == TaskCleanupType.INCREMENTAL_BOUNCE) {
        incrementalCleaningTasks.add(SingularityDeployKey.fromTaskId(cleanupTask.getTaskId()));
      }
      if (cleanupTask.getCleanupType() == TaskCleanupType.REQUEST_DELETING) {
        taskIdsForDeletedRequest.add(cleanupTask.getTaskId().getId());
        isRequestDeleting = true;
      }
    }

    LOG.info("Cleaning up {} tasks for request {}", cleanupTasks.size(), requestId);

    final List activeTaskIds = taskManager.getActiveTaskIds();

    for (SingularityTaskCleanup cleanupTask : cleanupTasks) {
      SingularityTaskId taskId = cleanupTask.getTaskId();

      if (!isValidTask(cleanupTask)) {
        LOG.info("Couldn't find a matching active task for cleanup task {}, deleting..", cleanupTask);
        taskManager.deleteCleanupTask(taskId.getId());
      } else if (shouldKillTask(cleanupTask, activeTaskIds, cleaningTasks, incrementalCleaningTasks) && checkLBStateAndShouldKillTask(cleanupTask)) {
        scheduler.killAndRecord(taskId, cleanupTask.getCleanupType(), cleanupTask.getUser());
        taskManager.deleteCleanupTask(taskId.getId());

        killedTasks.getAndIncrement();
      }

      cleanupRequestIfNoRemainingTasks(cleanupTask, taskIdsForDeletedRequest, isRequestDeleting);
    }
  }

  private void cleanupRequestIfNoRemainingTasks(SingularityTaskCleanup cleanupTask, List taskIdsForDeletedRequest, boolean isRequestDeleting) {
    String requestId = cleanupTask.getTaskId().getRequestId();

    taskIdsForDeletedRequest.remove(cleanupTask.getTaskId().getId());
    if (taskIdsForDeletedRequest.isEmpty() && isRequestDeleting) {
      LOG.warn("All tasks for requestId {} are now killed, re-enqueueing request cleanup", requestId);
      requestManager.createCleanupRequest(
          new SingularityRequestCleanup(
              cleanupTask.getUser(), RequestCleanupType.DELETING, System.currentTimeMillis(),
              Optional.of(Boolean.TRUE), cleanupTask.getRemoveFromLoadBalancer(), requestId, Optional.absent(),
              Optional.absent(), cleanupTask.getMessage(), Optional.absent(), Optional.absent()));
    }
  }

  private boolean checkLBStateAndShouldKillTask(SingularityTaskCleanup cleanupTask) {
    final long start = System.currentTimeMillis();

    CheckLBState checkLbState = checkLbState(cleanupTask.getTaskId());

    LOG.debug("TaskCleanup {} had LB state {} after {}", cleanupTask, checkLbState, JavaUtils.duration(start));

    switch (checkLbState) {
      case DONE:
      case NOT_LOAD_BALANCED:
      case MISSING_TASK:
      case LOAD_BALANCE_FAILED:
        return true;
      case RETRY:
      case WAITING:
    }

    return false;
  }

  private enum CheckLBState {
    NOT_LOAD_BALANCED, LOAD_BALANCE_FAILED, MISSING_TASK, WAITING, DONE, RETRY;
  }

  private boolean shouldRemoveLbState(SingularityTaskId taskId, SingularityLoadBalancerUpdate loadBalancerUpdate) {
    switch (loadBalancerUpdate.getLoadBalancerState()) {
      case UNKNOWN:
      case WAITING:
      case SUCCESS:
        return true;
      case INVALID_REQUEST_NOOP:
        return false;  // don't need to remove because Baragon doesnt know about it
      default:
        LOG.trace("Task {} had abnormal LB state {}", taskId, loadBalancerUpdate);
        return false;
    }
  }

  private LoadBalancerRequestId getLoadBalancerRequestId(SingularityTaskId taskId, Optional lbRemoveUpdate) {
    if (!lbRemoveUpdate.isPresent()) {
      return new LoadBalancerRequestId(taskId.getId(), LoadBalancerRequestType.REMOVE, Optional.absent());
    }

    switch (lbRemoveUpdate.get().getLoadBalancerState()) {
      case FAILED:
      case CANCELED:
        return new LoadBalancerRequestId(taskId.getId(), LoadBalancerRequestType.REMOVE, Optional.of(lbRemoveUpdate.get().getLoadBalancerRequestId().getAttemptNumber() + 1));
      default:
        return lbRemoveUpdate.get().getLoadBalancerRequestId();
    }
  }

  private boolean shouldEnqueueLbRequest(Optional maybeLbUpdate) {
    if (!maybeLbUpdate.isPresent()) {
      return true;
    }

    switch (maybeLbUpdate.get().getLoadBalancerState()) {
      case UNKNOWN:
      case FAILED:
      case CANCELED:
        return true;
      case CANCELING:
      case SUCCESS:
      case WAITING:
      case INVALID_REQUEST_NOOP:
    }

    return false;
  }

  private CheckLBState checkLbState(SingularityTaskId taskId) {
    Optional lbAddUpdate = taskManager.getLoadBalancerState(taskId, LoadBalancerRequestType.ADD);

    if (!lbAddUpdate.isPresent()) {
      return CheckLBState.NOT_LOAD_BALANCED;
    }

    if (!shouldRemoveLbState(taskId, lbAddUpdate.get())) {
      return CheckLBState.LOAD_BALANCE_FAILED;
    }

    Optional maybeLbRemoveUpdate = taskManager.getLoadBalancerState(taskId, LoadBalancerRequestType.REMOVE);
    SingularityLoadBalancerUpdate lbRemoveUpdate = null;

    final LoadBalancerRequestId loadBalancerRequestId = getLoadBalancerRequestId(taskId, maybeLbRemoveUpdate);

    if (shouldEnqueueLbRequest(maybeLbRemoveUpdate)) {
      final Optional task = taskManager.getTask(taskId);

      if (!task.isPresent()) {
        LOG.error("Missing task {}", taskId);
        return CheckLBState.MISSING_TASK;
      }

      lbRemoveUpdate = lbClient.enqueue(loadBalancerRequestId, task.get().getTaskRequest().getRequest(), task.get().getTaskRequest().getDeploy(), Collections.emptyList(), Collections.singletonList(task.get()));

      taskManager.saveLoadBalancerState(taskId, LoadBalancerRequestType.REMOVE, lbRemoveUpdate);
    } else if (maybeLbRemoveUpdate.get().getLoadBalancerState() == BaragonRequestState.WAITING || maybeLbRemoveUpdate.get().getLoadBalancerState() == BaragonRequestState.CANCELING) {
      lbRemoveUpdate = lbClient.getState(loadBalancerRequestId);

      taskManager.saveLoadBalancerState(taskId, LoadBalancerRequestType.REMOVE, lbRemoveUpdate);
    } else {
      lbRemoveUpdate = maybeLbRemoveUpdate.get();
    }

    switch (lbRemoveUpdate.getLoadBalancerState()) {
      case SUCCESS:
        if (configuration.getLoadBalancerRemovalGracePeriodMillis() > 0) {
          final long duration = System.currentTimeMillis() - lbRemoveUpdate.getTimestamp();

          if (duration < configuration.getLoadBalancerRemovalGracePeriodMillis()) {
            LOG.trace("LB removal for {} succeeded - waiting at least {} to kill task (current duration {})", taskId,
                JavaUtils.durationFromMillis(configuration.getLoadBalancerRemovalGracePeriodMillis()), JavaUtils.durationFromMillis(duration));
            return CheckLBState.WAITING;
          }
        }

        return CheckLBState.DONE;
      case FAILED:
      case CANCELED:
        LOG.error("LB removal request {} ({}) got unexpected response {}", lbRemoveUpdate, loadBalancerRequestId, lbRemoveUpdate.getLoadBalancerState());
        exceptionNotifier.notify("LB removal failed", ImmutableMap.of("state", lbRemoveUpdate.getLoadBalancerState().name(), "loadBalancerRequestId", loadBalancerRequestId.toString(), "addUpdate", lbRemoveUpdate.toString()));
        return CheckLBState.RETRY;
      case UNKNOWN:
      case CANCELING:
      case WAITING:
        LOG.trace("Waiting on LB cleanup request {} in state {}", loadBalancerRequestId, lbRemoveUpdate.getLoadBalancerState());
        break;
      case INVALID_REQUEST_NOOP:
        exceptionNotifier.notify("LB removal failed", ImmutableMap.of("state", lbRemoveUpdate.getLoadBalancerState().name(), "loadBalancerRequestId", loadBalancerRequestId.toString(), "addUpdate", lbRemoveUpdate.toString()));
        return CheckLBState.LOAD_BALANCE_FAILED;
    }

    return CheckLBState.WAITING;
  }

  private void drainLBTaskCleanupQueue(List lbCleanupTasks) {
    final long start = System.currentTimeMillis();

    if (lbCleanupTasks.isEmpty()) {
      LOG.trace("LB task cleanup queue is empty");
      return;
    }

    LOG.info("LB task cleanup queue had {} tasks", lbCleanupTasks.size());

    AtomicInteger cleanedTasks = new AtomicInteger(0);
    AtomicInteger ignoredTasks = new AtomicInteger(0);

    lbCleanupTasks.stream()
        .collect(Collectors.groupingBy(SingularityTaskId::getRequestId))
        .entrySet().parallelStream()
        .forEach((lbCleanupsForRequest) -> {
          lock.runWithRequestLock(() -> {
            for (SingularityTaskId taskId : lbCleanupsForRequest.getValue()) {
              final long checkStart = System.currentTimeMillis();

              final CheckLBState checkLbState = checkLbState(taskId);

              LOG.debug("LB cleanup for task {} had state {} after {}", taskId, checkLbState, JavaUtils.duration(checkStart));

              switch (checkLbState) {
                case WAITING:
                case RETRY:
                  continue;
                case DONE:
                case MISSING_TASK:
                  cleanedTasks.getAndIncrement();
                  break;
                case NOT_LOAD_BALANCED:
                case LOAD_BALANCE_FAILED:
                  ignoredTasks.getAndIncrement();
              }

              taskManager.deleteLBCleanupTask(taskId);
            }
          }, lbCleanupsForRequest.getKey(), String.format("%s#%s", getClass().getSimpleName(), "drainLBTaskCleanupQueue"));
        });

    LOG.info("LB cleaned {} tasks ({} left, {} obsolete) in {}", cleanedTasks, lbCleanupTasks.size() - (ignoredTasks.get() + cleanedTasks.get()), ignoredTasks, JavaUtils.duration(start));
  }

  private void drainLBRequestCleanupQueue(List lbCleanupTasks) {
    final long start = System.currentTimeMillis();

    final List lbCleanupRequests = requestManager.getLbCleanupRequests();

    if (lbCleanupRequests.isEmpty()) {
      LOG.trace("LB request cleanup queue is empty");
      return;
    }

    LOG.info("LB request cleanup queue had {} requests", lbCleanupRequests.size());

    AtomicInteger cleanedRequests = new AtomicInteger(0);
    AtomicInteger ignoredRequests = new AtomicInteger(0);

    lbCleanupRequests.parallelStream().forEach((cleanup) -> {
      lock.runWithRequestLock(() -> {
        final long checkStart = System.currentTimeMillis();

        final CheckLBState checkLbState = checkRequestLbState(cleanup, lbCleanupTasks);

        LOG.debug("LB cleanup for request {} had state {} after {}", cleanup.getRequestId(), checkLbState, JavaUtils.duration(checkStart));

        switch (checkLbState) {
          case WAITING:
          case RETRY:
            return;
          case DONE:
          case MISSING_TASK:
            cleanedRequests.getAndIncrement();
            break;
          case NOT_LOAD_BALANCED:
          case LOAD_BALANCE_FAILED:
            ignoredRequests.getAndIncrement();
        }

        requestManager.deleteLbCleanupRequest(cleanup.getRequestId());
      }, cleanup.getRequestId(), String.format("%s#%s", getClass().getSimpleName(), "drainLBRequestCleanupQueue"));
    });
    LOG.info("LB cleaned {} requests ({} left, {} obsolete) in {}", cleanedRequests, lbCleanupRequests.size() - (ignoredRequests.get() + cleanedRequests.get()), ignoredRequests, JavaUtils.duration(start));
  }

  private boolean  canRunRequestLbCleanup(SingularityRequestLbCleanup cleanup, List lbCleanupTasks) {
    Optional maybeRequestWithState = requestManager.getRequest(cleanup.getRequestId());
    if (maybeRequestWithState.isPresent() && SingularityRequestWithState.isActive(maybeRequestWithState)) {
      LOG.trace("Request is still active, will wait for request lb cleanup");
      return false;
    }
    for (String taskId : cleanup.getActiveTaskIds()) {
      if (taskManager.isActiveTask(taskId)) {
        LOG.trace("Request still has active tasks, will wait for lb request cleanup");
        return false;
      }
    }
    for (SingularityTaskId taskId : lbCleanupTasks) {
      if (taskId.getRequestId().equals(cleanup.getRequestId())) {
        LOG.trace("Waiting for task lb cleanup to finish before trying request lb cleanup for request {}", cleanup.getRequestId());
        return false;
      }
    }
    return true;
  }

  private CheckLBState checkRequestLbState(SingularityRequestLbCleanup cleanup, List lbCleanupTasks) {
    if (!canRunRequestLbCleanup(cleanup , lbCleanupTasks)) {
      return CheckLBState.RETRY;
    }

    Optional maybeDeleteUpdate = cleanup.getLoadBalancerUpdate();
    final LoadBalancerRequestId loadBalancerRequestId = getLoadBalancerRequestId(cleanup.getRequestId(), maybeDeleteUpdate);
    SingularityLoadBalancerUpdate lbDeleteUpdate;
    if (shouldEnqueueLbRequest(maybeDeleteUpdate)) {
      lbDeleteUpdate = lbClient.delete(loadBalancerRequestId, cleanup.getRequestId(), cleanup.getLoadBalancerGroups(), cleanup.getServiceBasePath());
      cleanup.setLoadBalancerUpdate(Optional.of(lbDeleteUpdate));
      requestManager.saveLbCleanupRequest(cleanup);
    } else if (maybeDeleteUpdate.get().getLoadBalancerState() == BaragonRequestState.WAITING || maybeDeleteUpdate.get().getLoadBalancerState() == BaragonRequestState.CANCELING) {
      lbDeleteUpdate = lbClient.getState(loadBalancerRequestId);
      cleanup.setLoadBalancerUpdate(Optional.of(lbDeleteUpdate));
      requestManager.saveLbCleanupRequest(cleanup);
    } else {
      lbDeleteUpdate = maybeDeleteUpdate.get();
    }

    switch (lbDeleteUpdate.getLoadBalancerState()) {
      case SUCCESS:
        return CheckLBState.DONE;
      case FAILED:
      case CANCELED:
        LOG.error("LB delete request {} ({}) got unexpected response {}", lbDeleteUpdate, loadBalancerRequestId, lbDeleteUpdate.getLoadBalancerState());
        exceptionNotifier.notify(String.format("LB delete failed for %s", lbDeleteUpdate.getLoadBalancerRequestId().toString()),
            ImmutableMap.of("state", lbDeleteUpdate.getLoadBalancerState().name(), "loadBalancerRequestId", loadBalancerRequestId.toString(), "addUpdate", lbDeleteUpdate.toString()));
        return CheckLBState.RETRY;
      case UNKNOWN:
      case CANCELING:
      case WAITING:
        LOG.trace("Waiting on LB delete request {} in state {}", loadBalancerRequestId, lbDeleteUpdate.getLoadBalancerState());
        break;
      case INVALID_REQUEST_NOOP:
        exceptionNotifier.notify(String.format("LB delete failed for %s", lbDeleteUpdate.getLoadBalancerRequestId().toString()),
            ImmutableMap.of("state", lbDeleteUpdate.getLoadBalancerState().name(), "loadBalancerRequestId", loadBalancerRequestId.toString(), "addUpdate", lbDeleteUpdate.toString()));
        return CheckLBState.LOAD_BALANCE_FAILED;
    }

    return CheckLBState.WAITING;
  }

  private LoadBalancerRequestId getLoadBalancerRequestId(String requestId, Optional lbDeleteUpdate) {
    if (!lbDeleteUpdate.isPresent()) {
      return new LoadBalancerRequestId(String.format("%s-%s", requestId, System.currentTimeMillis()), LoadBalancerRequestType.DELETE, Optional.absent());
    }

    switch (lbDeleteUpdate.get().getLoadBalancerState()) {
      case FAILED:
      case CANCELED:
        return new LoadBalancerRequestId(String.format("%s-%s", requestId, System.currentTimeMillis()), LoadBalancerRequestType.DELETE, Optional.of(lbDeleteUpdate.get().getLoadBalancerRequestId().getAttemptNumber() + 1));
      default:
        return lbDeleteUpdate.get().getLoadBalancerRequestId();
    }
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy