All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.hubspot.singularity.scheduler.SingularityUsagePoller Maven / Gradle / Ivy

package com.hubspot.singularity.scheduler;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.ReentrantLock;
import java.util.stream.Collectors;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import com.google.common.util.concurrent.AtomicDouble;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import com.google.inject.Inject;
import com.hubspot.mesos.Resources;
import com.hubspot.mesos.client.MesosClient;
import com.hubspot.mesos.json.MesosSlaveMetricsSnapshotObject;
import com.hubspot.mesos.json.MesosTaskMonitorObject;
import com.hubspot.singularity.ExtendedTaskState;
import com.hubspot.singularity.InvalidSingularityTaskIdException;
import com.hubspot.singularity.RequestUtilization;
import com.hubspot.singularity.SingularityClusterUtilization;
import com.hubspot.singularity.SingularityDeleteResult;
import com.hubspot.singularity.SingularityDeploy;
import com.hubspot.singularity.SingularityPendingRequest;
import com.hubspot.singularity.SingularityPendingRequest.PendingType;
import com.hubspot.singularity.SingularityRequestWithState;
import com.hubspot.singularity.SingularitySlave;
import com.hubspot.singularity.SingularitySlaveUsage;
import com.hubspot.singularity.SingularityTask;
import com.hubspot.singularity.SingularityTaskCleanup;
import com.hubspot.singularity.SingularityTaskCurrentUsage;
import com.hubspot.singularity.SingularityTaskHistoryUpdate;
import com.hubspot.singularity.SingularityTaskId;
import com.hubspot.singularity.SingularityTaskUsage;
import com.hubspot.singularity.TaskCleanupType;
import com.hubspot.singularity.async.AsyncSemaphore;
import com.hubspot.singularity.async.CompletableFutures;
import com.hubspot.singularity.config.SingularityConfiguration;
import com.hubspot.singularity.data.DeployManager;
import com.hubspot.singularity.data.RequestManager;
import com.hubspot.singularity.data.TaskManager;
import com.hubspot.singularity.data.UsageManager;
import com.hubspot.singularity.sentry.SingularityExceptionNotifier;

public class SingularityUsagePoller extends SingularityLeaderOnlyPoller {

  private static final Logger LOG = LoggerFactory.getLogger(SingularityUsagePoller.class);
  private static final long DAY_IN_SECONDS = TimeUnit.DAYS.toSeconds(1);

  private final SingularityConfiguration configuration;
  private final MesosClient mesosClient;
  private final UsageManager usageManager;
  private final SingularityUsageHelper usageHelper;
  private final SingularityExceptionNotifier exceptionNotifier;
  private final RequestManager requestManager;
  private final DeployManager deployManager;
  private final TaskManager taskManager;

  private final AsyncSemaphore usageCollectionSemaphore;
  private final ExecutorService usageExecutor;
  private final ConcurrentHashMap requestLocks;

  @Inject
  SingularityUsagePoller(SingularityConfiguration configuration,
                         SingularityUsageHelper usageHelper,
                         UsageManager usageManager,
                         MesosClient mesosClient,
                         SingularityExceptionNotifier exceptionNotifier,
                         RequestManager requestManager,
                         DeployManager deployManager,
                         TaskManager taskManager) {
    super(configuration.getCheckUsageEveryMillis(), TimeUnit.MILLISECONDS);

    this.configuration = configuration;
    this.usageHelper = usageHelper;
    this.mesosClient = mesosClient;
    this.usageManager = usageManager;
    this.exceptionNotifier = exceptionNotifier;
    this.requestManager = requestManager;
    this.deployManager = deployManager;
    this.taskManager = taskManager;

    this.usageCollectionSemaphore = AsyncSemaphore.newBuilder(configuration::getMaxConcurrentUsageCollections).build();
    this.usageExecutor = Executors.newCachedThreadPool(new ThreadFactoryBuilder().setNameFormat("usage-collection-%d").build());
    this.requestLocks = new ConcurrentHashMap<>();
  }

  @Override
  public void runActionOnPoll() {
    Map utilizationPerRequestId = new ConcurrentHashMap<>();
    Map previousUtilizations = usageManager.getRequestUtilizations();
    final long now = System.currentTimeMillis();

    AtomicLong totalMemBytesUsed = new AtomicLong(0);
    AtomicLong totalMemBytesAvailable = new AtomicLong(0);
    AtomicDouble totalCpuUsed = new AtomicDouble(0.00);
    AtomicDouble totalCpuAvailable = new AtomicDouble(0.00);
    AtomicLong totalDiskBytesUsed = new AtomicLong(0);
    AtomicLong totalDiskBytesAvailable = new AtomicLong(0);

    Map> overLoadedHosts = new ConcurrentHashMap<>();

    List> usageFutures = new ArrayList<>();

    usageHelper.getSlavesToTrackUsageFor().forEach((slave) -> {
      usageFutures.add(usageCollectionSemaphore.call(() ->
          CompletableFuture.runAsync(() -> {
            collectSlaveUage(slave, now, utilizationPerRequestId, previousUtilizations, overLoadedHosts, totalMemBytesUsed, totalMemBytesAvailable,
                totalCpuUsed, totalCpuAvailable, totalDiskBytesUsed, totalDiskBytesAvailable);
          }, usageExecutor)
      ));
    });

    CompletableFutures.allOf(usageFutures).join();

    usageManager.saveClusterUtilization(
        getClusterUtilization(utilizationPerRequestId, totalMemBytesUsed.get(), totalMemBytesAvailable.get(), totalCpuUsed.get(), totalCpuAvailable.get(), totalDiskBytesUsed.get(), totalDiskBytesAvailable
            .get(), now));
    utilizationPerRequestId.values().forEach(usageManager::saveRequestUtilization);

    if (configuration.isShuffleTasksForOverloadedSlaves()) {
      shuffleTasksOnOverloadedHosts(overLoadedHosts);
    }
  }

  public void runWithRequestLock(Runnable function, String requestId) {
    ReentrantLock lock = requestLocks.computeIfAbsent(requestId, (r) -> new ReentrantLock());
    lock.lock();
    try {
      function.run();
    } finally {
      lock.unlock();
    }
  }

  private void collectSlaveUage(SingularitySlave slave,
                                long now,
                                Map utilizationPerRequestId,
                                Map previousUtilizations,
                                Map> overLoadedHosts,
                                AtomicLong totalMemBytesUsed,
                                AtomicLong totalMemBytesAvailable,
                                AtomicDouble totalCpuUsed,
                                AtomicDouble totalCpuAvailable,
                                AtomicLong totalDiskBytesUsed,
                                AtomicLong totalDiskBytesAvailable) {
    Optional memoryMbTotal = Optional.absent();
    Optional cpusTotal = Optional.absent();
    Optional diskMbTotal = Optional.absent();

    long memoryMbReservedOnSlave = 0;
    double cpuReservedOnSlave = 0;
    long diskMbReservedOnSlave = 0;

    long memoryBytesUsedOnSlave = 0;
    double cpusUsedOnSlave = 0;
    long diskMbUsedOnSlave = 0;

    try {
      List allTaskUsage = mesosClient.getSlaveResourceUsage(slave.getHost());
      MesosSlaveMetricsSnapshotObject slaveMetricsSnapshot = mesosClient.getSlaveMetricsSnapshot(slave.getHost());
      double systemMemTotalBytes = 0;
      double systemMemFreeBytes = 0;
      double systemLoad1Min = 0;
      double systemLoad5Min = 0;
      double systemLoad15Min = 0;
      double slaveDiskUsed = 0;
      double slaveDiskTotal = 0;
      double systemCpusTotal = 0;
      if (slaveMetricsSnapshot != null) {
        systemMemTotalBytes = slaveMetricsSnapshot.getSystemMemTotalBytes();
        systemMemFreeBytes = slaveMetricsSnapshot.getSystemMemFreeBytes();
        systemLoad1Min = slaveMetricsSnapshot.getSystemLoad1Min();
        systemLoad5Min = slaveMetricsSnapshot.getSystemLoad5Min();
        systemLoad15Min = slaveMetricsSnapshot.getSystemLoad15Min();
        slaveDiskUsed = slaveMetricsSnapshot.getSlaveDiskUsed();
        slaveDiskTotal = slaveMetricsSnapshot.getSlaveDiskTotal();
        systemCpusTotal = slaveMetricsSnapshot.getSystemCpusTotal();
      }

      double systemLoad;
      switch (configuration.getMesosConfiguration().getScoreUsingSystemLoad()) {
        case LOAD_1:
          systemLoad = systemLoad1Min;
          break;
        case LOAD_15:
          systemLoad = systemLoad15Min;
          break;
        case LOAD_5:
        default:
          systemLoad = systemLoad5Min;
          break;
      }

      boolean slaveOverloaded = systemCpusTotal > 0 && systemLoad / systemCpusTotal > 1.0;
      List possibleTasksToShuffle = new ArrayList<>();

      for (MesosTaskMonitorObject taskUsage : allTaskUsage) {
        String taskId = taskUsage.getSource();
        SingularityTaskId task;
        try {
          task = SingularityTaskId.valueOf(taskId);
        } catch (InvalidSingularityTaskIdException e) {
          LOG.error("Couldn't get SingularityTaskId for {}", taskUsage);
          continue;
        }

        SingularityTaskUsage latestUsage = getUsage(taskUsage);
        List pastTaskUsages = usageManager.getTaskUsage(taskId);


        clearOldUsage(taskId);
        usageManager.saveSpecificTaskUsage(taskId, latestUsage);

        Optional maybeTask = taskManager.getTask(task);
        Optional maybeResources = Optional.absent();
        if (maybeTask.isPresent()) {
          maybeResources = maybeTask.get().getTaskRequest().getPendingTask().getResources().or(maybeTask.get().getTaskRequest().getDeploy().getResources());
          if (maybeResources.isPresent()) {
            Resources taskResources = maybeResources.get();
            double memoryMbReservedForTask = taskResources.getMemoryMb();
            double cpuReservedForTask = taskResources.getCpus();
            double diskMbReservedForTask = taskResources.getDiskMb();

            memoryMbReservedOnSlave += memoryMbReservedForTask;
            cpuReservedOnSlave += cpuReservedForTask;
            diskMbReservedOnSlave += diskMbReservedForTask;

            runWithRequestLock(() -> updateRequestUtilization(utilizationPerRequestId, previousUtilizations, pastTaskUsages, latestUsage, task, memoryMbReservedForTask, cpuReservedForTask, diskMbReservedForTask), task.getRequestId());
          }
        }
        memoryBytesUsedOnSlave += latestUsage.getMemoryTotalBytes();
        diskMbUsedOnSlave += latestUsage.getDiskTotalBytes();

        SingularityTaskCurrentUsage currentUsage = null;
        if (pastTaskUsages.isEmpty()) {
          Optional maybeStartingUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_STARTING);
          if (maybeStartingUpdate.isPresent()) {
            long startTimestampSeconds = TimeUnit.MILLISECONDS.toSeconds(maybeStartingUpdate.get().getTimestamp());
            double usedCpusSinceStart = latestUsage.getCpuSeconds() / (latestUsage.getTimestamp() - startTimestampSeconds);
            currentUsage = new SingularityTaskCurrentUsage(latestUsage.getMemoryTotalBytes(), now, usedCpusSinceStart, latestUsage.getDiskTotalBytes());
            usageManager.saveCurrentTaskUsage(taskId, currentUsage);

            cpusUsedOnSlave += usedCpusSinceStart;
          }
        } else {
          SingularityTaskUsage lastUsage = pastTaskUsages.get(pastTaskUsages.size() - 1);

          double taskCpusUsed = ((latestUsage.getCpuSeconds() - lastUsage.getCpuSeconds()) / (latestUsage.getTimestamp() - lastUsage.getTimestamp()));

          currentUsage = new SingularityTaskCurrentUsage(latestUsage.getMemoryTotalBytes(), now, taskCpusUsed, latestUsage.getDiskTotalBytes());
          usageManager.saveCurrentTaskUsage(taskId, currentUsage);
          cpusUsedOnSlave += taskCpusUsed;
        }

        if (configuration.isShuffleTasksForOverloadedSlaves() && currentUsage != null && currentUsage.getCpusUsed() > 0) {
          if (isEligibleForShuffle(task)) {
            Optional maybeCleanupUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_CLEANING);
            if (maybeCleanupUpdate.isPresent() && isTaskAlreadyCleanedUpForShuffle(maybeCleanupUpdate.get())) {
              LOG.trace("Task {} already being cleaned up to spread cpu usage, skipping", taskId);
            } else {
              if (maybeResources.isPresent()) {
                possibleTasksToShuffle.add(new TaskIdWithUsage(task, maybeResources.get(), currentUsage));
              }
            }
          }
        }
      }

      if (!slave.getResources().isPresent() ||
          !slave.getResources().get().getMemoryMegaBytes().isPresent() ||
          !slave.getResources().get().getNumCpus().isPresent()) {
        LOG.debug("Could not find slave or resources for slave {}", slave.getId());
      } else {
        memoryMbTotal = Optional.of(slave.getResources().get().getMemoryMegaBytes().get().longValue());
        cpusTotal = Optional.of(slave.getResources().get().getNumCpus().get().doubleValue());
        diskMbTotal = Optional.of(slave.getResources().get().getDiskSpace().get());
      }

      SingularitySlaveUsage slaveUsage = new SingularitySlaveUsage(cpusUsedOnSlave, cpuReservedOnSlave, cpusTotal, memoryBytesUsedOnSlave, memoryMbReservedOnSlave,
          memoryMbTotal, diskMbUsedOnSlave, diskMbReservedOnSlave, diskMbTotal, allTaskUsage.size(), now,
          systemMemTotalBytes, systemMemFreeBytes, systemCpusTotal, systemLoad1Min, systemLoad5Min, systemLoad15Min, slaveDiskUsed, slaveDiskTotal);

      if (slaveOverloaded) {
        overLoadedHosts.put(slaveUsage, possibleTasksToShuffle);
      }

      List slaveTimestamps = usageManager.getSlaveUsageTimestamps(slave.getId());
      if (slaveTimestamps.size() + 1 > configuration.getNumUsageToKeep()) {
        usageManager.deleteSpecificSlaveUsage(slave.getId(), slaveTimestamps.get(0));
      }

      if (slaveUsage.getMemoryBytesTotal().isPresent() && slaveUsage.getCpusTotal().isPresent()) {
        totalMemBytesUsed.getAndAdd(slaveUsage.getMemoryBytesUsed());
        totalCpuUsed.getAndAdd(slaveUsage.getCpusUsed());
        totalDiskBytesUsed.getAndAdd(slaveUsage.getDiskBytesUsed());

        totalMemBytesAvailable.getAndAdd(slaveUsage.getMemoryBytesTotal().get());
        totalCpuAvailable.getAndAdd(slaveUsage.getCpusTotal().get());
        totalDiskBytesAvailable.getAndAdd(slaveUsage.getDiskBytesTotal().get());
      }

      LOG.debug("Saving slave {} usage {}", slave.getHost(), slaveUsage);
      usageManager.saveSpecificSlaveUsageAndSetCurrent(slave.getId(), slaveUsage);
    } catch (Throwable t) {
      String message = String.format("Could not get slave usage for host %s", slave.getHost());
      LOG.error(message, t);
      exceptionNotifier.notify(message, t);
    }
  }

  private boolean isEligibleForShuffle(SingularityTaskId task) {
    Optional taskRunning = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_RUNNING);

    return (
        !configuration.getDoNotShuffleRequests().contains(task.getRequestId())
            && isLongRunning(task)
            && (
            configuration.getMinutesBeforeNewTaskEligibleForShuffle() == 0 // Shuffle delay is disabled entirely
                || (taskRunning.isPresent() && TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - taskRunning.get()
                .getTimestamp()) >= configuration.getMinutesBeforeNewTaskEligibleForShuffle())
        )
    );
  }

  private void shuffleTasksOnOverloadedHosts(Map> overLoadedHosts) {
    List shuffleCleanups = taskManager.getCleanupTasks()
        .stream()
        .filter((taskCleanup) -> taskCleanup.getCleanupType() == TaskCleanupType.REBALANCE_CPU_USAGE)
        .collect(Collectors.toList());
    long currentShuffleCleanupsTotal = shuffleCleanups.size();
    Set requestsWithShuffledTasks = shuffleCleanups
        .stream()
        .map((taskCleanup) -> taskCleanup.getTaskId().getRequestId())
        .collect(Collectors.toSet());

    List overLoadedSlavesByUsage = overLoadedHosts.keySet().stream()
        .sorted((usage1, usage2) -> Double.compare(
            getSystemLoadForShuffle(usage2),
            getSystemLoadForShuffle(usage1)
        ))
        .collect(Collectors.toList());
    for (SingularitySlaveUsage overloadedSlave : overLoadedSlavesByUsage) {
      if (currentShuffleCleanupsTotal >= configuration.getMaxTasksToShuffleTotal()) {
        LOG.debug("Not shuffling any more tasks (totalShuffleCleanups: {})", currentShuffleCleanupsTotal);
        break;
      }
      int shuffledTasksOnSlave = 0;
      List possibleTasksToShuffle = overLoadedHosts.get(overloadedSlave);
      possibleTasksToShuffle.sort((u1, u2) ->
          Double.compare(
              u2.getUsage().getCpusUsed() / u2.getRequestedResources().getCpus(),
              u1.getUsage().getCpusUsed() / u1.getRequestedResources().getCpus()
          ));

      double systemLoad = getSystemLoadForShuffle(overloadedSlave);
      double cpuOverage = systemLoad - overloadedSlave.getSystemCpusTotal();

      for (TaskIdWithUsage taskIdWithUsage : possibleTasksToShuffle) {
        if (requestsWithShuffledTasks.contains(taskIdWithUsage.getTaskId().getRequestId())) {
          LOG.debug("Request {} already has a shuffling task, skipping", taskIdWithUsage.getTaskId().getRequestId());
          continue;
        }
        if (cpuOverage <= 0 || shuffledTasksOnSlave > configuration.getMaxTasksToShufflePerHost() || currentShuffleCleanupsTotal >= configuration.getMaxTasksToShuffleTotal()) {
          LOG.debug("Not shuffling any more tasks (overage: {}, shuffledOnHost: {}, totalShuffleCleanups: {})", cpuOverage, shuffledTasksOnSlave, currentShuffleCleanupsTotal);
          break;
        }
        LOG.debug("Cleaning up task {} to free up cpu on overloaded host (remaining cpu overage: {})", taskIdWithUsage.getTaskId(), cpuOverage);
        Optional message = Optional.of(String.format(
            "Load on slave is %s / %s, shuffling task using %s / %s to less busy host",
            systemLoad,
            overloadedSlave.getSystemCpusTotal(),
            taskIdWithUsage.getUsage().getCpusUsed(),
            taskIdWithUsage.getRequestedResources().getCpus()));
        taskManager.createTaskCleanup(
            new SingularityTaskCleanup(
                Optional.absent(),
                TaskCleanupType.REBALANCE_CPU_USAGE,
                System.currentTimeMillis(),
                taskIdWithUsage.getTaskId(),
                message,
                Optional.of(UUID.randomUUID().toString()),
                Optional.absent(), Optional.absent()));
        requestManager.addToPendingQueue(new SingularityPendingRequest(taskIdWithUsage.getTaskId().getRequestId(), taskIdWithUsage.getTaskId()
            .getDeployId(), System.currentTimeMillis(), Optional.absent(),
            PendingType.TASK_BOUNCE, Optional.absent(), Optional.absent(), Optional.absent(), message, Optional.of(UUID.randomUUID().toString())));
        cpuOverage -= taskIdWithUsage.getUsage().getCpusUsed();
        shuffledTasksOnSlave++;
        currentShuffleCleanupsTotal++;
        requestsWithShuffledTasks.add(taskIdWithUsage.getTaskId().getRequestId());
      }
    }
  }

  private double getSystemLoadForShuffle(SingularitySlaveUsage usage) {
    switch (configuration.getMesosConfiguration().getScoreUsingSystemLoad()) {
      case LOAD_1:
        return usage.getSystemLoad15Min();
      case LOAD_15:
        return usage.getSystemLoad15Min();
      case LOAD_5:
      default:
        return usage.getSystemLoad5Min();
    }
  }

  private boolean isTaskAlreadyCleanedUpForShuffle(SingularityTaskHistoryUpdate taskHistoryUpdate) {
    if (taskHistoryUpdate.getStatusMessage().or("").contains(TaskCleanupType.REBALANCE_CPU_USAGE.name())) {
      return true;
    }
    for (SingularityTaskHistoryUpdate previous : taskHistoryUpdate.getPrevious()) {
      if (previous.getStatusMessage().or("").contains(TaskCleanupType.REBALANCE_CPU_USAGE.name())) {
        return true;
      }
    }
    return false;
  }

  private SingularityTaskUsage getUsage(MesosTaskMonitorObject taskUsage) {
    return new SingularityTaskUsage(
        taskUsage.getStatistics().getMemTotalBytes(),
        taskUsage.getStatistics().getTimestamp(),
        taskUsage.getStatistics().getCpusSystemTimeSecs() + taskUsage.getStatistics().getCpusUserTimeSecs(),
        taskUsage.getStatistics().getDiskUsedBytes(),
        taskUsage.getStatistics().getCpusNrPeriods(),
        taskUsage.getStatistics().getCpusNrThrottled(),
        taskUsage.getStatistics().getCpusThrottledTimeSecs());
  }

  private boolean isLongRunning(SingularityTaskId task) {
    Optional request = requestManager.getRequest(task.getRequestId());
    if (request.isPresent()) {
      return request.get().getRequest().getRequestType().isLongRunning();
    }

    LOG.warn("Couldn't find request id {} for task {}", task.getRequestId(), task.getId());
    return false;
  }

  private void updateRequestUtilization(Map utilizationPerRequestId,
                                        Map previousUtilizations,
                                        List pastTaskUsages,
                                        SingularityTaskUsage latestUsage,
                                        SingularityTaskId task,
                                        double memoryMbReservedForTask,
                                        double cpuReservedForTask,
                                        double diskMbReservedForTask) {
    String requestId = task.getRequestId();
    RequestUtilization newRequestUtilization = utilizationPerRequestId.getOrDefault(requestId, new RequestUtilization(requestId, task.getDeployId()));
    RequestUtilization previous = previousUtilizations.get(requestId);
    // Take the previous request utilization into account to better measure 24 hour max/min values
    if (previous != null) {
      if (previous.getMaxMemTimestamp() < DAY_IN_SECONDS) {
        newRequestUtilization.setMaxMemBytesUsed(previous.getMaxMemBytesUsed());
        newRequestUtilization.setMaxMemTimestamp(previous.getMaxMemTimestamp());
      }
      if (previous.getMinMemTimestamp() < DAY_IN_SECONDS) {
        newRequestUtilization.setMinMemBytesUsed(previous.getMinMemBytesUsed());
        newRequestUtilization.setMinMemTimestamp(previous.getMinMemTimestamp());
      }
      if (previous.getMaxCpusTimestamp() < DAY_IN_SECONDS) {
        newRequestUtilization.setMaxCpuUsed(previous.getMaxCpuUsed());
        newRequestUtilization.setMaxCpusTimestamp(previous.getMaxCpusTimestamp());
      }
      if (previous.getMinCpusTimestamp() < DAY_IN_SECONDS) {
        newRequestUtilization.setMinCpuUsed(previous.getMinCpuUsed());
        newRequestUtilization.setMinCpusTimestamp(previous.getMinCpusTimestamp());
      }
      if (previous.getMaxDiskTimestamp() < DAY_IN_SECONDS) {
        newRequestUtilization.setMaxDiskBytesUsed(previous.getMaxDiskBytesUsed());
        newRequestUtilization.setMaxDiskTimestamp(previous.getMaxDiskTimestamp());
      }
      if (previous.getMinDiskTimestamp() < DAY_IN_SECONDS) {
        newRequestUtilization.setMinDiskBytesUsed(previous.getMinDiskBytesUsed());
        newRequestUtilization.setMinDiskTimestamp(previous.getMinDiskTimestamp());
      }
      if (previous.getMaxCpuThrottledTimestamp() < DAY_IN_SECONDS) {
        newRequestUtilization.setMaxPercentCpuTimeThrottled(previous.getMaxPercentCpuTimeThrottled());
        newRequestUtilization.setMaxCpuThrottledTimestamp(previous.getMaxCpuThrottledTimestamp());
      }
      if (previous.getMinCpuThrottledTimestamp() < DAY_IN_SECONDS) {
        newRequestUtilization.setMinPercentCpuTimeThrottled(previous.getMinPercentCpuTimeThrottled());
        newRequestUtilization.setMinCpuThrottledTimestamp(previous.getMinCpuThrottledTimestamp());
      }
    }

    List pastTaskUsagesCopy = getFullListOfTaskUsages(pastTaskUsages, latestUsage, task);
    pastTaskUsagesCopy.sort(Comparator.comparingDouble(SingularityTaskUsage::getTimestamp));
    int numTasks = pastTaskUsagesCopy.size() - 1; // One usage is a fake 0 usage to calculate first cpu times

    int numCpuOverages = 0;

    for (int i = 0; i < numTasks; i++) {
      SingularityTaskUsage olderUsage = pastTaskUsagesCopy.get(i);
      SingularityTaskUsage newerUsage = pastTaskUsagesCopy.get(i + 1);
      double cpusUsed = (newerUsage.getCpuSeconds() - olderUsage.getCpuSeconds()) / (newerUsage.getTimestamp() - olderUsage.getTimestamp());
      double percentCpuTimeThrottled = (newerUsage.getCpusThrottledTimeSecs() - olderUsage.getCpusThrottledTimeSecs()) / (newerUsage.getTimestamp() - olderUsage.getTimestamp());

      if (cpusUsed > newRequestUtilization.getMaxCpuUsed()) {
        newRequestUtilization.setMaxCpuUsed(cpusUsed);
        newRequestUtilization.setMaxCpusTimestamp(newerUsage.getTimestamp());
      }
      if (cpusUsed < newRequestUtilization.getMinCpuUsed()) {
        newRequestUtilization.setMinCpuUsed(cpusUsed);
        newRequestUtilization.setMinCpusTimestamp(newerUsage.getTimestamp());
      }
      if (newerUsage.getMemoryTotalBytes() > newRequestUtilization.getMaxMemBytesUsed()) {
        newRequestUtilization.setMaxMemBytesUsed(newerUsage.getMemoryTotalBytes());
        newRequestUtilization.setMaxMemTimestamp(newerUsage.getTimestamp());
      }
      if (newerUsage.getMemoryTotalBytes() < newRequestUtilization.getMinMemBytesUsed()) {
        newRequestUtilization.setMinMemBytesUsed(newerUsage.getMemoryTotalBytes());
        newRequestUtilization.setMinMemTimestamp(newerUsage.getTimestamp());
      }
      if (newerUsage.getDiskTotalBytes() > newRequestUtilization.getMaxDiskBytesUsed()) {
        newRequestUtilization.setMaxDiskBytesUsed(newerUsage.getDiskTotalBytes());
        newRequestUtilization.setMaxDiskTimestamp(newerUsage.getTimestamp());
      }
      if (newerUsage.getDiskTotalBytes() < newRequestUtilization.getMinDiskBytesUsed()) {
        newRequestUtilization.setMinDiskBytesUsed(newerUsage.getDiskTotalBytes());
        newRequestUtilization.setMaxDiskTimestamp(newerUsage.getTimestamp());
      }
      if (percentCpuTimeThrottled > newRequestUtilization.getMaxPercentCpuTimeThrottled()) {
        newRequestUtilization.setMaxPercentCpuTimeThrottled(percentCpuTimeThrottled);
        newRequestUtilization.setMaxCpuThrottledTimestamp(newerUsage.getTimestamp());
      }
      if (percentCpuTimeThrottled < newRequestUtilization.getMinPercentCpuTimeThrottled()) {
        newRequestUtilization.setMinPercentCpuTimeThrottled(percentCpuTimeThrottled);
        newRequestUtilization.setMinCpuThrottledTimestamp(newerUsage.getTimestamp());
      }

      if (cpusUsed > cpuReservedForTask) {
        numCpuOverages++;
      }

      newRequestUtilization
          .addCpuUsed(cpusUsed)
          .addMemBytesUsed(newerUsage.getMemoryTotalBytes())
          .addPercentCpuTimeThrottled(percentCpuTimeThrottled)
          .addDiskBytesUsed(newerUsage.getDiskTotalBytes())
          .incrementTaskCount();
    }

    double cpuBurstRating = pastTaskUsagesCopy.size() > 0 ? numCpuOverages / (double) pastTaskUsagesCopy.size() : 1;

    newRequestUtilization
        .addMemBytesReserved((long) (memoryMbReservedForTask * SingularitySlaveUsage.BYTES_PER_MEGABYTE * numTasks))
        .addCpuReserved(cpuReservedForTask * numTasks)
        .addDiskBytesReserved((long) diskMbReservedForTask * SingularitySlaveUsage.BYTES_PER_MEGABYTE * numTasks)
        .setCpuBurstRating(cpuBurstRating);

    utilizationPerRequestId.put(requestId, newRequestUtilization);
  }

  private List getFullListOfTaskUsages(List pastTaskUsages, SingularityTaskUsage latestUsage, SingularityTaskId task) {
    List pastTaskUsagesCopy = new ArrayList<>();
    pastTaskUsagesCopy.add(new SingularityTaskUsage(0, TimeUnit.MILLISECONDS.toSeconds(task.getStartedAt()), 0, 0, 0 , 0, 0)); // to calculate oldest cpu usage
    pastTaskUsagesCopy.addAll(pastTaskUsages);
    pastTaskUsagesCopy.add(latestUsage);

    return pastTaskUsagesCopy;
  }

  private SingularityClusterUtilization getClusterUtilization(Map utilizationPerRequestId,
                                                              long totalMemBytesUsed,
                                                              long totalMemBytesAvailable,
                                                              double totalCpuUsed,
                                                              double totalCpuAvailable,
                                                              long totalDiskBytesUsed,
                                                              long totalDiskBytesAvailable,
                                                              long now) {
    int numRequestsWithUnderUtilizedCpu = 0;
    int numRequestsWithOverUtilizedCpu = 0;
    int numRequestsWithUnderUtilizedMemBytes = 0;
    int numRequestsWithUnderUtilizedDiskBytes = 0;

    double totalUnderUtilizedCpu = 0;
    double totalOverUtilizedCpu = 0;
    long totalUnderUtilizedMemBytes = 0;
    long totalUnderUtilizedDiskBytes = 0;

    double maxUnderUtilizedCpu = 0;
    double maxOverUtilizedCpu = 0;
    long maxUnderUtilizedMemBytes = 0;
    long maxUnderUtilizedDiskBytes = 0;

    String maxUnderUtilizedCpuRequestId = null;
    String maxOverUtilizedCpuRequestId = null;
    String maxUnderUtilizedMemBytesRequestId = null;
    String maxUnderUtilizedDiskBytesRequestId = null;

    double minUnderUtilizedCpu = Double.MAX_VALUE;
    double minOverUtilizedCpu = Double.MAX_VALUE;
    long minUnderUtilizedMemBytes = Long.MAX_VALUE;
    long minUnderUtilizedDiskBytes = Long.MAX_VALUE;


    for (RequestUtilization utilization : utilizationPerRequestId.values()) {
      Optional maybeDeploy = deployManager.getDeploy(utilization.getRequestId(), utilization.getDeployId());

      if (maybeDeploy.isPresent() && maybeDeploy.get().getResources().isPresent()) {
        String requestId = utilization.getRequestId();
        long memoryBytesReserved = (long) (maybeDeploy.get().getResources().get().getMemoryMb() * SingularitySlaveUsage.BYTES_PER_MEGABYTE);
        double cpuReserved = maybeDeploy.get().getResources().get().getCpus();
        long diskBytesReserved = (long) maybeDeploy.get().getResources().get().getDiskMb() * SingularitySlaveUsage.BYTES_PER_MEGABYTE;

        double unusedCpu = cpuReserved - utilization.getAvgCpuUsed();
        long unusedMemBytes = (long) (memoryBytesReserved - utilization.getAvgMemBytesUsed());
        long unusedDiskBytes = (long) (diskBytesReserved - utilization.getAvgDiskBytesUsed());

        if (unusedCpu > 0) {
          numRequestsWithUnderUtilizedCpu++;
          totalUnderUtilizedCpu += unusedCpu;
          if (unusedCpu > maxUnderUtilizedCpu) {
            maxUnderUtilizedCpu = unusedCpu;
            maxUnderUtilizedCpuRequestId = requestId;
          }
          minUnderUtilizedCpu = Math.min(unusedCpu, minUnderUtilizedCpu);
        } else if (unusedCpu < 0) {
          double overusedCpu = Math.abs(unusedCpu);

          numRequestsWithOverUtilizedCpu++;
          totalOverUtilizedCpu += overusedCpu;
          if (overusedCpu > maxOverUtilizedCpu) {
            maxOverUtilizedCpu = overusedCpu;
            maxOverUtilizedCpuRequestId = requestId;
          }
          minOverUtilizedCpu = Math.min(overusedCpu, minOverUtilizedCpu);
        }

        if (unusedMemBytes > 0) {
          numRequestsWithUnderUtilizedMemBytes++;
          totalUnderUtilizedMemBytes += unusedMemBytes;
          if (unusedMemBytes > maxUnderUtilizedMemBytes) {
            maxUnderUtilizedMemBytes = unusedMemBytes;
            maxUnderUtilizedMemBytesRequestId = requestId;
          }
          minUnderUtilizedMemBytes = Math.min(unusedMemBytes, minUnderUtilizedMemBytes);
        }

        if (unusedDiskBytes > 0) {
          numRequestsWithUnderUtilizedDiskBytes++;
          totalUnderUtilizedDiskBytes += unusedDiskBytes;
          if (unusedDiskBytes > maxUnderUtilizedDiskBytes) {
            maxUnderUtilizedDiskBytes = unusedDiskBytes;
            maxUnderUtilizedDiskBytesRequestId = requestId;
          }
          minUnderUtilizedDiskBytes = Math.min(unusedDiskBytes, minUnderUtilizedMemBytes);
        }
      }
    }

    double avgUnderUtilizedCpu = numRequestsWithUnderUtilizedCpu != 0 ? totalUnderUtilizedCpu / numRequestsWithUnderUtilizedCpu : 0;
    double avgOverUtilizedCpu = numRequestsWithOverUtilizedCpu != 0 ? totalOverUtilizedCpu / numRequestsWithOverUtilizedCpu : 0;
    long avgUnderUtilizedMemBytes = numRequestsWithUnderUtilizedMemBytes != 0 ? totalUnderUtilizedMemBytes / numRequestsWithUnderUtilizedMemBytes : 0;
    long avgUnderUtilizedDiskBytes = numRequestsWithUnderUtilizedDiskBytes != 0 ? totalUnderUtilizedDiskBytes / numRequestsWithUnderUtilizedDiskBytes : 0;

    return new SingularityClusterUtilization(numRequestsWithUnderUtilizedCpu, numRequestsWithOverUtilizedCpu,
        numRequestsWithUnderUtilizedMemBytes, numRequestsWithUnderUtilizedDiskBytes, totalUnderUtilizedCpu, totalOverUtilizedCpu, totalUnderUtilizedMemBytes, totalUnderUtilizedDiskBytes, avgUnderUtilizedCpu, avgOverUtilizedCpu,
        avgUnderUtilizedMemBytes, avgUnderUtilizedDiskBytes, maxUnderUtilizedCpu, maxOverUtilizedCpu, maxUnderUtilizedMemBytes, maxUnderUtilizedDiskBytes, maxUnderUtilizedCpuRequestId, maxOverUtilizedCpuRequestId,
        maxUnderUtilizedMemBytesRequestId, maxUnderUtilizedDiskBytesRequestId, getMin(minUnderUtilizedCpu), getMin(minOverUtilizedCpu), getMin(minUnderUtilizedMemBytes), getMin(minUnderUtilizedDiskBytes), totalMemBytesUsed,
        totalMemBytesAvailable, totalDiskBytesUsed, totalDiskBytesAvailable, totalCpuUsed, totalCpuAvailable, now);
  }

  private double getMin(double value) {
    return value == Double.MAX_VALUE ? 0 : value;
  }

  private long getMin(long value) {
    return value == Long.MAX_VALUE ? 0 : value;
  }

  @VisibleForTesting
  void clearOldUsage(String taskId) {
    usageManager.getTaskUsagePaths(taskId)
        .stream()
        .map(Double::parseDouble)
        .skip(configuration.getNumUsageToKeep())
        .forEach((pathId) -> {
          SingularityDeleteResult result = usageManager.deleteSpecificTaskUsage(taskId, pathId);
          if (result.equals(SingularityDeleteResult.DIDNT_EXIST)) {
            LOG.warn("Didn't delete taskUsage {} for taskId {}", pathId.toString(), taskId);
          }
        });
  }

  private static class TaskIdWithUsage {
    private final SingularityTaskId taskId;
    private final Resources requestedResources;
    private final SingularityTaskCurrentUsage usage;

    TaskIdWithUsage(SingularityTaskId taskId, Resources requestedResources, SingularityTaskCurrentUsage usage) {
      this.taskId = taskId;
      this.requestedResources = requestedResources;
      this.usage = usage;
    }

    public SingularityTaskId getTaskId() {
      return taskId;
    }

    public Resources getRequestedResources() {
      return requestedResources;
    }

    public SingularityTaskCurrentUsage getUsage() {
      return usage;
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy