All Downloads are FREE. Search and download functionalities are using the official Maven repository.
Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.hubspot.singularity.scheduler.SingularityDisasterDetectionPoller Maven / Gradle / Ivy
package com.hubspot.singularity.scheduler;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.mesos.v1.Protos.TaskStatus.Reason;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.base.Optional;
import com.google.common.collect.Multiset;
import com.google.inject.Inject;
import com.google.inject.name.Named;
import com.hubspot.singularity.MachineState;
import com.hubspot.singularity.SingularityDisabledAction;
import com.hubspot.singularity.SingularityDisasterDataPoint;
import com.hubspot.singularity.SingularityDisasterDataPoints;
import com.hubspot.singularity.SingularityDisasterType;
import com.hubspot.singularity.SingularityDisastersData;
import com.hubspot.singularity.SingularityPendingTaskId;
import com.hubspot.singularity.SingularitySlave;
import com.hubspot.singularity.config.DisasterDetectionConfiguration;
import com.hubspot.singularity.config.SingularityConfiguration;
import com.hubspot.singularity.data.DisasterManager;
import com.hubspot.singularity.data.SlaveManager;
import com.hubspot.singularity.data.TaskManager;
import com.hubspot.singularity.mesos.SingularityMesosModule;
import com.hubspot.singularity.smtp.SingularityMailer;
public class SingularityDisasterDetectionPoller extends SingularityLeaderOnlyPoller {
private static final Logger LOG = LoggerFactory.getLogger(SingularityDisasterDetectionPoller.class);
private final SingularityConfiguration configuration;
private final DisasterDetectionConfiguration disasterConfiguration;
private final TaskManager taskManager;
private final SlaveManager slaveManager;
private final DisasterManager disasterManager;
private final SingularityMailer mailer;
private final Multiset taskLostReasons;
private final AtomicInteger activeSlavesLost;
@Inject
public SingularityDisasterDetectionPoller(SingularityConfiguration configuration, TaskManager taskManager, SlaveManager slaveManager, DisasterManager disasterManager, SingularityMailer mailer,
@Named(SingularityMesosModule.TASK_LOST_REASONS_COUNTER) Multiset taskLostReasons, @Named(SingularityMesosModule.ACTIVE_SLAVES_LOST_COUNTER) AtomicInteger activeSlavesLost) {
super(configuration.getDisasterDetection().getRunEveryMillis(), TimeUnit.MILLISECONDS);
this.configuration = configuration;
this.disasterConfiguration = configuration.getDisasterDetection();
this.taskManager = taskManager;
this.slaveManager = slaveManager;
this.disasterManager = disasterManager;
this.mailer = mailer;
this.taskLostReasons = taskLostReasons;
this.activeSlavesLost = activeSlavesLost;
}
@Override
protected boolean isEnabled() {
return disasterConfiguration.isEnabled();
}
@Override
protected boolean abortsOnError() {
return false;
}
@Override
public void runActionOnPoll() {
LOG.trace("Starting disaster detection");
clearExpiredDisabledActions();
List previouslyActiveDisasters = disasterManager.getActiveDisasters();
List dataPoints = disasterManager.getDisasterStats().getDataPoints();
SingularityDisasterDataPoint newStats = collectDisasterStats();
dataPoints.add(0, newStats);
if (dataPoints.size() > disasterConfiguration.getStatsHistorySize()) {
dataPoints.remove(dataPoints.size() - 1);
}
LOG.debug("Collected new disaster detection dataPoints: {}", newStats);
List newActiveDisasters = checkDataPoints(dataPoints);
if (!newActiveDisasters.isEmpty()) {
LOG.warn("Detected new active disasters: {}", newActiveDisasters);
}
disasterManager.updateActiveDisasters(previouslyActiveDisasters, newActiveDisasters);
disasterManager.saveDisasterStats(new SingularityDisasterDataPoints(dataPoints));
if (!newActiveDisasters.isEmpty()) {
if (!disasterManager.isAutomatedDisabledActionsDisabled()) {
disasterManager.addDisabledActionsForDisasters(newActiveDisasters);
}
if (!previouslyActiveDisasters.containsAll(newActiveDisasters)) {
queueDisasterEmail(dataPoints, newActiveDisasters);
}
} else {
disasterManager.clearSystemGeneratedDisabledActions();
}
}
private void clearExpiredDisabledActions() {
for (SingularityDisabledAction disabledAction : disasterManager.getDisabledActions()) {
if (disabledAction.getExpiresAt().isPresent() && System.currentTimeMillis() > disabledAction.getExpiresAt().get()) {
disasterManager.enable(disabledAction.getType());
}
}
}
private SingularityDisasterDataPoint collectDisasterStats() {
long now = System.currentTimeMillis();
int numActiveTasks = taskManager.getNumActiveTasks();
List pendingTasks = taskManager.getPendingTaskIds();
int numPendingTasks = pendingTasks.size();
int numLateTasks = 0;
long totalTaskLagMillis = 0;
int numPastDueTasks = 0;
for (SingularityPendingTaskId pendingTask : pendingTasks) {
long taskLagMillis = now - pendingTask.getNextRunAt();
if (taskLagMillis > 0) {
numPastDueTasks++;
totalTaskLagMillis += taskLagMillis;
if (taskLagMillis > configuration.getDeltaAfterWhichTasksAreLateMillis()) {
numLateTasks++;
}
}
}
long avgTaskLagMillis = totalTaskLagMillis / Math.max(numPastDueTasks, 1);
List slaves = slaveManager.getObjects();
int numRunningSlaves = 0;
for (SingularitySlave slave : slaves) {
if (slave.getCurrentState().getState() != MachineState.DEAD && slave.getCurrentState().getState() != MachineState.MISSING_ON_STARTUP) {
numRunningSlaves++;
}
}
int numLostSlaves = activeSlavesLost.getAndSet(0);
int numLostTasks = 0;
for (Reason lostTaskReason : disasterConfiguration.getLostTaskReasons()) {
numLostTasks += taskLostReasons.count(lostTaskReason);
}
taskLostReasons.clear();
return new SingularityDisasterDataPoint(now, numActiveTasks, numPendingTasks, numLateTasks, avgTaskLagMillis, numLostTasks, numRunningSlaves, numLostSlaves);
}
private List checkDataPoints(List dataPoints) {
List activeDisasters = new ArrayList<>();
if (!dataPoints.isEmpty()) {
long now = System.currentTimeMillis();
if (disasterConfiguration.isCheckLateTasks() && tooMuchTaskLag(now, dataPoints)) {
activeDisasters.add(SingularityDisasterType.EXCESSIVE_TASK_LAG);
}
if (disasterConfiguration.isCheckLostSlaves() && tooManyLostSlaves(now, dataPoints)) {
activeDisasters.add(SingularityDisasterType.LOST_SLAVES);
}
if (disasterConfiguration.isCheckLostTasks() && tooManyLostTasks(now, dataPoints)) {
activeDisasters.add(SingularityDisasterType.LOST_TASKS);
}
}
return activeDisasters;
}
private boolean tooMuchTaskLag(long now, List dataPoints) {
Optional criticalAvgLagTriggeredSince = Optional.absent();
Optional warningAvgLagTriggeredSince = Optional.absent();
Optional criticalPortionTriggeredSince = Optional.absent();
Optional warningPortionTriggeredSince = Optional.absent();
for (SingularityDisasterDataPoint dataPoint : dataPoints) {
double overdueTaskPortion = dataPoint.getNumLateTasks() / (double) Math.max((dataPoint.getNumActiveTasks() + dataPoint.getNumPendingTasks()), 1);
boolean criticalOverdueTasksPortion = overdueTaskPortion > disasterConfiguration.getCriticalOverdueTaskPortion();
boolean warningOverdueTasksPortion = overdueTaskPortion > disasterConfiguration.getWarningOverdueTaskPortion();
boolean criticalAvgTaskLag = dataPoint.getAvgTaskLagMillis() > disasterConfiguration.getCriticalAvgTaskLagMillis() && warningOverdueTasksPortion;
boolean warningAvgTaskLag = dataPoint.getAvgTaskLagMillis() > disasterConfiguration.getWarningAvgTaskLagMillis();
if (criticalOverdueTasksPortion) {
criticalPortionTriggeredSince = Optional.of(dataPoint.getTimestamp());
}
if (warningOverdueTasksPortion) {
warningPortionTriggeredSince = Optional.of(dataPoint.getTimestamp());
}
if (criticalAvgTaskLag) {
criticalAvgLagTriggeredSince = Optional.of(dataPoint.getTimestamp());
}
if (warningAvgTaskLag) {
warningAvgLagTriggeredSince = Optional.of(dataPoint.getTimestamp());
}
if (!criticalOverdueTasksPortion && !warningOverdueTasksPortion && !criticalAvgTaskLag && !warningAvgTaskLag) {
break;
}
}
// 'true' if either critical condition is met
if ((criticalAvgLagTriggeredSince.isPresent() && now - criticalAvgLagTriggeredSince.get() > disasterConfiguration.getTriggerAfterMillisOverTaskLagThreshold())
|| (criticalPortionTriggeredSince.isPresent() && now - criticalPortionTriggeredSince.get() > disasterConfiguration.getTriggerAfterMillisOverTaskLagThreshold())) {
return true;
}
// 'true' if both warning conditions are met
return warningAvgLagTriggeredSince.isPresent() && now - warningAvgLagTriggeredSince.get() > disasterConfiguration.getTriggerAfterMillisOverTaskLagThreshold()
&& warningPortionTriggeredSince.isPresent() && now - warningPortionTriggeredSince.get() > disasterConfiguration.getTriggerAfterMillisOverTaskLagThreshold();
}
private boolean tooManyLostSlaves(long now, List dataPoints) {
int totalLostSlaves = 0;
for (SingularityDisasterDataPoint dataPoint : dataPoints) {
if (now - dataPoint.getTimestamp() < disasterConfiguration.getIncludeLostSlavesInLastMillis()) {
totalLostSlaves += dataPoint.getNumLostSlaves();
}
}
double lostSlavesPortion = totalLostSlaves / (double) (Math.max(dataPoints.get(0).getNumActiveSlaves() + dataPoints.get(0).getNumLostSlaves(), 1));
return lostSlavesPortion > disasterConfiguration.getCriticalLostSlavePortion();
}
private boolean tooManyLostTasks(long now, List dataPoints) {
int totalLostTasks = 0;
for (SingularityDisasterDataPoint dataPoint : dataPoints) {
if (now - dataPoint.getTimestamp() < disasterConfiguration.getIncludeLostTasksInLastMillis()) {
totalLostTasks += dataPoint.getNumLostTasks();
}
}
double lostTasksPortion = totalLostTasks / (double) Math.max(dataPoints.get(0).getNumActiveTasks(), 1);
return lostTasksPortion > disasterConfiguration.getCriticalLostTaskPortion();
}
private void queueDisasterEmail(List dataPoints, List disasters) {
SingularityDisastersData data = new SingularityDisastersData(
dataPoints,
disasterManager.getAllDisasterStates(disasters),
disasterManager.isAutomatedDisabledActionsDisabled()
);
mailer.sendDisasterMail(data);
}
}