com.yahoo.vespa.hosted.provision.maintenance.NodeFailer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of node-repository Show documentation
Show all versions of node-repository Show documentation
Keeps track of node assignment in a multi-application setup.
The newest version!
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;
import ai.vespa.metrics.ConfigServerMetrics;
import com.yahoo.concurrent.UncheckedTimeoutException;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.TransientException;
import com.yahoo.jdisc.Metric;
import com.yahoo.slime.SlimeUtils;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeMutex;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.Allocation;
import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException;
import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus;
import com.yahoo.yolean.Exceptions;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import java.util.stream.Stream;
/**
* Maintains information in the node repo about when this node last responded to ping
* and fails nodes which have not responded within the given time limit.
*
* @author bratseth
* @author mpolden
*/
public class NodeFailer extends NodeRepositoryMaintainer {
private static final Logger log = Logger.getLogger(NodeFailer.class.getName());
/** Metric for number of hosts that we want to fail, but cannot due to throttling */
static final String throttledHostFailuresMetric = ConfigServerMetrics.THROTTLED_HOST_FAILURES.baseName();
/** Metric for number of nodes that we want to fail, but cannot due to throttling */
static final String throttledNodeFailuresMetric = ConfigServerMetrics.THROTTLED_NODE_FAILURES.baseName();
/** Metric that indicates whether throttling is active where 1 means active and 0 means inactive */
static final String throttlingActiveMetric = "nodeFailThrottling";
private final Deployer deployer;
private final Duration downTimeLimit;
private final Duration suspendedDownTimeLimit;
private final ThrottlePolicy throttlePolicy;
private final Metric metric;
public NodeFailer(Deployer deployer, NodeRepository nodeRepository,
Duration downTimeLimit, Duration interval, ThrottlePolicy throttlePolicy, Metric metric) {
// check ping status every interval, but at least twice as often as the down time limit
super(nodeRepository, min(downTimeLimit.dividedBy(2), interval), metric);
this.deployer = deployer;
this.downTimeLimit = downTimeLimit;
this.suspendedDownTimeLimit = downTimeLimit.multipliedBy(4); // Allow more downtime when a node is suspended
this.throttlePolicy = throttlePolicy;
this.metric = metric;
}
@Override
protected double maintain() {
if ( ! nodeRepository().nodes().isWorking()) return 0.0;
int attempts = 0;
int failures = 0;
int throttledHostFailures = 0;
int throttledNodeFailures = 0;
for (FailingNode failing : findActiveFailingNodes()) {
attempts++;
if (!failAllowedFor(failing.node().type())) continue;
if (throttle(failing.node())) {
failures++;
if (failing.node().type().isHost())
throttledHostFailures++;
else
throttledNodeFailures++;
continue;
}
failActive(failing);
}
int throttlingActive = Math.min(1, throttledHostFailures + throttledNodeFailures);
metric.set(throttlingActiveMetric, throttlingActive, null);
metric.set(throttledHostFailuresMetric, throttledHostFailures, null);
metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null);
return asSuccessFactorDeviation(attempts, failures);
}
private Collection findActiveFailingNodes() {
Set failingNodes = new HashSet<>();
NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);
for (Node host : activeNodes.hosts().failing())
failingNodes.add(new FailingNode(host, "Host should be failed and have no tenant nodes"));
for (Node node : activeNodes) {
downSince(node).ifPresent(instant -> failingNodes.add(new FailingNode(node, "Node has been down since " + instant)));
}
for (Node node : activeNodes) {
if (allSuspended(node, activeNodes)) {
Node host = node.parentHostname().flatMap(activeNodes::node).orElse(node);
if (host.type().isHost()) {
List failureReports = reasonsToFailHost(host);
if ( ! failureReports.isEmpty()) {
failingNodes.add(new FailingNode(node, host.equals(node) ?
"Host has failure reports: " + failureReports :
"Parent " + host + " has failure reports: " + failureReports));
}
}
}
}
return failingNodes;
}
public static List reasonsToFailHost(Node host) {
return host.reports().getReports().stream()
.filter(report -> report.getType().hostShouldBeFailed())
// The generated string is built from the report's ID, created time, and description only.
.map(report -> report.getReportId() + " reported " + report.getCreatedTime() + ": " + report.getDescription())
.toList();
}
/** Returns whether node has any kind of hardware issue */
static boolean hasHardwareIssue(Node node, NodeList allNodes) {
Node host = node.parentHostname().flatMap(allNodes::node).orElse(node);
return !reasonsToFailHost(host).isEmpty();
}
private Optional downSince(Node node) {
Optional downInstant = node.history().downSince();
if (downInstant.isEmpty()) return Optional.empty();
Instant downSince = Stream.of(downInstant,
node.history().resumedSince(),
node.history().event(History.Event.Type.activated).map(History.Event::at))
.filter(Optional::isPresent)
.map(Optional::get)
.max(Comparator.naturalOrder())
.orElseThrow();
Duration graceDuration = node.history().isSuspended() ? suspendedDownTimeLimit : downTimeLimit;
if (clock().instant().isBefore(downSince.plus(graceDuration))) return Optional.empty();
if (applicationSuspended(node)) return Optional.empty();
if (affectedByMaintenance(node)) return Optional.empty();
return Optional.of(downSince);
}
private boolean applicationSuspended(Node node) {
try {
return nodeRepository().orchestrator().getApplicationInstanceStatus(node.allocation().get().owner())
== ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN;
} catch (ApplicationIdNotFoundException e) {
// Treat it as not suspended and allow to fail the node anyway
return false;
}
}
/** Is a maintenance event affecting this node? */
private boolean affectedByMaintenance(Node node) {
return node.reports().getReport("vcmr")
.map(report ->
SlimeUtils.entriesStream(report.getInspector().field("upcoming"))
.anyMatch(cmr -> {
var startTime = cmr.field("plannedStartTime").asLong();
var endTime = cmr.field("plannedEndTime").asLong();
var now = clock().instant().getEpochSecond();
return now > startTime && now < endTime;
})
).orElse(false);
}
/** Is the node and all active children suspended? */
private boolean allSuspended(Node node, NodeList activeNodes) {
if (!nodeRepository().nodes().suspended(node)) return false;
if (node.parentHostname().isPresent()) return true; // optimization
return activeNodes.childrenOf(node.hostname()).stream().allMatch(nodeRepository().nodes()::suspended);
}
/**
* We can attempt to fail any number of *tenant* and *host* nodes because the operation will not be effected
* unless the node is replaced.
* We can also attempt to fail a single proxy(host) as there should be enough redundancy to handle that.
* But we refuse to fail out config(host)/controller(host)
*/
private boolean failAllowedFor(NodeType nodeType) {
return switch (nodeType) {
case tenant, host -> true;
case proxy, proxyhost -> nodeRepository().nodes().list(Node.State.failed).nodeType(nodeType).isEmpty();
default -> false;
};
}
/**
* Called when a node should be moved to the failed state: Do that if it seems safe,
* which is when the node repo has available capacity to replace the node (and all its tenant nodes if host).
* Otherwise, not replacing the node ensures (by Orchestrator check) that no further action will be taken.
*/
private void failActive(FailingNode failing) {
Optional deployment =
deployer.deployFromLocalActive(failing.node().allocation().get().owner(), Duration.ofMinutes(5));
if (deployment.isEmpty()) return;
boolean redeploy = false;
// If the active node that we are trying to fail is of type host, we need to successfully fail all
// the children nodes running on it before we fail the host. Failing a child node in a dynamically
// provisioned zone may require provisioning new hosts that require the host application lock to be held,
// so we must release ours before failing the children.
if (failing.node.type().isHost()) {
List activeChildrenToFail = new ArrayList<>();
try (var lock = nodeRepository().nodes().lockAndGetRecursively(failing.node.hostname(), Optional.empty())) {
failing = shouldFail(lock.parent().node(), failing);
if (failing == null) return;
String reasonForChildFailure = "Failing due to parent host " + failing.node().hostname() + " failure: " + failing.reason();
for (var failingTenantNode : lock.children()) {
if (failingTenantNode.node().state() == Node.State.active) {
activeChildrenToFail.add(new FailingNode(failingTenantNode.node(), reasonForChildFailure));
} else if (failingTenantNode.node().state() != Node.State.failed) {
nodeRepository().nodes().fail(failingTenantNode.node().hostname(), Agent.NodeFailer, reasonForChildFailure);
}
}
if (activeChildrenToFail.isEmpty()) {
log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason);
markWantToFail(failing.node(), true, lock.parent());
redeploy = true;
}
}
// In a dynamically provisioned zone the failing of the first child may require a new host to be provisioned,
// so failActive() may take a long time to complete, but the remaining children should be fast.
activeChildrenToFail.forEach(this::failActive);
}
else {
try (var lock = nodeRepository().nodes().lockAndGetRequired(failing.node)) {
failing = shouldFail(lock.node(), failing);
if (failing == null) return;
log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason);
markWantToFail(failing.node(), true, lock);
redeploy = true;
}
}
// Redeploy to replace failing node
if (redeploy) {
redeploy(deployment.get(), failing);
}
}
// Returns an updated FailingNode if we should still fail the node, otherwise null
private static FailingNode shouldFail(Node fresh, FailingNode stale) {
// Now that we have gotten the node object under the proper lock, sanity-check it still makes sense to fail
if (!Objects.equals(stale.node.allocation().map(Allocation::owner), fresh.allocation().map(Allocation::owner)))
return null;
if (fresh.state() == Node.State.failed)
return null;
if (!Objects.equals(stale.node.state(), fresh.state()))
return null;
return new FailingNode(fresh, stale.reason);
}
private void redeploy(Deployment deployment, FailingNode failing) {
try {
deployment.activate();
} catch (TransientException | UncheckedTimeoutException e) {
log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() +
" with a transient error, will be retried by application maintainer: " +
Exceptions.toMessageString(e));
} catch (RuntimeException e) {
// Reset want to fail: We'll retry failing unless it heals in the meantime
Optional optionalNodeMutex = nodeRepository().nodes().lockAndGet(failing.node());
if (optionalNodeMutex.isEmpty()) return;
try (var nodeMutex = optionalNodeMutex.get()) {
markWantToFail(nodeMutex.node(), false, nodeMutex);
log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() +
" for " + failing.reason() + ": " + Exceptions.toMessageString(e));
}
}
}
private void markWantToFail(Node node, boolean wantToFail, Mutex lock) {
if (node.status().wantToFail() != wantToFail) {
nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock);
}
}
/** Returns true if node failing should be throttled */
private boolean throttle(Node node) {
if (throttlePolicy == ThrottlePolicy.disabled) return false;
Instant startOfThrottleWindow = clock().instant().minus(throttlePolicy.throttleWindow);
NodeList allNodes = nodeRepository().nodes().list();
NodeList recentlyFailedNodes = allNodes
.matching(n -> n.status().wantToFail() ||
(n.state() == Node.State.failed &&
n.history().hasEventAfter(History.Event.Type.failed, startOfThrottleWindow)));
// Allow failing any node within policy
if (recentlyFailedNodes.size() < throttlePolicy.allowedToFailOf(allNodes.size())) return false;
// Always allow failing a minimum number of hosts
if (node.parentHostname().isEmpty()) {
Set parentsOfRecentlyFailedNodes = recentlyFailedNodes.stream()
.map(n -> n.parentHostname().orElse(n.hostname()))
.collect(Collectors.toSet());
long potentiallyFailed = parentsOfRecentlyFailedNodes.contains(node.hostname()) ?
parentsOfRecentlyFailedNodes.size() :
parentsOfRecentlyFailedNodes.size() + 1;
if (potentiallyFailed <= throttlePolicy.minimumAllowedToFail) return false;
}
// Always allow failing children of a failed host
if (recentlyFailedNodes.parentOf(node).isPresent()) return false;
log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(),
throttlePolicy.toHumanReadableString(allNodes.size())));
return true;
}
public enum ThrottlePolicy {
hosted(Duration.ofDays(1), 0.04, 2),
disabled(Duration.ZERO, 0, 0);
private final Duration throttleWindow;
private final double fractionAllowedToFail;
private final int minimumAllowedToFail;
ThrottlePolicy(Duration throttleWindow, double fractionAllowedToFail, int minimumAllowedToFail) {
this.throttleWindow = throttleWindow;
this.fractionAllowedToFail = fractionAllowedToFail;
this.minimumAllowedToFail = minimumAllowedToFail;
}
public int allowedToFailOf(int totalNodes) {
return (int) Math.max(totalNodes * fractionAllowedToFail, minimumAllowedToFail);
}
public String toHumanReadableString(int totalNodes) {
return String.format("Max %.0f%% (%d) or %d nodes can fail over a period of %s", fractionAllowedToFail*100,
allowedToFailOf(totalNodes),
minimumAllowedToFail, throttleWindow);
}
}
private record FailingNode(Node node, String reason) { }
}