com.yahoo.vespa.hosted.provision.maintenance.NodeFailer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of node-repository Show documentation
Keeps track of node assignment in a multi-application setup.
The newest version!
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;

import ai.vespa.metrics.ConfigServerMetrics;
import com.yahoo.concurrent.UncheckedTimeoutException;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.NodeType;
import com.yahoo.config.provision.TransientException;
import com.yahoo.jdisc.Metric;
import com.yahoo.slime.SlimeUtils;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeMutex;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.node.Allocation;
import com.yahoo.vespa.hosted.provision.node.History;
import com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException;
import com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus;
import com.yahoo.yolean.Exceptions;

import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * Maintains information in the node repo about when this node last responded to ping
 * and fails nodes which have not responded within the given time limit.
 *
 * @author bratseth
 * @author mpolden
 */
public class NodeFailer extends NodeRepositoryMaintainer {

    private static final Logger log = Logger.getLogger(NodeFailer.class.getName());

    /** Metric for number of hosts that we want to fail, but cannot due to throttling */
    static final String throttledHostFailuresMetric = ConfigServerMetrics.THROTTLED_HOST_FAILURES.baseName();

    /** Metric for number of nodes that we want to fail, but cannot due to throttling */
    static final String throttledNodeFailuresMetric = ConfigServerMetrics.THROTTLED_NODE_FAILURES.baseName();

    /** Metric that indicates whether throttling is active where 1 means active and 0 means inactive */
    static final String throttlingActiveMetric = "nodeFailThrottling";

    private final Deployer deployer;
    private final Duration downTimeLimit;
    private final Duration suspendedDownTimeLimit;
    private final ThrottlePolicy throttlePolicy;
    private final Metric metric;

    public NodeFailer(Deployer deployer, NodeRepository nodeRepository,
                      Duration downTimeLimit, Duration interval, ThrottlePolicy throttlePolicy, Metric metric) {
        // check ping status every interval, but at least twice as often as the down time limit
        super(nodeRepository, min(downTimeLimit.dividedBy(2), interval), metric);
        this.deployer = deployer;
        this.downTimeLimit = downTimeLimit;
        this.suspendedDownTimeLimit = downTimeLimit.multipliedBy(4); // Allow more downtime when a node is suspended
        this.throttlePolicy = throttlePolicy;
        this.metric = metric;
    }

    @Override
    protected double maintain() {
        if ( ! nodeRepository().nodes().isWorking()) return 0.0;

        int attempts = 0;
        int failures = 0;
        int throttledHostFailures = 0;
        int throttledNodeFailures = 0;

        for (FailingNode failing : findActiveFailingNodes()) {
            attempts++;
            if (!failAllowedFor(failing.node().type())) continue;

            if (throttle(failing.node())) {
                failures++;
                if (failing.node().type().isHost())
                    throttledHostFailures++;
                else
                    throttledNodeFailures++;
                continue;
            }
            failActive(failing);
        }

        int throttlingActive = Math.min(1, throttledHostFailures + throttledNodeFailures);
        metric.set(throttlingActiveMetric, throttlingActive, null);
        metric.set(throttledHostFailuresMetric, throttledHostFailures, null);
        metric.set(throttledNodeFailuresMetric, throttledNodeFailures, null);
        return asSuccessFactorDeviation(attempts, failures);
    }

    private Collection findActiveFailingNodes() {
        Set failingNodes = new HashSet<>();
        NodeList activeNodes = nodeRepository().nodes().list(Node.State.active);

        for (Node host : activeNodes.hosts().failing())
            failingNodes.add(new FailingNode(host, "Host should be failed and have no tenant nodes"));

        for (Node node : activeNodes) {
            downSince(node).ifPresent(instant -> failingNodes.add(new FailingNode(node, "Node has been down since " + instant)));
        }

        for (Node node : activeNodes) {
            if (allSuspended(node, activeNodes)) {
                Node host = node.parentHostname().flatMap(activeNodes::node).orElse(node);
                if (host.type().isHost()) {
                    List failureReports = reasonsToFailHost(host);
                    if ( ! failureReports.isEmpty()) {
                        failingNodes.add(new FailingNode(node, host.equals(node) ?
                                                               "Host has failure reports: " + failureReports :
                                                               "Parent " + host + " has failure reports: " + failureReports));
                    }
                }
            }
        }

        return failingNodes;
    }

    public static List reasonsToFailHost(Node host) {
        return host.reports().getReports().stream()
                .filter(report -> report.getType().hostShouldBeFailed())
                // The generated string is built from the report's ID, created time, and description only.
                .map(report -> report.getReportId() + " reported " + report.getCreatedTime() + ": " + report.getDescription())
                .toList();
    }

    /** Returns whether node has any kind of hardware issue */
    static boolean hasHardwareIssue(Node node, NodeList allNodes) {
        Node host = node.parentHostname().flatMap(allNodes::node).orElse(node);
        return !reasonsToFailHost(host).isEmpty();
    }

    private Optional downSince(Node node) {
        Optional downInstant = node.history().downSince();
        if (downInstant.isEmpty()) return Optional.empty();

        Instant downSince = Stream.of(downInstant,
                                      node.history().resumedSince(),
                                      node.history().event(History.Event.Type.activated).map(History.Event::at))
                                  .filter(Optional::isPresent)
                                  .map(Optional::get)
                                  .max(Comparator.naturalOrder())
                                  .orElseThrow();
        Duration graceDuration = node.history().isSuspended() ? suspendedDownTimeLimit : downTimeLimit;
        if (clock().instant().isBefore(downSince.plus(graceDuration))) return Optional.empty();

        if (applicationSuspended(node)) return Optional.empty();
        if (affectedByMaintenance(node)) return Optional.empty();

        return Optional.of(downSince);
    }

    private boolean applicationSuspended(Node node) {
        try {
            return nodeRepository().orchestrator().getApplicationInstanceStatus(node.allocation().get().owner())
                   == ApplicationInstanceStatus.ALLOWED_TO_BE_DOWN;
        } catch (ApplicationIdNotFoundException e) {
            // Treat it as not suspended and allow to fail the node anyway
            return false;
        }
    }

    /** Is a maintenance event affecting this node? */
    private boolean affectedByMaintenance(Node node) {
        return node.reports().getReport("vcmr")
                   .map(report ->
                                SlimeUtils.entriesStream(report.getInspector().field("upcoming"))
                                          .anyMatch(cmr -> {
                                              var startTime = cmr.field("plannedStartTime").asLong();
                                              var endTime = cmr.field("plannedEndTime").asLong();
                                              var now = clock().instant().getEpochSecond();
                                              return now > startTime && now < endTime;
                                          })
                   ).orElse(false);
    }

    /** Is the node and all active children suspended? */
    private boolean allSuspended(Node node, NodeList activeNodes) {
        if (!nodeRepository().nodes().suspended(node)) return false;
        if (node.parentHostname().isPresent()) return true; // optimization
        return activeNodes.childrenOf(node.hostname()).stream().allMatch(nodeRepository().nodes()::suspended);
    }

    /**
     * We can attempt to fail any number of *tenant* and *host* nodes because the operation will not be effected
     * unless the node is replaced.
     * We can also attempt to fail a single proxy(host) as there should be enough redundancy to handle that.
     * But we refuse to fail out config(host)/controller(host)
     */
    private boolean failAllowedFor(NodeType nodeType) {
        return switch (nodeType) {
            case tenant, host -> true;
            case proxy, proxyhost -> nodeRepository().nodes().list(Node.State.failed).nodeType(nodeType).isEmpty();
            default -> false;
        };
    }

    /**
     * Called when a node should be moved to the failed state: Do that if it seems safe,
     * which is when the node repo has available capacity to replace the node (and all its tenant nodes if host).
     * Otherwise, not replacing the node ensures (by Orchestrator check) that no further action will be taken.
     */
    private void failActive(FailingNode failing) {
        Optional deployment =
            deployer.deployFromLocalActive(failing.node().allocation().get().owner(), Duration.ofMinutes(5));
        if (deployment.isEmpty()) return;

        boolean redeploy = false;
        // If the active node that we are trying to fail is of type host, we need to successfully fail all
        // the children nodes running on it before we fail the host.  Failing a child node in a dynamically
        // provisioned zone may require provisioning new hosts that require the host application lock to be held,
        // so we must release ours before failing the children.
        if (failing.node.type().isHost()) {
            List activeChildrenToFail = new ArrayList<>();
            try (var lock = nodeRepository().nodes().lockAndGetRecursively(failing.node.hostname(), Optional.empty())) {
                failing = shouldFail(lock.parent().node(), failing);
                if (failing == null) return;

                String reasonForChildFailure = "Failing due to parent host " + failing.node().hostname() + " failure: " + failing.reason();
                for (var failingTenantNode : lock.children()) {
                    if (failingTenantNode.node().state() == Node.State.active) {
                        activeChildrenToFail.add(new FailingNode(failingTenantNode.node(), reasonForChildFailure));
                    } else if (failingTenantNode.node().state() != Node.State.failed) {
                        nodeRepository().nodes().fail(failingTenantNode.node().hostname(), Agent.NodeFailer, reasonForChildFailure);
                    }
                }

                if (activeChildrenToFail.isEmpty()) {
                    log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason);
                    markWantToFail(failing.node(), true, lock.parent());
                    redeploy = true;
                }
            }
            // In a dynamically provisioned zone the failing of the first child may require a new host to be provisioned,
            // so failActive() may take a long time to complete, but the remaining children should be fast.
            activeChildrenToFail.forEach(this::failActive);
        }
        else {
            try (var lock = nodeRepository().nodes().lockAndGetRequired(failing.node)) {
                failing = shouldFail(lock.node(), failing);
                if (failing == null) return;

                log.log(Level.INFO, "Failing out " + failing.node + ": " + failing.reason);
                markWantToFail(failing.node(), true, lock);
                redeploy = true;
            }
        }

        // Redeploy to replace failing node
        if (redeploy) {
            redeploy(deployment.get(), failing);
        }
    }

    // Returns an updated FailingNode if we should still fail the node, otherwise null
    private static FailingNode shouldFail(Node fresh, FailingNode stale) {
        // Now that we have gotten the node object under the proper lock, sanity-check it still makes sense to fail
        if (!Objects.equals(stale.node.allocation().map(Allocation::owner), fresh.allocation().map(Allocation::owner)))
            return null;
        if (fresh.state() == Node.State.failed)
            return null;
        if (!Objects.equals(stale.node.state(), fresh.state()))
            return null;
        return new FailingNode(fresh, stale.reason);
    }

    private void redeploy(Deployment deployment, FailingNode failing) {
        try {
            deployment.activate();
        } catch (TransientException | UncheckedTimeoutException e) {
            log.log(Level.INFO, "Failed to redeploy " + failing.node().allocation().get().owner() +
                                " with a transient error, will be retried by application maintainer: " +
                                Exceptions.toMessageString(e));
        } catch (RuntimeException e) {
            // Reset want to fail: We'll retry failing unless it heals in the meantime
            Optional optionalNodeMutex = nodeRepository().nodes().lockAndGet(failing.node());
            if (optionalNodeMutex.isEmpty()) return;
            try (var nodeMutex = optionalNodeMutex.get()) {
                markWantToFail(nodeMutex.node(), false, nodeMutex);
                log.log(Level.WARNING, "Could not fail " + failing.node() + " for " + failing.node().allocation().get().owner() +
                                       " for " + failing.reason() + ": " + Exceptions.toMessageString(e));
            }
        }
    }

    private void markWantToFail(Node node, boolean wantToFail, Mutex lock) {
        if (node.status().wantToFail() != wantToFail) {
            nodeRepository().nodes().write(node.withWantToFail(wantToFail, Agent.NodeFailer, clock().instant()), lock);
        }
    }

    /** Returns true if node failing should be throttled */
    private boolean throttle(Node node) {
        if (throttlePolicy == ThrottlePolicy.disabled) return false;
        Instant startOfThrottleWindow = clock().instant().minus(throttlePolicy.throttleWindow);
        NodeList allNodes = nodeRepository().nodes().list();
        NodeList recentlyFailedNodes = allNodes
                .matching(n -> n.status().wantToFail() ||
                               (n.state() == Node.State.failed &&
                                n.history().hasEventAfter(History.Event.Type.failed, startOfThrottleWindow)));

        // Allow failing any node within policy
        if (recentlyFailedNodes.size() < throttlePolicy.allowedToFailOf(allNodes.size())) return false;

        // Always allow failing a minimum number of hosts
        if (node.parentHostname().isEmpty()) {
            Set parentsOfRecentlyFailedNodes = recentlyFailedNodes.stream()
                                                                          .map(n -> n.parentHostname().orElse(n.hostname()))
                                                                          .collect(Collectors.toSet());
            long potentiallyFailed = parentsOfRecentlyFailedNodes.contains(node.hostname()) ?
                                     parentsOfRecentlyFailedNodes.size() :
                                     parentsOfRecentlyFailedNodes.size() + 1;
            if (potentiallyFailed <= throttlePolicy.minimumAllowedToFail) return false;
        }

        // Always allow failing children of a failed host
        if (recentlyFailedNodes.parentOf(node).isPresent()) return false;

        log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(),
                               throttlePolicy.toHumanReadableString(allNodes.size())));

        return true;
    }

    public enum ThrottlePolicy {

        hosted(Duration.ofDays(1), 0.04, 2),
        disabled(Duration.ZERO, 0, 0);

        private final Duration throttleWindow;
        private final double fractionAllowedToFail;
        private final int minimumAllowedToFail;

        ThrottlePolicy(Duration throttleWindow, double fractionAllowedToFail, int minimumAllowedToFail) {
            this.throttleWindow = throttleWindow;
            this.fractionAllowedToFail = fractionAllowedToFail;
            this.minimumAllowedToFail = minimumAllowedToFail;
        }

        public int allowedToFailOf(int totalNodes) {
            return (int) Math.max(totalNodes * fractionAllowedToFail, minimumAllowedToFail);
        }

        public String toHumanReadableString(int totalNodes) {
            return String.format("Max %.0f%% (%d) or %d nodes can fail over a period of %s", fractionAllowedToFail*100,
                                 allowedToFailOf(totalNodes),
                                 minimumAllowedToFail, throttleWindow);
        }

    }

    private record FailingNode(Node node, String reason) { }

}