All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.vespa.hosted.provision.maintenance.NodeRetirer Maven / Gradle / Ivy

// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;

import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.Flavor;
import com.yahoo.config.provision.NodeType;
import com.yahoo.log.LogLevel;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.maintenance.retire.RetirementPolicy;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.vespa.hosted.provision.provisioning.FlavorSpareChecker;

import java.time.Duration;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import java.util.stream.Stream;

/**
 * Automatically retires ready and active nodes if they meet a certain criteria given by the {@link RetirementPolicy}
 * and if there are enough remaining nodes to both replace the retiring node as well as to keep enough in spare.
 *
 * @author freva
 */
public class NodeRetirer extends Maintainer {
    public static final FlavorSpareChecker.SpareNodesPolicy SPARE_NODES_POLICY = flavorSpareCount ->
            flavorSpareCount.getNumReadyAmongReplacees() > 2;

    private static final long MAX_SIMULTANEOUS_RETIRES_PER_CLUSTER = 1;
    private static final Logger log = Logger.getLogger(NodeRetirer.class.getName());

    private final Deployer deployer;
    private final FlavorSpareChecker flavorSpareChecker;
    private final RetirementPolicy retirementPolicy;

    public NodeRetirer(NodeRepository nodeRepository, FlavorSpareChecker flavorSpareChecker, Duration interval,
                       Deployer deployer, JobControl jobControl, RetirementPolicy retirementPolicy) {
        super(nodeRepository, interval, jobControl);
        this.deployer = deployer;
        this.retirementPolicy = retirementPolicy;
        this.flavorSpareChecker = flavorSpareChecker;
    }

    @Override
    protected void maintain() {
        if (! retirementPolicy.isActive()) return;

        if (retireUnallocated()) {
            retireAllocated();
        }
    }

    /**
     * Retires unallocated nodes by moving them directly to parked.
     * Returns true iff all there are no unallocated nodes that match the retirement policy
     */
    boolean retireUnallocated() {
        try (Mutex lock = nodeRepository().lockUnallocated()) {
            List allNodes = nodeRepository().getNodes(NodeType.tenant);
            Map> numSpareNodesByFlavorByState = getNumberOfNodesByFlavorByNodeState(allNodes);
            flavorSpareChecker.updateReadyAndActiveCountsByFlavor(numSpareNodesByFlavorByState);

            long numFlavorsWithUnsuccessfullyRetiredNodes = allNodes.stream()
                    .filter(node -> node.state() == Node.State.ready)
                    .filter(node -> retirementPolicy.shouldRetire(node).isPresent())
                    .collect(Collectors.groupingBy(
                            Node::flavor,
                            Collectors.toSet()))
                    .entrySet().stream()
                    .filter(entry -> {
                        Set nodesThatShouldBeRetiredForFlavor = entry.getValue();
                        for (Iterator iter = nodesThatShouldBeRetiredForFlavor.iterator(); iter.hasNext(); ) {
                            Node nodeToRetire = iter.next();
                            if (! flavorSpareChecker.canRetireUnallocatedNodeWithFlavor(nodeToRetire.flavor())) break;

                            retirementPolicy.shouldRetire(nodeToRetire).ifPresent(reason -> {
                                nodeRepository().write(nodeToRetire.with(nodeToRetire.status().withWantToDeprovision(true)));
                                nodeRepository().park(nodeToRetire.hostname(), Agent.NodeRetirer, reason);
                                iter.remove();
                            });
                        }

                        if (! nodesThatShouldBeRetiredForFlavor.isEmpty()) {
                            String commaSeparatedHostnames = nodesThatShouldBeRetiredForFlavor.stream().map(Node::hostname)
                                    .collect(Collectors.joining(", "));
                            log.info(String.format("Failed to retire %s, wanted to retire %d nodes (%s), but there are no spare nodes left.",
                                    entry.getKey(), nodesThatShouldBeRetiredForFlavor.size(), commaSeparatedHostnames));
                        }
                        return ! nodesThatShouldBeRetiredForFlavor.isEmpty();
                    }).count();

            return numFlavorsWithUnsuccessfullyRetiredNodes == 0;
        }
    }

    void retireAllocated() {
        List allNodes = nodeRepository().getNodes(NodeType.tenant);
        List activeApplications = getActiveApplicationIds(allNodes);
        Map> numSpareNodesByFlavorByState = getNumberOfNodesByFlavorByNodeState(allNodes);
        flavorSpareChecker.updateReadyAndActiveCountsByFlavor(numSpareNodesByFlavorByState);

        // Get all the nodes that we could retire along with their deployments
        Map> nodesToRetireByDeployment = new HashMap<>();
        for (ApplicationId applicationId : activeApplications) {
            Map> nodesByCluster = getNodesBelongingToApplication(allNodes, applicationId).stream()
                    .collect(Collectors.groupingBy(
                            node -> node.allocation().get().membership().cluster().id(),
                            Collectors.toSet()));
            Map> retireableNodesByCluster = nodesByCluster.entrySet().stream()
                    .collect(Collectors.toMap(
                            Map.Entry::getKey,
                            entry -> filterRetireableNodes(entry.getValue())));
            if (retireableNodesByCluster.values().stream().mapToInt(Set::size).sum() == 0) continue;

            Optional deployment = deployer.deployFromLocalActive(applicationId);
            if ( ! deployment.isPresent()) continue; // this will be done at another config server

            Set replaceableNodes = retireableNodesByCluster.entrySet().stream()
                    .flatMap(entry -> entry.getValue().stream()
                            .filter(node -> flavorSpareChecker.canRetireAllocatedNodeWithFlavor(node.flavor()))
                            .limit(getNumberNodesAllowToRetireForCluster(nodesByCluster.get(entry.getKey()), MAX_SIMULTANEOUS_RETIRES_PER_CLUSTER)))
                    .collect(Collectors.toSet());
            if (! replaceableNodes.isEmpty()) nodesToRetireByDeployment.put(deployment.get(), replaceableNodes);
        }

        nodesToRetireByDeployment.forEach(((deployment, nodes) -> {
            ApplicationId app = nodes.iterator().next().allocation().get().owner();
            Set nodesToRetire;

            // While under application lock, get up-to-date node, and make sure that the state and the owner of the
            // node has not changed in the meantime, mutate the up-to-date node (so to not overwrite other fields
            // that may have changed) with wantToRetire and wantToDeprovision.
            try (Mutex lock = nodeRepository().lock(app)) {
                nodesToRetire = nodes.stream()
                        .map(node ->
                                nodeRepository().getNode(node.hostname())
                                        .filter(upToDateNode -> node.state() == Node.State.active)
                                        .filter(upToDateNode -> node.allocation().get().owner().equals(upToDateNode.allocation().get().owner())))
                        .flatMap(node -> node.map(Stream::of).orElseGet(Stream::empty))
                        .collect(Collectors.toSet());

                nodesToRetire.forEach(node ->
                        retirementPolicy.shouldRetire(node).ifPresent(reason -> {
                            log.info("Setting wantToRetire and wantToDeprovision for host " + node.hostname() +
                                    " with flavor " + node.flavor().name() +
                                    " allocated to " + node.allocation().get().owner() + ". Reason: " + reason);

                            Node updatedNode = node.with(node.status()
                                    .withWantToRetire(true)
                                    .withWantToDeprovision(true));
                            nodeRepository().write(updatedNode);
                        }));
            }

            // This takes a while, so do it outside of the application lock
            if (! nodesToRetire.isEmpty()) {
                try {
                    deployment.activate();
                } catch (Exception e) {
                    log.log(LogLevel.INFO, "Failed to redeploy " + app.serializedForm() + ", will be redeployed later by application maintainer", e);
                }
            }
        }));
    }

    private List getNodesBelongingToApplication(Collection allNodes, ApplicationId applicationId) {
        return allNodes.stream()
                .filter(node -> node.allocation().isPresent())
                .filter(node -> node.allocation().get().owner().equals(applicationId))
                .collect(Collectors.toList());
    }

    /**
     * Returns a list of ApplicationIds sorted by number of active nodes the application has allocated to it
     */
    List getActiveApplicationIds(Collection nodes) {
        return nodes.stream()
                .filter(node -> node.state() == Node.State.active)
                .collect(Collectors.groupingBy(
                        node -> node.allocation().get().owner(),
                        Collectors.counting()))
                .entrySet().stream()
                .sorted((c1, c2) -> c2.getValue().compareTo(c1.getValue()))
                .map(Map.Entry::getKey)
                .collect(Collectors.toList());
    }

    /**
     * @param nodes Collection of nodes that are considered for retirement
     * @return Set of nodes that all should eventually be retired
     */
    Set filterRetireableNodes(Collection nodes) {
        return nodes.stream()
                .filter(node -> node.state() == Node.State.active)
                .filter(node -> !node.status().wantToRetire())
                .filter(node -> retirementPolicy.shouldRetire(node).isPresent())
                .collect(Collectors.toSet());
    }

    /**
     * @param clusterNodes All the nodes allocated to an application belonging to a single cluster
     * @return number of nodes we can safely start retiring
     */
    long getNumberNodesAllowToRetireForCluster(Collection clusterNodes, long maxSimultaneousRetires) {
        long numNodesInWantToRetire = clusterNodes.stream()
                .filter(node -> node.status().wantToRetire())
                .filter(node -> node.state() != Node.State.parked)
                .count();
        return Math.max(0, maxSimultaneousRetires - numNodesInWantToRetire);
    }

    private Map> getNumberOfNodesByFlavorByNodeState(Collection allNodes) {
        return allNodes.stream()
                .collect(Collectors.groupingBy(
                        Node::flavor,
                        Collectors.groupingBy(Node::state, Collectors.counting())));
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy