com.yahoo.vespa.hosted.provision.autoscale.ClusterModel Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of node-repository Show documentation
Keeps track of node assignment in a multi-application setup.
There is a newer version: 8.441.21
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.autoscale;

import com.yahoo.config.provision.CloudAccount;
import com.yahoo.config.provision.ClusterSpec;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeList;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.applications.Application;
import com.yahoo.vespa.hosted.provision.applications.Cluster;
import com.yahoo.config.provision.CapacityPolicies;

import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.util.Objects;
import java.util.OptionalDouble;

/**
 * A cluster with its associated metrics which allows prediction about its future behavior.
 * For single-threaded, short-term usage.
 *
 * @author bratseth
 */
public final class ClusterModel {

    /** Containers typically use more cpu right after generation change, so discard those metrics */
    public static final Duration warmupDuration = Duration.ofMinutes(7);

    /** If we have less than this query rate, we cannot be fully confident in our load data, which influences some decisions. */
    public static final double queryRateGivingFullConfidence = 100.0;

    static final double idealQueryCpuLoad = 0.75;
    static final double idealWriteCpuLoad = 0.95;

    static final double idealContainerMemoryLoad = 0.9;
    static final double idealContentMemoryLoad = 0.65;

    static final double idealContainerDiskLoad = 0.95;
    static final double idealContentDiskLoad = 0.6;

    // Memory for other processes running on the node (config-proxy, metrics-proxy).
    // Keep in sync with config-model/NodeResourcesTuning.
    static final double nodeMemoryOverheadGb = 0.7;

    // When a query is issued on a node the cost is the sum of a fixed cost component and a cost component
    // proportional to document count. We must account for this when comparing configurations with more or fewer nodes.
    // TODO: Measure this, and only take it into account with queries
    private static final double fixedCpuCostFraction = 0.1;

    private final NodeRepository nodeRepository;
    private final CapacityPolicies capacityPolicies;
    private final Application application;
    private final ClusterSpec clusterSpec;
    private final Cluster cluster;
    private final AllocatableResources current;

    private final MemoryModel memory = new MemoryModel();
    private final DiskModel disk = new DiskModel();

    /**
     * The current active nodes of this cluster, including retired,
     * or empty if this models a new cluster not yet deployed.
     */
    private final NodeList nodes;

    private final Clock clock;
    private final Duration scalingDuration;
    private final Duration allocationDuration;
    private final ClusterTimeseries clusterTimeseries;
    private final ClusterNodesTimeseries nodeTimeseries;
    private final Instant at;

    // Lazily initialized members
    private Double queryFractionOfMax = null;
    private Double maxQueryGrowthRate = null;
    private OptionalDouble averageQueryRate = OptionalDouble.empty();

    public ClusterModel(NodeRepository nodeRepository,
                        Application application,
                        ClusterSpec clusterSpec,
                        Cluster cluster,
                        NodeList clusterNodes,
                        AllocatableResources current,
                        MetricsDb metricsDb,
                        Clock clock) {
        this.nodeRepository = nodeRepository;
        this.capacityPolicies = nodeRepository.capacityPoliciesFor(application.id());
        this.application = application;
        this.clusterSpec = clusterSpec;
        this.cluster = cluster;
        this.nodes = clusterNodes;
        this.current = current;
        this.clock = clock;
        this.scalingDuration = cluster.scalingDuration();
        this.allocationDuration = cluster.allocationDuration(clusterSpec);
        this.clusterTimeseries = metricsDb.getClusterTimeseries(application.id(), cluster.id());
        this.nodeTimeseries = new ClusterNodesTimeseries(scalingDuration(), cluster, nodes, metricsDb);
        this.at = clock.instant();
    }

    ClusterModel(NodeRepository nodeRepository,
                 Application application,
                 ClusterSpec clusterSpec,
                 Cluster cluster,
                 AllocatableResources current,
                 Clock clock,
                 Duration scalingDuration,
                 Duration allocationDuration,
                 ClusterTimeseries clusterTimeseries,
                 ClusterNodesTimeseries nodeTimeseries) {
        this.nodeRepository = nodeRepository;
        this.capacityPolicies = nodeRepository.capacityPoliciesFor(application.id());
        this.application = application;
        this.clusterSpec = clusterSpec;
        this.cluster = cluster;
        this.nodes = NodeList.of();
        this.current = current;
        this.clock = clock;

        this.scalingDuration = scalingDuration;
        this.allocationDuration = allocationDuration;
        this.clusterTimeseries = clusterTimeseries;
        this.nodeTimeseries = nodeTimeseries;
        this.at = clock.instant();
    }


    /**
     * The central decision made in autoscaling.
     *
     * @return the relative load adjustment that should be made to this cluster given available measurements.
     *         For example, a load adjustment of 2 means we should allocate twice the amount of that resources.
     */
    public Load loadAdjustment() {
        if (nodeTimeseries().measurementsPerNode() < 0.5) return Load.one(); // Don't change based on very little data
        Load adjustment = peakLoad().divide(idealLoad());
        if (! safeToScaleDown())
            adjustment = adjustment.map(v -> v < 1 ? 1 : v);
        return adjustment;
    }

    public Application application() { return application; }
    public ClusterSpec clusterSpec() { return clusterSpec; }
    public CloudAccount cloudAccount() { return cluster.cloudAccount().orElse(CloudAccount.empty); }
    public AllocatableResources current() { return current; }
    private ClusterNodesTimeseries nodeTimeseries() { return nodeTimeseries; }
    private ClusterTimeseries clusterTimeseries() { return clusterTimeseries; }

    /** Returns the instant this model was created. */
    public Instant at() { return at; }

    /** Returns the predicted duration of a rescaling of this cluster */
    public Duration scalingDuration() { return scalingDuration; }

    /**
     * Returns the predicted duration of a resource change in this cluster,
     * until we, or the application , will change it again.
     */
    public Duration allocationDuration() { return allocationDuration; }

    public boolean isEmpty() { return nodeTimeseries().isEmpty(); }

    public boolean isContent() { return clusterSpec.type().isContent(); }

    /** Returns the predicted duration of data redistribution in this cluster. */
    public Duration redistributionDuration() {
        if (! isContent()) return Duration.ofMinutes(0);
        return scalingDuration(); // TODO: Estimate separately
    }

    /** Returns the predicted duration of replacing all the nodes in this cluster. */
    public Duration nodeReplacementDuration() {
        return Duration.ofMinutes(5); // TODO: Estimate?
    }

    /** Returns the average of the peak load measurement in each dimension, from each node. */
    public Load peakLoad() {
        return nodeTimeseries().peakLoad();
    }

    /** Returns the relative load adjustment accounting for redundancy in this. */
    private Load redundancyAdjustment(Instant now) {
        return loadWith(nodeCount(), groupCount(), now);
    }

    public boolean isExclusive() {
        return nodeRepository.exclusivity().allocation(clusterSpec);
    }

    public boolean isStable(NodeRepository nodeRepository) {
        // The cluster is processing recent changes
        if (nodes.stream().anyMatch(node -> node.status().wantToRetire() ||
                                            node.allocation().get().membership().retired() ||
                                            node.allocation().get().removable()))
            return false;

        // A deployment is ongoing
        if ( ! nodeRepository.nodes().list(Node.State.reserved, Node.State.provisioned).owner(application.id()).isEmpty())
            return false;

        return true;
    }

    /** Are we in a position to make decisions to scale down at this point? */
    public boolean safeToScaleDown() {
        if (hasScaledIn(scalingDuration().multipliedBy(3))) return false;
        if (nodeTimeseries().nodesMeasured() != nodeCount()) return false;
        return true;
    }

    public static Duration minScalingDuration() {
        return Duration.ofMinutes(5);
    }

    /** Transforms the given load adjustment to an equivalent adjustment given a target number of nodes and groups. */
    public Load loadAdjustmentWith(int nodes, int groups, Load loadAdjustment, Instant now) {
        return loadAdjustment // redundancy adjusted target relative to current load
               .multiply(loadWith(nodes, groups, now)) // redundancy aware adjustment with these counts
               .divide(redundancyAdjustment(now));   // correct for double redundancy adjustment
    }

    /**
     * Returns the relative load adjustment accounting for redundancy given these nodes+groups
     * relative to node nodes+groups in this.
     */
    Load loadWith(int givenNodes, int givenGroups, Instant now) {
        int nodes = nodesAdjustedForRedundancy(givenNodes, givenGroups);
        int groups = groupsAdjustedForRedundancy(givenNodes, givenGroups);
        if (clusterSpec().type() == ClusterSpec.Type.content) { // load scales with node share of content
            int groupSize = nodes / groups;

            // Cpu: Query cpu scales with cluster size, write cpu scales with group size
            // Memory and disk: Scales with group size

            // The fixed cost portion of cpu does not scale with changes to the node count
            double queryCpuPerGroup = fixedCpuCostFraction + (1 - fixedCpuCostFraction) * groupSize() / groupSize;

            double queryCpu = queryCpuPerGroup * groupCount() / groups;
            double writeCpu = (double)groupSize() / groupSize;
            CpuModel cpu = cpu(now);
            return new Load(cpu.queryFraction() * queryCpu + (1 - cpu.queryFraction()) * writeCpu,
                            (1 - memory.fixedFraction()) * (double) groupSize() / groupSize + memory.fixedFraction() * 1,
                            (double)groupSize() / groupSize,
                            1,
                            1);
        }
        else {
            return new Load((double) nodeCount() / nodes, 1, 1, 1, 1);
        }
    }

    /**
     * Returns the ideal load across the nodes of this such that each node will be at ideal load
     * if one of the nodes go down.
     */
    public Load idealLoad() {
        CpuModel cpu = cpu(clock.instant());
        var ideal = new Load(cpu.idealLoad(), memory.idealLoad(), disk.idealLoad(), cpu.idealLoad(), memory.idealLoad()).divide(redundancyAdjustment(cpu.at()));
        if ( !cluster.bcpGroupInfo().isEmpty() && cluster.bcpGroupInfo().queryRate() > 0) {
            // Since we have little local information, use information about query cost in other groups
            Load bcpGroupIdeal = adjustQueryDependentIdealLoadByBcpGroupInfo(ideal, cpu);

            // Do a weighted sum of the ideal "vote" based on local and bcp group info.
            // This avoids any discontinuities with a near-zero local query rate.
            double localInformationWeight = Math.min(1, averageQueryRate(cpu.at()).orElse(0) /
                                                        Math.min(queryRateGivingFullConfidence, cluster.bcpGroupInfo().queryRate()));
            ideal = ideal.multiply(localInformationWeight).add(bcpGroupIdeal.multiply(1 - localInformationWeight));
        }
        return ideal;
    }

    public CpuModel cpu(Instant now) {
        return new CpuModel(now);
    }

    private boolean canRescaleWithinBcpDeadline() {
        return scalingDuration().minus(cluster.clusterInfo().bcpDeadline()).isNegative();
    }

    public Autoscaling.Metrics metrics() {
        Instant now = clock.instant();
        return new Autoscaling.Metrics(averageQueryRate(now).orElse(0),
                                       growthRateHeadroom(now),
                                       cpu(now).costPerQuery().orElse(0));
    }

    private Load adjustQueryDependentIdealLoadByBcpGroupInfo(Load ideal, CpuModel cpu) {
        double currentClusterTotalVcpuPerGroup = nodes.not().retired().first().get().resources().vcpu() * groupSize();
        double targetQueryRateToHandle = ( canRescaleWithinBcpDeadline() ? averageQueryRate(cpu.at()).orElse(0)
                                                                         : cluster.bcpGroupInfo().queryRate() )
                                         * cluster.bcpGroupInfo().growthRateHeadroom() * trafficShiftHeadroom(cpu.at());
        double neededTotalVcpuPerGroup = cluster.bcpGroupInfo().cpuCostPerQuery() * targetQueryRateToHandle / groupCount() +
                                        ( 1 - cpu.queryFraction()) * cpu.idealLoad() *
                                        (clusterSpec.type().isContainer() ? 1 : groupSize());
        // Max 1: Only use bcp group info if it indicates that we need to scale *up*
        double cpuAdjustment = Math.max(1.0, neededTotalVcpuPerGroup / currentClusterTotalVcpuPerGroup);
        return ideal.withCpu(ideal.cpu() / cpuAdjustment);
    }

    private boolean hasScaledIn(Duration period) {
        if (cluster.lastScalingEvent().isEmpty()) return false;
        var lastCompletion = cluster.lastScalingEvent().get().completion();
        if (lastCompletion.isEmpty()) return true; // Ongoing
        return lastCompletion.get().isAfter(clock.instant().minus(period));
    }

    /**
     * Returns the predicted max query growth rate per minute as a fraction of the average traffic
     * in the scaling window.
     */
    private double maxQueryGrowthRate(Instant now) {
        if (maxQueryGrowthRate != null) return maxQueryGrowthRate;
        return maxQueryGrowthRate = clusterTimeseries().maxQueryGrowthRate(scalingDuration(), now);
    }

    /** Returns the average query rate in the scaling window as a fraction of the max observed query rate. */
    private double queryFractionOfMax(Instant now) {
        if (queryFractionOfMax != null) return queryFractionOfMax;
        return queryFractionOfMax = clusterTimeseries().queryFractionOfMax(scalingDuration(), now);
    }

    /** Returns the average query rate in the scaling window. */
    private OptionalDouble averageQueryRate(Instant now) {
        if (averageQueryRate.isPresent()) return averageQueryRate;
        return averageQueryRate = clusterTimeseries().queryRate(scalingDuration(), now);
    }

    /** The number of nodes this cluster has, or will have if not deployed yet. */
    // TODO: Make this the deployed, not current count
    public int nodeCount() {
        if ( ! nodes.isEmpty()) return (int)nodes.not().retired().stream().count();
        return cluster.minResources().nodes();
    }

    /** The number of groups this cluster has, or will have if not deployed yet. */
    // TODO: Make this the deployed, not current count
    private int groupCount() {
        if ( ! nodes.isEmpty()) return (int)nodes.not().retired().stream().mapToInt(node -> node.allocation().get().membership().cluster().group().get().index()).distinct().count();
        return cluster.minResources().groups();
    }

    private int groupSize() {
        // ceil: If the division does not produce a whole number we assume some node is missing
        return (int)Math.ceil((double)nodeCount() / groupCount());
    }

    private static int nodesAdjustedForRedundancy(int nodes, int groups) {
        int groupSize = (int)Math.ceil((double)nodes / groups);
        return nodes > 1 ? (groups == 1 ? nodes - 1 : nodes - groupSize) : nodes;
    }

    private static int groupsAdjustedForRedundancy(int nodes, int groups) {
        return nodes > 1 ? (groups == 1 ? 1 : groups - 1) : groups;
    }

    /** Returns the headroom for growth during organic traffic growth as a multiple of current resources. */
    private double growthRateHeadroom(Instant now) {
        if ( ! nodeRepository.zone().environment().isProduction()) return 1;
        double growthRateHeadroom = 1 + maxQueryGrowthRate(now) * scalingDuration().toMinutes();
        // Cap headroom at 10% above the historical observed peak
        if (queryFractionOfMax(now) != 0)
            growthRateHeadroom = Math.min(growthRateHeadroom, 1 / queryFractionOfMax(now) + 0.1);

        return adjustByConfidence(growthRateHeadroom, now);
    }

    /**
     * Returns the headroom is needed to handle sudden arrival of additional traffic due to another zone going down
     * as a multiple of current resources.
     */
    private double trafficShiftHeadroom(Instant now) {
        if ( ! nodeRepository.zone().environment().isProduction()) return 1;
        if (canRescaleWithinBcpDeadline()) return 1;
        double trafficShiftHeadroom;
        if (application.status().maxReadShare() == 0) // No traffic fraction data
            trafficShiftHeadroom = 2.0; // assume we currently get half of the max possible share of traffic
        else if (application.status().currentReadShare() == 0)
            trafficShiftHeadroom = 1/application.status().maxReadShare();
        else
            trafficShiftHeadroom = application.status().maxReadShare() / application.status().currentReadShare();
        return adjustByConfidence(Math.min(trafficShiftHeadroom, 1/application.status().maxReadShare()), now);
    }

    /**
     * Headroom values are a multiplier of the current query rate.
     * Adjust this value closer to 1 if the query rate is too low to derive statistical conclusions
     * with high confidence to avoid large adjustments caused by random noise due to low traffic numbers.
     */
    private double adjustByConfidence(double headroom, Instant now) {
        return ( (headroom -1 ) * Math.min(1, averageQueryRate(now).orElse(0) / queryRateGivingFullConfidence) ) + 1;
    }

    public class CpuModel {

        private final Instant at;

        public CpuModel(Instant at) {
            this.at = Objects.requireNonNull(at);
        }

        Instant at() {
            return at;
        }

        /** Ideal cpu load must take the application traffic fraction into account. */
        double idealLoad() {
            double queryCpuFraction = queryFraction();
            // Assumptions: 1) Write load is not organic so we should not increase to handle potential future growth.
            //                 (TODO: But allow applications to set their target write rate and size for that)
            //              2) Write load does not change in BCP scenarios.
            return queryCpuFraction * 1/growthRateHeadroom(at) * 1 / trafficShiftHeadroom(at) * idealQueryCpuLoad +
                   (1 - queryCpuFraction) * idealWriteCpuLoad;
        }

        OptionalDouble costPerQuery() {
            if (averageQueryRate(at()).isEmpty() || averageQueryRate(at()).getAsDouble() == 0.0) return OptionalDouble.empty();
            // TODO: Query rate should generally be sampled at the time where we see the peak resource usage
            int fanOut = clusterSpec.type().isContainer() ? 1 : groupSize();
            return OptionalDouble.of(peakLoad().cpu() * queryFraction() * fanOut * nodes.not().retired().first().get().resources().vcpu()
                                     / averageQueryRate(at).getAsDouble() / groupCount());
        }

        /** The estimated fraction of cpu usage which goes to processing queries vs. writes */
        double queryFraction() {
            OptionalDouble writeRate = clusterTimeseries().writeRate(scalingDuration(), at);
            if (averageQueryRate(at).orElse(0) == 0 && writeRate.orElse(0) == 0) return queryFraction(0.5);
            return queryFraction(averageQueryRate(at).orElse(0) / (averageQueryRate(at).orElse(0) + writeRate.orElse(0)));
        }

        double queryFraction(double queryRateFraction) {
            double relativeQueryCost = 9; // How much more expensive are queries than writes? TODO: Measure
            double writeFraction = 1 - queryRateFraction;
            return queryRateFraction * relativeQueryCost / (queryRateFraction * relativeQueryCost + writeFraction);
        }

        public String toString() {
            return "cpu model idealLoad: " + idealLoad() + ", queryFraction: " + queryFraction() +
                   ", growthRateHeadroom: " + growthRateHeadroom(at) + ", trafficShiftHeadroom: " + trafficShiftHeadroom(at);
        }

    }

    private class MemoryModel {

        double idealLoad() {
            if (clusterSpec.type().isContainer()) return idealContainerMemoryLoad;
            if (clusterSpec.type() == ClusterSpec.Type.admin) return idealContainerMemoryLoad; // Not autoscaled, but ideal shown in console
            return idealContentMemoryLoad;
        }

        /**
         * Returns the fraction of memory of the current allocation which is currently consumed by
         * fixed data structures which take the same amount of space regardless of document volume.
         */
        double fixedFraction() {
            if (clusterSpec().type().isContainer()) return 1.0;
            double fixedMemory = nodeMemoryOverheadGb +
                                 (averageReal() - nodeMemoryOverheadGb) * 0.05; // TODO: Measure actual content node usage
            return fixedMemory / averageReal();
        }

        double averageReal() {
            if (nodes.isEmpty()) { // we're estimating
                var initialResources = capacityPolicies.specifyFully(cluster.minResources().nodeResources(), clusterSpec);
                return nodeRepository.resourcesCalculator().requestToReal(initialResources,
                                                                          cloudAccount(),
                                                                          nodeRepository.exclusivity().allocation(clusterSpec),
                                                                          false).memoryGiB();
            }
            else {
                return nodes.stream()
                            .mapToDouble(node -> nodeRepository.resourcesCalculator().realResourcesOf(node, nodeRepository).memoryGiB())
                            .average()
                            .getAsDouble();
            }
        }

    }

    private class DiskModel {

        double idealLoad() {
            // Stateless clusters are not expected to consume more disk over time -
            // if they do it is due to logs which will be rotated away right before the disk is full
            return clusterSpec.isStateful() ? idealContentDiskLoad : idealContainerDiskLoad;
        }

    }

}