All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.vespa.hosted.provision.maintenance.MaintenanceDeployment Maven / Gradle / Ivy

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;

import ai.vespa.metrics.ConfigServerMetrics;
import com.yahoo.concurrent.UncheckedTimeoutException;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.TransientException;
import com.yahoo.config.provision.exception.ActivationConflictException;
import com.yahoo.jdisc.Metric;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeMutex;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.yolean.Exceptions;

import java.io.Closeable;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.function.Supplier;
import java.util.logging.Level;
import java.util.logging.Logger;

/**
 * A wrapper of a deployment suitable for maintenance.
 * This is a single-use, single-thread object.
 *
 * @author bratseth
 */
class MaintenanceDeployment implements Closeable {

    private static final Logger log = Logger.getLogger(MaintenanceDeployment.class.getName());

    private final ApplicationId application;
    private final Metric metric;
    private final Optional lock;
    private final Optional deployment;
    private final boolean throwOnFailure;

    private boolean closed = false;

    public MaintenanceDeployment(ApplicationId application,
                                 Deployer deployer,
                                 Metric metric,
                                 NodeRepository nodeRepository) {
        this(application, deployer, metric, nodeRepository, false);
    }

    public MaintenanceDeployment(ApplicationId application,
                                 Deployer deployer,
                                 Metric metric,
                                 NodeRepository nodeRepository,
                                 boolean throwOnFailure) {
        this.application = application;
        this.metric = metric;
        this.throwOnFailure = throwOnFailure;

        Optional lock = tryLock(application, nodeRepository);
        try {
            deployment = tryDeployment(lock, application, deployer, nodeRepository);
            this.lock = lock;
            lock = Optional.empty();
        } finally {
            lock.ifPresent(Mutex::close);
        }
    }

    /** Return whether this is - as yet - functional and can be used to carry out the deployment */
    public boolean isValid() {
        return deployment.isPresent();
    }

    /** Prepare this deployment. Returns whether prepare was successful */
    public boolean prepare() {
        return doStep(() -> {
            deployment.get().prepare();
            return 0L;
        }).isPresent();
    }

    /**
     * Attempts to activate this deployment
     *
     * @return the application config generation resulting from this deployment, or empty if it was not successful
     */
    public Optional activate() {
        return doStep(() -> deployment.get().activate());
    }

    private Optional doStep(Supplier step) {
        if (closed) throw new IllegalStateException(this + "' is closed");
        if ( ! isValid()) return Optional.empty();
        try {
            return Optional.of(step.get());
        } catch (TransientException | ActivationConflictException e) {
            metric.add(ConfigServerMetrics.MAINTENANCE_DEPLOYMENT_TRANSIENT_FAILURE.baseName(), 1, metric.createContext(Map.of()));
            log.log(Level.INFO, "Failed to maintenance deploy " + application + " with a transient error: " +
                                   Exceptions.toMessageString(e));
            return Optional.empty();
        } catch (RuntimeException e) {
            metric.add(ConfigServerMetrics.MAINTENANCE_DEPLOYMENT_FAILURE.baseName(), 1, metric.createContext(Map.of()));
            if (throwOnFailure) {
                throw e;
            } else {
                log.log(Level.WARNING, "Exception on maintenance deploy of " + application, e);
            }
            return Optional.empty();
        }
    }

    private Optional tryLock(ApplicationId application, NodeRepository nodeRepository) {
        try {
            return Optional.of(nodeRepository.applications().lockMaintenance(application));
        } catch (UncheckedTimeoutException e) {
            log.log(Level.INFO, () -> "Could not lock " + application + " for maintenance deployment within timeout");
            return Optional.empty();
        }
    }

    private Optional tryDeployment(Optional lock,
                                               ApplicationId application,
                                               Deployer deployer,
                                               NodeRepository nodeRepository) {
        if (lock.isEmpty()) return Optional.empty();
        if (nodeRepository.nodes().list(Node.State.active).owner(application).isEmpty()) return Optional.empty();
        return deployer.deployFromLocalActive(application);
    }

    @Override
    public void close() {
        lock.ifPresent(Mutex::close);
        closed = true;
    }

    @Override
    public String toString() {
        return "deployment of " + application;
    }

    public static class Move {

        private final Node node;
        private final Node fromHost, toHost;

        Move(Node node, Node fromHost, Node toHost) {
            this.node = node;
            this.fromHost = fromHost;
            this.toHost = toHost;
        }

        public Node node() { return node; }
        public Node fromHost() { return fromHost; }
        public Node toHost() { return toHost; }

        /**
         * Try to deploy to make this move.
         *
         * @param verifyTarget true to only make this move if the node ends up at the expected target host,
         *                     false if we should perform it as long as it moves from the source host
         * @return true if the move was done, false if it couldn't be
         */
        public boolean execute(boolean verifyTarget,
                               Agent agent, Deployer deployer, Metric metric, NodeRepository nodeRepository) {
            if (isEmpty()) return false;
            ApplicationId application = node.allocation().get().owner();
            try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository)) {
                if ( ! deployment.isValid()) return false;

                boolean couldMarkRetiredNow = markPreferToRetire(node, true, agent, nodeRepository);
                if ( ! couldMarkRetiredNow) return false;

                Optional expectedNewNode = Optional.empty();
                try {
                    if ( ! deployment.prepare()) return false;
                    if (verifyTarget) {
                        expectedNewNode =
                                nodeRepository.nodes().list(Node.State.reserved)
                                              .owner(application)
                                              .cluster(node.allocation().get().membership().cluster().id())
                                              .except(node)
                                              .first();
                        if (expectedNewNode.isEmpty()) return false;
                        if (!expectedNewNode.get().hasParent(toHost.hostname())) return false;
                    }
                    if ( deployment.activate().isEmpty()) return false;

                    log.info(agent + " redeployed " + application + " to " +
                             ( verifyTarget ? this : "move " + (node + " from " + fromHost.hostname())));
                    return true;
                }
                finally {
                    markPreferToRetire(node, false, agent, nodeRepository); // Necessary if this failed, no-op otherwise

                    // Immediately clean up if we reserved the node but could not activate or reserved a node on the wrong host
                    expectedNewNode.flatMap(node -> nodeRepository.nodes().node(node.hostname()))
                                   .filter(node -> node.state() == Node.State.reserved)
                                   .ifPresent(node -> nodeRepository.nodes().deallocate(node, agent, "Expired by " + agent));
                }
            }
        }

        /** Returns true only if this operation changes the state of the preferToRetire flag */
        private boolean markPreferToRetire(Node node, boolean preferToRetire, Agent agent, NodeRepository nodeRepository) {
            Optional nodeMutex = nodeRepository.nodes().lockAndGet(node);
            if (nodeMutex.isEmpty()) return false;

            try (var nodeLock = nodeMutex.get()) {
                if (nodeLock.node().state() != Node.State.active) return false;

                if (nodeLock.node().status().preferToRetire() == preferToRetire) return false;

                // Node is retiring, keep preferToRetire
                if (nodeLock.node().allocation().get().membership().retired() && !preferToRetire) return false;

                nodeRepository.nodes().write(nodeLock.node().withPreferToRetire(preferToRetire, agent, nodeRepository.clock().instant()), nodeLock);
                return true;
            }
        }

        public boolean isEmpty() { return node == null; }

        @Override
        public int hashCode() {
            return Objects.hash(node, fromHost, toHost);
        }

        public boolean equals(Object o) {
            if (o == this) return true;
            if (o == null || o.getClass() != this.getClass()) return false;

            Move other = (Move)o;
            if ( ! Objects.equals(other.node, this.node)) return false;
            if ( ! Objects.equals(other.fromHost, this.fromHost)) return false;
            if ( ! Objects.equals(other.toHost, this.toHost)) return false;
            return true;
        }

        @Override
        public String toString() {
            return "move " +
                   ( isEmpty() ? "none" : (node + " from " + fromHost.hostname() + " to " + toHost.hostname()));
        }

        public static Move empty() { return new Move(null, null, null); }

    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy