com.yahoo.vespa.hosted.provision.maintenance.MaintenanceDeployment Maven / Gradle / Ivy
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.provision.maintenance;
import ai.vespa.metrics.ConfigServerMetrics;
import com.yahoo.concurrent.UncheckedTimeoutException;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.Deployer;
import com.yahoo.config.provision.Deployment;
import com.yahoo.config.provision.TransientException;
import com.yahoo.config.provision.exception.ActivationConflictException;
import com.yahoo.jdisc.Metric;
import com.yahoo.transaction.Mutex;
import com.yahoo.vespa.hosted.provision.Node;
import com.yahoo.vespa.hosted.provision.NodeMutex;
import com.yahoo.vespa.hosted.provision.NodeRepository;
import com.yahoo.vespa.hosted.provision.node.Agent;
import com.yahoo.yolean.Exceptions;
import java.io.Closeable;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.function.Supplier;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* A wrapper of a deployment suitable for maintenance.
* This is a single-use, single-thread object.
*
* @author bratseth
*/
class MaintenanceDeployment implements Closeable {
private static final Logger log = Logger.getLogger(MaintenanceDeployment.class.getName());
private final ApplicationId application;
private final Metric metric;
private final Optional lock;
private final Optional deployment;
private final boolean throwOnFailure;
private boolean closed = false;
public MaintenanceDeployment(ApplicationId application,
Deployer deployer,
Metric metric,
NodeRepository nodeRepository) {
this(application, deployer, metric, nodeRepository, false);
}
public MaintenanceDeployment(ApplicationId application,
Deployer deployer,
Metric metric,
NodeRepository nodeRepository,
boolean throwOnFailure) {
this.application = application;
this.metric = metric;
this.throwOnFailure = throwOnFailure;
Optional lock = tryLock(application, nodeRepository);
try {
deployment = tryDeployment(lock, application, deployer, nodeRepository);
this.lock = lock;
lock = Optional.empty();
} finally {
lock.ifPresent(Mutex::close);
}
}
/** Return whether this is - as yet - functional and can be used to carry out the deployment */
public boolean isValid() {
return deployment.isPresent();
}
/** Prepare this deployment. Returns whether prepare was successful */
public boolean prepare() {
return doStep(() -> {
deployment.get().prepare();
return 0L;
}).isPresent();
}
/**
* Attempts to activate this deployment
*
* @return the application config generation resulting from this deployment, or empty if it was not successful
*/
public Optional activate() {
return doStep(() -> deployment.get().activate());
}
private Optional doStep(Supplier step) {
if (closed) throw new IllegalStateException(this + "' is closed");
if ( ! isValid()) return Optional.empty();
try {
return Optional.of(step.get());
} catch (TransientException | ActivationConflictException e) {
metric.add(ConfigServerMetrics.MAINTENANCE_DEPLOYMENT_TRANSIENT_FAILURE.baseName(), 1, metric.createContext(Map.of()));
log.log(Level.INFO, "Failed to maintenance deploy " + application + " with a transient error: " +
Exceptions.toMessageString(e));
return Optional.empty();
} catch (RuntimeException e) {
metric.add(ConfigServerMetrics.MAINTENANCE_DEPLOYMENT_FAILURE.baseName(), 1, metric.createContext(Map.of()));
if (throwOnFailure) {
throw e;
} else {
log.log(Level.WARNING, "Exception on maintenance deploy of " + application, e);
}
return Optional.empty();
}
}
private Optional tryLock(ApplicationId application, NodeRepository nodeRepository) {
try {
return Optional.of(nodeRepository.applications().lockMaintenance(application));
} catch (UncheckedTimeoutException e) {
log.log(Level.INFO, () -> "Could not lock " + application + " for maintenance deployment within timeout");
return Optional.empty();
}
}
private Optional tryDeployment(Optional lock,
ApplicationId application,
Deployer deployer,
NodeRepository nodeRepository) {
if (lock.isEmpty()) return Optional.empty();
if (nodeRepository.nodes().list(Node.State.active).owner(application).isEmpty()) return Optional.empty();
return deployer.deployFromLocalActive(application);
}
@Override
public void close() {
lock.ifPresent(Mutex::close);
closed = true;
}
@Override
public String toString() {
return "deployment of " + application;
}
public static class Move {
private final Node node;
private final Node fromHost, toHost;
Move(Node node, Node fromHost, Node toHost) {
this.node = node;
this.fromHost = fromHost;
this.toHost = toHost;
}
public Node node() { return node; }
public Node fromHost() { return fromHost; }
public Node toHost() { return toHost; }
/**
* Try to deploy to make this move.
*
* @param verifyTarget true to only make this move if the node ends up at the expected target host,
* false if we should perform it as long as it moves from the source host
* @return true if the move was done, false if it couldn't be
*/
public boolean execute(boolean verifyTarget,
Agent agent, Deployer deployer, Metric metric, NodeRepository nodeRepository) {
if (isEmpty()) return false;
ApplicationId application = node.allocation().get().owner();
try (MaintenanceDeployment deployment = new MaintenanceDeployment(application, deployer, metric, nodeRepository)) {
if ( ! deployment.isValid()) return false;
boolean couldMarkRetiredNow = markPreferToRetire(node, true, agent, nodeRepository);
if ( ! couldMarkRetiredNow) return false;
Optional expectedNewNode = Optional.empty();
try {
if ( ! deployment.prepare()) return false;
if (verifyTarget) {
expectedNewNode =
nodeRepository.nodes().list(Node.State.reserved)
.owner(application)
.cluster(node.allocation().get().membership().cluster().id())
.except(node)
.first();
if (expectedNewNode.isEmpty()) return false;
if (!expectedNewNode.get().hasParent(toHost.hostname())) return false;
}
if ( deployment.activate().isEmpty()) return false;
log.info(agent + " redeployed " + application + " to " +
( verifyTarget ? this : "move " + (node + " from " + fromHost.hostname())));
return true;
}
finally {
markPreferToRetire(node, false, agent, nodeRepository); // Necessary if this failed, no-op otherwise
// Immediately clean up if we reserved the node but could not activate or reserved a node on the wrong host
expectedNewNode.flatMap(node -> nodeRepository.nodes().node(node.hostname()))
.filter(node -> node.state() == Node.State.reserved)
.ifPresent(node -> nodeRepository.nodes().deallocate(node, agent, "Expired by " + agent));
}
}
}
/** Returns true only if this operation changes the state of the preferToRetire flag */
private boolean markPreferToRetire(Node node, boolean preferToRetire, Agent agent, NodeRepository nodeRepository) {
Optional nodeMutex = nodeRepository.nodes().lockAndGet(node);
if (nodeMutex.isEmpty()) return false;
try (var nodeLock = nodeMutex.get()) {
if (nodeLock.node().state() != Node.State.active) return false;
if (nodeLock.node().status().preferToRetire() == preferToRetire) return false;
// Node is retiring, keep preferToRetire
if (nodeLock.node().allocation().get().membership().retired() && !preferToRetire) return false;
nodeRepository.nodes().write(nodeLock.node().withPreferToRetire(preferToRetire, agent, nodeRepository.clock().instant()), nodeLock);
return true;
}
}
public boolean isEmpty() { return node == null; }
@Override
public int hashCode() {
return Objects.hash(node, fromHost, toHost);
}
public boolean equals(Object o) {
if (o == this) return true;
if (o == null || o.getClass() != this.getClass()) return false;
Move other = (Move)o;
if ( ! Objects.equals(other.node, this.node)) return false;
if ( ! Objects.equals(other.fromHost, this.fromHost)) return false;
if ( ! Objects.equals(other.toHost, this.toHost)) return false;
return true;
}
@Override
public String toString() {
return "move " +
( isEmpty() ? "none" : (node + " from " + fromHost.hostname() + " to " + toHost.hostname()));
}
public static Move empty() { return new Move(null, null, null); }
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy