com.yahoo.vespa.hosted.controller.deployment.DeploymentTrigger Maven / Gradle / Ivy
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.controller.deployment;
import com.yahoo.config.application.api.DeploymentInstanceSpec;
import com.yahoo.config.application.api.DeploymentSpec;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.InstanceName;
import com.yahoo.text.Text;
import com.yahoo.vespa.curator.Lock;
import com.yahoo.vespa.hosted.controller.Application;
import com.yahoo.vespa.hosted.controller.ApplicationController;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.Instance;
import com.yahoo.vespa.hosted.controller.api.identifiers.DeploymentId;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.ApplicationVersion;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobId;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.JobType;
import com.yahoo.vespa.hosted.controller.application.ApplicationList;
import com.yahoo.vespa.hosted.controller.application.Change;
import com.yahoo.vespa.hosted.controller.application.Deployment;
import com.yahoo.vespa.hosted.controller.application.TenantAndApplicationId;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.OptionalLong;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
import static java.util.Comparator.comparing;
import static java.util.stream.Collectors.groupingBy;
import static java.util.stream.Collectors.toList;
/**
* Responsible for scheduling deployment jobs in a build system and keeping
* {@link Instance#change()} in sync with what is scheduled.
*
* This class is multi-thread safe.
*
* @author bratseth
* @author mpolden
* @author jonmv
*/
public class DeploymentTrigger {
public static final Duration maxPause = Duration.ofDays(3);
private final static Logger log = Logger.getLogger(DeploymentTrigger.class.getName());
private final Controller controller;
private final Clock clock;
private final JobController jobs;
public DeploymentTrigger(Controller controller, Clock clock) {
this.controller = Objects.requireNonNull(controller, "controller cannot be null");
this.clock = Objects.requireNonNull(clock, "clock cannot be null");
this.jobs = controller.jobController();
}
public DeploymentSteps steps(DeploymentInstanceSpec spec) {
return new DeploymentSteps(spec, controller::system);
}
public void notifyOfSubmission(TenantAndApplicationId id, ApplicationVersion version, long projectId) {
if (applications().getApplication(id).isEmpty()) {
log.log(Level.WARNING, "Ignoring submission from project '" + projectId +
"': Unknown application '" + id + "'");
return;
}
applications().lockApplicationOrThrow(id, application -> {
application = application.withProjectId(OptionalLong.of(projectId));
application = application.withNewSubmission(version);
applications().store(application);
});
triggerNewRevision(id);
}
/**
* Propagates the latest revision to ready instances.
* Ready instances are those whose dependencies are complete, and which aren't blocked, and, additionally,
* which aren't upgrading, or are already deploying an application change, or failing upgrade.
*/
public void triggerNewRevision(TenantAndApplicationId id) {
applications().lockApplicationIfPresent(id, application -> {
DeploymentStatus status = jobs.deploymentStatus(application.get());
for (InstanceName instanceName : application.get().deploymentSpec().instanceNames()) {
Change outstanding = status.outstandingChange(instanceName);
if ( outstanding.hasTargets()
&& status.instanceSteps().get(instanceName)
.readyAt(outstanding)
.map(readyAt -> ! readyAt.isAfter(clock.instant())).orElse(false)
&& acceptNewApplicationVersion(status, instanceName)) {
application = application.with(instanceName,
instance -> {
instance = instance.withChange(instance.change().with(outstanding.application().get()));
return instance.withChange(remainingChange(instance, status));
});
}
}
applications().store(application);
});
}
/**
* Records information when a job completes (successfully or not). This information is used when deciding what to
* trigger next.
*/
public void notifyOfCompletion(ApplicationId id) {
if (applications().getInstance(id).isEmpty()) {
log.log(Level.WARNING, "Ignoring completion of job of unknown application '" + id + "'");
return;
}
applications().lockApplicationOrThrow(TenantAndApplicationId.from(id), application ->
applications().store(application.with(id.instance(),
instance -> instance.withChange(remainingChange(instance, jobs.deploymentStatus(application.get()))))));
}
/**
* Finds and triggers jobs that can and should run but are currently not, and returns the number of triggered jobs.
*
* Only one job per type is triggered each run for test jobs, since their environments have limited capacity.
*/
public long triggerReadyJobs() {
List readyJobs = computeReadyJobs();
var prodJobs = new ArrayList();
var testJobs = new ArrayList();
for (Job job : readyJobs) {
if (job.jobType.isTest()) testJobs.add(job);
else prodJobs.add(job);
}
// Flat list of prod jobs, grouped by application id, retaining the step order
List sortedProdJobs = prodJobs.stream()
.collect(groupingBy(Job::applicationId))
.values().stream()
.flatMap(List::stream)
.collect(Collectors.toUnmodifiableList());
// Map of test jobs, a list for each job type. Jobs in each list are sorted by priority.
Map> sortedTestJobsByType = testJobs.stream()
.sorted(comparing(Job::isRetry)
.thenComparing(Job::applicationUpgrade)
.reversed()
.thenComparing(Job::availableSince))
.collect(groupingBy(Job::jobType));
// Trigger all prod jobs
sortedProdJobs.forEach(this::trigger);
long triggeredJobs = sortedProdJobs.size();
// Trigger max one test job per type
for (var jobs : sortedTestJobsByType.values()) {
if (jobs.size() > 0) {
trigger(jobs.get(0));
triggeredJobs++;
}
}
return triggeredJobs;
}
/** Attempts to trigger the given job. */
public void trigger(Job job) {
log.log(Level.FINE, () -> "Triggering " + job);
applications().lockApplicationOrThrow(TenantAndApplicationId.from(job.applicationId()), application -> {
jobs.start(job.applicationId(), job.jobType, job.versions);
applications().store(application.with(job.applicationId().instance(), instance ->
instance.withJobPause(job.jobType, OptionalLong.empty())));
});
}
/** Force triggering of a job for given instance, with same versions as last run. */
public JobId reTrigger(ApplicationId applicationId, JobType jobType) {
Application application = applications().requireApplication(TenantAndApplicationId.from(applicationId));
Instance instance = application.require(applicationId.instance());
JobId job = new JobId(instance.id(), jobType);
JobStatus jobStatus = jobs.jobStatus(new JobId(applicationId, jobType));
Versions versions = jobStatus.lastTriggered()
.orElseThrow(() -> new IllegalArgumentException(job + " has never been triggered"))
.versions();
trigger(deploymentJob(instance, versions, jobType, jobStatus, clock.instant()));
return job;
}
/** Force triggering of a job for given instance. */
public List forceTrigger(ApplicationId applicationId, JobType jobType, String user, boolean requireTests) {
Application application = applications().requireApplication(TenantAndApplicationId.from(applicationId));
Instance instance = application.require(applicationId.instance());
JobId job = new JobId(instance.id(), jobType);
if (job.type().environment().isManuallyDeployed())
return forceTriggerManualJob(job);
DeploymentStatus status = jobs.deploymentStatus(application);
Versions versions = Versions.from(instance.change(), application, status.deploymentFor(job), controller.readSystemVersion());
Map> jobs = status.testJobs(Map.of(job, versions));
if (jobs.isEmpty() || ! requireTests)
jobs = Map.of(job, List.of(versions));
jobs.forEach((jobId, versionsList) -> {
trigger(deploymentJob(instance, versionsList.get(0), jobId.type(), status.jobs().get(jobId).get(), clock.instant()));
});
return List.copyOf(jobs.keySet());
}
private List forceTriggerManualJob(JobId job) {
Run last = jobs.last(job).orElseThrow(() -> new IllegalArgumentException(job + " has never been run"));
Versions target = new Versions(controller.readSystemVersion(),
last.versions().targetApplication(),
Optional.of(last.versions().targetPlatform()),
Optional.of(last.versions().targetApplication()));
jobs.start(job.application(), job.type(), target, true);
return List.of(job);
}
/** Retrigger job. If the job is already running, it will be canceled, and retrigger enqueued. */
public Optional reTriggerOrAddToQueue(DeploymentId deployment) {
JobType jobType = JobType.from(controller.system(), deployment.zoneId())
.orElseThrow(() -> new IllegalArgumentException(Text.format("No job to trigger for (system/zone): %s/%s", controller.system().value(), deployment.zoneId().value())));
Optional existingRun = controller.jobController().active(deployment.applicationId()).stream()
.filter(run -> run.id().type().equals(jobType))
.findFirst();
if (existingRun.isPresent()) {
Run run = existingRun.get();
try (Lock lock = controller.curator().lockDeploymentRetriggerQueue()) {
List retriggerEntries = controller.curator().readRetriggerEntries();
List newList = new ArrayList<>(retriggerEntries);
RetriggerEntry requiredEntry = new RetriggerEntry(new JobId(deployment.applicationId(), jobType), run.id().number() + 1);
if(newList.stream().noneMatch(entry -> entry.jobId().equals(requiredEntry.jobId()) && entry.requiredRun()>=requiredEntry.requiredRun())) {
newList.add(requiredEntry);
}
newList = newList.stream()
.filter(entry -> !(entry.jobId().equals(requiredEntry.jobId()) && entry.requiredRun() < requiredEntry.requiredRun()))
.collect(toList());
controller.curator().writeRetriggerEntries(newList);
}
controller.jobController().abort(run.id());
return Optional.empty();
} else {
return Optional.of(reTrigger(deployment.applicationId(), jobType));
}
}
/** Prevents jobs of the given type from starting, until the given time. */
public void pauseJob(ApplicationId id, JobType jobType, Instant until) {
if (until.isAfter(clock.instant().plus(maxPause)))
throw new IllegalArgumentException("Pause only allowed for up to " + maxPause);
applications().lockApplicationOrThrow(TenantAndApplicationId.from(id), application ->
applications().store(application.with(id.instance(),
instance -> instance.withJobPause(jobType, OptionalLong.of(until.toEpochMilli())))));
}
/** Resumes a previously paused job, letting it be triggered normally. */
public void resumeJob(ApplicationId id, JobType jobType) {
applications().lockApplicationOrThrow(TenantAndApplicationId.from(id), application ->
applications().store(application.with(id.instance(),
instance -> instance.withJobPause(jobType, OptionalLong.empty()))));
}
/** Triggers a change of this application, unless it already has a change. */
public void triggerChange(ApplicationId instanceId, Change change) {
applications().lockApplicationOrThrow(TenantAndApplicationId.from(instanceId), application -> {
if ( ! application.get().require(instanceId.instance()).change().hasTargets())
forceChange(instanceId, change);
});
}
/** Overrides the given instance's platform and application changes with any contained in the given change. */
public void forceChange(ApplicationId instanceId, Change change) {
applications().lockApplicationOrThrow(TenantAndApplicationId.from(instanceId), application -> {
Change newChange = change.onTopOf(application.get().require(instanceId.instance()).change());
application = application.with(instanceId.instance(),
instance -> instance.withChange(newChange));
DeploymentStatus newStatus = jobs.deploymentStatus(application.get());
application = application.with(instanceId.instance(),
instance -> instance.withChange(remainingChange(instance, newStatus)));
applications().store(application);
});
}
/** Cancels the indicated part of the given application's change. */
public void cancelChange(ApplicationId instanceId, ChangesToCancel cancellation) {
applications().lockApplicationOrThrow(TenantAndApplicationId.from(instanceId), application -> {
Change change;
switch (cancellation) {
case ALL: change = Change.empty(); break;
case VERSIONS: change = Change.empty().withPin(); break;
case PLATFORM: change = application.get().require(instanceId.instance()).change().withoutPlatform(); break;
case APPLICATION: change = application.get().require(instanceId.instance()).change().withoutApplication(); break;
case PIN: change = application.get().require(instanceId.instance()).change().withoutPin(); break;
default: throw new IllegalArgumentException("Unknown cancellation choice '" + cancellation + "'!");
}
applications().store(application.with(instanceId.instance(),
instance -> instance.withChange(change)));
});
}
public enum ChangesToCancel { ALL, PLATFORM, APPLICATION, VERSIONS, PIN }
// ---------- Conveniences ----------
private ApplicationController applications() {
return controller.applications();
}
// ---------- Ready job computation ----------
/** Returns the set of all jobs which have changes to propagate from the upstream steps. */
private List computeReadyJobs() {
return jobs.deploymentStatuses(ApplicationList.from(applications().readable())
.withProjectId() // Need to keep this, as we have applications with deployment spec that shouldn't be orchestrated.
.withDeploymentSpec())
.withChanges()
.asList().stream()
.map(this::computeReadyJobs)
.flatMap(Collection::stream)
.collect(toList());
}
/** Finds the next step to trigger for the given application, if any, and returns these as a list. */
private List computeReadyJobs(DeploymentStatus status) {
List jobs = new ArrayList<>();
status.jobsToRun().forEach((job, versionsList) -> {
for (Versions versions : versionsList)
status.jobSteps().get(job).readyAt(status.application().require(job.application().instance()).change())
.filter(readyAt -> ! clock.instant().isBefore(readyAt))
.filter(__ -> ! (job.type().isProduction() && isUnhealthyInAnotherZone(status.application(), job)))
.filter(__ -> abortIfRunning(versionsList, status.jobs().get(job).get())) // Abort and trigger this later if running with outdated parameters.
.ifPresent(readyAt -> {
jobs.add(deploymentJob(status.application().require(job.application().instance()),
versions,
job.type(),
status.instanceJobs(job.application().instance()).get(job.type()),
readyAt));
});
});
return Collections.unmodifiableList(jobs);
}
/** Returns whether the application is healthy in all other production zones. */
private boolean isUnhealthyInAnotherZone(Application application, JobId job) {
for (Deployment deployment : application.require(job.application().instance()).productionDeployments().values()) {
if ( ! deployment.zone().equals(job.type().zone(controller.system()))
&& ! controller.applications().isHealthy(new DeploymentId(job.application(), deployment.zone())))
return true;
}
return false;
}
/** Returns whether the job is not running, and also aborts it if it's running with outdated versions. */
private boolean abortIfRunning(List versionsList, JobStatus status) {
if ( ! status.isRunning())
return true;
Run last = status.lastTriggered().get();
if (versionsList.stream().noneMatch(versions -> versions.targetsMatch(last.versions())
&& versions.sourcesMatchIfPresent(last.versions())))
controller.jobController().abort(last.id());
return false;
}
// ---------- Change management o_O ----------
private boolean acceptNewApplicationVersion(DeploymentStatus status, InstanceName instance) {
if (status.application().require(instance).change().application().isPresent()) return true; // Replacing a previous application change is ok.
if (status.hasFailures()) return true; // Allow changes to fix upgrade problems.
if (status.application().deploymentSpec().instance(instance) // Leading upgrade allows app change to join in.
.map(spec -> spec.upgradeRollout() == DeploymentSpec.UpgradeRollout.leading).orElse(false)) return true;
return status.application().require(instance).change().platform().isEmpty();
}
private Change remainingChange(Instance instance, DeploymentStatus status) {
Change change = instance.change();
if (status.jobsToRun(Map.of(instance.name(), instance.change().withoutApplication())).isEmpty())
change = change.withoutPlatform();
if (status.jobsToRun(Map.of(instance.name(), instance.change().withoutPlatform())).isEmpty())
change = change.withoutApplication();
return change;
}
// ---------- Version and job helpers ----------
private Job deploymentJob(Instance instance, Versions versions, JobType jobType, JobStatus jobStatus, Instant availableSince) {
return new Job(instance, versions, jobType, availableSince, jobStatus.isOutOfCapacity(), instance.change().application().isPresent());
}
// ---------- Data containers ----------
private static class Job {
private final ApplicationId instanceId;
private final JobType jobType;
private final Versions versions;
private final Instant availableSince;
private final boolean isRetry;
private final boolean isApplicationUpgrade;
private Job(Instance instance, Versions versions, JobType jobType, Instant availableSince,
boolean isRetry, boolean isApplicationUpgrade) {
this.instanceId = instance.id();
this.jobType = jobType;
this.versions = versions;
this.availableSince = availableSince;
this.isRetry = isRetry;
this.isApplicationUpgrade = isApplicationUpgrade;
}
ApplicationId applicationId() { return instanceId; }
JobType jobType() { return jobType; }
Instant availableSince() { return availableSince; } // TODO jvenstad: This is 95% broken now. Change.at() can restore it.
boolean isRetry() { return isRetry; }
boolean applicationUpgrade() { return isApplicationUpgrade; }
@Override
public String toString() {
return jobType + " for " + instanceId +
" on (" + versions.targetPlatform() + versions.sourcePlatform().map(version -> " <-- " + version).orElse("") +
", " + versions.targetApplication().id() + versions.sourceApplication().map(version -> " <-- " + version.id()).orElse("") +
"), ready since " + availableSince;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy