com.yahoo.concurrent.maintenance.Maintainer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of vespajlib Show documentation
Show all versions of vespajlib Show documentation
Library for use in Java components of Vespa. Shared code which do
not fit anywhere else.
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.concurrent.maintenance;
import com.yahoo.concurrent.UncheckedTimeoutException;
import com.yahoo.net.HostName;
import java.math.BigDecimal;
import java.math.RoundingMode;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.util.List;
import java.util.Objects;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* The base class for maintainers. A maintainer is some job which runs at a fixed rate to perform maintenance tasks.
*
* @author bratseth
* @author mpolden
* @author jonmv
*/
public abstract class Maintainer implements Runnable {
protected final Logger log = Logger.getLogger(this.getClass().getName());
private final String name;
private final JobControl jobControl;
private final JobMetrics jobMetrics;
private final Duration interval;
private final ScheduledExecutorService service;
private final AtomicBoolean shutDown = new AtomicBoolean();
private final boolean ignoreCollision;
private final Clock clock;
private final double successFactorBaseline;
private final boolean acquireLock;
public Maintainer(String name, Duration interval, Clock clock, JobControl jobControl,
JobMetrics jobMetrics, List clusterHostnames, boolean ignoreCollision,
double successFactorBaseline, boolean acquireLock) {
this.name = name;
this.interval = requireInterval(interval);
this.jobControl = Objects.requireNonNull(jobControl);
this.jobMetrics = Objects.requireNonNull(jobMetrics);
this.ignoreCollision = ignoreCollision;
this.clock = clock;
this.successFactorBaseline = successFactorBaseline;
this.acquireLock = acquireLock;
Duration initialDelay = staggeredDelay(interval, HostName.getLocalhost(), clusterHostnames)
.plus(Duration.ofSeconds(30)); // Let the system stabilize before maintenance
service = new ScheduledThreadPoolExecutor(1, r -> new Thread(r, name() + "-worker"));
service.scheduleAtFixedRate(this, initialDelay.toMillis(), interval.toMillis(), TimeUnit.MILLISECONDS);
jobControl.started(name(), this);
if (ignoreCollision && !acquireLock) {
throw new IllegalArgumentException("ignoreCollision=" + ignoreCollision + ", but collisions cannot happen when acquireLock=" + acquireLock);
}
}
public Maintainer(String name, Duration interval, Clock clock, JobControl jobControl,
JobMetrics jobMetrics, List clusterHostnames, boolean ignoreCollision) {
this(name, interval, clock, jobControl, jobMetrics, clusterHostnames, ignoreCollision, 1.0, true);
}
@Override
public void run() {
doMaintain(false);
}
/** Starts shutdown of this, typically by shutting down executors. {@link #awaitShutdown()} waits for shutdown to complete. */
public void shutdown() {
if ( ! shutDown.getAndSet(true))
service.shutdown();
}
/** Waits for shutdown to complete, calling {@link #shutdown} if this hasn't been done already. */
public void awaitShutdown() {
shutdown();
var timeout = Duration.ofSeconds(30);
try {
if (!service.awaitTermination(timeout.toMillis(), TimeUnit.MILLISECONDS)) {
log.log(Level.WARNING, "Maintainer " + name() + " failed to shutdown " +
"within " + timeout);
}
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
/** Returns whether this is being shut down */
public final boolean shuttingDown() {
return shutDown.get();
}
@Override
public final String toString() { return name(); }
/**
* Called once each time this maintenance job should run.
*
* @return the degree to which the run successFactor deviated from the successFactorBaseline
* - a number between -1 (no success), to 0 (complete success) measured against the
* successFactorBaseline, or higher if the success factor is higher than the successFactorBaseline.
* The default successFactorBaseline is 1.0.
* If a maintainer is expected to fail sometimes, the successFactorBaseline should be set to a lower value.
*
* Note that this indicates whether something is wrong, so e.g. if the call did nothing because it should do
* nothing, 0.0 should be returned.
*/
protected abstract double maintain();
/** Convenience methods to convert attempts and failures into a success factor deviation from the baseline, and return */
protected final double asSuccessFactorDeviation(int attempts, int failures) {
double factor = attempts == 0 ? 1.0 : 1 - (double) failures / attempts;
return new BigDecimal(factor - successFactorBaseline).setScale(5, RoundingMode.HALF_UP).doubleValue();
}
/** Returns the interval at which this job is set to run */
protected Duration interval() { return interval; }
/** Run this while holding the job lock, as necessary */
public final void doMaintain(boolean force) {
if (!force && !jobControl.isActive(name())) return;
log.log(Level.FINE, () -> "Running " + this.getClass().getSimpleName());
double successFactorDeviation = -1;
long startTime = clock.millis();
try {
if (acquireLock) {
try (var lock = jobControl.lockJob(name())) {
successFactorDeviation = maintain();
} catch (UncheckedTimeoutException e) {
if (ignoreCollision)
successFactorDeviation = 0;
else
log.log(Level.WARNING, this + " collided with another run. Will retry in " + interval);
}
} else {
successFactorDeviation = maintain();
}
} catch (Throwable e) {
log.log(Level.WARNING, this + " failed. Will retry in " + interval, e);
} finally {
long endTime = clock.millis();
jobMetrics.completed(name(), successFactorDeviation, endTime - startTime);
}
log.log(Level.FINE, () -> "Finished " + this.getClass().getSimpleName());
}
/** Returns the simple name of this job */
public final String name() {
return name == null ? this.getClass().getSimpleName() : name;
}
/** Returns the initial delay of this calculated from cluster index of the hostname of this node, and the maintainer name. */
Duration staggeredDelay(Duration interval, String hostname, List clusterHostnames) {
Objects.requireNonNull(clusterHostnames);
if ( ! clusterHostnames.contains(hostname))
return interval;
Instant now = clock.instant();
long nodeOffset = clusterHostnames.indexOf(hostname) * interval.toMillis() / clusterHostnames.size();
long maintainerOffset = getClass().getName().hashCode() % interval.toMillis();
long totalOffset = nodeOffset + maintainerOffset;
return Duration.ofMillis(Math.floorMod(totalOffset - now.toEpochMilli(), interval.toMillis()));
}
private static Duration requireInterval(Duration interval) {
Objects.requireNonNull(interval);
if (interval.isNegative() || interval.isZero())
throw new IllegalArgumentException("Interval must be positive, but was " + interval);
return interval;
}
}