
com.yahoo.vespa.hosted.controller.maintenance.MetricsReporter Maven / Gradle / Ivy
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.controller.maintenance;
import ai.vespa.metrics.ConfigServerMetrics;
import ai.vespa.metrics.ControllerMetrics;
import com.yahoo.component.Version;
import com.yahoo.config.application.api.DeploymentInstanceSpec;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.config.provision.HostName;
import com.yahoo.config.provision.zone.ZoneId;
import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.athenz.client.zms.ZmsClient;
import com.yahoo.vespa.hosted.controller.Application;
import com.yahoo.vespa.hosted.controller.Controller;
import com.yahoo.vespa.hosted.controller.Instance;
import com.yahoo.vespa.hosted.controller.api.integration.deployment.ApplicationVersion;
import com.yahoo.vespa.hosted.controller.application.ApplicationList;
import com.yahoo.vespa.hosted.controller.application.Deployment;
import com.yahoo.vespa.hosted.controller.application.DeploymentMetrics;
import com.yahoo.vespa.hosted.controller.auditlog.AuditLog;
import com.yahoo.vespa.hosted.controller.deployment.DeploymentStatusList;
import com.yahoo.vespa.hosted.controller.deployment.JobList;
import com.yahoo.vespa.hosted.controller.routing.rotation.RotationLock;
import com.yahoo.vespa.hosted.controller.versions.NodeVersion;
import com.yahoo.vespa.hosted.controller.versions.VersionStatus;
import com.yahoo.vespa.hosted.controller.versions.VespaVersion;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.time.temporal.ChronoUnit;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.stream.Collectors;
/**
* This calculates and reports system-wide metrics based on data from a {@link Controller}.
*
* @author mortent
* @author mpolden
*/
public class MetricsReporter extends ControllerMaintainer {
public static final String TENANT_METRIC = ControllerMetrics.BILLING_TENANTS.baseName();
public static final String DEPLOYMENT_FAIL_METRIC = ControllerMetrics.DEPLOYMENT_FAILURE_PERCENTAGE.baseName();
public static final String DEPLOYMENT_AVERAGE_DURATION = ControllerMetrics.DEPLOYMENT_AVERAGE_DURATION.baseName();
public static final String DEPLOYMENT_FAILING_UPGRADES = ControllerMetrics.DEPLOYMENT_FAILING_UPGRADES.baseName();
public static final String DEPLOYMENT_BUILD_AGE_SECONDS = ControllerMetrics.DEPLOYMENT_BUILD_AGE_SECONDS.baseName();
public static final String DEPLOYMENT_WARNINGS = ControllerMetrics.DEPLOYMENT_WARNINGS.baseName();
public static final String DEPLOYMENT_OVERDUE_UPGRADE = ControllerMetrics.DEPLOYMENT_OVERDUE_UPGRADE_SECONDS.baseName();
public static final String OS_CHANGE_DURATION = ControllerMetrics.DEPLOYMENT_OS_CHANGE_DURATION.baseName();
public static final String PLATFORM_CHANGE_DURATION = ControllerMetrics.DEPLOYMENT_PLATFORM_CHANGE_DURATION.baseName();
public static final String OS_NODE_COUNT = ControllerMetrics.DEPLOYMENT_NODE_COUNT_BY_OS_VERSION.baseName();
public static final String PLATFORM_NODE_COUNT = ControllerMetrics.DEPLOYMENT_NODE_COUNT_BY_PLATFORM_VERSION.baseName();
public static final String BROKEN_SYSTEM_VERSION = ControllerMetrics.DEPLOYMENT_BROKEN_SYSTEM_VERSION.baseName();
public static final String REMAINING_ROTATIONS = ControllerMetrics.REMAINING_ROTATIONS.baseName();
public static final String NAME_SERVICE_REQUESTS_QUEUED = ControllerMetrics.DNS_QUEUED_REQUESTS.baseName();
public static final String OPERATION_PREFIX = "operation.";
public static final String ZMS_QUOTA_USAGE = ControllerMetrics.ZMS_QUOTA_USAGE.baseName();
private final Metric metric;
private final Clock clock;
private final ZmsClient zmsClient;
// Keep track of reported node counts for each version
private final ConcurrentHashMap nodeCounts = new ConcurrentHashMap<>();
public MetricsReporter(Controller controller, Metric metric, ZmsClient zmsClient) {
super(controller, Duration.ofMinutes(1)); // use fixed rate for metrics
this.metric = metric;
this.clock = controller.clock();
this.zmsClient = zmsClient;
}
@Override
public double maintain() {
reportDeploymentMetrics();
reportRemainingRotations();
reportQueuedNameServiceRequests();
VersionStatus versionStatus = controller().readVersionStatus();
reportInfrastructureUpgradeMetrics(versionStatus);
reportAuditLog();
reportBrokenSystemVersion(versionStatus);
reportTenantMetrics();
reportZmsQuotaMetrics();
return 0.0;
}
private void reportBrokenSystemVersion(VersionStatus versionStatus) {
Version systemVersion = controller().systemVersion(versionStatus);
VespaVersion.Confidence confidence = versionStatus.version(systemVersion).confidence();
int isBroken = confidence == VespaVersion.Confidence.broken ? 1 : 0;
metric.set(BROKEN_SYSTEM_VERSION, isBroken, metric.createContext(Map.of()));
}
private void reportAuditLog() {
AuditLog log = controller().auditLogger().readLog();
HashMap> metricCounts = new HashMap<>();
for (AuditLog.Entry entry : log.entries()) {
String[] resource = entry.resource().split("/");
if((resource.length > 1) && (resource[1] != null)) {
String api = resource[1];
String operationMetric = OPERATION_PREFIX + api;
HashMap dimension = metricCounts.get(operationMetric);
if (dimension != null) {
Integer count = dimension.get(entry.principal());
if (count != null) {
dimension.replace(entry.principal(), ++count);
} else {
dimension.put(entry.principal(), 1);
}
} else {
dimension = new HashMap<>();
dimension.put(entry.principal(),1);
metricCounts.put(operationMetric, dimension);
}
}
}
for (String operationMetric : metricCounts.keySet()) {
for (String userDimension : metricCounts.get(operationMetric).keySet()) {
metric.set(operationMetric, (metricCounts.get(operationMetric)).get(userDimension), metric.createContext(Map.of("operator", userDimension)));
}
}
}
private void reportInfrastructureUpgradeMetrics(VersionStatus versionStatus) {
Map osChangeDurations = osChangeDurations();
Map platformChangeDurations = platformChangeDurations(versionStatus);
reportChangeDurations(osChangeDurations, OS_CHANGE_DURATION);
reportChangeDurations(platformChangeDurations, PLATFORM_CHANGE_DURATION);
reportNodeCount(osChangeDurations.keySet(), OS_NODE_COUNT);
reportNodeCount(platformChangeDurations.keySet(), PLATFORM_NODE_COUNT);
}
private void reportRemainingRotations() {
try (RotationLock lock = controller().routing().rotations().lock()) {
int availableRotations = controller().routing().rotations().availableRotations(lock).size();
metric.set(REMAINING_ROTATIONS, availableRotations, metric.createContext(Map.of()));
}
}
private void reportDeploymentMetrics() {
ApplicationList applications = ApplicationList.from(controller().applications().readable())
.withProductionDeployment();
DeploymentStatusList deployments = controller().jobController().deploymentStatuses(applications);
metric.set(DEPLOYMENT_FAIL_METRIC, deploymentFailRatio(deployments) * 100, metric.createContext(Map.of()));
averageDeploymentDurations(deployments, clock.instant()).forEach((instance, duration) -> {
metric.set(DEPLOYMENT_AVERAGE_DURATION, duration.toSeconds(), metric.createContext(dimensions(instance)));
});
deploymentsFailingUpgrade(deployments).forEach((instance, failingJobs) -> {
metric.set(DEPLOYMENT_FAILING_UPGRADES, failingJobs, metric.createContext(dimensions(instance)));
});
deploymentWarnings(deployments).forEach((instance, warnings) -> {
metric.set(DEPLOYMENT_WARNINGS, warnings, metric.createContext(dimensions(instance)));
});
overdueUpgradeDurationByInstance(deployments).forEach((instance, overduePeriod) -> {
metric.set(DEPLOYMENT_OVERDUE_UPGRADE, overduePeriod.toSeconds(), metric.createContext(dimensions(instance)));
});
for (Application application : applications.asList())
application.revisions().last()
.flatMap(ApplicationVersion::buildTime)
.ifPresent(buildTime -> metric.set(DEPLOYMENT_BUILD_AGE_SECONDS,
controller().clock().instant().getEpochSecond() - buildTime.getEpochSecond(),
metric.createContext(dimensions(application.id().defaultInstance()))));
}
private Map overdueUpgradeDurationByInstance(DeploymentStatusList deployments) {
Instant now = clock.instant();
Map overdueUpgrades = new HashMap<>();
for (var deploymentStatus : deployments) {
for (var kv : deploymentStatus.instanceJobs().entrySet()) {
ApplicationId instance = kv.getKey();
JobList jobs = kv.getValue();
boolean upgradeRunning = !jobs.production().upgrading().isEmpty();
DeploymentInstanceSpec instanceSpec = deploymentStatus.application().deploymentSpec().requireInstance(instance.instance());
Duration overdueDuration = upgradeRunning ? overdueUpgradeDuration(now, instanceSpec) : Duration.ZERO;
overdueUpgrades.put(instance, overdueDuration);
}
}
return Collections.unmodifiableMap(overdueUpgrades);
}
/** Returns how long an upgrade has been running inside a block window */
static Duration overdueUpgradeDuration(Instant upgradingAt, DeploymentInstanceSpec instanceSpec) {
Optional lastOpened = Optional.empty(); // When the upgrade window most recently opened
Instant oneWeekAgo = upgradingAt.minus(Duration.ofDays(7));
Duration step = Duration.ofHours(1);
for (Instant instant = upgradingAt.truncatedTo(ChronoUnit.HOURS); !instanceSpec.canUpgradeAt(instant); instant = instant.minus(step)) {
if (!instant.isAfter(oneWeekAgo)) { // Wrapped around, the entire week is being blocked
lastOpened = Optional.empty();
break;
}
lastOpened = Optional.of(instant);
}
if (lastOpened.isEmpty()) return Duration.ZERO;
return Duration.between(lastOpened.get(), upgradingAt);
}
private void reportQueuedNameServiceRequests() {
metric.set(NAME_SERVICE_REQUESTS_QUEUED, controller().curator().readNameServiceQueue().requests().size(),
metric.createContext(Map.of()));
}
private void reportNodeCount(Set nodeVersions, String metricName) {
Map newNodeCounts = nodeVersions.stream()
.collect(Collectors.groupingBy(nodeVersion -> {
return new NodeCountKey(metricName,
nodeVersion.currentVersion(),
nodeVersion.zone());
}, Collectors.counting()));
nodeCounts.putAll(newNodeCounts);
nodeCounts.forEach((key, count) -> {
if (newNodeCounts.containsKey(key)) {
// Version is still present: Update the metric.
metric.set(metricName, count, metric.createContext(dimensions(key.zone, key.version)));
} else if (key.metricName.equals(metricName)) {
// Version is no longer present, but has been previously reported: Set it to zero.
metric.set(metricName, 0, metric.createContext(dimensions(key.zone, key.version)));
}
});
}
private void reportChangeDurations(Map changeDurations, String metricName) {
changeDurations.forEach((nodeVersion, duration) -> {
metric.set(metricName, duration.toSeconds(), metric.createContext(dimensions(nodeVersion.hostname(), nodeVersion.zone())));
});
}
private void reportTenantMetrics() {
if (! controller().system().isPublic()) return;
var planCounter = new TreeMap();
controller().tenants().asList().forEach(tenant -> {
var planId = controller().serviceRegistry().billingController().getPlan(tenant.name());
planCounter.merge(planId.value(), 1, Integer::sum);
});
planCounter.forEach((planId, count) -> {
var context = metric.createContext(Map.of("plan", planId));
metric.set(TENANT_METRIC, count, context);
});
}
private void reportZmsQuotaMetrics() {
var quota = zmsClient.getQuotaUsage();
reportZmsQuota("subdomains", quota.getSubdomainUsage());
reportZmsQuota("services", quota.getServiceUsage());
reportZmsQuota("policies", quota.getPolicyUsage());
reportZmsQuota("roles", quota.getRoleUsage());
reportZmsQuota("groups", quota.getGroupUsage());
}
private void reportZmsQuota(String resourceType, double usage) {
var context = metric.createContext(Map.of("resourceType", resourceType));
metric.set(ZMS_QUOTA_USAGE, usage, context);
}
private Map platformChangeDurations(VersionStatus versionStatus) {
return changeDurations(versionStatus.versions(), VespaVersion::nodeVersions);
}
private Map osChangeDurations() {
return changeDurations(controller().os().status().versions().values(), Function.identity());
}
private Map changeDurations(Collection versions, Function> versionsGetter) {
var now = clock.instant();
var durations = new HashMap();
for (var version : versions) {
for (var nodeVersion : versionsGetter.apply(version)) {
durations.put(nodeVersion, nodeVersion.changeDuration(now));
}
}
return durations;
}
private static double deploymentFailRatio(DeploymentStatusList statuses) {
return statuses.asList().stream()
.mapToInt(status -> status.hasFailures() ? 1 : 0)
.average().orElse(0);
}
private static Map averageDeploymentDurations(DeploymentStatusList statuses, Instant now) {
return statuses.asList().stream()
.flatMap(status -> status.instanceJobs().entrySet().stream())
.collect(Collectors.toUnmodifiableMap(entry -> entry.getKey(),
entry -> averageDeploymentDuration(entry.getValue(), now)));
}
private static Map deploymentsFailingUpgrade(DeploymentStatusList statuses) {
return statuses.asList().stream()
.flatMap(status -> status.instanceJobs().entrySet().stream())
.collect(Collectors.toUnmodifiableMap(entry -> entry.getKey(),
entry -> deploymentsFailingUpgrade(entry.getValue())));
}
private static int deploymentsFailingUpgrade(JobList jobs) {
return jobs.failingHard().not().failingApplicationChange().size();
}
private static Duration averageDeploymentDuration(JobList jobs, Instant now) {
List jobDurations = jobs.lastTriggered()
.mapToList(run -> Duration.between(run.start(), run.end().orElse(now)));
return jobDurations.stream()
.reduce(Duration::plus)
.map(totalDuration -> totalDuration.dividedBy(jobDurations.size()))
.orElse(Duration.ZERO);
}
private static Map deploymentWarnings(DeploymentStatusList statuses) {
return statuses.asList().stream()
.flatMap(status -> status.application().instances().values().stream())
.collect(Collectors.toMap(Instance::id, a -> maxWarningCountOf(a.deployments().values())));
}
private static int maxWarningCountOf(Collection deployments) {
return deployments.stream()
.map(Deployment::metrics)
.map(DeploymentMetrics::warnings)
.map(Map::values)
.flatMap(Collection::stream)
.max(Integer::compareTo)
.orElse(0);
}
private static Map dimensions(ApplicationId application) {
return Map.of("tenantName", application.tenant().value(),
"app", application.application().value() + "." + application.instance().value(),
"applicationId", application.toFullString());
}
private static Map dimensions(HostName hostname, ZoneId zone) {
return Map.of("host", hostname.value(),
"zone", zone.value());
}
private static Map dimensions(ZoneId zone, Version currentVersion) {
return Map.of("zone", zone.value(),
"currentVersion", currentVersion.toFullString());
}
private static class NodeCountKey {
private final String metricName;
private final Version version;
private final ZoneId zone;
public NodeCountKey(String metricName, Version version, ZoneId zone) {
this.metricName = metricName;
this.version = version;
this.zone = zone;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
NodeCountKey that = (NodeCountKey) o;
return metricName.equals(that.metricName) &&
version.equals(that.version) &&
zone.equals(that.zone);
}
@Override
public int hashCode() {
return Objects.hash(metricName, version, zone);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy