com.yahoo.vespa.hosted.routing.nginx.NginxMetricsReporter Maven / Gradle / Ivy
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.routing.nginx;
import com.google.common.collect.ImmutableMap;
import com.yahoo.cloud.config.ApplicationIdConfig;
import com.yahoo.component.AbstractComponent;
import com.yahoo.component.annotation.Inject;
import com.yahoo.concurrent.DaemonThreadFactory;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.hosted.routing.RoutingGenerator;
import com.yahoo.vespa.hosted.routing.RoutingTable;
import com.yahoo.vespa.hosted.routing.status.HealthStatus;
import com.yahoo.vespa.hosted.routing.status.ServerGroup;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.file.FileSystem;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.time.Duration;
import java.time.Instant;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;
import java.util.stream.Collectors;
/**
* Report Nginx metrics periodically.
*
* @author mortent
* @author mpolden
*/
public final class NginxMetricsReporter extends AbstractComponent implements Runnable {
private static final Duration interval = Duration.ofSeconds(20);
static final String UPSTREAM_UP_METRIC = "nginx.upstreams.up";
static final String UPSTREAM_DOWN_METRIC = "nginx.upstreams.down";
static final String UPSTREAM_UNKNOWN_METRIC = "nginx.upstreams.unknown";
static final String CONFIG_AGE_METRIC = "upstreams_configuration_age";
static final String CONFIG_RELOAD_TIME = "upstreams_configuration_reload_time";
private final Metric metric;
private final HealthStatus healthStatus;
private final ApplicationId routingApplication;
private final FileSystem fileSystem;
private final ScheduledExecutorService service;
private final Supplier> tableSupplier;
@Inject
public NginxMetricsReporter(ApplicationIdConfig applicationId, Metric metric, HealthStatus healthStatus, RoutingGenerator routingGenerator) {
this(ApplicationId.from(applicationId), metric, healthStatus, FileSystems.getDefault(), interval, routingGenerator::routingTable);
}
NginxMetricsReporter(ApplicationId application, Metric metric, HealthStatus healthStatus, FileSystem fileSystem, Duration interval,
Supplier> tableSupplier) {
this.metric = Objects.requireNonNull(metric);
this.healthStatus = Objects.requireNonNull(healthStatus);
this.routingApplication = Objects.requireNonNull(application);
this.fileSystem = Objects.requireNonNull(fileSystem);
this.tableSupplier = Objects.requireNonNull(tableSupplier);
this.service = Executors.newSingleThreadScheduledExecutor(new DaemonThreadFactory("nginx-metrics-reporter"));
this.service.scheduleAtFixedRate(this, interval.toMillis(), interval.toMillis(), TimeUnit.MILLISECONDS);
}
@Override
public void run() {
Optional table = tableSupplier.get();
table.ifPresent(this::reportHealth);
reportConfigAge();
}
private void reportConfigAge() {
Path temporaryNginxConfiguration = NginxPath.temporaryConfig.in(fileSystem);
Path nginxConfiguration = NginxPath.config.in(fileSystem);
Optional temporaryConfigModified = lastModified(temporaryNginxConfiguration);
if (temporaryConfigModified.isEmpty()) {
metric.set(CONFIG_AGE_METRIC, 0, metric.createContext(Map.of()));
metric.set(CONFIG_RELOAD_TIME, 0, metric.createContext(Map.of()));
return;
}
Instant configModified = lastModified(nginxConfiguration).orElse(Instant.EPOCH);
long secondsDiff = Math.abs(Duration.between(configModified, temporaryConfigModified.get()).toSeconds());
long reloadTime = Math.abs(Duration.between(temporaryConfigModified.get(), Instant.now()).toSeconds());
metric.set(CONFIG_AGE_METRIC, secondsDiff, metric.createContext(Map.of()));
metric.set(CONFIG_RELOAD_TIME, reloadTime, metric.createContext(Map.of()));
}
private void reportHealth(RoutingTable table) {
Collection targets = table.asMap().values();
Map> status = healthStatus.servers().asMap();
targets.forEach(service -> {
List serversOfUpstream = status.get(service.id());
if (serversOfUpstream != null) {
reportMetrics(service, serversOfUpstream);
} else {
reportMetricsUnknown(service);
}
});
Set knownUpstreams = targets.stream().map(RoutingTable.Target::id).collect(Collectors.toSet());
long unknownUpstreamCount = status.keySet().stream()
.filter(upstreamName -> !knownUpstreams.contains(upstreamName))
.count();
reportMetricsUnknown(unknownUpstreamCount);
}
// We report a target as unknown if there is no trace of it in the health check yet. This might not be an issue
// (the health check status is a cache), but if it lasts for a long time it might be an error.
private void reportMetricsUnknown(RoutingTable.Target target) {
var dimensions = metricsDimensionsForService(target);
var context = metric.createContext(dimensions);
metric.set(UPSTREAM_UP_METRIC, 0L, context);
metric.set(UPSTREAM_DOWN_METRIC, 0L, context);
metric.set(UPSTREAM_UNKNOWN_METRIC, 1L, context);
}
// This happens if an application is mentioned in the health check cache, but is not present
// in the routing table. We report this to the routing application, as we don't have anywhere
// else to put the data.
private void reportMetricsUnknown(long count) {
var dimensions = ImmutableMap.of(
"tenantName", routingApplication.tenant().value(),
"app", String.format("%s.%s", routingApplication.application().value(), routingApplication.instance().value()),
"applicationId", routingApplication.toFullString(),
"clusterid", "routing"
);
var context = metric.createContext(dimensions);
metric.set(UPSTREAM_UNKNOWN_METRIC, count, context);
}
private void reportMetrics(RoutingTable.Target target, List servers) {
long up = countStatus(servers, true);
long down = countStatus(servers, false);
var dimensions = metricsDimensionsForService(target);
var context = metric.createContext(dimensions);
metric.set(UPSTREAM_UP_METRIC, up, context);
metric.set(UPSTREAM_DOWN_METRIC, down, context);
metric.set(UPSTREAM_UNKNOWN_METRIC, 0L, context);
}
private Map metricsDimensionsForService(RoutingTable.Target target) {
String applicationId = target.tenant().value() + "." + target.application().value();
String app = target.application().value();
if (target.instance().isPresent()) {
app += "." + target.instance().get().value();
applicationId += "." + target.instance().get().value();
}
return ImmutableMap.of(
"tenantName", target.tenant().value(),
"app", app,
"applicationId", applicationId,
"clusterid", target.cluster().value()
);
}
private long countStatus(List upstreams, boolean up) {
return upstreams.stream().filter(nginxServer -> up == nginxServer.up()).count();
}
private static Optional lastModified(Path path) {
try {
return Optional.ofNullable(Files.getLastModifiedTime(path).toInstant());
} catch (NoSuchFileException e) {
return Optional.empty();
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
@Override
public void deconstruct() {
Duration timeout = Duration.ofSeconds(10);
service.shutdown();
try {
if (!service.awaitTermination(timeout.toMillis(), TimeUnit.MILLISECONDS)) {
throw new RuntimeException("Failed to shutdown executor within " + timeout);
}
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy