All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.yahoo.vespa.hosted.routing.nginx.NginxMetricsReporter Maven / Gradle / Ivy

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.vespa.hosted.routing.nginx;

import com.google.common.collect.ImmutableMap;
import com.yahoo.cloud.config.ApplicationIdConfig;
import com.yahoo.component.AbstractComponent;
import com.yahoo.component.annotation.Inject;
import com.yahoo.concurrent.DaemonThreadFactory;
import com.yahoo.config.provision.ApplicationId;
import com.yahoo.jdisc.Metric;
import com.yahoo.vespa.hosted.routing.RoutingGenerator;
import com.yahoo.vespa.hosted.routing.RoutingTable;
import com.yahoo.vespa.hosted.routing.status.HealthStatus;
import com.yahoo.vespa.hosted.routing.status.ServerGroup;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.file.FileSystem;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
import java.nio.file.Path;
import java.time.Duration;
import java.time.Instant;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;
import java.util.stream.Collectors;

/**
 * Report Nginx metrics periodically.
 *
 * @author mortent
 * @author mpolden
 */
public final class NginxMetricsReporter extends AbstractComponent implements Runnable {

    private static final Duration interval = Duration.ofSeconds(20);

    static final String UPSTREAM_UP_METRIC = "nginx.upstreams.up";
    static final String UPSTREAM_DOWN_METRIC = "nginx.upstreams.down";
    static final String UPSTREAM_UNKNOWN_METRIC = "nginx.upstreams.unknown";
    static final String CONFIG_AGE_METRIC = "upstreams_configuration_age";
    static final String CONFIG_RELOAD_TIME = "upstreams_configuration_reload_time";

    private final Metric metric;
    private final HealthStatus healthStatus;
    private final ApplicationId routingApplication;
    private final FileSystem fileSystem;
    private final ScheduledExecutorService service;
    private final Supplier> tableSupplier;

    @Inject
    public NginxMetricsReporter(ApplicationIdConfig applicationId, Metric metric, HealthStatus healthStatus, RoutingGenerator routingGenerator) {
        this(ApplicationId.from(applicationId), metric, healthStatus, FileSystems.getDefault(), interval, routingGenerator::routingTable);
    }

    NginxMetricsReporter(ApplicationId application, Metric metric, HealthStatus healthStatus, FileSystem fileSystem, Duration interval,
                         Supplier> tableSupplier) {
        this.metric = Objects.requireNonNull(metric);
        this.healthStatus = Objects.requireNonNull(healthStatus);
        this.routingApplication = Objects.requireNonNull(application);
        this.fileSystem = Objects.requireNonNull(fileSystem);
        this.tableSupplier = Objects.requireNonNull(tableSupplier);
        this.service = Executors.newSingleThreadScheduledExecutor(new DaemonThreadFactory("nginx-metrics-reporter"));
        this.service.scheduleAtFixedRate(this, interval.toMillis(), interval.toMillis(), TimeUnit.MILLISECONDS);
    }

    @Override
    public void run() {
        Optional table = tableSupplier.get();
        table.ifPresent(this::reportHealth);
        reportConfigAge();
    }

    private void reportConfigAge() {
        Path temporaryNginxConfiguration = NginxPath.temporaryConfig.in(fileSystem);
        Path nginxConfiguration = NginxPath.config.in(fileSystem);
        Optional temporaryConfigModified = lastModified(temporaryNginxConfiguration);
        if (temporaryConfigModified.isEmpty()) {
            metric.set(CONFIG_AGE_METRIC, 0, metric.createContext(Map.of()));
            metric.set(CONFIG_RELOAD_TIME, 0, metric.createContext(Map.of()));
            return;
        }
        Instant configModified = lastModified(nginxConfiguration).orElse(Instant.EPOCH);
        long secondsDiff = Math.abs(Duration.between(configModified, temporaryConfigModified.get()).toSeconds());
        long reloadTime = Math.abs(Duration.between(temporaryConfigModified.get(), Instant.now()).toSeconds());
        metric.set(CONFIG_AGE_METRIC, secondsDiff, metric.createContext(Map.of()));
        metric.set(CONFIG_RELOAD_TIME, reloadTime, metric.createContext(Map.of()));
    }

    private void reportHealth(RoutingTable table) {
        Collection targets = table.asMap().values();
        Map> status = healthStatus.servers().asMap();
        targets.forEach(service -> {
            List serversOfUpstream = status.get(service.id());
            if (serversOfUpstream != null) {
                reportMetrics(service, serversOfUpstream);
            } else {
                reportMetricsUnknown(service);
            }
        });

        Set knownUpstreams = targets.stream().map(RoutingTable.Target::id).collect(Collectors.toSet());
        long unknownUpstreamCount = status.keySet().stream()
                                          .filter(upstreamName -> !knownUpstreams.contains(upstreamName))
                                          .count();
        reportMetricsUnknown(unknownUpstreamCount);
    }

    // We report a target as unknown if there is no trace of it in the health check yet.  This might not be an issue
    // (the health check status is a cache), but if it lasts for a long time it might be an error.
    private void reportMetricsUnknown(RoutingTable.Target target) {
        var dimensions = metricsDimensionsForService(target);
        var context = metric.createContext(dimensions);
        metric.set(UPSTREAM_UP_METRIC, 0L, context);
        metric.set(UPSTREAM_DOWN_METRIC, 0L, context);
        metric.set(UPSTREAM_UNKNOWN_METRIC, 1L, context);
    }

    // This happens if an application is mentioned in the health check cache, but is not present
    // in the routing table. We report this to the routing application, as we don't have anywhere
    // else to put the data.
    private void reportMetricsUnknown(long count) {
        var dimensions = ImmutableMap.of(
                "tenantName", routingApplication.tenant().value(),
                "app", String.format("%s.%s", routingApplication.application().value(), routingApplication.instance().value()),
                "applicationId", routingApplication.toFullString(),
                "clusterid", "routing"
        );
        var context = metric.createContext(dimensions);
        metric.set(UPSTREAM_UNKNOWN_METRIC, count, context);
    }

    private void reportMetrics(RoutingTable.Target target, List servers) {
        long up = countStatus(servers, true);
        long down = countStatus(servers, false);

        var dimensions = metricsDimensionsForService(target);
        var context = metric.createContext(dimensions);
        metric.set(UPSTREAM_UP_METRIC, up, context);
        metric.set(UPSTREAM_DOWN_METRIC, down, context);
        metric.set(UPSTREAM_UNKNOWN_METRIC, 0L, context);
    }

    private Map metricsDimensionsForService(RoutingTable.Target target) {
        String applicationId = target.tenant().value() + "." + target.application().value();
        String app = target.application().value();
        if (target.instance().isPresent()) {
            app += "." + target.instance().get().value();
            applicationId += "." + target.instance().get().value();
        }
        return ImmutableMap.of(
                "tenantName", target.tenant().value(),
                "app", app,
                "applicationId", applicationId,
                "clusterid", target.cluster().value()
        );
    }

    private long countStatus(List upstreams, boolean up) {
        return upstreams.stream().filter(nginxServer -> up == nginxServer.up()).count();
    }

    private static Optional lastModified(Path path) {
        try {
            return Optional.ofNullable(Files.getLastModifiedTime(path).toInstant());
        } catch (NoSuchFileException e) {
            return Optional.empty();
        } catch (IOException e) {
            throw new UncheckedIOException(e);
        }
    }

    @Override
    public void deconstruct() {
        Duration timeout = Duration.ofSeconds(10);
        service.shutdown();
        try {
            if (!service.awaitTermination(timeout.toMillis(), TimeUnit.MILLISECONDS)) {
                throw new RuntimeException("Failed to shutdown executor within " + timeout);
            }
        } catch (InterruptedException e) {
            throw new RuntimeException(e);
        }
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy