All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.vespa.metricsproxy.http.application.ApplicationMetricsRetriever Maven / Gradle / Ivy

There is a newer version: 8.458.13
Show newest version
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package ai.vespa.metricsproxy.http.application;

import ai.vespa.metricsproxy.metric.model.ConsumerId;
import ai.vespa.metricsproxy.metric.model.MetricsPacket;
import ai.vespa.util.http.hc5.VespaAsyncHttpClientBuilder;
import com.yahoo.component.annotation.Inject;
import com.yahoo.component.AbstractComponent;
import org.apache.hc.client5.http.ConnectTimeoutException;
import org.apache.hc.client5.http.config.RequestConfig;
import org.apache.hc.client5.http.impl.async.CloseableHttpAsyncClient;
import org.apache.hc.core5.reactor.IOReactorConfig;
import org.apache.hc.core5.util.Timeout;

import java.io.IOException;
import java.net.SocketException;
import java.time.Clock;
import java.time.Duration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.CancellationException;
import java.util.logging.Level;
import java.util.logging.Logger;

import static ai.vespa.metricsproxy.http.ValuesFetcher.defaultMetricsConsumerId;

/**
 * This class retrieves metrics from all nodes in the given config, usually all
 * nodes in a Vespa application.
 *
 * @author gjoranv
 */
public final class ApplicationMetricsRetriever extends AbstractComponent implements Runnable {

    private static final Logger log = Logger.getLogger(ApplicationMetricsRetriever.class.getName());

    static final Duration MIN_TIMEOUT = Duration.ofSeconds(60);
    static final Duration MAX_TIMEOUT = Duration.ofSeconds(240);

    private static final int HTTP_CONNECT_TIMEOUT = 5000;
    private static final int HTTP_SOCKET_TIMEOUT = 30000;
    private static final Duration METRICS_TTL = Duration.ofSeconds(30);

    private final CloseableHttpAsyncClient httpClient = createHttpClient();
    private final List clients;
    private final Thread pollThread;
    private final Set consumerSet;
    private long pollCount = 0;
    private volatile boolean stopped;
    private volatile Duration taskTimeout;

    @Inject
    public ApplicationMetricsRetriever(MetricsNodesConfig nodesConfig) {
        clients = createNodeClients(nodesConfig);
        taskTimeout = timeout(clients.size());
        stopped = false;
        consumerSet = new HashSet<>();
        httpClient.start();
        pollThread = new Thread(this, "metrics-poller");
        pollThread.setDaemon(true);
    }

    @Override
    public void run() {
        while (true) {
            try {
                ConsumerId [] consumers;
                synchronized (pollThread) {
                    consumers = consumerSet.toArray(new ConsumerId[0]);
                }
                for (ConsumerId consumer : consumers) {
                    int numFailed = fetchMetricsAsync(consumer);
                    if (numFailed > 0 ) {
                        log.log(Level.INFO, "Updated metrics for consumer '" + consumer +"' failed for " + numFailed + " services");
                    } else {
                        log.log(Level.FINE, "Updated metrics for consumer '" + consumer +"'.");
                    }
                }
                Duration timeUntilNextPoll = Duration.ofMillis(1000);
                synchronized (pollThread) {
                    pollCount++;
                    pollThread.notifyAll();
                    pollThread.wait(timeUntilNextPoll.toMillis());
                    if (stopped) return;
                }
            } catch (InterruptedException e) {
            } catch (Exception e) {
                log.log(Level.WARNING, "Got unknown exception:", e);
            }
        }

    }

    @Override
    public void deconstruct() {
        try {
            httpClient.close();
        } catch (IOException e) {
            log.warning("Failed closing httpclient: " + e);
        }
        synchronized (pollThread) {
            stopped = true;
            pollThread.notifyAll();
        }
        try {
            pollThread.join(Duration.ofSeconds(3).toMillis());
            if (pollThread.isAlive()) {
                log.log(Level.WARNING, "metrics poller thread still not stopped, using Thread.interrupt():");
                pollThread.interrupt();
                pollThread.join();
            }
        } catch (InterruptedException e) {}
        super.deconstruct();
    }

    Map> getMetrics() {
        return getMetrics(defaultMetricsConsumerId);
    }

    public Map> getMetrics(ConsumerId consumer) {
        log.log(Level.FINE, () -> "Retrieving metrics from " + clients.size() + " nodes.");
        synchronized (pollThread) {
            if (consumerSet.add(consumer)) {
                // Wakeup poll thread first time we see a new consumer
                pollThread.notifyAll();
            }
        }
        Map> metrics = new HashMap<>();
        for (NodeMetricsClient client : clients) {
            metrics.put(client.node, client.getMetrics(consumer));
        }
        return metrics;
    }

    void startPollAndWait() {
        try {
            synchronized (pollThread) {
                if ( ! pollThread.isAlive()) {
                    pollThread.start();
                }
                long before = pollCount;
                pollThread.notifyAll();
                while (pollCount <= before + 1) {
                    pollThread.notifyAll();
                    pollThread.wait();
                }
            }
        } catch (InterruptedException e) {}
    }

    private int fetchMetricsAsync(ConsumerId consumer) {
        Map> futures = new HashMap<>();
        for (NodeMetricsClient client : clients) {
            client.startSnapshotUpdate(consumer, METRICS_TTL).ifPresent(future -> futures.put(client.node, future));
        }
        int numOk = 0;
        int numTried = futures.size();
        for (Map.Entry> entry : futures.entrySet()) {
            if (stopped) break;
            try {
                entry.getValue().get(taskTimeout.toMillis(), TimeUnit.MILLISECONDS);
                numOk++;
            } catch (InterruptedException | ExecutionException | TimeoutException | CancellationException e) {
                Throwable cause = e.getCause();
                if (stopped || e instanceof ExecutionException && ((cause instanceof SocketException) || cause instanceof ConnectTimeoutException)) {
                    log.log(Level.FINE, "Failed retrieving metrics for '" + entry.getKey() + "' : " + cause.getMessage());
                } else {
                    log.log(Level.WARNING, "Failed retrieving metrics for '" + entry.getKey() + "' : ", e);
                }
            }
        }
        log.log(Level.FINE, () -> "Finished retrieving metrics from " + clients.size() + " nodes.");
        return numTried - numOk;
    }

    private List createNodeClients(MetricsNodesConfig nodesConfig) {
        return nodesConfig.node().stream()
                .map(Node::new)
                .map(node-> new NodeMetricsClient(httpClient, node, Clock.systemUTC()))
                .toList();
    }

    @SuppressWarnings("deprecation")
    static CloseableHttpAsyncClient createHttpClient() {
        return VespaAsyncHttpClientBuilder.create()
                .setIOReactorConfig(IOReactorConfig.custom()
                        .setSoTimeout(Timeout.ofMilliseconds(HTTP_SOCKET_TIMEOUT))
                        .setIoThreadCount(2)
                        .build())
                .setUserAgent("application-metrics-retriever")
                .setDefaultRequestConfig(RequestConfig.custom()
                        .setConnectTimeout(Timeout.ofMilliseconds(HTTP_CONNECT_TIMEOUT))
                        .setResponseTimeout(Timeout.ofMilliseconds(HTTP_SOCKET_TIMEOUT))
                        .build())
                .build();
    }

    static Duration timeout(int clients) {
        Duration timeout = Duration.ofSeconds(Long.max(MIN_TIMEOUT.toSeconds(), clients));
        return timeout.compareTo(MAX_TIMEOUT) > 0 ? MAX_TIMEOUT : timeout;
    }

    // For testing only!
    void setTaskTimeout(Duration taskTimeout) {
        this.taskTimeout = taskTimeout;
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy