All Downloads are FREE. Search and download functionalities are using the official Maven repository.

ai.vespa.metricsproxy.service.SystemPoller Maven / Gradle / Ivy

There is a newer version: 8.458.13
Show newest version
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package ai.vespa.metricsproxy.service;

import ai.vespa.metricsproxy.metric.Metric;
import ai.vespa.metricsproxy.metric.Metrics;
import ai.vespa.metricsproxy.metric.model.MetricId;

import java.io.Reader;
import java.time.Duration;
import java.time.Instant;
import java.util.HashMap;
import java.util.logging.Level;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.ConcurrentHashMap;
import java.util.logging.Logger;

/**
 * Class to get data from the system and update the services at given intervals.
 * TODO: rewrite to use ScheduledExecutorService or just call poll() directly.
 *
 * @author Eirik Nygaard
 */
public class SystemPoller {

    private static final Logger log = Logger.getLogger(SystemPoller.class.getName());
    private static final int memoryTypeVirtual = 0;
    private static final int memoryTypeResident = 1;
    private static final MetricId CPU = MetricId.toMetricId("cpu");
    private static final MetricId CPU_UTIL = MetricId.toMetricId("cpu_util");
    private static final MetricId MEMORY_VIRT = MetricId.toMetricId("memory_virt");
    private static final MetricId MEMORY_RSS = MetricId.toMetricId("memory_rss");

    private final Duration interval;
    private final List services;
    private final Map lastCpuJiffiesMetrics = new ConcurrentHashMap<>();
    private final Timer systemPollTimer;
    private final GetJiffies jiffiesInterface;

    private JiffiesAndCpus lastTotalCpuJiffies;

    static class JiffiesAndCpus {
        final long jiffies;
        final int cpus;
        JiffiesAndCpus() { this(0,1); }
        JiffiesAndCpus(long jiffies, int cpus) {
            this.jiffies = jiffies;
            this.cpus = Math.max(1, cpus);
        }
        /** 1.0 = 1 busy core Range = [0.0, #cores] */
        double ratioSingleCoreJiffies(long partJiffies) {
            return (double)(partJiffies * cpus) / Math.max(1.0, jiffies);
        }
        /** Range = [0.0, 1.0] */
        double ratioJiffies(long partJiffies) {
            return (double)(partJiffies) / Math.max(1.0, jiffies);
        }
        JiffiesAndCpus diff(JiffiesAndCpus prev) {
            return (cpus == prev.cpus)
                    ? new JiffiesAndCpus(jiffies - prev.jiffies, cpus)
                    : new JiffiesAndCpus();
        }
    }
    interface GetJiffies {
        JiffiesAndCpus getTotalSystemJiffies();
        long getJiffies(VespaService service);
    }

    public SystemPoller(List services, Duration interval) {
        this.services = services;
        this.interval = interval;
        systemPollTimer = new Timer("systemPollTimer", true);
        jiffiesInterface = new GetJiffies() {
            @Override
            public JiffiesAndCpus getTotalSystemJiffies() {
                return SystemPoller.getTotalSystemJiffies();
            }

            @Override
            public long getJiffies(VespaService service) {
                return SystemPoller.getPidJiffies(service);
            }
        };
        lastTotalCpuJiffies = jiffiesInterface.getTotalSystemJiffies();
        for (VespaService s : services) {
            lastCpuJiffiesMetrics.put(s, jiffiesInterface.getJiffies(s));
        }
    }

    void stop() {
        systemPollTimer.cancel();
    }

    /**
     * Return memory usage for a given process, both resident and virtual is
     * returned.
     *
     * @param service The instance to get memory usage for
     * @return array[0] = memoryResident, array[1] = memoryVirtual (kB units)
     */
    static long[] getMemoryUsage(VespaService service) {
        BufferedReader br;
        int pid = service.getPid();

        try {
            br = new BufferedReader(new FileReader("/proc/" + pid + "/smaps"));
        } catch (FileNotFoundException ex) {
            service.setAlive(false);
            return new long[2];
        }
        try {
            return getMemoryUsage(br);
        } catch (IOException ex) {
            log.log(Level.FINE, "Unable to read line from smaps file", ex);
            return new long[2];
        } finally {
            try {
                br.close();
            } catch (IOException ex) {
                log.log(Level.FINE, "Closing of smaps file failed", ex);
            }
        }
    }
    static long[] getMemoryUsage(BufferedReader br) throws IOException{
        String line;
        long[] size = new long[2];
        while ((line = br.readLine()) != null) {
            /* Memory size is given in kB - convert to bytes by multiply with 1024*/
            if (line.startsWith("Rss:")) {
                String remain = line.substring(4).trim();
                size[memoryTypeResident] += Long.parseLong(remain.substring(0, remain.indexOf(' '))) * 1024;
            } else if (line.startsWith("Size:")) {
                String remain = line.substring(5).trim();
                size[memoryTypeVirtual] += Long.parseLong(remain.substring(0, remain.indexOf(' '))) * 1024;
            }
        }

        return size;
    }

    /**
     * Poll services for system metrics
     */
    void poll() {
        Instant startTime = Instant.now();

        /* Don't do any work if there are no known services */
        if (services.isEmpty()) {
            schedule();
            return;
        }

        log.log(Level.FINE, () -> "Monitoring system metrics for " + services.size() + " services");

        boolean someAlive = services.stream().anyMatch(VespaService::isAlive);
        lastTotalCpuJiffies = updateMetrics(lastTotalCpuJiffies, startTime, jiffiesInterface, services, lastCpuJiffiesMetrics);

        // If none of the services were alive, reschedule in a short time
        if (!someAlive) {
            reschedule(Duration.between(startTime, Instant.now()));
        } else {
            schedule();
        }
    }

    static JiffiesAndCpus updateMetrics(JiffiesAndCpus prevTotalJiffies, Instant timeStamp, GetJiffies getJiffies,
                                        List services, Map lastCpuJiffiesMetrics) {
        Map currentServiceJiffies = new HashMap<>();
        for (VespaService s : services) {
            currentServiceJiffies.put(s, getJiffies.getJiffies(s));
        }
        JiffiesAndCpus sysJiffies = getJiffies.getTotalSystemJiffies();
        JiffiesAndCpus sysJiffiesDiff = sysJiffies.diff(prevTotalJiffies);
        log.log(Level.FINE, () -> "Total jiffies: " + sysJiffies.jiffies + " - " + prevTotalJiffies.jiffies + " = " + sysJiffiesDiff.jiffies);
        for (VespaService s : services) {
            Metrics metrics = new Metrics();

            long[] size = getMemoryUsage(s);
            log.log(Level.FINE, () -> "Updating memory metric for service " + s);

            metrics.add(new Metric(MEMORY_VIRT, size[memoryTypeVirtual], timeStamp));
            metrics.add(new Metric(MEMORY_RSS, size[memoryTypeResident], timeStamp));

            long procJiffies = currentServiceJiffies.get(s);
            long last = lastCpuJiffiesMetrics.get(s);
            long diff = procJiffies - last;

            log.log(Level.FINE, () -> "Service " + s + " jiffies: " + procJiffies + " - " + last + " = " + diff);
            if (diff >= 0) {
                metrics.add(new Metric(CPU, 100 * sysJiffiesDiff.ratioSingleCoreJiffies(diff), timeStamp));
                metrics.add(new Metric(CPU_UTIL, 100 * sysJiffiesDiff.ratioJiffies(diff), timeStamp));
            }
            lastCpuJiffiesMetrics.put(s, procJiffies);
            s.setSystemMetrics(metrics);
            log.log(Level.FINE, () -> "Current size of system metrics for service  " + s + " is " + metrics.size());
        }
        return sysJiffies;
    }

    static long getPidJiffies(VespaService service) {
        int pid = service.getPid();
        try {
            BufferedReader in = new BufferedReader(new FileReader("/proc/" + pid + "/stat"));
            return getPidJiffies(in);
        } catch (FileNotFoundException ex) {
            log.log(Level.FINE, () -> "Unable to find pid " + pid + " in proc directory, for service " + service.getInstanceName());
            service.setAlive(false);
            return 0;
        }
    }
    static long getPidJiffies(BufferedReader in) {
        String line;
        String[] elems;

        try {
            line = in.readLine();
            in.close();
        } catch (IOException ex) {
            log.log(Level.FINE, "Unable to read line from process stat file", ex);
            return 0;
        }

        elems = line.split(" ");

        /* Add user mode and kernel mode jiffies for the given process */
        return Long.parseLong(elems[13]) + Long.parseLong(elems[14]);
    }

    private static JiffiesAndCpus getTotalSystemJiffies() {
        try {
            BufferedReader in = new BufferedReader(new FileReader("/proc/stat"));
            return getTotalSystemJiffies(in);
        } catch (FileNotFoundException ex) {
            log.log(Level.SEVERE, "Unable to open stat file", ex);
            return new JiffiesAndCpus();
        }
    }
    static JiffiesAndCpus getTotalSystemJiffies(BufferedReader in) {
        ArrayList jiffies = new ArrayList<>();
        CpuJiffies total = null;

        try {
            String line;
            while ((line = in.readLine()) != null) {
                if (line.startsWith("cpu ")) {
                    total = new CpuJiffies(line);
                } else if (line.startsWith("cpu")) {
                    jiffies.add(new CpuJiffies(line));
                }
            }

            in.close();
        } catch (IOException ex) {
            log.log(Level.SEVERE, "Unable to read line from stat file", ex);
            return new JiffiesAndCpus();
        }

        /* Normalize so that a process that uses an entire CPU core will get 100% util */
        return (total != null)
                ? new JiffiesAndCpus(total.getTotalJiffies(), jiffies.size())
                : new JiffiesAndCpus();
    }

    void schedule(Duration time) {
        try {
            systemPollTimer.schedule(new PollTask(this), time.toMillis());
        } catch(IllegalStateException e){
            log.info("Tried to schedule task, but timer was already shut down.");
        }
    }

    void schedule() {
        schedule(interval);
    }

    private void reschedule(Duration skew) {
        Duration sleep = interval.minus(skew);

        // Don't sleep less than 1 min
        if ( sleep.compareTo(Duration.ofMinutes(1)) < 0) {
            schedule(Duration.ofMinutes(1));
        } else {
            schedule(sleep);
        }
    }


    private static class PollTask extends TimerTask {
        private final SystemPoller poller;

        PollTask(SystemPoller poller) {
            this.poller = poller;
        }

        @Override
        public void run() {
            poller.poll();
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy