All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.monitor.fs.FsHealthService Maven / Gradle / Ivy

There is a newer version: 8.16.0
Show newest version
/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */

package org.elasticsearch.monitor.fs;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.elasticsearch.common.UUIDs;
import org.elasticsearch.common.component.AbstractLifecycleComponent;
import org.elasticsearch.common.settings.ClusterSettings;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.core.internal.io.IOUtils;
import org.elasticsearch.env.NodeEnvironment;
import org.elasticsearch.monitor.NodeHealthService;
import org.elasticsearch.monitor.StatusInfo;
import org.elasticsearch.threadpool.Scheduler;
import org.elasticsearch.threadpool.ThreadPool;

import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.HashSet;
import java.util.Set;
import java.util.function.LongSupplier;
import java.util.stream.Collectors;

import static org.elasticsearch.monitor.StatusInfo.Status.HEALTHY;
import static org.elasticsearch.monitor.StatusInfo.Status.UNHEALTHY;

/**
 * Runs periodically and attempts to create a temp file to see if the filesystem is writable. If not then it marks the path as unhealthy.
 */
public class FsHealthService extends AbstractLifecycleComponent implements NodeHealthService {

    private static final Logger logger = LogManager.getLogger(FsHealthService.class);
    private final ThreadPool threadPool;
    private volatile boolean enabled;
    private volatile boolean brokenLock;
    private final TimeValue refreshInterval;
    private volatile TimeValue slowPathLoggingThreshold;
    private final NodeEnvironment nodeEnv;
    private final LongSupplier currentTimeMillisSupplier;
    private volatile Scheduler.Cancellable scheduledFuture;

    @Nullable
    private volatile Set unhealthyPaths;

    public static final Setting ENABLED_SETTING = Setting.boolSetting(
        "monitor.fs.health.enabled",
        true,
        Setting.Property.NodeScope,
        Setting.Property.Dynamic
    );
    public static final Setting REFRESH_INTERVAL_SETTING = Setting.timeSetting(
        "monitor.fs.health.refresh_interval",
        TimeValue.timeValueSeconds(120),
        TimeValue.timeValueMillis(1),
        Setting.Property.NodeScope
    );
    public static final Setting SLOW_PATH_LOGGING_THRESHOLD_SETTING = Setting.timeSetting(
        "monitor.fs.health.slow_path_logging_threshold",
        TimeValue.timeValueSeconds(5),
        TimeValue.timeValueMillis(1),
        Setting.Property.NodeScope,
        Setting.Property.Dynamic
    );

    public FsHealthService(Settings settings, ClusterSettings clusterSettings, ThreadPool threadPool, NodeEnvironment nodeEnv) {
        this.threadPool = threadPool;
        this.enabled = ENABLED_SETTING.get(settings);
        this.refreshInterval = REFRESH_INTERVAL_SETTING.get(settings);
        this.slowPathLoggingThreshold = SLOW_PATH_LOGGING_THRESHOLD_SETTING.get(settings);
        this.currentTimeMillisSupplier = threadPool::relativeTimeInMillis;
        this.nodeEnv = nodeEnv;
        clusterSettings.addSettingsUpdateConsumer(SLOW_PATH_LOGGING_THRESHOLD_SETTING, this::setSlowPathLoggingThreshold);
        clusterSettings.addSettingsUpdateConsumer(ENABLED_SETTING, this::setEnabled);
    }

    @Override
    protected void doStart() {
        scheduledFuture = threadPool.scheduleWithFixedDelay(new FsHealthMonitor(), refreshInterval, ThreadPool.Names.GENERIC);
    }

    @Override
    protected void doStop() {
        scheduledFuture.cancel();
    }

    @Override
    protected void doClose() {}

    public void setEnabled(boolean enabled) {
        this.enabled = enabled;
    }

    public void setSlowPathLoggingThreshold(TimeValue slowPathLoggingThreshold) {
        this.slowPathLoggingThreshold = slowPathLoggingThreshold;
    }

    @Override
    public StatusInfo getHealth() {
        StatusInfo statusInfo;
        Set unhealthyPaths = this.unhealthyPaths;
        if (enabled == false) {
            statusInfo = new StatusInfo(HEALTHY, "health check disabled");
        } else if (brokenLock) {
            statusInfo = new StatusInfo(UNHEALTHY, "health check failed due to broken node lock");
        } else if (unhealthyPaths == null) {
            statusInfo = new StatusInfo(HEALTHY, "health check passed");
        } else {
            String info = "health check failed on ["
                + unhealthyPaths.stream().map(k -> k.toString()).collect(Collectors.joining(","))
                + "]";
            statusInfo = new StatusInfo(UNHEALTHY, info);
        }

        return statusInfo;
    }

    class FsHealthMonitor implements Runnable {

        static final String TEMP_FILE_NAME = ".es_temp_file";
        private byte[] bytesToWrite;

        FsHealthMonitor() {
            this.bytesToWrite = UUIDs.randomBase64UUID().getBytes(StandardCharsets.UTF_8);
        }

        @Override
        public void run() {
            try {
                if (enabled) {
                    monitorFSHealth();
                    logger.debug("health check succeeded");
                }
            } catch (Exception e) {
                logger.error("health check failed", e);
            }
        }

        private void monitorFSHealth() {
            Set currentUnhealthyPaths = null;
            final Path[] paths;
            try {
                paths = nodeEnv.nodeDataPaths();
            } catch (IllegalStateException e) {
                logger.error("health check failed", e);
                brokenLock = true;
                return;
            }

            for (Path path : paths) {
                final long executionStartTime = currentTimeMillisSupplier.getAsLong();
                try {
                    if (Files.exists(path)) {
                        final Path tempDataPath = path.resolve(TEMP_FILE_NAME);
                        Files.deleteIfExists(tempDataPath);
                        try (OutputStream os = Files.newOutputStream(tempDataPath, StandardOpenOption.CREATE_NEW)) {
                            os.write(bytesToWrite);
                            IOUtils.fsync(tempDataPath, false);
                        }
                        Files.delete(tempDataPath);
                        final long elapsedTime = currentTimeMillisSupplier.getAsLong() - executionStartTime;
                        if (elapsedTime > slowPathLoggingThreshold.millis()) {
                            logger.warn(
                                "health check of [{}] took [{}ms] which is above the warn threshold of [{}]",
                                path,
                                elapsedTime,
                                slowPathLoggingThreshold
                            );
                        }
                    }
                } catch (Exception ex) {
                    logger.error(new ParameterizedMessage("health check of [{}] failed", path), ex);
                    if (currentUnhealthyPaths == null) {
                        currentUnhealthyPaths = new HashSet<>(1);
                    }
                    currentUnhealthyPaths.add(path);
                }
            }
            unhealthyPaths = currentUnhealthyPaths;
            brokenLock = false;
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy