All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.graylog2.periodical.IndexerClusterCheckerThread Maven / Gradle / Ivy

There is a newer version: 6.1.4
Show newest version
/*
 * Copyright (C) 2020 Graylog, Inc.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the Server Side Public License, version 1,
 * as published by MongoDB, Inc.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * Server Side Public License for more details.
 *
 * You should have received a copy of the Server Side Public License
 * along with this program. If not, see
 * .
 */
package org.graylog2.periodical;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.ImmutableList;
import org.graylog2.indexer.cluster.Cluster;
import org.graylog2.indexer.cluster.health.AbsoluteValueWatermarkSettings;
import org.graylog2.indexer.cluster.health.ClusterAllocationDiskSettings;
import org.graylog2.indexer.cluster.health.NodeDiskUsageStats;
import org.graylog2.indexer.cluster.health.NodeFileDescriptorStats;
import org.graylog2.indexer.cluster.health.NodeRole;
import org.graylog2.indexer.cluster.health.PercentageWatermarkSettings;
import org.graylog2.indexer.cluster.health.WatermarkSettings;
import org.graylog2.notifications.Notification;
import org.graylog2.notifications.NotificationService;
import org.graylog2.plugin.periodical.Periodical;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.inject.Inject;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;

import static com.google.common.base.MoreObjects.firstNonNull;

public class IndexerClusterCheckerThread extends Periodical {
    private static final Logger LOG = LoggerFactory.getLogger(IndexerClusterCheckerThread.class);
    private static final int MINIMUM_OPEN_FILES_LIMIT = 64000;

    private final NotificationService notificationService;
    private final Cluster cluster;

    @Inject
    public IndexerClusterCheckerThread(NotificationService notificationService,
                                       Cluster cluster) {
        this.notificationService = notificationService;
        this.cluster = cluster;
    }

    @Override
    public void doRun() {
        if (cluster.health().isEmpty()) {
            LOG.info("Indexer not fully initialized yet. Skipping periodic cluster check.");
            return;
        }
        checkOpenFiles();
        checkDiskUsage();
    }

    @VisibleForTesting
    void checkOpenFiles() {
        if (notificationExists(Notification.Type.ES_OPEN_FILES)) {
            return;
        }

        boolean allHigher = true;
        final Set fileDescriptorStats = cluster.getFileDescriptorStats();
        for (NodeFileDescriptorStats nodeFileDescriptorStats : fileDescriptorStats) {
            final String name = nodeFileDescriptorStats.name();
            final String ip = nodeFileDescriptorStats.ip();
            final String host = nodeFileDescriptorStats.host();
            final long maxFileDescriptors = nodeFileDescriptorStats.fileDescriptorMax().orElse(-1L);

            if (maxFileDescriptors != -1L && maxFileDescriptors < MINIMUM_OPEN_FILES_LIMIT) {
                // Write notification.
                final String ipOrHostName = firstNonNull(host, ip);
                final Notification notification = notificationService.buildNow()
                        .addType(Notification.Type.ES_OPEN_FILES)
                        .addSeverity(Notification.Severity.URGENT)
                        .addDetail("hostname", ipOrHostName)
                        .addDetail("max_file_descriptors", maxFileDescriptors);

                if (notificationService.publishIfFirst(notification)) {
                    LOG.warn("Indexer node <{}> ({}) open file limit is too low: [{}]. Set it to at least {}.",
                            name,
                            ipOrHostName,
                            maxFileDescriptors,
                            MINIMUM_OPEN_FILES_LIMIT);
                }
                allHigher = false;
            }
        }

        if (allHigher) {
            Notification notification = notificationService.build().addType(Notification.Type.ES_OPEN_FILES);
            notificationService.fixed(notification);
        }
    }

    @VisibleForTesting()
    void checkDiskUsage() {
        final Map> notificationTypePerNodeIdentifier = new HashMap<>();
        try {
            ClusterAllocationDiskSettings settings = cluster.getClusterAllocationDiskSettings();
            if (settings.ThresholdEnabled()) {
                final Set diskUsageStats = cluster.getDiskUsageStats();

                for (NodeDiskUsageStats nodeDiskUsageStats : diskUsageStats) {
                    if (!nodeHoldsData(nodeDiskUsageStats)) {
                        LOG.debug("Ignoring non-data ES node <{}/{}> with roles <{}> for disk usage check.",
                                nodeDiskUsageStats.host(), nodeDiskUsageStats.ip(), nodeDiskUsageStats.roles());
                        continue;
                    }
                    Notification.Type currentNodeNotificationType = null;
                    WatermarkSettings watermarkSettings = settings.watermarkSettings();
                    if (watermarkSettings instanceof PercentageWatermarkSettings) {
                        currentNodeNotificationType = getDiskUsageNotificationTypeByPercentage((PercentageWatermarkSettings) watermarkSettings, nodeDiskUsageStats);
                    } else if (watermarkSettings instanceof AbsoluteValueWatermarkSettings) {
                        currentNodeNotificationType = getDiskUsageNotificationTypeByAbsoluteValues((AbsoluteValueWatermarkSettings) watermarkSettings, nodeDiskUsageStats);
                    }
                    if (currentNodeNotificationType != null) {
                        String nodeIdentifier = firstNonNull(nodeDiskUsageStats.host(), nodeDiskUsageStats.ip());
                        notificationTypePerNodeIdentifier.merge(currentNodeNotificationType, Collections.singletonList(nodeIdentifier), (prev, cur) -> ImmutableList.builder()
                                .addAll(prev)
                                .addAll(cur)
                                .build());
                    }
                }

                if (notificationTypePerNodeIdentifier.isEmpty()) {
                    fixAllDiskUsageNotifications();
                } else {
                    publishDiskUsageNotifications(notificationTypePerNodeIdentifier);
                }
            }
        } catch (Exception e) {
            LOG.error("Error while trying to check Elasticsearch disk usage.Details: " + e.getMessage());
        }
    }

    private boolean nodeHoldsData(NodeDiskUsageStats nodeDiskUsageStats) {
        return nodeDiskUsageStats.roles().stream().anyMatch(NodeRole::holdsData);
    }

    private Notification.Type getDiskUsageNotificationTypeByPercentage(PercentageWatermarkSettings settings, NodeDiskUsageStats nodeDiskUsageStats) {
        if (settings.floodStage() != null && nodeDiskUsageStats.diskUsedPercent() >= settings.floodStage()) {
            return Notification.Type.ES_NODE_DISK_WATERMARK_FLOOD_STAGE;
        } else if (nodeDiskUsageStats.diskUsedPercent() >= settings.high()) {
            return Notification.Type.ES_NODE_DISK_WATERMARK_HIGH;
        } else if (nodeDiskUsageStats.diskUsedPercent() >= settings.low()) {
            return Notification.Type.ES_NODE_DISK_WATERMARK_LOW;
        }
        return null;
    }

    private Notification.Type getDiskUsageNotificationTypeByAbsoluteValues(AbsoluteValueWatermarkSettings settings, NodeDiskUsageStats nodeDiskUsageStats) {
        if (settings.floodStage() != null && nodeDiskUsageStats.diskAvailable().getBytes() <= settings.floodStage().getBytes()) {
            return Notification.Type.ES_NODE_DISK_WATERMARK_FLOOD_STAGE;
        } else if (nodeDiskUsageStats.diskAvailable().getBytes() <= settings.high().getBytes()) {
            return Notification.Type.ES_NODE_DISK_WATERMARK_HIGH;
        } else if (nodeDiskUsageStats.diskAvailable().getBytes() <= settings.low().getBytes()) {
            return Notification.Type.ES_NODE_DISK_WATERMARK_LOW;
        }
        return null;
    }

    private void fixAllDiskUsageNotifications() {
        notificationService.fixed(Notification.Type.ES_NODE_DISK_WATERMARK_FLOOD_STAGE);
        notificationService.fixed(Notification.Type.ES_NODE_DISK_WATERMARK_HIGH);
        notificationService.fixed(Notification.Type.ES_NODE_DISK_WATERMARK_LOW);
    }

    private void publishDiskUsageNotifications(Map> notificationTypePerNodeIdentifier) {
        for (Map.Entry> entry : notificationTypePerNodeIdentifier.entrySet()) {
            if (!notificationExists(entry.getKey())) {
                Notification notification = notificationService.buildNow()
                        .addType(entry.getKey())
                        .addSeverity(Notification.Severity.URGENT)
                        .addDetail("nodes", String.join(", ", entry.getValue()));
                notificationService.publishIfFirst(notification);
                for (String node: entry.getValue()) {
                    LOG.warn("Elasticsearch node [{}] triggered [{}] due to low free disk space",
                            node,
                            entry.getKey());
                }
            }
        }
    }

    private boolean notificationExists(Notification.Type type) {
        return !notificationService.isFirst(type);
    }

    @Override
    protected Logger getLogger() {
        return LOG;
    }

    @Override
    public boolean runsForever() {
        return false;
    }

    @Override
    public boolean stopOnGracefulShutdown() {
        return true;
    }

    @Override
    public boolean leaderOnly() {
        return true;
    }

    @Override
    public boolean startOnThisNode() {
        return true;
    }

    @Override
    public boolean isDaemon() {
        return true;
    }

    @Override
    public int getInitialDelaySeconds() {
        return 5;
    }

    @Override
    public int getPeriodSeconds() {
        return 30;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy