org.opensearch.cluster.InternalClusterInfoService Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearch Show documentation
OpenSearch subproject :server
There is a newer version: 2.18.0
/*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*
 * Modifications Copyright OpenSearch Contributors. See
 * GitHub history for details.
 */

package org.opensearch.cluster;

import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.opensearch.action.LatchedActionListener;
import org.opensearch.action.admin.cluster.node.stats.NodeStats;
import org.opensearch.action.admin.cluster.node.stats.NodesStatsRequest;
import org.opensearch.action.admin.cluster.node.stats.NodesStatsResponse;
import org.opensearch.action.admin.indices.stats.IndicesStatsRequest;
import org.opensearch.action.admin.indices.stats.IndicesStatsResponse;
import org.opensearch.action.admin.indices.stats.ShardStats;
import org.opensearch.action.support.IndicesOptions;
import org.opensearch.client.Client;
import org.opensearch.cluster.block.ClusterBlockException;
import org.opensearch.cluster.node.DiscoveryNode;
import org.opensearch.cluster.routing.ShardRouting;
import org.opensearch.cluster.routing.allocation.DiskThresholdSettings;
import org.opensearch.cluster.service.ClusterService;
import org.opensearch.common.settings.ClusterSettings;
import org.opensearch.common.settings.Setting;
import org.opensearch.common.settings.Setting.Property;
import org.opensearch.common.settings.Settings;
import org.opensearch.common.unit.TimeValue;
import org.opensearch.common.util.concurrent.AbstractRunnable;
import org.opensearch.core.action.ActionListener;
import org.opensearch.core.concurrency.OpenSearchRejectedExecutionException;
import org.opensearch.index.store.StoreStats;
import org.opensearch.index.store.remote.filecache.FileCacheStats;
import org.opensearch.monitor.fs.FsInfo;
import org.opensearch.threadpool.ThreadPool;
import org.opensearch.transport.ReceiveTimeoutTransportException;

import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Consumer;
import java.util.stream.Collectors;

/**
 * InternalClusterInfoService provides the ClusterInfoService interface,
 * routinely updated on a timer. The timer can be dynamically changed by
 * setting the cluster.info.update.interval setting (defaulting
 * to 30 seconds). The InternalClusterInfoService only runs on the cluster-manager node.
 * Listens for changes in the number of data nodes and immediately submits a
 * ClusterInfoUpdateJob if a node has been added.
 * 
 * Every time the timer runs, gathers information about the disk usage and
 * shard sizes across the cluster.
 *
 * @opensearch.internal
 */
public class InternalClusterInfoService implements ClusterInfoService, ClusterStateListener {

    private static final Logger logger = LogManager.getLogger(InternalClusterInfoService.class);

    private static final String REFRESH_EXECUTOR = ThreadPool.Names.MANAGEMENT;

    public static final Setting INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL_SETTING = Setting.timeSetting(
        "cluster.info.update.interval",
        TimeValue.timeValueSeconds(30),
        TimeValue.timeValueSeconds(10),
        Property.Dynamic,
        Property.NodeScope
    );
    public static final Setting INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING = Setting.positiveTimeSetting(
        "cluster.info.update.timeout",
        TimeValue.timeValueSeconds(15),
        Property.Dynamic,
        Property.NodeScope
    );

    private volatile TimeValue updateFrequency;

    private volatile Map leastAvailableSpaceUsages;
    private volatile Map mostAvailableSpaceUsages;
    private volatile Map nodeFileCacheStats;
    private volatile IndicesStatsSummary indicesStatsSummary;
    // null if this node is not currently the cluster-manager
    private final AtomicReference refreshAndRescheduleRunnable = new AtomicReference<>();
    private volatile boolean enabled;
    private volatile TimeValue fetchTimeout;
    private final ThreadPool threadPool;
    private final Client client;
    private final List> listeners = new CopyOnWriteArrayList<>();

    public InternalClusterInfoService(Settings settings, ClusterService clusterService, ThreadPool threadPool, Client client) {
        this.leastAvailableSpaceUsages = Map.of();
        this.mostAvailableSpaceUsages = Map.of();
        this.nodeFileCacheStats = Map.of();
        this.indicesStatsSummary = IndicesStatsSummary.EMPTY;
        this.threadPool = threadPool;
        this.client = client;
        this.updateFrequency = INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL_SETTING.get(settings);
        this.fetchTimeout = INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING.get(settings);
        this.enabled = DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.get(settings);
        ClusterSettings clusterSettings = clusterService.getClusterSettings();
        clusterSettings.addSettingsUpdateConsumer(INTERNAL_CLUSTER_INFO_TIMEOUT_SETTING, this::setFetchTimeout);
        clusterSettings.addSettingsUpdateConsumer(INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL_SETTING, this::setUpdateFrequency);
        clusterSettings.addSettingsUpdateConsumer(
            DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING,
            this::setEnabled
        );
    }

    private void setEnabled(boolean enabled) {
        this.enabled = enabled;
    }

    private void setFetchTimeout(TimeValue fetchTimeout) {
        this.fetchTimeout = fetchTimeout;
    }

    void setUpdateFrequency(TimeValue updateFrequency) {
        this.updateFrequency = updateFrequency;
    }

    @Override
    public void clusterChanged(ClusterChangedEvent event) {
        if (event.localNodeClusterManager() && refreshAndRescheduleRunnable.get() == null) {
            logger.trace("elected as cluster-manager, scheduling cluster info update tasks");
            executeRefresh(event.state(), "became cluster-manager");

            final RefreshAndRescheduleRunnable newRunnable = new RefreshAndRescheduleRunnable();
            refreshAndRescheduleRunnable.set(newRunnable);
            threadPool.scheduleUnlessShuttingDown(updateFrequency, REFRESH_EXECUTOR, newRunnable);
        } else if (event.localNodeClusterManager() == false) {
            refreshAndRescheduleRunnable.set(null);
            return;
        }

        if (enabled == false) {
            return;
        }

        // Refresh if a data node was added
        for (DiscoveryNode addedNode : event.nodesDelta().addedNodes()) {
            if (addedNode.isDataNode()) {
                executeRefresh(event.state(), "data node added");
                break;
            }
        }

        // Clean up info for any removed nodes
        for (DiscoveryNode removedNode : event.nodesDelta().removedNodes()) {
            if (removedNode.isDataNode()) {
                logger.trace("Removing node from cluster info: {}", removedNode.getId());
                if (leastAvailableSpaceUsages.containsKey(removedNode.getId())) {
                    Map newMaxUsages = new HashMap<>(leastAvailableSpaceUsages);
                    newMaxUsages.remove(removedNode.getId());
                    leastAvailableSpaceUsages = Collections.unmodifiableMap(newMaxUsages);
                }
                if (mostAvailableSpaceUsages.containsKey(removedNode.getId())) {
                    Map newMinUsages = new HashMap<>(mostAvailableSpaceUsages);
                    newMinUsages.remove(removedNode.getId());
                    mostAvailableSpaceUsages = Collections.unmodifiableMap(newMinUsages);
                }
            }
        }
    }

    private void executeRefresh(ClusterState clusterState, String reason) {
        if (clusterState.nodes().getDataNodes().size() > 1) {
            logger.trace("refreshing cluster info in background [{}]", reason);
            threadPool.executor(REFRESH_EXECUTOR).execute(new RefreshRunnable(reason));
        }
    }

    @Override
    public ClusterInfo getClusterInfo() {
        final IndicesStatsSummary indicesStatsSummary = this.indicesStatsSummary; // single volatile read
        return new ClusterInfo(
            leastAvailableSpaceUsages,
            mostAvailableSpaceUsages,
            indicesStatsSummary.shardSizes,
            indicesStatsSummary.shardRoutingToDataPath,
            indicesStatsSummary.reservedSpace,
            nodeFileCacheStats
        );
    }

    /**
     * Retrieve the latest nodes stats, calling the listener when complete
     * @return a latch that can be used to wait for the nodes stats to complete if desired
     */
    protected CountDownLatch updateNodeStats(final ActionListener listener) {
        final CountDownLatch latch = new CountDownLatch(1);
        final NodesStatsRequest nodesStatsRequest = new NodesStatsRequest("data:true");
        nodesStatsRequest.clear();
        nodesStatsRequest.addMetric(NodesStatsRequest.Metric.FS.metricName());
        nodesStatsRequest.addMetric(NodesStatsRequest.Metric.FILE_CACHE_STATS.metricName());
        nodesStatsRequest.timeout(fetchTimeout);
        client.admin().cluster().nodesStats(nodesStatsRequest, new LatchedActionListener<>(listener, latch));
        return latch;
    }

    /**
     * Retrieve the latest indices stats, calling the listener when complete
     * @return a latch that can be used to wait for the indices stats to complete if desired
     */
    protected CountDownLatch updateIndicesStats(final ActionListener listener) {
        final CountDownLatch latch = new CountDownLatch(1);
        final IndicesStatsRequest indicesStatsRequest = new IndicesStatsRequest();
        indicesStatsRequest.clear();
        indicesStatsRequest.store(true);
        indicesStatsRequest.indicesOptions(IndicesOptions.STRICT_EXPAND_OPEN_CLOSED_HIDDEN);

        client.admin().indices().stats(indicesStatsRequest, new LatchedActionListener<>(listener, latch));
        return latch;
    }

    // allow tests to adjust the node stats on receipt
    List adjustNodesStats(List nodeStats) {
        return nodeStats;
    }

    /**
     * Refreshes the ClusterInfo in a blocking fashion
     */
    public final ClusterInfo refresh() {
        logger.trace("refreshing cluster info");
        final CountDownLatch nodeLatch = updateNodeStats(new ActionListener() {
            @Override
            public void onResponse(NodesStatsResponse nodesStatsResponse) {
                final Map leastAvailableUsagesBuilder = new HashMap<>();
                final Map mostAvailableUsagesBuilder = new HashMap<>();
                fillDiskUsagePerNode(
                    logger,
                    adjustNodesStats(nodesStatsResponse.getNodes()),
                    leastAvailableUsagesBuilder,
                    mostAvailableUsagesBuilder
                );
                leastAvailableSpaceUsages = Collections.unmodifiableMap(leastAvailableUsagesBuilder);
                mostAvailableSpaceUsages = Collections.unmodifiableMap(mostAvailableUsagesBuilder);

                nodeFileCacheStats = Collections.unmodifiableMap(
                    nodesStatsResponse.getNodes()
                        .stream()
                        .filter(nodeStats -> nodeStats.getNode().isSearchNode())
                        .collect(Collectors.toMap(nodeStats -> nodeStats.getNode().getId(), NodeStats::getFileCacheStats))
                );
            }

            @Override
            public void onFailure(Exception e) {
                if (e instanceof ReceiveTimeoutTransportException) {
                    logger.error("NodeStatsAction timed out for ClusterInfoUpdateJob", e);
                } else {
                    if (e instanceof ClusterBlockException) {
                        if (logger.isTraceEnabled()) {
                            logger.trace("Failed to execute NodeStatsAction for ClusterInfoUpdateJob", e);
                        }
                    } else {
                        logger.warn("Failed to execute NodeStatsAction for ClusterInfoUpdateJob", e);
                    }
                    // we empty the usages list, to be safe - we don't know what's going on.
                    leastAvailableSpaceUsages = Map.of();
                    mostAvailableSpaceUsages = Map.of();
                }
            }
        });

        final CountDownLatch indicesLatch = updateIndicesStats(new ActionListener<>() {
            @Override
            public void onResponse(IndicesStatsResponse indicesStatsResponse) {
                final ShardStats[] stats = indicesStatsResponse.getShards();
                final Map shardSizeByIdentifierBuilder = new HashMap<>();
                final Map dataPathByShardRoutingBuilder = new HashMap<>();
                final Map reservedSpaceBuilders = new HashMap<>();
                buildShardLevelInfo(logger, stats, shardSizeByIdentifierBuilder, dataPathByShardRoutingBuilder, reservedSpaceBuilders);

                final Map rsrvdSpace = new HashMap<>();
                reservedSpaceBuilders.forEach((nodeAndPath, builder) -> rsrvdSpace.put(nodeAndPath, builder.build()));

                indicesStatsSummary = new IndicesStatsSummary(shardSizeByIdentifierBuilder, dataPathByShardRoutingBuilder, rsrvdSpace);
            }

            @Override
            public void onFailure(Exception e) {
                if (e instanceof ReceiveTimeoutTransportException) {
                    logger.error("IndicesStatsAction timed out for ClusterInfoUpdateJob", e);
                } else {
                    if (e instanceof ClusterBlockException) {
                        if (logger.isTraceEnabled()) {
                            logger.trace("Failed to execute IndicesStatsAction for ClusterInfoUpdateJob", e);
                        }
                    } else {
                        logger.warn("Failed to execute IndicesStatsAction for ClusterInfoUpdateJob", e);
                    }
                    // we empty the usages list, to be safe - we don't know what's going on.
                    indicesStatsSummary = IndicesStatsSummary.EMPTY;
                }
            }
        });

        try {
            if (nodeLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS) == false) {
                logger.warn("Failed to update node information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
            }
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt(); // restore interrupt status
        }

        try {
            if (indicesLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS) == false) {
                logger.warn("Failed to update shard information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
            }
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt(); // restore interrupt status
        }
        ClusterInfo clusterInfo = getClusterInfo();
        boolean anyListeners = false;
        for (final Consumer listener : listeners) {
            anyListeners = true;
            try {
                logger.trace("notifying [{}] of new cluster info", listener);
                listener.accept(clusterInfo);
            } catch (Exception e) {
                logger.info(new ParameterizedMessage("failed to notify [{}] of new cluster info", listener), e);
            }
        }
        assert anyListeners : "expected to notify at least one listener";
        return clusterInfo;
    }

    @Override
    public void addListener(Consumer clusterInfoConsumer) {
        listeners.add(clusterInfoConsumer);
    }

    static void buildShardLevelInfo(
        Logger logger,
        ShardStats[] stats,
        final Map shardSizes,
        final Map newShardRoutingToDataPath,
        final Map reservedSpaceByShard
    ) {
        for (ShardStats s : stats) {
            final ShardRouting shardRouting = s.getShardRouting();
            newShardRoutingToDataPath.put(shardRouting, s.getDataPath());

            final StoreStats storeStats = s.getStats().getStore();
            if (storeStats == null) {
                continue;
            }
            final long size = storeStats.sizeInBytes();
            final long reserved = storeStats.getReservedSize().getBytes();

            final String shardIdentifier = ClusterInfo.shardIdentifierFromRouting(shardRouting);
            logger.trace("shard: {} size: {} reserved: {}", shardIdentifier, size, reserved);
            shardSizes.put(shardIdentifier, size);

            if (reserved != StoreStats.UNKNOWN_RESERVED_BYTES) {
                final ClusterInfo.ReservedSpace.Builder reservedSpaceBuilder = reservedSpaceByShard.computeIfAbsent(
                    new ClusterInfo.NodeAndPath(shardRouting.currentNodeId(), s.getDataPath()),
                    t -> new ClusterInfo.ReservedSpace.Builder()
                );
                reservedSpaceBuilder.add(shardRouting.shardId(), reserved);
            }
        }
    }

    static void fillDiskUsagePerNode(
        Logger logger,
        List nodeStatsArray,
        final Map newLeastAvailableUsages,
        final Map newMostAvailableUsages
    ) {
        for (NodeStats nodeStats : nodeStatsArray) {
            if (nodeStats.getFs() == null) {
                logger.warn("Unable to retrieve node FS stats for {}", nodeStats.getNode().getName());
            } else {
                FsInfo.Path leastAvailablePath = null;
                FsInfo.Path mostAvailablePath = null;
                for (FsInfo.Path info : nodeStats.getFs()) {
                    if (leastAvailablePath == null) {
                        assert mostAvailablePath == null;
                        mostAvailablePath = leastAvailablePath = info;
                    } else if (leastAvailablePath.getAvailable().getBytes() > info.getAvailable().getBytes()) {
                        leastAvailablePath = info;
                    } else if (mostAvailablePath.getAvailable().getBytes() < info.getAvailable().getBytes()) {
                        mostAvailablePath = info;
                    }
                }
                String nodeId = nodeStats.getNode().getId();
                String nodeName = nodeStats.getNode().getName();
                if (logger.isTraceEnabled()) {
                    logger.trace(
                        "node: [{}], most available: total disk: {},"
                            + " available disk: {} / least available: total disk: {}, available disk: {}",
                        nodeId,
                        mostAvailablePath.getTotal(),
                        mostAvailablePath.getAvailable(),
                        leastAvailablePath.getTotal(),
                        leastAvailablePath.getAvailable()
                    );
                }
                if (leastAvailablePath.getTotal().getBytes() < 0) {
                    if (logger.isTraceEnabled()) {
                        logger.trace(
                            "node: [{}] least available path has less than 0 total bytes of disk [{}], skipping",
                            nodeId,
                            leastAvailablePath.getTotal().getBytes()
                        );
                    }
                } else {
                    newLeastAvailableUsages.put(
                        nodeId,
                        new DiskUsage(
                            nodeId,
                            nodeName,
                            leastAvailablePath.getPath(),
                            leastAvailablePath.getTotal().getBytes(),
                            leastAvailablePath.getAvailable().getBytes()
                        )
                    );
                }
                if (mostAvailablePath.getTotal().getBytes() < 0) {
                    if (logger.isTraceEnabled()) {
                        logger.trace(
                            "node: [{}] most available path has less than 0 total bytes of disk [{}], skipping",
                            nodeId,
                            mostAvailablePath.getTotal().getBytes()
                        );
                    }
                } else {
                    newMostAvailableUsages.put(
                        nodeId,
                        new DiskUsage(
                            nodeId,
                            nodeName,
                            mostAvailablePath.getPath(),
                            mostAvailablePath.getTotal().getBytes(),
                            mostAvailablePath.getAvailable().getBytes()
                        )
                    );
                }

            }
        }
    }

    /**
     * Indices statistics summary.
     *
     * @opensearch.internal
     */
    private static class IndicesStatsSummary {
        static final IndicesStatsSummary EMPTY = new IndicesStatsSummary(Map.of(), Map.of(), Map.of());

        final Map shardSizes;
        final Map shardRoutingToDataPath;
        final Map reservedSpace;

        IndicesStatsSummary(
            final Map shardSizes,
            final Map shardRoutingToDataPath,
            final Map reservedSpace
        ) {
            this.shardSizes = shardSizes;
            this.shardRoutingToDataPath = shardRoutingToDataPath;
            this.reservedSpace = reservedSpace;
        }
    }

    /**
     * Runs {@link InternalClusterInfoService#refresh()}, logging failures/rejections appropriately.
     *
     * @opensearch.internal
     */
    private class RefreshRunnable extends AbstractRunnable {
        private final String reason;

        RefreshRunnable(String reason) {
            this.reason = reason;
        }

        @Override
        protected void doRun() {
            if (enabled) {
                logger.trace("refreshing cluster info [{}]", reason);
                refresh();
            } else {
                logger.trace("skipping cluster info refresh [{}] since it is disabled", reason);
            }
        }

        @Override
        public void onFailure(Exception e) {
            logger.warn(new ParameterizedMessage("refreshing cluster info failed [{}]", reason), e);
        }

        @Override
        public void onRejection(Exception e) {
            final boolean shutDown = e instanceof OpenSearchRejectedExecutionException
                && ((OpenSearchRejectedExecutionException) e).isExecutorShutdown();
            logger.log(shutDown ? Level.DEBUG : Level.WARN, "refreshing cluster info rejected [{}]", reason, e);
        }
    }

    /**
     * Runs {@link InternalClusterInfoService#refresh()}, logging failures/rejections appropriately, and reschedules itself on completion.
     *
     * @opensearch.internal
     */
    private class RefreshAndRescheduleRunnable extends RefreshRunnable {
        RefreshAndRescheduleRunnable() {
            super("scheduled");
        }

        @Override
        protected void doRun() {
            if (this == refreshAndRescheduleRunnable.get()) {
                super.doRun();
            } else {
                logger.trace("cluster-manager changed, scheduled refresh job is stale");
            }
        }

        @Override
        public void onAfter() {
            if (this == refreshAndRescheduleRunnable.get()) {
                logger.trace("scheduling next cluster info refresh in [{}]", updateFrequency);
                threadPool.scheduleUnlessShuttingDown(updateFrequency, REFRESH_EXECUTOR, this);
            }
        }
    }
}