org.elasticsearch.cluster.InternalClusterInfoService Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Elasticsearch - Open Source, Distributed, RESTful Search Engine
There is a newer version: 8.14.1
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.cluster;

import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.LatchedActionListener;
import org.elasticsearch.action.admin.cluster.node.stats.NodeStats;
import org.elasticsearch.action.admin.cluster.node.stats.NodesStatsRequest;
import org.elasticsearch.action.admin.cluster.node.stats.NodesStatsResponse;
import org.elasticsearch.action.admin.cluster.node.stats.TransportNodesStatsAction;
import org.elasticsearch.action.admin.indices.stats.IndicesStatsRequest;
import org.elasticsearch.action.admin.indices.stats.IndicesStatsResponse;
import org.elasticsearch.action.admin.indices.stats.ShardStats;
import org.elasticsearch.action.admin.indices.stats.TransportIndicesStatsAction;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.block.ClusterBlockException;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDecider;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.logging.ESLogger;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
import org.elasticsearch.monitor.fs.FsInfo;
import org.elasticsearch.node.settings.NodeSettingsService;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.ReceiveTimeoutTransportException;

import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;

/**
 * InternalClusterInfoService provides the ClusterInfoService interface,
 * routinely updated on a timer. The timer can be dynamically changed by
 * setting the cluster.info.update.interval setting (defaulting
 * to 30 seconds). The InternalClusterInfoService only runs on the master node.
 * Listens for changes in the number of data nodes and immediately submits a
 * ClusterInfoUpdateJob if a node has been added.
 *
 * Every time the timer runs, gathers information about the disk usage and
 * shard sizes across the cluster.
 */
public class InternalClusterInfoService extends AbstractComponent implements ClusterInfoService, LocalNodeMasterListener, ClusterStateListener {

    public static final String INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL = "cluster.info.update.interval";
    public static final String INTERNAL_CLUSTER_INFO_TIMEOUT = "cluster.info.update.timeout";

    private volatile TimeValue updateFrequency;

    private volatile Map leastAvailableSpaceUsages;
    private volatile Map mostAvailableSpaceUsages;
    private volatile Map shardRoutingToDataPath;
    private volatile Map shardSizes;
    private volatile boolean isMaster = false;
    private volatile boolean enabled;
    private volatile TimeValue fetchTimeout;
    private final TransportNodesStatsAction transportNodesStatsAction;
    private final TransportIndicesStatsAction transportIndicesStatsAction;
    private final ClusterService clusterService;
    private final ThreadPool threadPool;
    private final List listeners = new CopyOnWriteArrayList<>();

    @Inject
    public InternalClusterInfoService(Settings settings, NodeSettingsService nodeSettingsService,
                                      TransportNodesStatsAction transportNodesStatsAction,
                                      TransportIndicesStatsAction transportIndicesStatsAction, ClusterService clusterService,
                                      ThreadPool threadPool) {
        super(settings);
        this.leastAvailableSpaceUsages = Collections.emptyMap();
        this.mostAvailableSpaceUsages = Collections.emptyMap();
        this.shardRoutingToDataPath = Collections.emptyMap();
        this.shardSizes = Collections.emptyMap();
        this.transportNodesStatsAction = transportNodesStatsAction;
        this.transportIndicesStatsAction = transportIndicesStatsAction;
        this.clusterService = clusterService;
        this.threadPool = threadPool;
        this.updateFrequency = settings.getAsTime(INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL, TimeValue.timeValueSeconds(30));
        this.fetchTimeout = settings.getAsTime(INTERNAL_CLUSTER_INFO_TIMEOUT, TimeValue.timeValueSeconds(15));
        this.enabled = settings.getAsBoolean(DiskThresholdDecider.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED, true);
        nodeSettingsService.addListener(new ApplySettings());

        // Add InternalClusterInfoService to listen for Master changes
        this.clusterService.add((LocalNodeMasterListener)this);
        // Add to listen for state changes (when nodes are added)
        this.clusterService.add((ClusterStateListener)this);
    }

    class ApplySettings implements NodeSettingsService.Listener {
        @Override
        public void onRefreshSettings(Settings settings) {
            TimeValue newUpdateFrequency = settings.getAsTime(INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL, null);
            // ClusterInfoService is only enabled if the DiskThresholdDecider is enabled
            Boolean newEnabled = settings.getAsBoolean(DiskThresholdDecider.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED, null);

            if (newUpdateFrequency != null) {
                if (newUpdateFrequency.getMillis() < TimeValue.timeValueSeconds(10).getMillis()) {
                    logger.warn("[{}] set too low [{}] (< 10s)", INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL, newUpdateFrequency);
                    throw new IllegalStateException("Unable to set " + INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL + " less than 10 seconds");
                } else {
                    logger.info("updating [{}] from [{}] to [{}]", INTERNAL_CLUSTER_INFO_UPDATE_INTERVAL, updateFrequency, newUpdateFrequency);
                    InternalClusterInfoService.this.updateFrequency = newUpdateFrequency;
                }
            }

            TimeValue newFetchTimeout = settings.getAsTime(INTERNAL_CLUSTER_INFO_TIMEOUT, null);
            if (newFetchTimeout != null) {
                logger.info("updating fetch timeout [{}] from [{}] to [{}]", INTERNAL_CLUSTER_INFO_TIMEOUT, fetchTimeout, newFetchTimeout);
                InternalClusterInfoService.this.fetchTimeout = newFetchTimeout;
            }


            // We don't log about enabling it here, because the DiskThresholdDecider will already be logging about enable/disable
            if (newEnabled != null) {
                InternalClusterInfoService.this.enabled = newEnabled;
            }
        }
    }

    @Override
    public void onMaster() {
        this.isMaster = true;
        if (logger.isTraceEnabled()) {
            logger.trace("I have been elected master, scheduling a ClusterInfoUpdateJob");
        }
        try {
            // Submit a job that will start after DEFAULT_STARTING_INTERVAL, and reschedule itself after running
            threadPool.schedule(updateFrequency, executorName(), new SubmitReschedulingClusterInfoUpdatedJob());
            if (clusterService.state().getNodes().getDataNodes().size() > 1) {
                // Submit an info update job to be run immediately
                threadPool.executor(executorName()).execute(new Runnable() {
                    @Override
                    public void run() {
                        maybeRefresh();
                    }
                });
            }
        } catch (EsRejectedExecutionException ex) {
            if (logger.isDebugEnabled()) {
                logger.debug("Couldn't schedule cluster info update task - node might be shutting down", ex);
            }
        }
    }

    @Override
    public void offMaster() {
        this.isMaster = false;
    }

    @Override
    public String executorName() {
        return ThreadPool.Names.MANAGEMENT;
    }

    @Override
    public void clusterChanged(ClusterChangedEvent event) {
        if (!this.enabled) {
            return;
        }

        // Check whether it was a data node that was added
        boolean dataNodeAdded = false;
        for (DiscoveryNode addedNode : event.nodesDelta().addedNodes()) {
            if (addedNode.dataNode()) {
                dataNodeAdded = true;
                break;
            }
        }

        if (this.isMaster && dataNodeAdded && clusterService.state().getNodes().getDataNodes().size() > 1) {
            if (logger.isDebugEnabled()) {
                logger.debug("data node was added, retrieving new cluster info");
            }
            threadPool.executor(executorName()).execute(new Runnable() {
                @Override
                public void run() {
                    maybeRefresh();
                }
            });
        }

        if (this.isMaster && event.nodesRemoved()) {
            for (DiscoveryNode removedNode : event.nodesDelta().removedNodes()) {
                if (removedNode.dataNode()) {
                    if (logger.isTraceEnabled()) {
                        logger.trace("Removing node from cluster info: {}", removedNode.getId());
                    }
                    if (leastAvailableSpaceUsages.containsKey(removedNode.getId())) {
                        Map newMaxUsages = new HashMap<>(leastAvailableSpaceUsages);
                        newMaxUsages.remove(removedNode.getId());
                        leastAvailableSpaceUsages = Collections.unmodifiableMap(newMaxUsages);
                    }
                    if (mostAvailableSpaceUsages.containsKey(removedNode.getId())) {
                        Map newMinUsages = new HashMap<>(mostAvailableSpaceUsages);
                        newMinUsages.remove(removedNode.getId());
                        mostAvailableSpaceUsages = Collections.unmodifiableMap(newMinUsages);
                    }
                }
            }
        }
    }

    @Override
    public ClusterInfo getClusterInfo() {
        return new ClusterInfo(leastAvailableSpaceUsages, mostAvailableSpaceUsages, shardSizes, shardRoutingToDataPath);
    }

    @Override
    public void addListener(Listener listener) {
        this.listeners.add(listener);
    }

    /**
     * Class used to submit {@link #maybeRefresh()} on the
     * {@link InternalClusterInfoService} threadpool, these jobs will
     * reschedule themselves by placing a new instance of this class onto the
     * scheduled threadpool.
     */
    public class SubmitReschedulingClusterInfoUpdatedJob implements Runnable {
        @Override
        public void run() {
            if (logger.isTraceEnabled()) {
                logger.trace("Submitting new rescheduling cluster info update job");
            }
            try {
                threadPool.executor(executorName()).execute(new Runnable() {
                    @Override
                    public void run() {
                        try {
                            maybeRefresh();
                        } finally { //schedule again after we refreshed
                            if (isMaster) {
                                if (logger.isTraceEnabled()) {
                                    logger.trace("Scheduling next run for updating cluster info in: {}", updateFrequency.toString());
                                }
                                try {
                                    threadPool.schedule(updateFrequency, executorName(), this);
                                } catch (EsRejectedExecutionException ex) {
                                    logger.debug("Reschedule cluster info service was rejected", ex);
                                }
                            }
                        }
                    }
                });
            } catch (EsRejectedExecutionException ex) {
                if (logger.isDebugEnabled()) {
                    logger.debug("Couldn't re-schedule cluster info update task - node might be shutting down", ex);
                }
            }
        }
    }

    /**
     * Retrieve the latest nodes stats, calling the listener when complete
     * @return a latch that can be used to wait for the nodes stats to complete if desired
     */
    protected CountDownLatch updateNodeStats(final ActionListener listener) {
        final CountDownLatch latch = new CountDownLatch(1);
        final NodesStatsRequest nodesStatsRequest = new NodesStatsRequest("data:true");
        nodesStatsRequest.clear();
        nodesStatsRequest.fs(true);
        nodesStatsRequest.timeout(fetchTimeout);

        transportNodesStatsAction.execute(nodesStatsRequest, new LatchedActionListener<>(listener, latch));
        return latch;
    }

    /**
     * Retrieve the latest indices stats, calling the listener when complete
     * @return a latch that can be used to wait for the indices stats to complete if desired
     */
    protected CountDownLatch updateIndicesStats(final ActionListener listener) {
        final CountDownLatch latch = new CountDownLatch(1);
        final IndicesStatsRequest indicesStatsRequest = new IndicesStatsRequest();
        indicesStatsRequest.clear();
        indicesStatsRequest.store(true);

        transportIndicesStatsAction.execute(indicesStatsRequest, new LatchedActionListener<>(listener, latch));
        return latch;
    }

    private final void maybeRefresh() {
        // Short-circuit if not enabled
        if (enabled) {
            refresh();
        } else {
            if (logger.isTraceEnabled()) {
                logger.trace("Skipping ClusterInfoUpdatedJob since it is disabled");
            }
        }
    }

    /**
     * Refreshes the ClusterInfo in a blocking fashion
     */
    public final ClusterInfo refresh() {
        if (logger.isTraceEnabled()) {
            logger.trace("Performing ClusterInfoUpdateJob");
        }
        final CountDownLatch nodeLatch = updateNodeStats(new ActionListener() {
            @Override
            public void onResponse(NodesStatsResponse nodeStatses) {
                Map newLeastAvaiableUsages = new HashMap<>();
                Map newMostAvaiableUsages = new HashMap<>();
                fillDiskUsagePerNode(logger, nodeStatses.getNodes(), newLeastAvaiableUsages, newMostAvaiableUsages);
                leastAvailableSpaceUsages = Collections.unmodifiableMap(newLeastAvaiableUsages);
                mostAvailableSpaceUsages = Collections.unmodifiableMap(newMostAvaiableUsages);
            }

            @Override
            public void onFailure(Throwable e) {
                if (e instanceof ReceiveTimeoutTransportException) {
                    logger.error("NodeStatsAction timed out for ClusterInfoUpdateJob (reason [{}])", e.getMessage());
                } else {
                    if (e instanceof ClusterBlockException) {
                        if (logger.isTraceEnabled()) {
                            logger.trace("Failed to execute NodeStatsAction for ClusterInfoUpdateJob", e);
                        }
                    } else {
                        logger.warn("Failed to execute NodeStatsAction for ClusterInfoUpdateJob", e);
                    }
                    // we empty the usages list, to be safe - we don't know what's going on.
                    leastAvailableSpaceUsages = Collections.emptyMap();
                    mostAvailableSpaceUsages = Collections.emptyMap();
                }
            }
        });

        final CountDownLatch indicesLatch = updateIndicesStats(new ActionListener() {
            @Override
            public void onResponse(IndicesStatsResponse indicesStatsResponse) {
                ShardStats[] stats = indicesStatsResponse.getShards();
                final HashMap newShardSizes = new HashMap<>();
                final HashMap newShardRoutingToDataPath = new HashMap<>();
                buildShardLevelInfo(logger, stats, newShardSizes, newShardRoutingToDataPath, clusterService.state());
                shardSizes = Collections.unmodifiableMap(newShardSizes);
                shardRoutingToDataPath = Collections.unmodifiableMap(newShardRoutingToDataPath);
            }

            @Override
            public void onFailure(Throwable e) {
                if (e instanceof ReceiveTimeoutTransportException) {
                    logger.error("IndicesStatsAction timed out for ClusterInfoUpdateJob (reason [{}])", e.getMessage());
                } else {
                    if (e instanceof ClusterBlockException) {
                        if (logger.isTraceEnabled()) {
                            logger.trace("Failed to execute IndicesStatsAction for ClusterInfoUpdateJob", e);
                        }
                    } else {
                        logger.warn("Failed to execute IndicesStatsAction for ClusterInfoUpdateJob", e);
                    }
                    // we empty the usages list, to be safe - we don't know what's going on.
                    shardSizes = Collections.emptyMap();
                    shardRoutingToDataPath = Collections.emptyMap();
                }
            }
        });

        try {
            nodeLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS);
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt(); // restore interrupt status
            logger.warn("Failed to update node information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
        }

        try {
            indicesLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS);
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt(); // restore interrupt status
            logger.warn("Failed to update shard information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
        }
        ClusterInfo clusterInfo = getClusterInfo();
        for (Listener l : listeners) {
            try {
                l.onNewInfo(clusterInfo);
            } catch (Exception e) {
                logger.info("Failed executing ClusterInfoService listener", e);
            }
        }
        return clusterInfo;
    }

    static void buildShardLevelInfo(ESLogger logger, ShardStats[] stats, HashMap newShardSizes, HashMap newShardRoutingToDataPath, ClusterState state) {
        MetaData meta = state.getMetaData();
        for (ShardStats s : stats) {
            IndexMetaData indexMeta = meta.index(s.getShardRouting().index());
            Settings indexSettings = indexMeta == null ? null : indexMeta.getSettings();
            newShardRoutingToDataPath.put(s.getShardRouting(), s.getDataPath());
            long size = s.getStats().getStore().sizeInBytes();
            String sid = ClusterInfo.shardIdentifierFromRouting(s.getShardRouting());
            if (logger.isTraceEnabled()) {
                logger.trace("shard: {} size: {}", sid, size);
            }
            if (indexSettings != null && IndexMetaData.isIndexUsingShadowReplicas(indexSettings)) {
                // Shards on a shared filesystem should be considered of size 0
                if (logger.isTraceEnabled()) {
                    logger.trace("shard: {} is using shadow replicas and will be treated as size 0", sid);
                }
                size = 0;
            }
            newShardSizes.put(sid, size);
        }
    }

    static void fillDiskUsagePerNode(ESLogger logger, NodeStats[] nodeStatsArray, Map newLeastAvaiableUsages, Map newMostAvaiableUsages) {
        for (NodeStats nodeStats : nodeStatsArray) {
            if (nodeStats.getFs() == null) {
                logger.warn("Unable to retrieve node FS stats for {}", nodeStats.getNode().name());
            } else {
                FsInfo.Path leastAvailablePath = null;
                FsInfo.Path mostAvailablePath = null;
                for (FsInfo.Path info : nodeStats.getFs()) {
                    if (leastAvailablePath == null) {
                        assert mostAvailablePath == null;
                        mostAvailablePath = leastAvailablePath = info;
                    } else if (leastAvailablePath.getAvailable().bytes() > info.getAvailable().bytes()){
                        leastAvailablePath = info;
                    } else if (mostAvailablePath.getAvailable().bytes() < info.getAvailable().bytes()) {
                        mostAvailablePath = info;
                    }
                }
                String nodeId = nodeStats.getNode().id();
                String nodeName = nodeStats.getNode().getName();
                if (logger.isTraceEnabled()) {
                    logger.trace("node: [{}], most available: total disk: {}, available disk: {} / least available: total disk: {}, available disk: {}",
                            nodeId, mostAvailablePath.getTotal(), leastAvailablePath.getAvailable(),
                            leastAvailablePath.getTotal(), leastAvailablePath.getAvailable());
                }
                if (leastAvailablePath.getTotal().bytes() < 0) {
                    if (logger.isTraceEnabled()) {
                        logger.trace("node: [{}] least available path has less than 0 total bytes of disk [{}], skipping",
                                nodeId, leastAvailablePath.getTotal().bytes());
                    }
                } else {
                    newLeastAvaiableUsages.put(nodeId, new DiskUsage(nodeId, nodeName, leastAvailablePath.getPath(), leastAvailablePath.getTotal().bytes(), leastAvailablePath.getAvailable().bytes()));
                }
                if (mostAvailablePath.getTotal().bytes() < 0) {
                    if (logger.isTraceEnabled()) {
                        logger.trace("node: [{}] most available path has less than 0 total bytes of disk [{}], skipping",
                                nodeId, mostAvailablePath.getTotal().bytes());
                    }
                } else {
                    newMostAvaiableUsages.put(nodeId, new DiskUsage(nodeId, nodeName, mostAvailablePath.getPath(), mostAvailablePath.getTotal().bytes(), mostAvailablePath.getAvailable().bytes()));
                }

            }
        }
    }


}