org.elasticsearch.gateway.AsyncShardFetch Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
There is a newer version: 8.15.1
/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */
package org.elasticsearch.gateway;

import org.apache.logging.log4j.Logger;
import org.elasticsearch.ElasticsearchTimeoutException;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.FailedNodeException;
import org.elasticsearch.action.support.nodes.BaseNodeResponse;
import org.elasticsearch.action.support.nodes.BaseNodesResponse;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
import org.elasticsearch.common.util.Maps;
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.Releasable;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.transport.ReceiveTimeoutTransportException;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import java.util.stream.Collectors;

import static java.util.Collections.emptySet;
import static org.elasticsearch.core.Strings.format;

/**
 * Allows to asynchronously fetch shard related data from other nodes for allocation, without blocking
 * the cluster update thread.
 * 
 * The async fetch logic maintains a map of which nodes are being fetched from in an async manner,
 * and once the results are back, it makes sure to schedule a reroute to make sure those results will
 * be taken into account.
 */
public abstract class AsyncShardFetch implements Releasable {

    protected final Logger logger;
    protected final String type;
    protected final ShardId shardId;
    protected final String customDataPath;
    private final Map> cache;
    private final Set nodesToIgnore = new HashSet<>();
    private final AtomicLong round = new AtomicLong();
    private boolean closed;
    private volatile int fetchingCount;

    @SuppressWarnings("unchecked")
    protected AsyncShardFetch(Logger logger, String type, ShardId shardId, String customDataPath, int expectedSize) {
        this.logger = logger;
        this.type = type;
        this.shardId = Objects.requireNonNull(shardId);
        this.customDataPath = Objects.requireNonNull(customDataPath);
        this.fetchingCount = 0;
        this.cache = Maps.newHashMapWithExpectedSize(expectedSize);
    }

    @Override
    public synchronized void close() {
        this.closed = true;
    }

    /**
     * Returns the number of async fetches that are currently ongoing.
     */
    public int getNumberOfInFlightFetches() {
        return fetchingCount;
    }

    /**
     * Fetches the data for the relevant shard. If there any ongoing async fetches going on, or new ones have
     * been initiated by this call, the result will have no data.
     * 
     * The ignoreNodes are nodes that are supposed to be ignored for this round, since fetching is async, we need
     * to keep them around and make sure we add them back when all the responses are fetched and returned.
     */
    public synchronized FetchResult fetchData(DiscoveryNodes nodes, Set ignoreNodes) {
        if (closed) {
            throw new IllegalStateException(shardId + ": can't fetch data on closed async fetch");
        }
        nodesToIgnore.addAll(ignoreNodes);
        fillShardCacheWithDataNodes(cache, nodes);
        List> nodesToFetch = findNodesToFetch(cache);
        if (nodesToFetch.isEmpty() == false) {
            // mark all node as fetching and go ahead and async fetch them
            // use a unique round id to detect stale responses in processAsyncFetch
            final long fetchingRound = round.incrementAndGet();
            for (NodeEntry nodeEntry : nodesToFetch) {
                nodeEntry.markAsFetching(fetchingRound);
                fetchingCount++;
            }
            DiscoveryNode[] discoNodesToFetch = nodesToFetch.stream()
                .map(NodeEntry::getNodeId)
                .map(nodes::get)
                .toArray(DiscoveryNode[]::new);
            asyncFetch(discoNodesToFetch, fetchingRound);
        }

        assert assertFetchingCountConsistent();

        // if we are still fetching, return null to indicate it
        if (hasAnyNodeFetching()) {
            return new FetchResult<>(shardId, null, emptySet());
        } else {
            // nothing to fetch, yay, build the return value
            Map fetchData = new HashMap<>();
            Set failedNodes = new HashSet<>();
            for (Iterator>> it = cache.entrySet().iterator(); it.hasNext();) {
                Map.Entry> entry = it.next();
                String nodeId = entry.getKey();
                NodeEntry nodeEntry = entry.getValue();

                DiscoveryNode node = nodes.get(nodeId);
                if (node != null) {
                    if (nodeEntry.isFailed()) {
                        // if its failed, remove it from the list of nodes, so if this run doesn't work
                        // we try again next round to fetch it again
                        it.remove();
                        failedNodes.add(nodeEntry.getNodeId());
                    } else {
                        if (nodeEntry.getValue() != null) {
                            fetchData.put(node, nodeEntry.getValue());
                        }
                    }
                }
            }
            Set allIgnoreNodes = Set.copyOf(nodesToIgnore);
            // clear the nodes to ignore, we had a successful run in fetching everything we can
            // we need to try them if another full run is needed
            nodesToIgnore.clear();
            // if at least one node failed, make sure to have a protective reroute
            // here, just case this round won't find anything, and we need to retry fetching data
            if (failedNodes.isEmpty() == false || allIgnoreNodes.isEmpty() == false) {
                reroute(shardId, "nodes failed [" + failedNodes.size() + "], ignored [" + allIgnoreNodes.size() + "]");
            }
            return new FetchResult<>(shardId, fetchData, allIgnoreNodes);
        }
    }

    /**
     * Called by the response handler of the async action to fetch data. Verifies that its still working
     * on the same cache generation, otherwise the results are discarded. It then goes and fills the relevant data for
     * the shard (response + failures), issuing a reroute at the end of it to make sure there will be another round
     * of allocations taking this new data into account.
     */
    protected synchronized void processAsyncFetch(List responses, List failures, long fetchingRound) {
        if (closed) {
            // we are closed, no need to process this async fetch at all
            logger.trace("{} ignoring fetched [{}] results, already closed", shardId, type);
            return;
        }
        logger.trace("{} processing fetched [{}] results", shardId, type);

        if (responses != null) {
            for (T response : responses) {
                NodeEntry nodeEntry = cache.get(response.getNode().getId());
                if (nodeEntry != null) {
                    if (nodeEntry.getFetchingRound() != fetchingRound) {
                        assert nodeEntry.getFetchingRound() > fetchingRound : "node entries only replaced by newer rounds";
                        logger.trace(
                            "{} received response for [{}] from node {} for an older fetching round (expected: {} but was: {})",
                            shardId,
                            nodeEntry.getNodeId(),
                            type,
                            nodeEntry.getFetchingRound(),
                            fetchingRound
                        );
                    } else if (nodeEntry.isFailed()) {
                        logger.trace(
                            "{} node {} has failed for [{}] (failure [{}])",
                            shardId,
                            nodeEntry.getNodeId(),
                            type,
                            nodeEntry.getFailure()
                        );
                    } else {
                        // if the entry is there, for the right fetching round and not marked as failed already, process it
                        logger.trace("{} marking {} as done for [{}], result is [{}]", shardId, nodeEntry.getNodeId(), type, response);
                        nodeEntry.doneFetching(response);
                        fetchingCount--;
                    }
                }
            }
        }
        if (failures != null) {
            for (FailedNodeException failure : failures) {
                logger.trace("{} processing failure {} for [{}]", shardId, failure, type);
                NodeEntry nodeEntry = cache.get(failure.nodeId());
                if (nodeEntry != null) {
                    if (nodeEntry.getFetchingRound() != fetchingRound) {
                        assert nodeEntry.getFetchingRound() > fetchingRound : "node entries only replaced by newer rounds";
                        logger.trace(
                            "{} received failure for [{}] from node {} for an older fetching round (expected: {} but was: {})",
                            shardId,
                            nodeEntry.getNodeId(),
                            type,
                            nodeEntry.getFetchingRound(),
                            fetchingRound
                        );
                    } else if (nodeEntry.isFailed() == false) {
                        // if the entry is there, for the right fetching round and not marked as failed already, process it
                        Throwable unwrappedCause = ExceptionsHelper.unwrapCause(failure.getCause());
                        // if the request got rejected or timed out, we need to try it again next time...
                        if (unwrappedCause instanceof EsRejectedExecutionException
                            || unwrappedCause instanceof ReceiveTimeoutTransportException
                            || unwrappedCause instanceof ElasticsearchTimeoutException) {
                            nodeEntry.restartFetching();
                            fetchingCount--;
                        } else {
                            logger.warn(
                                () -> format("%s: failed to list shard for %s on node [%s]", shardId, type, failure.nodeId()),
                                failure
                            );
                            nodeEntry.doneFetching(failure.getCause());
                            fetchingCount--;
                        }
                    }
                }
            }
        }
        reroute(shardId, "post_response");
        assert assertFetchingCountConsistent();
    }

    /**
     * Implement this in order to scheduled another round that causes a call to fetch data.
     */
    protected abstract void reroute(ShardId shardId, String reason);

    /**
     * Clear cache for node, ensuring next fetch will fetch a fresh copy.
     */
    synchronized void clearCacheForNode(String nodeId) {
        NodeEntry nodeEntry = cache.remove(nodeId);
        if (nodeEntry != null && nodeEntry.fetching) {
            fetchingCount--;
        }
        assert assertFetchingCountConsistent();
    }

    /**
     * Fills the shard fetched data with new (data) nodes and a fresh NodeEntry, and removes from
     * it nodes that are no longer part of the state.
     */
    private void fillShardCacheWithDataNodes(Map> shardCache, DiscoveryNodes nodes) {
        // verify that all current data nodes are there
        for (Map.Entry cursor : nodes.getDataNodes().entrySet()) {
            DiscoveryNode node = cursor.getValue();
            if (shardCache.containsKey(node.getId()) == false) {
                shardCache.put(node.getId(), new NodeEntry(node.getId()));
            }
        }
        // remove nodes that are not longer part of the data nodes set
        for (Iterator>> iterator = shardCache.entrySet().iterator(); iterator.hasNext();) {
            Map.Entry> entry = iterator.next();
            String nodeId = entry.getKey();
            NodeEntry nodeEntry = entry.getValue();
            if (nodes.nodeExists(nodeId) == false) {
                if (nodeEntry.fetching) {
                    fetchingCount--;
                }
                iterator.remove();
            }
        }
        assert assertFetchingCountConsistent();
    }

    /**
     * Finds all the nodes that need to be fetched. Those are nodes that have no
     * data, and are not in fetch mode.
     */
    private List> findNodesToFetch(Map> shardCache) {
        List> nodesToFetch = new ArrayList<>();
        for (NodeEntry nodeEntry : shardCache.values()) {
            if (nodeEntry.hasData() == false && nodeEntry.isFetching() == false) {
                nodesToFetch.add(nodeEntry);
            }
        }
        return nodesToFetch;
    }

    /**
     * Are there any nodes that are fetching data?
     */
    private boolean hasAnyNodeFetching() {
        return fetchingCount != 0;
    }

    /**
     * Async fetches data for the provided shard with the set of nodes that need to be fetched from.
     */
    // visible for testing
    void asyncFetch(final DiscoveryNode[] nodes, long fetchingRound) {
        logger.trace("{} fetching [{}] from {}", shardId, type, nodes);
        list(shardId, customDataPath, nodes, new ActionListener<>() {
            @Override
            public void onResponse(BaseNodesResponse response) {
                assert assertSameNodes(response);
                processAsyncFetch(response.getNodes(), response.failures(), fetchingRound);
            }

            @Override
            public void onFailure(Exception e) {
                List failures = new ArrayList<>(nodes.length);
                for (final DiscoveryNode node : nodes) {
                    failures.add(new FailedNodeException(node.getId(), "total failure in fetching", e));
                }
                processAsyncFetch(null, failures, fetchingRound);
            }

            private boolean assertSameNodes(BaseNodesResponse response) {
                final Map nodesById = Arrays.stream(nodes)
                    .collect(Collectors.toMap(DiscoveryNode::getEphemeralId, Function.identity()));
                for (T nodeResponse : response.getNodes()) {
                    final DiscoveryNode responseNode = nodeResponse.getNode();
                    final DiscoveryNode localNode = nodesById.get(responseNode.getEphemeralId());
                    assert localNode == responseNode : "not reference equal: " + localNode + " vs " + responseNode;
                }
                return true;
            }
        });
    }

    protected abstract void list(
        ShardId shardId,
        @Nullable String customDataPath,
        DiscoveryNode[] nodes,
        ActionListener> listener
    );

    /**
     * The result of a fetch operation. Make sure to first check {@link #hasData()} before
     * fetching the actual data.
     */
    public static class FetchResult {

        private final ShardId shardId;
        private final Map data;
        private final Set ignoreNodes;

        public FetchResult(ShardId shardId, Map data, Set ignoreNodes) {
            this.shardId = shardId;
            this.data = data;
            this.ignoreNodes = ignoreNodes;
        }

        /**
         * Does the result actually contain data? If not, then there are on going fetch
         * operations happening, and it should wait for it.
         */
        public boolean hasData() {
            return data != null;
        }

        /**
         * Returns the actual data, note, make sure to check {@link #hasData()} first and
         * only use this when there is an actual data.
         */
        public Map getData() {
            assert data != null : "getData should only be called if there is data to be fetched, please check hasData first";
            return this.data;
        }

        /**
         * Process any changes needed to the allocation based on this fetch result.
         */
        public void processAllocation(RoutingAllocation allocation) {
            for (String ignoreNode : ignoreNodes) {
                allocation.addIgnoreShardForNode(shardId, ignoreNode);
            }
        }
    }

    /**
     * A node entry, holding the state of the fetched data for a specific shard
     * for a giving node.
     */
    static class NodeEntry {
        private final String nodeId;
        private boolean fetching;
        @Nullable
        private T value;
        private boolean valueSet;
        private Throwable failure;
        private long fetchingRound;

        NodeEntry(String nodeId) {
            this.nodeId = nodeId;
        }

        String getNodeId() {
            return this.nodeId;
        }

        boolean isFetching() {
            return fetching;
        }

        void markAsFetching(long fetchingRound) {
            assert fetching == false : "double marking a node as fetching";
            this.fetching = true;
            this.fetchingRound = fetchingRound;
        }

        void doneFetching(T value) {
            assert fetching : "setting value but not in fetching mode";
            assert failure == null : "setting value when failure already set";
            this.valueSet = true;
            this.value = value;
            this.fetching = false;
        }

        void doneFetching(Throwable failure) {
            assert fetching : "setting value but not in fetching mode";
            assert valueSet == false : "setting failure when already set value";
            assert failure != null : "setting failure can't be null";
            this.failure = failure;
            this.fetching = false;
        }

        void restartFetching() {
            assert fetching : "restarting fetching, but not in fetching mode";
            assert valueSet == false : "value can't be set when restarting fetching";
            assert failure == null : "failure can't be set when restarting fetching";
            this.fetching = false;
        }

        boolean isFailed() {
            return failure != null;
        }

        boolean hasData() {
            return valueSet || failure != null;
        }

        Throwable getFailure() {
            assert hasData() : "getting failure when data has not been fetched";
            return failure;
        }

        @Nullable
        T getValue() {
            assert failure == null : "trying to fetch value, but its marked as failed, check isFailed";
            assert valueSet : "value is not set, hasn't been fetched yet";
            return value;
        }

        long getFetchingRound() {
            return fetchingRound;
        }
    }

    private boolean assertFetchingCountConsistent() {
        assert Thread.holdsLock(this);
        assert fetchingCount == cache.values().stream().filter(NodeEntry::isFetching).count();
        return true;
    }
}