org.elasticsearch.gateway.local.LocalGatewayNodeAllocation Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Elasticsearch subproject :server
There is a newer version: 8.15.1
/*
 * Licensed to Elastic Search and Shay Banon under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. Elastic Search licenses this
 * file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.gateway.local;

import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.action.FailedNodeException;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.node.DiscoveryNodes;
import org.elasticsearch.cluster.routing.*;
import org.elasticsearch.cluster.routing.allocation.*;
import org.elasticsearch.common.collect.Maps;
import org.elasticsearch.common.collect.Sets;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.trove.ExtTObjectIntHasMap;
import org.elasticsearch.common.trove.TObjectIntHashMap;
import org.elasticsearch.common.trove.TObjectIntIterator;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
import org.elasticsearch.gateway.GatewayService;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.index.store.StoreFileMetaData;
import org.elasticsearch.indices.store.TransportNodesListShardStoreMetaData;
import org.elasticsearch.transport.ConnectTransportException;

import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentMap;

import static org.elasticsearch.cluster.routing.ShardRoutingState.*;

/**
 * @author kimchy (shay.banon)
 */
public class LocalGatewayNodeAllocation extends NodeAllocation {

    private final TransportNodesListGatewayStartedShards listGatewayStartedShards;

    private final TransportNodesListShardStoreMetaData listShardStoreMetaData;

    private final ConcurrentMap> cachedStores = ConcurrentCollections.newConcurrentMap();

    private final TimeValue listTimeout;

    private final String initialShards;

    @Inject public LocalGatewayNodeAllocation(Settings settings,
                                              TransportNodesListGatewayStartedShards listGatewayStartedShards, TransportNodesListShardStoreMetaData listShardStoreMetaData) {
        super(settings);
        this.listGatewayStartedShards = listGatewayStartedShards;
        this.listShardStoreMetaData = listShardStoreMetaData;

        this.listTimeout = componentSettings.getAsTime("list_timeout", TimeValue.timeValueSeconds(30));
        this.initialShards = componentSettings.get("initial_shards", "quorum");
    }

    @Override public void applyStartedShards(NodeAllocations nodeAllocations, StartedRerouteAllocation allocation) {
        for (ShardRouting shardRouting : allocation.startedShards()) {
            cachedStores.remove(shardRouting.shardId());
        }
    }

    @Override public void applyFailedShards(NodeAllocations nodeAllocations, FailedRerouteAllocation allocation) {
        for (ShardRouting shardRouting : allocation.failedShards()) {
            cachedStores.remove(shardRouting.shardId());
        }
        for (ShardRouting failedShard : allocation.failedShards()) {
            IndexRoutingTable indexRoutingTable = allocation.routingTable().index(failedShard.index());
            if (!allocation.routingNodes().blocks().hasIndexBlock(indexRoutingTable.index(), GatewayService.INDEX_NOT_RECOVERED_BLOCK)) {
                continue;
            }

            // we are still in the initial allocation, find another node with existing shards
            // all primary are unassigned for the index, see if we can allocate it on existing nodes, if not, don't assign
            Set nodesIds = Sets.newHashSet();
            nodesIds.addAll(allocation.nodes().dataNodes().keySet());
            TransportNodesListGatewayStartedShards.NodesLocalGatewayStartedShards nodesState = listGatewayStartedShards.list(nodesIds, null).actionGet();

            // make a list of ShardId to Node, each one from the latest version
            Tuple t = null;
            for (TransportNodesListGatewayStartedShards.NodeLocalGatewayStartedShards nodeState : nodesState) {
                if (nodeState.state() == null) {
                    continue;
                }
                // we don't want to reallocate to the node we failed on
                if (nodeState.node().id().equals(failedShard.currentNodeId())) {
                    continue;
                }
                // go and find
                for (Map.Entry entry : nodeState.state().shards().entrySet()) {
                    if (entry.getKey().equals(failedShard.shardId())) {
                        if (t == null || entry.getValue() > t.v2().longValue()) {
                            t = new Tuple(nodeState.node(), entry.getValue());
                        }
                    }
                }
            }
            if (t != null) {
                // we found a node to allocate to, do it
                RoutingNode currentRoutingNode = allocation.routingNodes().nodesToShards().get(failedShard.currentNodeId());
                if (currentRoutingNode == null) {
                    // already failed (might be called several times for the same shard)
                    continue;
                }

                // find the shard and cancel relocation
                Iterator shards = currentRoutingNode.iterator();
                while (shards.hasNext()) {
                    MutableShardRouting shard = shards.next();
                    if (shard.shardId().equals(failedShard.shardId())) {
                        shard.deassignNode();
                        shards.remove();
                        break;
                    }
                }

                RoutingNode targetNode = allocation.routingNodes().nodesToShards().get(t.v1().id());
                targetNode.add(new MutableShardRouting(failedShard.index(), failedShard.id(),
                        targetNode.nodeId(), failedShard.relocatingNodeId(),
                        failedShard.primary(), INITIALIZING));
            }
        }
    }

    @Override public boolean allocateUnassigned(NodeAllocations nodeAllocations, RoutingAllocation allocation) {
        boolean changed = false;
        DiscoveryNodes nodes = allocation.nodes();
        RoutingNodes routingNodes = allocation.routingNodes();

        for (IndexRoutingTable indexRoutingTable : routingNodes.routingTable()) {
            // only do the allocation if there is a local "INDEX NOT RECOVERED" block
            // we check this here since it helps distinguish between index creation though an API, where the below logic
            // should not apply, and when recovering from the gateway, where we should apply this logic
            if (!routingNodes.blocks().hasIndexBlock(indexRoutingTable.index(), GatewayService.INDEX_NOT_RECOVERED_BLOCK)) {
                continue;
            }

            // the index might be created, but shards not instantiated yet, ignore this state
            if (indexRoutingTable.shards().isEmpty()) {
                continue;
            }

            if (indexRoutingTable.allPrimaryShardsUnassigned()) {
                // all primary are unassigned for the index, see if we can allocate it on existing nodes, if not, don't assign
                Set nodesIds = Sets.newHashSet();
                nodesIds.addAll(nodes.dataNodes().keySet());
                TransportNodesListGatewayStartedShards.NodesLocalGatewayStartedShards nodesState = listGatewayStartedShards.list(nodesIds, null).actionGet();

                if (nodesState.failures().length > 0) {
                    for (FailedNodeException failedNodeException : nodesState.failures()) {
                        logger.warn("failed to fetch shards state from node", failedNodeException);
                    }
                }

                // make a list of ShardId to Node, each one from the latest version
                Map> shards = Maps.newHashMap();
                // and a list of the number of shard instances
                TObjectIntHashMap shardsCounts = new ExtTObjectIntHasMap().defaultReturnValue(-1);
                for (TransportNodesListGatewayStartedShards.NodeLocalGatewayStartedShards nodeState : nodesState) {
                    if (nodeState.state() == null) {
                        continue;
                    }
                    for (Map.Entry entry : nodeState.state().shards().entrySet()) {
                        ShardId shardId = entry.getKey();
                        if (shardId.index().name().equals(indexRoutingTable.index())) {
                            shardsCounts.adjustOrPutValue(shardId, 1, 1);

                            Tuple t = shards.get(shardId);
                            if (t == null || entry.getValue() > t.v2().longValue()) {
                                t = new Tuple(nodeState.node(), entry.getValue());
                                shards.put(shardId, t);
                            }
                        }
                    }
                }

                // check if we managed to allocate to all of them, if not, move all relevant shards to ignored
                if (shards.size() < indexRoutingTable.shards().size()) {
                    moveIndexToIgnoreUnassigned(routingNodes, indexRoutingTable);
                } else {
                    // check if the counts meets the minimum set
                    int requiredNumber = 1;
                    IndexMetaData indexMetaData = routingNodes.metaData().index(indexRoutingTable.index());
                    if ("quorum".equals(initialShards)) {
                        if (indexMetaData.numberOfReplicas() > 1) {
                            requiredNumber = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
                        }
                    } else if ("full".equals(initialShards)) {
                        requiredNumber = indexMetaData.numberOfReplicas() + 1;
                    } else if ("full-1".equals(initialShards)) {
                        if (indexMetaData.numberOfReplicas() > 1) {
                            requiredNumber = indexMetaData.numberOfReplicas();
                        }
                    } else {
                        requiredNumber = Integer.parseInt(initialShards);
                    }

                    boolean allocate = true;
                    for (TObjectIntIterator it = shardsCounts.iterator(); it.hasNext();) {
                        it.advance();
                        if (it.value() < requiredNumber) {
                            allocate = false;
                        }
                    }

                    if (allocate) {
                        changed = true;
                        // we found all nodes to allocate to, do the allocation (but only for the index we are working on)
                        for (Iterator it = routingNodes.unassigned().iterator(); it.hasNext();) {
                            MutableShardRouting shardRouting = it.next();
                            if (shardRouting.index().equals(indexRoutingTable.index())) {
                                if (shardRouting.primary()) {
                                    DiscoveryNode node = shards.get(shardRouting.shardId()).v1();
                                    logger.debug("[{}][{}] initial allocation to [{}]", shardRouting.index(), shardRouting.id(), node);
                                    RoutingNode routingNode = routingNodes.node(node.id());
                                    routingNode.add(shardRouting);
                                    it.remove();
                                }
                            }
                        }
                    } else {
                        moveIndexToIgnoreUnassigned(routingNodes, indexRoutingTable);
                    }
                }
            }
        }

        if (!routingNodes.hasUnassigned()) {
            return changed;
        }

        Iterator unassignedIterator = routingNodes.unassigned().iterator();
        while (unassignedIterator.hasNext()) {
            MutableShardRouting shard = unassignedIterator.next();

            // pre-check if it can be allocated to any node that currently exists, so we won't list the store for it for nothing
            boolean canBeAllocatedToAtLeastOneNode = false;
            for (DiscoveryNode discoNode : nodes.dataNodes().values()) {
                RoutingNode node = routingNodes.node(discoNode.id());
                if (node == null) {
                    continue;
                }
                // if its THROTTLING, we are not going to allocate it to this node, so ignore it as well
                if (nodeAllocations.canAllocate(shard, node, allocation).allocate()) {
                    canBeAllocatedToAtLeastOneNode = true;
                    break;
                }
            }

            if (!canBeAllocatedToAtLeastOneNode) {
                continue;
            }

            ConcurrentMap shardStores = buildShardStores(nodes, shard);

            long lastSizeMatched = 0;
            DiscoveryNode lastDiscoNodeMatched = null;
            RoutingNode lastNodeMatched = null;

            for (Map.Entry nodeStoreEntry : shardStores.entrySet()) {
                DiscoveryNode discoNode = nodeStoreEntry.getKey();
                TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue();
                logger.trace("{}: checking node [{}]", shard, discoNode);

                if (storeFilesMetaData == null) {
                    // already allocated on that node...
                    continue;
                }

                RoutingNode node = routingNodes.node(discoNode.id());
                if (node == null) {
                    continue;
                }

                // check if we can allocate on that node...
                // we only check for NO, since if this node is THROTTLING and it has enough "same data"
                // then we will try and assign it next time
                if (nodeAllocations.canAllocate(shard, node, allocation) == Decision.NO) {
                    continue;
                }

                // if it is already allocated, we can't assign to it...
                if (storeFilesMetaData.allocated()) {
                    continue;
                }

                if (!shard.primary()) {
                    MutableShardRouting primaryShard = routingNodes.findPrimaryForReplica(shard);
                    if (primaryShard != null && primaryShard.active()) {
                        DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId());
                        if (primaryNode != null) {
                            TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore = shardStores.get(primaryNode);
                            if (primaryNodeStore != null && primaryNodeStore.allocated()) {
                                long sizeMatched = 0;

                                for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
                                    if (primaryNodeStore.fileExists(storeFileMetaData.name()) && primaryNodeStore.file(storeFileMetaData.name()).isSame(storeFileMetaData)) {
                                        sizeMatched += storeFileMetaData.length();
                                    }
                                }
                                if (sizeMatched > lastSizeMatched) {
                                    lastSizeMatched = sizeMatched;
                                    lastDiscoNodeMatched = discoNode;
                                    lastNodeMatched = node;
                                }
                            }
                        }
                    }
                }
            }

            if (lastNodeMatched != null) {
                if (nodeAllocations.canAllocate(shard, lastNodeMatched, allocation) == NodeAllocation.Decision.THROTTLE) {
                    if (logger.isTraceEnabled()) {
                        logger.debug("[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched));
                    }
                    // we are throttling this, but we have enough to allocate to this node, ignore it for now
                    unassignedIterator.remove();
                    routingNodes.ignoredUnassigned().add(shard);
                } else {
                    if (logger.isDebugEnabled()) {
                        logger.debug("[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched));
                    }
                    // we found a match
                    changed = true;
                    lastNodeMatched.add(shard);
                    unassignedIterator.remove();
                }
            }
        }

        return changed;
    }

    private void moveIndexToIgnoreUnassigned(RoutingNodes routingNodes, IndexRoutingTable indexRoutingTable) {
        for (Iterator it = routingNodes.unassigned().iterator(); it.hasNext();) {
            MutableShardRouting shardRouting = it.next();
            if (shardRouting.index().equals(indexRoutingTable.index())) {
                it.remove();
                routingNodes.ignoredUnassigned().add(shardRouting);
            }
        }
    }

    private ConcurrentMap buildShardStores(DiscoveryNodes nodes, MutableShardRouting shard) {
        ConcurrentMap shardStores = cachedStores.get(shard.shardId());
        if (shardStores == null) {
            shardStores = ConcurrentCollections.newConcurrentMap();
            TransportNodesListShardStoreMetaData.NodesStoreFilesMetaData nodesStoreFilesMetaData = listShardStoreMetaData.list(shard.shardId(), false, nodes.dataNodes().keySet(), listTimeout).actionGet();
            if (logger.isDebugEnabled()) {
                if (nodesStoreFilesMetaData.failures().length > 0) {
                    StringBuilder sb = new StringBuilder(shard + ": failures when trying to list stores on nodes:");
                    for (int i = 0; i < nodesStoreFilesMetaData.failures().length; i++) {
                        Throwable cause = ExceptionsHelper.unwrapCause(nodesStoreFilesMetaData.failures()[i]);
                        if (cause instanceof ConnectTransportException) {
                            continue;
                        }
                        sb.append("\n    -> ").append(nodesStoreFilesMetaData.failures()[i].getDetailedMessage());
                    }
                    logger.debug(sb.toString());
                }
            }

            for (TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData nodeStoreFilesMetaData : nodesStoreFilesMetaData) {
                if (nodeStoreFilesMetaData.storeFilesMetaData() != null) {
                    shardStores.put(nodeStoreFilesMetaData.node(), nodeStoreFilesMetaData.storeFilesMetaData());
                }
            }
            cachedStores.put(shard.shardId(), shardStores);
        } else {
            // clean nodes that have failed
            for (DiscoveryNode node : shardStores.keySet()) {
                if (!nodes.nodeExists(node.id())) {
                    shardStores.remove(node);
                }
            }

            // we have stored cached from before, see if the nodes changed, if they have, go fetch again
            Set fetchedNodes = Sets.newHashSet();
            for (DiscoveryNode node : nodes.dataNodes().values()) {
                if (!shardStores.containsKey(node)) {
                    fetchedNodes.add(node.id());
                }
            }

            if (!fetchedNodes.isEmpty()) {
                TransportNodesListShardStoreMetaData.NodesStoreFilesMetaData nodesStoreFilesMetaData = listShardStoreMetaData.list(shard.shardId(), false, fetchedNodes, listTimeout).actionGet();
                if (logger.isTraceEnabled()) {
                    if (nodesStoreFilesMetaData.failures().length > 0) {
                        StringBuilder sb = new StringBuilder(shard + ": failures when trying to list stores on nodes:");
                        for (int i = 0; i < nodesStoreFilesMetaData.failures().length; i++) {
                            Throwable cause = ExceptionsHelper.unwrapCause(nodesStoreFilesMetaData.failures()[i]);
                            if (cause instanceof ConnectTransportException) {
                                continue;
                            }
                            sb.append("\n    -> ").append(nodesStoreFilesMetaData.failures()[i].getDetailedMessage());
                        }
                        logger.trace(sb.toString());
                    }
                }

                for (TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData nodeStoreFilesMetaData : nodesStoreFilesMetaData) {
                    if (nodeStoreFilesMetaData.storeFilesMetaData() != null) {
                        shardStores.put(nodeStoreFilesMetaData.node(), nodeStoreFilesMetaData.storeFilesMetaData());
                    }
                }
            }
        }
        return shardStores;
    }
}