All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.cluster.routing.RoutingNodes Maven / Gradle / Ivy

There is a newer version: 8.13.2
Show newest version
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.elasticsearch.cluster.routing;

import com.carrotsearch.hppc.ObjectIntHashMap;
import com.carrotsearch.hppc.cursors.ObjectCursor;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.util.CollectionUtil;
import org.elasticsearch.Assertions;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.routing.UnassignedInfo.AllocationStatus;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.Randomness;
import org.elasticsearch.common.collect.Tuple;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.shard.ShardId;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Queue;
import java.util.Set;
import java.util.function.Predicate;

/**
 * {@link RoutingNodes} represents a copy the routing information contained in the {@link ClusterState cluster state}.
 * It can be either initialized as mutable or immutable (see {@link #RoutingNodes(ClusterState, boolean)}), allowing
 * or disallowing changes to its elements.
 *
 * The main methods used to update routing entries are:
 * 
    *
  • {@link #initializeShard} initializes an unassigned shard. *
  • {@link #startShard} starts an initializing shard / completes relocation of a shard. *
  • {@link #relocateShard} starts relocation of a started shard. *
  • {@link #failShard} fails/cancels an assigned shard. *
*/ public class RoutingNodes implements Iterable { private final Map nodesToShards = new HashMap<>(); private final UnassignedShards unassignedShards = new UnassignedShards(this); private final Map> assignedShards = new HashMap<>(); private final boolean readOnly; private int inactivePrimaryCount = 0; private int inactiveShardCount = 0; private int relocatingShards = 0; private final Map> nodesPerAttributeNames = new HashMap<>(); private final Map recoveriesPerNode = new HashMap<>(); public RoutingNodes(ClusterState clusterState) { this(clusterState, true); } public RoutingNodes(ClusterState clusterState, boolean readOnly) { this.readOnly = readOnly; final RoutingTable routingTable = clusterState.routingTable(); Map> nodesToShards = new HashMap<>(); // fill in the nodeToShards with the "live" nodes for (ObjectCursor cursor : clusterState.nodes().getDataNodes().values()) { nodesToShards.put(cursor.value.getId(), new LinkedHashMap<>()); // LinkedHashMap to preserve order } // fill in the inverse of node -> shards allocated // also fill replicaSet information for (ObjectCursor indexRoutingTable : routingTable.indicesRouting().values()) { for (IndexShardRoutingTable indexShard : indexRoutingTable.value) { assert indexShard.primary != null; for (ShardRouting shard : indexShard) { // to get all the shards belonging to an index, including the replicas, // we define a replica set and keep track of it. A replica set is identified // by the ShardId, as this is common for primary and replicas. // A replica Set might have one (and not more) replicas with the state of RELOCATING. if (shard.assignedToNode()) { Map entries = nodesToShards.computeIfAbsent(shard.currentNodeId(), k -> new LinkedHashMap<>()); // LinkedHashMap to preserve order ShardRouting previousValue = entries.put(shard.shardId(), shard); if (previousValue != null) { throw new IllegalArgumentException("Cannot have two different shards with same shard id on same node"); } assignedShardsAdd(shard); if (shard.relocating()) { relocatingShards++; // LinkedHashMap to preserve order. // Add the counterpart shard with relocatingNodeId reflecting the source from which // it's relocating from. entries = nodesToShards.computeIfAbsent(shard.relocatingNodeId(), k -> new LinkedHashMap<>()); ShardRouting targetShardRouting = shard.getTargetRelocatingShard(); addInitialRecovery(targetShardRouting, indexShard.primary); previousValue = entries.put(targetShardRouting.shardId(), targetShardRouting); if (previousValue != null) { throw new IllegalArgumentException("Cannot have two different shards with same shard id on same node"); } assignedShardsAdd(targetShardRouting); } else if (shard.initializing()) { if (shard.primary()) { inactivePrimaryCount++; } inactiveShardCount++; addInitialRecovery(shard, indexShard.primary); } } else { unassignedShards.add(shard); } } } } for (Map.Entry> entry : nodesToShards.entrySet()) { String nodeId = entry.getKey(); this.nodesToShards.put(nodeId, new RoutingNode(nodeId, clusterState.nodes().get(nodeId), entry.getValue())); } } private void addRecovery(ShardRouting routing) { updateRecoveryCounts(routing, true, findAssignedPrimaryIfPeerRecovery(routing)); } private void removeRecovery(ShardRouting routing) { updateRecoveryCounts(routing, false, findAssignedPrimaryIfPeerRecovery(routing)); } private void addInitialRecovery(ShardRouting routing, ShardRouting initialPrimaryShard) { updateRecoveryCounts(routing, true, initialPrimaryShard); } private void updateRecoveryCounts(final ShardRouting routing, final boolean increment, @Nullable final ShardRouting primary) { final int howMany = increment ? 1 : -1; assert routing.initializing() : "routing must be initializing: " + routing; // TODO: check primary == null || primary.active() after all tests properly add ReplicaAfterPrimaryActiveAllocationDecider assert primary == null || primary.assignedToNode() : "shard is initializing but its primary is not assigned to a node"; Recoveries.getOrAdd(recoveriesPerNode, routing.currentNodeId()).addIncoming(howMany); if (routing.recoverySource().getType() == RecoverySource.Type.PEER) { // add/remove corresponding outgoing recovery on node with primary shard if (primary == null) { throw new IllegalStateException("shard is peer recovering but primary is unassigned"); } Recoveries.getOrAdd(recoveriesPerNode, primary.currentNodeId()).addOutgoing(howMany); if (increment == false && routing.primary() && routing.relocatingNodeId() != null) { // primary is done relocating, move non-primary recoveries from old primary to new primary int numRecoveringReplicas = 0; for (ShardRouting assigned : assignedShards(routing.shardId())) { if (assigned.primary() == false && assigned.initializing() && assigned.recoverySource().getType() == RecoverySource.Type.PEER) { numRecoveringReplicas++; } } recoveriesPerNode.get(routing.relocatingNodeId()).addOutgoing(-numRecoveringReplicas); recoveriesPerNode.get(routing.currentNodeId()).addOutgoing(numRecoveringReplicas); } } } public int getIncomingRecoveries(String nodeId) { return recoveriesPerNode.getOrDefault(nodeId, Recoveries.EMPTY).getIncoming(); } public int getOutgoingRecoveries(String nodeId) { return recoveriesPerNode.getOrDefault(nodeId, Recoveries.EMPTY).getOutgoing(); } @Nullable private ShardRouting findAssignedPrimaryIfPeerRecovery(ShardRouting routing) { ShardRouting primary = null; if (routing.recoverySource() != null && routing.recoverySource().getType() == RecoverySource.Type.PEER) { List shardRoutings = assignedShards.get(routing.shardId()); if (shardRoutings != null) { for (ShardRouting shardRouting : shardRoutings) { if (shardRouting.primary()) { if (shardRouting.active()) { return shardRouting; } else if (primary == null) { primary = shardRouting; } else if (primary.relocatingNodeId() != null) { primary = shardRouting; } } } } } return primary; } @Override public Iterator iterator() { return Collections.unmodifiableCollection(nodesToShards.values()).iterator(); } public Iterator mutableIterator() { ensureMutable(); return nodesToShards.values().iterator(); } public UnassignedShards unassigned() { return this.unassignedShards; } public RoutingNode node(String nodeId) { return nodesToShards.get(nodeId); } public ObjectIntHashMap nodesPerAttributesCounts(String attributeName) { ObjectIntHashMap nodesPerAttributesCounts = nodesPerAttributeNames.get(attributeName); if (nodesPerAttributesCounts != null) { return nodesPerAttributesCounts; } nodesPerAttributesCounts = new ObjectIntHashMap<>(); for (RoutingNode routingNode : this) { String attrValue = routingNode.node().getAttributes().get(attributeName); nodesPerAttributesCounts.addTo(attrValue, 1); } nodesPerAttributeNames.put(attributeName, nodesPerAttributesCounts); return nodesPerAttributesCounts; } /** * Returns true iff this {@link RoutingNodes} instance has any unassigned primaries even if the * primaries are marked as temporarily ignored. */ public boolean hasUnassignedPrimaries() { return unassignedShards.getNumPrimaries() + unassignedShards.getNumIgnoredPrimaries() > 0; } /** * Returns true iff this {@link RoutingNodes} instance has any unassigned shards even if the * shards are marked as temporarily ignored. * @see UnassignedShards#isEmpty() * @see UnassignedShards#isIgnoredEmpty() */ public boolean hasUnassignedShards() { return unassignedShards.isEmpty() == false || unassignedShards.isIgnoredEmpty() == false; } public boolean hasInactivePrimaries() { return inactivePrimaryCount > 0; } public boolean hasInactiveShards() { return inactiveShardCount > 0; } public int getRelocatingShardCount() { return relocatingShards; } /** * Returns all shards that are not in the state UNASSIGNED with the same shard * ID as the given shard. */ public List assignedShards(ShardId shardId) { final List replicaSet = assignedShards.get(shardId); return replicaSet == null ? EMPTY : Collections.unmodifiableList(replicaSet); } @Nullable public ShardRouting getByAllocationId(ShardId shardId, String allocationId) { final List replicaSet = assignedShards.get(shardId); if (replicaSet == null) { return null; } for (ShardRouting shardRouting : replicaSet) { if (shardRouting.allocationId().getId().equals(allocationId)) { return shardRouting; } } return null; } /** * Returns the active primary shard for the given shard id or null if * no primary is found or the primary is not active. */ public ShardRouting activePrimary(ShardId shardId) { for (ShardRouting shardRouting : assignedShards(shardId)) { if (shardRouting.primary() && shardRouting.active()) { return shardRouting; } } return null; } /** * Returns one active replica shard for the given shard id or null if * no active replica is found. * * Since replicas could possibly be on nodes with a older version of ES than * the primary is, this will return replicas on the highest version of ES. * */ public ShardRouting activeReplicaWithHighestVersion(ShardId shardId) { // It's possible for replicaNodeVersion to be null, when disassociating dead nodes // that have been removed, the shards are failed, and part of the shard failing // calls this method with an out-of-date RoutingNodes, where the version might not // be accessible. Therefore, we need to protect against the version being null // (meaning the node will be going away). return assignedShards(shardId).stream() .filter(shr -> !shr.primary() && shr.active()) .filter(shr -> node(shr.currentNodeId()) != null) .max(Comparator.comparing(shr -> node(shr.currentNodeId()).node(), Comparator.nullsFirst(Comparator.comparing(DiscoveryNode::getVersion)))) .orElse(null); } /** * Returns true iff all replicas are active for the given shard routing. Otherwise false */ public boolean allReplicasActive(ShardId shardId, MetaData metaData) { final List shards = assignedShards(shardId); if (shards.isEmpty() || shards.size() < metaData.getIndexSafe(shardId.getIndex()).getNumberOfReplicas() + 1) { return false; // if we are empty nothing is active if we have less than total at least one is unassigned } for (ShardRouting shard : shards) { if (!shard.active()) { return false; } } return true; } public List shards(Predicate predicate) { List shards = new ArrayList<>(); for (RoutingNode routingNode : this) { for (ShardRouting shardRouting : routingNode) { if (predicate.test(shardRouting)) { shards.add(shardRouting); } } } return shards; } public List shardsWithState(ShardRoutingState... state) { // TODO these are used on tests only - move into utils class List shards = new ArrayList<>(); for (RoutingNode routingNode : this) { shards.addAll(routingNode.shardsWithState(state)); } for (ShardRoutingState s : state) { if (s == ShardRoutingState.UNASSIGNED) { unassigned().forEach(shards::add); break; } } return shards; } public List shardsWithState(String index, ShardRoutingState... state) { // TODO these are used on tests only - move into utils class List shards = new ArrayList<>(); for (RoutingNode routingNode : this) { shards.addAll(routingNode.shardsWithState(index, state)); } for (ShardRoutingState s : state) { if (s == ShardRoutingState.UNASSIGNED) { for (ShardRouting unassignedShard : unassignedShards) { if (unassignedShard.index().getName().equals(index)) { shards.add(unassignedShard); } } break; } } return shards; } @Override public String toString() { StringBuilder sb = new StringBuilder("routing_nodes:\n"); for (RoutingNode routingNode : this) { sb.append(routingNode.prettyPrint()); } sb.append("---- unassigned\n"); for (ShardRouting shardEntry : unassignedShards) { sb.append("--------").append(shardEntry.shortSummary()).append('\n'); } return sb.toString(); } /** * Moves a shard from unassigned to initialize state * * @param existingAllocationId allocation id to use. If null, a fresh allocation id is generated. * @return the initialized shard */ public ShardRouting initializeShard(ShardRouting unassignedShard, String nodeId, @Nullable String existingAllocationId, long expectedSize, RoutingChangesObserver routingChangesObserver) { ensureMutable(); assert unassignedShard.unassigned() : "expected an unassigned shard " + unassignedShard; ShardRouting initializedShard = unassignedShard.initialize(nodeId, existingAllocationId, expectedSize); node(nodeId).add(initializedShard); inactiveShardCount++; if (initializedShard.primary()) { inactivePrimaryCount++; } addRecovery(initializedShard); assignedShardsAdd(initializedShard); routingChangesObserver.shardInitialized(unassignedShard, initializedShard); return initializedShard; } /** * Relocate a shard to another node, adding the target initializing * shard as well as assigning it. * * @return pair of source relocating and target initializing shards. */ public Tuple relocateShard(ShardRouting startedShard, String nodeId, long expectedShardSize, RoutingChangesObserver changes) { ensureMutable(); relocatingShards++; ShardRouting source = startedShard.relocate(nodeId, expectedShardSize); ShardRouting target = source.getTargetRelocatingShard(); updateAssigned(startedShard, source); node(target.currentNodeId()).add(target); assignedShardsAdd(target); addRecovery(target); changes.relocationStarted(startedShard, target); return Tuple.tuple(source, target); } /** * Applies the relevant logic to start an initializing shard. * * Moves the initializing shard to started. If the shard is a relocation target, also removes the relocation source. * * If the started shard is a primary relocation target, this also reinitializes currently initializing replicas as their * recovery source changes * * @return the started shard */ public ShardRouting startShard(Logger logger, ShardRouting initializingShard, RoutingChangesObserver routingChangesObserver) { ensureMutable(); ShardRouting startedShard = started(initializingShard); logger.trace("{} marked shard as started (routing: {})", initializingShard.shardId(), initializingShard); routingChangesObserver.shardStarted(initializingShard, startedShard); if (initializingShard.relocatingNodeId() != null) { // relocation target has been started, remove relocation source RoutingNode relocationSourceNode = node(initializingShard.relocatingNodeId()); ShardRouting relocationSourceShard = relocationSourceNode.getByShardId(initializingShard.shardId()); assert relocationSourceShard.isRelocationSourceOf(initializingShard); assert relocationSourceShard.getTargetRelocatingShard() == initializingShard : "relocation target mismatch, expected: " + initializingShard + " but was: " + relocationSourceShard.getTargetRelocatingShard(); remove(relocationSourceShard); routingChangesObserver.relocationCompleted(relocationSourceShard); // if this is a primary shard with ongoing replica recoveries, reinitialize them as their recovery source changed if (startedShard.primary()) { List assignedShards = assignedShards(startedShard.shardId()); // copy list to prevent ConcurrentModificationException for (ShardRouting routing : new ArrayList<>(assignedShards)) { if (routing.initializing() && routing.primary() == false) { if (routing.isRelocationTarget()) { // find the relocation source ShardRouting sourceShard = getByAllocationId(routing.shardId(), routing.allocationId().getRelocationId()); // cancel relocation and start relocation to same node again ShardRouting startedReplica = cancelRelocation(sourceShard); remove(routing); routingChangesObserver.shardFailed(routing, new UnassignedInfo(UnassignedInfo.Reason.REINITIALIZED, "primary changed")); relocateShard(startedReplica, sourceShard.relocatingNodeId(), sourceShard.getExpectedShardSize(), routingChangesObserver); } else { ShardRouting reinitializedReplica = reinitReplica(routing); routingChangesObserver.initializedReplicaReinitialized(routing, reinitializedReplica); } } } } } return startedShard; } /** * Applies the relevant logic to handle a cancelled or failed shard. * * Moves the shard to unassigned or completely removes the shard (if relocation target). * * - If shard is a primary, this also fails initializing replicas. * - If shard is an active primary, this also promotes an active replica to primary (if such a replica exists). * - If shard is a relocating primary, this also removes the primary relocation target shard. * - If shard is a relocating replica, this promotes the replica relocation target to a full initializing replica, removing the * relocation source information. This is possible as peer recovery is always done from the primary. * - If shard is a (primary or replica) relocation target, this also clears the relocation information on the source shard. * */ public void failShard(Logger logger, ShardRouting failedShard, UnassignedInfo unassignedInfo, IndexMetaData indexMetaData, RoutingChangesObserver routingChangesObserver) { ensureMutable(); assert failedShard.assignedToNode() : "only assigned shards can be failed"; assert indexMetaData.getIndex().equals(failedShard.index()) : "shard failed for unknown index (shard entry: " + failedShard + ")"; assert getByAllocationId(failedShard.shardId(), failedShard.allocationId().getId()) == failedShard : "shard routing to fail does not exist in routing table, expected: " + failedShard + " but was: " + getByAllocationId(failedShard.shardId(), failedShard.allocationId().getId()); logger.debug("{} failing shard {} with unassigned info ({})", failedShard.shardId(), failedShard, unassignedInfo.shortSummary()); // if this is a primary, fail initializing replicas first (otherwise we move RoutingNodes into an inconsistent state) if (failedShard.primary()) { List assignedShards = assignedShards(failedShard.shardId()); if (assignedShards.isEmpty() == false) { // copy list to prevent ConcurrentModificationException for (ShardRouting routing : new ArrayList<>(assignedShards)) { if (!routing.primary() && routing.initializing()) { // re-resolve replica as earlier iteration could have changed source/target of replica relocation ShardRouting replicaShard = getByAllocationId(routing.shardId(), routing.allocationId().getId()); assert replicaShard != null : "failed to re-resolve " + routing + " when failing replicas"; UnassignedInfo primaryFailedUnassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.PRIMARY_FAILED, "primary failed while replica initializing", null, 0, unassignedInfo.getUnassignedTimeInNanos(), unassignedInfo.getUnassignedTimeInMillis(), false, AllocationStatus.NO_ATTEMPT, Collections.emptySet()); failShard(logger, replicaShard, primaryFailedUnassignedInfo, indexMetaData, routingChangesObserver); } } } } if (failedShard.relocating()) { // find the shard that is initializing on the target node ShardRouting targetShard = getByAllocationId(failedShard.shardId(), failedShard.allocationId().getRelocationId()); assert targetShard.isRelocationTargetOf(failedShard); if (failedShard.primary()) { logger.trace("{} is removed due to the failure/cancellation of the source shard", targetShard); // cancel and remove target shard remove(targetShard); routingChangesObserver.shardFailed(targetShard, unassignedInfo); } else { logger.trace("{}, relocation source failed / cancelled, mark as initializing without relocation source", targetShard); // promote to initializing shard without relocation source and ensure that removed relocation source // is not added back as unassigned shard removeRelocationSource(targetShard); routingChangesObserver.relocationSourceRemoved(targetShard); } } // fail actual shard if (failedShard.initializing()) { if (failedShard.relocatingNodeId() == null) { if (failedShard.primary()) { // promote active replica to primary if active replica exists (only the case for shadow replicas) unassignPrimaryAndPromoteActiveReplicaIfExists(failedShard, unassignedInfo, routingChangesObserver); } else { // initializing shard that is not relocation target, just move to unassigned moveToUnassigned(failedShard, unassignedInfo); } } else { // The shard is a target of a relocating shard. In that case we only need to remove the target shard and cancel the source // relocation. No shard is left unassigned logger.trace("{} is a relocation target, resolving source to cancel relocation ({})", failedShard, unassignedInfo.shortSummary()); ShardRouting sourceShard = getByAllocationId(failedShard.shardId(), failedShard.allocationId().getRelocationId()); assert sourceShard.isRelocationSourceOf(failedShard); logger.trace("{}, resolved source to [{}]. canceling relocation ... ({})", failedShard.shardId(), sourceShard, unassignedInfo.shortSummary()); cancelRelocation(sourceShard); remove(failedShard); } } else { assert failedShard.active(); if (failedShard.primary()) { // promote active replica to primary if active replica exists unassignPrimaryAndPromoteActiveReplicaIfExists(failedShard, unassignedInfo, routingChangesObserver); } else { if (failedShard.relocating()) { remove(failedShard); } else { moveToUnassigned(failedShard, unassignedInfo); } } } routingChangesObserver.shardFailed(failedShard, unassignedInfo); assert node(failedShard.currentNodeId()).getByShardId(failedShard.shardId()) == null : "failedShard " + failedShard + " was matched but wasn't removed"; } private void unassignPrimaryAndPromoteActiveReplicaIfExists(ShardRouting failedShard, UnassignedInfo unassignedInfo, RoutingChangesObserver routingChangesObserver) { assert failedShard.primary(); ShardRouting activeReplica = activeReplicaWithHighestVersion(failedShard.shardId()); if (activeReplica == null) { moveToUnassigned(failedShard, unassignedInfo); } else { movePrimaryToUnassignedAndDemoteToReplica(failedShard, unassignedInfo); promoteReplicaToPrimary(activeReplica, routingChangesObserver); } } private void promoteReplicaToPrimary(ShardRouting activeReplica, RoutingChangesObserver routingChangesObserver) { // if the activeReplica was relocating before this call to failShard, its relocation was cancelled earlier when we // failed initializing replica shards (and moved replica relocation source back to started) assert activeReplica.started() : "replica relocation should have been cancelled: " + activeReplica; promoteActiveReplicaShardToPrimary(activeReplica); routingChangesObserver.replicaPromoted(activeReplica); } /** * Mark a shard as started and adjusts internal statistics. * * @return the started shard */ private ShardRouting started(ShardRouting shard) { assert shard.initializing() : "expected an initializing shard " + shard; if (shard.relocatingNodeId() == null) { // if this is not a target shard for relocation, we need to update statistics inactiveShardCount--; if (shard.primary()) { inactivePrimaryCount--; } } removeRecovery(shard); ShardRouting startedShard = shard.moveToStarted(); updateAssigned(shard, startedShard); return startedShard; } /** * Cancels a relocation of a shard that shard must relocating. * * @return the shard after cancelling relocation */ private ShardRouting cancelRelocation(ShardRouting shard) { relocatingShards--; ShardRouting cancelledShard = shard.cancelRelocation(); updateAssigned(shard, cancelledShard); return cancelledShard; } /** * moves the assigned replica shard to primary. * * @param replicaShard the replica shard to be promoted to primary * @return the resulting primary shard */ private ShardRouting promoteActiveReplicaShardToPrimary(ShardRouting replicaShard) { assert replicaShard.active() : "non-active shard cannot be promoted to primary: " + replicaShard; assert replicaShard.primary() == false : "primary shard cannot be promoted to primary: " + replicaShard; ShardRouting primaryShard = replicaShard.moveActiveReplicaToPrimary(); updateAssigned(replicaShard, primaryShard); return primaryShard; } private static final List EMPTY = Collections.emptyList(); /** * Cancels the give shard from the Routing nodes internal statistics and cancels * the relocation if the shard is relocating. */ private void remove(ShardRouting shard) { assert shard.unassigned() == false : "only assigned shards can be removed here (" + shard + ")"; node(shard.currentNodeId()).remove(shard); if (shard.initializing() && shard.relocatingNodeId() == null) { inactiveShardCount--; assert inactiveShardCount >= 0; if (shard.primary()) { inactivePrimaryCount--; } } else if (shard.relocating()) { shard = cancelRelocation(shard); } assignedShardsRemove(shard); if (shard.initializing()) { removeRecovery(shard); } } /** * Removes relocation source of an initializing non-primary shard. This allows the replica shard to continue recovery from * the primary even though its non-primary relocation source has failed. */ private ShardRouting removeRelocationSource(ShardRouting shard) { assert shard.isRelocationTarget() : "only relocation target shards can have their relocation source removed (" + shard + ")"; ShardRouting relocationMarkerRemoved = shard.removeRelocationSource(); updateAssigned(shard, relocationMarkerRemoved); inactiveShardCount++; // relocation targets are not counted as inactive shards whereas initializing shards are return relocationMarkerRemoved; } private void assignedShardsAdd(ShardRouting shard) { assert shard.unassigned() == false : "unassigned shard " + shard + " cannot be added to list of assigned shards"; List shards = assignedShards.computeIfAbsent(shard.shardId(), k -> new ArrayList<>()); assert assertInstanceNotInList(shard, shards) : "shard " + shard + " cannot appear twice in list of assigned shards"; shards.add(shard); } private boolean assertInstanceNotInList(ShardRouting shard, List shards) { for (ShardRouting s : shards) { assert s != shard; } return true; } private void assignedShardsRemove(ShardRouting shard) { final List replicaSet = assignedShards.get(shard.shardId()); if (replicaSet != null) { final Iterator iterator = replicaSet.iterator(); while(iterator.hasNext()) { // yes we check identity here if (shard == iterator.next()) { iterator.remove(); return; } } } assert false : "No shard found to remove"; } private ShardRouting reinitReplica(ShardRouting shard) { assert shard.primary() == false : "shard must be a replica: " + shard; assert shard.initializing() : "can only reinitialize an initializing replica: " + shard; assert shard.isRelocationTarget() == false : "replication target cannot be reinitialized: " + shard; ShardRouting reinitializedShard = shard.reinitializeReplicaShard(); updateAssigned(shard, reinitializedShard); return reinitializedShard; } private void updateAssigned(ShardRouting oldShard, ShardRouting newShard) { assert oldShard.shardId().equals(newShard.shardId()) : "can only update " + oldShard + " by shard with same shard id but was " + newShard; assert oldShard.unassigned() == false && newShard.unassigned() == false : "only assigned shards can be updated in list of assigned shards (prev: " + oldShard + ", new: " + newShard + ")"; assert oldShard.currentNodeId().equals(newShard.currentNodeId()) : "shard to update " + oldShard + " can only update " + oldShard + " by shard assigned to same node but was " + newShard; node(oldShard.currentNodeId()).update(oldShard, newShard); List shardsWithMatchingShardId = assignedShards.computeIfAbsent(oldShard.shardId(), k -> new ArrayList<>()); int previousShardIndex = shardsWithMatchingShardId.indexOf(oldShard); assert previousShardIndex >= 0 : "shard to update " + oldShard + " does not exist in list of assigned shards"; shardsWithMatchingShardId.set(previousShardIndex, newShard); } private ShardRouting moveToUnassigned(ShardRouting shard, UnassignedInfo unassignedInfo) { assert shard.unassigned() == false : "only assigned shards can be moved to unassigned (" + shard + ")"; remove(shard); ShardRouting unassigned = shard.moveToUnassigned(unassignedInfo); unassignedShards.add(unassigned); return unassigned; } /** * Moves assigned primary to unassigned and demotes it to a replica. * Used in conjunction with {@link #promoteActiveReplicaShardToPrimary} when an active replica is promoted to primary. */ private ShardRouting movePrimaryToUnassignedAndDemoteToReplica(ShardRouting shard, UnassignedInfo unassignedInfo) { assert shard.unassigned() == false : "only assigned shards can be moved to unassigned (" + shard + ")"; assert shard.primary() : "only primary can be demoted to replica (" + shard + ")"; remove(shard); ShardRouting unassigned = shard.moveToUnassigned(unassignedInfo).moveUnassignedFromPrimary(); unassignedShards.add(unassigned); return unassigned; } /** * Returns the number of routing nodes */ public int size() { return nodesToShards.size(); } public static final class UnassignedShards implements Iterable { private final RoutingNodes nodes; private final List unassigned; private final List ignored; private int primaries = 0; private int ignoredPrimaries = 0; public UnassignedShards(RoutingNodes nodes) { this.nodes = nodes; unassigned = new ArrayList<>(); ignored = new ArrayList<>(); } public void add(ShardRouting shardRouting) { if(shardRouting.primary()) { primaries++; } unassigned.add(shardRouting); } public void sort(Comparator comparator) { nodes.ensureMutable(); CollectionUtil.timSort(unassigned, comparator); } /** * Returns the size of the non-ignored unassigned shards */ public int size() { return unassigned.size(); } /** * Returns the number of non-ignored unassigned primaries */ public int getNumPrimaries() { return primaries; } /** * Returns the number of temporarily marked as ignored unassigned primaries */ public int getNumIgnoredPrimaries() { return ignoredPrimaries; } @Override public UnassignedIterator iterator() { return new UnassignedIterator(); } /** * The list of ignored unassigned shards (read only). The ignored unassigned shards * are not part of the formal unassigned list, but are kept around and used to build * back the list of unassigned shards as part of the routing table. */ public List ignored() { return Collections.unmodifiableList(ignored); } /** * Marks a shard as temporarily ignored and adds it to the ignore unassigned list. * Should be used with caution, typically, * the correct usage is to removeAndIgnore from the iterator. * @see #ignored() * @see UnassignedIterator#removeAndIgnore(AllocationStatus, RoutingChangesObserver) * @see #isIgnoredEmpty() */ public void ignoreShard(ShardRouting shard, AllocationStatus allocationStatus, RoutingChangesObserver changes) { nodes.ensureMutable(); if (shard.primary()) { ignoredPrimaries++; UnassignedInfo currInfo = shard.unassignedInfo(); assert currInfo != null; if (allocationStatus.equals(currInfo.getLastAllocationStatus()) == false) { UnassignedInfo newInfo = new UnassignedInfo(currInfo.getReason(), currInfo.getMessage(), currInfo.getFailure(), currInfo.getNumFailedAllocations(), currInfo.getUnassignedTimeInNanos(), currInfo.getUnassignedTimeInMillis(), currInfo.isDelayed(), allocationStatus, currInfo.getFailedNodeIds()); ShardRouting updatedShard = shard.updateUnassigned(newInfo, shard.recoverySource()); changes.unassignedInfoUpdated(shard, newInfo); shard = updatedShard; } } ignored.add(shard); } public class UnassignedIterator implements Iterator { private final ListIterator iterator; private ShardRouting current; public UnassignedIterator() { this.iterator = unassigned.listIterator(); } @Override public boolean hasNext() { return iterator.hasNext(); } @Override public ShardRouting next() { return current = iterator.next(); } /** * Initializes the current unassigned shard and moves it from the unassigned list. * * @param existingAllocationId allocation id to use. If null, a fresh allocation id is generated. */ public ShardRouting initialize(String nodeId, @Nullable String existingAllocationId, long expectedShardSize, RoutingChangesObserver routingChangesObserver) { nodes.ensureMutable(); innerRemove(); return nodes.initializeShard(current, nodeId, existingAllocationId, expectedShardSize, routingChangesObserver); } /** * Removes and ignores the unassigned shard (will be ignored for this run, but * will be added back to unassigned once the metadata is constructed again). * Typically this is used when an allocation decision prevents a shard from being allocated such * that subsequent consumers of this API won't try to allocate this shard again. * * @param attempt the result of the allocation attempt */ public void removeAndIgnore(AllocationStatus attempt, RoutingChangesObserver changes) { nodes.ensureMutable(); innerRemove(); ignoreShard(current, attempt, changes); } private void updateShardRouting(ShardRouting shardRouting) { current = shardRouting; iterator.set(shardRouting); } /** * updates the unassigned info and recovery source on the current unassigned shard * * @param unassignedInfo the new unassigned info to use * @param recoverySource the new recovery source to use * @return the shard with unassigned info updated */ public ShardRouting updateUnassigned(UnassignedInfo unassignedInfo, RecoverySource recoverySource, RoutingChangesObserver changes) { nodes.ensureMutable(); ShardRouting updatedShardRouting = current.updateUnassigned(unassignedInfo, recoverySource); changes.unassignedInfoUpdated(current, unassignedInfo); updateShardRouting(updatedShardRouting); return updatedShardRouting; } /** * Unsupported operation, just there for the interface. Use * {@link #removeAndIgnore(AllocationStatus, RoutingChangesObserver)} or * {@link #initialize(String, String, long, RoutingChangesObserver)}. */ @Override public void remove() { throw new UnsupportedOperationException("remove is not supported in unassigned iterator," + " use removeAndIgnore or initialize"); } private void innerRemove() { iterator.remove(); if (current.primary()) { primaries--; } } } /** * Returns true iff this collection contains one or more non-ignored unassigned shards. */ public boolean isEmpty() { return unassigned.isEmpty(); } /** * Returns true iff any unassigned shards are marked as temporarily ignored. * @see UnassignedShards#ignoreShard(ShardRouting, AllocationStatus, RoutingChangesObserver) * @see UnassignedIterator#removeAndIgnore(AllocationStatus, RoutingChangesObserver) */ public boolean isIgnoredEmpty() { return ignored.isEmpty(); } public void shuffle() { nodes.ensureMutable(); Randomness.shuffle(unassigned); } /** * Drains all unassigned shards and returns it. * This method will not drain ignored shards. */ public ShardRouting[] drain() { nodes.ensureMutable(); ShardRouting[] mutableShardRoutings = unassigned.toArray(new ShardRouting[unassigned.size()]); unassigned.clear(); primaries = 0; return mutableShardRoutings; } } /** * Calculates RoutingNodes statistics by iterating over all {@link ShardRouting}s * in the cluster to ensure the book-keeping is correct. * For performance reasons, this should only be called from asserts * * @return this method always returns true or throws an assertion error. If assertion are not enabled * this method does nothing. */ public static boolean assertShardStats(RoutingNodes routingNodes) { if (!Assertions.ENABLED) { return true; } int unassignedPrimaryCount = 0; int unassignedIgnoredPrimaryCount = 0; int inactivePrimaryCount = 0; int inactiveShardCount = 0; int relocating = 0; Map indicesAndShards = new HashMap<>(); for (RoutingNode node : routingNodes) { for (ShardRouting shard : node) { if (shard.initializing() && shard.relocatingNodeId() == null) { inactiveShardCount++; if (shard.primary()) { inactivePrimaryCount++; } } if (shard.relocating()) { relocating++; } Integer i = indicesAndShards.get(shard.index()); if (i == null) { i = shard.id(); } indicesAndShards.put(shard.index(), Math.max(i, shard.id())); } } // Assert that the active shard routing are identical. Set> entries = indicesAndShards.entrySet(); final Map> shardsByShardId = new HashMap<>(); for (final RoutingNode routingNode: routingNodes) { for (final ShardRouting shardRouting : routingNode) { final HashSet shards = shardsByShardId.computeIfAbsent(new ShardId(shardRouting.index(), shardRouting.id()), k -> new HashSet<>()); shards.add(shardRouting); } } for (final Map.Entry e : entries) { final Index index = e.getKey(); for (int i = 0; i < e.getValue(); i++) { final ShardId shardId = new ShardId(index, i); final HashSet shards = shardsByShardId.get(shardId); final List mutableShardRoutings = routingNodes.assignedShards(shardId); assert (shards == null && mutableShardRoutings.size() == 0) || (shards != null && shards.size() == mutableShardRoutings.size() && shards.containsAll(mutableShardRoutings)); } } for (ShardRouting shard : routingNodes.unassigned()) { if (shard.primary()) { unassignedPrimaryCount++; } } for (ShardRouting shard : routingNodes.unassigned().ignored()) { if (shard.primary()) { unassignedIgnoredPrimaryCount++; } } for (Map.Entry recoveries : routingNodes.recoveriesPerNode.entrySet()) { String node = recoveries.getKey(); final Recoveries value = recoveries.getValue(); int incoming = 0; int outgoing = 0; RoutingNode routingNode = routingNodes.nodesToShards.get(node); if (routingNode != null) { // node might have dropped out of the cluster for (ShardRouting routing : routingNode) { if (routing.initializing()) { incoming++; } if (routing.primary() && routing.isRelocationTarget() == false) { for (ShardRouting assigned : routingNodes.assignedShards.get(routing.shardId())) { if (assigned.initializing() && assigned.recoverySource().getType() == RecoverySource.Type.PEER) { outgoing++; } } } } } assert incoming == value.incoming : incoming + " != " + value.incoming + " node: " + routingNode; assert outgoing == value.outgoing : outgoing + " != " + value.outgoing + " node: " + routingNode; } assert unassignedPrimaryCount == routingNodes.unassignedShards.getNumPrimaries() : "Unassigned primaries is [" + unassignedPrimaryCount + "] but RoutingNodes returned unassigned primaries [" + routingNodes.unassigned().getNumPrimaries() + "]"; assert unassignedIgnoredPrimaryCount == routingNodes.unassignedShards.getNumIgnoredPrimaries() : "Unassigned ignored primaries is [" + unassignedIgnoredPrimaryCount + "] but RoutingNodes returned unassigned ignored primaries [" + routingNodes.unassigned().getNumIgnoredPrimaries() + "]"; assert inactivePrimaryCount == routingNodes.inactivePrimaryCount : "Inactive Primary count [" + inactivePrimaryCount + "] but RoutingNodes returned inactive primaries [" + routingNodes.inactivePrimaryCount + "]"; assert inactiveShardCount == routingNodes.inactiveShardCount : "Inactive Shard count [" + inactiveShardCount + "] but RoutingNodes returned inactive shards [" + routingNodes.inactiveShardCount + "]"; assert routingNodes.getRelocatingShardCount() == relocating : "Relocating shards mismatch [" + routingNodes.getRelocatingShardCount() + "] but expected [" + relocating + "]"; return true; } private void ensureMutable() { if (readOnly) { throw new IllegalStateException("can't modify RoutingNodes - readonly"); } } /** * Creates an iterator over shards interleaving between nodes: The iterator returns the first shard from * the first node, then the first shard of the second node, etc. until one shard from each node has been returned. * The iterator then resumes on the first node by returning the second shard and continues until all shards from * all the nodes have been returned. */ public Iterator nodeInterleavedShardIterator() { final Queue> queue = new ArrayDeque<>(); for (Map.Entry entry : nodesToShards.entrySet()) { queue.add(entry.getValue().copyShards().iterator()); } return new Iterator() { public boolean hasNext() { while (!queue.isEmpty()) { if (queue.peek().hasNext()) { return true; } queue.poll(); } return false; } public ShardRouting next() { if (hasNext() == false) { throw new NoSuchElementException(); } Iterator iter = queue.poll(); ShardRouting result = iter.next(); queue.offer(iter); return result; } public void remove() { throw new UnsupportedOperationException(); } }; } private static final class Recoveries { private static final Recoveries EMPTY = new Recoveries(); private int incoming = 0; private int outgoing = 0; void addOutgoing(int howMany) { assert outgoing + howMany >= 0 : outgoing + howMany+ " must be >= 0"; outgoing += howMany; } void addIncoming(int howMany) { assert incoming + howMany >= 0 : incoming + howMany+ " must be >= 0"; incoming += howMany; } int getOutgoing() { return outgoing; } int getIncoming() { return incoming; } public static Recoveries getOrAdd(Map map, String key) { Recoveries recoveries = map.get(key); if (recoveries == null) { recoveries = new Recoveries(); map.put(key, recoveries); } return recoveries; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy