All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.opensearch.cluster.routing.allocation.IndexMetadataUpdater Maven / Gradle / Ivy

There is a newer version: 2.18.0
Show newest version
/*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*
 * Modifications Copyright OpenSearch Contributors. See
 * GitHub history for details.
 */

package org.opensearch.cluster.routing.allocation;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.opensearch.cluster.ClusterState;
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.cluster.metadata.Metadata;
import org.opensearch.cluster.node.DiscoveryNodes;
import org.opensearch.cluster.routing.IndexShardRoutingTable;
import org.opensearch.cluster.routing.RecoverySource;
import org.opensearch.cluster.routing.RoutingChangesObserver;
import org.opensearch.cluster.routing.RoutingTable;
import org.opensearch.cluster.routing.ShardRouting;
import org.opensearch.cluster.routing.UnassignedInfo;
import org.opensearch.common.util.set.Sets;
import org.opensearch.core.index.Index;
import org.opensearch.core.index.shard.ShardId;
import org.opensearch.index.remote.RemoteMigrationIndexMetadataUpdater;

import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * Observer that tracks changes made to RoutingNodes in order to update the primary terms and in-sync allocation ids in
 * {@link IndexMetadata} once the allocation round has completed.
 * 

* Primary terms are updated on primary initialization or when an active primary fails. *

* Allocation ids are added for shards that become active and removed for shards that stop being active. * * @opensearch.internal */ public class IndexMetadataUpdater extends RoutingChangesObserver.AbstractRoutingChangesObserver { private final Logger logger = LogManager.getLogger(IndexMetadataUpdater.class); private final Map shardChanges = new HashMap<>(); private boolean ongoingRemoteStoreMigration = false; @Override public void shardInitialized(ShardRouting unassignedShard, ShardRouting initializedShard) { assert initializedShard.isRelocationTarget() == false : "shardInitialized is not called on relocation target: " + initializedShard; if (initializedShard.primary()) { increasePrimaryTerm(initializedShard.shardId()); Updates updates = changes(initializedShard.shardId()); assert updates.initializedPrimary == null : "Primary cannot be initialized more than once in same allocation round: " + "(previous: " + updates.initializedPrimary + ", next: " + initializedShard + ")"; updates.initializedPrimary = initializedShard; } } @Override public void shardStarted(ShardRouting initializingShard, ShardRouting startedShard) { assert Objects.equals(initializingShard.allocationId().getId(), startedShard.allocationId().getId()) : "initializingShard.allocationId [" + initializingShard.allocationId().getId() + "] and startedShard.allocationId [" + startedShard.allocationId().getId() + "] have to have the same"; Updates updates = changes(startedShard.shardId()); updates.addedAllocationIds.add(startedShard.allocationId().getId()); if (startedShard.primary() // started shard has to have null recoverySource; have to pick up recoverySource from its initializing state && (initializingShard.recoverySource() == RecoverySource.ExistingStoreRecoverySource.FORCE_STALE_PRIMARY_INSTANCE)) { updates.removedAllocationIds.add(RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID); } } @Override public void shardFailed(ShardRouting failedShard, UnassignedInfo unassignedInfo) { if (failedShard.active() && failedShard.primary()) { Updates updates = changes(failedShard.shardId()); if (updates.firstFailedPrimary == null) { // more than one primary can be failed (because of batching, primary can be failed, replica promoted and then failed...) updates.firstFailedPrimary = failedShard; } increasePrimaryTerm(failedShard.shardId()); } // Track change through shardChanges Map regardless of above-mentioned conditions // To be used to update index metadata while computing new cluster state if (ongoingRemoteStoreMigration) { changes(failedShard.shardId()); } } @Override public void relocationCompleted(ShardRouting removedRelocationSource) { removeAllocationId(removedRelocationSource); } /** * Adds the target {@link ShardRouting} to the tracking updates set. * Used to track started relocations while applying changes to the new {@link ClusterState} */ @Override public void relocationStarted(ShardRouting startedShard, ShardRouting targetRelocatingShard) { // Store change in shardChanges Map regardless of above-mentioned conditions // To be used to update index metadata while computing new cluster state if (ongoingRemoteStoreMigration) { changes(targetRelocatingShard.shardId()); } } /** * Updates the current {@link Metadata} based on the changes of this RoutingChangesObserver. Specifically * we update {@link IndexMetadata#getInSyncAllocationIds()} and {@link IndexMetadata#primaryTerm(int)} based on * the changes made during this allocation. *
* Manipulates index settings or index metadata during an ongoing remote store migration * * @param oldMetadata {@link Metadata} object from before the routing nodes was changed. * @param newRoutingTable {@link RoutingTable} object after routing changes were applied. * @return adapted {@link Metadata}, potentially the original one if no change was needed. */ public Metadata applyChanges(Metadata oldMetadata, RoutingTable newRoutingTable, DiscoveryNodes discoveryNodes) { Map>> changesGroupedByIndex = shardChanges.entrySet() .stream() .collect(Collectors.groupingBy(e -> e.getKey().getIndex())); Metadata.Builder metadataBuilder = null; for (Map.Entry>> indexChanges : changesGroupedByIndex.entrySet()) { Index index = indexChanges.getKey(); final IndexMetadata oldIndexMetadata = oldMetadata.getIndexSafe(index); IndexMetadata.Builder indexMetadataBuilder = null; for (Map.Entry shardEntry : indexChanges.getValue()) { ShardId shardId = shardEntry.getKey(); Updates updates = shardEntry.getValue(); indexMetadataBuilder = updateInSyncAllocations(newRoutingTable, oldIndexMetadata, indexMetadataBuilder, shardId, updates); indexMetadataBuilder = updatePrimaryTerm(oldIndexMetadata, indexMetadataBuilder, shardId, updates); if (ongoingRemoteStoreMigration) { RemoteMigrationIndexMetadataUpdater migrationImdUpdater = new RemoteMigrationIndexMetadataUpdater( discoveryNodes, newRoutingTable, oldIndexMetadata, oldMetadata.settings(), logger ); migrationImdUpdater.maybeUpdateRemoteStoreCustomMetadata(indexMetadataBuilder, index.getName()); migrationImdUpdater.maybeAddRemoteIndexSettings(indexMetadataBuilder, index.getName()); } } if (indexMetadataBuilder != null) { if (metadataBuilder == null) { metadataBuilder = Metadata.builder(oldMetadata); } metadataBuilder.put(indexMetadataBuilder); } } if (metadataBuilder != null) { return metadataBuilder.build(); } else { return oldMetadata; } } /** * Updates in-sync allocations with routing changes that were made to the routing table. */ private IndexMetadata.Builder updateInSyncAllocations( RoutingTable newRoutingTable, IndexMetadata oldIndexMetadata, IndexMetadata.Builder indexMetadataBuilder, ShardId shardId, Updates updates ) { assert Sets.haveEmptyIntersection(updates.addedAllocationIds, updates.removedAllocationIds) : "allocation ids cannot be both added and removed in the same allocation round, added ids: " + updates.addedAllocationIds + ", removed ids: " + updates.removedAllocationIds; Set oldInSyncAllocationIds = oldIndexMetadata.inSyncAllocationIds(shardId.id()); // check if we have been force-initializing an empty primary or a stale primary if (updates.initializedPrimary != null && oldInSyncAllocationIds.isEmpty() == false && oldInSyncAllocationIds.contains(updates.initializedPrimary.allocationId().getId()) == false) { // we're not reusing an existing in-sync allocation id to initialize a primary, which means that we're either force-allocating // an empty or a stale primary (see AllocateEmptyPrimaryAllocationCommand or AllocateStalePrimaryAllocationCommand). RecoverySource recoverySource = updates.initializedPrimary.recoverySource(); RecoverySource.Type recoverySourceType = recoverySource.getType(); boolean emptyPrimary = recoverySourceType == RecoverySource.Type.EMPTY_STORE; assert updates.addedAllocationIds.isEmpty() : (emptyPrimary ? "empty" : "stale") + " primary is not force-initialized in same allocation round where shards are started"; if (indexMetadataBuilder == null) { indexMetadataBuilder = IndexMetadata.builder(oldIndexMetadata); } if (emptyPrimary) { // forcing an empty primary resets the in-sync allocations to the empty set (ShardRouting.allocatedPostIndexCreate) indexMetadataBuilder.putInSyncAllocationIds(shardId.id(), Collections.emptySet()); } else { final String allocationId; if (recoverySource == RecoverySource.ExistingStoreRecoverySource.FORCE_STALE_PRIMARY_INSTANCE) { allocationId = RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID; } else { assert (recoverySource instanceof RecoverySource.SnapshotRecoverySource || recoverySource instanceof RecoverySource.RemoteStoreRecoverySource) : recoverySource; allocationId = updates.initializedPrimary.allocationId().getId(); } // forcing a stale primary resets the in-sync allocations to the singleton set with the stale id indexMetadataBuilder.putInSyncAllocationIds(shardId.id(), Collections.singleton(allocationId)); } } else { // standard path for updating in-sync ids Set inSyncAllocationIds = new HashSet<>(oldInSyncAllocationIds); inSyncAllocationIds.addAll(updates.addedAllocationIds); inSyncAllocationIds.removeAll(updates.removedAllocationIds); assert oldInSyncAllocationIds.contains(RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID) == false || inSyncAllocationIds.contains(RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID) == false : "fake allocation id has to be removed, inSyncAllocationIds:" + inSyncAllocationIds; // Prevent set of inSyncAllocationIds to grow unboundedly. This can happen for example if we don't write to a primary // but repeatedly shut down nodes that have active replicas. // We use number_of_replicas + 1 (= possible active shard copies) to bound the inSyncAllocationIds set // Only trim the set of allocation ids when it grows, otherwise we might trim too eagerly when the number // of replicas was decreased while shards were unassigned. int maxActiveShards = oldIndexMetadata.getNumberOfReplicas() + 1; // +1 for the primary IndexShardRoutingTable newShardRoutingTable = newRoutingTable.shardRoutingTable(shardId); assert newShardRoutingTable.assignedShards() .stream() .filter(ShardRouting::isRelocationTarget) .map(s -> s.allocationId().getId()) .noneMatch(inSyncAllocationIds::contains) : newShardRoutingTable.assignedShards() + " vs " + inSyncAllocationIds; if (inSyncAllocationIds.size() > oldInSyncAllocationIds.size() && inSyncAllocationIds.size() > maxActiveShards) { // trim entries that have no corresponding shard routing in the cluster state (i.e. trim unavailable copies) List assignedShards = newShardRoutingTable.assignedShards() .stream() .filter(s -> s.isRelocationTarget() == false) .collect(Collectors.toList()); assert assignedShards.size() <= maxActiveShards : "cannot have more assigned shards " + assignedShards + " than maximum possible active shards " + maxActiveShards; Set assignedAllocations = assignedShards.stream().map(s -> s.allocationId().getId()).collect(Collectors.toSet()); inSyncAllocationIds = inSyncAllocationIds.stream() .sorted(Comparator.comparing(assignedAllocations::contains).reversed()) // values with routing entries first .limit(maxActiveShards) .collect(Collectors.toSet()); } // only remove allocation id of failed active primary if there is at least one active shard remaining. Assume for example that // the primary fails but there is no new primary to fail over to. If we were to remove the allocation id of the primary from the // in-sync set, this could create an empty primary on the next allocation. if (newShardRoutingTable.activeShards().isEmpty() && updates.firstFailedPrimary != null) { // add back allocation id of failed primary inSyncAllocationIds.add(updates.firstFailedPrimary.allocationId().getId()); } assert inSyncAllocationIds.isEmpty() == false || oldInSyncAllocationIds.isEmpty() : "in-sync allocations cannot become empty after they have been non-empty: " + oldInSyncAllocationIds; // be extra safe here and only update in-sync set if it is non-empty if (inSyncAllocationIds.isEmpty() == false) { if (indexMetadataBuilder == null) { indexMetadataBuilder = IndexMetadata.builder(oldIndexMetadata); } indexMetadataBuilder.putInSyncAllocationIds(shardId.id(), inSyncAllocationIds); } } return indexMetadataBuilder; } /** * Removes allocation ids from the in-sync set for shard copies for which there is no routing entries in the routing table. * This method is called in AllocationService before any changes to the routing table are made. */ public static ClusterState removeStaleIdsWithoutRoutings(ClusterState clusterState, List staleShards, Logger logger) { Metadata oldMetadata = clusterState.metadata(); RoutingTable oldRoutingTable = clusterState.routingTable(); Metadata.Builder metadataBuilder = null; // group staleShards entries by index for (Map.Entry> indexEntry : staleShards.stream() .collect(Collectors.groupingBy(fs -> fs.getShardId().getIndex())) .entrySet()) { final IndexMetadata oldIndexMetadata = oldMetadata.getIndexSafe(indexEntry.getKey()); IndexMetadata.Builder indexMetadataBuilder = null; // group staleShards entries by shard id for (Map.Entry> shardEntry : indexEntry.getValue() .stream() .collect(Collectors.groupingBy(staleShard -> staleShard.getShardId())) .entrySet()) { int shardNumber = shardEntry.getKey().getId(); Set oldInSyncAllocations = oldIndexMetadata.inSyncAllocationIds(shardNumber); Set idsToRemove = shardEntry.getValue().stream().map(e -> e.getAllocationId()).collect(Collectors.toSet()); assert idsToRemove.stream().allMatch(id -> oldRoutingTable.getByAllocationId(shardEntry.getKey(), id) == null) : "removing stale ids: " + idsToRemove + ", some of which have still a routing entry: " + oldRoutingTable; Set remainingInSyncAllocations = Sets.difference(oldInSyncAllocations, idsToRemove); assert remainingInSyncAllocations.isEmpty() == false : "Set of in-sync ids cannot become empty for shard " + shardEntry.getKey() + " (before: " + oldInSyncAllocations + ", ids to remove: " + idsToRemove + ")"; // be extra safe here: if the in-sync set were to become empty, this would create an empty primary on the next allocation // (see ShardRouting#allocatedPostIndexCreate) if (remainingInSyncAllocations.isEmpty() == false) { if (indexMetadataBuilder == null) { indexMetadataBuilder = IndexMetadata.builder(oldIndexMetadata); } indexMetadataBuilder.putInSyncAllocationIds(shardNumber, remainingInSyncAllocations); } logger.warn("{} marking unavailable shards as stale: {}", shardEntry.getKey(), idsToRemove); } if (indexMetadataBuilder != null) { if (metadataBuilder == null) { metadataBuilder = Metadata.builder(oldMetadata); } metadataBuilder.put(indexMetadataBuilder); } } if (metadataBuilder != null) { return ClusterState.builder(clusterState).metadata(metadataBuilder).build(); } else { return clusterState; } } /** * Increases the primary term if {@link #increasePrimaryTerm} was called for this shard id. */ private IndexMetadata.Builder updatePrimaryTerm( IndexMetadata oldIndexMetadata, IndexMetadata.Builder indexMetadataBuilder, ShardId shardId, Updates updates ) { if (updates.increaseTerm) { if (indexMetadataBuilder == null) { indexMetadataBuilder = IndexMetadata.builder(oldIndexMetadata); } indexMetadataBuilder.primaryTerm(shardId.id(), oldIndexMetadata.primaryTerm(shardId.id()) + 1); } return indexMetadataBuilder; } /** * Helper method that creates update entry for the given shard id if such an entry does not exist yet. */ private Updates changes(ShardId shardId) { return shardChanges.computeIfAbsent(shardId, k -> new Updates()); } /** * Remove allocation id of this shard from the set of in-sync shard copies */ void removeAllocationId(ShardRouting shardRouting) { if (shardRouting.active()) { changes(shardRouting.shardId()).removedAllocationIds.add(shardRouting.allocationId().getId()); } } /** * Increase primary term for this shard id */ private void increasePrimaryTerm(ShardId shardId) { changes(shardId).increaseTerm = true; } public void setOngoingRemoteStoreMigration(boolean ongoingRemoteStoreMigration) { this.ongoingRemoteStoreMigration = ongoingRemoteStoreMigration; } private static class Updates { private boolean increaseTerm; // whether primary term should be increased private Set addedAllocationIds = new HashSet<>(); // allocation ids that should be added to the in-sync set private Set removedAllocationIds = new HashSet<>(); // allocation ids that should be removed from the in-sync set private ShardRouting initializedPrimary = null; // primary that was initialized from unassigned private ShardRouting firstFailedPrimary = null; // first active primary that was failed } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy