All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.elasticsearch.action.support.replication.ReplicationOperation Maven / Gradle / Ivy

/*
 * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
 * or more contributor license agreements. Licensed under the Elastic License
 * 2.0 and the Server Side Public License, v 1; you may not use this file except
 * in compliance with, at your election, the Elastic License 2.0 or the Server
 * Side Public License, v 1.
 */
package org.elasticsearch.action.support.replication;

import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.lucene.store.AlreadyClosedException;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.UnavailableShardsException;
import org.elasticsearch.action.support.ActiveShardCount;
import org.elasticsearch.action.support.RetryableAction;
import org.elasticsearch.action.support.TransportActions;
import org.elasticsearch.cluster.action.shard.ShardStateAction;
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.common.breaker.CircuitBreakingException;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
import org.elasticsearch.core.Nullable;
import org.elasticsearch.core.TimeValue;
import org.elasticsearch.index.seqno.SequenceNumbers;
import org.elasticsearch.index.shard.ReplicationGroup;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.node.NodeClosedException;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.ConnectTransportException;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.LongSupplier;

public class ReplicationOperation<
    Request extends ReplicationRequest,
    ReplicaRequest extends ReplicationRequest,
    PrimaryResultT extends ReplicationOperation.PrimaryResult> {
    private final Logger logger;
    private final ThreadPool threadPool;
    private final Request request;
    private final String opType;
    private final AtomicInteger totalShards = new AtomicInteger();
    /**
     * The number of pending sub-operations in this operation. This is incremented when the following operations start and decremented when
     * they complete:
     * 
    *
  • The operation on the primary
  • *
  • The operation on each replica
  • *
  • Coordination of the operation as a whole. This prevents the operation from terminating early if we haven't started any replica * operations and the primary finishes.
  • *
*/ private final AtomicInteger pendingActions = new AtomicInteger(); private final AtomicInteger successfulShards = new AtomicInteger(); private final Primary primary; private final Replicas replicasProxy; private final AtomicBoolean finished = new AtomicBoolean(); private final TimeValue initialRetryBackoffBound; private final TimeValue retryTimeout; private final long primaryTerm; // exposed for tests private final ActionListener resultListener; private volatile PrimaryResultT primaryResult = null; private final List shardReplicaFailures = Collections.synchronizedList(new ArrayList<>()); public ReplicationOperation( Request request, Primary primary, ActionListener listener, Replicas replicas, Logger logger, ThreadPool threadPool, String opType, long primaryTerm, TimeValue initialRetryBackoffBound, TimeValue retryTimeout ) { this.replicasProxy = replicas; this.primary = primary; this.resultListener = listener; this.logger = logger; this.threadPool = threadPool; this.request = request; this.opType = opType; this.primaryTerm = primaryTerm; this.initialRetryBackoffBound = initialRetryBackoffBound; this.retryTimeout = retryTimeout; } public void execute() throws Exception { final String activeShardCountFailure = checkActiveShardCount(); final ShardRouting primaryRouting = primary.routingEntry(); final ShardId primaryId = primaryRouting.shardId(); if (activeShardCountFailure != null) { finishAsFailed( new UnavailableShardsException( primaryId, "{} Timeout: [{}], request: [{}]", activeShardCountFailure, request.timeout(), request ) ); return; } totalShards.incrementAndGet(); pendingActions.incrementAndGet(); // increase by 1 until we finish all primary coordination primary.perform(request, ActionListener.wrap(this::handlePrimaryResult, this::finishAsFailed)); } private void handlePrimaryResult(final PrimaryResultT primaryResult) { this.primaryResult = primaryResult; final ReplicaRequest replicaRequest = primaryResult.replicaRequest(); if (replicaRequest != null) { if (logger.isTraceEnabled()) { logger.trace("[{}] op [{}] completed on primary for request [{}]", primary.routingEntry().shardId(), opType, request); } // we have to get the replication group after successfully indexing into the primary in order to honour recovery semantics. // we have to make sure that every operation indexed into the primary after recovery start will also be replicated // to the recovery target. If we used an old replication group, we may miss a recovery that has started since then. // we also have to make sure to get the global checkpoint before the replication group, to ensure that the global checkpoint // is valid for this replication group. If we would sample in the reverse, the global checkpoint might be based on a subset // of the sampled replication group, and advanced further than what the given replication group would allow it to. // This would entail that some shards could learn about a global checkpoint that would be higher than its local checkpoint. final long globalCheckpoint = primary.computedGlobalCheckpoint(); // we have to capture the max_seq_no_of_updates after this request was completed on the primary to make sure the value of // max_seq_no_of_updates on replica when this request is executed is at least the value on the primary when it was executed // on. final long maxSeqNoOfUpdatesOrDeletes = primary.maxSeqNoOfUpdatesOrDeletes(); assert maxSeqNoOfUpdatesOrDeletes != SequenceNumbers.UNASSIGNED_SEQ_NO : "seqno_of_updates still uninitialized"; final ReplicationGroup replicationGroup = primary.getReplicationGroup(); final PendingReplicationActions pendingReplicationActions = primary.getPendingReplicationActions(); markUnavailableShardsAsStale(replicaRequest, replicationGroup); performOnReplicas(replicaRequest, globalCheckpoint, maxSeqNoOfUpdatesOrDeletes, replicationGroup, pendingReplicationActions); } primaryResult.runPostReplicationActions(new ActionListener() { @Override public void onResponse(Void aVoid) { successfulShards.incrementAndGet(); updateCheckPoints( primary.routingEntry(), primary::localCheckpoint, primary::globalCheckpoint, () -> decPendingAndFinishIfNeeded() ); } @Override public void onFailure(Exception e) { logger.trace("[{}] op [{}] post replication actions failed for [{}]", primary.routingEntry().shardId(), opType, request); // TODO: fail shard? This will otherwise have the local / global checkpoint info lagging, or possibly have replicas // go out of sync with the primary finishAsFailed(e); } }); } private void markUnavailableShardsAsStale(ReplicaRequest replicaRequest, ReplicationGroup replicationGroup) { // if inSyncAllocationIds contains allocation ids of shards that don't exist in RoutingTable, mark copies as stale for (String allocationId : replicationGroup.getUnavailableInSyncShards()) { pendingActions.incrementAndGet(); replicasProxy.markShardCopyAsStaleIfNeeded( replicaRequest.shardId(), allocationId, primaryTerm, ActionListener.wrap(r -> decPendingAndFinishIfNeeded(), ReplicationOperation.this::onNoLongerPrimary) ); } } private void performOnReplicas( final ReplicaRequest replicaRequest, final long globalCheckpoint, final long maxSeqNoOfUpdatesOrDeletes, final ReplicationGroup replicationGroup, final PendingReplicationActions pendingReplicationActions ) { // for total stats, add number of unassigned shards and // number of initializing shards that are not ready yet to receive operations (recovery has not opened engine yet on the target) totalShards.addAndGet(replicationGroup.getSkippedShards().size()); final ShardRouting primaryRouting = primary.routingEntry(); for (final ShardRouting shard : replicationGroup.getReplicationTargets()) { if (shard.isSameAllocation(primaryRouting) == false) { performOnReplica(shard, replicaRequest, globalCheckpoint, maxSeqNoOfUpdatesOrDeletes, pendingReplicationActions); } } } private void performOnReplica( final ShardRouting shard, final ReplicaRequest replicaRequest, final long globalCheckpoint, final long maxSeqNoOfUpdatesOrDeletes, final PendingReplicationActions pendingReplicationActions ) { if (logger.isTraceEnabled()) { logger.trace("[{}] sending op [{}] to replica {} for request [{}]", shard.shardId(), opType, shard, replicaRequest); } totalShards.incrementAndGet(); pendingActions.incrementAndGet(); final ActionListener replicationListener = new ActionListener() { @Override public void onResponse(ReplicaResponse response) { successfulShards.incrementAndGet(); updateCheckPoints(shard, response::localCheckpoint, response::globalCheckpoint, () -> decPendingAndFinishIfNeeded()); } @Override public void onFailure(Exception replicaException) { logger.trace( () -> new ParameterizedMessage( "[{}] failure while performing [{}] on replica {}, request [{}]", shard.shardId(), opType, shard, replicaRequest ), replicaException ); // Only report "critical" exceptions - TODO: Reach out to the master node to get the latest shard state then report. if (TransportActions.isShardNotAvailableException(replicaException) == false) { RestStatus restStatus = ExceptionsHelper.status(replicaException); shardReplicaFailures.add( new ReplicationResponse.ShardInfo.Failure( shard.shardId(), shard.currentNodeId(), replicaException, restStatus, false ) ); } String message = String.format(Locale.ROOT, "failed to perform %s on replica %s", opType, shard); replicasProxy.failShardIfNeeded( shard, primaryTerm, message, replicaException, ActionListener.wrap(r -> decPendingAndFinishIfNeeded(), ReplicationOperation.this::onNoLongerPrimary) ); } @Override public String toString() { return "[" + replicaRequest + "][" + shard + "]"; } }; final String allocationId = shard.allocationId().getId(); final RetryableAction replicationAction = new RetryableAction( logger, threadPool, initialRetryBackoffBound, retryTimeout, replicationListener ) { @Override public void tryAction(ActionListener listener) { replicasProxy.performOn(shard, replicaRequest, primaryTerm, globalCheckpoint, maxSeqNoOfUpdatesOrDeletes, listener); } @Override public void onFinished() { super.onFinished(); pendingReplicationActions.removeReplicationAction(allocationId, this); } @Override public boolean shouldRetry(Exception e) { final Throwable cause = ExceptionsHelper.unwrapCause(e); return cause instanceof CircuitBreakingException || cause instanceof EsRejectedExecutionException || cause instanceof ConnectTransportException; } }; pendingReplicationActions.addPendingAction(allocationId, replicationAction); replicationAction.run(); } private void updateCheckPoints( ShardRouting shard, LongSupplier localCheckpointSupplier, LongSupplier globalCheckpointSupplier, Runnable onCompletion ) { boolean forked = false; try { primary.updateLocalCheckpointForShard(shard.allocationId().getId(), localCheckpointSupplier.getAsLong()); primary.updateGlobalCheckpointForShard(shard.allocationId().getId(), globalCheckpointSupplier.getAsLong()); } catch (final AlreadyClosedException e) { // the index was deleted or this shard was never activated after a relocation; fall through and finish normally } catch (final Exception e) { threadPool.executor(ThreadPool.Names.WRITE).execute(new AbstractRunnable() { @Override public void onFailure(Exception e) { assert false : e; } @Override public boolean isForceExecution() { return true; } @Override protected void doRun() { // fail the primary but fall through and let the rest of operation processing complete primary.failShard(String.format(Locale.ROOT, "primary failed updating local checkpoint for replica %s", shard), e); } @Override public void onAfter() { onCompletion.run(); } }); forked = true; } finally { if (forked == false) { onCompletion.run(); } } } private void onNoLongerPrimary(Exception failure) { final Throwable cause = ExceptionsHelper.unwrapCause(failure); final boolean nodeIsClosing = cause instanceof NodeClosedException; if (nodeIsClosing) { // We prefer not to fail the primary to avoid unnecessary warning log // when the node with the primary shard is gracefully shutting down. finishAsFailed( new RetryOnPrimaryException( primary.routingEntry().shardId(), String.format( Locale.ROOT, "node with primary [%s] is shutting down while failing replica shard", primary.routingEntry() ), failure ) ); } else { assert failure instanceof ShardStateAction.NoLongerPrimaryShardException : failure; threadPool.executor(ThreadPool.Names.WRITE).execute(new AbstractRunnable() { @Override protected void doRun() { // we are no longer the primary, fail ourselves and start over final String message = String.format( Locale.ROOT, "primary shard [%s] was demoted while failing replica shard", primary.routingEntry() ); primary.failShard(message, failure); finishAsFailed(new RetryOnPrimaryException(primary.routingEntry().shardId(), message, failure)); } @Override public boolean isForceExecution() { return true; } @Override public void onFailure(Exception e) { e.addSuppressed(failure); assert false : e; logger.error(new ParameterizedMessage("unexpected failure while failing primary [{}]", primary.routingEntry()), e); finishAsFailed( new RetryOnPrimaryException( primary.routingEntry().shardId(), String.format(Locale.ROOT, "unexpected failure while failing primary [%s]", primary.routingEntry()), e ) ); } }); } } /** * Checks whether we can perform a write based on the required active shard count setting. * Returns **null* if OK to proceed, or a string describing the reason to stop */ protected String checkActiveShardCount() { final ShardId shardId = primary.routingEntry().shardId(); final ActiveShardCount waitForActiveShards = request.waitForActiveShards(); if (waitForActiveShards == ActiveShardCount.NONE) { return null; // not waiting for any shards } final IndexShardRoutingTable shardRoutingTable = primary.getReplicationGroup().getRoutingTable(); if (waitForActiveShards.enoughShardsActive(shardRoutingTable)) { return null; } else { final String resolvedShards = waitForActiveShards == ActiveShardCount.ALL ? Integer.toString(shardRoutingTable.shards().size()) : waitForActiveShards.toString(); logger.trace( "[{}] not enough active copies to meet shard count of [{}] (have {}, needed {}), scheduling a retry. op [{}], " + "request [{}]", shardId, waitForActiveShards, shardRoutingTable.activeShards().size(), resolvedShards, opType, request ); return "Not enough active copies to meet shard count of [" + waitForActiveShards + "] (have " + shardRoutingTable.activeShards().size() + ", needed " + resolvedShards + ")."; } } private void decPendingAndFinishIfNeeded() { assert pendingActions.get() > 0 : "pending action count goes below 0 for request [" + request + "]"; if (pendingActions.decrementAndGet() == 0) { finish(); } } private void finish() { if (finished.compareAndSet(false, true)) { final ReplicationResponse.ShardInfo.Failure[] failuresArray; if (shardReplicaFailures.isEmpty()) { failuresArray = ReplicationResponse.EMPTY; } else { failuresArray = new ReplicationResponse.ShardInfo.Failure[shardReplicaFailures.size()]; shardReplicaFailures.toArray(failuresArray); } primaryResult.setShardInfo(new ReplicationResponse.ShardInfo(totalShards.get(), successfulShards.get(), failuresArray)); resultListener.onResponse(primaryResult); } } private void finishAsFailed(Exception exception) { if (finished.compareAndSet(false, true)) { resultListener.onFailure(exception); } } /** * An encapsulation of an operation that is to be performed on the primary shard */ public interface Primary< RequestT extends ReplicationRequest, ReplicaRequestT extends ReplicationRequest, PrimaryResultT extends PrimaryResult> { /** * routing entry for this primary */ ShardRouting routingEntry(); /** * Fail the primary shard. * * @param message the failure message * @param exception the exception that triggered the failure */ void failShard(String message, Exception exception); /** * Performs the given request on this primary. Yes, this returns as soon as it can with the request for the replicas and calls a * listener when the primary request is completed. Yes, the primary request might complete before the method returns. Yes, it might * also complete after. Deal with it. * * @param request the request to perform * @param listener result listener */ void perform(RequestT request, ActionListener listener); /** * Notifies the primary of a local checkpoint for the given allocation. * * Note: The primary will use this information to advance the global checkpoint if possible. * * @param allocationId allocation ID of the shard corresponding to the supplied local checkpoint * @param checkpoint the *local* checkpoint for the shard */ void updateLocalCheckpointForShard(String allocationId, long checkpoint); /** * Update the local knowledge of the global checkpoint for the specified allocation ID. * * @param allocationId the allocation ID to update the global checkpoint for * @param globalCheckpoint the global checkpoint */ void updateGlobalCheckpointForShard(String allocationId, long globalCheckpoint); /** * Returns the persisted local checkpoint on the primary shard. * * @return the local checkpoint */ long localCheckpoint(); /** * Returns the global checkpoint computed on the primary shard. * * @return the computed global checkpoint */ long computedGlobalCheckpoint(); /** * Returns the persisted global checkpoint on the primary shard. * * @return the persisted global checkpoint */ long globalCheckpoint(); /** * Returns the maximum seq_no of updates (index operations overwrite Lucene) or deletes on the primary. * This value must be captured after the execution of a replication request on the primary is completed. */ long maxSeqNoOfUpdatesOrDeletes(); /** * Returns the current replication group on the primary shard * * @return the replication group */ ReplicationGroup getReplicationGroup(); /** * Returns the pending replication actions on the primary shard * * @return the pending replication actions */ PendingReplicationActions getPendingReplicationActions(); } /** * An encapsulation of an operation that will be executed on the replica shards, if present. */ public interface Replicas> { /** * Performs the specified request on the specified replica. * * @param replica the shard this request should be executed on * @param replicaRequest the operation to perform * @param primaryTerm the primary term * @param globalCheckpoint the global checkpoint on the primary * @param maxSeqNoOfUpdatesOrDeletes the max seq_no of updates (index operations overwriting Lucene) or deletes on primary * after this replication was executed on it. * @param listener callback for handling the response or failure */ void performOn( ShardRouting replica, RequestT replicaRequest, long primaryTerm, long globalCheckpoint, long maxSeqNoOfUpdatesOrDeletes, ActionListener listener ); /** * Fail the specified shard if needed, removing it from the current set * of active shards. Whether a failure is needed is left up to the * implementation. * * @param replica shard to fail * @param primaryTerm the primary term * @param message a (short) description of the reason * @param exception the original exception which caused the ReplicationOperation to request the shard to be failed * @param listener a listener that will be notified when the failing shard has been removed from the in-sync set */ void failShardIfNeeded(ShardRouting replica, long primaryTerm, String message, Exception exception, ActionListener listener); /** * Marks shard copy as stale if needed, removing its allocation id from * the set of in-sync allocation ids. Whether marking as stale is needed * is left up to the implementation. * * @param shardId shard id * @param allocationId allocation id to remove from the set of in-sync allocation ids * @param primaryTerm the primary term * @param listener a listener that will be notified when the failing shard has been removed from the in-sync set */ void markShardCopyAsStaleIfNeeded(ShardId shardId, String allocationId, long primaryTerm, ActionListener listener); } /** * An interface to encapsulate the metadata needed from replica shards when they respond to operations performed on them. */ public interface ReplicaResponse { /** * The persisted local checkpoint for the shard. * * @return the persisted local checkpoint **/ long localCheckpoint(); /** * The persisted global checkpoint for the shard. * * @return the persisted global checkpoint **/ long globalCheckpoint(); } public static class RetryOnPrimaryException extends ElasticsearchException { RetryOnPrimaryException(ShardId shardId, String msg) { this(shardId, msg, null); } RetryOnPrimaryException(ShardId shardId, String msg, Throwable cause) { super(msg, cause); setShard(shardId); } public RetryOnPrimaryException(StreamInput in) throws IOException { super(in); } } public interface PrimaryResult> { /** * @return null if no operation needs to be sent to a replica * (for example when the operation failed on the primary due to a parsing exception) */ @Nullable RequestT replicaRequest(); void setShardInfo(ReplicationResponse.ShardInfo shardInfo); /** * Run actions to be triggered post replication * @param listener calllback that is invoked after post replication actions have completed * */ void runPostReplicationActions(ActionListener listener); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy