org.elasticsearch.action.support.replication.ReplicationOperation Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Elasticsearch - Open Source, Distributed, RESTful Search Engine
There is a newer version: 8.16.0
/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.elasticsearch.action.support.replication;

import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.UnavailableShardsException;
import org.elasticsearch.action.support.ActiveShardCount;
import org.elasticsearch.action.support.TransportActions;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.routing.AllocationId;
import org.elasticsearch.cluster.routing.IndexRoutingTable;
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.transport.TransportResponse;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Consumer;
import java.util.function.Supplier;
import java.util.stream.Collectors;

public class ReplicationOperation<
            Request extends ReplicationRequest,
            ReplicaRequest extends ReplicationRequest,
            PrimaryResultT extends ReplicationOperation.PrimaryResult
        > {
    private final Logger logger;
    private final Request request;
    private final Supplier clusterStateSupplier;
    private final String opType;
    private final AtomicInteger totalShards = new AtomicInteger();
    /**
     * The number of pending sub-operations in this operation. This is incremented when the following operations start and decremented when
     * they complete:
     * 
     * The operation on the primary
     * The operation on each replica
     * Coordination of the operation as a whole. This prevents the operation from terminating early if we haven't started any replica
     * operations and the primary finishes.
     * 
     */
    private final AtomicInteger pendingActions = new AtomicInteger();
    private final AtomicInteger successfulShards = new AtomicInteger();
    private final boolean executeOnReplicas;
    private final Primary primary;
    private final Replicas replicasProxy;
    private final AtomicBoolean finished = new AtomicBoolean();
    protected final ActionListener resultListener;

    private volatile PrimaryResultT primaryResult = null;

    private final List shardReplicaFailures = Collections.synchronizedList(new ArrayList<>());

    public ReplicationOperation(Request request, Primary primary,
                                ActionListener listener,
                                boolean executeOnReplicas, Replicas replicas,
                                Supplier clusterStateSupplier, Logger logger, String opType) {
        this.executeOnReplicas = executeOnReplicas;
        this.replicasProxy = replicas;
        this.primary = primary;
        this.resultListener = listener;
        this.logger = logger;
        this.request = request;
        this.clusterStateSupplier = clusterStateSupplier;
        this.opType = opType;
    }

    public void execute() throws Exception {
        final String activeShardCountFailure = checkActiveShardCount();
        final ShardRouting primaryRouting = primary.routingEntry();
        final ShardId primaryId = primaryRouting.shardId();
        if (activeShardCountFailure != null) {
            finishAsFailed(new UnavailableShardsException(primaryId,
                "{} Timeout: [{}], request: [{}]", activeShardCountFailure, request.timeout(), request));
            return;
        }

        totalShards.incrementAndGet();
        pendingActions.incrementAndGet();
        primaryResult = primary.perform(request);
        final ReplicaRequest replicaRequest = primaryResult.replicaRequest();
        if (replicaRequest != null) {
            assert replicaRequest.primaryTerm() > 0 : "replicaRequest doesn't have a primary term";
            if (logger.isTraceEnabled()) {
                logger.trace("[{}] op [{}] completed on primary for request [{}]", primaryId, opType, request);
            }

            // we have to get a new state after successfully indexing into the primary in order to honour recovery semantics.
            // we have to make sure that every operation indexed into the primary after recovery start will also be replicated
            // to the recovery target. If we use an old cluster state, we may miss a relocation that has started since then.
            ClusterState clusterState = clusterStateSupplier.get();
            final List shards = getShards(primaryId, clusterState);
            Set inSyncAllocationIds = getInSyncAllocationIds(primaryId, clusterState);

            markUnavailableShardsAsStale(replicaRequest, inSyncAllocationIds, shards);

            performOnReplicas(replicaRequest, shards);
        }

        successfulShards.incrementAndGet();
        decPendingAndFinishIfNeeded();
    }

    private void markUnavailableShardsAsStale(ReplicaRequest replicaRequest, Set inSyncAllocationIds, List shards) {
        if (inSyncAllocationIds.isEmpty() == false && shards.isEmpty() == false) {
            Set availableAllocationIds = shards.stream()
                .map(ShardRouting::allocationId)
                .filter(Objects::nonNull)
                .map(AllocationId::getId)
                .collect(Collectors.toSet());

            // if inSyncAllocationIds contains allocation ids of shards that don't exist in RoutingTable, mark copies as stale
            for (String allocationId : Sets.difference(inSyncAllocationIds, availableAllocationIds)) {
                // mark copy as stale
                pendingActions.incrementAndGet();
                replicasProxy.markShardCopyAsStale(replicaRequest.shardId(), allocationId, replicaRequest.primaryTerm(),
                    ReplicationOperation.this::decPendingAndFinishIfNeeded,
                    ReplicationOperation.this::onPrimaryDemoted,
                    throwable -> decPendingAndFinishIfNeeded()
                );
            }
        }
    }

    private void performOnReplicas(ReplicaRequest replicaRequest, List shards) {
        final String localNodeId = primary.routingEntry().currentNodeId();
        // If the index gets deleted after primary operation, we skip replication
        for (final ShardRouting shard : shards) {
            if (executeOnReplicas == false || shard.unassigned()) {
                if (shard.primary() == false) {
                    totalShards.incrementAndGet();
                }
                continue;
            }

            if (shard.currentNodeId().equals(localNodeId) == false) {
                performOnReplica(shard, replicaRequest);
            }

            if (shard.relocating() && shard.relocatingNodeId().equals(localNodeId) == false) {
                performOnReplica(shard.getTargetRelocatingShard(), replicaRequest);
            }
        }
    }

    private void performOnReplica(final ShardRouting shard, final ReplicaRequest replicaRequest) {
        if (logger.isTraceEnabled()) {
            logger.trace("[{}] sending op [{}] to replica {} for request [{}]", shard.shardId(), opType, shard, replicaRequest);
        }

        totalShards.incrementAndGet();
        pendingActions.incrementAndGet();
        replicasProxy.performOn(shard, replicaRequest, new ActionListener() {
            @Override
            public void onResponse(TransportResponse.Empty empty) {
                successfulShards.incrementAndGet();
                decPendingAndFinishIfNeeded();
            }

            @Override
            public void onFailure(Exception replicaException) {
                logger.trace(
                    (org.apache.logging.log4j.util.Supplier) () -> new ParameterizedMessage(
                        "[{}] failure while performing [{}] on replica {}, request [{}]",
                        shard.shardId(),
                        opType,
                        shard,
                        replicaRequest),
                    replicaException);
                if (TransportActions.isShardNotAvailableException(replicaException)) {
                    decPendingAndFinishIfNeeded();
                } else {
                    RestStatus restStatus = ExceptionsHelper.status(replicaException);
                    shardReplicaFailures.add(new ReplicationResponse.ShardInfo.Failure(
                        shard.shardId(), shard.currentNodeId(), replicaException, restStatus, false));
                    String message = String.format(Locale.ROOT, "failed to perform %s on replica %s", opType, shard);
                    logger.warn(
                        (org.apache.logging.log4j.util.Supplier)
                            () -> new ParameterizedMessage("[{}] {}", shard.shardId(), message), replicaException);
                    replicasProxy.failShard(shard, replicaRequest.primaryTerm(), message, replicaException,
                        ReplicationOperation.this::decPendingAndFinishIfNeeded,
                        ReplicationOperation.this::onPrimaryDemoted,
                        throwable -> decPendingAndFinishIfNeeded()
                    );
                }
            }
        });
    }

    private void onPrimaryDemoted(Exception demotionFailure) {
        String primaryFail = String.format(Locale.ROOT,
            "primary shard [%s] was demoted while failing replica shard",
            primary.routingEntry());
        // we are no longer the primary, fail ourselves and start over
        primary.failShard(primaryFail, demotionFailure);
        finishAsFailed(new RetryOnPrimaryException(primary.routingEntry().shardId(), primaryFail, demotionFailure));
    }

    /**
     * Checks whether we can perform a write based on the required active shard count setting.
     * Returns **null* if OK to proceed, or a string describing the reason to stop
     */
    protected String checkActiveShardCount() {
        final ShardId shardId = primary.routingEntry().shardId();
        final String indexName = shardId.getIndexName();
        final ClusterState state = clusterStateSupplier.get();
        assert state != null : "replication operation must have access to the cluster state";
        final ActiveShardCount waitForActiveShards = request.waitForActiveShards();
        if (waitForActiveShards == ActiveShardCount.NONE) {
            return null;  // not waiting for any shards
        }
        IndexRoutingTable indexRoutingTable = state.getRoutingTable().index(indexName);
        if (indexRoutingTable == null) {
            logger.trace("[{}] index not found in the routing table", shardId);
            return "Index " + indexName + " not found in the routing table";
        }
        IndexShardRoutingTable shardRoutingTable = indexRoutingTable.shard(shardId.getId());
        if (shardRoutingTable == null) {
            logger.trace("[{}] shard not found in the routing table", shardId);
            return "Shard " + shardId + " not found in the routing table";
        }
        if (waitForActiveShards.enoughShardsActive(shardRoutingTable)) {
            return null;
        } else {
            final String resolvedShards = waitForActiveShards == ActiveShardCount.ALL ? Integer.toString(shardRoutingTable.shards().size())
                                              : waitForActiveShards.toString();
            logger.trace("[{}] not enough active copies to meet shard count of [{}] (have {}, needed {}), scheduling a retry. op [{}], " +
                         "request [{}]", shardId, waitForActiveShards, shardRoutingTable.activeShards().size(),
                         resolvedShards, opType, request);
            return "Not enough active copies to meet shard count of [" + waitForActiveShards + "] (have " +
                       shardRoutingTable.activeShards().size() + ", needed " + resolvedShards + ").";
        }
    }

    protected Set getInSyncAllocationIds(ShardId shardId, ClusterState clusterState) {
        IndexMetaData indexMetaData = clusterState.metaData().index(shardId.getIndex());
        if (indexMetaData != null) {
            return indexMetaData.inSyncAllocationIds(shardId.id());
        }
        return Collections.emptySet();
    }

    protected List getShards(ShardId shardId, ClusterState state) {
        // can be null if the index is deleted / closed on us..
        final IndexShardRoutingTable shardRoutingTable = state.getRoutingTable().shardRoutingTableOrNull(shardId);
        List shards = shardRoutingTable == null ? Collections.emptyList() : shardRoutingTable.shards();
        return shards;
    }

    private void decPendingAndFinishIfNeeded() {
        assert pendingActions.get() > 0;
        if (pendingActions.decrementAndGet() == 0) {
            finish();
        }
    }

    private void finish() {
        if (finished.compareAndSet(false, true)) {
            final ReplicationResponse.ShardInfo.Failure[] failuresArray;
            if (shardReplicaFailures.isEmpty()) {
                failuresArray = ReplicationResponse.EMPTY;
            } else {
                failuresArray = new ReplicationResponse.ShardInfo.Failure[shardReplicaFailures.size()];
                shardReplicaFailures.toArray(failuresArray);
            }
            primaryResult.setShardInfo(new ReplicationResponse.ShardInfo(
                    totalShards.get(),
                    successfulShards.get(),
                    failuresArray
                )
            );
            resultListener.onResponse(primaryResult);
        }
    }

    private void finishAsFailed(Exception exception) {
        if (finished.compareAndSet(false, true)) {
            resultListener.onFailure(exception);
        }
    }

    public interface Primary<
                Request extends ReplicationRequest,
                ReplicaRequest extends ReplicationRequest,
                PrimaryResultT extends PrimaryResult
            > {

        /**
         * routing entry for this primary
         */
        ShardRouting routingEntry();

        /**
         * fail the primary, typically due to the fact that the operation has learned the primary has been demoted by the master
         */
        void failShard(String message, Exception exception);

        /**
         * Performs the given request on this primary. Yes, this returns as soon as it can with the request for the replicas and calls a
         * listener when the primary request is completed. Yes, the primary request might complete before the method returns. Yes, it might
         * also complete after. Deal with it.
         *
         * @param request the request to perform
         * @return the request to send to the repicas
         */
        PrimaryResultT perform(Request request) throws Exception;

    }

    public interface Replicas> {

        /**
         * performs the the given request on the specified replica
         *
         * @param replica        {@link ShardRouting} of the shard this request should be executed on
         * @param replicaRequest operation to peform
         * @param listener       a callback to call once the operation has been complicated, either successfully or with an error.
         */
        void performOn(ShardRouting replica, ReplicaRequest replicaRequest, ActionListener listener);

        /**
         * Fail the specified shard, removing it from the current set of active shards
         * @param replica          shard to fail
         * @param primaryTerm      the primary term of the primary shard when requesting the failure
         * @param message          a (short) description of the reason
         * @param exception        the original exception which caused the ReplicationOperation to request the shard to be failed
         * @param onSuccess        a callback to call when the shard has been successfully removed from the active set.
         * @param onPrimaryDemoted a callback to call when the shard can not be failed because the current primary has been demoted
*                         by the master.
         * @param onIgnoredFailure a callback to call when failing a shard has failed, but it that failure can be safely ignored and the
         */
        void failShard(ShardRouting replica, long primaryTerm, String message, Exception exception, Runnable onSuccess,
                       Consumer onPrimaryDemoted, Consumer onIgnoredFailure);

        /**
         * Marks shard copy as stale, removing its allocation id from the set of in-sync allocation ids.
         *
         * @param shardId          shard id
         * @param allocationId     allocation id to remove from the set of in-sync allocation ids
         * @param primaryTerm      the primary term of the primary shard when requesting the failure
         * @param onSuccess        a callback to call when the allocation id has been successfully removed from the in-sync set.
         * @param onPrimaryDemoted a callback to call when the request failed because the current primary was already demoted
         *                         by the master.
         * @param onIgnoredFailure a callback to call when the request failed, but the failure can be safely ignored.
         */
        void markShardCopyAsStale(ShardId shardId, String allocationId, long primaryTerm, Runnable onSuccess,
                                  Consumer onPrimaryDemoted, Consumer onIgnoredFailure);
    }

    public static class RetryOnPrimaryException extends ElasticsearchException {
        public RetryOnPrimaryException(ShardId shardId, String msg) {
            this(shardId, msg, null);
        }

        public RetryOnPrimaryException(ShardId shardId, String msg, Throwable cause) {
            super(msg, cause);
            setShard(shardId);
        }

        public RetryOnPrimaryException(StreamInput in) throws IOException {
            super(in);
        }
    }

    public interface PrimaryResult> {

        /**
         * @return null if no operation needs to be sent to a replica
         * (for example when the operation failed on the primary due to a parsing exception)
         */
        @Nullable R replicaRequest();

        void setShardInfo(ReplicationResponse.ShardInfo shardInfo);
    }
}