org.opensearch.action.support.replication.TransportWriteAction Maven / Gradle / Ivy

/*
 * SPDX-License-Identifier: Apache-2.0
 *
 * The OpenSearch Contributors require contributions made to
 * this file be licensed under the Apache-2.0 license or a
 * compatible open source license.
 */

/*
 * Licensed to Elasticsearch under one or more contributor
 * license agreements. See the NOTICE file distributed with
 * this work for additional information regarding copyright
 * ownership. Elasticsearch licenses this file to you under
 * the Apache License, Version 2.0 (the "License"); you may
 * not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*
 * Modifications Copyright OpenSearch Contributors. See
 * GitHub history for details.
 */

package org.opensearch.action.support.replication;

import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.opensearch.action.ActionListener;
import org.opensearch.action.ActionRunnable;
import org.opensearch.action.support.ActionFilters;
import org.opensearch.action.support.TransportActions;
import org.opensearch.action.support.WriteRequest;
import org.opensearch.action.support.WriteResponse;
import org.opensearch.cluster.action.shard.ShardStateAction;
import org.opensearch.cluster.block.ClusterBlockLevel;
import org.opensearch.cluster.metadata.IndexAbstraction;
import org.opensearch.cluster.routing.ShardRouting;
import org.opensearch.cluster.service.ClusterService;
import org.opensearch.common.Nullable;
import org.opensearch.common.io.stream.Writeable;
import org.opensearch.common.lease.Releasable;
import org.opensearch.common.settings.Settings;
import org.opensearch.index.IndexingPressureService;
import org.opensearch.index.engine.Engine;
import org.opensearch.index.mapper.MapperParsingException;
import org.opensearch.index.shard.IndexShard;
import org.opensearch.index.shard.ShardId;
import org.opensearch.index.translog.Translog;
import org.opensearch.index.translog.Translog.Location;
import org.opensearch.indices.IndicesService;
import org.opensearch.indices.SystemIndices;
import org.opensearch.threadpool.ThreadPool;
import org.opensearch.transport.TransportService;

import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Function;

/**
 * Base class for transport actions that modify data in some shard like index, delete, and shardBulk.
 * Allows performing async actions (e.g. refresh) after performing write operations on primary and replica shards
 */
public abstract class TransportWriteAction<
    Request extends ReplicatedWriteRequest,
    ReplicaRequest extends ReplicatedWriteRequest,
    Response extends ReplicationResponse & WriteResponse> extends TransportReplicationAction {

    protected final IndexingPressureService indexingPressureService;
    protected final SystemIndices systemIndices;

    private final Function executorFunction;

    protected TransportWriteAction(
        Settings settings,
        String actionName,
        TransportService transportService,
        ClusterService clusterService,
        IndicesService indicesService,
        ThreadPool threadPool,
        ShardStateAction shardStateAction,
        ActionFilters actionFilters,
        Writeable.Reader request,
        Writeable.Reader replicaRequest,
        Function executorFunction,
        boolean forceExecutionOnPrimary,
        IndexingPressureService indexingPressureService,
        SystemIndices systemIndices
    ) {
        // We pass ThreadPool.Names.SAME to the super class as we control the dispatching to the
        // ThreadPool.Names.WRITE/ThreadPool.Names.SYSTEM_WRITE thread pools in this class.
        super(
            settings,
            actionName,
            transportService,
            clusterService,
            indicesService,
            threadPool,
            shardStateAction,
            actionFilters,
            request,
            replicaRequest,
            ThreadPool.Names.SAME,
            true,
            forceExecutionOnPrimary
        );
        this.executorFunction = executorFunction;
        this.indexingPressureService = indexingPressureService;
        this.systemIndices = systemIndices;
    }

    protected String executor(IndexShard shard) {
        return executorFunction.apply(shard);
    }

    @Override
    protected Releasable checkOperationLimits(Request request) {
        return indexingPressureService.markPrimaryOperationStarted(request.shardId, primaryOperationSize(request), force(request));
    }

    protected boolean force(ReplicatedWriteRequest request) {
        return forceExecutionOnPrimary || isSystemShard(request.shardId);
    }

    protected boolean isSystemShard(ShardId shardId) {
        final IndexAbstraction abstraction = clusterService.state().metadata().getIndicesLookup().get(shardId.getIndexName());
        return abstraction != null ? abstraction.isSystem() : systemIndices.isSystemIndex(shardId.getIndexName());
    }

    @Override
    protected Releasable checkPrimaryLimits(Request request, boolean rerouteWasLocal, boolean localRerouteInitiatedByNodeClient) {
        if (rerouteWasLocal) {
            // If this primary request was received from a local reroute initiated by the node client, we
            // must mark a new primary operation local to the coordinating node.
            if (localRerouteInitiatedByNodeClient) {
                return indexingPressureService.markPrimaryOperationLocalToCoordinatingNodeStarted(
                    request.shardId,
                    primaryOperationSize(request)
                );
            } else {
                return () -> {};
            }
        } else {
            // If this primary request was received directly from the network, we must mark a new primary
            // operation. This happens if the write action skips the reroute step (ex: rsync) or during
            // primary delegation, after the primary relocation hand-off.
            return indexingPressureService.markPrimaryOperationStarted(request.shardId, primaryOperationSize(request), force(request));
        }
    }

    protected long primaryOperationSize(Request request) {
        return 0;
    }

    @Override
    protected Releasable checkReplicaLimits(ReplicaRequest request) {
        return indexingPressureService.markReplicaOperationStarted(request.shardId, replicaOperationSize(request), force(request));
    }

    protected long replicaOperationSize(ReplicaRequest request) {
        return 0;
    }

    /** Syncs operation result to the translog or throws a shard not available failure */
    protected static Location syncOperationResultOrThrow(final Engine.Result operationResult, final Location currentLocation)
        throws Exception {
        final Location location;
        if (operationResult.getFailure() != null) {
            // check if any transient write operation failures should be bubbled up
            Exception failure = operationResult.getFailure();
            assert failure instanceof MapperParsingException : "expected mapper parsing failures. got " + failure;
            throw failure;
        } else {
            location = locationToSync(currentLocation, operationResult.getTranslogLocation());
        }
        return location;
    }

    public static Location locationToSync(Location current, Location next) {
        /* here we are moving forward in the translog with each operation. Under the hood this might
         * cross translog files which is ok since from the user perspective the translog is like a
         * tape where only the highest location needs to be fsynced in order to sync all previous
         * locations even though they are not in the same file. When the translog rolls over files
         * the previous file is fsynced on after closing if needed.*/
        assert next != null : "next operation can't be null";
        assert current == null || current.compareTo(next) < 0 : "translog locations are not increasing";
        return next;
    }

    @Override
    protected ReplicationOperation.Replicas newReplicasProxy() {
        return new WriteActionReplicasProxy();
    }

    /**
     * Called on the primary with a reference to the primary {@linkplain IndexShard} to modify.
     *
     * @param listener listener for the result of the operation on primary, including current translog location and operation response
     * and failure async refresh is performed on the primary shard according to the Request refresh policy
     */
    @Override
    protected void shardOperationOnPrimary(
        Request request,
        IndexShard primary,
        ActionListener> listener
    ) {
        final String executor = executorFunction.apply(primary);
        threadPool.executor(executor).execute(new ActionRunnable>(listener) {
            @Override
            protected void doRun() {
                dispatchedShardOperationOnPrimary(request, primary, listener);
            }

            @Override
            public boolean isForceExecution() {
                return force(request);
            }
        });
    }

    protected abstract void dispatchedShardOperationOnPrimary(
        Request request,
        IndexShard primary,
        ActionListener> listener
    );

    /**
     * Called once per replica with a reference to the replica {@linkplain IndexShard} to modify.
     *
     * @param listener listener for the result of the operation on replica, including current translog location and operation
     * response and failure async refresh is performed on the replica shard according to the ReplicaRequest
     * refresh policy
     */
    @Override
    protected void shardOperationOnReplica(ReplicaRequest request, IndexShard replica, ActionListener listener) {
        threadPool.executor(executorFunction.apply(replica)).execute(new ActionRunnable(listener) {
            @Override
            protected void doRun() {
                dispatchedShardOperationOnReplica(request, replica, listener);
            }

            @Override
            public boolean isForceExecution() {
                return true;
            }
        });
    }

    protected abstract void dispatchedShardOperationOnReplica(
        ReplicaRequest request,
        IndexShard replica,
        ActionListener listener
    );

    /**
     * Result of taking the action on the primary.
     *
     * NOTE: public for testing
     */
    public static class WritePrimaryResult<
        ReplicaRequest extends ReplicatedWriteRequest,
        Response extends ReplicationResponse & WriteResponse> extends PrimaryResult {
        public final Location location;
        public final IndexShard primary;
        private final Logger logger;

        public WritePrimaryResult(
            ReplicaRequest request,
            @Nullable Response finalResponse,
            @Nullable Location location,
            @Nullable Exception operationFailure,
            IndexShard primary,
            Logger logger
        ) {
            super(request, finalResponse, operationFailure);
            this.location = location;
            this.primary = primary;
            this.logger = logger;
            assert location == null || operationFailure == null : "expected either failure to be null or translog location to be null, "
                + "but found: ["
                + location
                + "] translog location and ["
                + operationFailure
                + "] failure";
        }

        @Override
        public void runPostReplicationActions(ActionListener listener) {
            if (finalFailure != null) {
                listener.onFailure(finalFailure);
            } else {
                /*
                 * We call this after replication because this might wait for a refresh and that can take a while.
                 * This way we wait for the refresh in parallel on the primary and on the replica.
                 */
                new AsyncAfterWriteAction(primary, replicaRequest, location, new RespondingWriteResult() {
                    @Override
                    public void onSuccess(boolean forcedRefresh) {
                        finalResponseIfSuccessful.setForcedRefresh(forcedRefresh);
                        listener.onResponse(null);
                    }

                    @Override
                    public void onFailure(Exception ex) {
                        listener.onFailure(ex);
                    }
                }, logger).run();
            }
        }
    }

    /**
     * Result of taking the action on the replica.
     */
    public static class WriteReplicaResult> extends ReplicaResult {
        public final Location location;
        private final ReplicaRequest request;
        private final IndexShard replica;
        private final Logger logger;

        public WriteReplicaResult(
            ReplicaRequest request,
            @Nullable Location location,
            @Nullable Exception operationFailure,
            IndexShard replica,
            Logger logger
        ) {
            super(operationFailure);
            this.location = location;
            this.request = request;
            this.replica = replica;
            this.logger = logger;
        }

        @Override
        public void runPostReplicaActions(ActionListener listener) {
            if (finalFailure != null) {
                listener.onFailure(finalFailure);
            } else {
                new AsyncAfterWriteAction(replica, request, location, new RespondingWriteResult() {
                    @Override
                    public void onSuccess(boolean forcedRefresh) {
                        listener.onResponse(null);
                    }

                    @Override
                    public void onFailure(Exception ex) {
                        listener.onFailure(ex);
                    }
                }, logger).run();
            }
        }
    }

    @Override
    protected ClusterBlockLevel globalBlockLevel() {
        return ClusterBlockLevel.WRITE;
    }

    @Override
    public ClusterBlockLevel indexBlockLevel() {
        return ClusterBlockLevel.WRITE;
    }

    /**
     * callback used by {@link AsyncAfterWriteAction} to notify that all post
     * process actions have been executed
     */
    interface RespondingWriteResult {
        /**
         * Called on successful processing of all post write actions
         * @param forcedRefresh true iff this write has caused a refresh
         */
        void onSuccess(boolean forcedRefresh);

        /**
         * Called on failure if a post action failed.
         */
        void onFailure(Exception ex);
    }

    /**
     * This class encapsulates post write actions like async waits for
     * translog syncs or waiting for a refresh to happen making the write operation
     * visible.
     */
    static final class AsyncAfterWriteAction {
        private final Location location;
        private final boolean waitUntilRefresh;
        private final boolean sync;
        private final AtomicInteger pendingOps = new AtomicInteger(1);
        private final AtomicBoolean refreshed = new AtomicBoolean(false);
        private final AtomicReference syncFailure = new AtomicReference<>(null);
        private final RespondingWriteResult respond;
        private final IndexShard indexShard;
        private final WriteRequest request;
        private final Logger logger;

        AsyncAfterWriteAction(
            final IndexShard indexShard,
            final WriteRequest request,
            @Nullable final Translog.Location location,
            final RespondingWriteResult respond,
            final Logger logger
        ) {
            this.indexShard = indexShard;
            this.request = request;
            boolean waitUntilRefresh = false;
            switch (request.getRefreshPolicy()) {
                case IMMEDIATE:
                    indexShard.refresh("refresh_flag_index");
                    refreshed.set(true);
                    break;
                case WAIT_UNTIL:
                    if (location != null) {
                        waitUntilRefresh = true;
                        pendingOps.incrementAndGet();
                    }
                    break;
                case NONE:
                    break;
                default:
                    throw new IllegalArgumentException("unknown refresh policy: " + request.getRefreshPolicy());
            }
            this.waitUntilRefresh = waitUntilRefresh;
            this.respond = respond;
            this.location = location;
            if ((sync = indexShard.getTranslogDurability() == Translog.Durability.REQUEST && location != null)) {
                pendingOps.incrementAndGet();
            }
            this.logger = logger;
            assert pendingOps.get() >= 0 && pendingOps.get() <= 3 : "pendingOpts was: " + pendingOps.get();
        }

        /** calls the response listener if all pending operations have returned otherwise it just decrements the pending opts counter.*/
        private void maybeFinish() {
            final int numPending = pendingOps.decrementAndGet();
            if (numPending == 0) {
                if (syncFailure.get() != null) {
                    respond.onFailure(syncFailure.get());
                } else {
                    respond.onSuccess(refreshed.get());
                }
            }
            assert numPending >= 0 && numPending <= 2 : "numPending must either 2, 1 or 0 but was " + numPending;
        }

        void run() {
            /*
             * We either respond immediately (i.e., if we do not fsync per request or wait for
             * refresh), or we there are past async operations and we wait for them to return to
             * respond.
             */
            indexShard.afterWriteOperation();
            // decrement pending by one, if there is nothing else to do we just respond with success
            maybeFinish();
            if (waitUntilRefresh) {
                assert pendingOps.get() > 0;
                indexShard.addRefreshListener(location, forcedRefresh -> {
                    if (forcedRefresh) {
                        logger.warn("block until refresh ran out of slots and forced a refresh: [{}]", request);
                    }
                    refreshed.set(forcedRefresh);
                    maybeFinish();
                });
            }
            if (sync) {
                assert pendingOps.get() > 0;
                indexShard.sync(location, (ex) -> {
                    syncFailure.set(ex);
                    maybeFinish();
                });
            }
        }
    }

    /**
     * A proxy for write operations that need to be performed on the
     * replicas, where a failure to execute the operation should fail
     * the replica shard and/or mark the replica as stale.
     *
     * This extends {@code TransportReplicationAction.ReplicasProxy} to do the
     * failing and stale-ing.
     */
    class WriteActionReplicasProxy extends ReplicasProxy {

        @Override
        public void failShardIfNeeded(
            ShardRouting replica,
            long primaryTerm,
            String message,
            Exception exception,
            ActionListener listener
        ) {
            if (TransportActions.isShardNotAvailableException(exception) == false) {
                logger.warn(new ParameterizedMessage("[{}] {}", replica.shardId(), message), exception);
            }
            shardStateAction.remoteShardFailed(
                replica.shardId(),
                replica.allocationId().getId(),
                primaryTerm,
                true,
                message,
                exception,
                listener
            );
        }

        @Override
        public void markShardCopyAsStaleIfNeeded(ShardId shardId, String allocationId, long primaryTerm, ActionListener listener) {
            shardStateAction.remoteShardFailed(shardId, allocationId, primaryTerm, true, "mark copy as stale", null, listener);
        }
    }
}