org.elasticsearch.action.support.replication.ReplicationOperation Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of elasticsearch Show documentation
Show all versions of elasticsearch Show documentation
Elasticsearch - Open Source, Distributed, RESTful Search Engine
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.action.support.replication;
import org.apache.logging.log4j.Logger;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.UnavailableShardsException;
import org.elasticsearch.action.support.ActiveShardCount;
import org.elasticsearch.action.support.TransportActions;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.routing.AllocationId;
import org.elasticsearch.cluster.routing.IndexRoutingTable;
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.util.set.Sets;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.transport.TransportResponse;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Consumer;
import java.util.function.Supplier;
import java.util.stream.Collectors;
public class ReplicationOperation<
Request extends ReplicationRequest,
ReplicaRequest extends ReplicationRequest,
PrimaryResultT extends ReplicationOperation.PrimaryResult
> {
private final Logger logger;
private final Request request;
private final Supplier clusterStateSupplier;
private final String opType;
private final AtomicInteger totalShards = new AtomicInteger();
/**
* The number of pending sub-operations in this operation. This is incremented when the following operations start and decremented when
* they complete:
*
* - The operation on the primary
* - The operation on each replica
* - Coordination of the operation as a whole. This prevents the operation from terminating early if we haven't started any replica
* operations and the primary finishes.
*
*/
private final AtomicInteger pendingActions = new AtomicInteger();
private final AtomicInteger successfulShards = new AtomicInteger();
private final boolean executeOnReplicas;
private final Primary primary;
private final Replicas replicasProxy;
private final AtomicBoolean finished = new AtomicBoolean();
protected final ActionListener resultListener;
private volatile PrimaryResultT primaryResult = null;
private final List shardReplicaFailures = Collections.synchronizedList(new ArrayList<>());
public ReplicationOperation(Request request, Primary primary,
ActionListener listener,
boolean executeOnReplicas, Replicas replicas,
Supplier clusterStateSupplier, Logger logger, String opType) {
this.executeOnReplicas = executeOnReplicas;
this.replicasProxy = replicas;
this.primary = primary;
this.resultListener = listener;
this.logger = logger;
this.request = request;
this.clusterStateSupplier = clusterStateSupplier;
this.opType = opType;
}
public void execute() throws Exception {
final String activeShardCountFailure = checkActiveShardCount();
final ShardRouting primaryRouting = primary.routingEntry();
final ShardId primaryId = primaryRouting.shardId();
if (activeShardCountFailure != null) {
finishAsFailed(new UnavailableShardsException(primaryId,
"{} Timeout: [{}], request: [{}]", activeShardCountFailure, request.timeout(), request));
return;
}
totalShards.incrementAndGet();
pendingActions.incrementAndGet();
primaryResult = primary.perform(request);
final ReplicaRequest replicaRequest = primaryResult.replicaRequest();
if (replicaRequest != null) {
assert replicaRequest.primaryTerm() > 0 : "replicaRequest doesn't have a primary term";
if (logger.isTraceEnabled()) {
logger.trace("[{}] op [{}] completed on primary for request [{}]", primaryId, opType, request);
}
// we have to get a new state after successfully indexing into the primary in order to honour recovery semantics.
// we have to make sure that every operation indexed into the primary after recovery start will also be replicated
// to the recovery target. If we use an old cluster state, we may miss a relocation that has started since then.
ClusterState clusterState = clusterStateSupplier.get();
final List shards = getShards(primaryId, clusterState);
Set inSyncAllocationIds = getInSyncAllocationIds(primaryId, clusterState);
markUnavailableShardsAsStale(replicaRequest, inSyncAllocationIds, shards);
performOnReplicas(replicaRequest, shards);
}
successfulShards.incrementAndGet();
decPendingAndFinishIfNeeded();
}
private void markUnavailableShardsAsStale(ReplicaRequest replicaRequest, Set inSyncAllocationIds, List shards) {
if (inSyncAllocationIds.isEmpty() == false && shards.isEmpty() == false) {
Set availableAllocationIds = shards.stream()
.map(ShardRouting::allocationId)
.filter(Objects::nonNull)
.map(AllocationId::getId)
.collect(Collectors.toSet());
// if inSyncAllocationIds contains allocation ids of shards that don't exist in RoutingTable, mark copies as stale
for (String allocationId : Sets.difference(inSyncAllocationIds, availableAllocationIds)) {
// mark copy as stale
pendingActions.incrementAndGet();
replicasProxy.markShardCopyAsStale(replicaRequest.shardId(), allocationId, replicaRequest.primaryTerm(),
ReplicationOperation.this::decPendingAndFinishIfNeeded,
ReplicationOperation.this::onPrimaryDemoted,
throwable -> decPendingAndFinishIfNeeded()
);
}
}
}
private void performOnReplicas(ReplicaRequest replicaRequest, List shards) {
final String localNodeId = primary.routingEntry().currentNodeId();
// If the index gets deleted after primary operation, we skip replication
for (final ShardRouting shard : shards) {
if (executeOnReplicas == false || shard.unassigned()) {
if (shard.primary() == false) {
totalShards.incrementAndGet();
}
continue;
}
if (shard.currentNodeId().equals(localNodeId) == false) {
performOnReplica(shard, replicaRequest);
}
if (shard.relocating() && shard.relocatingNodeId().equals(localNodeId) == false) {
performOnReplica(shard.getTargetRelocatingShard(), replicaRequest);
}
}
}
private void performOnReplica(final ShardRouting shard, final ReplicaRequest replicaRequest) {
if (logger.isTraceEnabled()) {
logger.trace("[{}] sending op [{}] to replica {} for request [{}]", shard.shardId(), opType, shard, replicaRequest);
}
totalShards.incrementAndGet();
pendingActions.incrementAndGet();
replicasProxy.performOn(shard, replicaRequest, new ActionListener() {
@Override
public void onResponse(TransportResponse.Empty empty) {
successfulShards.incrementAndGet();
decPendingAndFinishIfNeeded();
}
@Override
public void onFailure(Exception replicaException) {
logger.trace(
(org.apache.logging.log4j.util.Supplier>) () -> new ParameterizedMessage(
"[{}] failure while performing [{}] on replica {}, request [{}]",
shard.shardId(),
opType,
shard,
replicaRequest),
replicaException);
if (TransportActions.isShardNotAvailableException(replicaException)) {
decPendingAndFinishIfNeeded();
} else {
RestStatus restStatus = ExceptionsHelper.status(replicaException);
shardReplicaFailures.add(new ReplicationResponse.ShardInfo.Failure(
shard.shardId(), shard.currentNodeId(), replicaException, restStatus, false));
String message = String.format(Locale.ROOT, "failed to perform %s on replica %s", opType, shard);
logger.warn(
(org.apache.logging.log4j.util.Supplier>)
() -> new ParameterizedMessage("[{}] {}", shard.shardId(), message), replicaException);
replicasProxy.failShard(shard, replicaRequest.primaryTerm(), message, replicaException,
ReplicationOperation.this::decPendingAndFinishIfNeeded,
ReplicationOperation.this::onPrimaryDemoted,
throwable -> decPendingAndFinishIfNeeded()
);
}
}
});
}
private void onPrimaryDemoted(Exception demotionFailure) {
String primaryFail = String.format(Locale.ROOT,
"primary shard [%s] was demoted while failing replica shard",
primary.routingEntry());
// we are no longer the primary, fail ourselves and start over
primary.failShard(primaryFail, demotionFailure);
finishAsFailed(new RetryOnPrimaryException(primary.routingEntry().shardId(), primaryFail, demotionFailure));
}
/**
* Checks whether we can perform a write based on the required active shard count setting.
* Returns **null* if OK to proceed, or a string describing the reason to stop
*/
protected String checkActiveShardCount() {
final ShardId shardId = primary.routingEntry().shardId();
final String indexName = shardId.getIndexName();
final ClusterState state = clusterStateSupplier.get();
assert state != null : "replication operation must have access to the cluster state";
final ActiveShardCount waitForActiveShards = request.waitForActiveShards();
if (waitForActiveShards == ActiveShardCount.NONE) {
return null; // not waiting for any shards
}
IndexRoutingTable indexRoutingTable = state.getRoutingTable().index(indexName);
if (indexRoutingTable == null) {
logger.trace("[{}] index not found in the routing table", shardId);
return "Index " + indexName + " not found in the routing table";
}
IndexShardRoutingTable shardRoutingTable = indexRoutingTable.shard(shardId.getId());
if (shardRoutingTable == null) {
logger.trace("[{}] shard not found in the routing table", shardId);
return "Shard " + shardId + " not found in the routing table";
}
if (waitForActiveShards.enoughShardsActive(shardRoutingTable)) {
return null;
} else {
final String resolvedShards = waitForActiveShards == ActiveShardCount.ALL ? Integer.toString(shardRoutingTable.shards().size())
: waitForActiveShards.toString();
logger.trace("[{}] not enough active copies to meet shard count of [{}] (have {}, needed {}), scheduling a retry. op [{}], " +
"request [{}]", shardId, waitForActiveShards, shardRoutingTable.activeShards().size(),
resolvedShards, opType, request);
return "Not enough active copies to meet shard count of [" + waitForActiveShards + "] (have " +
shardRoutingTable.activeShards().size() + ", needed " + resolvedShards + ").";
}
}
protected Set getInSyncAllocationIds(ShardId shardId, ClusterState clusterState) {
IndexMetaData indexMetaData = clusterState.metaData().index(shardId.getIndex());
if (indexMetaData != null) {
return indexMetaData.inSyncAllocationIds(shardId.id());
}
return Collections.emptySet();
}
protected List getShards(ShardId shardId, ClusterState state) {
// can be null if the index is deleted / closed on us..
final IndexShardRoutingTable shardRoutingTable = state.getRoutingTable().shardRoutingTableOrNull(shardId);
List shards = shardRoutingTable == null ? Collections.emptyList() : shardRoutingTable.shards();
return shards;
}
private void decPendingAndFinishIfNeeded() {
assert pendingActions.get() > 0;
if (pendingActions.decrementAndGet() == 0) {
finish();
}
}
private void finish() {
if (finished.compareAndSet(false, true)) {
final ReplicationResponse.ShardInfo.Failure[] failuresArray;
if (shardReplicaFailures.isEmpty()) {
failuresArray = ReplicationResponse.EMPTY;
} else {
failuresArray = new ReplicationResponse.ShardInfo.Failure[shardReplicaFailures.size()];
shardReplicaFailures.toArray(failuresArray);
}
primaryResult.setShardInfo(new ReplicationResponse.ShardInfo(
totalShards.get(),
successfulShards.get(),
failuresArray
)
);
resultListener.onResponse(primaryResult);
}
}
private void finishAsFailed(Exception exception) {
if (finished.compareAndSet(false, true)) {
resultListener.onFailure(exception);
}
}
public interface Primary<
Request extends ReplicationRequest,
ReplicaRequest extends ReplicationRequest,
PrimaryResultT extends PrimaryResult
> {
/**
* routing entry for this primary
*/
ShardRouting routingEntry();
/**
* fail the primary, typically due to the fact that the operation has learned the primary has been demoted by the master
*/
void failShard(String message, Exception exception);
/**
* Performs the given request on this primary. Yes, this returns as soon as it can with the request for the replicas and calls a
* listener when the primary request is completed. Yes, the primary request might complete before the method returns. Yes, it might
* also complete after. Deal with it.
*
* @param request the request to perform
* @return the request to send to the repicas
*/
PrimaryResultT perform(Request request) throws Exception;
}
public interface Replicas> {
/**
* performs the the given request on the specified replica
*
* @param replica {@link ShardRouting} of the shard this request should be executed on
* @param replicaRequest operation to peform
* @param listener a callback to call once the operation has been complicated, either successfully or with an error.
*/
void performOn(ShardRouting replica, ReplicaRequest replicaRequest, ActionListener listener);
/**
* Fail the specified shard, removing it from the current set of active shards
* @param replica shard to fail
* @param primaryTerm the primary term of the primary shard when requesting the failure
* @param message a (short) description of the reason
* @param exception the original exception which caused the ReplicationOperation to request the shard to be failed
* @param onSuccess a callback to call when the shard has been successfully removed from the active set.
* @param onPrimaryDemoted a callback to call when the shard can not be failed because the current primary has been demoted
* by the master.
* @param onIgnoredFailure a callback to call when failing a shard has failed, but it that failure can be safely ignored and the
*/
void failShard(ShardRouting replica, long primaryTerm, String message, Exception exception, Runnable onSuccess,
Consumer onPrimaryDemoted, Consumer onIgnoredFailure);
/**
* Marks shard copy as stale, removing its allocation id from the set of in-sync allocation ids.
*
* @param shardId shard id
* @param allocationId allocation id to remove from the set of in-sync allocation ids
* @param primaryTerm the primary term of the primary shard when requesting the failure
* @param onSuccess a callback to call when the allocation id has been successfully removed from the in-sync set.
* @param onPrimaryDemoted a callback to call when the request failed because the current primary was already demoted
* by the master.
* @param onIgnoredFailure a callback to call when the request failed, but the failure can be safely ignored.
*/
void markShardCopyAsStale(ShardId shardId, String allocationId, long primaryTerm, Runnable onSuccess,
Consumer onPrimaryDemoted, Consumer onIgnoredFailure);
}
public static class RetryOnPrimaryException extends ElasticsearchException {
public RetryOnPrimaryException(ShardId shardId, String msg) {
this(shardId, msg, null);
}
public RetryOnPrimaryException(ShardId shardId, String msg, Throwable cause) {
super(msg, cause);
setShard(shardId);
}
public RetryOnPrimaryException(StreamInput in) throws IOException {
super(in);
}
}
public interface PrimaryResult> {
/**
* @return null if no operation needs to be sent to a replica
* (for example when the operation failed on the primary due to a parsing exception)
*/
@Nullable R replicaRequest();
void setShardInfo(ReplicationResponse.ShardInfo shardInfo);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy