package org.elasticsearch.indices.recovery;

import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.StopWatch;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
import org.elasticsearch.common.util.concurrent.ConcurrentMapLong;
import org.elasticsearch.index.IndexShardMissingException;
import org.elasticsearch.index.engine.RecoveryEngineException;
import org.elasticsearch.index.shard.*;
import org.elasticsearch.index.shard.service.IndexShard;
import org.elasticsearch.index.shard.service.InternalIndexShard;
import org.elasticsearch.index.translog.Translog;
import org.elasticsearch.indices.IndexMissingException;
import org.elasticsearch.indices.IndicesLifecycle;
import org.elasticsearch.indices.IndicesService;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.*;

import java.util.Collections;
import java.util.Map;
import java.util.Set;

import static org.elasticsearch.common.unit.TimeValue.timeValueMillis;

 * The recovery target handles recoveries of peer shards of the shard+node to recover to.


Note, it can be safely assumed that there will only be a single recovery per shard (index+id) and * not several of them (since we don't allocate several shard replicas to the same node). */ public class RecoveryTarget extends AbstractComponent { public static class Actions { public static final String FILES_INFO = "index/shard/recovery/filesInfo"; public static final String FILE_CHUNK = "index/shard/recovery/fileChunk"; public static final String CLEAN_FILES = "index/shard/recovery/cleanFiles"; public static final String TRANSLOG_OPS = "index/shard/recovery/translogOps"; public static final String PREPARE_TRANSLOG = "index/shard/recovery/prepareTranslog"; public static final String FINALIZE = "index/shard/recovery/finalize"; } private final ThreadPool threadPool; private final TransportService transportService; private final IndicesService indicesService; private final RecoverySettings recoverySettings; private final ConcurrentMapLong onGoingRecoveries = ConcurrentCollections.newConcurrentMapLong(); @Inject public RecoveryTarget(Settings settings, ThreadPool threadPool, TransportService transportService, IndicesService indicesService, IndicesLifecycle indicesLifecycle, RecoverySettings recoverySettings) { super(settings); this.threadPool = threadPool; this.transportService = transportService; this.indicesService = indicesService; this.recoverySettings = recoverySettings; transportService.registerHandler(Actions.FILES_INFO, new FilesInfoRequestHandler()); transportService.registerHandler(Actions.FILE_CHUNK, new FileChunkTransportRequestHandler()); transportService.registerHandler(Actions.CLEAN_FILES, new CleanFilesRequestHandler()); transportService.registerHandler(Actions.PREPARE_TRANSLOG, new PrepareForTranslogOperationsRequestHandler()); transportService.registerHandler(Actions.TRANSLOG_OPS, new TranslogOperationsRequestHandler()); transportService.registerHandler(Actions.FINALIZE, new FinalizeRecoveryRequestHandler()); indicesLifecycle.addListener(new IndicesLifecycle.Listener() { @Override public void beforeIndexShardClosed(ShardId shardId, @Nullable IndexShard indexShard) { if (indexShard != null) { removeAndCleanOnGoingRecovery(findRecoveryByShard(indexShard)); } } }); } public RecoveryStatus peerRecoveryStatus(ShardId shardId) { RecoveryStatus peerRecoveryStatus = findRecoveryByShardId(shardId); if (peerRecoveryStatus == null) { return null; } // update how long it takes if we are still recovering... if (peerRecoveryStatus.startTime > 0 && peerRecoveryStatus.stage != RecoveryStatus.Stage.DONE) { peerRecoveryStatus.time = System.currentTimeMillis() - peerRecoveryStatus.startTime; } return peerRecoveryStatus; } public void cancelRecovery(IndexShard indexShard) { RecoveryStatus recoveryStatus = findRecoveryByShard(indexShard); // it might be if the recovery source got canceled first if (recoveryStatus == null) { return; } if (recoveryStatus.sentCanceledToSource) { return; } recoveryStatus.canceled = true; if (recoveryStatus.recoveryThread != null) { recoveryStatus.recoveryThread.interrupt(); } long time = System.currentTimeMillis(); // give it a grace period of actually getting the sent ack part while (!recoveryStatus.sentCanceledToSource) { try { Thread.sleep(100); } catch (InterruptedException e) { // ignore } if (System.currentTimeMillis() - time > 10000) { break; } } removeAndCleanOnGoingRecovery(recoveryStatus); } public void startRecovery(final StartRecoveryRequest request, final InternalIndexShard indexShard, final RecoveryListener listener) { try { indexShard.recovering("from " + request.sourceNode()); } catch (IllegalIndexShardStateException e) { // that's fine, since we might be called concurrently, just ignore this, we are already recovering listener.onIgnoreRecovery(false, "already in recovering process, " + e.getMessage()); return; } threadPool.generic().execute(new Runnable() { @Override public void run() { // create a new recovery status, and process... RecoveryStatus recoveryStatus = new RecoveryStatus(request.recoveryId(), indexShard); onGoingRecoveries.put(recoveryStatus.recoveryId, recoveryStatus); doRecovery(request, recoveryStatus, listener); } }); } public void retryRecovery(final StartRecoveryRequest request, final RecoveryStatus status, final RecoveryListener listener) { threadPool.generic().execute(new Runnable() { @Override public void run() { doRecovery(request, status, listener); } }); } private void doRecovery(final StartRecoveryRequest request, final RecoveryStatus recoveryStatus, final RecoveryListener listener) { if (request.sourceNode() == null) { listener.onIgnoreRecovery(false, "No node to recover from, retry on next cluster state update"); return; } final InternalIndexShard shard = recoveryStatus.indexShard; if (shard == null) { listener.onIgnoreRecovery(false, "shard missing locally, stop recovery"); return; } if (shard.state() == IndexShardState.CLOSED) { listener.onIgnoreRecovery(false, "local shard closed, stop recovery"); return; } if (recoveryStatus.canceled) { // don't remove it, the cancellation code will remove it... listener.onIgnoreRecovery(false, "canceled recovery"); return; } recoveryStatus.recoveryThread = Thread.currentThread(); try { logger.trace("[{}][{}] starting recovery from {}", request.shardId().index().name(), request.shardId().id(), request.sourceNode()); StopWatch stopWatch = new StopWatch().start(); RecoveryResponse recoveryResponse = transportService.submitRequest(request.sourceNode(), RecoverySource.Actions.START_RECOVERY, request, new FutureTransportResponseHandler() { @Override public RecoveryResponse newInstance() { return new RecoveryResponse(); } }).txGet(); if (shard.state() == IndexShardState.CLOSED) { removeAndCleanOnGoingRecovery(recoveryStatus); listener.onIgnoreRecovery(false, "local shard closed, stop recovery"); return; } stopWatch.stop(); if (logger.isDebugEnabled()) { StringBuilder sb = new StringBuilder(); sb.append('[').append(request.shardId().index().name()).append(']').append('[').append(request.shardId().id()).append("] "); sb.append("recovery completed from ").append(request.sourceNode()).append(", took[").append(stopWatch.totalTime()).append("]\n"); sb.append(" phase1: recovered_files [").append(recoveryResponse.phase1FileNames.size()).append("]").append(" with total_size of [").append(new ByteSizeValue(recoveryResponse.phase1TotalSize)).append("]") .append(", took [").append(timeValueMillis(recoveryResponse.phase1Time)).append("], throttling_wait [").append(timeValueMillis(recoveryResponse.phase1ThrottlingWaitTime)).append(']') .append("\n"); sb.append(" : reusing_files [").append(recoveryResponse.phase1ExistingFileNames.size()).append("] with total_size of [").append(new ByteSizeValue(recoveryResponse.phase1ExistingTotalSize)).append("]\n"); sb.append(" phase2: start took [").append(timeValueMillis(recoveryResponse.startTime)).append("]\n"); sb.append(" : recovered [").append(recoveryResponse.phase2Operations).append("]").append(" transaction log operations") .append(", took [").append(timeValueMillis(recoveryResponse.phase2Time)).append("]") .append("\n"); sb.append(" phase3: recovered [").append(recoveryResponse.phase3Operations).append("]").append(" transaction log operations") .append(", took [").append(timeValueMillis(recoveryResponse.phase3Time)).append("]"); logger.debug(sb.toString()); } removeAndCleanOnGoingRecovery(recoveryStatus); listener.onRecoveryDone(); } catch (Exception e) { // logger.trace("[{}][{}] Got exception on recovery", e, request.shardId().index().name(), request.shardId().id()); if (recoveryStatus.canceled) { // don't remove it, the cancellation code will remove it... listener.onIgnoreRecovery(false, "canceled recovery"); return; } if (shard.state() == IndexShardState.CLOSED) { removeAndCleanOnGoingRecovery(recoveryStatus); listener.onIgnoreRecovery(false, "local shard closed, stop recovery"); return; } Throwable cause = ExceptionsHelper.unwrapCause(e); if (cause instanceof RecoveryEngineException) { // unwrap an exception that was thrown as part of the recovery cause = cause.getCause(); } // do it twice, in case we have double transport exception cause = ExceptionsHelper.unwrapCause(cause); if (cause instanceof RecoveryEngineException) { // unwrap an exception that was thrown as part of the recovery cause = cause.getCause(); } // here, we would add checks against exception that need to be retried (and not removeAndClean in this case) if (cause instanceof IndexShardNotStartedException || cause instanceof IndexMissingException || cause instanceof IndexShardMissingException) { // if the target is not ready yet, retry listener.onRetryRecovery(TimeValue.timeValueMillis(500), recoveryStatus); return; } if (cause instanceof DelayRecoveryException) { listener.onRetryRecovery(TimeValue.timeValueMillis(500), recoveryStatus); return; } // here, we check against ignore recovery options // in general, no need to clean the shard on ignored recovery, since we want to try and reuse it later // it will get deleted in the IndicesStore if all are allocated and no shard exists on this node... removeAndCleanOnGoingRecovery(recoveryStatus); if (cause instanceof ConnectTransportException) { listener.onIgnoreRecovery(true, "source node disconnected (" + request.sourceNode() + ")"); return; } if (cause instanceof IndexShardClosedException) { listener.onIgnoreRecovery(true, "source shard is closed (" + request.sourceNode() + ")"); return; } if (cause instanceof AlreadyClosedException) { listener.onIgnoreRecovery(true, "source shard is closed (" + request.sourceNode() + ")"); return; } logger.trace("[{}][{}] recovery from [{}] failed", e, request.shardId().index().name(), request.shardId().id(), request.sourceNode()); listener.onRecoveryFailure(new RecoveryFailedException(request, e), true); } } public static interface RecoveryListener { void onRecoveryDone(); void onRetryRecovery(TimeValue retryAfter, RecoveryStatus status); void onIgnoreRecovery(boolean removeShard, String reason); void onRecoveryFailure(RecoveryFailedException e, boolean sendShardFailure); } @Nullable private RecoveryStatus findRecoveryByShardId(ShardId shardId) { for (RecoveryStatus recoveryStatus : onGoingRecoveries.values()) { if (recoveryStatus.shardId.equals(shardId)) { return recoveryStatus; } } return null; } @Nullable private RecoveryStatus findRecoveryByShard(IndexShard indexShard) { for (RecoveryStatus recoveryStatus : onGoingRecoveries.values()) { if (recoveryStatus.indexShard == indexShard) { return recoveryStatus; } } return null; } private void removeAndCleanOnGoingRecovery(@Nullable RecoveryStatus status) { if (status == null) { return; } // clean it from the on going recoveries since it is being closed status = onGoingRecoveries.remove(status.recoveryId); if (status == null) { return; } // just mark it as canceled as well, just in case there are in flight requests // coming from the recovery target status.canceled = true; // clean open index outputs for (Map.Entry entry : status.openIndexOutputs.entrySet()) { synchronized (entry.getValue()) { try { entry.getValue().close(); } catch (Exception e) { // ignore } } } status.openIndexOutputs = null; status.checksums = null; } class PrepareForTranslogOperationsRequestHandler extends BaseTransportRequestHandler { @Override public RecoveryPrepareForTranslogOperationsRequest newInstance() { return new RecoveryPrepareForTranslogOperationsRequest(); } @Override public String executor() { return ThreadPool.Names.GENERIC; } @Override public void messageReceived(RecoveryPrepareForTranslogOperationsRequest request, TransportChannel channel) throws Exception { RecoveryStatus onGoingRecovery = onGoingRecoveries.get(request.recoveryId()); if (onGoingRecovery == null) { // shard is getting closed on us throw new IndexShardClosedException(request.shardId()); } if (onGoingRecovery.canceled) { onGoingRecovery.sentCanceledToSource = true; throw new IndexShardClosedException(request.shardId()); } onGoingRecovery.stage = RecoveryStatus.Stage.TRANSLOG; onGoingRecovery.indexShard.performRecoveryPrepareForTranslog(); channel.sendResponse(TransportResponse.Empty.INSTANCE); } } class FinalizeRecoveryRequestHandler extends BaseTransportRequestHandler { @Override public RecoveryFinalizeRecoveryRequest newInstance() { return new RecoveryFinalizeRecoveryRequest(); } @Override public String executor() { return ThreadPool.Names.GENERIC; } @Override public void messageReceived(RecoveryFinalizeRecoveryRequest request, TransportChannel channel) throws Exception { RecoveryStatus onGoingRecovery = onGoingRecoveries.get(request.recoveryId()); if (onGoingRecovery == null) { // shard is getting closed on us throw new IndexShardClosedException(request.shardId()); } if (onGoingRecovery.canceled) { onGoingRecovery.sentCanceledToSource = true; throw new IndexShardClosedException(request.shardId()); } onGoingRecovery.stage = RecoveryStatus.Stage.FINALIZE; onGoingRecovery.indexShard.performRecoveryFinalization(false, onGoingRecovery); onGoingRecovery.time = System.currentTimeMillis() - onGoingRecovery.startTime; onGoingRecovery.stage = RecoveryStatus.Stage.DONE; channel.sendResponse(TransportResponse.Empty.INSTANCE); } } class TranslogOperationsRequestHandler extends BaseTransportRequestHandler { @Override public RecoveryTranslogOperationsRequest newInstance() { return new RecoveryTranslogOperationsRequest(); } @Override public String executor() { return ThreadPool.Names.GENERIC; } @Override public void messageReceived(RecoveryTranslogOperationsRequest request, TransportChannel channel) throws Exception { RecoveryStatus onGoingRecovery = onGoingRecoveries.get(request.recoveryId()); if (onGoingRecovery == null) { // shard is getting closed on us throw new IndexShardClosedException(request.shardId()); } if (onGoingRecovery.canceled) { onGoingRecovery.sentCanceledToSource = true; throw new IndexShardClosedException(request.shardId()); } InternalIndexShard shard = (InternalIndexShard) indicesService.indexServiceSafe(request.shardId().index().name()).shardSafe(request.shardId().id()); for (Translog.Operation operation : request.operations()) { if (onGoingRecovery.canceled) { onGoingRecovery.sentCanceledToSource = true; throw new IndexShardClosedException(request.shardId()); } shard.performRecoveryOperation(operation); onGoingRecovery.currentTranslogOperations++; } channel.sendResponse(TransportResponse.Empty.INSTANCE); } } class FilesInfoRequestHandler extends BaseTransportRequestHandler { @Override public RecoveryFilesInfoRequest newInstance() { return new RecoveryFilesInfoRequest(); } @Override public String executor() { return ThreadPool.Names.GENERIC; } @Override public void messageReceived(RecoveryFilesInfoRequest request, TransportChannel channel) throws Exception { RecoveryStatus onGoingRecovery = onGoingRecoveries.get(request.recoveryId()); if (onGoingRecovery == null) { // shard is getting closed on us throw new IndexShardClosedException(request.shardId()); } if (onGoingRecovery.canceled) { onGoingRecovery.sentCanceledToSource = true; throw new IndexShardClosedException(request.shardId()); } onGoingRecovery.phase1FileNames = request.phase1FileNames; onGoingRecovery.phase1FileSizes = request.phase1FileSizes; onGoingRecovery.phase1ExistingFileNames = request.phase1ExistingFileNames; onGoingRecovery.phase1ExistingFileSizes = request.phase1ExistingFileSizes; onGoingRecovery.phase1TotalSize = request.phase1TotalSize; onGoingRecovery.phase1ExistingTotalSize = request.phase1ExistingTotalSize; onGoingRecovery.stage = RecoveryStatus.Stage.INDEX; channel.sendResponse(TransportResponse.Empty.INSTANCE); } } class CleanFilesRequestHandler extends BaseTransportRequestHandler { @Override public RecoveryCleanFilesRequest newInstance() { return new RecoveryCleanFilesRequest(); } @Override public String executor() { return ThreadPool.Names.GENERIC; } @Override public void messageReceived(RecoveryCleanFilesRequest request, TransportChannel channel) throws Exception { RecoveryStatus onGoingRecovery = onGoingRecoveries.get(request.recoveryId()); if (onGoingRecovery == null) { // shard is getting closed on us throw new IndexShardClosedException(request.shardId()); } if (onGoingRecovery.canceled) { onGoingRecovery.sentCanceledToSource = true; throw new IndexShardClosedException(request.shardId()); } Store store =; // first, we go and move files that were created with the recovery id suffix to // the actual names, its ok if we have a corrupted index here, since we have replicas // to recover from in case of a full cluster shutdown just when this code executes... String prefix = "recovery." + onGoingRecovery.startTime + "."; Set filesToRename = Sets.newHashSet(); for (String existingFile : { if (existingFile.startsWith(prefix)) { filesToRename.add(existingFile.substring(prefix.length(), existingFile.length())); } } Exception failureToRename = null; if (!filesToRename.isEmpty()) { // first, go and delete the existing ones for (String fileToRename : filesToRename) {; } for (String fileToRename : filesToRename) { // now, rename the files... try { store.renameFile(prefix + fileToRename, fileToRename); } catch (Exception e) { failureToRename = e; break; } } } if (failureToRename != null) { throw failureToRename; } // now write checksums store.writeChecksums(onGoingRecovery.checksums); for (String existingFile : { // don't delete snapshot file, or the checksums file (note, this is extra protection since the Store won't delete checksum) if (!request.snapshotFiles().contains(existingFile) && !Store.isChecksum(existingFile)) { try {; } catch (Exception e) { // ignore, we don't really care, will get deleted later on } } } channel.sendResponse(TransportResponse.Empty.INSTANCE); } } class FileChunkTransportRequestHandler extends BaseTransportRequestHandler { @Override public RecoveryFileChunkRequest newInstance() { return new RecoveryFileChunkRequest(); } @Override public String executor() { return ThreadPool.Names.GENERIC; } @Override public void messageReceived(final RecoveryFileChunkRequest request, TransportChannel channel) throws Exception { RecoveryStatus onGoingRecovery = onGoingRecoveries.get(request.recoveryId()); if (onGoingRecovery == null) { // shard is getting closed on us throw new IndexShardClosedException(request.shardId()); } if (onGoingRecovery.canceled) { onGoingRecovery.sentCanceledToSource = true; throw new IndexShardClosedException(request.shardId()); } Store store =; IndexOutput indexOutput; if (request.position() == 0) { // first request onGoingRecovery.checksums.remove(; indexOutput = onGoingRecovery.openIndexOutputs.remove(; if (indexOutput != null) { try { indexOutput.close(); } catch (IOException e) { // ignore } } // we create an output with no checksum, this is because the pure binary data of the file is not // the checksum (because of seek). We will create the checksum file once copying is done // also, we check if the file already exists, if it does, we create a file name based // on the current recovery "id" and later we make the switch, the reason for that is that // we only want to overwrite the index files once we copied all over, and not create a // case where the index is half moved String name =; if ( { name = "recovery." + onGoingRecovery.startTime + "." + name; } indexOutput = store.createOutputRaw(name); onGoingRecovery.openIndexOutputs.put(, indexOutput); } else { indexOutput = onGoingRecovery.openIndexOutputs.get(; } if (indexOutput == null) { // shard is getting closed on us throw new IndexShardClosedException(request.shardId()); } synchronized (indexOutput) { try { if (recoverySettings.rateLimiter() != null) { recoverySettings.rateLimiter().pause(request.content().length()); } BytesReference content = request.content(); if (!content.hasArray()) { content = content.toBytesArray(); } indexOutput.writeBytes(content.array(), content.arrayOffset(), content.length()); onGoingRecovery.currentFilesSize.addAndGet(request.length()); if (indexOutput.getFilePointer() == request.length()) { // we are done indexOutput.close(); // write the checksum if (request.checksum() != null) { onGoingRecovery.checksums.put(, request.checksum()); }; onGoingRecovery.openIndexOutputs.remove(; } } catch (IOException e) { onGoingRecovery.openIndexOutputs.remove(; try { indexOutput.close(); } catch (IOException e1) { // ignore } throw e; } } channel.sendResponse(TransportResponse.Empty.INSTANCE); } } }

