All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.executiongraph.failover.RestartIndividualStrategy Maven / Gradle / Ivy

There is a newer version: 1.5.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.executiongraph.failover;

import org.apache.flink.metrics.MetricGroup;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.executiongraph.Execution;
import org.apache.flink.runtime.executiongraph.ExecutionGraph;
import org.apache.flink.runtime.executiongraph.ExecutionJobVertex;
import org.apache.flink.runtime.executiongraph.ExecutionVertex;
import org.apache.flink.runtime.executiongraph.GlobalModVersionMismatch;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayDeque;
import java.util.Collections;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;

import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * Failover strategy that restarts each task individually.
 */
public class RestartIndividualStrategy extends FailoverStrategy {

	private static final Logger LOG = LoggerFactory.getLogger(RestartIndividualStrategy.class);

	// ------------------------------------------------------------------------

	/** The execution graph to recover */
	private final ExecutionGraph executionGraph;

	/** The executor that executes restart callbacks */
	private final Executor callbackExecutor;

	private final Queue taskFailuresTimestamps;

	/** The time-span(in seconds) over which the task failures will be detected. */
	private final int taskFailuresDetectTimeSpan;

	/** The max number task failures can happen during last {@link #taskFailuresDetectTimeSpan} seconds
	 * before fallback to fail global. */
	private final int taskFailuresLimit;

	/**
	 * Creates a new failover strategy that recovers from failures by restarting only the failed task
	 * of the execution graph.
	 *
	 * @param executionGraph The execution graph to handle.
	 * @param callbackExecutor The executor that executes restart callbacks
	 */
	public RestartIndividualStrategy(ExecutionGraph executionGraph, Executor callbackExecutor) {
		this(executionGraph, callbackExecutor, 60, Integer.MAX_VALUE);
	}

	/**
	 * Creates a new failover strategy that recovers from failures by restarting only the failed task
	 * of the execution graph.
	 *
	 * 

The strategy will use the ExecutionGraph's future executor for callbacks. * * @param executionGraph The execution graph to handle. * @param callbackExecutor The executor that executes restart callbacks * @param taskFailuresDetectTimeSpan The time-span(int seconds) over which the task failures will be detected * @param taskFailuresLimit The max number task failures can happen in a late period before fallback to fail global */ public RestartIndividualStrategy(ExecutionGraph executionGraph, Executor callbackExecutor, int taskFailuresDetectTimeSpan, int taskFailuresLimit) { this.executionGraph = checkNotNull(executionGraph); this.callbackExecutor = checkNotNull(callbackExecutor); this.taskFailuresDetectTimeSpan = taskFailuresDetectTimeSpan; this.taskFailuresLimit = taskFailuresLimit; this.taskFailuresTimestamps = new ArrayDeque<>(); } // ------------------------------------------------------------------------ @Override public void onTaskFailure(Execution taskExecution, Throwable cause) { long currentTimestamp = System.currentTimeMillis(); taskFailuresTimestamps.add(currentTimestamp); while (taskFailuresTimestamps.peek() != null) { if (taskFailuresTimestamps.peek() < currentTimestamp / 1000 - taskFailuresDetectTimeSpan) { taskFailuresTimestamps.poll(); continue; } else { break; } } // fail global if the failures limit is exceeded int latestFailuresCount = taskFailuresTimestamps.size(); if (latestFailuresCount > taskFailuresLimit) { LOG.info("Task failures count {} in last {} seconds exceeds failures limit {}. Will fail globally.", latestFailuresCount, taskFailuresDetectTimeSpan, taskFailuresLimit); executionGraph.failGlobal(cause); return; } LOG.info("Recovering task failure for {} (#{}) via individual restart.", taskExecution.getVertex().getTaskNameWithSubtaskIndex(), taskExecution.getAttemptNumber()); // trigger the restart once the task has reached its terminal state // Note: currently all tasks passed here are already in their terminal state, // so we could actually avoid the future. We use it anyways because it is cheap and // it helps to support better testing final CompletableFuture terminationFuture = taskExecution.getTerminalStateFuture(); final ExecutionVertex vertexToRecover = taskExecution.getVertex(); final long globalModVersion = taskExecution.getGlobalModVersion(); terminationFuture.thenAcceptAsync( (ExecutionState value) -> { try { executionGraph.resetExecutionVertices(globalModVersion, Collections.singletonList(vertexToRecover)); // Mark the execution required to update partitions to consumers once it is running vertexToRecover.getCurrentExecutionAttempt().setUpdatePartitionToConsumersRequired(true); executionGraph.notifyExecutionVertexFailover(Collections.singletonList(vertexToRecover)); } catch (GlobalModVersionMismatch e) { // this happens if a concurrent global recovery happens. simply do nothing. } catch (Exception e) { executionGraph.failGlobal( new Exception("Error during fine grained recovery - triggering full recovery", e)); } }, callbackExecutor); } @Override public void notifyNewVertices(List newJobVerticesTopological) { } @Override public String getStrategyName() { return "Individual Task Restart"; } @Override public void registerMetrics(MetricGroup metricGroup) { } // ------------------------------------------------------------------------ // factory // ------------------------------------------------------------------------ /** * Factory that instantiates the RestartAllStrategy. */ public static class Factory implements FailoverStrategy.Factory { private final int taskFailuresDetectTimeSpan; private final int taskFailuresLimit; public Factory(int taskFailuresDetectTimeSpan, int taskFailuresLimit) { this.taskFailuresDetectTimeSpan = taskFailuresDetectTimeSpan; this.taskFailuresLimit = taskFailuresLimit; } @Override public RestartIndividualStrategy create(ExecutionGraph executionGraph) { return new RestartIndividualStrategy(executionGraph, executionGraph.getFutureExecutor(), taskFailuresDetectTimeSpan, taskFailuresLimit); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy