All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.executiongraph.failover.RestartIndividualStrategy Maven / Gradle / Ivy

There is a newer version: 1.13.6
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.executiongraph.failover;

import org.apache.flink.metrics.MetricGroup;
import org.apache.flink.metrics.SimpleCounter;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.executiongraph.Execution;
import org.apache.flink.runtime.executiongraph.ExecutionGraph;
import org.apache.flink.runtime.executiongraph.ExecutionJobVertex;
import org.apache.flink.runtime.executiongraph.ExecutionVertex;
import org.apache.flink.runtime.executiongraph.GlobalModVersionMismatch;
import org.apache.flink.runtime.executiongraph.IntermediateResult;
import org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException;
import org.apache.flink.util.FlinkRuntimeException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.List;
import java.util.concurrent.CompletableFuture;

import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * Simple failover strategy that restarts each task individually.
 * This strategy is only applicable if the entire job consists unconnected
 * tasks, meaning each task is its own component.
 */
public class RestartIndividualStrategy extends FailoverStrategy {

	private static final Logger LOG = LoggerFactory.getLogger(RestartIndividualStrategy.class);

	// ------------------------------------------------------------------------

	/** The execution graph to recover */
	private final ExecutionGraph executionGraph;

	private final SimpleCounter numTaskFailures;

	/**
	 * Creates a new failover strategy that recovers from failures by restarting only the failed task
	 * of the execution graph.
	 *
	 * @param executionGraph The execution graph to handle.
	 */
	public RestartIndividualStrategy(ExecutionGraph executionGraph) {
		this.executionGraph = checkNotNull(executionGraph);
		this.numTaskFailures = new SimpleCounter();
	}

	// ------------------------------------------------------------------------

	@Override
	public void onTaskFailure(Execution taskExecution, Throwable cause) {

		executionGraph.getJobMasterMainThreadExecutor().assertRunningInMainThread();

		// to better handle the lack of resources (potentially by a scale-in), we
		// make failures due to missing resources global failures 
		if (cause instanceof NoResourceAvailableException) {
			LOG.info("Not enough resources to schedule {} - triggering full recovery.", taskExecution);
			executionGraph.failGlobal(cause);
			return;
		}

		LOG.info("Recovering task failure for {} (#{}) via individual restart.", 
				taskExecution.getVertex().getTaskNameWithSubtaskIndex(), taskExecution.getAttemptNumber());

		numTaskFailures.inc();

		// trigger the restart once the task has reached its terminal state
		// Note: currently all tasks passed here are already in their terminal state,
		//       so we could actually avoid the future. We use it anyways because it is cheap and
		//       it helps to support better testing
		final CompletableFuture terminationFuture = taskExecution.getTerminalStateFuture();
		terminationFuture.thenRun(
			() -> performExecutionVertexRestart(taskExecution.getVertex(), taskExecution.getGlobalModVersion()));
	}

	protected void performExecutionVertexRestart(
		ExecutionVertex vertexToRecover,
		long globalModVersion) {
		try {
			long createTimestamp = System.currentTimeMillis();
			Execution newExecution = vertexToRecover.resetForNewExecution(createTimestamp, globalModVersion);
			newExecution.scheduleForExecution();
		} catch (GlobalModVersionMismatch e) {
			// this happens if a concurrent global recovery happens. simply do nothing.
		} catch (Exception e) {
			executionGraph.failGlobal(
				new Exception("Error during fine grained recovery - triggering full recovery", e));
		}
	}

	@Override
	public void notifyNewVertices(List newJobVerticesTopological) {
		// we validate here that the vertices are in fact not connected to
		// any other vertices
		for (ExecutionJobVertex ejv : newJobVerticesTopological) {
			List inputs = ejv.getInputs();
			IntermediateResult[] outputs = ejv.getProducedDataSets();

			if ((inputs != null && inputs.size() > 0) || (outputs != null && outputs.length > 0)) {
				throw new FlinkRuntimeException("Incompatible failover strategy - strategy '" + 
						getStrategyName() + "' can only handle jobs with only disconnected tasks.");
			}
		}
	}

	@Override
	public String getStrategyName() {
		return "Individual Task Restart";
	}

	@Override
	public void registerMetrics(MetricGroup metricGroup) {
		metricGroup.counter("task_failures", numTaskFailures);
	}

	// ------------------------------------------------------------------------
	//  factory
	// ------------------------------------------------------------------------

	/**
	 * Factory that instantiates the RestartAllStrategy.
	 */
	public static class Factory implements FailoverStrategy.Factory {

		@Override
		public RestartIndividualStrategy create(ExecutionGraph executionGraph) {
			return new RestartIndividualStrategy(executionGraph);
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy