All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.executiongraph.failover.StrictRestartPipelinedRegionStrategy Maven / Gradle / Ivy

There is a newer version: 1.5.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.executiongraph.failover;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.runtime.executiongraph.ExecutionGraph;
import org.apache.flink.runtime.executiongraph.ExecutionVertex;
import org.apache.flink.runtime.executiongraph.IntermediateResultPartition;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayDeque;
import java.util.IdentityHashMap;
import java.util.Optional;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.Executor;

/**
 * A strict version of {@link RestartPipelinedRegionStrategy} which ensures data consistency
 * even if there are non-deterministic shuffles(rebalance, rescale..).
 */
public class StrictRestartPipelinedRegionStrategy extends RestartPipelinedRegionStrategy {

	/** The log object used for debugging. */
	private static final Logger LOG = LoggerFactory.getLogger(StrictRestartPipelinedRegionStrategy.class);

	/**
	 * Creates a new failover strategy to restart pipelined regions that works on the given
	 * execution graph and uses the execution graph's future executor to call restart actions.
	 *
	 * @param executionGraph The execution graph on which this FailoverStrategy will work
	 * @param regionFailLimit The max number a region can fail
	 */
	public StrictRestartPipelinedRegionStrategy(ExecutionGraph executionGraph, int regionFailLimit) {
		this(executionGraph, executionGraph.getFutureExecutor(), regionFailLimit);
	}

	/**
	 * Creates a new failover strategy to restart pipelined regions that works on the given
	 * execution graph and uses the given executor to call restart actions.
	 *
	 * @param executionGraph The execution graph on which this FailoverStrategy will work
	 * @param executor  The executor used for future actions
	 * @param regionFailLimit The max number a region can fail
	 */
	public StrictRestartPipelinedRegionStrategy(ExecutionGraph executionGraph, Executor executor, int regionFailLimit) {
		this(executionGraph, executor, executionGraph.getResultPartitionFailureChecker(), regionFailLimit);
	}

	@VisibleForTesting
	public StrictRestartPipelinedRegionStrategy(
		ExecutionGraph executionGraph,
		Executor executor,
		ResultPartitionFailureChecker resultPartitionFailureChecker,
		int regionFailLimit) {

		super(executionGraph, executor, resultPartitionFailureChecker, regionFailLimit);
	}

	// ------------------------------------------------------------------------
	//  failover implementation
	// ------------------------------------------------------------------------

	/**
	 * All 'involved' regions are proposed to be restarted.
	 * The 'involved' regions are calculated with rules below:
	 * 1. The region containing the failed task is always involved
	 * 2. If an input result partition of an involved region is not available, i.e. Missing or Corrupted,
	 *    the region containing the partition producer task is involved
	 * 3. If a region is involved, all of its consumer regions are involved
	 */
	@Override
	@VisibleForTesting
	public Set getRegionsToRestart(ExecutionVertex failedVertex, Throwable cause) {
		IdentityHashMap regionsToRestart = new IdentityHashMap<>();
		IdentityHashMap visitedRegions = new IdentityHashMap<>();

		Queue regionsToVisit = new ArrayDeque<>();
		FailoverRegion failedRegion = getFailoverRegion(failedVertex);

		regionsToVisit.add(failedRegion);
		visitedRegions.put(failedRegion, null);

		// get root failed region.
		Optional rootFailedRegion = getRootFailedRegion(failedVertex, cause);
		if (rootFailedRegion.isPresent() && rootFailedRegion.get() != failedRegion) {
			regionsToVisit.add(rootFailedRegion.get());
			visitedRegions.put(rootFailedRegion.get(), null);
		}

		// start from the failed region to visit all involved regions
		while (!regionsToVisit.isEmpty()) {
			FailoverRegion regionToRestart = regionsToVisit.poll();

			// an involved region should be restarted
			regionsToRestart.put(regionToRestart, null);

			// if a needed input result partition is not available, its producer region is involved
			for (ExecutionVertex vertex : regionToRestart.getAllExecutionVertices()) {
				for (int i = 0; i < vertex.getNumberOfInputs(); ++i) {
					for (IntermediateResultPartition consumedPartition : vertex.getConsumedPartitions(i)) {
						if (resultPartitionFailureChecker.isFailed(consumedPartition.getPartitionId())) {
							FailoverRegion producerRegion = getFailoverRegion(consumedPartition.getProducer());
							if (!visitedRegions.containsKey(producerRegion)) {
								visitedRegions.put(producerRegion, null);
								regionsToVisit.add(producerRegion);
							}
						}
					}
				}
			}

			// all consumer regions of an involved region should be involved
			for (ExecutionVertex vertex : regionToRestart.getAllExecutionVertices()) {
				for (IntermediateResultPartition resultPartition : vertex.getProducedPartitions().values()) {
					for (ExecutionVertex consumerVertex : resultPartition.getConsumers()) {
						FailoverRegion consumerRegion = getFailoverRegion(consumerVertex);
						if (!visitedRegions.containsKey(consumerRegion)) {
							visitedRegions.put(consumerRegion, null);
							regionsToVisit.add(consumerRegion);
						}
					}
				}
			}
		}

		return regionsToRestart.keySet();
	}

	@Override
	public String getStrategyName() {
		return "Strict Pipelined Region Failover";
	}

	// ------------------------------------------------------------------------
	//  factory
	// ------------------------------------------------------------------------

	/**
	 * Factory that instantiates the StrictRestartPipelinedRegionStrategy.
	 */
	public static class Factory implements FailoverStrategy.Factory {

		private int regionFailLimit = 100;

		@Override
		public FailoverStrategy create(ExecutionGraph executionGraph) {
			return new StrictRestartPipelinedRegionStrategy(executionGraph, regionFailLimit);
		}

		public void setRegionFailLimit(int regionFailLimit) {
			this.regionFailLimit = regionFailLimit;
		}
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy