All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.executiongraph.failover.FailoverRegion Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.executiongraph.failover;

import org.apache.flink.runtime.concurrent.FutureUtils;
import org.apache.flink.runtime.executiongraph.ExecutionGraph;
import org.apache.flink.runtime.executiongraph.ExecutionVertex;
import org.apache.flink.runtime.executiongraph.GlobalModVersionMismatch;
import org.apache.flink.runtime.jobgraph.JobStatus;
import org.apache.flink.runtime.jobmanager.scheduler.CoLocationGroup;
import org.apache.flink.util.AbstractID;
import org.apache.flink.util.FlinkException;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.concurrent.atomic.AtomicLongFieldUpdater;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;

import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * FailoverRegion manages the failover of a minimal pipeline connected sub graph.
 * It will change from CREATED to CANCELING and then to CANCELLED and at last to RUNNING,
 */
public class FailoverRegion {

	private static final AtomicReferenceFieldUpdater STATE_UPDATER =
			AtomicReferenceFieldUpdater.newUpdater(FailoverRegion.class, JobStatus.class, "state");

	/** In place updater for the execution graph's current global recovery version.
	 * Avoids having to use an AtomicLong and thus makes the frequent read access a bit faster */
	private static final AtomicLongFieldUpdater REGION_VERSION_UPDATER =
		AtomicLongFieldUpdater.newUpdater(FailoverRegion.class, "regionModVersion");

	/** The log object used for debugging. */
	private static final Logger LOG = LoggerFactory.getLogger(FailoverRegion.class);

	// ------------------------------------------------------------------------

	/** a unique id for debugging. */
	private final AbstractID id = new AbstractID();

	private final ExecutionGraph executionGraph;

	private final List connectedExecutionVertices;

	/** The executor that executes the recovery action after all vertices are in terminal state. */
	private final Executor executor;

	/** Current status of the job execution. */
	private volatile JobStatus state = JobStatus.RUNNING;

	/** On each region failover, this version is incremented. */
	private volatile long regionModVersion;

	/** The max number the region can fail. */
	private final int regionFailLimit;

	/** The number the region has failed and restarted. */
	private volatile int regionFailCount = 0;

	public FailoverRegion(ExecutionGraph executionGraph, Executor executor, List connectedExecutions,
						  int regionFailLimit) {
		this.executionGraph = checkNotNull(executionGraph);
		this.executor = checkNotNull(executor);
		this.connectedExecutionVertices = checkNotNull(connectedExecutions);
		this.regionFailLimit = regionFailLimit;

		LOG.debug("Created failover region {} with vertices: {}", id, connectedExecutions);
	}

	public void onExecutionFail(long globalModVersionOfFailover, Throwable cause) {
		incrementRegionModVersion();
		failover(globalModVersionOfFailover, regionModVersion, cause);
	}

	private void allVerticesInTerminalState(long globalModVersionOfFailover) {
		while (true) {
			JobStatus curStatus = this.state;
			if (curStatus.equals(JobStatus.CANCELLING)) {
				if (transitionState(curStatus, JobStatus.CANCELED)) {
					reset(globalModVersionOfFailover, regionModVersion);
					break;
				}
			}
			else {
				LOG.info("FailoverRegion {} is {} when allVerticesInTerminalState.", id, state);
				break;
			}
		}
	}

	public JobStatus getState() {
		return state;
	}

	/**
	 * get all execution vertices contained in this region.
	 */
	public List getAllExecutionVertices() {
		return connectedExecutionVertices;
	}

	/**
	 * Notify the region to failover.
 	 */
	private void failover(long globalModVersionOfFailover, long regionModVersion, Throwable cause) {
		if (regionModVersion < this.regionModVersion) {
			LOG.info("Ignoring concurrent failover of region {}.", id);
			return;
		}

		LOG.info("Try to fail and restart region due to error: ", cause);

		regionFailCount++;

		if (!executionGraph.getRestartStrategy().canRestart()) {
			executionGraph.failGlobal(new FlinkException("RestartStrategy validate fail", cause));
		}
		else if (regionFailCount > regionFailLimit) {
			executionGraph.failGlobal(new FlinkException("FailoverRegion " + id + " exceeds max region restart limit", cause));
		}
		else {
			JobStatus curStatus = this.state;
			if (curStatus.equals(JobStatus.RUNNING)) {
				cancel(globalModVersionOfFailover, regionModVersion);
			}
			else if (curStatus.equals(JobStatus.CANCELED)) {
				reset(globalModVersionOfFailover, regionModVersion);
			}
			else {
				LOG.info("FailoverRegion {} is {} when notified to failover.", id, state);
			}
		}
	}

	// cancel all executions in this sub graph
	private void cancel(final long globalModVersionOfFailover, final long regionModVersion) {
		while (true) {
			JobStatus curStatus = this.state;
			if (curStatus.equals(JobStatus.RUNNING)) {
				if (transitionState(curStatus, JobStatus.CANCELLING)) {

					// we build a future that is complete once all vertices have reached a terminal state
					final ArrayList> futures = new ArrayList<>(connectedExecutionVertices.size());

					// cancel all tasks (that still need cancelling)
					for (ExecutionVertex vertex : connectedExecutionVertices) {
						futures.add(vertex.cancel());
					}

					final FutureUtils.ConjunctFuture allTerminal = FutureUtils.waitForAll(futures);
					allTerminal.whenCompleteAsync(
						(Void ignored, Throwable throwable) -> {
							if (throwable != null) {
								failover(globalModVersionOfFailover, regionModVersion,
									new FlinkException("Could not cancel all execution job vertices properly.", throwable));
							} else {
								allVerticesInTerminalState(globalModVersionOfFailover);
							}
						},
						executor);

					break;
				}
			}
			else {
				LOG.info("FailoverRegion {} is {} when cancel.", id, state);
				break;
			}
		}
	}

	// reset all executions in this sub graph
	private void reset(long globalModVersionOfFailover, long regionModVersion) {
		if (transitionState(JobStatus.CANCELED, JobStatus.CREATED)) {
			// reset all connected ExecutionVertexes
			final Collection colGroups = new HashSet<>();

			for (ExecutionVertex ev : connectedExecutionVertices) {
				CoLocationGroup cgroup = ev.getJobVertex().getCoLocationGroup();
				if (cgroup != null && !colGroups.contains(cgroup)){
					cgroup.resetConstraints();
					colGroups.add(cgroup);
				}
			}

			restart(globalModVersionOfFailover, regionModVersion);
		}
		else {
			failover(globalModVersionOfFailover, regionModVersion,
					new FlinkException("FailoverRegion " + id + " switch from CANCELLED to CREATED fail."));
		}
	}

	/**
	 * Restart the region by notify the schedule plugin.
	 */
	private void restart(long globalModVersionOfFailover, long regionModVersion) {
		try {
			if (transitionState(JobStatus.CREATED, JobStatus.RUNNING)) {
				// Let the scheduler event to reschedule connected ExecutionVertices
				executionGraph.resetExecutionVerticesAndNotify(globalModVersionOfFailover, connectedExecutionVertices);
			}
			else {
				failover(globalModVersionOfFailover, regionModVersion,
						new FlinkException("FailoverRegion " + id + " witch from CREATED to RUNNING fail."));
			}
		} catch (GlobalModVersionMismatch e) {
			// happens when a global recovery happens concurrently to the regional recovery
			// should do nothing
		} catch (Exception e) {
			failover(globalModVersionOfFailover, regionModVersion,
					new FlinkException("FailoverRegion " + id + " restart failed.", e));
		}
	}

	private boolean transitionState(JobStatus current, JobStatus newState) {
		if (STATE_UPDATER.compareAndSet(this, current, newState)) {
			LOG.info("FailoverRegion {} switched from state {} to {}.", id, current, newState);
			return true;
		}
		else {
			return false;
		}
	}

	private long incrementRegionModVersion() {
		return REGION_VERSION_UPDATER.incrementAndGet(this);
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy