org.apache.flink.runtime.executiongraph.ExecutionGraph Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of flink-runtime_2.11 Show documentation
There is a newer version: 1.5.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.executiongraph;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.ArchivedExecutionConfig;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.accumulators.Accumulator;
import org.apache.flink.api.common.accumulators.AccumulatorHelper;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.CoreOptions;
import org.apache.flink.configuration.JobManagerOptions;
import org.apache.flink.metrics.Histogram;
import org.apache.flink.metrics.Meter;
import org.apache.flink.metrics.MeterView;
import org.apache.flink.metrics.MetricGroup;
import org.apache.flink.metrics.groups.UnregisteredMetricsGroup;
import org.apache.flink.runtime.JobException;
import org.apache.flink.runtime.StoppingException;
import org.apache.flink.runtime.accumulators.AccumulatorSnapshot;
import org.apache.flink.runtime.accumulators.StringifiedAccumulatorResult;
import org.apache.flink.runtime.blob.BlobWriter;
import org.apache.flink.runtime.blob.PermanentBlobKey;
import org.apache.flink.runtime.checkpoint.CheckpointCoordinator;
import org.apache.flink.runtime.checkpoint.CheckpointIDCounter;
import org.apache.flink.runtime.checkpoint.CheckpointRetentionPolicy;
import org.apache.flink.runtime.checkpoint.CheckpointStatsSnapshot;
import org.apache.flink.runtime.checkpoint.CheckpointStatsTracker;
import org.apache.flink.runtime.checkpoint.CompletedCheckpointStore;
import org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook;
import org.apache.flink.runtime.clusterframework.types.ResourceProfile;
import org.apache.flink.runtime.clusterframework.types.SlotProfile;
import org.apache.flink.runtime.concurrent.FutureUtils;
import org.apache.flink.runtime.concurrent.FutureUtils.ConjunctFuture;
import org.apache.flink.runtime.concurrent.ScheduledExecutorServiceAdapter;
import org.apache.flink.runtime.deployment.ResultPartitionLocationTrackerProxy;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.execution.SuppressRestartsException;
import org.apache.flink.runtime.executiongraph.failover.FailoverStrategy;
import org.apache.flink.runtime.executiongraph.restart.ExecutionGraphRestartCallback;
import org.apache.flink.runtime.executiongraph.restart.RestartCallback;
import org.apache.flink.runtime.executiongraph.restart.RestartStrategy;
import org.apache.flink.runtime.io.network.partition.ResultPartitionID;
import org.apache.flink.runtime.jobgraph.ExecutionVertexID;
import org.apache.flink.runtime.jobgraph.IntermediateDataSetID;
import org.apache.flink.runtime.jobgraph.JobStatus;
import org.apache.flink.runtime.jobgraph.JobVertex;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration;
import org.apache.flink.runtime.jobmanager.scheduler.CoLocationGroup;
import org.apache.flink.runtime.jobmanager.scheduler.ScheduledUnit;
import org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroup;
import org.apache.flink.runtime.jobmaster.GraphManager;
import org.apache.flink.runtime.jobmaster.LogicalSlot;
import org.apache.flink.runtime.jobmaster.SlotRequestId;
import org.apache.flink.runtime.jobmaster.slotpool.SlotProvider;
import org.apache.flink.runtime.metrics.SimpleHistogram;
import org.apache.flink.runtime.query.KvStateLocationRegistry;
import org.apache.flink.runtime.schedule.SlotSharingResourceCalculator;
import org.apache.flink.runtime.schedule.SummationSlotSharingResourceCalculator;
import org.apache.flink.runtime.state.SharedStateRegistry;
import org.apache.flink.runtime.state.StateBackend;
import org.apache.flink.runtime.taskmanager.TaskExecutionState;
import org.apache.flink.types.Either;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.FlinkException;
import org.apache.flink.util.OptionalFailure;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.SerializedThrowable;
import org.apache.flink.util.SerializedValue;
import org.apache.flink.util.StringUtils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;

import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executor;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLongFieldUpdater;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
import java.util.stream.Collectors;

import static org.apache.flink.util.Preconditions.checkArgument;
import static org.apache.flink.util.Preconditions.checkNotNull;
import static org.apache.flink.util.Preconditions.checkState;

/**
 * The execution graph is the central data structure that coordinates the distributed
 * execution of a data flow. It keeps representations of each parallel task, each
 * intermediate stream, and the communication between them.
 *
 * The execution graph consists of the following constructs:
 * 

 *     The {@link ExecutionJobVertex} represents one vertex from the JobGraph (usually one operation like
 *         "map" or "join") during execution. It holds the aggregated state of all parallel subtasks.
 *         The ExecutionJobVertex is identified inside the graph by the {@link JobVertexID}, which it takes
 *         from the JobGraph's corresponding JobVertex.
 *     The {@link ExecutionVertex} represents one parallel subtask. For each ExecutionJobVertex, there are
 *         as many ExecutionVertices as the parallelism. The ExecutionVertex is identified by
 *         the ExecutionJobVertex and the number of the parallel subtask
 *     The {@link Execution} is one attempt to execute a ExecutionVertex. There may be multiple Executions
 *         for the ExecutionVertex, in case of a failure, or in the case where some data needs to be recomputed
 *         because it is no longer available when requested by later operations. An Execution is always
 *         identified by an {@link ExecutionAttemptID}. All messages between the JobManager and the TaskManager
 *         about deployment of tasks and updates in the task status always use the ExecutionAttemptID to
 *         address the message receiver.
 * 
 *
 * Global and local failover
 *
 * The Execution Graph has two failover modes: global failover and local failover.
 *
 * 
A global failover aborts the task executions for all vertices and restarts whole
 * data flow graph from the last completed checkpoint. Global failover is considered the
 * "fallback strategy" that is used when a local failover is unsuccessful, or when a issue is
 * found in the state of the ExecutionGraph that could mark it as inconsistent (caused by a bug).
 *
 * 
A local failover is triggered when an individual vertex execution (a task) fails.
 * The local failover is coordinated by the {@link FailoverStrategy}. A local failover typically
 * attempts to restart as little as possible, but as much as necessary.
 *
 * 
Between local- and global failover, the global failover always takes precedence, because it
 * is the core mechanism that the ExecutionGraph relies on to bring back consistency. The
 * guard that, the ExecutionGraph maintains a global modification version, which is incremented
 * with every global failover (and other global actions, like job cancellation, or terminal
 * failure). Local failover is always scoped by the modification version that the execution graph
 * had when the failover was triggered. If a new global modification version is reached during
 * local failover (meaning there is a concurrent global failover), the failover strategy has to
 * yield before the global failover.
 */
public class ExecutionGraph implements AccessExecutionGraph {

	/** In place updater for the execution graph's current state. Avoids having to use an
	 * AtomicReference and thus makes the frequent read access a bit faster. */
	private static final AtomicReferenceFieldUpdater STATE_UPDATER =
		AtomicReferenceFieldUpdater.newUpdater(ExecutionGraph.class, JobStatus.class, "state");

	/** In place updater for the execution graph's current global recovery version.
	 * Avoids having to use an AtomicLong and thus makes the frequent read access a bit faster */
	private static final AtomicLongFieldUpdater GLOBAL_VERSION_UPDATER =
		AtomicLongFieldUpdater.newUpdater(ExecutionGraph.class, "globalModVersion");

	/** The log object used for debugging. */
	static final Logger LOG = LoggerFactory.getLogger(ExecutionGraph.class);

	// --------------------------------------------------------------------------------------------

	/** The lock used to secure all access to mutable fields, especially the tracking of progress
	 * within the job. */
	private final Object progressLock = new Object();

	/** Job specific information like the job id, job name, job configuration, etc. */
	private final JobInformation jobInformation;

	/** Serialized job information or a blob key pointing to the offloaded job information. */
	private final Either, PermanentBlobKey> jobInformationOrBlobKey;

	/** The executor which is used to execute futures. */
	private final ScheduledExecutorService futureExecutor;

	/** The executor which is used to execute blocking io operations. */
	private final Executor ioExecutor;

	/** {@code true} if all source tasks are stoppable. */
	private boolean isStoppable = true;

	/** All job vertices that are part of this graph. */
	private final ConcurrentHashMap tasks;

	/** All vertices, in the order in which they were created. **/
	private final List verticesInCreationOrder;

	/** All intermediate results that are part of this graph. */
	private final ConcurrentHashMap intermediateResults;

	/** The currently executed tasks, for callbacks. */
	private final ConcurrentHashMap currentExecutions;

	/** Listeners that receive messages when the entire job switches it status
	 * (such as from RUNNING to FINISHED). */
	private final List jobStatusListeners;

	/** Listeners that receive messages whenever a single task execution changes its status. */
	private final List executionListeners;

	/** The implementation that decides how to recover the failures of tasks. */
	private final FailoverStrategy failoverStrategy;

	/** Timestamps (in milliseconds as returned by {@code System.currentTimeMillis()} when
	 * the execution graph transitioned into a certain state. The index into this array is the
	 * ordinal of the enum value, i.e. the timestamp when the graph went into state "RUNNING" is
	 * at {@code stateTimestamps[RUNNING.ordinal()]}. */
	private final long[] stateTimestamps;

	/** The timeout for all messages that require a response/acknowledgement. */
	private final Time rpcTimeout;

	/** The timeout for slot allocations. */
	private final Time allocationTimeout;

	/** Strategy to use for restarts. */
	private final RestartStrategy restartStrategy;

	/** The slot provider to use for allocating slots for tasks as they are needed. */
	private final SlotProvider slotProvider;

	/** The classloader for the user code. Needed for calls into user code classes. */
	private final ClassLoader userClassLoader;

	/** Registered KvState instances reported by the TaskManagers. */
	private final KvStateLocationRegistry kvStateLocationRegistry;

	/** Blob writer used to offload RPC messages. */
	private final BlobWriter blobWriter;

	/** The result partition location tracker proxy to get result partition location from. */
	private final ResultPartitionLocationTrackerProxy resultPartitionLocationTrackerProxy;

	/** The resource calculator for the sharing groups. */
	private final SlotSharingResourceCalculator slotSharingResourceCalculator;

	/** The total number of vertices currently in the execution graph. */
	private int numVerticesTotal;

	/** The manager for the graph. */
	private GraphManager graphManager;

	private final Configuration jobManagerConfiguration;

	// ------ Configuration of the Execution -------

	/** Flag to indicate whether the scheduler may queue tasks for execution, or needs to be able
	 * to deploy them immediately. */
	private boolean allowQueuedScheduling = false;

	/**
	 * Indicates whether execution vertices can be scheduled before knowing all of its upstream vertex locations.
	 */
	private boolean allowLazyDeployment = true;

	/**
	 * This limit the count of input splits of a certain operator can be retrieved by one task.
	 * The value is the multiplier of the average input split count one task can retrieve.
	 * Less than zero means there is no limit.
	 */
	private double perTaskInputSplitsLimitAsAverageMultiplier = ExecutionConfig.DEFAULT_PER_TASK_INPUT_SPLITS_LIMMIT_AS_AVERAGE_MULTIPLIER;

	// ------ Execution status and progress. These values are volatile, and accessed under the lock -------

	private final AtomicInteger verticesFinished;

	/** Current status of the job execution. */
	private volatile JobStatus state = JobStatus.CREATED;

	/** A future that completes once the job has reached a terminal state. */
	private volatile CompletableFuture terminationFuture;

	/** A future that completes once all the execution vertices finish reconciling. */
	private volatile CompletableFuture> reconcileFuture;

	/** On each global recovery, this version is incremented. The version breaks conflicts
	 * between concurrent restart attempts by local failover strategies. */
	private volatile long globalModVersion;

	/** The exception that caused the job to fail. This is set to the first root exception
	 * that was not recoverable and triggered job failure. */
	private volatile Throwable failureCause;

	/** The extended failure cause information for the job. This exists in addition to 'failureCause',
	 * to let 'failureCause' be a strong reference to the exception, while this info holds no
	 * strong reference to any user-defined classes.*/
	private volatile ErrorInfo failureInfo;

	/**
	 * Future for an ongoing or completed scheduling action. Use ConcurrentHashMap as concurrent hash set.
	 */
	private volatile ConcurrentHashMap, Object> schedulingFutures;

	// ------ Fields that are relevant to the execution and need to be cleared before archiving  -------

	/** The coordinator for checkpoints, if snapshot checkpoints are enabled. */
	private CheckpointCoordinator checkpointCoordinator;

	/** Checkpoint stats tracker separate from the coordinator in order to be
	 * available after archiving. */
	private CheckpointStatsTracker checkpointStatsTracker;

	private long updatePartitionInfoSendInterval;

	// ------ Fields that are only relevant for archived execution graphs ------------
	private String jsonPlan;

	private Meter failOverMetrics;

	private Histogram taskDeployMetrics;

	// --------------------------------------------------------------------------------------------
	//   Constructors
	// --------------------------------------------------------------------------------------------

	public ExecutionGraph(
		JobInformation jobInformation,
		ScheduledExecutorService futureExecutor,
		Executor ioExecutor,
		Time rpcTimeout,
		RestartStrategy restartStrategy,
		FailoverStrategy.Factory failoverStrategyFactory,
		SlotProvider slotProvider,
		ClassLoader userClassLoader,
		BlobWriter blobWriter,
		ResultPartitionLocationTrackerProxy resultPartitionLocationTrackerProxy,
		Time allocationTimeout,
		Configuration jobManagerConfiguration) throws IOException {

		this(
			jobInformation,
			futureExecutor,
			ioExecutor,
			rpcTimeout,
			restartStrategy,
			failoverStrategyFactory,
			slotProvider,
			userClassLoader,
			blobWriter,
			resultPartitionLocationTrackerProxy,
			allocationTimeout,
			new UnregisteredMetricsGroup(),
			jobManagerConfiguration);
	}

	public ExecutionGraph(
			JobInformation jobInformation,
			ScheduledExecutorService futureExecutor,
			Executor ioExecutor,
			Time rpcTimeout,
			RestartStrategy restartStrategy,
			FailoverStrategy.Factory failoverStrategyFactory,
			SlotProvider slotProvider,
			ClassLoader userClassLoader,
			BlobWriter blobWriter,
			ResultPartitionLocationTrackerProxy resultPartitionLocationTrackerProxy,
			Time allocationTimeout,
			MetricGroup metricGroup,
			Configuration jobManagerConfiguration) throws IOException {

		checkNotNull(futureExecutor);

		this.jobInformation = Preconditions.checkNotNull(jobInformation);

		this.blobWriter = Preconditions.checkNotNull(blobWriter);

		this.jobInformationOrBlobKey = BlobWriter.serializeAndTryOffload(jobInformation, jobInformation.getJobId(), blobWriter);

		this.futureExecutor = Preconditions.checkNotNull(futureExecutor);
		this.ioExecutor = Preconditions.checkNotNull(ioExecutor);

		this.slotProvider = Preconditions.checkNotNull(slotProvider, "scheduler");
		this.userClassLoader = Preconditions.checkNotNull(userClassLoader, "userClassLoader");

		this.resultPartitionLocationTrackerProxy = Preconditions.checkNotNull(resultPartitionLocationTrackerProxy);

		this.tasks = new ConcurrentHashMap<>(16);
		this.intermediateResults = new ConcurrentHashMap<>(16);
		this.verticesInCreationOrder = new ArrayList<>(16);
		this.currentExecutions = new ConcurrentHashMap<>(16);

		this.jobStatusListeners  = new CopyOnWriteArrayList<>();
		this.executionListeners = new CopyOnWriteArrayList<>();

		this.stateTimestamps = new long[JobStatus.values().length];
		this.stateTimestamps[JobStatus.CREATED.ordinal()] = System.currentTimeMillis();

		this.rpcTimeout = checkNotNull(rpcTimeout);
		this.allocationTimeout = checkNotNull(allocationTimeout);

		this.restartStrategy = restartStrategy;
		this.kvStateLocationRegistry = new KvStateLocationRegistry(jobInformation.getJobId(), getAllVertices());

		this.verticesFinished = new AtomicInteger();

		this.globalModVersion = 1L;

		// the failover strategy must be instantiated last, so that the execution graph
		// is ready by the time the failover strategy sees it
		this.failoverStrategy = checkNotNull(failoverStrategyFactory.create(this), "null failover strategy");
		LOG.info("Job recovers via failover strategy: {}", failoverStrategy.getStrategyName());

		this.schedulingFutures = new ConcurrentHashMap<>();

		this.failOverMetrics = metricGroup.meter("task_failover", new MeterView(60));
		this.taskDeployMetrics = metricGroup.histogram("task_deploy_cost", new SimpleHistogram());

		this.jobManagerConfiguration = checkNotNull(jobManagerConfiguration);

		final boolean enableSharedSlot = jobManagerConfiguration.getBoolean(JobManagerOptions.SLOT_ENABLE_SHARED_SLOT);
		if (enableSharedSlot) {
			this.slotSharingResourceCalculator = new SummationSlotSharingResourceCalculator();
		} else {
			this.slotSharingResourceCalculator = null;
		}
	}

	// --------------------------------------------------------------------------------------------
	//  Configuration of Data-flow wide execution settings
	// --------------------------------------------------------------------------------------------

	/**
	 * Gets the number of job vertices currently held by this execution graph.
	 * @return The current number of job vertices.
	 */
	public int getNumberOfExecutionJobVertices() {
		return this.verticesInCreationOrder.size();
	}

	public boolean isQueuedSchedulingAllowed() {
		return this.allowQueuedScheduling;
	}

	public void setQueuedSchedulingAllowed(boolean allowed) {
		this.allowQueuedScheduling = allowed;
	}

	public GraphManager getGraphManager() {
		return graphManager;
	}

	public boolean isLazyDeploymentAllowed() {
		return allowLazyDeployment;
	}

	public double getPerTaskInputSplitsLimitAsAverageMultiplier() {
		return perTaskInputSplitsLimitAsAverageMultiplier;
	}

	public void setPerTaskInputSplitsLimitAsAverageMultiplier(double multiplier) {
		this.perTaskInputSplitsLimitAsAverageMultiplier = multiplier;
	}

	public void setGraphManager(GraphManager graphManager) {
		this.graphManager = checkNotNull(graphManager);
		this.allowLazyDeployment  = graphManager.allowLazyDeployment();
	}

	public ResultPartitionLocationTrackerProxy getResultPartitionLocationTrackerProxy() {
		return resultPartitionLocationTrackerProxy;
	}

	public Time getAllocationTimeout() {
		return allocationTimeout;
	}

	@Override
	public boolean isArchived() {
		return false;
	}

	public void enableCheckpointing(
		long interval,
		long checkpointTimeout,
		long minPauseBetweenCheckpoints,
		int maxConcurrentCheckpoints,
		CheckpointRetentionPolicy retentionPolicy,
		List verticesToTrigger,
		List verticesToWaitFor,
		List verticesToCommitTo,
		List> masterHooks,
		CheckpointIDCounter checkpointIDCounter,
		CompletedCheckpointStore checkpointStore,
		StateBackend checkpointStateBackend,
		CheckpointStatsTracker statsTracker) {

		// simple sanity checks
		checkArgument(interval >= 10, "checkpoint interval must not be below 10ms");
		checkArgument(checkpointTimeout >= 10, "checkpoint timeout must not be below 10ms");

		checkState(state == JobStatus.CREATED, "Job must be in CREATED state");
		checkState(checkpointCoordinator == null, "checkpointing already enabled");

		ExecutionVertex[] tasksToTrigger = collectExecutionVertices(verticesToTrigger);
		ExecutionVertex[] tasksToWaitFor = collectExecutionVertices(verticesToWaitFor);
		ExecutionVertex[] tasksToCommitTo = collectExecutionVertices(verticesToCommitTo);

		checkpointStatsTracker = checkNotNull(statsTracker, "CheckpointStatsTracker");

		// create the coordinator that triggers and commits checkpoints and holds the state
		checkpointCoordinator = new CheckpointCoordinator(
			jobInformation.getJobId(),
			interval,
			checkpointTimeout,
			minPauseBetweenCheckpoints,
			maxConcurrentCheckpoints,
			retentionPolicy,
			tasksToTrigger,
			tasksToWaitFor,
			tasksToCommitTo,
			checkpointIDCounter,
			checkpointStore,
			checkpointStateBackend,
			ioExecutor,
			SharedStateRegistry.DEFAULT_FACTORY);

		// register the master hooks on the checkpoint coordinator
		for (MasterTriggerRestoreHook hook : masterHooks) {
			if (!checkpointCoordinator.addMasterHook(hook)) {
				LOG.warn("Trying to register multiple checkpoint hooks with the name: {}", hook.getIdentifier());
			}
		}

		checkpointCoordinator.setCheckpointStatsTracker(checkpointStatsTracker);

		// interval of max long value indicates disable periodic checkpoint,
		// the CheckpointActivatorDeactivator should be created only if the interval is not max value
		if (interval != Long.MAX_VALUE) {
			// the periodic checkpoint scheduler is activated and deactivated as a result of
			// job status changes (running -> on, all other states -> off)
			registerJobStatusListener(checkpointCoordinator.createActivatorDeactivator());
		}
	}

	@Nullable
	public CheckpointCoordinator getCheckpointCoordinator() {
		return checkpointCoordinator;
	}

	public KvStateLocationRegistry getKvStateLocationRegistry() {
		return kvStateLocationRegistry;
	}

	public RestartStrategy getRestartStrategy() {
		return restartStrategy;
	}

	@Override
	public CheckpointCoordinatorConfiguration getCheckpointCoordinatorConfiguration() {
		if (checkpointStatsTracker != null) {
			return checkpointStatsTracker.getJobCheckpointingConfiguration();
		} else {
			return null;
		}
	}

	@Override
	public CheckpointStatsSnapshot getCheckpointStatsSnapshot() {
		if (checkpointStatsTracker != null) {
			return checkpointStatsTracker.createSnapshot();
		} else {
			return null;
		}
	}

	private ExecutionVertex[] collectExecutionVertices(List jobVertices) {
		if (jobVertices.size() == 1) {
			ExecutionJobVertex jv = jobVertices.get(0);
			if (jv.getGraph() != this) {
				throw new IllegalArgumentException("Can only use ExecutionJobVertices of this ExecutionGraph");
			}
			return jv.getTaskVertices();
		}
		else {
			ArrayList all = new ArrayList<>();
			for (ExecutionJobVertex jv : jobVertices) {
				if (jv.getGraph() != this) {
					throw new IllegalArgumentException("Can only use ExecutionJobVertices of this ExecutionGraph");
				}
				all.addAll(Arrays.asList(jv.getTaskVertices()));
			}
			return all.toArray(new ExecutionVertex[all.size()]);
		}
	}

	// --------------------------------------------------------------------------------------------
	//  Properties and Status of the Execution Graph
	// --------------------------------------------------------------------------------------------

	/**
	 * Returns a list of BLOB keys referring to the JAR files required to run this job.
	 *
	 * @return list of BLOB keys referring to the JAR files required to run this job
	 */
	public Collection getRequiredJarFiles() {
		return jobInformation.getRequiredJarFileBlobKeys();
	}

	/**
	 * Returns a list of classpaths referring to the directories/JAR files required to run this job.
	 *
	 * @return list of classpaths referring to the directories/JAR files required to run this job
	 */
	public Collection getRequiredClasspaths() {
		return jobInformation.getRequiredClasspathURLs();
	}

	// --------------------------------------------------------------------------------------------

	public void setJsonPlan(String jsonPlan) {
		this.jsonPlan = jsonPlan;
	}

	@Override
	public String getJsonPlan() {
		return jsonPlan;
	}

	public SlotProvider getSlotProvider() {
		return slotProvider;
	}

	public Either, PermanentBlobKey> getJobInformationOrBlobKey() {
		return jobInformationOrBlobKey;
	}

	@Override
	public JobID getJobID() {
		return jobInformation.getJobId();
	}

	@Override
	public long getJobVersion() {
		return jobInformation.getJobVersion();
	}

	@Override
	public String getJobName() {
		return jobInformation.getJobName();
	}

	@Override
	public boolean isStoppable() {
		return this.isStoppable;
	}

	public Configuration getJobConfiguration() {
		return jobInformation.getJobConfiguration();
	}

	public ClassLoader getUserClassLoader() {
		return this.userClassLoader;
	}

	@Override
	public JobStatus getState() {
		return state;
	}

	public Throwable getFailureCause() {
		return failureCause;
	}

	public ErrorInfo getFailureInfo() {
		return failureInfo;
	}

	/**
	 * Gets the number of full restarts that the execution graph went through.
	 * If a full restart recovery is currently pending, this recovery is included in the
	 * count.
	 *
	 * @return The number of full restarts so far
	 */
	public long getNumberOfFullRestarts() {
		// subtract one, because the version starts at one
		return globalModVersion - 1;
	}

	@Override
	public ExecutionJobVertex getJobVertex(JobVertexID id) {
		return this.tasks.get(id);
	}

	@Override
	public Map getAllVertices() {
		return Collections.unmodifiableMap(this.tasks);
	}

	@Override
	public Iterable getVerticesTopologically() {
		// we return a specific iterator that does not fail with concurrent modifications
		// the list is append only, so it is safe for that
		final int numElements = this.verticesInCreationOrder.size();

		return new Iterable() {
			@Override
			public Iterator iterator() {
				return new Iterator() {
					private int pos = 0;

					@Override
					public boolean hasNext() {
						return pos < numElements;
					}

					@Override
					public ExecutionJobVertex next() {
						if (hasNext()) {
							return verticesInCreationOrder.get(pos++);
						} else {
							throw new NoSuchElementException();
						}
					}

					@Override
					public void remove() {
						throw new UnsupportedOperationException();
					}
				};
			}
		};
	}

	public int getTotalNumberOfVertices() {
		return numVerticesTotal;
	}

	public Map getAllIntermediateResults() {
		return Collections.unmodifiableMap(this.intermediateResults);
	}

	@Override
	public Iterable getAllExecutionVertices() {
		return new Iterable() {
			@Override
			public Iterator iterator() {
				return new AllVerticesIterator(getVerticesTopologically().iterator());
			}
		};
	}

	@Override
	public long getStatusTimestamp(JobStatus status) {
		return this.stateTimestamps[status.ordinal()];
	}

	public final BlobWriter getBlobWriter() {
		return blobWriter;
	}

	/**
	 * Returns the ExecutionContext associated with this ExecutionGraph.
	 *
	 * @return ExecutionContext associated with this ExecutionGraph
	 */
	public Executor getFutureExecutor() {
		return futureExecutor;
	}

	/**
	 * Returns the ExecutionContext associated with this ExecutionGraph.
	 *
	 * @return ExecutionContext associated with this ExecutionGraph
	 */
	public ScheduledExecutorService getFutureExecutorService() {
		return futureExecutor;
	}

	/**
	 * Merges all accumulator results from the tasks previously executed in the Executions.
	 * @return The accumulator map
	 */
	public Map>> aggregateUserAccumulators() {

		Map>> userAccumulators = new HashMap<>();

		for (ExecutionVertex vertex : getAllExecutionVertices()) {
			Map> next = vertex.getCurrentExecutionAttempt().getUserAccumulators();
			if (next != null) {
				AccumulatorHelper.mergeInto(userAccumulators, next);
			}
		}

		return userAccumulators;
	}

	/**
	 * Gets a serialized accumulator map.
	 * @return The accumulator map with serialized accumulator values.
	 */
	@Override
	public Map>> getAccumulatorsSerialized() {
		return aggregateUserAccumulators()
			.entrySet()
			.stream()
			.collect(Collectors.toMap(
				Map.Entry::getKey,
				entry -> serializeAccumulator(entry.getKey(), entry.getValue())));
	}

	private static SerializedValue> serializeAccumulator(String name, OptionalFailure> accumulator) {
		try {
			if (accumulator.isFailure()) {
				return new SerializedValue<>(OptionalFailure.ofFailure(accumulator.getFailureCause()));
			}
			return new SerializedValue<>(OptionalFailure.of(accumulator.getUnchecked().getLocalValue()));
		} catch (IOException ioe) {
			LOG.error("Could not serialize accumulator " + name + '.', ioe);
			try {
				return new SerializedValue<>(OptionalFailure.ofFailure(ioe));
			} catch (IOException e) {
				throw new RuntimeException("It should never happen that we cannot serialize the accumulator serialization exception.", e);
			}
		}
	}

	/**
	 * Returns the a stringified version of the user-defined accumulators.
	 * @return an Array containing the StringifiedAccumulatorResult objects
	 */
	@Override
	public StringifiedAccumulatorResult[] getAccumulatorResultsStringified() {
		Map>> accumulatorMap = aggregateUserAccumulators();
		return StringifiedAccumulatorResult.stringifyAccumulatorResults(accumulatorMap);
	}

	public long getUpdatePartitionInfoSendInterval() {
		return updatePartitionInfoSendInterval;
	}

	public void setUpdatePartitionInfoSendInterval(long updatePartitionInfoSendInterval) {
		this.updatePartitionInfoSendInterval = updatePartitionInfoSendInterval;
	}

	Configuration getJobManagerConfiguration() {
		return jobManagerConfiguration;
	}

	// --------------------------------------------------------------------------------------------
	//  Actions
	// --------------------------------------------------------------------------------------------

	private void createExecutionJobVertex(List topologiallySorted) throws JobException {
		final List> futures = new LinkedList<>();
		final long createTimestamp = System.currentTimeMillis();

		for (JobVertex jobVertex: topologiallySorted) {
			futures.add(CompletableFuture.supplyAsync(() -> {
				try {
					ExecutionJobVertex ejv = new ExecutionJobVertex(
						this,
						jobVertex,
						getJobConfiguration().getInteger(CoreOptions.DEFAULT_PARALLELISM),
						rpcTimeout,
						globalModVersion,
						createTimestamp);
					ExecutionJobVertex previousTask = tasks.putIfAbsent(jobVertex.getID(), ejv);
					if (previousTask != null) {
						throw new JobException(
							String.format(
								"Encountered two job vertices with ID %s : previous=[%s] / new=[%s]",
								jobVertex.getID(), ejv, previousTask));
					}
					return null;
				} catch (JobException e) {
					return e;
				}
			}, futureExecutor));
		}

		try {
			// wait for all futures done
			Collection exceptions = FutureUtils.combineAll(futures).get();

			// suppress all optional exceptions
			Exception suppressedException = null;
			for (Exception exception : exceptions) {
				if (exception != null) {
					suppressedException = ExceptionUtils.firstOrSuppressed(exception, suppressedException);
				}
			}

			if (suppressedException != null) {
				throw suppressedException;
			}
		} catch (Exception e) {
			throw new JobException("Could not create execution job vertex.", e);
		}
	}

	public void attachJobGraph(List topologicallySorted) throws JobException {

		LOG.debug("Attaching {} topologically sorted vertices to existing job graph with {} " +
				"vertices and {} intermediate results.",
			topologicallySorted.size(), tasks.size(), intermediateResults.size());

		final ArrayList newExecJobVertices = new ArrayList<>(topologicallySorted.size());

		createExecutionJobVertex(topologicallySorted);

		for (JobVertex jobVertex : topologicallySorted) {

			if (jobVertex.isInputVertex() && !jobVertex.isStoppable()) {
				this.isStoppable = false;
			}

			ExecutionJobVertex ejv = tasks.get(jobVertex.getID());

			ejv.connectToPredecessors(this.intermediateResults);

			for (IntermediateResult res : ejv.getProducedDataSets()) {
				IntermediateResult previousDataSet = this.intermediateResults.putIfAbsent(res.getId(), res);
				if (previousDataSet != null) {
					throw new JobException(String.format("Encountered two intermediate data set with ID %s : previous=[%s] / new=[%s]",
						res.getId(), res, previousDataSet));
				}
			}

			this.verticesInCreationOrder.add(ejv);
			this.numVerticesTotal += ejv.getParallelism();
			newExecJobVertices.add(ejv);
		}

		terminationFuture = new CompletableFuture<>();
		failoverStrategy.notifyNewVertices(newExecJobVertices);
	}

	public void scheduleForExecution() throws JobException {

		try {
			for (ExecutionJobVertex ejv : getAllVertices().values()) {
				ejv.setUpInputSplits(null);
			}
		} catch (JobException e) {
			failGlobal(new SuppressRestartsException(e));
			return;
		}

		final boolean enableSharedSlot = jobManagerConfiguration.getBoolean(JobManagerOptions.SLOT_ENABLE_SHARED_SLOT);
		if (enableSharedSlot) {
			updateSharedSlotResources();
		}

		if (transitionState(JobStatus.CREATED, JobStatus.RUNNING)) {
			graphManager.startScheduling();
		}
		else {
			throw new IllegalStateException("Job may only be scheduled from state " + JobStatus.CREATED);
		}
	}

	private void updateSharedSlotResources() {
		for (ExecutionJobVertex jobVertex : getVerticesTopologically()) {
			SlotSharingGroup sharingGroup = jobVertex.getJobVertex().getSlotSharingGroup();

			if (sharingGroup != null && sharingGroup.getResourceProfile() == null) {
				ResourceProfile sharingGroupResources = slotSharingResourceCalculator.calculateSharedGroupResource(sharingGroup, this);
				sharingGroup.setResourceProfile(sharingGroupResources);
			}
		}
	}

	/**
	 * Schedule execution vertices. These vertices get to deploy only when they have all allocated resources.
	 *
	 * @param vertices execution vertices to schedule
	 */
	private CompletableFuture schedule(Collection vertices) {

		checkState(state == JobStatus.RUNNING, "job is not running currently");

		// Important: reserve all the space we need up front.
		// that way we do not have any operation that can fail between allocating the slots
		// and adding them to the list. If we had a failure in between there, that would
		// cause the slots to get lost
		final boolean queued = allowQueuedScheduling;

		List slotRequestIds = new ArrayList<>(vertices.size());
		List scheduledUnits = new ArrayList<>(vertices.size());
		List slotProfiles = new ArrayList<>(vertices.size());
		List scheduledExecutions = new ArrayList<>(vertices.size());

		for (ExecutionVertex ev : vertices) {
			final Execution exec = ev.getCurrentExecutionAttempt();
			try {
				Tuple2 scheduleUnitAndSlotProfile = exec.enterScheduledAndPrepareSchedulingResources();
				slotRequestIds.add(new SlotRequestId());
				scheduledUnits.add(scheduleUnitAndSlotProfile.f0);
				slotProfiles.add(scheduleUnitAndSlotProfile.f1);
				scheduledExecutions.add(exec);
			} catch (IllegalExecutionStateException e) {
				LOG.info("The execution {} may be already scheduled by other thread.", ev.getTaskNameWithSubtaskIndex(), e);
			}
		}

		if (slotRequestIds.isEmpty()) {
			return CompletableFuture.completedFuture(null);
		}

		List> allocationFutures =
				slotProvider.allocateSlots(slotRequestIds, scheduledUnits, queued, slotProfiles, allocationTimeout);
		List> assignFutures = new ArrayList<>(slotRequestIds.size());
		for (int i = 0; i < allocationFutures.size(); i++) {
			final int index = i;
			allocationFutures.get(i).whenComplete(
					(ignore, throwable) -> {
						if (throwable != null) {
							slotProvider.cancelSlotRequest(
									slotRequestIds.get(index),
									scheduledUnits.get(index).getSlotSharingGroupId(),
									scheduledUnits.get(index).getCoLocationConstraint(),
									throwable);
						}
					}
			);
			assignFutures.add(allocationFutures.get(i).thenAccept(
					(LogicalSlot logicalSlot) -> {
						if (!scheduledExecutions.get(index).tryAssignResource(logicalSlot)) {
							// release the slot
							Exception e = new FlinkException("Could not assign logical slot to execution " + scheduledExecutions.get(index) + '.');
							logicalSlot.releaseSlot(e);
							throw new CompletionException(e);
						}
					})
			);
		}
		// this future is complete once all slot futures are complete.
		// the future fails once one slot future fails.
		final ConjunctFuture> allAssignFutures = FutureUtils.combineAll(assignFutures);

		CompletableFuture currentSchedulingFuture = allAssignFutures
				.exceptionally(
						throwable -> {
							LOG.info("Batch request {} slots, but only {} are fulfilled.",
									allAssignFutures.getNumFuturesTotal(), allAssignFutures.getNumFuturesCompleted());

							// Complete all futures first, or else the execution fail may cause global failover,
							// and other executions may fail first without clear the pending request.
							for (int i = 0; i < allocationFutures.size(); i++) {
								allocationFutures.get(i).completeExceptionally(throwable);
							}
							for (Execution execution : scheduledExecutions) {
								execution.markFailed(ExceptionUtils.stripCompletionException(throwable));
							}
							throw new CompletionException(throwable);
						}
				)
				.handleAsync(
						(Collection ignored, Throwable throwable) -> {
							if (throwable != null) {
								throw new CompletionException(throwable);
							} else {
								boolean hasFailure = false;
								for (int i = 0; i <  scheduledExecutions.size(); i++) {
									try {
										scheduledExecutions.get(i).deploy();
									} catch (Exception e) {
										hasFailure = true;
										LOG.info("Fail to deploy execution {}", scheduledExecutions.get(i), e);
										scheduledExecutions.get(i).markFailed(e);
									}
								}
								if (hasFailure) {
									throw new CompletionException(
											new FlinkException("Fail to deploy some executions."));
								}
							}
							return null;
						}, futureExecutor);

		currentSchedulingFuture.whenComplete(
				(Void ignored, Throwable throwable) -> {
					final Throwable strippedThrowable = ExceptionUtils.stripCompletionException(throwable);
					if (strippedThrowable instanceof CancellationException) {
						// cancel the individual allocation futures
						allAssignFutures.cancel(false);
					}
				});

		return currentSchedulingFuture;
	}

	public void cancel() {
		while (true) {
			JobStatus current = state;

			if (current == JobStatus.RUNNING || current == JobStatus.CREATED) {
				if (transitionState(current, JobStatus.CANCELLING)) {

					// make sure no concurrent local actions interfere with the cancellation
					final long globalVersionForRestart = incrementGlobalModVersion();

					// cancel ongoing scheduling actions
					cancelOngoingSchedulings();

					final ArrayList> futures = new ArrayList<>(verticesInCreationOrder.size());

					// cancel all tasks (that still need cancelling)
					for (ExecutionJobVertex ejv : verticesInCreationOrder) {
						futures.add(ejv.cancelWithFuture());
					}

					// we build a future that is complete once all vertices have reached a terminal state
					final ConjunctFuture allTerminal = FutureUtils.waitForAll(futures);
					allTerminal.whenComplete(
						(Void value, Throwable throwable) -> {
							if (throwable != null) {
								transitionState(
									JobStatus.CANCELLING,
									JobStatus.FAILED,
									new FlinkException(
										"Could not cancel job " + getJobName() + " because not all execution job vertices could be cancelled.",
										throwable));
							} else {
								// cancellations may currently be overridden by failures which trigger
								// restarts, so we need to pass a proper restart global version here
								allVerticesInTerminalState(globalVersionForRestart);
							}
						}
					);

					return;
				}
			}
			// Executions are being canceled. Go into cancelling and wait for
			// all vertices to be in their final state.
			else if (current == JobStatus.FAILING) {
				if (transitionState(current, JobStatus.CANCELLING)) {
					return;
				}
			}
			// All vertices have been cancelled and it's safe to directly go
			// into the canceled state.
			else if (current == JobStatus.RESTARTING) {
				synchronized (progressLock) {
					if (transitionState(current, JobStatus.CANCELED)) {
						onTerminalState(JobStatus.CANCELED);

						LOG.info("Canceled during restart.");
						return;
					}
				}
			}
			else {
				// no need to treat other states
				return;
			}
		}
	}

	public void stop() throws StoppingException {
		if (isStoppable) {
			for (ExecutionVertex ev : this.getAllExecutionVertices()) {
				if (ev.getNumberOfInputs() == 0) { // send signal to sources only
					ev.stop();
				}
			}
		} else {
			throw new StoppingException("This job is not stoppable.");
		}
	}

	/**
	 * Suspends the current ExecutionGraph.
	 *
	 * 
The JobStatus will be directly set to SUSPENDING iff the current state is not a terminal
	 * state. All ExecutionJobVertices will be canceled and the onTerminalState() is executed.
	 *
	 * 
The SUSPENDING state is a local terminal state which stops the execution of the job but does
	 * not remove the job from the HA job store so that it can be recovered by another JobManager.
	 *
	 * @param suspensionCause Cause of the suspension
	 */
	public void suspend(Throwable suspensionCause) {
		while (true) {
			JobStatus currentState = state;

			if (currentState.isTerminalState() || currentState == JobStatus.SUSPENDING) {
				// stay in a terminal state
				return;
			} else if (transitionState(currentState, JobStatus.SUSPENDING, suspensionCause)) {
				initFailureCause(suspensionCause);

				// make sure no concurrent local actions interfere with the cancellation
				incrementGlobalModVersion();

				// cancel ongoing scheduling actions
				cancelOngoingSchedulings();

				final ArrayList> executionJobVertexTerminationFutures = new ArrayList<>(verticesInCreationOrder.size());

				for (ExecutionJobVertex ejv: verticesInCreationOrder) {
					executionJobVertexTerminationFutures.add(ejv.cancelWithFuture());
				}

				final ConjunctFuture jobVerticesTerminationFuture = FutureUtils.waitForAll(executionJobVertexTerminationFutures);

				jobVerticesTerminationFuture.whenComplete(
					(Void ignored, Throwable throwable) -> {
						if (throwable != null) {
							LOG.debug("Flink could not properly clean up resource after suspension.", throwable);
						}

						// the globalModVersion does not play a role because there is no way
						// currently to leave the SUSPENDING state
						allVerticesInTerminalState(-1L);
						LOG.info("Job {} has been suspended.", getJobID());
					});

				return;
			}
		}
	}

	/**
	 * Fails the execution graph globally. This failure will not be recovered by a specific
	 * failover strategy, but results in a full restart of all tasks.
	 *
	 * 
This global failure is meant to be triggered in cases where the consistency of the
	 * execution graph' state cannot be guaranteed any more (for example when catching unexpected
	 * exceptions that indicate a bug or an unexpected call race), and where a full restart is the
	 * safe way to get consistency back.
	 *
	 * @param t The exception that caused the failure.
	 */
	public void failGlobal(Throwable t) {
		while (true) {
			JobStatus current = state;
			// stay in these states
			if (current == JobStatus.FAILING ||
				current == JobStatus.SUSPENDING ||
				current == JobStatus.SUSPENDED ||
				current	== JobStatus.RECONCILING ||
				current.isGloballyTerminalState()) {
				return;
			}
			else if (current == JobStatus.RESTARTING) {
				// we handle 'failGlobal()' while in 'RESTARTING' as a safety net in case something
				// has gone wrong in 'RESTARTING' and we need to re-attempt the restarts
				initFailureCause(t);

				final long globalVersionForRestart = incrementGlobalModVersion();
				if (tryRestartOrFail(globalVersionForRestart)) {
					return;
				}
			}
			else if (transitionState(current, JobStatus.FAILING, t)) {
				initFailureCause(t);

				// make sure no concurrent local or global actions interfere with the failover
				final long globalVersionForRestart = incrementGlobalModVersion();

				// cancel ongoing scheduling actions
				cancelOngoingSchedulings();

				// we build a future that is complete once all vertices have reached a terminal state
				final ArrayList> futures = new ArrayList<>(verticesInCreationOrder.size());

				// cancel all tasks (that still need cancelling)
				for (ExecutionJobVertex ejv : verticesInCreationOrder) {
					futures.add(ejv.cancelWithFuture());
				}

				final ConjunctFuture allTerminal = FutureUtils.waitForAll(futures);
				allTerminal.whenComplete(
					(Void ignored, Throwable throwable) -> {
						if (throwable != null) {
							transitionState(
								JobStatus.FAILING,
								JobStatus.FAILED,
								new FlinkException("Could not cancel all execution job vertices properly.", throwable));
						} else {
							allVerticesInTerminalState(globalVersionForRestart);
						}
					});

				return;
			}

			// else: concurrent change to execution state, retry
		}
	}

	public CompletableFuture> reconcile() {
		JobStatus curStatus = state;
		checkState(JobStatus.CREATED.equals(curStatus), "Not allow reconcile in state " + curStatus);

		if (transitionState(curStatus, JobStatus.RECONCILING)) {
			List> executionReconcileFutures = new ArrayList<>(currentExecutions.size());
			for (ExecutionVertex ev : getAllExecutionVertices()) {
				executionReconcileFutures.add(ev.getCurrentExecutionAttempt().reconcile());
			}

			reconcileFuture = FutureUtils.combineAll(executionReconcileFutures).whenComplete(
					(ignore, throwable) -> {
						boolean allCreated = true;
						for (Execution execution : getRegisteredExecutions().values()) {
							if (execution.getState() != ExecutionState.CREATED) {
								allCreated = false;
								break;
							}
						}
						if (allCreated && !graphManager.hasToBeScheduledVertices()) {
							if (!transitionState(JobStatus.RECONCILING, JobStatus.CREATED)) {
								LOG.error("When reconcile finish, the job is in {}, this is a logical error.", state);
							}
						}
						else if (!transitionState(JobStatus.RECONCILING, JobStatus.RUNNING)) {
							LOG.error("When reconcile finish, the job is in {}, this is a logical error.", state);
						}
					}
			);
			return reconcileFuture;
		} else {
			throw new IllegalStateException("ExecutionGraph reconcile fail while in state " + curStatus);
		}
	}

	private void cancelOngoingSchedulings() {
		for (CompletableFuture ongoingSchedulingFuture : schedulingFutures.keySet()) {
			ongoingSchedulingFuture.cancel(false);
		}
	}

	public void restart(long expectedGlobalVersion) {
		try {
			synchronized (progressLock) {
				// check the global version to see whether this recovery attempt is still valid
				if (globalModVersion != expectedGlobalVersion) {
					LOG.info("Concurrent full restart subsumed this restart.");
					return;
				}

				final JobStatus current = state;

				if (current == JobStatus.CANCELED) {
					LOG.info("Canceled job during restart. Aborting restart.");
					return;
				} else if (current == JobStatus.FAILED) {
					LOG.info("Failed job during restart. Aborting restart.");
					return;
				} else if (current == JobStatus.SUSPENDING || current == JobStatus.SUSPENDED) {
					LOG.info("Suspended job during restart. Aborting restart.");
					return;
				} else if (current != JobStatus.RESTARTING) {
					throw new IllegalStateException("Can only restart job from state restarting.");
				}

				this.currentExecutions.clear();

				final Collection colGroups = new HashSet<>();
				final long resetTimestamp = System.currentTimeMillis();

				for (ExecutionJobVertex jv : this.verticesInCreationOrder) {

					CoLocationGroup cgroup = jv.getCoLocationGroup();
					if (cgroup != null && !colGroups.contains(cgroup)){
						cgroup.resetConstraints();
						colGroups.add(cgroup);
					}

					jv.resetForNewExecution(resetTimestamp, globalModVersion);
				}

				for (int i = 0; i < stateTimestamps.length; i++) {
					if (i != JobStatus.RESTARTING.ordinal()) {
						// Only clear the non restarting state in order to preserve when the job was
						// restarted. This is needed for the restarting time gauge
						stateTimestamps[i] = 0;
					}
				}

				transitionState(JobStatus.RESTARTING, JobStatus.CREATED);

				// if we have checkpointed state, reload it into the executions
				if (checkpointCoordinator != null) {
					checkpointCoordinator.restoreLatestCheckpointedState(getAllVertices(), false, false);
				}
			}

			// Reset the graph manager to initial status
			graphManager.reset();

			if (transitionState(JobStatus.CREATED, JobStatus.RUNNING)) {
				graphManager.startScheduling();
			}
			else {
				throw new IllegalStateException("Job may only be scheduled from state " + JobStatus.CREATED);
			}
		} catch (Throwable t) {
			LOG.warn("Failed to restart the job.", t);
			failGlobal(t);
		}
	}

	public void resetExecutionVerticesAndNotify(long modVersion, List executionVertices) throws Exception {
		final long resetTimestamp = System.currentTimeMillis();
		List evIds = new ArrayList<>(executionVertices.size());
		for (ExecutionVertex ev : executionVertices) {
			ev.resetForNewExecution(resetTimestamp, modVersion);
			evIds.add(ev.getExecutionVertexID());
		}
		// if we have checkpointed state, reload it into the executions
		// we restart scheduler to ensure EXACTLY_ONCE mechanism and
		// to trigger new checkpoint without having to wait for old checkpoint expired
		if (checkpointCoordinator != null) {
			checkpointCoordinator.stopCheckpointScheduler();
			checkpointCoordinator.restoreLatestCheckpointedState(executionVertices, false, true);
			checkpointCoordinator.startCheckpointScheduler();
		}

		graphManager.notifyExecutionVertexFailover(evIds);
	}

	/**
	 * Restores the latest checkpointed state.
	 *
	 * 
The recovery of checkpoints might block. Make sure that calls to this method don't
	 * block the job manager actor and run asynchronously.
	 *
	 * @param errorIfNoCheckpoint Fail if there is no checkpoint available
	 * @param allowNonRestoredState Allow to skip checkpoint state that cannot be mapped
	 * to the ExecutionGraph vertices (if the checkpoint contains state for a
	 * job vertex that is not part of this ExecutionGraph).
	 */
	public void restoreLatestCheckpointedState(boolean errorIfNoCheckpoint, boolean allowNonRestoredState) throws Exception {
		synchronized (progressLock) {
			if (checkpointCoordinator != null) {
				checkpointCoordinator.restoreLatestCheckpointedState(getAllVertices(), errorIfNoCheckpoint, allowNonRestoredState);
			}
		}
	}

	/**
	 * Returns the serializable {@link ArchivedExecutionConfig}.
	 *
	 * @return ArchivedExecutionConfig which may be null in case of errors
	 */
	@Override
	public ArchivedExecutionConfig getArchivedExecutionConfig() {
		// create a summary of all relevant data accessed in the web interface's JobConfigHandler
		try {
			ExecutionConfig executionConfig = jobInformation.getSerializedExecutionConfig().deserializeValue(userClassLoader);
			if (executionConfig != null) {
				return executionConfig.archive();
			}
		} catch (IOException | ClassNotFoundException e) {
			LOG.error("Couldn't create ArchivedExecutionConfig for job {} ", getJobID(), e);
		}
		return null;
	}

	/**
	 * Returns the termination future of this {@link ExecutionGraph}. The termination future
	 * is completed with the terminal {@link JobStatus} once the ExecutionGraph reaches this
	 * terminal state and all {@link Execution} have been terminated.
	 *
	 * @return Termination future of this {@link ExecutionGraph}.
	 */
	public CompletableFuture getTerminationFuture() {
		return terminationFuture;
	}

	/**
	 * Returns the reconcile future of this {@link ExecutionGraph}. The reconcile future
	 * is completed with the execution attempt ids once all the execution vertices finish reconciling.
	 *
	 * @return Reconcile future of this {@link ExecutionGraph}.
	 */
	public CompletableFuture> getReconcileFuture() {
		return reconcileFuture;
	}

	@VisibleForTesting
	public JobStatus waitUntilTerminal() throws InterruptedException {
		try {
			return terminationFuture.get();
		}
		catch (ExecutionException e) {
			// this should never happen
			// it would be a bug, so we  don't expect this to be handled and throw
			// an unchecked exception here
			throw new RuntimeException(e);
		}
	}

	/**
	 * Gets the failover strategy used by the execution graph to recover from failures of tasks.
	 */
	public FailoverStrategy getFailoverStrategy() {
		return this.failoverStrategy;
	}

	/**
	 * Gets the current global modification version of the ExecutionGraph.
	 * The global modification version is incremented with each global action (cancel/fail/restart)
	 * and is used to disambiguate concurrent modifications between local and global
	 * failover actions.
	 */
	public long getGlobalModVersion() {
		return globalModVersion;
	}

	public void scheduleVertices(Collection verticesToSchedule) {

		try {
			long currentGlobalModVersion = globalModVersion;

			if (verticesToSchedule == null || verticesToSchedule.isEmpty()) {
				return;
			}

			int alreadyScheduledCount = 0;
			int unhealthyCount = 0;
			final List vertices = new ArrayList<>(verticesToSchedule.size());
			for (ExecutionVertexID executionVertexID : verticesToSchedule) {
				ExecutionVertex ev = getJobVertex(executionVertexID.getJobVertexID()).getTaskVertices()[executionVertexID.getSubTaskIndex()];
				if (ev.getExecutionState() == ExecutionState.CREATED) {
					vertices.add(ev);
				} else if (ev.getExecutionState() == ExecutionState.SCHEDULED ||
						ev.getExecutionState() == ExecutionState.DEPLOYING ||
						ev.getExecutionState() == ExecutionState.RUNNING ||
						ev.getExecutionState() == ExecutionState.FINISHED) {

					alreadyScheduledCount++;
				} else {
					unhealthyCount++;
				}
			}
			if (unhealthyCount >  0) {
				throw new IllegalStateException("Not all submitted vertices can be scheduled. " +
//						alreadyScheduledCount + " vertices are already scheduled and " +
						unhealthyCount + " vertices are in unhealthy state. " +
						"Please check the schedule logic in " + graphManager.getClass().getCanonicalName());
			}

			LOG.info("Schedule {} vertices, already scheduled {}", vertices.size(), alreadyScheduledCount);

			if (vertices.isEmpty()) {
				return;
			}

			final CompletableFuture schedulingFuture = schedule(vertices);

			if (state == JobStatus.RUNNING && currentGlobalModVersion == globalModVersion) {

				schedulingFutures.put(schedulingFuture, schedulingFuture);

				schedulingFuture.whenCompleteAsync(
						(Void ignored, Throwable throwable) -> {
							schedulingFutures.remove(schedulingFuture);
						},
						futureExecutor);
			} else {
				schedulingFuture.cancel(false);
			}
		} catch (Throwable t) {
			LOG.info("scheduleVertices meet exception, need to fail global execution graph", t);
			failGlobal(t);
		}
	}

	// ------------------------------------------------------------------------
	//  State Transitions
	// ------------------------------------------------------------------------

	private boolean transitionState(JobStatus current, JobStatus newState) {
		return transitionState(current, newState, null);
	}

	private boolean transitionState(JobStatus current, JobStatus newState, Throwable error) {
		// consistency check
		if (current.isTerminalState()) {
			String message = "Job is trying to leave terminal state " + current;
			LOG.error(message);
			throw new IllegalStateException(message);
		}

		// now do the actual state transition
		if (STATE_UPDATER.compareAndSet(this, current, newState)) {
			LOG.info("Job {} ({}) switched from state {} to {}.", getJobName(), getJobID(), current, newState, error);

			stateTimestamps[newState.ordinal()] = System.currentTimeMillis();
			notifyJobStatusChange(newState, error);
			return true;
		}
		else {
			return false;
		}
	}

	private long incrementGlobalModVersion() {
		return GLOBAL_VERSION_UPDATER.incrementAndGet(this);
	}

	private void initFailureCause(Throwable t) {
		this.failureCause = t;
		this.failureInfo = new ErrorInfo(t, System.currentTimeMillis());
	}

	// ------------------------------------------------------------------------
	//  Job Status Progress
	// ------------------------------------------------------------------------

	/**
	 * Called whenever a vertex reaches state FINISHED (completed successfully).
	 * Once all vertices are in the FINISHED state, the program is successfully done.
	 */
	void vertexFinished() {
		final int numFinished = verticesFinished.incrementAndGet();
		if (numFinished == numVerticesTotal) {
			// done :-)

			// check whether we are still in "RUNNING" and trigger the final cleanup
			if (state == JobStatus.RUNNING) {
				// we do the final cleanup in the I/O executor, because it may involve
				// some heavier work

				try {
					for (ExecutionJobVertex ejv : verticesInCreationOrder) {
						ejv.getJobVertex().finalizeOnMaster(getUserClassLoader());
					}
				}
				catch (Throwable t) {
					ExceptionUtils.rethrowIfFatalError(t);
					failGlobal(new Exception("Failed to finalize execution on master", t));
					return;
				}

				// if we do not make this state transition, then a concurrent
				// cancellation or failure happened
				if (transitionState(JobStatus.RUNNING, JobStatus.FINISHED)) {
					onTerminalState(JobStatus.FINISHED);
				}
			}
		}
	}

	void vertexUnFinished() {
		verticesFinished.getAndDecrement();
	}

	/**
	 * This method is a callback during cancellation/failover and called when all tasks
	 * have reached a terminal state (cancelled/failed/finished).
	 */
	private void allVerticesInTerminalState(long expectedGlobalVersionForRestart) {
		// we are done, transition to the final state
		JobStatus current;
		while (true) {
			current = this.state;

			if (current == JobStatus.RUNNING) {
				failGlobal(new Exception("ExecutionGraph went into allVerticesInTerminalState() from RUNNING"));
			}
			else if (current == JobStatus.CANCELLING) {
				if (transitionState(current, JobStatus.CANCELED)) {
					onTerminalState(JobStatus.CANCELED);
					break;
				}
			}
			else if (current == JobStatus.FAILING) {
				if (tryRestartOrFail(expectedGlobalVersionForRestart)) {
					break;
				}
				// concurrent job status change, let's check again
			}
			else if (current == JobStatus.SUSPENDING) {
				if (transitionState(current, JobStatus.SUSPENDED)) {
					onTerminalState(JobStatus.SUSPENDED);
					break;
				}
			}
			else if (current.isGloballyTerminalState()) {
				LOG.warn("Job has entered globally terminal state without waiting for all " +
					"job vertices to reach final state.");
				break;
			}
			else {
				failGlobal(new Exception("ExecutionGraph went into final state from state " + current));
				break;
			}
		}
		// done transitioning the state
	}

	/**
	 * Try to restart the job. If we cannot restart the job (e.g. no more restarts allowed), then
	 * try to fail the job. This operation is only permitted if the current state is FAILING or
	 * RESTARTING.
	 *
	 * @return true if the operation could be executed; false if a concurrent job status change occurred
	 */
	private boolean tryRestartOrFail(long globalModVersionForRestart) {
		JobStatus currentState = state;

		if (currentState == JobStatus.FAILING || currentState == JobStatus.RESTARTING) {
			final Throwable failureCause = this.failureCause;

			synchronized (progressLock) {
				if (LOG.isDebugEnabled()) {
					LOG.debug("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID(), failureCause);
				} else {
					LOG.info("Try to restart or fail the job {} ({}) if no longer possible.", getJobName(), getJobID());
				}

				final boolean isFailureCauseAllowingRestart = !(failureCause instanceof SuppressRestartsException);
				final boolean isRestartStrategyAllowingRestart = restartStrategy.canRestart();
				boolean isRestartable = isFailureCauseAllowingRestart && isRestartStrategyAllowingRestart;

				if (isRestartable && transitionState(currentState, JobStatus.RESTARTING)) {
					LOG.info("Restarting the job {} ({}).", getJobName(), getJobID());

					RestartCallback restarter = new ExecutionGraphRestartCallback(this, globalModVersionForRestart);
					restartStrategy.restart(restarter, new ScheduledExecutorServiceAdapter(futureExecutor));

					return true;
				}
				else if (!isRestartable && transitionState(currentState, JobStatus.FAILED, failureCause)) {
					final String cause1 = isFailureCauseAllowingRestart ? null :
						"a type of SuppressRestartsException was thrown";
					final String cause2 = isRestartStrategyAllowingRestart ? null :
						"the restart strategy prevented it";

					LOG.info("Could not restart the job {} ({}) because {}.", getJobName(), getJobID(),
						StringUtils.concatenateWithAnd(cause1, cause2), failureCause);
					onTerminalState(JobStatus.FAILED);

					return true;
				} else {
					// we must have changed the state concurrently, thus we cannot complete this operation
					return false;
				}
			}
		} else {
			// this operation is only allowed in the state FAILING or RESTARTING
			return false;
		}
	}

	private void onTerminalState(JobStatus status) {
		try {
			CheckpointCoordinator coord = this.checkpointCoordinator;
			this.checkpointCoordinator = null;
			if (coord != null) {
				coord.shutdown(status);
			}
		}
		catch (Exception e) {
			LOG.error("Error while cleaning up after execution", e);
		}
		finally {
			terminationFuture.complete(status);
		}
	}

	// --------------------------------------------------------------------------------------------
	//  Callbacks and Callback Utilities
	// --------------------------------------------------------------------------------------------

	/**
	 * Updates the state of one of the ExecutionVertex's Execution attempts.
	 * If the new status if "FINISHED", this also updates the accumulators.
	 *
	 * @param state The state update.
	 * @return True, if the task update was properly applied, false, if the execution attempt was not found.
	 */
	public boolean updateState(TaskExecutionState state) {
		final Execution attempt = currentExecutions.get(state.getID());

		if (attempt != null) {
			try {
				Map> accumulators;

				switch (state.getExecutionState()) {
					case RUNNING:
						boolean result = attempt.switchToRunning();
						if(result) {
							taskDeployMetrics.update(
								attempt.getStateTimestamp(ExecutionState.RUNNING) - attempt.getStateTimestamp(ExecutionState.DEPLOYING));
						}
						return result;

					case FINISHED:
						// this deserialization is exception-free
						accumulators = deserializeAccumulators(state);
						attempt.markFinished(accumulators, state.getIOMetrics());
						return true;

					case CANCELED:
						// this deserialization is exception-free
						accumulators = deserializeAccumulators(state);
						attempt.cancelingComplete(accumulators, state.getIOMetrics());
						return true;

					case FAILED:
						// this deserialization is exception-free
						accumulators = deserializeAccumulators(state);
						attempt.markFailed(state.getError(userClassLoader), accumulators, state.getIOMetrics());
						return true;

					default:
						// we mark as failed and return false, which triggers the TaskManager
						// to remove the task
						attempt.fail(new Exception("TaskManager sent illegal state update: " + state.getExecutionState()));
						return false;
				}
			}
			catch (Throwable t) {
				ExceptionUtils.rethrowIfFatalErrorOrOOM(t);

				// failures during updates leave the ExecutionGraph inconsistent
				failGlobal(t);
				return false;
			}
		}
		else {
			return false;
		}
	}

	/**
	 * Deserializes accumulators from a task state update.
	 *
	 * This method never throws an exception!
	 *
	 * @param state The task execution state from which to deserialize the accumulators.
	 * @return The deserialized accumulators, of null, if there are no accumulators or an error occurred.
	 */
	private Map> deserializeAccumulators(TaskExecutionState state) {
		AccumulatorSnapshot serializedAccumulators = state.getAccumulators();

		if (serializedAccumulators != null) {
			try {
				return serializedAccumulators.deserializeUserAccumulators(userClassLoader);
			}
			catch (Throwable t) {
				// we catch Throwable here to include all form of linking errors that may
				// occur if user classes are missing in the classpath
				LOG.error("Failed to deserialize final accumulator results.", t);
			}
		}
		return null;
	}

	/**
	 * Schedule or updates consumers of the given result partition.
	 *
	 * @param partitionId specifying the result partition whose consumer shall be scheduled or updated
	 * @throws ExecutionGraphException if the schedule or update consumers operation could not be executed
	 */
	public void scheduleOrUpdateConsumers(ResultPartitionID partitionId) throws ExecutionGraphException {

		final Execution execution = currentExecutions.get(partitionId.getProducerId());

		if (execution == null) {
			throw new ExecutionGraphException("Cannot find execution for execution Id " +
				partitionId.getPartitionId() + '.');
		}
		else if (execution.getVertex() == null){
			throw new ExecutionGraphException("Execution with execution Id " +
				partitionId.getPartitionId() + " has no vertex assigned.");
		} else {
			execution.getVertex().scheduleOrUpdateConsumers(partitionId);
		}
	}

	public Map getRegisteredExecutions() {
		return Collections.unmodifiableMap(currentExecutions);
	}

	void registerExecution(Execution exec) {
		Execution previous = currentExecutions.putIfAbsent(exec.getAttemptId(), exec);
		if (previous != null) {
			failGlobal(new Exception("Trying to register execution " + exec + " for already used ID " + exec.getAttemptId()));
		}
	}

	void deregisterExecution(Execution exec) {
		Execution contained = currentExecutions.remove(exec.getAttemptId());

		if (contained != null && contained != exec) {
			failGlobal(new Exception("De-registering execution " + exec + " failed. Found for same ID execution " + contained));
		}
	}

	/**
	 * Updates the accumulators during the runtime of a job. Final accumulator results are transferred
	 * through the UpdateTaskExecutionState message.
	 * @param accumulatorSnapshot The serialized flink and user-defined accumulators
	 */
	public void updateAccumulators(AccumulatorSnapshot accumulatorSnapshot) {
		Map> userAccumulators;
		try {
			userAccumulators = accumulatorSnapshot.deserializeUserAccumulators(userClassLoader);

			ExecutionAttemptID execID = accumulatorSnapshot.getExecutionAttemptID();
			Execution execution = currentExecutions.get(execID);
			if (execution != null) {
				execution.setAccumulators(userAccumulators);
			} else {
				LOG.debug("Received accumulator result for unknown execution {}.", execID);
			}
		} catch (Exception e) {
			LOG.error("Cannot update accumulators for job {}.", getJobID(), e);
		}
	}

	// --------------------------------------------------------------------------------------------
	//  Listeners & Observers
	// --------------------------------------------------------------------------------------------

	public void registerJobStatusListener(JobStatusListener listener) {
		if (listener != null) {
			jobStatusListeners.add(listener);
		}
	}

	public void registerExecutionListener(ExecutionStatusListener listener) {
		if (listener != null) {
			executionListeners.add(listener);
		}
	}

	private void notifyJobStatusChange(JobStatus newState, Throwable error) {
		if (jobStatusListeners.size() > 0) {
			final long timestamp = System.currentTimeMillis();
			final Throwable serializedError = error == null ? null : new SerializedThrowable(error);

			for (JobStatusListener listener : jobStatusListeners) {
				try {
					listener.jobStatusChanges(getJobID(), newState, timestamp, serializedError);
				} catch (Throwable t) {
					LOG.warn("Error while notifying JobStatusListener", t);
				}
			}
		}
	}

	void notifyExecutionChange(
		final Execution execution,
		final ExecutionState newExecutionState,
		final Throwable error) {

		if (executionListeners.size() > 0) {
			final ExecutionJobVertex vertex = execution.getVertex().getJobVertex();
			final String message = error == null ? null : ExceptionUtils.stringifyException(error);
			final long timestamp = System.currentTimeMillis();

			for (ExecutionStatusListener listener : executionListeners) {
				try {
					listener.executionStatusChanged(
						getJobID(), vertex.getJobVertexId(), vertex.getJobVertex().getName(),
						vertex.getParallelism(), execution.getParallelSubtaskIndex(),
						execution.getAttemptId(), newExecutionState, timestamp, message);
				} catch (Throwable t) {
					LOG.warn("Error while notifying ExecutionStatusListener", t);
				}
			}
		}

		// see what this means for us. currently, the first FAILED state means -> FAILED
		if (newExecutionState == ExecutionState.FAILED) {
			final Throwable ex = error != null ? error : new FlinkException("Unknown Error (missing cause)");
			long timestamp = execution.getStateTimestamp(ExecutionState.FAILED);

			// by filtering out late failure calls, we can save some work in
			// avoiding redundant local failover
			if (execution.getGlobalModVersion() == globalModVersion) {
				try {
					if (this.failOverMetrics != null) {
						this.failOverMetrics.markEvent();
					}
					// fail all checkpoints which the failed task has not yet acknowledged
					if (checkpointCoordinator != null) {
						checkpointCoordinator.failUnacknowledgedPendingCheckpointsFor(execution.getAttemptId(), ex);
					}

					failoverStrategy.onTaskFailure(execution, ex);
				}
				catch (Throwable t) {
					// bug in the failover strategy - fall back to global failover
					LOG.warn("Error in failover strategy - falling back to global restart", t);
					failGlobal(ex);
				}
			}
		}
	}

}