org.apache.flink.runtime.executiongraph.DefaultExecutionGraph Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.executiongraph;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.ArchivedExecutionConfig;
import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.JobStatus;
import org.apache.flink.api.common.accumulators.Accumulator;
import org.apache.flink.api.common.accumulators.AccumulatorHelper;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.execution.DefaultJobExecutionStatusEvent;
import org.apache.flink.core.execution.JobStatusChangedListener;
import org.apache.flink.core.execution.JobStatusHook;
import org.apache.flink.metrics.Counter;
import org.apache.flink.metrics.SimpleCounter;
import org.apache.flink.runtime.JobException;
import org.apache.flink.runtime.accumulators.AccumulatorSnapshot;
import org.apache.flink.runtime.accumulators.StringifiedAccumulatorResult;
import org.apache.flink.runtime.blob.BlobWriter;
import org.apache.flink.runtime.blob.PermanentBlobKey;
import org.apache.flink.runtime.checkpoint.CheckpointCoordinator;
import org.apache.flink.runtime.checkpoint.CheckpointFailureManager;
import org.apache.flink.runtime.checkpoint.CheckpointIDCounter;
import org.apache.flink.runtime.checkpoint.CheckpointPlanCalculator;
import org.apache.flink.runtime.checkpoint.CheckpointStatsSnapshot;
import org.apache.flink.runtime.checkpoint.CheckpointStatsTracker;
import org.apache.flink.runtime.checkpoint.CheckpointsCleaner;
import org.apache.flink.runtime.checkpoint.CompletedCheckpointStore;
import org.apache.flink.runtime.checkpoint.DefaultCheckpointPlanCalculator;
import org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook;
import org.apache.flink.runtime.checkpoint.OperatorCoordinatorCheckpointContext;
import org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor;
import org.apache.flink.runtime.deployment.TaskDeploymentDescriptorFactory;
import org.apache.flink.runtime.entrypoint.ClusterEntryPointExceptionUtils;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.executiongraph.failover.ResultPartitionAvailabilityChecker;
import org.apache.flink.runtime.executiongraph.failover.partitionrelease.PartitionGroupReleaseStrategy;
import org.apache.flink.runtime.io.network.partition.JobMasterPartitionTracker;
import org.apache.flink.runtime.io.network.partition.ResultPartitionID;
import org.apache.flink.runtime.jobgraph.IntermediateDataSetID;
import org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID;
import org.apache.flink.runtime.jobgraph.JobType;
import org.apache.flink.runtime.jobgraph.JobVertex;
import org.apache.flink.runtime.jobgraph.JobVertex.FinalizeOnMasterContext;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration;
import org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroup;
import org.apache.flink.runtime.metrics.groups.JobManagerJobMetricGroup;
import org.apache.flink.runtime.operators.coordination.CoordinatorStore;
import org.apache.flink.runtime.operators.coordination.CoordinatorStoreImpl;
import org.apache.flink.runtime.query.KvStateLocationRegistry;
import org.apache.flink.runtime.scheduler.InternalFailuresListener;
import org.apache.flink.runtime.scheduler.SsgNetworkMemoryCalculationUtils;
import org.apache.flink.runtime.scheduler.VertexParallelismInformation;
import org.apache.flink.runtime.scheduler.VertexParallelismStore;
import org.apache.flink.runtime.scheduler.adapter.DefaultExecutionTopology;
import org.apache.flink.runtime.scheduler.strategy.ConsumedPartitionGroup;
import org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID;
import org.apache.flink.runtime.scheduler.strategy.SchedulingExecutionVertex;
import org.apache.flink.runtime.scheduler.strategy.SchedulingResultPartition;
import org.apache.flink.runtime.scheduler.strategy.SchedulingTopology;
import org.apache.flink.runtime.shuffle.ShuffleDescriptor;
import org.apache.flink.runtime.shuffle.ShuffleMaster;
import org.apache.flink.runtime.state.CheckpointStorage;
import org.apache.flink.runtime.state.StateBackend;
import org.apache.flink.runtime.state.StateBackendLoader;
import org.apache.flink.runtime.taskmanager.DispatcherThreadFactory;
import org.apache.flink.util.CollectionUtil;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.FlinkException;
import org.apache.flink.util.IterableUtils;
import org.apache.flink.util.MdcUtils;
import org.apache.flink.util.OptionalFailure;
import org.apache.flink.util.SerializedValue;
import org.apache.flink.util.TernaryBoolean;
import org.apache.flink.util.concurrent.FutureUtils;
import org.apache.flink.util.concurrent.ScheduledExecutorServiceAdapter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.IOException;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Executor;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.function.Function;
import java.util.stream.Collectors;
import static org.apache.flink.runtime.executiongraph.ExecutionGraphUtils.isAnyOutputBlocking;
import static org.apache.flink.util.Preconditions.checkNotNull;
import static org.apache.flink.util.Preconditions.checkState;
/** Default implementation of the {@link ExecutionGraph}. */
public class DefaultExecutionGraph implements ExecutionGraph, InternalExecutionGraphAccessor {
/** The log object used for debugging. */
static final Logger LOG = LoggerFactory.getLogger(ExecutionGraph.class);
// --------------------------------------------------------------------------------------------
/**
* The unique id of an execution graph. It is different from JobID, because there can be
* multiple execution graphs created from one job graph, in cases like job re-submission, job
* master failover and job rescaling.
*/
private final ExecutionGraphID executionGraphId;
/** Job specific information like the job id, job name, job configuration, etc. */
private final JobInformation jobInformation;
/** The executor which is used to execute futures. */
private final ScheduledExecutorService futureExecutor;
/** The executor which is used to execute blocking io operations. */
private final Executor ioExecutor;
/** {@link CoordinatorStore} shared across all operator coordinators within this execution. */
private final CoordinatorStore coordinatorStore = new CoordinatorStoreImpl();
/** Executor that runs tasks in the job manager's main thread. */
@Nonnull private ComponentMainThreadExecutor jobMasterMainThreadExecutor;
/** {@code true} if all source tasks are stoppable. */
private boolean isStoppable = true;
/** All job vertices that are part of this graph. */
private final Map tasks;
/** All vertices, in the order in which they were created. * */
private final List verticesInCreationOrder;
/** All intermediate results that are part of this graph. */
private final Map intermediateResults;
/** The currently executed tasks, for callbacks. */
private final Map currentExecutions;
/**
* Listeners that receive messages when the entire job switches it status (such as from RUNNING
* to FINISHED).
*/
private final List jobStatusListeners;
/**
* Timestamps (in milliseconds as returned by {@code System.currentTimeMillis()} when the
* execution graph transitioned into a certain state. The index into this array is the ordinal
* of the enum value, i.e. the timestamp when the graph went into state "RUNNING" is at {@code
* stateTimestamps[RUNNING.ordinal()]}.
*/
private final long[] stateTimestamps;
/** The timeout for all messages that require a response/acknowledgement. */
private final Duration rpcTimeout;
/** The classloader for the user code. Needed for calls into user code classes. */
private final ClassLoader userClassLoader;
/** Registered KvState instances reported by the TaskManagers. */
private final KvStateLocationRegistry kvStateLocationRegistry;
/** Blob writer used to offload RPC messages. */
private final BlobWriter blobWriter;
/** Number of total job vertices. */
private int numJobVerticesTotal;
private final PartitionGroupReleaseStrategy.Factory partitionGroupReleaseStrategyFactory;
private PartitionGroupReleaseStrategy partitionGroupReleaseStrategy;
private DefaultExecutionTopology executionTopology;
@Nullable private InternalFailuresListener internalTaskFailuresListener;
/** Counts all restarts. Used by other Gauges/Meters and does not register to metric group. */
private final Counter numberOfRestartsCounter = new SimpleCounter();
// ------ Configuration of the Execution -------
/** The maximum number of historical execution attempts kept in history. */
private final int executionHistorySizeLimit;
// ------ Execution status and progress. These values are volatile, and accessed under the lock
// -------
/** Number of finished job vertices. */
private int numFinishedJobVertices;
/** Current status of the job execution. */
private volatile JobStatus state = JobStatus.CREATED;
/** The job type of the job execution. */
private final JobType jobType;
/** A future that completes once the job has reached a terminal state. */
private final CompletableFuture terminationFuture = new CompletableFuture<>();
/**
* The exception that caused the job to fail. This is set to the first root exception that was
* not recoverable and triggered job failure.
*/
private Throwable failureCause;
/**
* The extended failure cause information for the job. This exists in addition to
* 'failureCause', to let 'failureCause' be a strong reference to the exception, while this info
* holds no strong reference to any user-defined classes.
*/
private ErrorInfo failureInfo;
private final JobMasterPartitionTracker partitionTracker;
private final ResultPartitionAvailabilityChecker resultPartitionAvailabilityChecker;
/** Future for an ongoing or completed scheduling action. */
@Nullable private CompletableFuture schedulingFuture;
private final VertexAttemptNumberStore initialAttemptCounts;
private final VertexParallelismStore parallelismStore;
// ------ Fields that are relevant to the execution and need to be cleared before archiving
// -------
@Nullable private CheckpointCoordinatorConfiguration checkpointCoordinatorConfiguration;
/** The coordinator for checkpoints, if snapshot checkpoints are enabled. */
@Nullable private CheckpointCoordinator checkpointCoordinator;
/** TODO, replace it with main thread executor. */
@Nullable private ScheduledExecutorService checkpointCoordinatorTimer;
/**
* Checkpoint stats tracker separate from the coordinator in order to be available after
* archiving.
*/
@Nullable private CheckpointStatsTracker checkpointStatsTracker;
// ------ Fields that are only relevant for archived execution graphs ------------
@Nullable private String stateBackendName;
@Nullable private String checkpointStorageName;
@Nullable private String changelogStorageName;
@Nullable private TernaryBoolean stateChangelogEnabled;
private String jsonPlan;
/** Shuffle master to register partitions for task deployment. */
private final ShuffleMaster> shuffleMaster;
private final ExecutionDeploymentListener executionDeploymentListener;
private final ExecutionStateUpdateListener executionStateUpdateListener;
private final EdgeManager edgeManager;
private final Map executionVerticesById;
private final Map
resultPartitionsById;
private final VertexInputInfoStore vertexInputInfoStore;
private final boolean isDynamic;
private final ExecutionJobVertex.Factory executionJobVertexFactory;
private final List jobStatusHooks;
private final MarkPartitionFinishedStrategy markPartitionFinishedStrategy;
private final TaskDeploymentDescriptorFactory taskDeploymentDescriptorFactory;
private final List jobStatusChangedListeners;
// --------------------------------------------------------------------------------------------
// Constructors
// --------------------------------------------------------------------------------------------
public DefaultExecutionGraph(
JobType jobType,
JobInformation jobInformation,
ScheduledExecutorService futureExecutor,
Executor ioExecutor,
Duration rpcTimeout,
int executionHistorySizeLimit,
ClassLoader userClassLoader,
BlobWriter blobWriter,
PartitionGroupReleaseStrategy.Factory partitionGroupReleaseStrategyFactory,
ShuffleMaster> shuffleMaster,
JobMasterPartitionTracker partitionTracker,
ExecutionDeploymentListener executionDeploymentListener,
ExecutionStateUpdateListener executionStateUpdateListener,
long initializationTimestamp,
VertexAttemptNumberStore initialAttemptCounts,
VertexParallelismStore vertexParallelismStore,
boolean isDynamic,
ExecutionJobVertex.Factory executionJobVertexFactory,
List jobStatusHooks,
MarkPartitionFinishedStrategy markPartitionFinishedStrategy,
TaskDeploymentDescriptorFactory taskDeploymentDescriptorFactory,
List jobStatusChangedListeners) {
this.jobType = jobType;
this.executionGraphId = new ExecutionGraphID();
this.jobInformation = checkNotNull(jobInformation);
this.blobWriter = checkNotNull(blobWriter);
this.futureExecutor = checkNotNull(futureExecutor);
this.ioExecutor = checkNotNull(ioExecutor);
this.userClassLoader = checkNotNull(userClassLoader, "userClassLoader");
this.tasks = CollectionUtil.newHashMapWithExpectedSize(16);
this.intermediateResults = CollectionUtil.newHashMapWithExpectedSize(16);
this.verticesInCreationOrder = new ArrayList<>(16);
this.currentExecutions = CollectionUtil.newHashMapWithExpectedSize(16);
this.jobStatusListeners = new ArrayList<>();
this.stateTimestamps = new long[JobStatus.values().length];
this.stateTimestamps[JobStatus.INITIALIZING.ordinal()] = initializationTimestamp;
this.stateTimestamps[JobStatus.CREATED.ordinal()] = System.currentTimeMillis();
this.rpcTimeout = checkNotNull(rpcTimeout);
this.partitionGroupReleaseStrategyFactory =
checkNotNull(partitionGroupReleaseStrategyFactory);
this.kvStateLocationRegistry =
new KvStateLocationRegistry(jobInformation.getJobId(), getAllVertices());
this.executionHistorySizeLimit = executionHistorySizeLimit;
this.schedulingFuture = null;
this.jobMasterMainThreadExecutor =
new ComponentMainThreadExecutor.DummyComponentMainThreadExecutor(
"ExecutionGraph is not initialized with proper main thread executor. "
+ "Call to ExecutionGraph.start(...) required.");
this.shuffleMaster = checkNotNull(shuffleMaster);
this.partitionTracker = checkNotNull(partitionTracker);
this.resultPartitionAvailabilityChecker =
new ExecutionGraphResultPartitionAvailabilityChecker(
this::createResultPartitionId, partitionTracker);
this.executionDeploymentListener = executionDeploymentListener;
this.executionStateUpdateListener = executionStateUpdateListener;
this.initialAttemptCounts = initialAttemptCounts;
this.parallelismStore = vertexParallelismStore;
this.edgeManager = new EdgeManager();
this.executionVerticesById = new HashMap<>();
this.resultPartitionsById = new HashMap<>();
this.vertexInputInfoStore = new VertexInputInfoStore();
this.isDynamic = isDynamic;
this.executionJobVertexFactory = checkNotNull(executionJobVertexFactory);
this.jobStatusHooks = checkNotNull(jobStatusHooks);
this.markPartitionFinishedStrategy = markPartitionFinishedStrategy;
this.taskDeploymentDescriptorFactory = checkNotNull(taskDeploymentDescriptorFactory);
this.jobStatusChangedListeners = checkNotNull(jobStatusChangedListeners);
LOG.info(
"Created execution graph {} for job {}.",
executionGraphId,
jobInformation.getJobId());
// Trigger hook onCreated
notifyJobStatusHooks(state, null);
}
@Override
public void start(@Nonnull ComponentMainThreadExecutor jobMasterMainThreadExecutor) {
this.jobMasterMainThreadExecutor = jobMasterMainThreadExecutor;
}
// --------------------------------------------------------------------------------------------
// Configuration of Data-flow wide execution settings
// --------------------------------------------------------------------------------------------
@Override
public SchedulingTopology getSchedulingTopology() {
return executionTopology;
}
@Override
@Nonnull
public ComponentMainThreadExecutor getJobMasterMainThreadExecutor() {
return jobMasterMainThreadExecutor;
}
@Override
public TernaryBoolean isChangelogStateBackendEnabled() {
return stateChangelogEnabled;
}
@Override
public Optional getStateBackendName() {
return Optional.ofNullable(stateBackendName);
}
@Override
public Optional getCheckpointStorageName() {
return Optional.ofNullable(checkpointStorageName);
}
@Override
public Optional getChangelogStorageName() {
return Optional.ofNullable(changelogStorageName);
}
@Override
public void enableCheckpointing(
CheckpointCoordinatorConfiguration chkConfig,
List> masterHooks,
CheckpointIDCounter checkpointIDCounter,
CompletedCheckpointStore checkpointStore,
StateBackend checkpointStateBackend,
CheckpointStorage checkpointStorage,
CheckpointStatsTracker statsTracker,
CheckpointsCleaner checkpointsCleaner,
String changelogStorageName) {
checkState(state == JobStatus.CREATED, "Job must be in CREATED state");
checkState(checkpointCoordinator == null, "checkpointing already enabled");
final Collection operatorCoordinators =
buildOpCoordinatorCheckpointContexts();
checkpointStatsTracker = checkNotNull(statsTracker, "CheckpointStatsTracker");
checkpointCoordinatorConfiguration =
checkNotNull(chkConfig, "CheckpointCoordinatorConfiguration");
CheckpointFailureManager failureManager =
new CheckpointFailureManager(
chkConfig.getTolerableCheckpointFailureNumber(),
new CheckpointFailureManager.FailJobCallback() {
@Override
public void failJob(Throwable cause) {
getJobMasterMainThreadExecutor().execute(() -> failGlobal(cause));
}
@Override
public void failJobDueToTaskFailure(
Throwable cause, ExecutionAttemptID failingTask) {
getJobMasterMainThreadExecutor()
.execute(
() ->
failGlobalIfExecutionIsStillRunning(
cause, failingTask));
}
});
checkState(checkpointCoordinatorTimer == null);
checkpointCoordinatorTimer =
MdcUtils.scopeToJob(
getJobID(),
Executors.newSingleThreadScheduledExecutor(
new DispatcherThreadFactory(
Thread.currentThread().getThreadGroup(),
"Checkpoint Timer")));
// create the coordinator that triggers and commits checkpoints and holds the state
checkpointCoordinator =
new CheckpointCoordinator(
jobInformation.getJobId(),
chkConfig,
operatorCoordinators,
checkpointIDCounter,
checkpointStore,
checkpointStorage,
ioExecutor,
checkpointsCleaner,
new ScheduledExecutorServiceAdapter(checkpointCoordinatorTimer),
failureManager,
createCheckpointPlanCalculator(
chkConfig.isEnableCheckpointsAfterTasksFinish()),
checkpointStatsTracker);
// register the master hooks on the checkpoint coordinator
for (MasterTriggerRestoreHook> hook : masterHooks) {
if (!checkpointCoordinator.addMasterHook(hook)) {
LOG.warn(
"Trying to register multiple checkpoint hooks with the name: {}",
hook.getIdentifier());
}
}
if (checkpointCoordinator.isPeriodicCheckpointingConfigured()) {
// the periodic checkpoint scheduler is activated and deactivated as a result of
// job status and topology changes (running & all edges non-blocking -> on, all
// other states -> off)
boolean allTasksOutputNonBlocking =
tasks.values().stream()
.noneMatch(vertex -> vertex.getJobVertex().isAnyOutputBlocking());
registerJobStatusListener(
checkpointCoordinator.createActivatorDeactivator(allTasksOutputNonBlocking));
}
this.stateBackendName = checkpointStateBackend.getName();
this.stateChangelogEnabled =
TernaryBoolean.fromBoolean(
StateBackendLoader.isChangelogStateBackend(checkpointStateBackend));
this.checkpointStorageName = checkpointStorage.getClass().getSimpleName();
this.changelogStorageName = changelogStorageName;
}
private CheckpointPlanCalculator createCheckpointPlanCalculator(
boolean enableCheckpointsAfterTasksFinish) {
return new DefaultCheckpointPlanCalculator(
getJobID(),
new ExecutionGraphCheckpointPlanCalculatorContext(this),
getVerticesTopologically(),
enableCheckpointsAfterTasksFinish);
}
@Override
@Nullable
public CheckpointCoordinator getCheckpointCoordinator() {
return checkpointCoordinator;
}
@Override
public KvStateLocationRegistry getKvStateLocationRegistry() {
return kvStateLocationRegistry;
}
@Override
public CheckpointCoordinatorConfiguration getCheckpointCoordinatorConfiguration() {
if (checkpointCoordinatorConfiguration != null) {
return checkpointCoordinatorConfiguration;
} else {
return null;
}
}
@Override
public CheckpointStatsSnapshot getCheckpointStatsSnapshot() {
if (checkpointStatsTracker != null) {
return checkpointStatsTracker.createSnapshot();
} else {
return null;
}
}
private Collection
buildOpCoordinatorCheckpointContexts() {
final ArrayList contexts = new ArrayList<>();
for (final ExecutionJobVertex vertex : verticesInCreationOrder) {
contexts.addAll(vertex.getOperatorCoordinators());
}
contexts.trimToSize();
return contexts;
}
// --------------------------------------------------------------------------------------------
// Properties and Status of the Execution Graph
// --------------------------------------------------------------------------------------------
@Override
public void setJsonPlan(String jsonPlan) {
this.jsonPlan = jsonPlan;
}
@Override
public String getJsonPlan() {
return jsonPlan;
}
@Override
public JobID getJobID() {
return jobInformation.getJobId();
}
@Override
public String getJobName() {
return jobInformation.getJobName();
}
@Override
public boolean isStoppable() {
return this.isStoppable;
}
@Override
public Configuration getJobConfiguration() {
return jobInformation.getJobConfiguration();
}
@Override
public ClassLoader getUserClassLoader() {
return this.userClassLoader;
}
@Override
public JobStatus getState() {
return state;
}
@Override
public JobType getJobType() {
return jobType;
}
@Override
public Throwable getFailureCause() {
return failureCause;
}
public ErrorInfo getFailureInfo() {
return failureInfo;
}
@Override
public long getNumberOfRestarts() {
return numberOfRestartsCounter.getCount();
}
@Override
public int getNumFinishedVertices() {
return IterableUtils.toStream(getVerticesTopologically())
.map(ExecutionJobVertex::getNumExecutionVertexFinished)
.mapToInt(Integer::intValue)
.sum();
}
@Override
public ExecutionJobVertex getJobVertex(JobVertexID id) {
return this.tasks.get(id);
}
@Override
public Map getAllVertices() {
return Collections.unmodifiableMap(this.tasks);
}
@Override
public Iterable getVerticesTopologically() {
// we return a specific iterator that does not fail with concurrent modifications
// the list is append only, so it is safe for that
final int numElements = this.verticesInCreationOrder.size();
return new Iterable() {
@Override
public Iterator iterator() {
return new Iterator() {
private int pos = 0;
@Override
public boolean hasNext() {
return pos < numElements;
}
@Override
public ExecutionJobVertex next() {
if (hasNext()) {
return verticesInCreationOrder.get(pos++);
} else {
throw new NoSuchElementException();
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
};
}
@Override
public Map getAllIntermediateResults() {
return Collections.unmodifiableMap(this.intermediateResults);
}
@Override
public Iterable getAllExecutionVertices() {
return () -> new AllVerticesIterator<>(getVerticesTopologically().iterator());
}
@Override
public EdgeManager getEdgeManager() {
return edgeManager;
}
@Override
public ExecutionVertex getExecutionVertexOrThrow(ExecutionVertexID id) {
return checkNotNull(executionVerticesById.get(id));
}
@Override
public IntermediateResultPartition getResultPartitionOrThrow(
final IntermediateResultPartitionID id) {
return checkNotNull(resultPartitionsById.get(id));
}
@Override
public long getStatusTimestamp(JobStatus status) {
return this.stateTimestamps[status.ordinal()];
}
@Override
public final BlobWriter getBlobWriter() {
return blobWriter;
}
@Override
public Executor getFutureExecutor() {
return futureExecutor;
}
@Override
public Map>> aggregateUserAccumulators() {
Map>> userAccumulators = new HashMap<>();
for (ExecutionVertex vertex : getAllExecutionVertices()) {
Map> next =
vertex.getCurrentExecutionAttempt().getUserAccumulators();
if (next != null) {
AccumulatorHelper.mergeInto(userAccumulators, next);
}
}
return userAccumulators;
}
/**
* Gets a serialized accumulator map.
*
* @return The accumulator map with serialized accumulator values.
*/
@Override
public Map>> getAccumulatorsSerialized() {
return aggregateUserAccumulators().entrySet().stream()
.collect(
Collectors.toMap(
Map.Entry::getKey,
entry -> serializeAccumulator(entry.getKey(), entry.getValue())));
}
private static SerializedValue> serializeAccumulator(
String name, OptionalFailure> accumulator) {
try {
if (accumulator.isFailure()) {
return new SerializedValue<>(
OptionalFailure.ofFailure(accumulator.getFailureCause()));
}
return new SerializedValue<>(
OptionalFailure.of(accumulator.getUnchecked().getLocalValue()));
} catch (IOException ioe) {
LOG.error("Could not serialize accumulator " + name + '.', ioe);
try {
return new SerializedValue<>(OptionalFailure.ofFailure(ioe));
} catch (IOException e) {
throw new RuntimeException(
"It should never happen that we cannot serialize the accumulator serialization exception.",
e);
}
}
}
/**
* Returns the a stringified version of the user-defined accumulators.
*
* @return an Array containing the StringifiedAccumulatorResult objects
*/
@Override
public StringifiedAccumulatorResult[] getAccumulatorResultsStringified() {
Map>> accumulatorMap =
aggregateUserAccumulators();
return StringifiedAccumulatorResult.stringifyAccumulatorResults(accumulatorMap);
}
@Override
public void setInternalTaskFailuresListener(
final InternalFailuresListener internalTaskFailuresListener) {
checkNotNull(internalTaskFailuresListener);
checkState(
this.internalTaskFailuresListener == null,
"internalTaskFailuresListener can be only set once");
this.internalTaskFailuresListener = internalTaskFailuresListener;
}
// --------------------------------------------------------------------------------------------
// Actions
// --------------------------------------------------------------------------------------------
@Override
public void notifyNewlyInitializedJobVertices(List vertices) {
executionTopology.notifyExecutionGraphUpdated(this, vertices);
}
@Override
public void attachJobGraph(
List verticesToAttach, JobManagerJobMetricGroup jobManagerJobMetricGroup)
throws JobException {
assertRunningInJobMasterMainThread();
LOG.debug(
"Attaching {} topologically sorted vertices to existing job graph with {} "
+ "vertices and {} intermediate results.",
verticesToAttach.size(),
tasks.size(),
intermediateResults.size());
attachJobVertices(verticesToAttach, jobManagerJobMetricGroup);
if (!isDynamic) {
initializeJobVertices(verticesToAttach);
}
// the topology assigning should happen before notifying new vertices to failoverStrategy
executionTopology = DefaultExecutionTopology.fromExecutionGraph(this);
partitionGroupReleaseStrategy =
partitionGroupReleaseStrategyFactory.createInstance(getSchedulingTopology());
}
/** Attach job vertices without initializing them. */
private void attachJobVertices(
List topologicallySorted, JobManagerJobMetricGroup jobManagerJobMetricGroup)
throws JobException {
for (JobVertex jobVertex : topologicallySorted) {
if (jobVertex.isInputVertex() && !jobVertex.isStoppable()) {
this.isStoppable = false;
}
VertexParallelismInformation parallelismInfo =
parallelismStore.getParallelismInfo(jobVertex.getID());
// create the execution job vertex and attach it to the graph
ExecutionJobVertex ejv =
executionJobVertexFactory.createExecutionJobVertex(
this,
jobVertex,
parallelismInfo,
coordinatorStore,
jobManagerJobMetricGroup);
ExecutionJobVertex previousTask = this.tasks.putIfAbsent(jobVertex.getID(), ejv);
if (previousTask != null) {
throw new JobException(
String.format(
"Encountered two job vertices with ID %s : previous=[%s] / new=[%s]",
jobVertex.getID(), ejv, previousTask));
}
this.verticesInCreationOrder.add(ejv);
this.numJobVerticesTotal++;
}
}
private void initializeJobVertices(List topologicallySorted) throws JobException {
final long createTimestamp = System.currentTimeMillis();
for (JobVertex jobVertex : topologicallySorted) {
final ExecutionJobVertex ejv = tasks.get(jobVertex.getID());
initializeJobVertex(ejv, createTimestamp);
}
}
@Override
public void initializeJobVertex(
ExecutionJobVertex ejv,
long createTimestamp,
Map jobVertexInputInfos)
throws JobException {
checkNotNull(ejv);
checkNotNull(jobVertexInputInfos);
jobVertexInputInfos.forEach(
(resultId, info) ->
this.vertexInputInfoStore.put(ejv.getJobVertexId(), resultId, info));
ejv.initialize(
executionHistorySizeLimit,
rpcTimeout,
createTimestamp,
this.initialAttemptCounts.getAttemptCounts(ejv.getJobVertexId()));
ejv.connectToPredecessors(this.intermediateResults);
for (IntermediateResult res : ejv.getProducedDataSets()) {
IntermediateResult previousDataSet =
this.intermediateResults.putIfAbsent(res.getId(), res);
if (previousDataSet != null) {
throw new JobException(
String.format(
"Encountered two intermediate data set with ID %s : previous=[%s] / new=[%s]",
res.getId(), res, previousDataSet));
}
}
registerExecutionVerticesAndResultPartitionsFor(ejv);
// enrich network memory.
SlotSharingGroup slotSharingGroup = ejv.getSlotSharingGroup();
if (areJobVerticesAllInitialized(slotSharingGroup)) {
SsgNetworkMemoryCalculationUtils.enrichNetworkMemory(
slotSharingGroup, this::getJobVertex, shuffleMaster);
}
}
private boolean areJobVerticesAllInitialized(final SlotSharingGroup group) {
for (JobVertexID jobVertexId : group.getJobVertexIds()) {
final ExecutionJobVertex jobVertex = getJobVertex(jobVertexId);
checkNotNull(jobVertex, "Unknown job vertex %s", jobVertexId);
if (!jobVertex.isInitialized()) {
return false;
}
}
return true;
}
@Override
public void transitionToRunning() {
if (!transitionState(JobStatus.CREATED, JobStatus.RUNNING)) {
throw new IllegalStateException(
"Job may only be scheduled from state " + JobStatus.CREATED);
}
}
@Override
public void cancel() {
assertRunningInJobMasterMainThread();
while (true) {
JobStatus current = state;
if (current == JobStatus.RUNNING
|| current == JobStatus.CREATED
|| current == JobStatus.RESTARTING) {
if (transitionState(current, JobStatus.CANCELLING)) {
resetExecutionGraph(ExecutionJobVertex::cancelWithFuture)
.whenComplete(
(Void value, Throwable throwable) -> {
if (throwable != null) {
transitionState(
JobStatus.CANCELLING,
JobStatus.FAILED,
new FlinkException(
"Could not cancel job "
+ getJobName()
+ " because not all execution job vertices could be cancelled.",
throwable));
} else {
// cancellations may currently be overridden by failures
// which trigger restarts, so we need to pass a proper
// restart global version here
allVerticesInTerminalState();
}
});
return;
}
}
// Executions are being canceled. Go into cancelling and wait for
// all vertices to be in their final state.
else if (current == JobStatus.FAILING) {
if (transitionState(current, JobStatus.CANCELLING)) {
return;
}
} else {
// no need to treat other states
return;
}
}
}
private CompletableFuture resetExecutionGraph(
Function> perVertexOperationAsync) {
assertRunningInJobMasterMainThread();
incrementRestarts();
// cancel ongoing scheduling action
if (schedulingFuture != null) {
schedulingFuture.cancel(false);
}
return applyToVertexAsync(perVertexOperationAsync);
}
private CompletableFuture applyToVertexAsync(
Function> perVertexOperationAsync) {
return FutureUtils.waitForAll(
verticesInCreationOrder.stream()
.map(perVertexOperationAsync)
.collect(Collectors.toList()));
}
@Override
public void suspend(Throwable suspensionCause) {
assertRunningInJobMasterMainThread();
if (state.isTerminalState()) {
// stay in a terminal state
return;
} else if (transitionState(state, JobStatus.SUSPENDED, suspensionCause)) {
initFailureCause(suspensionCause, System.currentTimeMillis());
final CompletableFuture jobVerticesTerminationFuture =
resetExecutionGraph(ExecutionJobVertex::suspend);
checkState(jobVerticesTerminationFuture.isDone(), "Suspend needs to happen atomically");
jobVerticesTerminationFuture.whenComplete(
(Void ignored, Throwable throwable) -> {
if (throwable != null) {
LOG.debug("Could not properly suspend the execution graph.", throwable);
}
onTerminalState(state);
LOG.info("Job {} has been suspended.", getJobID());
});
} else {
throw new IllegalStateException(
String.format(
"Could not suspend because transition from %s to %s failed.",
state, JobStatus.SUSPENDED));
}
}
void failGlobalIfExecutionIsStillRunning(Throwable cause, ExecutionAttemptID failingAttempt) {
final Execution failedExecution = currentExecutions.get(failingAttempt);
if (failedExecution != null
&& (failedExecution.getState() == ExecutionState.RUNNING
|| failedExecution.getState() == ExecutionState.INITIALIZING)) {
failGlobal(cause);
} else {
LOG.debug(
"The failing attempt {} belongs to an already not"
+ " running task thus won't fail the job",
failingAttempt);
}
}
@Override
public void failGlobal(Throwable t) {
checkState(internalTaskFailuresListener != null);
internalTaskFailuresListener.notifyGlobalFailure(t);
}
/**
* Returns the serializable {@link ArchivedExecutionConfig}.
*
* @return ArchivedExecutionConfig which may be null in case of errors
*/
@Override
public ArchivedExecutionConfig getArchivedExecutionConfig() {
// create a summary of all relevant data accessed in the web interface's JobConfigHandler
try {
ExecutionConfig executionConfig =
jobInformation.getSerializedExecutionConfig().deserializeValue(userClassLoader);
if (executionConfig != null) {
return executionConfig.archive();
}
} catch (IOException | ClassNotFoundException e) {
LOG.error("Couldn't create ArchivedExecutionConfig for job {} ", getJobID(), e);
}
return null;
}
@Override
public CompletableFuture getTerminationFuture() {
return terminationFuture;
}
@Override
@VisibleForTesting
public JobStatus waitUntilTerminal() throws InterruptedException {
try {
return terminationFuture.get();
} catch (ExecutionException e) {
// this should never happen
// it would be a bug, so we don't expect this to be handled and throw
// an unchecked exception here
throw new RuntimeException(e);
}
}
// ------------------------------------------------------------------------
// State Transitions
// ------------------------------------------------------------------------
@Override
public boolean transitionState(JobStatus current, JobStatus newState) {
return transitionState(current, newState, null);
}
private void transitionState(JobStatus newState, Throwable error) {
transitionState(state, newState, error);
}
private boolean transitionState(JobStatus current, JobStatus newState, Throwable error) {
assertRunningInJobMasterMainThread();
// consistency check
if (current.isTerminalState()) {
String message = "Job is trying to leave terminal state " + current;
LOG.error(message);
throw new IllegalStateException(message);
}
// now do the actual state transition
if (state == current) {
state = newState;
LOG.info(
"Job {} ({}) switched from state {} to {}.",
getJobName(),
getJobID(),
current,
newState,
error);
stateTimestamps[newState.ordinal()] = System.currentTimeMillis();
notifyJobStatusChange(current, newState, error);
notifyJobStatusHooks(newState, error);
return true;
} else {
return false;
}
}
@Override
public void incrementRestarts() {
numberOfRestartsCounter.inc();
}
@Override
public void initFailureCause(Throwable t, long timestamp) {
this.failureCause = t;
this.failureInfo = new ErrorInfo(t, timestamp);
}
// ------------------------------------------------------------------------
// Job Status Progress
// ------------------------------------------------------------------------
/**
* Called whenever a job vertex reaches state FINISHED (completed successfully). Once all job
* vertices are in the FINISHED state, the program is successfully done.
*/
@Override
public void jobVertexFinished() {
assertRunningInJobMasterMainThread();
final int numFinished = ++numFinishedJobVertices;
if (numFinished == numJobVerticesTotal) {
FutureUtils.assertNoException(
waitForAllExecutionsTermination().thenAccept(ignored -> jobFinished()));
}
}
private CompletableFuture> waitForAllExecutionsTermination() {
final List> terminationFutures =
verticesInCreationOrder.stream()
.flatMap(ejv -> Arrays.stream(ejv.getTaskVertices()))
.map(ExecutionVertex::getTerminationFuture)
.collect(Collectors.toList());
return FutureUtils.waitForAll(terminationFutures);
}
private void jobFinished() {
assertRunningInJobMasterMainThread();
// check whether we are still in "RUNNING" and trigger the final cleanup
if (state == JobStatus.RUNNING) {
// we do the final cleanup in the I/O executor, because it may involve
// some heavier work
try {
for (ExecutionJobVertex ejv : verticesInCreationOrder) {
final Map subtaskToFinishedAttempt =
Arrays.stream(ejv.getTaskVertices())
.map(ExecutionVertex::getCurrentExecutionAttempt)
.collect(
Collectors.toMap(
Execution::getParallelSubtaskIndex,
Execution::getAttemptNumber));
ejv.getJobVertex()
.finalizeOnMaster(
new FinalizeOnMasterContext() {
@Override
public ClassLoader getClassLoader() {
return getUserClassLoader();
}
@Override
public int getExecutionParallelism() {
return ejv.getParallelism();
}
@Override
public int getFinishedAttempt(int subtaskIndex) {
final Integer attemptNumber =
subtaskToFinishedAttempt.get(subtaskIndex);
if (attemptNumber == null) {
throw new IllegalArgumentException(
"Invalid subtaskIndex "
+ subtaskIndex
+ " provided");
}
return attemptNumber;
}
});
}
} catch (Throwable t) {
ExceptionUtils.rethrowIfFatalError(t);
ClusterEntryPointExceptionUtils.tryEnrichClusterEntryPointError(t);
failGlobal(new Exception("Failed to finalize execution on master", t));
return;
}
// if we do not make this state transition, then a concurrent
// cancellation or failure happened
if (transitionState(JobStatus.RUNNING, JobStatus.FINISHED)) {
onTerminalState(JobStatus.FINISHED);
}
}
}
@Override
public void jobVertexUnFinished() {
assertRunningInJobMasterMainThread();
numFinishedJobVertices--;
}
/**
* This method is a callback during cancellation/failover and called when all tasks have reached
* a terminal state (cancelled/failed/finished).
*/
private void allVerticesInTerminalState() {
assertRunningInJobMasterMainThread();
// we are done, transition to the final state
JobStatus current;
while (true) {
current = this.state;
if (current == JobStatus.RUNNING) {
failGlobal(
new Exception(
"ExecutionGraph went into allVerticesInTerminalState() from RUNNING"));
} else if (current == JobStatus.CANCELLING) {
if (transitionState(current, JobStatus.CANCELED)) {
onTerminalState(JobStatus.CANCELED);
break;
}
} else if (current == JobStatus.FAILING) {
break;
} else if (current.isGloballyTerminalState()) {
LOG.warn(
"Job has entered globally terminal state without waiting for all "
+ "job vertices to reach final state.");
break;
} else {
failGlobal(
new Exception(
"ExecutionGraph went into final state from state " + current));
break;
}
}
// done transitioning the state
}
@Override
public void failJob(Throwable cause, long timestamp) {
if (state == JobStatus.FAILING || state.isTerminalState()) {
return;
}
transitionState(JobStatus.FAILING, cause);
initFailureCause(cause, timestamp);
FutureUtils.assertNoException(
applyToVertexAsync(ExecutionJobVertex::cancelWithFuture)
.whenComplete(
(aVoid, throwable) -> {
if (transitionState(
JobStatus.FAILING, JobStatus.FAILED, cause)) {
onTerminalState(JobStatus.FAILED);
} else if (state == JobStatus.CANCELLING) {
transitionState(JobStatus.CANCELLING, JobStatus.CANCELED);
onTerminalState(JobStatus.CANCELED);
} else if (!state.isTerminalState()) {
throw new IllegalStateException(
"Cannot complete job failing from an unexpected state: "
+ state);
}
}));
}
private void onTerminalState(JobStatus status) {
LOG.debug("ExecutionGraph {} reached terminal state {}.", getJobID(), status);
try {
CheckpointCoordinator coord = this.checkpointCoordinator;
this.checkpointCoordinator = null;
if (coord != null) {
coord.shutdown();
}
if (checkpointCoordinatorTimer != null) {
checkpointCoordinatorTimer.shutdownNow();
checkpointCoordinatorTimer = null;
}
} catch (Exception e) {
LOG.error("Error while cleaning up after execution", e);
} finally {
terminationFuture.complete(status);
}
}
// --------------------------------------------------------------------------------------------
// Callbacks and Callback Utilities
// --------------------------------------------------------------------------------------------
@Override
public boolean updateState(TaskExecutionStateTransition state) {
assertRunningInJobMasterMainThread();
final Execution attempt = currentExecutions.get(state.getID());
if (attempt != null) {
try {
final boolean stateUpdated = updateStateInternal(state, attempt);
maybeReleasePartitionGroupsFor(attempt);
return stateUpdated;
} catch (Throwable t) {
ExceptionUtils.rethrowIfFatalErrorOrOOM(t);
// failures during updates leave the ExecutionGraph inconsistent
failGlobal(t);
return false;
}
} else {
return false;
}
}
private boolean updateStateInternal(
final TaskExecutionStateTransition state, final Execution attempt) {
Map> accumulators;
switch (state.getExecutionState()) {
case INITIALIZING:
return attempt.switchToInitializing();
case RUNNING:
if (!isAnyOutputBlocking(this)
&& checkpointCoordinator != null
&& checkpointCoordinator.isPeriodicCheckpointingConfigured()
&& !checkpointCoordinator.isPeriodicCheckpointingStarted()) {
checkpointCoordinator.startCheckpointScheduler();
}
return attempt.switchToRunning();
case FINISHED:
// this deserialization is exception-free
accumulators = deserializeAccumulators(state);
attempt.markFinished(accumulators, state.getIOMetrics());
return true;
case CANCELED:
// this deserialization is exception-free
accumulators = deserializeAccumulators(state);
attempt.completeCancelling(accumulators, state.getIOMetrics(), false);
return true;
case FAILED:
// this deserialization is exception-free
accumulators = deserializeAccumulators(state);
attempt.markFailed(
state.getError(userClassLoader),
state.getCancelTask(),
accumulators,
state.getIOMetrics(),
state.getReleasePartitions(),
true);
return true;
default:
// we mark as failed and return false, which triggers the TaskManager
// to remove the task
attempt.fail(
new Exception(
"TaskManager sent illegal state update: "
+ state.getExecutionState()));
return false;
}
}
private void maybeReleasePartitionGroupsFor(final Execution attempt) {
final ExecutionVertexID finishedExecutionVertex = attempt.getVertex().getID();
if (attempt.getState() == ExecutionState.FINISHED) {
final List releasablePartitionGroups =
partitionGroupReleaseStrategy.vertexFinished(finishedExecutionVertex);
releasePartitionGroups(releasablePartitionGroups);
} else {
partitionGroupReleaseStrategy.vertexUnfinished(finishedExecutionVertex);
}
}
private void releasePartitionGroups(
final List releasablePartitionGroups) {
if (releasablePartitionGroups.size() > 0) {
final List releasablePartitionIds = new ArrayList<>();
// Remove the cache of ShuffleDescriptors when ConsumedPartitionGroups are released
for (ConsumedPartitionGroup releasablePartitionGroup : releasablePartitionGroups) {
IntermediateResult totalResult =
checkNotNull(
intermediateResults.get(
releasablePartitionGroup.getIntermediateDataSetID()));
for (IntermediateResultPartitionID partitionId : releasablePartitionGroup) {
IntermediateResultPartition partition =
totalResult.getPartitionById(partitionId);
partition.markPartitionGroupReleasable(releasablePartitionGroup);
if (partition.canBeReleased()) {
releasablePartitionIds.add(createResultPartitionId(partitionId));
}
}
totalResult.clearCachedInformationForPartitionGroup(releasablePartitionGroup);
}
partitionTracker.stopTrackingAndReleasePartitions(releasablePartitionIds);
}
}
@VisibleForTesting
public ResultPartitionID createResultPartitionId(
final IntermediateResultPartitionID resultPartitionId) {
final SchedulingResultPartition schedulingResultPartition =
getSchedulingTopology().getResultPartition(resultPartitionId);
final SchedulingExecutionVertex producer = schedulingResultPartition.getProducer();
final ExecutionVertexID producerId = producer.getId();
final JobVertexID jobVertexId = producerId.getJobVertexId();
final ExecutionJobVertex jobVertex = getJobVertex(jobVertexId);
checkNotNull(jobVertex, "Unknown job vertex %s", jobVertexId);
final ExecutionVertex[] taskVertices = jobVertex.getTaskVertices();
final int subtaskIndex = producerId.getSubtaskIndex();
checkState(
subtaskIndex < taskVertices.length,
"Invalid subtask index %d for job vertex %s",
subtaskIndex,
jobVertexId);
final ExecutionVertex taskVertex = taskVertices[subtaskIndex];
final Execution execution = taskVertex.getCurrentExecutionAttempt();
return new ResultPartitionID(resultPartitionId, execution.getAttemptId());
}
/**
* Deserializes accumulators from a task state update.
*
* This method never throws an exception!
*
* @param state The task execution state from which to deserialize the accumulators.
* @return The deserialized accumulators, of null, if there are no accumulators or an error
* occurred.
*/
private Map> deserializeAccumulators(
TaskExecutionStateTransition state) {
AccumulatorSnapshot serializedAccumulators = state.getAccumulators();
if (serializedAccumulators != null) {
try {
return serializedAccumulators.deserializeUserAccumulators(userClassLoader);
} catch (Throwable t) {
// we catch Throwable here to include all form of linking errors that may
// occur if user classes are missing in the classpath
LOG.error("Failed to deserialize final accumulator results.", t);
}
}
return null;
}
@Override
public Map getRegisteredExecutions() {
return Collections.unmodifiableMap(currentExecutions);
}
@Override
public void registerExecution(Execution exec) {
assertRunningInJobMasterMainThread();
Execution previous = currentExecutions.putIfAbsent(exec.getAttemptId(), exec);
if (previous != null) {
failGlobal(
new Exception(
"Trying to register execution "
+ exec
+ " for already used ID "
+ exec.getAttemptId()));
}
}
@Override
public void deregisterExecution(Execution exec) {
assertRunningInJobMasterMainThread();
Execution contained = currentExecutions.remove(exec.getAttemptId());
if (contained != null && contained != exec) {
failGlobal(
new Exception(
"De-registering execution "
+ exec
+ " failed. Found for same ID execution "
+ contained));
}
}
private void registerExecutionVerticesAndResultPartitionsFor(
ExecutionJobVertex executionJobVertex) {
for (ExecutionVertex executionVertex : executionJobVertex.getTaskVertices()) {
executionVerticesById.put(executionVertex.getID(), executionVertex);
resultPartitionsById.putAll(executionVertex.getProducedPartitions());
}
}
@Override
public void updateAccumulators(AccumulatorSnapshot accumulatorSnapshot) {
Map> userAccumulators;
try {
userAccumulators = accumulatorSnapshot.deserializeUserAccumulators(userClassLoader);
ExecutionAttemptID execID = accumulatorSnapshot.getExecutionAttemptID();
Execution execution = currentExecutions.get(execID);
if (execution != null) {
execution.setAccumulators(userAccumulators);
} else {
LOG.debug("Received accumulator result for unknown execution {}.", execID);
}
} catch (Exception e) {
LOG.error("Cannot update accumulators for job {}.", getJobID(), e);
}
}
// --------------------------------------------------------------------------------------------
// Listeners & Observers
// --------------------------------------------------------------------------------------------
@Override
public void registerJobStatusListener(JobStatusListener listener) {
if (listener != null) {
jobStatusListeners.add(listener);
}
}
private void notifyJobStatusChange(
JobStatus oldState, JobStatus newState, @Nullable Throwable cause) {
if (jobStatusListeners.size() > 0) {
final long timestamp = System.currentTimeMillis();
for (JobStatusListener listener : jobStatusListeners) {
try {
listener.jobStatusChanges(getJobID(), newState, timestamp);
} catch (Throwable t) {
LOG.warn("Error while notifying JobStatusListener", t);
}
}
}
if (jobStatusChangedListeners.size() > 0) {
jobStatusChangedListeners.forEach(
listener ->
listener.onEvent(
new DefaultJobExecutionStatusEvent(
getJobID(), getJobName(), oldState, newState, cause)));
}
}
private void notifyJobStatusHooks(JobStatus newState, @Nullable Throwable cause) {
JobID jobID = jobInformation.getJobId();
for (JobStatusHook hook : jobStatusHooks) {
try {
switch (newState) {
case CREATED:
hook.onCreated(jobID);
break;
case CANCELED:
hook.onCanceled(jobID);
break;
case FAILED:
hook.onFailed(jobID, cause);
break;
case FINISHED:
hook.onFinished(jobID);
break;
}
} catch (Throwable e) {
throw new RuntimeException(
"Error while notifying JobStatusHook[" + hook.getClass() + "]", e);
}
}
}
@Override
public void notifyExecutionChange(
final Execution execution,
ExecutionState previousState,
final ExecutionState newExecutionState) {
executionStateUpdateListener.onStateUpdate(
execution.getAttemptId(), previousState, newExecutionState);
}
private void assertRunningInJobMasterMainThread() {
if (!(jobMasterMainThreadExecutor
instanceof ComponentMainThreadExecutor.DummyComponentMainThreadExecutor)) {
jobMasterMainThreadExecutor.assertRunningInMainThread();
}
}
@Override
public void notifySchedulerNgAboutInternalTaskFailure(
final ExecutionAttemptID attemptId,
final Throwable t,
final boolean cancelTask,
final boolean releasePartitions) {
checkState(internalTaskFailuresListener != null);
internalTaskFailuresListener.notifyTaskFailure(attemptId, t, cancelTask, releasePartitions);
}
@Override
public void deleteBlobs(List blobKeys) {
CompletableFuture.runAsync(
() -> {
for (PermanentBlobKey blobKey : blobKeys) {
blobWriter.deletePermanent(getJobID(), blobKey);
}
},
ioExecutor);
}
@Override
public ShuffleMaster> getShuffleMaster() {
return shuffleMaster;
}
@Override
public JobMasterPartitionTracker getPartitionTracker() {
return partitionTracker;
}
@Override
public ResultPartitionAvailabilityChecker getResultPartitionAvailabilityChecker() {
return resultPartitionAvailabilityChecker;
}
@Override
public PartitionGroupReleaseStrategy getPartitionGroupReleaseStrategy() {
return partitionGroupReleaseStrategy;
}
@Override
public ExecutionDeploymentListener getExecutionDeploymentListener() {
return executionDeploymentListener;
}
@Override
public boolean isDynamic() {
return isDynamic;
}
@Override
public Optional findVertexWithAttempt(ExecutionAttemptID attemptId) {
return Optional.ofNullable(currentExecutions.get(attemptId))
.map(Execution::getVertexWithAttempt);
}
@Override
public Optional findExecution(ExecutionAttemptID attemptId) {
return Optional.ofNullable(currentExecutions.get(attemptId));
}
@Override
public ExecutionGraphID getExecutionGraphID() {
return executionGraphId;
}
@Override
public List getClusterPartitionShuffleDescriptors(
IntermediateDataSetID intermediateDataSetID) {
return partitionTracker.getClusterPartitionShuffleDescriptors(intermediateDataSetID);
}
@Override
public MarkPartitionFinishedStrategy getMarkPartitionFinishedStrategy() {
return markPartitionFinishedStrategy;
}
@Override
public JobVertexInputInfo getJobVertexInputInfo(
JobVertexID jobVertexId, IntermediateDataSetID resultId) {
return vertexInputInfoStore.get(jobVertexId, resultId);
}
@Override
public TaskDeploymentDescriptorFactory getTaskDeploymentDescriptorFactory() {
return taskDeploymentDescriptorFactory;
}
}