org.apache.flink.runtime.executiongraph.Execution Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.executiongraph;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.Archiveable;
import org.apache.flink.api.common.accumulators.Accumulator;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.core.io.InputSplit;
import org.apache.flink.runtime.JobException;
import org.apache.flink.runtime.accumulators.StringifiedAccumulatorResult;
import org.apache.flink.runtime.checkpoint.CheckpointOptions;
import org.apache.flink.runtime.checkpoint.CheckpointType;
import org.apache.flink.runtime.checkpoint.CheckpointType.PostCheckpointAction;
import org.apache.flink.runtime.checkpoint.JobManagerTaskRestore;
import org.apache.flink.runtime.clusterframework.types.AllocationID;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor;
import org.apache.flink.runtime.concurrent.FutureUtils;
import org.apache.flink.runtime.deployment.ResultPartitionDeploymentDescriptor;
import org.apache.flink.runtime.deployment.TaskDeploymentDescriptor;
import org.apache.flink.runtime.deployment.TaskDeploymentDescriptorFactory;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.io.network.partition.JobMasterPartitionTracker;
import org.apache.flink.runtime.io.network.partition.ResultPartitionID;
import org.apache.flink.runtime.jobgraph.IntermediateDataSetID;
import org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID;
import org.apache.flink.runtime.jobgraph.OperatorID;
import org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway;
import org.apache.flink.runtime.jobmaster.LogicalSlot;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.operators.coordination.OperatorEvent;
import org.apache.flink.runtime.operators.coordination.TaskNotRunningException;
import org.apache.flink.runtime.scheduler.strategy.ConsumerVertexGroup;
import org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID;
import org.apache.flink.runtime.shuffle.NettyShuffleMaster;
import org.apache.flink.runtime.shuffle.PartitionDescriptor;
import org.apache.flink.runtime.shuffle.ProducerDescriptor;
import org.apache.flink.runtime.shuffle.ShuffleDescriptor;
import org.apache.flink.runtime.shuffle.ShuffleMaster;
import org.apache.flink.runtime.taskexecutor.TaskExecutorOperatorEventGateway;
import org.apache.flink.runtime.taskmanager.TaskManagerLocation;
import org.apache.flink.util.FlinkException;
import org.apache.flink.util.OptionalFailure;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.SerializedValue;
import org.slf4j.Logger;
import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Executor;
import java.util.concurrent.TimeoutException;
import java.util.function.Function;
import java.util.stream.Collectors;
import static org.apache.flink.runtime.deployment.TaskDeploymentDescriptorFactory.getConsumedPartitionShuffleDescriptor;
import static org.apache.flink.runtime.execution.ExecutionState.CANCELED;
import static org.apache.flink.runtime.execution.ExecutionState.CANCELING;
import static org.apache.flink.runtime.execution.ExecutionState.CREATED;
import static org.apache.flink.runtime.execution.ExecutionState.DEPLOYING;
import static org.apache.flink.runtime.execution.ExecutionState.FAILED;
import static org.apache.flink.runtime.execution.ExecutionState.FINISHED;
import static org.apache.flink.runtime.execution.ExecutionState.INITIALIZING;
import static org.apache.flink.runtime.execution.ExecutionState.RUNNING;
import static org.apache.flink.runtime.execution.ExecutionState.SCHEDULED;
import static org.apache.flink.util.Preconditions.checkNotNull;
import static org.apache.flink.util.Preconditions.checkState;
/**
* A single execution of a vertex. While an {@link ExecutionVertex} can be executed multiple times
* (for recovery, re-computation, re-configuration), this class tracks the state of a single
* execution of that vertex and the resources.
*
* Lock free state transitions
*
* In several points of the code, we need to deal with possible concurrent state changes and
* actions. For example, while the call to deploy a task (send it to the TaskManager) happens, the
* task gets cancelled.
*
*
We could lock the entire portion of the code (decision to deploy, deploy, set state to
* running) such that it is guaranteed that any "cancel command" will only pick up after deployment
* is done and that the "cancel command" call will never overtake the deploying call.
*
*
This blocks the threads big time, because the remote calls may take long. Depending of their
* locking behavior, it may even result in distributed deadlocks (unless carefully avoided). We
* therefore use atomic state updates and occasional double-checking to ensure that the state after
* a completed call is as expected, and trigger correcting actions if it is not. Many actions are
* also idempotent (like canceling).
*/
public class Execution
implements AccessExecution, Archiveable, LogicalSlot.Payload {
private static final Logger LOG = DefaultExecutionGraph.LOG;
private static final int NUM_CANCEL_CALL_TRIES = 3;
// --------------------------------------------------------------------------------------------
/** The executor which is used to execute futures. */
private final Executor executor;
/** The execution vertex whose task this execution executes. */
private final ExecutionVertex vertex;
/** The unique ID marking the specific execution instant of the task. */
private final ExecutionAttemptID attemptId;
/**
* The timestamps when state transitions occurred, indexed by {@link ExecutionState#ordinal()}.
*/
private final long[] stateTimestamps;
private final int attemptNumber;
private final Time rpcTimeout;
private final Collection partitionInfos;
/** A future that completes once the Execution reaches a terminal ExecutionState. */
private final CompletableFuture terminalStateFuture;
private final CompletableFuture> releaseFuture;
private final CompletableFuture taskManagerLocationFuture;
/**
* Gets completed successfully when the task switched to {@link ExecutionState#INITIALIZING} or
* {@link ExecutionState#RUNNING}. If the task never switches to those state, but fails
* immediately, then this future never completes.
*/
private final CompletableFuture> initializingOrRunningFuture;
private volatile ExecutionState state = CREATED;
private LogicalSlot assignedResource;
private Optional failureCause =
Optional.empty(); // once an ErrorInfo is set, never changes
/**
* Information to restore the task on recovery, such as checkpoint id and task state snapshot.
*/
@Nullable private JobManagerTaskRestore taskRestore;
/** This field holds the allocation id once it was assigned successfully. */
@Nullable private AllocationID assignedAllocationID;
// ------------------------ Accumulators & Metrics ------------------------
/**
* Lock for updating the accumulators atomically. Prevents final accumulators to be overwritten
* by partial accumulators on a late heartbeat.
*/
private final Object accumulatorLock = new Object();
/* Continuously updated map of user-defined accumulators */
private Map> userAccumulators;
private IOMetrics ioMetrics;
private Map
producedPartitions;
// --------------------------------------------------------------------------------------------
/**
* Creates a new Execution attempt.
*
* @param executor The executor used to dispatch callbacks from futures and asynchronous RPC
* calls.
* @param vertex The execution vertex to which this Execution belongs
* @param attemptNumber The execution attempt number.
* @param startTimestamp The timestamp that marks the creation of this Execution
* @param rpcTimeout The rpcTimeout for RPC calls like deploy/cancel/stop.
*/
public Execution(
Executor executor,
ExecutionVertex vertex,
int attemptNumber,
long startTimestamp,
Time rpcTimeout) {
this.executor = checkNotNull(executor);
this.vertex = checkNotNull(vertex);
this.attemptId = new ExecutionAttemptID();
this.rpcTimeout = checkNotNull(rpcTimeout);
this.attemptNumber = attemptNumber;
this.stateTimestamps = new long[ExecutionState.values().length];
markTimestamp(CREATED, startTimestamp);
this.partitionInfos = new ArrayList<>(16);
this.producedPartitions = Collections.emptyMap();
this.terminalStateFuture = new CompletableFuture<>();
this.releaseFuture = new CompletableFuture<>();
this.taskManagerLocationFuture = new CompletableFuture<>();
this.initializingOrRunningFuture = new CompletableFuture<>();
this.assignedResource = null;
}
// --------------------------------------------------------------------------------------------
// Properties
// --------------------------------------------------------------------------------------------
public ExecutionVertex getVertex() {
return vertex;
}
@Override
public ExecutionAttemptID getAttemptId() {
return attemptId;
}
@Override
public int getAttemptNumber() {
return attemptNumber;
}
@Override
public ExecutionState getState() {
return state;
}
@Nullable
public AllocationID getAssignedAllocationID() {
return assignedAllocationID;
}
public CompletableFuture getTaskManagerLocationFuture() {
return taskManagerLocationFuture;
}
public LogicalSlot getAssignedResource() {
return assignedResource;
}
public Optional getResultPartitionDeploymentDescriptor(
IntermediateResultPartitionID id) {
return Optional.ofNullable(producedPartitions.get(id));
}
/**
* Tries to assign the given slot to the execution. The assignment works only if the Execution
* is in state SCHEDULED. Returns true, if the resource could be assigned.
*
* @param logicalSlot to assign to this execution
* @return true if the slot could be assigned to the execution, otherwise false
*/
public boolean tryAssignResource(final LogicalSlot logicalSlot) {
assertRunningInJobMasterMainThread();
checkNotNull(logicalSlot);
// only allow to set the assigned resource in state SCHEDULED or CREATED
// note: we also accept resource assignment when being in state CREATED for testing purposes
if (state == SCHEDULED || state == CREATED) {
if (assignedResource == null) {
assignedResource = logicalSlot;
if (logicalSlot.tryAssignPayload(this)) {
// check for concurrent modification (e.g. cancelling call)
if ((state == SCHEDULED || state == CREATED)
&& !taskManagerLocationFuture.isDone()) {
taskManagerLocationFuture.complete(logicalSlot.getTaskManagerLocation());
assignedAllocationID = logicalSlot.getAllocationId();
return true;
} else {
// free assigned resource and return false
assignedResource = null;
return false;
}
} else {
assignedResource = null;
return false;
}
} else {
// the slot already has another slot assigned
return false;
}
} else {
// do not allow resource assignment if we are not in state SCHEDULED
return false;
}
}
public InputSplit getNextInputSplit() {
final LogicalSlot slot = this.getAssignedResource();
final String host = slot != null ? slot.getTaskManagerLocation().getHostname() : null;
return this.vertex.getNextInputSplit(host);
}
@Override
public TaskManagerLocation getAssignedResourceLocation() {
// returns non-null only when a location is already assigned
final LogicalSlot currentAssignedResource = assignedResource;
return currentAssignedResource != null
? currentAssignedResource.getTaskManagerLocation()
: null;
}
@Override
public Optional getFailureInfo() {
return failureCause;
}
@Override
public long[] getStateTimestamps() {
return stateTimestamps;
}
@Override
public long getStateTimestamp(ExecutionState state) {
return this.stateTimestamps[state.ordinal()];
}
public boolean isFinished() {
return state.isTerminal();
}
@Nullable
public JobManagerTaskRestore getTaskRestore() {
return taskRestore;
}
/**
* Sets the initial state for the execution. The serialized state is then shipped via the {@link
* TaskDeploymentDescriptor} to the TaskManagers.
*
* @param taskRestore information to restore the state
*/
public void setInitialState(@Nullable JobManagerTaskRestore taskRestore) {
this.taskRestore = taskRestore;
}
/**
* Gets a future that completes once the task execution reaches one of the states {@link
* ExecutionState#INITIALIZING} or {@link ExecutionState#RUNNING}. If this task never reaches
* these states (for example because the task is cancelled before it was properly deployed and
* restored), then this future will never complete.
*
* The future is completed already in the {@link ExecutionState#INITIALIZING} state, because
* various running actions are already possible in that state (the task already accepts and
* sends events and network data for task recovery). (Note that in earlier versions, the
* INITIALIZING state was not separate but part of the RUNNING state).
*
*
This future is always completed from the job master's main thread.
*/
public CompletableFuture> getInitializingOrRunningFuture() {
return initializingOrRunningFuture;
}
/**
* Gets a future that completes once the task execution reaches a terminal state. The future
* will be completed with specific state that the execution reached. This future is always
* completed from the job master's main thread.
*
* @return A future which is completed once the execution reaches a terminal state
*/
@Override
public CompletableFuture getTerminalStateFuture() {
return terminalStateFuture;
}
/**
* Gets the release future which is completed once the execution reaches a terminal state and
* the assigned resource has been released. This future is always completed from the job
* master's main thread.
*
* @return A future which is completed once the assigned resource has been released
*/
public CompletableFuture> getReleaseFuture() {
return releaseFuture;
}
// --------------------------------------------------------------------------------------------
// Actions
// --------------------------------------------------------------------------------------------
public CompletableFuture registerProducedPartitions(
TaskManagerLocation location, boolean notifyPartitionDataAvailable) {
assertRunningInJobMasterMainThread();
return FutureUtils.thenApplyAsyncIfNotDone(
registerProducedPartitions(
vertex, location, attemptId, notifyPartitionDataAvailable),
vertex.getExecutionGraphAccessor().getJobMasterMainThreadExecutor(),
producedPartitionsCache -> {
producedPartitions = producedPartitionsCache;
startTrackingPartitions(
location.getResourceID(), producedPartitionsCache.values());
return this;
});
}
/**
* Register producedPartitions to {@link ShuffleMaster}
*
* HACK: Please notice that this method simulates asynchronous registration in a synchronous
* way by making sure the returned {@link CompletableFuture} from {@link
* ShuffleMaster#registerPartitionWithProducer} is completed immediately.
*
*
{@link Execution#producedPartitions} are registered through an asynchronous interface
* {@link ShuffleMaster#registerPartitionWithProducer} to {@link ShuffleMaster}, however they
* are not always accessed through callbacks. So, it is possible that {@link
* Execution#producedPartitions} have not been available yet when accessed (in {@link
* Execution#deploy} for example).
*
*
Since the only implementation of {@link ShuffleMaster} is {@link NettyShuffleMaster},
* which indeed registers producedPartition in a synchronous way, this method enforces
* synchronous registration under an asynchronous interface for now.
*
*
TODO: If asynchronous registration is needed in the future, use callbacks to access {@link
* Execution#producedPartitions}.
*
* @return completed future of partition deployment descriptors.
*/
@VisibleForTesting
static CompletableFuture<
Map>
registerProducedPartitions(
ExecutionVertex vertex,
TaskManagerLocation location,
ExecutionAttemptID attemptId,
boolean notifyPartitionDataAvailable) {
ProducerDescriptor producerDescriptor = ProducerDescriptor.create(location, attemptId);
Collection partitions =
vertex.getProducedPartitions().values();
Collection> partitionRegistrations =
new ArrayList<>(partitions.size());
for (IntermediateResultPartition partition : partitions) {
PartitionDescriptor partitionDescriptor = PartitionDescriptor.from(partition);
int maxParallelism =
getPartitionMaxParallelism(
partition,
vertex.getExecutionGraphAccessor()::getExecutionVertexOrThrow);
CompletableFuture extends ShuffleDescriptor> shuffleDescriptorFuture =
vertex.getExecutionGraphAccessor()
.getShuffleMaster()
.registerPartitionWithProducer(partitionDescriptor, producerDescriptor);
// temporary hack; the scheduler does not handle incomplete futures properly
Preconditions.checkState(
shuffleDescriptorFuture.isDone(), "ShuffleDescriptor future is incomplete.");
CompletableFuture partitionRegistration =
shuffleDescriptorFuture.thenApply(
shuffleDescriptor ->
new ResultPartitionDeploymentDescriptor(
partitionDescriptor,
shuffleDescriptor,
maxParallelism,
notifyPartitionDataAvailable));
partitionRegistrations.add(partitionRegistration);
}
return FutureUtils.combineAll(partitionRegistrations)
.thenApply(
rpdds -> {
Map
producedPartitions = new LinkedHashMap<>(partitions.size());
rpdds.forEach(
rpdd -> producedPartitions.put(rpdd.getPartitionId(), rpdd));
return producedPartitions;
});
}
private static int getPartitionMaxParallelism(
IntermediateResultPartition partition,
Function getVertexById) {
final List consumerVertexGroups = partition.getConsumerVertexGroups();
Preconditions.checkArgument(
consumerVertexGroups.size() == 1,
"Currently there has to be exactly one consumer in real jobs");
final ConsumerVertexGroup consumerVertexGroup = consumerVertexGroups.get(0);
return getVertexById
.apply(consumerVertexGroup.getFirst())
.getJobVertex()
.getMaxParallelism();
}
/**
* Deploys the execution to the previously assigned resource.
*
* @throws JobException if the execution cannot be deployed to the assigned resource
*/
public void deploy() throws JobException {
assertRunningInJobMasterMainThread();
final LogicalSlot slot = assignedResource;
checkNotNull(
slot,
"In order to deploy the execution we first have to assign a resource via tryAssignResource.");
// Check if the TaskManager died in the meantime
// This only speeds up the response to TaskManagers failing concurrently to deployments.
// The more general check is the rpcTimeout of the deployment call
if (!slot.isAlive()) {
throw new JobException("Target slot (TaskManager) for deployment is no longer alive.");
}
// make sure exactly one deployment call happens from the correct state
// note: the transition from CREATED to DEPLOYING is for testing purposes only
ExecutionState previous = this.state;
if (previous == SCHEDULED || previous == CREATED) {
if (!transitionState(previous, DEPLOYING)) {
// race condition, someone else beat us to the deploying call.
// this should actually not happen and indicates a race somewhere else
throw new IllegalStateException(
"Cannot deploy task: Concurrent deployment call race.");
}
} else {
// vertex may have been cancelled, or it was already scheduled
throw new IllegalStateException(
"The vertex must be in CREATED or SCHEDULED state to be deployed. Found state "
+ previous);
}
if (this != slot.getPayload()) {
throw new IllegalStateException(
String.format(
"The execution %s has not been assigned to the assigned slot.", this));
}
try {
// race double check, did we fail/cancel and do we need to release the slot?
if (this.state != DEPLOYING) {
slot.releaseSlot(
new FlinkException(
"Actual state of execution "
+ this
+ " ("
+ state
+ ") does not match expected state DEPLOYING."));
return;
}
LOG.info(
"Deploying {} (attempt #{}) with attempt id {} to {} with allocation id {}",
vertex.getTaskNameWithSubtaskIndex(),
attemptNumber,
vertex.getCurrentExecutionAttempt().getAttemptId(),
getAssignedResourceLocation(),
slot.getAllocationId());
final TaskDeploymentDescriptor deployment =
TaskDeploymentDescriptorFactory.fromExecutionVertex(vertex, attemptNumber)
.createDeploymentDescriptor(
slot.getAllocationId(),
taskRestore,
producedPartitions.values());
// null taskRestore to let it be GC'ed
taskRestore = null;
final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
final ComponentMainThreadExecutor jobMasterMainThreadExecutor =
vertex.getExecutionGraphAccessor().getJobMasterMainThreadExecutor();
getVertex().notifyPendingDeployment(this);
// We run the submission in the future executor so that the serialization of large TDDs
// does not block
// the main thread and sync back to the main thread once submission is completed.
CompletableFuture.supplyAsync(
() -> taskManagerGateway.submitTask(deployment, rpcTimeout), executor)
.thenCompose(Function.identity())
.whenCompleteAsync(
(ack, failure) -> {
if (failure == null) {
vertex.notifyCompletedDeployment(this);
} else {
if (failure instanceof TimeoutException) {
String taskname =
vertex.getTaskNameWithSubtaskIndex()
+ " ("
+ attemptId
+ ')';
markFailed(
new Exception(
"Cannot deploy task "
+ taskname
+ " - TaskManager ("
+ getAssignedResourceLocation()
+ ") not responding after a rpcTimeout of "
+ rpcTimeout,
failure));
} else {
markFailed(failure);
}
}
},
jobMasterMainThreadExecutor);
} catch (Throwable t) {
markFailed(t);
}
}
public void cancel() {
// depending on the previous state, we go directly to cancelled (no cancel call necessary)
// -- or to canceling (cancel call needs to be sent to the task manager)
// because of several possibly previous states, we need to again loop until we make a
// successful atomic state transition
assertRunningInJobMasterMainThread();
while (true) {
ExecutionState current = this.state;
if (current == CANCELING || current == CANCELED) {
// already taken care of, no need to cancel again
return;
}
// these two are the common cases where we need to send a cancel call
else if (current == INITIALIZING || current == RUNNING || current == DEPLOYING) {
// try to transition to canceling, if successful, send the cancel call
if (startCancelling(NUM_CANCEL_CALL_TRIES)) {
return;
}
// else: fall through the loop
} else if (current == FINISHED) {
// finished before it could be cancelled.
// in any case, the task is removed from the TaskManager already
// a pipelined partition whose consumer has never been deployed could still be
// buffered on the TM
// release it here since pipelined partitions for FINISHED executions aren't handled
// elsewhere
// covers the following cases:
// a) restarts of this vertex
// b) a global failure (which may result in a FAILED job state)
sendReleaseIntermediateResultPartitionsRpcCall();
return;
} else if (current == FAILED) {
// failed before it could be cancelled.
// in any case, the task is removed from the TaskManager already
return;
} else if (current == CREATED || current == SCHEDULED) {
// from here, we can directly switch to cancelled, because no task has been deployed
if (cancelAtomically()) {
return;
}
// else: fall through the loop
} else {
throw new IllegalStateException(current.name());
}
}
}
public CompletableFuture> suspend() {
switch (state) {
case RUNNING:
case INITIALIZING:
case DEPLOYING:
case CREATED:
case SCHEDULED:
if (!cancelAtomically()) {
throw new IllegalStateException(
String.format(
"Could not directly go to %s from %s.",
CANCELED.name(), state.name()));
}
break;
case CANCELING:
completeCancelling();
break;
case FINISHED:
// a pipelined partition whose consumer has never been deployed could still be
// buffered on the TM
// release it here since pipelined partitions for FINISHED executions aren't handled
// elsewhere
// most notably, the TaskExecutor does not release pipelined partitions when
// disconnecting from the JM
sendReleaseIntermediateResultPartitionsRpcCall();
break;
case FAILED:
case CANCELED:
break;
default:
throw new IllegalStateException(state.name());
}
return releaseFuture;
}
private void updatePartitionConsumers(final IntermediateResultPartition partition) {
final List consumerVertexGroups = partition.getConsumerVertexGroups();
if (consumerVertexGroups.size() == 0) {
return;
}
if (consumerVertexGroups.size() > 1) {
fail(
new IllegalStateException(
"Currently, only a single consumer group per partition is supported."));
return;
}
for (ExecutionVertexID consumerVertexId : consumerVertexGroups.get(0)) {
final ExecutionVertex consumerVertex =
vertex.getExecutionGraphAccessor().getExecutionVertexOrThrow(consumerVertexId);
final Execution consumer = consumerVertex.getCurrentExecutionAttempt();
final ExecutionState consumerState = consumer.getState();
// ----------------------------------------------------------------
// Consumer is recovering or running => send update message now
// Consumer is deploying => cache the partition info which would be
// sent after switching to running
// ----------------------------------------------------------------
if (consumerState == DEPLOYING
|| consumerState == RUNNING
|| consumerState == INITIALIZING) {
final PartitionInfo partitionInfo = createPartitionInfo(partition);
if (consumerState == DEPLOYING) {
consumerVertex.cachePartitionInfo(partitionInfo);
} else {
consumer.sendUpdatePartitionInfoRpcCall(Collections.singleton(partitionInfo));
}
}
}
}
private static PartitionInfo createPartitionInfo(
IntermediateResultPartition consumedPartition) {
IntermediateDataSetID intermediateDataSetID =
consumedPartition.getIntermediateResult().getId();
ShuffleDescriptor shuffleDescriptor =
getConsumedPartitionShuffleDescriptor(
consumedPartition,
TaskDeploymentDescriptorFactory.PartitionLocationConstraint.MUST_BE_KNOWN);
return new PartitionInfo(intermediateDataSetID, shuffleDescriptor);
}
/**
* This method fails the vertex due to an external condition. The task will move to state
* FAILED. If the task was in state RUNNING or DEPLOYING before, it will send a cancel call to
* the TaskManager.
*
* @param t The exception that caused the task to fail.
*/
@Override
public void fail(Throwable t) {
processFail(t, true);
}
/**
* Notify the task of this execution about a completed checkpoint.
*
* @param checkpointId of the completed checkpoint
* @param timestamp of the completed checkpoint
*/
public void notifyCheckpointComplete(long checkpointId, long timestamp) {
final LogicalSlot slot = assignedResource;
if (slot != null) {
final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
taskManagerGateway.notifyCheckpointComplete(
attemptId, getVertex().getJobId(), checkpointId, timestamp);
} else {
LOG.debug(
"The execution has no slot assigned. This indicates that the execution is "
+ "no longer running.");
}
}
/**
* Notify the task of this execution about a aborted checkpoint.
*
* @param abortCheckpointId of the subsumed checkpoint
* @param timestamp of the subsumed checkpoint
*/
public void notifyCheckpointAborted(long abortCheckpointId, long timestamp) {
final LogicalSlot slot = assignedResource;
if (slot != null) {
final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
taskManagerGateway.notifyCheckpointAborted(
attemptId, getVertex().getJobId(), abortCheckpointId, timestamp);
} else {
LOG.debug(
"The execution has no slot assigned. This indicates that the execution is "
+ "no longer running.");
}
}
/**
* Trigger a new checkpoint on the task of this execution.
*
* @param checkpointId of th checkpoint to trigger
* @param timestamp of the checkpoint to trigger
* @param checkpointOptions of the checkpoint to trigger
*/
public void triggerCheckpoint(
long checkpointId, long timestamp, CheckpointOptions checkpointOptions) {
triggerCheckpointHelper(checkpointId, timestamp, checkpointOptions);
}
/**
* Trigger a new checkpoint on the task of this execution.
*
* @param checkpointId of th checkpoint to trigger
* @param timestamp of the checkpoint to trigger
* @param checkpointOptions of the checkpoint to trigger
*/
public void triggerSynchronousSavepoint(
long checkpointId, long timestamp, CheckpointOptions checkpointOptions) {
triggerCheckpointHelper(checkpointId, timestamp, checkpointOptions);
}
private void triggerCheckpointHelper(
long checkpointId, long timestamp, CheckpointOptions checkpointOptions) {
final CheckpointType checkpointType = checkpointOptions.getCheckpointType();
if (checkpointType.getPostCheckpointAction() == PostCheckpointAction.TERMINATE
&& !(checkpointType.isSynchronous() && checkpointType.isSavepoint())) {
throw new IllegalArgumentException(
"Only synchronous savepoints are allowed to advance the watermark to MAX.");
}
final LogicalSlot slot = assignedResource;
if (slot != null) {
final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
taskManagerGateway.triggerCheckpoint(
attemptId, getVertex().getJobId(), checkpointId, timestamp, checkpointOptions);
} else {
LOG.debug(
"The execution has no slot assigned. This indicates that the execution is no longer running.");
}
}
/**
* Sends the operator event to the Task on the Task Executor.
*
* @return True, of the message was sent, false is the task is currently not running.
*/
public CompletableFuture sendOperatorEvent(
OperatorID operatorId, SerializedValue event) {
assertRunningInJobMasterMainThread();
final LogicalSlot slot = assignedResource;
if (slot != null && (getState() == RUNNING || getState() == INITIALIZING)) {
final TaskExecutorOperatorEventGateway eventGateway = slot.getTaskManagerGateway();
return eventGateway.sendOperatorEventToTask(getAttemptId(), operatorId, event);
} else {
return FutureUtils.completedExceptionally(
new TaskNotRunningException(
'"'
+ vertex.getTaskNameWithSubtaskIndex()
+ "\" is not running, but in state "
+ getState()));
}
}
// --------------------------------------------------------------------------------------------
// Callbacks
// --------------------------------------------------------------------------------------------
/**
* This method marks the task as failed, but will make no attempt to remove task execution from
* the task manager. It is intended for cases where the task is known not to be running, or then
* the TaskManager reports failure (in which case it has already removed the task).
*
* @param t The exception that caused the task to fail.
*/
void markFailed(Throwable t) {
processFail(t, false);
}
void markFailed(
Throwable t,
boolean cancelTask,
Map> userAccumulators,
IOMetrics metrics,
boolean releasePartitions,
boolean fromSchedulerNg) {
processFail(t, cancelTask, userAccumulators, metrics, releasePartitions, fromSchedulerNg);
}
@VisibleForTesting
public void markFinished() {
markFinished(null, null);
}
void markFinished(Map> userAccumulators, IOMetrics metrics) {
assertRunningInJobMasterMainThread();
// this call usually comes during RUNNING, but may also come while still in deploying (very
// fast tasks!)
while (true) {
ExecutionState current = this.state;
if (current == INITIALIZING || current == RUNNING || current == DEPLOYING) {
if (transitionState(current, FINISHED)) {
try {
finishPartitionsAndUpdateConsumers();
updateAccumulatorsAndMetrics(userAccumulators, metrics);
releaseAssignedResource(null);
vertex.getExecutionGraphAccessor().deregisterExecution(this);
} finally {
vertex.executionFinished(this);
}
return;
}
} else if (current == CANCELING) {
// we sent a cancel call, and the task manager finished before it arrived. We
// will never get a CANCELED call back from the job manager
completeCancelling(userAccumulators, metrics, true);
return;
} else if (current == CANCELED || current == FAILED) {
if (LOG.isDebugEnabled()) {
LOG.debug("Task FINISHED, but concurrently went to state " + state);
}
return;
} else {
// this should not happen, we need to fail this
markFailed(
new Exception(
"Vertex received FINISHED message while being in state " + state));
return;
}
}
}
private void finishPartitionsAndUpdateConsumers() {
final List newlyFinishedResults =
getVertex().finishAllBlockingPartitions();
if (newlyFinishedResults.isEmpty()) {
return;
}
for (IntermediateResultPartition finishedPartition : newlyFinishedResults) {
final IntermediateResultPartition[] allPartitionsOfNewlyFinishedResults =
finishedPartition.getIntermediateResult().getPartitions();
for (IntermediateResultPartition partition : allPartitionsOfNewlyFinishedResults) {
updatePartitionConsumers(partition);
}
}
}
private boolean cancelAtomically() {
if (startCancelling(0)) {
completeCancelling();
return true;
} else {
return false;
}
}
private boolean startCancelling(int numberCancelRetries) {
if (transitionState(state, CANCELING)) {
taskManagerLocationFuture.cancel(false);
sendCancelRpcCall(numberCancelRetries);
return true;
} else {
return false;
}
}
void completeCancelling() {
completeCancelling(null, null, true);
}
void completeCancelling(
Map> userAccumulators,
IOMetrics metrics,
boolean releasePartitions) {
// the taskmanagers can themselves cancel tasks without an external trigger, if they find
// that the
// network stack is canceled (for example by a failing / canceling receiver or sender
// this is an artifact of the old network runtime, but for now we need to support task
// transitions
// from running directly to canceled
while (true) {
ExecutionState current = this.state;
if (current == CANCELED) {
return;
} else if (current == CANCELING
|| current == RUNNING
|| current == INITIALIZING
|| current == DEPLOYING) {
updateAccumulatorsAndMetrics(userAccumulators, metrics);
if (transitionState(current, CANCELED)) {
finishCancellation(releasePartitions);
return;
}
// else fall through the loop
} else {
// failing in the meantime may happen and is no problem.
// anything else is a serious problem !!!
if (current != FAILED) {
String message =
String.format(
"Asynchronous race: Found %s in state %s after successful cancel call.",
vertex.getTaskNameWithSubtaskIndex(), state);
LOG.error(message);
vertex.getExecutionGraphAccessor().failGlobal(new Exception(message));
}
return;
}
}
}
private void finishCancellation(boolean releasePartitions) {
releaseAssignedResource(new FlinkException("Execution " + this + " was cancelled."));
vertex.getExecutionGraphAccessor().deregisterExecution(this);
handlePartitionCleanup(releasePartitions, releasePartitions);
}
void cachePartitionInfo(PartitionInfo partitionInfo) {
partitionInfos.add(partitionInfo);
}
private void sendPartitionInfos() {
if (!partitionInfos.isEmpty()) {
sendUpdatePartitionInfoRpcCall(new ArrayList<>(partitionInfos));
partitionInfos.clear();
}
}
// --------------------------------------------------------------------------------------------
// Internal Actions
// --------------------------------------------------------------------------------------------
private void processFail(Throwable t, boolean cancelTask) {
processFail(t, cancelTask, null, null, true, false);
}
/**
* Process a execution failure. The failure can be fired by JobManager or reported by
* TaskManager. If it is fired by JobManager and the execution is already deployed, it needs to
* send a PRC call to remove the task from TaskManager. It also needs to release the produced
* partitions if it fails before deployed (because the partitions are possibly already created
* in external shuffle service) or JobManager proactively fails it (in case that it finishes in
* TaskManager when JobManager tries to fail it). The failure will be notified to SchedulerNG if
* it is from within the ExecutionGraph. This is to trigger the failure handling of SchedulerNG
* to recover this failed execution.
*
* @param t Failure cause
* @param cancelTask Indicating whether to send a PRC call to remove task from TaskManager. True
* if the failure is fired by JobManager and the execution is already deployed. Otherwise it
* should be false.
* @param userAccumulators User accumulators
* @param metrics IO metrics
* @param releasePartitions Indicating whether to release result partitions produced by this
* execution. False if the task is FAILED in TaskManager, otherwise true.
* @param fromSchedulerNg Indicating whether the failure is from the SchedulerNg. It should be
* false if it is from within the ExecutionGraph.
*/
private void processFail(
Throwable t,
boolean cancelTask,
Map> userAccumulators,
IOMetrics metrics,
boolean releasePartitions,
boolean fromSchedulerNg) {
assertRunningInJobMasterMainThread();
ExecutionState current = this.state;
if (current == FAILED) {
// already failed. It is enough to remember once that we failed (its sad enough)
return;
}
if (current == CANCELED || current == FINISHED) {
// we are already aborting or are already aborted or we are already finished
if (LOG.isDebugEnabled()) {
LOG.debug(
"Ignoring transition of vertex {} to {} while being {}.",
getVertexWithAttempt(),
FAILED,
current);
}
return;
}
if (current == CANCELING) {
completeCancelling(userAccumulators, metrics, true);
return;
}
if (!fromSchedulerNg) {
vertex.getExecutionGraphAccessor()
.notifySchedulerNgAboutInternalTaskFailure(
attemptId, t, cancelTask, releasePartitions);
return;
}
checkState(transitionState(current, FAILED, t));
// success (in a manner of speaking)
this.failureCause =
Optional.of(
ErrorInfo.createErrorInfoWithNullableCause(t, getStateTimestamp(FAILED)));
updateAccumulatorsAndMetrics(userAccumulators, metrics);
releaseAssignedResource(t);
vertex.getExecutionGraphAccessor().deregisterExecution(this);
maybeReleasePartitionsAndSendCancelRpcCall(current, cancelTask, releasePartitions);
}
private void maybeReleasePartitionsAndSendCancelRpcCall(
final ExecutionState stateBeforeFailed,
final boolean cancelTask,
final boolean releasePartitions) {
handlePartitionCleanup(releasePartitions, releasePartitions);
if (cancelTask
&& (stateBeforeFailed == RUNNING
|| stateBeforeFailed == INITIALIZING
|| stateBeforeFailed == DEPLOYING)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Sending out cancel request, to remove task execution from TaskManager.");
}
try {
if (assignedResource != null) {
sendCancelRpcCall(NUM_CANCEL_CALL_TRIES);
}
} catch (Throwable tt) {
// no reason this should ever happen, but log it to be safe
LOG.error(
"Error triggering cancel call while marking task {} as failed.",
getVertex().getTaskNameWithSubtaskIndex(),
tt);
}
}
}
boolean switchToRecovering() {
if (switchTo(DEPLOYING, INITIALIZING)) {
sendPartitionInfos();
return true;
}
return false;
}
boolean switchToRunning() {
return switchTo(INITIALIZING, RUNNING);
}
private boolean switchTo(ExecutionState from, ExecutionState to) {
if (transitionState(from, to)) {
return true;
} else {
// something happened while the call was in progress.
// it can mean:
// - canceling, while deployment was in progress. state is now canceling, or canceled,
// if the response overtook
// - finishing (execution and finished call overtook the deployment answer, which is
// possible and happens for fast tasks)
// - failed (execution, failure, and failure message overtook the deployment answer)
ExecutionState currentState = this.state;
if (currentState == FINISHED || currentState == CANCELED) {
// do nothing, the task was really fast (nice)
// or it was canceled really fast
} else if (currentState == CANCELING || currentState == FAILED) {
if (LOG.isDebugEnabled()) {
// this log statement is guarded because the 'getVertexWithAttempt()' method
// performs string concatenations
LOG.debug(
"Concurrent canceling/failing of {} while deployment was in progress.",
getVertexWithAttempt());
}
sendCancelRpcCall(NUM_CANCEL_CALL_TRIES);
} else {
String message =
String.format(
"Concurrent unexpected state transition of task %s to %s while deployment was in progress.",
getVertexWithAttempt(), currentState);
LOG.debug(message);
// undo the deployment
sendCancelRpcCall(NUM_CANCEL_CALL_TRIES);
// record the failure
markFailed(new Exception(message));
}
return false;
}
}
/**
* This method sends a CancelTask message to the instance of the assigned slot.
*
* The sending is tried up to NUM_CANCEL_CALL_TRIES times.
*/
private void sendCancelRpcCall(int numberRetries) {
final LogicalSlot slot = assignedResource;
if (slot != null) {
final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
final ComponentMainThreadExecutor jobMasterMainThreadExecutor =
getVertex().getExecutionGraphAccessor().getJobMasterMainThreadExecutor();
CompletableFuture cancelResultFuture =
FutureUtils.retry(
() -> taskManagerGateway.cancelTask(attemptId, rpcTimeout),
numberRetries,
jobMasterMainThreadExecutor);
cancelResultFuture.whenComplete(
(ack, failure) -> {
if (failure != null) {
fail(new Exception("Task could not be canceled.", failure));
}
});
}
}
private void startTrackingPartitions(
final ResourceID taskExecutorId,
final Collection partitions) {
JobMasterPartitionTracker partitionTracker =
vertex.getExecutionGraphAccessor().getPartitionTracker();
for (ResultPartitionDeploymentDescriptor partition : partitions) {
partitionTracker.startTrackingPartition(taskExecutorId, partition);
}
}
void handlePartitionCleanup(
boolean releasePipelinedPartitions, boolean releaseBlockingPartitions) {
if (releasePipelinedPartitions) {
sendReleaseIntermediateResultPartitionsRpcCall();
}
final Collection partitionIds = getPartitionIds();
final JobMasterPartitionTracker partitionTracker =
getVertex().getExecutionGraphAccessor().getPartitionTracker();
if (!partitionIds.isEmpty()) {
if (releaseBlockingPartitions) {
LOG.info("Discarding the results produced by task execution {}.", attemptId);
partitionTracker.stopTrackingAndReleasePartitions(partitionIds);
} else {
partitionTracker.stopTrackingPartitions(partitionIds);
}
}
}
private Collection getPartitionIds() {
return producedPartitions.values().stream()
.map(ResultPartitionDeploymentDescriptor::getShuffleDescriptor)
.map(ShuffleDescriptor::getResultPartitionID)
.collect(Collectors.toList());
}
private void sendReleaseIntermediateResultPartitionsRpcCall() {
LOG.info("Discarding the results produced by task execution {}.", attemptId);
final LogicalSlot slot = assignedResource;
if (slot != null) {
final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
final ShuffleMaster> shuffleMaster =
getVertex().getExecutionGraphAccessor().getShuffleMaster();
Set partitionIds =
producedPartitions.values().stream()
.filter(
resultPartitionDeploymentDescriptor ->
resultPartitionDeploymentDescriptor
.getPartitionType()
.isPipelined())
.map(ResultPartitionDeploymentDescriptor::getShuffleDescriptor)
.peek(shuffleMaster::releasePartitionExternally)
.map(ShuffleDescriptor::getResultPartitionID)
.collect(Collectors.toSet());
if (!partitionIds.isEmpty()) {
// TODO For some tests this could be a problem when querying too early if all
// resources were released
taskManagerGateway.releasePartitions(getVertex().getJobId(), partitionIds);
}
}
}
/**
* Update the partition infos on the assigned resource.
*
* @param partitionInfos for the remote task
*/
private void sendUpdatePartitionInfoRpcCall(final Iterable partitionInfos) {
final LogicalSlot slot = assignedResource;
if (slot != null) {
final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
final TaskManagerLocation taskManagerLocation = slot.getTaskManagerLocation();
CompletableFuture updatePartitionsResultFuture =
taskManagerGateway.updatePartitions(attemptId, partitionInfos, rpcTimeout);
updatePartitionsResultFuture.whenCompleteAsync(
(ack, failure) -> {
// fail if there was a failure
if (failure != null) {
fail(
new IllegalStateException(
"Update to task ["
+ getVertexWithAttempt()
+ "] on TaskManager "
+ taskManagerLocation
+ " failed",
failure));
}
},
getVertex().getExecutionGraphAccessor().getJobMasterMainThreadExecutor());
}
}
/**
* Releases the assigned resource and completes the release future once the assigned resource
* has been successfully released.
*
* @param cause for the resource release, null if none
*/
private void releaseAssignedResource(@Nullable Throwable cause) {
assertRunningInJobMasterMainThread();
final LogicalSlot slot = assignedResource;
if (slot != null) {
ComponentMainThreadExecutor jobMasterMainThreadExecutor =
getVertex().getExecutionGraphAccessor().getJobMasterMainThreadExecutor();
slot.releaseSlot(cause)
.whenComplete(
(Object ignored, Throwable throwable) -> {
jobMasterMainThreadExecutor.assertRunningInMainThread();
if (throwable != null) {
releaseFuture.completeExceptionally(throwable);
} else {
releaseFuture.complete(null);
}
});
} else {
// no assigned resource --> we can directly complete the release future
releaseFuture.complete(null);
}
}
// --------------------------------------------------------------------------------------------
// Miscellaneous
// --------------------------------------------------------------------------------------------
public void transitionState(ExecutionState targetState) {
transitionState(state, targetState);
}
private boolean transitionState(ExecutionState currentState, ExecutionState targetState) {
return transitionState(currentState, targetState, null);
}
private boolean transitionState(
ExecutionState currentState, ExecutionState targetState, Throwable error) {
// sanity check
if (currentState.isTerminal()) {
throw new IllegalStateException(
"Cannot leave terminal state "
+ currentState
+ " to transition to "
+ targetState
+ '.');
}
if (state == currentState) {
state = targetState;
markTimestamp(targetState);
if (error == null) {
LOG.info(
"{} ({}) switched from {} to {}.",
getVertex().getTaskNameWithSubtaskIndex(),
getAttemptId(),
currentState,
targetState);
} else {
if (LOG.isInfoEnabled()) {
LOG.info(
"{} ({}) switched from {} to {} on {}.",
getVertex().getTaskNameWithSubtaskIndex(),
getAttemptId(),
currentState,
targetState,
getLocationInformation(),
error);
}
}
if (targetState == INITIALIZING || targetState == RUNNING) {
initializingOrRunningFuture.complete(null);
} else if (targetState.isTerminal()) {
// complete the terminal state future
terminalStateFuture.complete(targetState);
}
// make sure that the state transition completes normally.
// potential errors (in listeners may not affect the main logic)
try {
vertex.notifyStateTransition(this, targetState);
} catch (Throwable t) {
LOG.error(
"Error while notifying execution graph of execution state transition.", t);
}
return true;
} else {
return false;
}
}
private String getLocationInformation() {
if (assignedResource != null) {
return assignedResource.getTaskManagerLocation().toString();
} else {
return "[unassigned resource]";
}
}
private void markTimestamp(ExecutionState state) {
markTimestamp(state, System.currentTimeMillis());
}
private void markTimestamp(ExecutionState state, long timestamp) {
this.stateTimestamps[state.ordinal()] = timestamp;
}
public String getVertexWithAttempt() {
return vertex.getTaskNameWithSubtaskIndex() + " - execution #" + attemptNumber;
}
// ------------------------------------------------------------------------
// Accumulators
// ------------------------------------------------------------------------
/**
* Update accumulators (discarded when the Execution has already been terminated).
*
* @param userAccumulators the user accumulators
*/
public void setAccumulators(Map> userAccumulators) {
synchronized (accumulatorLock) {
if (!state.isTerminal()) {
this.userAccumulators = userAccumulators;
}
}
}
public Map> getUserAccumulators() {
return userAccumulators;
}
@Override
public StringifiedAccumulatorResult[] getUserAccumulatorsStringified() {
Map>> accumulators =
userAccumulators == null
? null
: userAccumulators.entrySet().stream()
.collect(
Collectors.toMap(
Map.Entry::getKey,
entry -> OptionalFailure.of(entry.getValue())));
return StringifiedAccumulatorResult.stringifyAccumulatorResults(accumulators);
}
@Override
public int getParallelSubtaskIndex() {
return getVertex().getParallelSubtaskIndex();
}
@Override
public IOMetrics getIOMetrics() {
return ioMetrics;
}
private void updateAccumulatorsAndMetrics(
Map> userAccumulators, IOMetrics metrics) {
if (userAccumulators != null) {
synchronized (accumulatorLock) {
this.userAccumulators = userAccumulators;
}
}
if (metrics != null) {
this.ioMetrics = metrics;
}
}
// ------------------------------------------------------------------------
// Standard utilities
// ------------------------------------------------------------------------
@Override
public String toString() {
final LogicalSlot slot = assignedResource;
return String.format(
"Attempt #%d (%s) @ %s - [%s]",
attemptNumber,
vertex.getTaskNameWithSubtaskIndex(),
(slot == null ? "(unassigned)" : slot),
state);
}
@Override
public ArchivedExecution archive() {
return new ArchivedExecution(this);
}
private void assertRunningInJobMasterMainThread() {
vertex.getExecutionGraphAccessor()
.getJobMasterMainThreadExecutor()
.assertRunningInMainThread();
}
}