/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.executiongraph;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.Archiveable;
import org.apache.flink.api.common.InputDependencyConstraint;
import org.apache.flink.api.common.accumulators.Accumulator;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.core.io.InputSplit;
import org.apache.flink.runtime.JobException;
import org.apache.flink.runtime.accumulators.StringifiedAccumulatorResult;
import org.apache.flink.runtime.checkpoint.CheckpointOptions;
import org.apache.flink.runtime.checkpoint.CheckpointType;
import org.apache.flink.runtime.checkpoint.JobManagerTaskRestore;
import org.apache.flink.runtime.clusterframework.types.AllocationID;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.clusterframework.types.SlotProfile;
import org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor;
import org.apache.flink.runtime.concurrent.FutureUtils;
import org.apache.flink.runtime.deployment.ResultPartitionDeploymentDescriptor;
import org.apache.flink.runtime.deployment.TaskDeploymentDescriptor;
import org.apache.flink.runtime.deployment.TaskDeploymentDescriptorFactory;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.instance.SlotSharingGroupId;
import org.apache.flink.runtime.io.network.partition.JobMasterPartitionTracker;
import org.apache.flink.runtime.io.network.partition.ResultPartitionID;
import org.apache.flink.runtime.jobgraph.IntermediateDataSetID;
import org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID;
import org.apache.flink.runtime.jobgraph.OperatorID;
import org.apache.flink.runtime.jobmanager.scheduler.CoLocationConstraint;
import org.apache.flink.runtime.jobmanager.scheduler.LocationPreferenceConstraint;
import org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException;
import org.apache.flink.runtime.jobmanager.scheduler.ScheduledUnit;
import org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroup;
import org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway;
import org.apache.flink.runtime.jobmaster.LogicalSlot;
import org.apache.flink.runtime.jobmaster.SlotRequestId;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.messages.TaskBackPressureResponse;
import org.apache.flink.runtime.operators.coordination.OperatorEvent;
import org.apache.flink.runtime.operators.coordination.TaskNotRunningException;
import org.apache.flink.runtime.shuffle.NettyShuffleMaster;
import org.apache.flink.runtime.shuffle.PartitionDescriptor;
import org.apache.flink.runtime.shuffle.ProducerDescriptor;
import org.apache.flink.runtime.shuffle.ShuffleDescriptor;
import org.apache.flink.runtime.shuffle.ShuffleMaster;
import org.apache.flink.runtime.taskexecutor.TaskExecutorOperatorEventGateway;
import org.apache.flink.runtime.taskmanager.TaskManagerLocation;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.FlinkException;
import org.apache.flink.util.FlinkRuntimeException;
import org.apache.flink.util.OptionalFailure;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.SerializedValue;
import org.apache.flink.util.function.ThrowingRunnable;
import org.slf4j.Logger;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.Executor;
import java.util.concurrent.TimeoutException;
import java.util.function.Function;
import java.util.stream.Collectors;
import static org.apache.flink.runtime.deployment.TaskDeploymentDescriptorFactory.getConsumedPartitionShuffleDescriptor;
import static org.apache.flink.runtime.execution.ExecutionState.CANCELED;
import static org.apache.flink.runtime.execution.ExecutionState.CANCELING;
import static org.apache.flink.runtime.execution.ExecutionState.CREATED;
import static org.apache.flink.runtime.execution.ExecutionState.DEPLOYING;
import static org.apache.flink.runtime.execution.ExecutionState.FAILED;
import static org.apache.flink.runtime.execution.ExecutionState.FINISHED;
import static org.apache.flink.runtime.execution.ExecutionState.RUNNING;
import static org.apache.flink.runtime.execution.ExecutionState.SCHEDULED;
import static org.apache.flink.runtime.scheduler.ExecutionVertexSchedulingRequirementsMapper.getPhysicalSlotResourceProfile;
import static org.apache.flink.util.Preconditions.checkNotNull;
/**
* A single execution of a vertex. While an {@link ExecutionVertex} can be executed multiple times
* (for recovery, re-computation, re-configuration), this class tracks the state of a single execution
* of that vertex and the resources.
*
* Lock free state transitions
*
* In several points of the code, we need to deal with possible concurrent state changes and actions.
* For example, while the call to deploy a task (send it to the TaskManager) happens, the task gets cancelled.
*
*
We could lock the entire portion of the code (decision to deploy, deploy, set state to running) such that
* it is guaranteed that any "cancel command" will only pick up after deployment is done and that the "cancel
* command" call will never overtake the deploying call.
*
*
This blocks the threads big time, because the remote calls may take long. Depending of their locking behavior, it
* may even result in distributed deadlocks (unless carefully avoided). We therefore use atomic state updates and
* occasional double-checking to ensure that the state after a completed call is as expected, and trigger correcting
* actions if it is not. Many actions are also idempotent (like canceling).
*/
public class Execution implements AccessExecution, Archiveable, LogicalSlot.Payload {
private static final Logger LOG = ExecutionGraph.LOG;
private static final int NUM_CANCEL_CALL_TRIES = 3;
// --------------------------------------------------------------------------------------------
/** The executor which is used to execute futures. */
private final Executor executor;
/** The execution vertex whose task this execution executes. */
private final ExecutionVertex vertex;
/** The unique ID marking the specific execution instant of the task. */
private final ExecutionAttemptID attemptId;
/** Gets the global modification version of the execution graph when this execution was created.
* This version is bumped in the ExecutionGraph whenever a global failover happens. It is used
* to resolve conflicts between concurrent modification by global and local failover actions. */
private final long globalModVersion;
/** The timestamps when state transitions occurred, indexed by {@link ExecutionState#ordinal()}. */
private final long[] stateTimestamps;
private final int attemptNumber;
private final Time rpcTimeout;
private final Collection partitionInfos;
/** A future that completes once the Execution reaches a terminal ExecutionState. */
private final CompletableFuture terminalStateFuture;
private final CompletableFuture> releaseFuture;
private final CompletableFuture taskManagerLocationFuture;
private volatile ExecutionState state = CREATED;
private LogicalSlot assignedResource;
private Throwable failureCause; // once assigned, never changes
/** Information to restore the task on recovery, such as checkpoint id and task state snapshot. */
@Nullable
private JobManagerTaskRestore taskRestore;
/** This field holds the allocation id once it was assigned successfully. */
@Nullable
private AllocationID assignedAllocationID;
// ------------------------ Accumulators & Metrics ------------------------
/** Lock for updating the accumulators atomically.
* Prevents final accumulators to be overwritten by partial accumulators on a late heartbeat. */
private final Object accumulatorLock = new Object();
/* Continuously updated map of user-defined accumulators */
private Map> userAccumulators;
private IOMetrics ioMetrics;
private Map producedPartitions;
// --------------------------------------------------------------------------------------------
/**
* Creates a new Execution attempt.
*
* @param executor
* The executor used to dispatch callbacks from futures and asynchronous RPC calls.
* @param vertex
* The execution vertex to which this Execution belongs
* @param attemptNumber
* The execution attempt number.
* @param globalModVersion
* The global modification version of the execution graph when this execution was created
* @param startTimestamp
* The timestamp that marks the creation of this Execution
* @param rpcTimeout
* The rpcTimeout for RPC calls like deploy/cancel/stop.
*/
public Execution(
Executor executor,
ExecutionVertex vertex,
int attemptNumber,
long globalModVersion,
long startTimestamp,
Time rpcTimeout) {
this.executor = checkNotNull(executor);
this.vertex = checkNotNull(vertex);
this.attemptId = new ExecutionAttemptID();
this.rpcTimeout = checkNotNull(rpcTimeout);
this.globalModVersion = globalModVersion;
this.attemptNumber = attemptNumber;
this.stateTimestamps = new long[ExecutionState.values().length];
markTimestamp(CREATED, startTimestamp);
this.partitionInfos = new ArrayList<>(16);
this.producedPartitions = Collections.emptyMap();
this.terminalStateFuture = new CompletableFuture<>();
this.releaseFuture = new CompletableFuture<>();
this.taskManagerLocationFuture = new CompletableFuture<>();
this.assignedResource = null;
}
// --------------------------------------------------------------------------------------------
// Properties
// --------------------------------------------------------------------------------------------
public ExecutionVertex getVertex() {
return vertex;
}
@Override
public ExecutionAttemptID getAttemptId() {
return attemptId;
}
@Override
public int getAttemptNumber() {
return attemptNumber;
}
@Override
public ExecutionState getState() {
return state;
}
@Nullable
public AllocationID getAssignedAllocationID() {
return assignedAllocationID;
}
/**
* Gets the global modification version of the execution graph when this execution was created.
*
* This version is bumped in the ExecutionGraph whenever a global failover happens. It is used
* to resolve conflicts between concurrent modification by global and local failover actions.
*/
public long getGlobalModVersion() {
return globalModVersion;
}
public CompletableFuture getTaskManagerLocationFuture() {
return taskManagerLocationFuture;
}
public LogicalSlot getAssignedResource() {
return assignedResource;
}
public Optional getResultPartitionDeploymentDescriptor(
IntermediateResultPartitionID id) {
return Optional.ofNullable(producedPartitions.get(id));
}
/**
* Tries to assign the given slot to the execution. The assignment works only if the
* Execution is in state SCHEDULED. Returns true, if the resource could be assigned.
*
* @param logicalSlot to assign to this execution
* @return true if the slot could be assigned to the execution, otherwise false
*/
public boolean tryAssignResource(final LogicalSlot logicalSlot) {
assertRunningInJobMasterMainThread();
checkNotNull(logicalSlot);
// only allow to set the assigned resource in state SCHEDULED or CREATED
// note: we also accept resource assignment when being in state CREATED for testing purposes
if (state == SCHEDULED || state == CREATED) {
if (assignedResource == null) {
assignedResource = logicalSlot;
if (logicalSlot.tryAssignPayload(this)) {
// check for concurrent modification (e.g. cancelling call)
if ((state == SCHEDULED || state == CREATED) && !taskManagerLocationFuture.isDone()) {
taskManagerLocationFuture.complete(logicalSlot.getTaskManagerLocation());
assignedAllocationID = logicalSlot.getAllocationId();
return true;
} else {
// free assigned resource and return false
assignedResource = null;
return false;
}
} else {
assignedResource = null;
return false;
}
} else {
// the slot already has another slot assigned
return false;
}
} else {
// do not allow resource assignment if we are not in state SCHEDULED
return false;
}
}
public InputSplit getNextInputSplit() {
final LogicalSlot slot = this.getAssignedResource();
final String host = slot != null ? slot.getTaskManagerLocation().getHostname() : null;
return this.vertex.getNextInputSplit(host);
}
@Override
public TaskManagerLocation getAssignedResourceLocation() {
// returns non-null only when a location is already assigned
final LogicalSlot currentAssignedResource = assignedResource;
return currentAssignedResource != null ? currentAssignedResource.getTaskManagerLocation() : null;
}
public Throwable getFailureCause() {
return failureCause;
}
@Override
public String getFailureCauseAsString() {
return ExceptionUtils.stringifyException(getFailureCause());
}
@Override
public long[] getStateTimestamps() {
return stateTimestamps;
}
@Override
public long getStateTimestamp(ExecutionState state) {
return this.stateTimestamps[state.ordinal()];
}
public boolean isFinished() {
return state.isTerminal();
}
@Nullable
public JobManagerTaskRestore getTaskRestore() {
return taskRestore;
}
/**
* Sets the initial state for the execution. The serialized state is then shipped via the
* {@link TaskDeploymentDescriptor} to the TaskManagers.
*
* @param taskRestore information to restore the state
*/
public void setInitialState(@Nullable JobManagerTaskRestore taskRestore) {
this.taskRestore = taskRestore;
}
/**
* Gets a future that completes once the task execution reaches a terminal state.
* The future will be completed with specific state that the execution reached.
* This future is always completed from the job master's main thread.
*
* @return A future which is completed once the execution reaches a terminal state
*/
@Override
public CompletableFuture getTerminalStateFuture() {
return terminalStateFuture;
}
/**
* Gets the release future which is completed once the execution reaches a terminal
* state and the assigned resource has been released.
* This future is always completed from the job master's main thread.
*
* @return A future which is completed once the assigned resource has been released
*/
public CompletableFuture> getReleaseFuture() {
return releaseFuture;
}
// --------------------------------------------------------------------------------------------
// Actions
// --------------------------------------------------------------------------------------------
public CompletableFuture scheduleForExecution() {
final ExecutionGraph executionGraph = getVertex().getExecutionGraph();
final SlotProviderStrategy resourceProvider = executionGraph.getSlotProviderStrategy();
return scheduleForExecution(
resourceProvider,
LocationPreferenceConstraint.ANY,
Collections.emptySet());
}
/**
* NOTE: This method only throws exceptions if it is in an illegal state to be scheduled, or if the tasks needs
* to be scheduled immediately and no resource is available. If the task is accepted by the schedule, any
* error sets the vertex state to failed and triggers the recovery logic.
*
* @param slotProviderStrategy The slot provider strategy to use to allocate slot for this execution attempt.
* @param locationPreferenceConstraint constraint for the location preferences
* @param allPreviousExecutionGraphAllocationIds set with all previous allocation ids in the job graph.
* Can be empty if the allocation ids are not required for scheduling.
* @return Future which is completed once the Execution has been deployed
*/
public CompletableFuture scheduleForExecution(
SlotProviderStrategy slotProviderStrategy,
LocationPreferenceConstraint locationPreferenceConstraint,
@Nonnull Set allPreviousExecutionGraphAllocationIds) {
assertRunningInJobMasterMainThread();
try {
final CompletableFuture allocationFuture = allocateResourcesForExecution(
slotProviderStrategy,
locationPreferenceConstraint,
allPreviousExecutionGraphAllocationIds);
final CompletableFuture deploymentFuture = allocationFuture.thenRun(ThrowingRunnable.unchecked(this::deploy));
deploymentFuture.whenComplete(
(Void ignored, Throwable failure) -> {
if (failure != null) {
final Throwable stripCompletionException = ExceptionUtils.stripCompletionException(failure);
final Throwable schedulingFailureCause;
if (stripCompletionException instanceof TimeoutException) {
schedulingFailureCause = new NoResourceAvailableException(
"Could not allocate enough slots to run the job. " +
"Please make sure that the cluster has enough resources.");
} else {
schedulingFailureCause = stripCompletionException;
}
markFailed(schedulingFailureCause);
}
});
return deploymentFuture;
} catch (IllegalExecutionStateException e) {
return FutureUtils.completedExceptionally(e);
}
}
/**
* Allocates resources for the execution.
*
* Allocates following resources:
*
* - slot obtained from the slot provider
* - registers produced partitions with the {@link org.apache.flink.runtime.shuffle.ShuffleMaster}
*
*
* @param slotProviderStrategy to obtain a new slot from
* @param locationPreferenceConstraint constraint for the location preferences
* @param allPreviousExecutionGraphAllocationIds set with all previous allocation ids in the job graph.
* Can be empty if the allocation ids are not required for scheduling.
* @return Future which is completed with this execution once the slot has been assigned
* or with an exception if an error occurred.
*/
CompletableFuture allocateResourcesForExecution(
SlotProviderStrategy slotProviderStrategy,
LocationPreferenceConstraint locationPreferenceConstraint,
@Nonnull Set allPreviousExecutionGraphAllocationIds) {
return allocateAndAssignSlotForExecution(
slotProviderStrategy,
locationPreferenceConstraint,
allPreviousExecutionGraphAllocationIds)
.thenCompose(slot -> registerProducedPartitions(slot.getTaskManagerLocation()));
}
/**
* Allocates and assigns a slot obtained from the slot provider to the execution.
*
* @param slotProviderStrategy to obtain a new slot from
* @param locationPreferenceConstraint constraint for the location preferences
* @param allPreviousExecutionGraphAllocationIds set with all previous allocation ids in the job graph.
* Can be empty if the allocation ids are not required for scheduling.
* @return Future which is completed with the allocated slot once it has been assigned
* or with an exception if an error occurred.
*/
private CompletableFuture allocateAndAssignSlotForExecution(
SlotProviderStrategy slotProviderStrategy,
LocationPreferenceConstraint locationPreferenceConstraint,
@Nonnull Set allPreviousExecutionGraphAllocationIds) {
checkNotNull(slotProviderStrategy);
assertRunningInJobMasterMainThread();
final SlotSharingGroup sharingGroup = vertex.getJobVertex().getSlotSharingGroup();
final CoLocationConstraint locationConstraint = vertex.getLocationConstraint();
// sanity check
if (locationConstraint != null && sharingGroup == null) {
throw new IllegalStateException(
"Trying to schedule with co-location constraint but without slot sharing allowed.");
}
// this method only works if the execution is in the state 'CREATED'
if (transitionState(CREATED, SCHEDULED)) {
final SlotSharingGroupId slotSharingGroupId = sharingGroup != null ? sharingGroup.getSlotSharingGroupId() : null;
ScheduledUnit toSchedule = locationConstraint == null ?
new ScheduledUnit(this, slotSharingGroupId) :
new ScheduledUnit(this, slotSharingGroupId, locationConstraint);
// try to extract previous allocation ids, if applicable, so that we can reschedule to the same slot
ExecutionVertex executionVertex = getVertex();
AllocationID lastAllocation = executionVertex.getLatestPriorAllocation();
Collection previousAllocationIDs =
lastAllocation != null ? Collections.singletonList(lastAllocation) : Collections.emptyList();
// calculate the preferred locations
final CompletableFuture> preferredLocationsFuture =
calculatePreferredLocations(locationPreferenceConstraint);
final SlotRequestId slotRequestId = new SlotRequestId();
final CompletableFuture logicalSlotFuture =
preferredLocationsFuture.thenCompose(
(Collection preferredLocations) ->
slotProviderStrategy.allocateSlot(
slotRequestId,
toSchedule,
SlotProfile.priorAllocation(
vertex.getResourceProfile(),
getPhysicalSlotResourceProfile(vertex),
preferredLocations,
previousAllocationIDs,
allPreviousExecutionGraphAllocationIds)));
// register call back to cancel slot request in case that the execution gets canceled
releaseFuture.whenComplete(
(Object ignored, Throwable throwable) -> {
if (logicalSlotFuture.cancel(false)) {
slotProviderStrategy.cancelSlotRequest(
slotRequestId,
slotSharingGroupId,
new FlinkException("Execution " + this + " was released."));
}
});
// This forces calls to the slot pool back into the main thread, for normal and exceptional completion
return logicalSlotFuture.handle(
(LogicalSlot logicalSlot, Throwable failure) -> {
if (failure != null) {
throw new CompletionException(failure);
}
if (tryAssignResource(logicalSlot)) {
return logicalSlot;
} else {
// release the slot
logicalSlot.releaseSlot(new FlinkException("Could not assign logical slot to execution " + this + '.'));
throw new CompletionException(
new FlinkException(
"Could not assign slot " + logicalSlot + " to execution " + this + " because it has already been assigned "));
}
});
} else {
// call race, already deployed, or already done
throw new IllegalExecutionStateException(this, CREATED, state);
}
}
public CompletableFuture registerProducedPartitions(TaskManagerLocation location) {
Preconditions.checkState(isLegacyScheduling());
return registerProducedPartitions(location, vertex.getExecutionGraph().getScheduleMode().allowLazyDeployment());
}
public CompletableFuture registerProducedPartitions(
TaskManagerLocation location,
boolean sendScheduleOrUpdateConsumersMessage) {
assertRunningInJobMasterMainThread();
return FutureUtils.thenApplyAsyncIfNotDone(
registerProducedPartitions(vertex, location, attemptId, sendScheduleOrUpdateConsumersMessage),
vertex.getExecutionGraph().getJobMasterMainThreadExecutor(),
producedPartitionsCache -> {
producedPartitions = producedPartitionsCache;
startTrackingPartitions(location.getResourceID(), producedPartitionsCache.values());
return this;
});
}
/**
* Register producedPartitions to {@link ShuffleMaster}
*
* HACK: Please notice that this method simulates asynchronous registration in a synchronous way
* by making sure the returned {@link CompletableFuture} from {@link ShuffleMaster#registerPartitionWithProducer}
* is completed immediately.
*
*
{@link Execution#producedPartitions} are registered through an asynchronous interface
* {@link ShuffleMaster#registerPartitionWithProducer} to {@link ShuffleMaster}, however they are not always
* accessed through callbacks. So, it is possible that {@link Execution#producedPartitions}
* have not been available yet when accessed (in {@link Execution#deploy} for example).
*
*
Since the only implementation of {@link ShuffleMaster} is {@link NettyShuffleMaster},
* which indeed registers producedPartition in a synchronous way, this method enforces
* synchronous registration under an asynchronous interface for now.
*
*
TODO: If asynchronous registration is needed in the future, use callbacks to access {@link Execution#producedPartitions}.
*
* @return completed future of partition deployment descriptors.
*/
@VisibleForTesting
static CompletableFuture