All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.executiongraph.Execution Maven / Gradle / Ivy

There is a newer version: 1.5.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.executiongraph;

import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.Archiveable;
import org.apache.flink.api.common.accumulators.Accumulator;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.JobManagerOptions;
import org.apache.flink.runtime.JobException;
import org.apache.flink.runtime.accumulators.StringifiedAccumulatorResult;
import org.apache.flink.runtime.checkpoint.CheckpointOptions;
import org.apache.flink.runtime.checkpoint.JobManagerTaskRestore;
import org.apache.flink.runtime.clusterframework.types.AllocationID;
import org.apache.flink.runtime.clusterframework.types.ResourceProfile;
import org.apache.flink.runtime.clusterframework.types.SlotProfile;
import org.apache.flink.runtime.concurrent.FutureUtils;
import org.apache.flink.runtime.deployment.PartialInputChannelDeploymentDescriptor;
import org.apache.flink.runtime.deployment.TaskDeploymentDescriptor;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.instance.SlotSharingGroupId;
import org.apache.flink.runtime.jobmanager.scheduler.CoLocationConstraint;
import org.apache.flink.runtime.jobmanager.scheduler.LocationPreferenceConstraint;
import org.apache.flink.runtime.jobmanager.scheduler.ScheduledUnit;
import org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroup;
import org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway;
import org.apache.flink.runtime.jobmaster.LogicalSlot;
import org.apache.flink.runtime.jobmaster.SlotRequestId;
import org.apache.flink.runtime.jobmaster.slotpool.SlotProvider;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.messages.StackTraceSampleResponse;
import org.apache.flink.runtime.taskmanager.TaskManagerLocation;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.FlinkException;
import org.apache.flink.util.FlinkRuntimeException;
import org.apache.flink.util.OptionalFailure;

import org.slf4j.Logger;

import javax.annotation.Nullable;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.Executor;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
import java.util.stream.Collectors;

import static org.apache.flink.runtime.execution.ExecutionState.CANCELED;
import static org.apache.flink.runtime.execution.ExecutionState.CANCELING;
import static org.apache.flink.runtime.execution.ExecutionState.CREATED;
import static org.apache.flink.runtime.execution.ExecutionState.DEPLOYING;
import static org.apache.flink.runtime.execution.ExecutionState.FAILED;
import static org.apache.flink.runtime.execution.ExecutionState.FINISHED;
import static org.apache.flink.runtime.execution.ExecutionState.RUNNING;
import static org.apache.flink.runtime.execution.ExecutionState.SCHEDULED;
import static org.apache.flink.util.Preconditions.checkNotNull;
import static org.apache.flink.util.Preconditions.checkState;

/**
 * A single execution of a vertex. While an {@link ExecutionVertex} can be executed multiple times
 * (for recovery, re-computation, re-configuration), this class tracks the state of a single execution
 * of that vertex and the resources.
 *
 * 

Lock free state transitions

* *

In several points of the code, we need to deal with possible concurrent state changes and actions. * For example, while the call to deploy a task (send it to the TaskManager) happens, the task gets cancelled. * *

We could lock the entire portion of the code (decision to deploy, deploy, set state to running) such that * it is guaranteed that any "cancel command" will only pick up after deployment is done and that the "cancel * command" call will never overtake the deploying call. * *

This blocks the threads big time, because the remote calls may take long. Depending of their locking behavior, it * may even result in distributed deadlocks (unless carefully avoided). We therefore use atomic state updates and * occasional double-checking to ensure that the state after a completed call is as expected, and trigger correcting * actions if it is not. Many actions are also idempotent (like canceling). */ public class Execution implements AccessExecution, Archiveable, LogicalSlot.Payload { private static final AtomicReferenceFieldUpdater STATE_UPDATER = AtomicReferenceFieldUpdater.newUpdater(Execution.class, ExecutionState.class, "state"); private static final AtomicReferenceFieldUpdater ASSIGNED_SLOT_UPDATER = AtomicReferenceFieldUpdater.newUpdater( Execution.class, LogicalSlot.class, "assignedResource"); private static final Logger LOG = ExecutionGraph.LOG; private static final int NUM_CANCEL_CALL_TRIES = 3; private static final int NUM_STOP_CALL_TRIES = 3; // -------------------------------------------------------------------------------------------- /** The executor which is used to execute futures. */ private final Executor executor; /** The execution vertex whose task this execution executes. */ private final ExecutionVertex vertex; /** The unique ID marking the specific execution instant of the task. */ private ExecutionAttemptID attemptId; /** Gets the global modification version of the execution graph when this execution was created. * This version is bumped in the ExecutionGraph whenever a global failover happens. It is used * to resolve conflicts between concurrent modification by global and local failover actions. */ private final long globalModVersion; /** The timestamps when state transitions occurred, indexed by {@link ExecutionState#ordinal()}. */ private final long[] stateTimestamps; private int attemptNumber; private final Time rpcTimeout; private final ConcurrentLinkedQueue partialInputChannelDeploymentDescriptors; /** A future that completes once the Execution reaches a terminal ExecutionState. */ private final CompletableFuture terminalStateFuture; private final CompletableFuture releaseFuture; private final CompletableFuture taskManagerLocationFuture; private volatile ExecutionState state = CREATED; private volatile LogicalSlot assignedResource; private volatile Throwable failureCause; // once assigned, never changes /** Information to restore the task on recovery, such as checkpoint id and task state snapshot. */ @Nullable private volatile JobManagerTaskRestore taskRestore; /** This field holds the allocation id once it was assigned successfully. */ @Nullable private volatile AllocationID assignedAllocationID; // ------------------------ Accumulators & Metrics ------------------------ /** Lock for updating the accumulators atomically. * Prevents final accumulators to be overwritten by partial accumulators on a late heartbeat. */ private final Object accumulatorLock = new Object(); /* Continuously updated map of user-defined accumulators */ private volatile Map> userAccumulators; private volatile IOMetrics ioMetrics; // -------------------------------------------------------------------------------------------- private final Object updatePartitionLock = new Object(); private ScheduledFuture updatePartitionFuture; /** A future that completes once the Execution reconcile finish. */ private CompletableFuture reconcileFuture; /** * Creates a new Execution attempt. * * @param executor * The executor used to dispatch callbacks from futures and asynchronous RPC calls. * @param vertex * The execution vertex to which this Execution belongs * @param attemptNumber * The execution attempt number. * @param globalModVersion * The global modification version of the execution graph when this execution was created * @param startTimestamp * The timestamp that marks the creation of this Execution * @param rpcTimeout * The rpcTimeout for RPC calls like deploy/cancel/stop. */ public Execution( Executor executor, ExecutionVertex vertex, int attemptNumber, long globalModVersion, long startTimestamp, Time rpcTimeout) { this.executor = checkNotNull(executor); this.vertex = checkNotNull(vertex); this.attemptId = new ExecutionAttemptID(); this.rpcTimeout = checkNotNull(rpcTimeout); this.globalModVersion = globalModVersion; this.attemptNumber = attemptNumber; this.stateTimestamps = new long[ExecutionState.values().length]; markTimestamp(ExecutionState.CREATED, startTimestamp); this.partialInputChannelDeploymentDescriptors = new ConcurrentLinkedQueue<>(); this.terminalStateFuture = new CompletableFuture<>(); this.releaseFuture = new CompletableFuture<>(); this.taskManagerLocationFuture = new CompletableFuture<>(); this.assignedResource = null; } // -------------------------------------------------------------------------------------------- // Properties // -------------------------------------------------------------------------------------------- public ExecutionVertex getVertex() { return vertex; } @Override public ExecutionAttemptID getAttemptId() { return attemptId; } @Override public int getAttemptNumber() { return attemptNumber; } @Override public ExecutionState getState() { return state; } @Nullable public AllocationID getAssignedAllocationID() { return assignedAllocationID; } /** * Gets the global modification version of the execution graph when this execution was created. * *

This version is bumped in the ExecutionGraph whenever a global failover happens. It is used * to resolve conflicts between concurrent modification by global and local failover actions. */ public long getGlobalModVersion() { return globalModVersion; } public CompletableFuture getTaskManagerLocationFuture() { return taskManagerLocationFuture; } public LogicalSlot getAssignedResource() { return assignedResource; } /** * Tries to assign the given slot to the execution. The assignment works only if the * Execution is in state SCHEDULED. Returns true, if the resource could be assigned. * * @param logicalSlot to assign to this execution * @return true if the slot could be assigned to the execution, otherwise false */ @VisibleForTesting boolean tryAssignResource(final LogicalSlot logicalSlot) { checkNotNull(logicalSlot); // only allow to set the assigned resource in state SCHEDULED or CREATED // note: we also accept resource assignment when being in state CREATED for testing purposes if (state == SCHEDULED || state == CREATED) { if (ASSIGNED_SLOT_UPDATER.compareAndSet(this, null, logicalSlot) && logicalSlot.tryAssignPayload(this)) { // check for concurrent modification (e.g. cancelling call) if (state == SCHEDULED || state == CREATED) { checkState(!taskManagerLocationFuture.isDone(), "The TaskManagerLocationFuture should not be set if we haven't assigned a resource yet."); taskManagerLocationFuture.complete(logicalSlot.getTaskManagerLocation()); assignedAllocationID = logicalSlot.getAllocationId(); LOG.info("{} is assigned resource {}_{} with {}", getVertexWithAttempt(), logicalSlot.getTaskManagerLocation().getResourceID(), logicalSlot.getPhysicalSlotNumber(), assignedAllocationID); return true; } else { // free assigned resource and return false ASSIGNED_SLOT_UPDATER.set(this, null); return false; } } else { // the slot already has another slot assigned return false; } } else { // do not allow resource assignment if we are not in state SCHEDULED return false; } } @Override public TaskManagerLocation getAssignedResourceLocation() { // returns non-null only when a location is already assigned final LogicalSlot currentAssignedResource = assignedResource; try { return currentAssignedResource != null ? currentAssignedResource.getTaskManagerLocation() : (taskManagerLocationFuture.isDone() ? taskManagerLocationFuture.get() : null); } catch (Exception e) { return null; } } public Throwable getFailureCause() { return failureCause; } @Override public String getFailureCauseAsString() { return ExceptionUtils.stringifyException(getFailureCause()); } @Override public long[] getStateTimestamps() { return stateTimestamps; } @Override public long getStateTimestamp(ExecutionState state) { return this.stateTimestamps[state.ordinal()]; } public boolean isFinished() { return state.isTerminal(); } @Nullable public JobManagerTaskRestore getTaskRestore() { return taskRestore; } /** * Sets the initial state for the execution. The serialized state is then shipped via the * {@link TaskDeploymentDescriptor} to the TaskManagers. * * @param taskRestore information to restore the state */ public void setInitialState(@Nullable JobManagerTaskRestore taskRestore) { checkState(state == CREATED, "Can only assign operator state when execution attempt is in CREATED"); this.taskRestore = taskRestore; } /** * Gets a future that completes once the task execution reaches a terminal state. * The future will be completed with specific state that the execution reached. * * @return A future which is completed once the execution reaches a terminal state */ @Override public CompletableFuture getTerminalStateFuture() { return terminalStateFuture; } /** * Gets the release future which is completed once the execution reaches a terminal * state and the assigned resource has been released. * * @return A future which is completed once the assigned resource has been released */ public CompletableFuture getReleaseFuture() { return releaseFuture; } /** * Gets the reconcile future which is completed once the task executor report status or timeout. */ public CompletableFuture getReconcileFuture() { return reconcileFuture; } // -------------------------------------------------------------------------------------------- // Actions // -------------------------------------------------------------------------------------------- public CompletableFuture scheduleForExecution() { final ExecutionGraph executionGraph = getVertex().getExecutionGraph(); final SlotProvider resourceProvider = executionGraph.getSlotProvider(); final boolean allowQueued = executionGraph.isQueuedSchedulingAllowed(); return scheduleForExecution( resourceProvider, allowQueued, LocationPreferenceConstraint.ANY); } /** * NOTE: This method only throws exceptions if it is in an illegal state to be scheduled, or if the tasks needs * to be scheduled immediately and no resource is available. If the task is accepted by the schedule, any * error sets the vertex state to failed and triggers the recovery logic. * * @param slotProvider The slot provider to use to allocate slot for this execution attempt. * @param queued Flag to indicate whether the scheduler may queue this task if it cannot * immediately deploy it. * @param locationPreferenceConstraint constraint for the location preferences * @return Future which is completed once the Execution has been deployed */ public CompletableFuture scheduleForExecution( SlotProvider slotProvider, boolean queued, LocationPreferenceConstraint locationPreferenceConstraint) { final Time allocationTimeout = vertex.getExecutionGraph().getAllocationTimeout(); try { final CompletableFuture allocationFuture = allocateAndAssignSlotForExecution( slotProvider, queued, locationPreferenceConstraint, allocationTimeout); // IMPORTANT: We have to use the synchronous handle operation (direct executor) here so // that we directly deploy the tasks if the slot allocation future is completed. This is // necessary for immediate deployment. final CompletableFuture deploymentFuture = allocationFuture.handle( (Execution ignored, Throwable throwable) -> { if (throwable != null) { markFailed(ExceptionUtils.stripCompletionException(throwable)); } else { try { deploy(); } catch (Throwable t) { markFailed(ExceptionUtils.stripCompletionException(t)); } } return null; } ); // if tasks have to scheduled immediately check that the task has been deployed if (!queued && !deploymentFuture.isDone()) { allocationFuture.completeExceptionally(new IllegalArgumentException("The slot allocation future has not been completed yet.")); } return deploymentFuture; } catch (IllegalExecutionStateException e) { return FutureUtils.completedExceptionally(e); } } /** * Enter SCHEDULED and return schedule unit and slot profile for scheduling this execution. This is for batch allocating slots. * * @throws IllegalExecutionStateException if this method has been called while not being in the CREATED state */ public Tuple2 enterScheduledAndPrepareSchedulingResources() throws IllegalStateException { final SlotSharingGroup sharingGroup = vertex.getJobVertex().getSlotSharingGroup(); final CoLocationConstraint locationConstraint = vertex.getLocationConstraint(); // sanity check if (locationConstraint != null && sharingGroup == null) { throw new IllegalStateException( "Trying to schedule with co-location constraint but without slot sharing allowed."); } // this method only works if the execution is in the state 'CREATED' if (transitionState(CREATED, SCHEDULED)) { final SlotSharingGroupId slotSharingGroupId = sharingGroup != null ? sharingGroup.getSlotSharingGroupId() : null; ScheduledUnit toSchedule = locationConstraint == null ? new ScheduledUnit(this, slotSharingGroupId) : new ScheduledUnit(this, slotSharingGroupId, locationConstraint); // try to extract previous allocation ids, if applicable, so that we can reschedule to the same slot ExecutionVertex executionVertex = getVertex(); AllocationID lastAllocation = executionVertex.getLatestPriorAllocation(); Collection previousAllocationIDs = lastAllocation != null ? Collections.singletonList(lastAllocation) : Collections.emptyList(); // calculate the preferred locations only based on state. Collection> locationFuture = getVertex().getPreferredLocationsBasedOnState(); final Collection preferredLocations = locationFuture == null ? Collections.EMPTY_LIST : FutureUtils.combineAll(locationFuture).join(); return new Tuple2(toSchedule, new SlotProfile( computeResource(sharingGroup), preferredLocations, previousAllocationIDs, executionVertex.getJobVertex().getJobVertex().getTags())); } else { // call race, already deployed, or already done throw new IllegalExecutionStateException(this, CREATED, state); } } /** * Allocates and assigns a slot obtained from the slot provider to the execution. * * @param slotProvider to obtain a new slot from * @param queued if the allocation can be queued * @param locationPreferenceConstraint constraint for the location preferences * @param allocationTimeout rpcTimeout for allocating a new slot * @return Future which is completed with this execution once the slot has been assigned * or with an exception if an error occurred. * @throws IllegalExecutionStateException if this method has been called while not being in the CREATED state */ public CompletableFuture allocateAndAssignSlotForExecution( SlotProvider slotProvider, boolean queued, LocationPreferenceConstraint locationPreferenceConstraint, Time allocationTimeout) throws IllegalExecutionStateException { checkNotNull(slotProvider); final SlotSharingGroup sharingGroup = vertex.getJobVertex().getSlotSharingGroup(); final CoLocationConstraint locationConstraint = vertex.getLocationConstraint(); // sanity check if (locationConstraint != null && sharingGroup == null) { throw new IllegalStateException( "Trying to schedule with co-location constraint but without slot sharing allowed."); } // this method only works if the execution is in the state 'CREATED' if (transitionState(CREATED, SCHEDULED)) { final SlotSharingGroupId slotSharingGroupId = sharingGroup != null ? sharingGroup.getSlotSharingGroupId() : null; ScheduledUnit toSchedule = locationConstraint == null ? new ScheduledUnit(this, slotSharingGroupId) : new ScheduledUnit(this, slotSharingGroupId, locationConstraint); // try to extract previous allocation ids, if applicable, so that we can reschedule to the same slot ExecutionVertex executionVertex = getVertex(); AllocationID lastAllocation = executionVertex.getLatestPriorAllocation(); Collection previousAllocationIDs = lastAllocation != null ? Collections.singletonList(lastAllocation) : Collections.emptyList(); // calculate the preferred locations final CompletableFuture> preferredLocationsFuture = calculatePreferredLocations(locationPreferenceConstraint); final SlotRequestId slotRequestId = new SlotRequestId(); final CompletableFuture logicalSlotFuture = preferredLocationsFuture .thenCompose( (Collection preferredLocations) -> slotProvider.allocateSlot( slotRequestId, toSchedule, queued, new SlotProfile( computeResource(sharingGroup), preferredLocations, previousAllocationIDs, getVertex().getJobVertex().getJobVertex().getTags()), allocationTimeout)); // register call back to cancel slot request in case that the execution gets canceled releaseFuture.whenComplete( (Object ignored, Throwable throwable) -> { if (logicalSlotFuture.cancel(false)) { slotProvider.cancelSlotRequest( slotRequestId, slotSharingGroupId, locationConstraint, new FlinkException("Execution " + this + " was released.")); } }); return logicalSlotFuture.thenApply( (LogicalSlot logicalSlot) -> { if (tryAssignResource(logicalSlot)) { return this; } else { // release the slot logicalSlot.releaseSlot(new FlinkException("Could not assign logical slot to execution " + this + '.')); throw new CompletionException(new FlinkException("Could not assign slot " + logicalSlot + " to execution " + this + " because it has already been assigned ")); } }); } else { // call race, already deployed, or already done throw new IllegalExecutionStateException(this, CREATED, state); } } private ResourceProfile computeResource(SlotSharingGroup slotSharingGroup) { if (slotSharingGroup != null && slotSharingGroup.getResourceProfile() != null) { return slotSharingGroup.getResourceProfile(); } else { return getVertex().calculateResourceProfile(); } } /** * Deploys the execution to the previously assigned resource. * * @throws JobException if the execution cannot be deployed to the assigned resource */ public void deploy() throws JobException { final LogicalSlot slot = assignedResource; checkNotNull(slot, "In order to deploy the execution we first have to assign a resource via tryAssignResource."); // Check if the TaskManager died in the meantime // This only speeds up the response to TaskManagers failing concurrently to deployments. // The more general check is the rpcTimeout of the deployment call if (!slot.isAlive()) { throw new JobException("Target slot (TaskManager) for deployment is no longer alive."); } // make sure exactly one deployment call happens from the correct state // note: the transition from CREATED to DEPLOYING is for testing purposes only ExecutionState previous = this.state; if (previous == SCHEDULED || previous == CREATED) { if (!transitionState(previous, DEPLOYING)) { // race condition, someone else beat us to the deploying call. // this should actually not happen and indicates a race somewhere else throw new IllegalStateException("Cannot deploy task: Concurrent deployment call race."); } } else { // vertex may have been cancelled, or it was already scheduled throw new IllegalStateException("The vertex must be in CREATED or SCHEDULED state to be deployed. Found state " + previous); } if (this != slot.getPayload()) { throw new IllegalStateException( String.format("The execution %s has not been assigned to the assigned slot.", this)); } // race double check, did we fail/cancel and do we need to release the slot? if (this.state != DEPLOYING) { slot.releaseSlot(new FlinkException("Actual state of execution " + this + " (" + state + ") does not match expected state DEPLOYING.")); return; } if (LOG.isInfoEnabled()) { LOG.info(String.format("Deploying %s (attempt #%d) to slot %s_%s on %s", vertex.getTaskNameWithSubtaskIndex(), attemptNumber, slot.getTaskManagerLocation().getResourceID(), slot.getPhysicalSlotNumber(), getAssignedResourceLocation().getHostname())); } executor.execute( () -> { try { final TaskDeploymentDescriptor deployment = vertex.createDeploymentDescriptor( attemptId, slot, taskRestore, attemptNumber); // null taskRestore to let it be GC'ed taskRestore = null; final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway(); final CompletableFuture submitResultFuture = taskManagerGateway.submitTask(deployment, rpcTimeout); submitResultFuture.whenCompleteAsync( (ack, failure) -> { // only respond to the failure case if (failure != null) { if (failure instanceof TimeoutException) { String taskname = vertex.getTaskNameWithSubtaskIndex() + " (" + attemptId + ')'; markFailed(new Exception( "Cannot deploy task " + taskname + " - TaskManager (" + getAssignedResourceLocation() + ") not responding after a rpcTimeout of " + rpcTimeout, failure)); } else { markFailed(failure); } } }, executor); } catch (Throwable t) { markFailed(t); } } ); } /** * Sends stop RPC call. */ public void stop() { final LogicalSlot slot = assignedResource; if (slot != null) { final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway(); CompletableFuture stopResultFuture = FutureUtils.retry( () -> taskManagerGateway.stopTask(attemptId, rpcTimeout), NUM_STOP_CALL_TRIES, executor); stopResultFuture.exceptionally( failure -> { LOG.info("Stopping task was not successful.", failure); return null; }); } } public void cancel() { // depending on the previous state, we go directly to cancelled (no cancel call necessary) // -- or to canceling (cancel call needs to be sent to the task manager) // because of several possibly previous states, we need to again loop until we make a // successful atomic state transition while (true) { ExecutionState current = this.state; if (current == CANCELING || current == CANCELED) { // already taken care of, no need to cancel again return; } // these two are the common cases where we need to send a cancel call else if (current == RUNNING || current == DEPLOYING) { // try to transition to canceling, if successful, send the cancel call if (transitionState(current, CANCELING)) { sendCancelRpcCall(); return; } // else: fall through the loop } else if (current == FINISHED || current == FAILED) { // nothing to do any more. finished failed before it could be cancelled. // in any case, the task is removed from the TaskManager already sendFailIntermediateResultPartitionsRpcCall(); return; } else if (current == CREATED || current == SCHEDULED) { // from here, we can directly switch to cancelled, because no task has been deployed if (transitionState(current, CANCELED)) { // we skip the canceling state. set the timestamp, for a consistent appearance markTimestamp(CANCELING, getStateTimestamp(CANCELED)); // cancel the future in order to fail depending scheduling operations taskManagerLocationFuture.cancel(false); try { vertex.getExecutionGraph().deregisterExecution(this); releaseAssignedResource(new FlinkException("Execution " + this + " was cancelled.")); } finally { vertex.executionCanceled(this); } return; } // else: fall through the loop } else { throw new IllegalStateException(current.name()); } } } protected void updateConsumers(List> allConsumers) { final int numConsumers = allConsumers.size(); if (numConsumers > 1) { fail(new IllegalStateException("Currently, only a single consumer group per partition is supported.")); } else if (numConsumers == 0) { return; } // Update partition for vertices which are already running // Cache partition info for vertices which are scheduled but not running yet for (ExecutionEdge edge : allConsumers.get(0)) { final ExecutionVertex consumerVertex = edge.getTarget(); final Execution consumer = consumerVertex.getCurrentExecutionAttempt(); final ExecutionState consumerState = consumer.getState(); final IntermediateResultPartition partition = edge.getSource(); if (consumerState == RUNNING) { // cache the partition info and trigger a timer to group them and send in batch final Execution partitionExecution = partition.getProducer() .getCurrentExecutionAttempt(); consumerVertex.cachePartitionInfo(PartialInputChannelDeploymentDescriptor.fromEdge(partition, partitionExecution)); consumerVertex.getCurrentExecutionAttempt().sendPartitionInfoAsync(); } // ---------------------------------------------------------------- // Consumer is created, scheduled or deploying => cache input channel // deployment descriptors and send update message later // ---------------------------------------------------------------- else if (consumerState == CREATED || consumerState == SCHEDULED || consumerState == DEPLOYING) { final Execution partitionExecution = partition.getProducer() .getCurrentExecutionAttempt(); consumerVertex.cachePartitionInfo(PartialInputChannelDeploymentDescriptor .fromEdge(partition, partitionExecution)); // double check to resolve race conditions if (consumerVertex.getExecutionState() == RUNNING) { consumerVertex.getCurrentExecutionAttempt().sendPartitionInfoAsync(); } } } } /** * This method fails the vertex due to an external condition. The task will move to state FAILED. * If the task was in state RUNNING or DEPLOYING before, it will send a cancel call to the TaskManager. * * @param t The exception that caused the task to fail. */ @Override public void fail(Throwable t) { processFail(t, false); } /** * Request a stack trace sample from the task of this execution. * * @param sampleId of the stack trace sample * @param numSamples the sample should contain * @param delayBetweenSamples to wait * @param maxStackTraceDepth of the samples * @param timeout until the request times out * @return Future stack trace sample response */ public CompletableFuture requestStackTraceSample( int sampleId, int numSamples, Time delayBetweenSamples, int maxStackTraceDepth, Time timeout) { final LogicalSlot slot = assignedResource; if (slot != null) { final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway(); return taskManagerGateway.requestStackTraceSample( attemptId, sampleId, numSamples, delayBetweenSamples, maxStackTraceDepth, timeout); } else { return FutureUtils.completedExceptionally(new Exception("The execution has no slot assigned.")); } } /** * Notify the task of this execution about a completed checkpoint. * * @param checkpointId of the completed checkpoint * @param timestamp of the completed checkpoint */ public void notifyCheckpointComplete(long checkpointId, long timestamp) { final LogicalSlot slot = assignedResource; if (slot != null) { final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway(); taskManagerGateway.notifyCheckpointComplete(attemptId, getVertex().getJobId(), checkpointId, timestamp); } else { LOG.debug("The execution has no slot assigned. This indicates that the execution is " + "no longer running."); } } /** * Trigger a new checkpoint on the task of this execution. * * @param checkpointId of th checkpoint to trigger * @param timestamp of the checkpoint to trigger * @param checkpointOptions of the checkpoint to trigger */ public void triggerCheckpoint(long checkpointId, long timestamp, CheckpointOptions checkpointOptions) { final LogicalSlot slot = assignedResource; if (slot != null) { final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway(); taskManagerGateway.triggerCheckpoint(attemptId, getVertex().getJobId(), checkpointId, timestamp, checkpointOptions); } else { LOG.debug("The execution has no slot assigned. This indicates that the execution is " + "no longer running."); } } /** * Let the execution begins to reconcile. */ public CompletableFuture reconcile() { checkState(reconcileFuture == null); reconcileFuture = new CompletableFuture<>(); reconcileFuture.whenComplete( (value, throwable) -> { if (throwable == null && value == null) { sendPartitionInfoAsync(); } }); if (ExecutionState.CREATED.equals(state) || ExecutionState.FINISHED.equals(state)) { reconcileFuture.complete(null); } else { getVertex().getExecutionGraph().getFutureExecutorService().schedule( () -> { if (!reconcileFuture.isDone()) { reconcileFuture.complete(getAttemptId()); } }, getVertex().getExecutionGraph().getJobConfiguration().getLong(JobManagerOptions.JOB_RECONCILE_TIMEOUT), TimeUnit.SECONDS); } return reconcileFuture; } /** * Reconcile status with the info reported by task executor. * * @param reportedState The state reported by task executor. * @param executionId The execution attempt id reported. * @param attemptNumber The attempt number reported. * @param startTimestamp The start time reported. * @param slot The logic slot reported. * @return */ public boolean reconcileStatus( ExecutionState reportedState, ExecutionAttemptID executionId, int attemptNumber, long startTimestamp, LogicalSlot slot ) { if (!reportedState.equals(state)) { LOG.info("Reconcile {} fail as expected status {} with actural {}.", vertex.getTaskNameWithSubtaskIndex(), state, reportedState); return false; } else if (reconcileFuture.isDone()) { LOG.info("Reconcile {} fail as reconcile has finished.", vertex.getTaskNameWithSubtaskIndex()); return false; } if (ASSIGNED_SLOT_UPDATER.compareAndSet(this, null, slot) && slot.tryAssignPayload(this)) { if (taskManagerLocationFuture.isDone() && !slot.getTaskManagerLocation().equals(taskManagerLocationFuture.getNow(null))) { ASSIGNED_SLOT_UPDATER.compareAndSet(this, slot, null); reconcileFuture.complete(attemptId); LOG.info("Reconcile {} fail as has already has a different location.", vertex.getTaskNameWithSubtaskIndex()); return false; } else if (!taskManagerLocationFuture.isDone() && !taskManagerLocationFuture.complete(slot.getTaskManagerLocation())) { ASSIGNED_SLOT_UPDATER.compareAndSet(this, slot, null); reconcileFuture.complete(attemptId); LOG.info("Reconcile {} fail as has already has a location.", vertex.getTaskNameWithSubtaskIndex()); return false; } else { if (reconcileFuture.complete(null)) { assignedAllocationID = slot.getAllocationId(); this.attemptId = executionId; this.attemptNumber = attemptNumber; markTimestamp(ExecutionState.CREATED, startTimestamp); LOG.info("Reconcile {} success, the state is {}.", vertex.getTaskNameWithSubtaskIndex(), reportedState); return true; } else { ASSIGNED_SLOT_UPDATER.compareAndSet(this, slot, null); LOG.info("Reconcile {} fail as it has reconciled finished.", vertex.getTaskNameWithSubtaskIndex()); return false; } } } else { LOG.info("Reconcile {} fail as has already assigned a slot {}.", vertex.getTaskNameWithSubtaskIndex(), assignedResource); return false; } } /** * Set the state to the state recovered when job master failover. */ public void recoverState(ExecutionState recoveredState) { transitionState(state, recoveredState); } // -------------------------------------------------------------------------------------------- // Callbacks // -------------------------------------------------------------------------------------------- /** * This method marks the task as failed, but will make no attempt to remove task execution from the task manager. * It is intended for cases where the task is known not to be running, or then the TaskManager reports failure * (in which case it has already removed the task). * * @param t The exception that caused the task to fail. */ void markFailed(Throwable t) { processFail(t, true); } void markFailed(Throwable t, Map> userAccumulators, IOMetrics metrics) { processFail(t, true, userAccumulators, metrics); } void markFinished() { markFinished(null, null); } void markFinished(Map> userAccumulators, IOMetrics metrics) { // this call usually comes during RUNNING, but may also come while still in deploying (very fast tasks!) while (true) { ExecutionState current = this.state; if (current == RUNNING || current == DEPLOYING) { if (transitionState(current, FINISHED)) { try { getVertex().finishPartitionsAndNotify(); updateAccumulatorsAndMetrics(userAccumulators, metrics); releaseAssignedResource(null); vertex.getExecutionGraph().deregisterExecution(this); } finally { vertex.executionFinished(this); } return; } } else if (current == CANCELING) { // we sent a cancel call, and the task manager finished before it arrived. We // will never get a CANCELED call back from the job manager cancelingComplete(userAccumulators, metrics); return; } else if (current == CANCELED || current == FAILED) { if (LOG.isDebugEnabled()) { LOG.debug("Task FINISHED, but concurrently went to state " + state); } return; } else { // this should not happen, we need to fail this markFailed(new Exception("Vertex received FINISHED message while being in state " + state)); return; } } } void cancelingComplete() { cancelingComplete(null, null); } void cancelingComplete(Map> userAccumulators, IOMetrics metrics) { // the taskmanagers can themselves cancel tasks without an external trigger, if they find that the // network stack is canceled (for example by a failing / canceling receiver or sender // this is an artifact of the old network runtime, but for now we need to support task transitions // from running directly to canceled while (true) { ExecutionState current = this.state; if (current == CANCELED) { return; } else if (current == CANCELING || current == RUNNING || current == DEPLOYING) { updateAccumulatorsAndMetrics(userAccumulators, metrics); if (transitionState(current, CANCELED)) { try { releaseAssignedResource(new FlinkException("Execution " + this + " was cancelled.")); vertex.getExecutionGraph().deregisterExecution(this); } finally { vertex.executionCanceled(this); } return; } // else fall through the loop } else { // failing in the meantime may happen and is no problem. // anything else is a serious problem !!! if (current != FAILED) { String message = String.format("Asynchronous race: Found %s in state %s after successful cancel call.", vertex.getTaskNameWithSubtaskIndex(), state); LOG.error(message); vertex.getExecutionGraph().failGlobal(new Exception(message)); } return; } } } void cachePartitionInfo(PartialInputChannelDeploymentDescriptor partitionInfo) { partialInputChannelDeploymentDescriptors.add(partitionInfo); } void sendPartitionInfos() { if (vertex.getExecutionGraph().getGraphManager().isReplaying()) { return; } synchronized (updatePartitionLock) { updatePartitionFuture = null; } // check if the ExecutionVertex has already been archived and thus cleared the // partial partition infos queue if (partialInputChannelDeploymentDescriptors != null && !partialInputChannelDeploymentDescriptors.isEmpty()) { PartialInputChannelDeploymentDescriptor partialInputChannelDeploymentDescriptor; List partitionInfos = new ArrayList<>(partialInputChannelDeploymentDescriptors.size()); while ((partialInputChannelDeploymentDescriptor = partialInputChannelDeploymentDescriptors.poll()) != null) { partitionInfos.add( new PartitionInfo( partialInputChannelDeploymentDescriptor.getResultId(), partialInputChannelDeploymentDescriptor.createInputChannelDeploymentDescriptor( this, getVertex().getExecutionGraph().getResultPartitionLocationTrackerProxy()))); } sendUpdatePartitionInfoRpcCall(partitionInfos); } } void sendPartitionInfoAsync() { if (reconcileFuture != null && !reconcileFuture.isDone()) { return; } synchronized (updatePartitionLock) { if (updatePartitionFuture == null) { updatePartitionFuture = getVertex().getExecutionGraph().getFutureExecutorService().schedule( () -> { sendPartitionInfos(); }, vertex.getExecutionGraph().getUpdatePartitionInfoSendInterval(), TimeUnit.MILLISECONDS); } } } // -------------------------------------------------------------------------------------------- // Internal Actions // -------------------------------------------------------------------------------------------- private boolean processFail(Throwable t, boolean isCallback) { return processFail(t, isCallback, null, null); } private boolean processFail(Throwable t, boolean isCallback, Map> userAccumulators, IOMetrics metrics) { // damn, we failed. This means only that we keep our books and notify our parent JobExecutionVertex // the actual computation on the task manager is cleaned up by the TaskManager that noticed the failure // we may need to loop multiple times (in the presence of concurrent calls) in order to // atomically switch to failed while (true) { ExecutionState current = this.state; if (current == FAILED) { // already failed. It is enough to remember once that we failed (its sad enough) return false; } if (current == CANCELED || current == FINISHED) { // we are already aborting or are already aborted or we are already finished if (LOG.isDebugEnabled()) { LOG.debug("Ignoring transition of vertex {} to {} while being {}.", getVertexWithAttempt(), FAILED, current); } return false; } if (current == CANCELING) { cancelingComplete(userAccumulators, metrics); return false; } if (transitionState(current, FAILED, t)) { // success (in a manner of speaking) this.failureCause = t; updateAccumulatorsAndMetrics(userAccumulators, metrics); try { releaseAssignedResource(t); vertex.getExecutionGraph().deregisterExecution(this); } finally { vertex.executionFailed(this, t); } if (!isCallback && (current == RUNNING || current == DEPLOYING)) { if (LOG.isDebugEnabled()) { LOG.debug("Sending out cancel request, to remove task execution from TaskManager."); } try { if (assignedResource != null) { sendCancelRpcCall(); } } catch (Throwable tt) { // no reason this should ever happen, but log it to be safe LOG.error("Error triggering cancel call while marking task {} as failed.", getVertex().getTaskNameWithSubtaskIndex(), tt); } } // leave the loop return true; } } } boolean switchToRunning() { if (transitionState(DEPLOYING, RUNNING)) { sendPartitionInfoAsync(); return true; } else { // something happened while the call was in progress. // it can mean: // - canceling, while deployment was in progress. state is now canceling, or canceled, if the response overtook // - finishing (execution and finished call overtook the deployment answer, which is possible and happens for fast tasks) // - failed (execution, failure, and failure message overtook the deployment answer) ExecutionState currentState = this.state; if (currentState == FINISHED || currentState == CANCELED) { // do nothing, the task was really fast (nice) // or it was canceled really fast } else if (currentState == CANCELING || currentState == FAILED) { if (LOG.isDebugEnabled()) { // this log statement is guarded because the 'getVertexWithAttempt()' method // performs string concatenations LOG.debug("Concurrent canceling/failing of {} while deployment was in progress.", getVertexWithAttempt()); } sendCancelRpcCall(); } else { String message = String.format("Concurrent unexpected state transition of task %s to %s while deployment was in progress.", getVertexWithAttempt(), currentState); if (LOG.isDebugEnabled()) { LOG.debug(message); } // undo the deployment sendCancelRpcCall(); // record the failure markFailed(new Exception(message)); } return false; } } /** * This method sends a CancelTask message to the instance of the assigned slot. * * The sending is tried up to NUM_CANCEL_CALL_TRIES times. */ private void sendCancelRpcCall() { final LogicalSlot slot = assignedResource; if (slot != null) { final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway(); CompletableFuture cancelResultFuture = FutureUtils.retry( () -> taskManagerGateway.cancelTask(attemptId, rpcTimeout), NUM_CANCEL_CALL_TRIES, executor); cancelResultFuture.whenCompleteAsync( (ack, failure) -> { if (failure != null) { fail(new Exception("Task could not be canceled.", failure)); } }, executor); } } private void sendFailIntermediateResultPartitionsRpcCall() { final LogicalSlot slot = assignedResource; if (slot != null) { final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway(); // TODO For some tests this could be a problem when querying too early if all resources were released taskManagerGateway.failPartition(attemptId); } } /** * Update the partition infos on the assigned resource. * * @param partitionInfos for the remote task */ private void sendUpdatePartitionInfoRpcCall( final Iterable partitionInfos) { final LogicalSlot slot = assignedResource; if (slot != null) { final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway(); final TaskManagerLocation taskManagerLocation = slot.getTaskManagerLocation(); CompletableFuture updatePartitionsResultFuture = taskManagerGateway.updatePartitions( attemptId, partitionInfos, rpcTimeout); updatePartitionsResultFuture.whenCompleteAsync( (ack, failure) -> { // fail if there was a failure if (failure != null) { fail(new IllegalStateException("Update task on TaskManager " + taskManagerLocation + " failed due to:", failure)); } }, executor); } } /** * Releases the assigned resource and completes the release future * once the assigned resource has been successfully released. * * @param cause for the resource release, null if none */ private void releaseAssignedResource(@Nullable Throwable cause) { final LogicalSlot slot = assignedResource; if (slot != null) { slot.releaseSlot(cause).whenComplete( (Object ignored, Throwable throwable) -> { if (throwable != null) { releaseFuture.completeExceptionally(throwable); } else { releaseFuture.complete(null); } }); } else { // no assigned resource --> we can directly complete the release future releaseFuture.complete(null); } } // -------------------------------------------------------------------------------------------- // Miscellaneous // -------------------------------------------------------------------------------------------- /** * Calculates the preferred locations based on the location preference constraint. * * @param locationPreferenceConstraint constraint for the location preference * @return Future containing the collection of preferred locations. This might not be completed if not all inputs * have been a resource assigned. */ @VisibleForTesting public CompletableFuture> calculatePreferredLocations(LocationPreferenceConstraint locationPreferenceConstraint) { final Collection> preferredLocationFutures = getVertex().getPreferredLocations(); final CompletableFuture> preferredLocationsFuture; switch(locationPreferenceConstraint) { case ALL: preferredLocationsFuture = FutureUtils.combineAll(preferredLocationFutures); break; case ANY: final ArrayList completedTaskManagerLocations = new ArrayList<>(preferredLocationFutures.size()); for (CompletableFuture preferredLocationFuture : preferredLocationFutures) { if (preferredLocationFuture.isDone() && !preferredLocationFuture.isCompletedExceptionally()) { final TaskManagerLocation taskManagerLocation = preferredLocationFuture.getNow(null); if (taskManagerLocation == null) { throw new FlinkRuntimeException("TaskManagerLocationFuture was completed with null. This indicates a programming bug."); } completedTaskManagerLocations.add(taskManagerLocation); } } preferredLocationsFuture = CompletableFuture.completedFuture(completedTaskManagerLocations); break; default: throw new RuntimeException("Unknown LocationPreferenceConstraint " + locationPreferenceConstraint + '.'); } return preferredLocationsFuture; } private boolean transitionState(ExecutionState currentState, ExecutionState targetState) { return transitionState(currentState, targetState, null); } private boolean transitionState(ExecutionState currentState, ExecutionState targetState, Throwable error) { // sanity check if (currentState.isTerminal()) { throw new IllegalStateException("Cannot leave terminal state " + currentState + " to transition to " + targetState + '.'); } if (STATE_UPDATER.compareAndSet(this, currentState, targetState)) { markTimestamp(targetState); if (error == null) { LOG.info("{} ({}) switched from {} to {}.", getVertex().getTaskNameWithSubtaskIndex(), getAttemptId(), currentState, targetState); } else { LOG.info("{} ({}) switched from {} to {}.", getVertex().getTaskNameWithSubtaskIndex(), getAttemptId(), currentState, targetState, error); } if (targetState.isTerminal()) { // complete the terminal state future terminalStateFuture.complete(targetState); } // make sure that the state transition completes normally. // potential errors (in listeners may not affect the main logic) try { vertex.notifyStateTransition(this, targetState, error); } catch (Throwable t) { LOG.error("Error while notifying execution graph of execution state transition.", t); } return true; } else { return false; } } private void markTimestamp(ExecutionState state) { markTimestamp(state, System.currentTimeMillis()); } private void markTimestamp(ExecutionState state, long timestamp) { this.stateTimestamps[state.ordinal()] = timestamp; } public String getVertexWithAttempt() { return vertex.getTaskNameWithSubtaskIndex() + " - execution #" + attemptNumber; } // ------------------------------------------------------------------------ // Accumulators // ------------------------------------------------------------------------ /** * Update accumulators (discarded when the Execution has already been terminated). * @param userAccumulators the user accumulators */ public void setAccumulators(Map> userAccumulators) { synchronized (accumulatorLock) { if (!state.isTerminal()) { this.userAccumulators = userAccumulators; } } } public Map> getUserAccumulators() { return userAccumulators; } @Override public StringifiedAccumulatorResult[] getUserAccumulatorsStringified() { Map>> accumulators = userAccumulators == null ? null : userAccumulators.entrySet() .stream() .collect(Collectors.toMap(Map.Entry::getKey, entry -> OptionalFailure.of(entry.getValue()))); return StringifiedAccumulatorResult.stringifyAccumulatorResults(accumulators); } @Override public int getParallelSubtaskIndex() { return getVertex().getParallelSubtaskIndex(); } @Override public IOMetrics getIOMetrics() { return ioMetrics; } private void updateAccumulatorsAndMetrics(Map> userAccumulators, IOMetrics metrics) { if (userAccumulators != null) { synchronized (accumulatorLock) { this.userAccumulators = userAccumulators; } } if (metrics != null) { this.ioMetrics = metrics; } } // ------------------------------------------------------------------------ // Standard utilities // ------------------------------------------------------------------------ @Override public String toString() { final LogicalSlot slot = assignedResource; return String.format("Attempt #%d (%s) @ %s - [%s]", attemptNumber, vertex.getTaskNameWithSubtaskIndex(), (slot == null ? "(unassigned)" : slot), state); } @Override public ArchivedExecution archive() { return new ArchivedExecution(this); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy