All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.taskmanager.Task Maven / Gradle / Ivy

There is a newer version: 1.3.3
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.taskmanager;

import org.apache.flink.api.common.ExecutionConfig;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.TaskInfo;
import org.apache.flink.api.common.cache.DistributedCache;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.fs.Path;
import org.apache.flink.runtime.metrics.groups.TaskMetricGroup;
import org.apache.flink.runtime.accumulators.AccumulatorRegistry;
import org.apache.flink.runtime.blob.BlobKey;
import org.apache.flink.runtime.broadcast.BroadcastVariableManager;
import org.apache.flink.runtime.deployment.InputGateDeploymentDescriptor;
import org.apache.flink.runtime.deployment.ResultPartitionDeploymentDescriptor;
import org.apache.flink.runtime.deployment.TaskDeploymentDescriptor;
import org.apache.flink.runtime.execution.CancelTaskException;
import org.apache.flink.runtime.execution.Environment;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.execution.librarycache.LibraryCacheManager;
import org.apache.flink.runtime.executiongraph.ExecutionAttemptID;
import org.apache.flink.runtime.filecache.FileCache;
import org.apache.flink.runtime.instance.ActorGateway;
import org.apache.flink.runtime.io.disk.iomanager.IOManager;
import org.apache.flink.runtime.io.network.NetworkEnvironment;
import org.apache.flink.runtime.io.network.api.writer.ResultPartitionWriter;
import org.apache.flink.runtime.io.network.partition.ResultPartition;
import org.apache.flink.runtime.io.network.partition.ResultPartitionID;
import org.apache.flink.runtime.io.network.partition.consumer.SingleInputGate;
import org.apache.flink.runtime.jobgraph.IntermediateDataSetID;
import org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable;
import org.apache.flink.runtime.jobgraph.tasks.StatefulTask;
import org.apache.flink.runtime.jobgraph.tasks.StoppableTask;
import org.apache.flink.runtime.memory.MemoryManager;
import org.apache.flink.runtime.messages.TaskManagerMessages.FatalError;
import org.apache.flink.runtime.messages.TaskMessages.FailTask;
import org.apache.flink.runtime.messages.TaskMessages.TaskInFinalState;
import org.apache.flink.runtime.messages.TaskMessages.UpdateTaskExecutionState;
import org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint;
import org.apache.flink.runtime.state.StateHandle;
import org.apache.flink.runtime.state.StateUtils;
import org.apache.flink.util.SerializedValue;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import scala.concurrent.duration.FiniteDuration;

import java.io.IOException;
import java.net.URL;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;

import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * The Task represents one execution of a parallel subtask on a TaskManager.
 * A Task wraps a Flink operator (which may be a user function) and
 * runs it, providing all services necessary for example to consume input data,
 * produce its results (intermediate result partitions) and communicate
 * with the JobManager.
 *
 * 

The Flink operators (implemented as subclasses of * {@link org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable} have only data * readers, -writers, and certain event callbacks. The task connects those to the * network stack and actor messages, and tracks the state of the execution and * handles exceptions. * *

Tasks have no knowledge about how they relate to other tasks, or whether they * are the first attempt to execute the task, or a repeated attempt. All of that * is only known to the JobManager. All the task knows are its own runnable code, * the task's configuration, and the IDs of the intermediate results to consume and * produce (if any). * *

Each Task is run by one dedicated thread. */ public class Task implements Runnable { /** The class logger. */ private static final Logger LOG = LoggerFactory.getLogger(Task.class); /** The tread group that contains all task threads */ private static final ThreadGroup TASK_THREADS_GROUP = new ThreadGroup("Flink Task Threads"); /** For atomic state updates */ private static final AtomicReferenceFieldUpdater STATE_UPDATER = AtomicReferenceFieldUpdater.newUpdater(Task.class, ExecutionState.class, "executionState"); // ------------------------------------------------------------------------ // Constant fields that are part of the initial Task construction // ------------------------------------------------------------------------ /** The job that the task belongs to */ private final JobID jobId; /** The vertex in the JobGraph whose code the task executes */ private final JobVertexID vertexId; /** The execution attempt of the parallel subtask */ private final ExecutionAttemptID executionId; /** TaskInfo object for this task */ private final TaskInfo taskInfo; /** The name of the task, including subtask indexes */ private final String taskNameWithSubtask; /** The job-wide configuration object */ private final Configuration jobConfiguration; /** The task-specific configuration */ private final Configuration taskConfiguration; /** The jar files used by this task */ private final List requiredJarFiles; /** The classpaths used by this task */ private final List requiredClasspaths; /** The name of the class that holds the invokable code */ private final String nameOfInvokableClass; /** Access to task manager configuration and host names*/ private final TaskManagerRuntimeInfo taskManagerConfig; /** The memory manager to be used by this task */ private final MemoryManager memoryManager; /** The I/O manager to be used by this task */ private final IOManager ioManager; /** The BroadcastVariableManager to be used by this task */ private final BroadcastVariableManager broadcastVariableManager; /** Serialized version of the job specific execution configuration (see {@link ExecutionConfig}). */ private final SerializedValue serializedExecutionConfig; private final ResultPartition[] producedPartitions; private final ResultPartitionWriter[] writers; private final SingleInputGate[] inputGates; private final Map inputGatesById; /** Gateway to the TaskManager that spawned this task */ private final ActorGateway taskManager; /** Gateway to the JobManager */ private final ActorGateway jobManager; /** All actors that want to be notified about changes in the task's execution state */ private final List executionListenerActors; /** The timeout for all ask operations on actors */ private final FiniteDuration actorAskTimeout; /** The library cache, from which the task can request its required JAR files */ private final LibraryCacheManager libraryCache; /** The cache for user-defined files that the invokable requires */ private final FileCache fileCache; /** The gateway to the network stack, which handles inputs and produced results */ private final NetworkEnvironment network; /** The registry of this task which enables live reporting of accumulators */ private final AccumulatorRegistry accumulatorRegistry; /** The thread that executes the task */ private final Thread executingThread; /** Parent group for all metrics of this task */ private final TaskMetricGroup metrics; // ------------------------------------------------------------------------ // Fields that control the task execution. All these fields are volatile // (which means that they introduce memory barriers), to establish // proper happens-before semantics on parallel modification // ------------------------------------------------------------------------ /** atomic flag that makes sure the invokable is canceled exactly once upon error */ private final AtomicBoolean invokableHasBeenCanceled; /** The invokable of this task, if initialized */ private volatile AbstractInvokable invokable; /** The current execution state of the task */ private volatile ExecutionState executionState = ExecutionState.CREATED; /** The observed exception, in case the task execution failed */ private volatile Throwable failureCause; /** Serial executor for asynchronous calls (checkpoints, etc), lazily initialized */ private volatile ExecutorService asyncCallDispatcher; /** The handle to the state that the operator was initialized with. Will be set to null after the * initialization, to be memory friendly */ private volatile SerializedValue> operatorState; /** Initialized from the Flink configuration. May also be set at the ExecutionConfig */ private long taskCancellationInterval; /** *

IMPORTANT: This constructor may not start any work that would need to * be undone in the case of a failing task deployment.

*/ public Task(TaskDeploymentDescriptor tdd, MemoryManager memManager, IOManager ioManager, NetworkEnvironment networkEnvironment, BroadcastVariableManager bcVarManager, ActorGateway taskManagerActor, ActorGateway jobManagerActor, FiniteDuration actorAskTimeout, LibraryCacheManager libraryCache, FileCache fileCache, TaskManagerRuntimeInfo taskManagerConfig, TaskMetricGroup metricGroup) { this.taskInfo = checkNotNull(tdd.getTaskInfo()); this.jobId = checkNotNull(tdd.getJobID()); this.vertexId = checkNotNull(tdd.getVertexID()); this.executionId = checkNotNull(tdd.getExecutionId()); this.taskNameWithSubtask = taskInfo.getTaskNameWithSubtasks(); this.jobConfiguration = checkNotNull(tdd.getJobConfiguration()); this.taskConfiguration = checkNotNull(tdd.getTaskConfiguration()); this.requiredJarFiles = checkNotNull(tdd.getRequiredJarFiles()); this.requiredClasspaths = checkNotNull(tdd.getRequiredClasspaths()); this.nameOfInvokableClass = checkNotNull(tdd.getInvokableClassName()); this.operatorState = tdd.getOperatorState(); this.serializedExecutionConfig = checkNotNull(tdd.getSerializedExecutionConfig()); this.taskCancellationInterval = jobConfiguration.getLong( ConfigConstants.TASK_CANCELLATION_INTERVAL_MILLIS, ConfigConstants.DEFAULT_TASK_CANCELLATION_INTERVAL_MILLIS); this.memoryManager = checkNotNull(memManager); this.ioManager = checkNotNull(ioManager); this.broadcastVariableManager = checkNotNull(bcVarManager); this.accumulatorRegistry = new AccumulatorRegistry(jobId, executionId); this.jobManager = checkNotNull(jobManagerActor); this.taskManager = checkNotNull(taskManagerActor); this.actorAskTimeout = checkNotNull(actorAskTimeout); this.libraryCache = checkNotNull(libraryCache); this.fileCache = checkNotNull(fileCache); this.network = checkNotNull(networkEnvironment); this.taskManagerConfig = checkNotNull(taskManagerConfig); this.executionListenerActors = new CopyOnWriteArrayList(); this.metrics = metricGroup; // create the reader and writer structures final String taskNameWithSubtaskAndId = taskNameWithSubtask + " (" + executionId + ')'; List partitions = tdd.getProducedPartitions(); List consumedPartitions = tdd.getInputGates(); // Produced intermediate result partitions this.producedPartitions = new ResultPartition[partitions.size()]; this.writers = new ResultPartitionWriter[partitions.size()]; for (int i = 0; i < this.producedPartitions.length; i++) { ResultPartitionDeploymentDescriptor desc = partitions.get(i); ResultPartitionID partitionId = new ResultPartitionID(desc.getPartitionId(), executionId); this.producedPartitions[i] = new ResultPartition( taskNameWithSubtaskAndId, jobId, partitionId, desc.getPartitionType(), desc.getEagerlyDeployConsumers(), desc.getNumberOfSubpartitions(), networkEnvironment.getPartitionManager(), networkEnvironment.getPartitionConsumableNotifier(), ioManager, networkEnvironment.getDefaultIOMode()); this.writers[i] = new ResultPartitionWriter(this.producedPartitions[i]); } // Consumed intermediate result partitions this.inputGates = new SingleInputGate[consumedPartitions.size()]; this.inputGatesById = new HashMap(); for (int i = 0; i < this.inputGates.length; i++) { SingleInputGate gate = SingleInputGate.create( taskNameWithSubtaskAndId, jobId, executionId, consumedPartitions.get(i), networkEnvironment, metricGroup.getIOMetricGroup()); this.inputGates[i] = gate; inputGatesById.put(gate.getConsumedResultId(), gate); } invokableHasBeenCanceled = new AtomicBoolean(false); // finally, create the executing thread, but do not start it executingThread = new Thread(TASK_THREADS_GROUP, this, taskNameWithSubtask); } // ------------------------------------------------------------------------ // Accessors // ------------------------------------------------------------------------ public JobID getJobID() { return jobId; } public JobVertexID getJobVertexId() { return vertexId; } public ExecutionAttemptID getExecutionId() { return executionId; } public TaskInfo getTaskInfo() { return taskInfo; } public Configuration getJobConfiguration() { return jobConfiguration; } public Configuration getTaskConfiguration() { return this.taskConfiguration; } public ResultPartitionWriter[] getAllWriters() { return writers; } public SingleInputGate[] getAllInputGates() { return inputGates; } public ResultPartition[] getProducedPartitions() { return producedPartitions; } public SingleInputGate getInputGateById(IntermediateDataSetID id) { return inputGatesById.get(id); } public AccumulatorRegistry getAccumulatorRegistry() { return accumulatorRegistry; } public Thread getExecutingThread() { return executingThread; } // ------------------------------------------------------------------------ // Task Execution // ------------------------------------------------------------------------ /** * Returns the current execution state of the task. * @return The current execution state of the task. */ public ExecutionState getExecutionState() { return this.executionState; } /** * Checks whether the task has failed, is canceled, or is being canceled at the moment. * @return True is the task in state FAILED, CANCELING, or CANCELED, false otherwise. */ public boolean isCanceledOrFailed() { return executionState == ExecutionState.CANCELING || executionState == ExecutionState.CANCELED || executionState == ExecutionState.FAILED; } /** * If the task has failed, this method gets the exception that caused this task to fail. * Otherwise this method returns null. * * @return The exception that caused the task to fail, or null, if the task has not failed. */ public Throwable getFailureCause() { return failureCause; } /** * Starts the task's thread. */ public void startTaskThread() { executingThread.start(); } /** * The core work method that bootstraps the task and executes it code */ @Override public void run() { // ---------------------------- // Initial State transition // ---------------------------- while (true) { ExecutionState current = this.executionState; if (current == ExecutionState.CREATED) { if (STATE_UPDATER.compareAndSet(this, ExecutionState.CREATED, ExecutionState.DEPLOYING)) { // success, we can start our work break; } } else if (current == ExecutionState.FAILED) { // we were immediately failed. tell the TaskManager that we reached our final state notifyFinalState(); return; } else if (current == ExecutionState.CANCELING) { if (STATE_UPDATER.compareAndSet(this, ExecutionState.CANCELING, ExecutionState.CANCELED)) { // we were immediately canceled. tell the TaskManager that we reached our final state notifyFinalState(); return; } } else { throw new IllegalStateException("Invalid state for beginning of task operation"); } } // all resource acquisitions and registrations from here on // need to be undone in the end Map> distributedCacheEntries = new HashMap>(); AbstractInvokable invokable = null; ClassLoader userCodeClassLoader = null; try { // ---------------------------- // Task Bootstrap - We periodically // check for canceling as a shortcut // ---------------------------- // first of all, get a user-code classloader // this may involve downloading the job's JAR files and/or classes LOG.info("Loading JAR files for task " + taskNameWithSubtask); userCodeClassLoader = createUserCodeClassloader(libraryCache); final ExecutionConfig executionConfig = serializedExecutionConfig.deserializeValue(userCodeClassLoader); if (executionConfig.getTaskCancellationInterval() >= 0) { // override task cancellation interval from Flink config if set in ExecutionConfig taskCancellationInterval = executionConfig.getTaskCancellationInterval(); } // now load the task's invokable code invokable = loadAndInstantiateInvokable(userCodeClassLoader, nameOfInvokableClass); if (isCanceledOrFailed()) { throw new CancelTaskException(); } // ---------------------------------------------------------------- // register the task with the network stack // this operation may fail if the system does not have enough // memory to run the necessary data exchanges // the registration must also strictly be undone // ---------------------------------------------------------------- LOG.info("Registering task at network: " + this); network.registerTask(this); // next, kick off the background copying of files for the distributed cache try { for (Map.Entry entry : DistributedCache.readFileInfoFromConfig(jobConfiguration)) { LOG.info("Obtaining local cache file for '" + entry.getKey() + '\''); Future cp = fileCache.createTmpFile(entry.getKey(), entry.getValue(), jobId); distributedCacheEntries.put(entry.getKey(), cp); } } catch (Exception e) { throw new Exception("Exception while adding files to distributed cache.", e); } if (isCanceledOrFailed()) { throw new CancelTaskException(); } // ---------------------------------------------------------------- // call the user code initialization methods // ---------------------------------------------------------------- TaskInputSplitProvider splitProvider = new TaskInputSplitProvider(jobManager, jobId, vertexId, executionId, userCodeClassLoader, actorAskTimeout); Environment env = new RuntimeEnvironment(jobId, vertexId, executionId, executionConfig, taskInfo, jobConfiguration, taskConfiguration, userCodeClassLoader, memoryManager, ioManager, broadcastVariableManager, accumulatorRegistry, splitProvider, distributedCacheEntries, writers, inputGates, jobManager, taskManagerConfig, metrics, this); // let the task code create its readers and writers invokable.setEnvironment(env); // the very last thing before the actual execution starts running is to inject // the state into the task. the state is non-empty if this is an execution // of a task that failed but had backuped state from a checkpoint // get our private reference onto the stack (be safe against concurrent changes) SerializedValue> operatorState = this.operatorState; if (operatorState != null) { if (invokable instanceof StatefulTask) { try { StateHandle state = operatorState.deserializeValue(userCodeClassLoader); StatefulTask op = (StatefulTask) invokable; StateUtils.setOperatorState(op, state); } catch (Exception e) { throw new RuntimeException("Failed to deserialize state handle and setup initial operator state.", e); } } else { throw new IllegalStateException("Found operator state for a non-stateful task invokable"); } } // be memory and GC friendly - since the code stays in invoke() for a potentially long time, // we clear the reference to the state handle //noinspection UnusedAssignment operatorState = null; this.operatorState = null; // ---------------------------------------------------------------- // actual task core work // ---------------------------------------------------------------- // we must make strictly sure that the invokable is accessible to the cancel() call // by the time we switched to running. this.invokable = invokable; // switch to the RUNNING state, if that fails, we have been canceled/failed in the meantime if (!STATE_UPDATER.compareAndSet(this, ExecutionState.DEPLOYING, ExecutionState.RUNNING)) { throw new CancelTaskException(); } // notify everyone that we switched to running. especially the TaskManager needs // to know this! notifyObservers(ExecutionState.RUNNING, null); taskManager.tell(new UpdateTaskExecutionState( new TaskExecutionState(jobId, executionId, ExecutionState.RUNNING))); // make sure the user code classloader is accessible thread-locally executingThread.setContextClassLoader(userCodeClassLoader); // run the invokable invokable.invoke(); // make sure, we enter the catch block if the task leaves the invoke() method due // to the fact that it has been canceled if (isCanceledOrFailed()) { throw new CancelTaskException(); } // ---------------------------------------------------------------- // finalization of a successful execution // ---------------------------------------------------------------- // finish the produced partitions. if this fails, we consider the execution failed. for (ResultPartition partition : producedPartitions) { if (partition != null) { partition.finish(); } } // try to mark the task as finished // if that fails, the task was canceled/failed in the meantime if (STATE_UPDATER.compareAndSet(this, ExecutionState.RUNNING, ExecutionState.FINISHED)) { notifyObservers(ExecutionState.FINISHED, null); } else { throw new CancelTaskException(); } } catch (Throwable t) { // ---------------------------------------------------------------- // the execution failed. either the invokable code properly failed, or // an exception was thrown as a side effect of cancelling // ---------------------------------------------------------------- try { // transition into our final state. we should be either in DEPLOYING, RUNNING, CANCELING, or FAILED // loop for multiple retries during concurrent state changes via calls to cancel() or // to failExternally() while (true) { ExecutionState current = this.executionState; if (current == ExecutionState.RUNNING || current == ExecutionState.DEPLOYING) { if (t instanceof CancelTaskException) { if (STATE_UPDATER.compareAndSet(this, current, ExecutionState.CANCELED)) { cancelInvokable(); notifyObservers(ExecutionState.CANCELED, null); break; } } else { if (STATE_UPDATER.compareAndSet(this, current, ExecutionState.FAILED)) { // proper failure of the task. record the exception as the root cause LOG.error("Task execution failed. ", t); failureCause = t; cancelInvokable(); notifyObservers(ExecutionState.FAILED, t); break; } } } else if (current == ExecutionState.CANCELING) { if (STATE_UPDATER.compareAndSet(this, current, ExecutionState.CANCELED)) { notifyObservers(ExecutionState.CANCELED, null); break; } } else if (current == ExecutionState.FAILED) { // in state failed already, no transition necessary any more break; } // unexpected state, go to failed else if (STATE_UPDATER.compareAndSet(this, current, ExecutionState.FAILED)) { LOG.error("Unexpected state in Task during an exception: " + current); break; } // else fall through the loop and } } catch (Throwable tt) { String message = "FATAL - exception in task exception handler"; LOG.error(message, tt); notifyFatalError(message, tt); } } finally { try { LOG.info("Freeing task resources for " + taskNameWithSubtask); // stop the async dispatcher. // copy dispatcher reference to stack, against concurrent release ExecutorService dispatcher = this.asyncCallDispatcher; if (dispatcher != null && !dispatcher.isShutdown()) { dispatcher.shutdownNow(); } // free the network resources network.unregisterTask(this); // free memory resources if (invokable != null) { memoryManager.releaseAll(invokable); } // remove all of the tasks library resources libraryCache.unregisterTask(jobId, executionId); // remove all files in the distributed cache removeCachedFiles(distributedCacheEntries, fileCache); notifyFinalState(); } catch (Throwable t) { // an error in the resource cleanup is fatal String message = "FATAL - exception in task resource cleanup"; LOG.error(message, t); notifyFatalError(message, t); } // un-register the metrics at the end so that the task may already be // counted as finished when this happens // errors here will only be logged try { metrics.close(); } catch (Throwable t) { LOG.error("Error during metrics de-registration", t); } } } private ClassLoader createUserCodeClassloader(LibraryCacheManager libraryCache) throws Exception { long startDownloadTime = System.currentTimeMillis(); // triggers the download of all missing jar files from the job manager libraryCache.registerTask(jobId, executionId, requiredJarFiles, requiredClasspaths); LOG.debug("Register task {} at library cache manager took {} milliseconds", executionId, System.currentTimeMillis() - startDownloadTime); ClassLoader userCodeClassLoader = libraryCache.getClassLoader(jobId); if (userCodeClassLoader == null) { throw new Exception("No user code classloader available."); } return userCodeClassLoader; } private AbstractInvokable loadAndInstantiateInvokable(ClassLoader classLoader, String className) throws Exception { Class invokableClass; try { invokableClass = Class.forName(className, true, classLoader) .asSubclass(AbstractInvokable.class); } catch (Throwable t) { throw new Exception("Could not load the task's invokable class.", t); } try { return invokableClass.newInstance(); } catch (Throwable t) { throw new Exception("Could not instantiate the task's invokable class.", t); } } private void removeCachedFiles(Map> entries, FileCache fileCache) { // cancel and release all distributed cache files try { for (Map.Entry> entry : entries.entrySet()) { String name = entry.getKey(); try { fileCache.deleteTmpFile(name, jobId); } catch (Exception e) { // unpleasant, but we continue LOG.error("Distributed Cache could not remove cached file registered under '" + name + "'.", e); } } } catch (Throwable t) { LOG.error("Error while removing cached local files from distributed cache."); } } private void notifyFinalState() { taskManager.tell(new TaskInFinalState(executionId)); } private void notifyFatalError(String message, Throwable cause) { taskManager.tell(new FatalError(message, cause)); } // ---------------------------------------------------------------------------------------------------------------- // Stopping / Canceling / Failing the task from the outside // ---------------------------------------------------------------------------------------------------------------- /** * Stops the executing task by calling {@link StoppableTask#stop()}. *

* This method never blocks. *

* * @throws UnsupportedOperationException * if the {@link AbstractInvokable} does not implement {@link StoppableTask} */ public void stopExecution() throws UnsupportedOperationException { LOG.info("Attempting to stop task " + taskNameWithSubtask); if(this.invokable instanceof StoppableTask) { Runnable runnable = new Runnable() { @Override public void run() { try { ((StoppableTask)Task.this.invokable).stop(); } catch(RuntimeException e) { LOG.error("Stopping task " + taskNameWithSubtask + " failed.", e); taskManager.tell(new FailTask(executionId, e)); } } }; executeAsyncCallRunnable(runnable, "Stopping source task " + this.taskNameWithSubtask); } else { throw new UnsupportedOperationException("Stopping not supported by this task."); } } /** * Cancels the task execution. If the task is already in a terminal state * (such as FINISHED, CANCELED, FAILED), or if the task is already canceling this does nothing. * Otherwise it sets the state to CANCELING, and, if the invokable code is running, * starts an asynchronous thread that aborts that code. * *

This method never blocks.

*/ public void cancelExecution() { LOG.info("Attempting to cancel task " + taskNameWithSubtask); cancelOrFailAndCancelInvokable(ExecutionState.CANCELING, null); } /** * Marks task execution failed for an external reason (a reason other than the task code itself * throwing an exception). If the task is already in a terminal state * (such as FINISHED, CANCELED, FAILED), or if the task is already canceling this does nothing. * Otherwise it sets the state to FAILED, and, if the invokable code is running, * starts an asynchronous thread that aborts that code. * *

This method never blocks.

*/ public void failExternally(Throwable cause) { LOG.info("Attempting to fail task externally " + taskNameWithSubtask); cancelOrFailAndCancelInvokable(ExecutionState.FAILED, cause); } private void cancelOrFailAndCancelInvokable(ExecutionState targetState, Throwable cause) { while (true) { ExecutionState current = this.executionState; // if the task is already canceled (or canceling) or finished or failed, // then we need not do anything if (current.isTerminal() || current == ExecutionState.CANCELING) { LOG.info("Task " + taskNameWithSubtask + " is already in state " + current); return; } if (current == ExecutionState.DEPLOYING || current == ExecutionState.CREATED) { if (STATE_UPDATER.compareAndSet(this, current, targetState)) { // if we manage this state transition, then the invokable gets never called // we need not call cancel on it this.failureCause = cause; notifyObservers(targetState, cause); return; } } else if (current == ExecutionState.RUNNING) { if (STATE_UPDATER.compareAndSet(this, ExecutionState.RUNNING, targetState)) { // we are canceling / failing out of the running state // we need to cancel the invokable if (invokable != null && invokableHasBeenCanceled.compareAndSet(false, true)) { this.failureCause = cause; notifyObservers(targetState, cause); LOG.info("Triggering cancellation of task code {} ({}).", taskNameWithSubtask, executionId); // because the canceling may block on user code, we cancel from a separate thread // we do not reuse the async call handler, because that one may be blocked, in which // case the canceling could not continue Runnable canceler = new TaskCanceler( LOG, invokable, executingThread, taskNameWithSubtask, taskCancellationInterval, producedPartitions, inputGates); Thread cancelThread = new Thread(executingThread.getThreadGroup(), canceler, "Canceler for " + taskNameWithSubtask); cancelThread.setDaemon(true); cancelThread.start(); } return; } } else { throw new IllegalStateException("Unexpected task state: " + current); } } } // ------------------------------------------------------------------------ // State Listeners // ------------------------------------------------------------------------ public void registerExecutionListener(ActorGateway listener) { executionListenerActors.add(listener); } private void notifyObservers(ExecutionState newState, Throwable error) { if (error == null) { LOG.info(taskNameWithSubtask + " switched to " + newState); } else { LOG.info(taskNameWithSubtask + " switched to " + newState + " with exception.", error); } TaskExecutionState stateUpdate = new TaskExecutionState(jobId, executionId, newState, error); UpdateTaskExecutionState actorMessage = new UpdateTaskExecutionState(stateUpdate); for (ActorGateway listener : executionListenerActors) { listener.tell(actorMessage); } } // ------------------------------------------------------------------------ // Notifications on the invokable // ------------------------------------------------------------------------ /** * Calls the invokable to trigger a checkpoint, if the invokable implements the interface * {@link org.apache.flink.runtime.jobgraph.tasks.StatefulTask}. * * @param checkpointID The ID identifying the checkpoint. * @param checkpointTimestamp The timestamp associated with the checkpoint. */ public void triggerCheckpointBarrier(final long checkpointID, final long checkpointTimestamp) { AbstractInvokable invokable = this.invokable; if (executionState == ExecutionState.RUNNING && invokable != null) { if (invokable instanceof StatefulTask) { // build a local closure final StatefulTask statefulTask = (StatefulTask) invokable; final String taskName = taskNameWithSubtask; Runnable runnable = new Runnable() { @Override public void run() { try { boolean success = statefulTask.triggerCheckpoint(checkpointID, checkpointTimestamp); if (!success) { DeclineCheckpoint decline = new DeclineCheckpoint(jobId, getExecutionId(), checkpointID, checkpointTimestamp); jobManager.tell(decline); } } catch (Throwable t) { if (getExecutionState() == ExecutionState.RUNNING) { failExternally(new RuntimeException( "Error while triggering checkpoint for " + taskName, t)); } } } }; executeAsyncCallRunnable(runnable, "Checkpoint Trigger for " + taskName); } else { LOG.error("Task received a checkpoint request, but is not a checkpointing task - " + taskNameWithSubtask); } } else { LOG.debug("Ignoring request to trigger a checkpoint for non-running task."); } } public void notifyCheckpointComplete(final long checkpointID) { AbstractInvokable invokable = this.invokable; if (executionState == ExecutionState.RUNNING && invokable != null) { if (invokable instanceof StatefulTask) { // build a local closure final StatefulTask statefulTask = (StatefulTask) invokable; final String taskName = taskNameWithSubtask; Runnable runnable = new Runnable() { @Override public void run() { try { statefulTask.notifyCheckpointComplete(checkpointID); } catch (Throwable t) { if (getExecutionState() == ExecutionState.RUNNING) { // fail task if checkpoint confirmation failed. failExternally(new RuntimeException( "Error while confirming checkpoint", t)); } } } }; executeAsyncCallRunnable(runnable, "Checkpoint Confirmation for " + taskName); } else { LOG.error("Task received a checkpoint commit notification, but is not a checkpoint committing task - " + taskNameWithSubtask); } } else { LOG.debug("Ignoring checkpoint commit notification for non-running task."); } } // ------------------------------------------------------------------------ /** * Answer to a partition state check issued after a failed partition request. */ public void onPartitionStateUpdate( IntermediateDataSetID resultId, IntermediateResultPartitionID partitionId, ExecutionState partitionState) throws IOException, InterruptedException { if (executionState == ExecutionState.RUNNING) { final SingleInputGate inputGate = inputGatesById.get(resultId); if (inputGate != null) { if (partitionState == ExecutionState.RUNNING) { // Retrigger the partition request inputGate.retriggerPartitionRequest(partitionId); } else if (partitionState == ExecutionState.CANCELED || partitionState == ExecutionState.CANCELING || partitionState == ExecutionState.FAILED) { cancelExecution(); } else { failExternally(new IllegalStateException("Received unexpected partition state " + partitionState + " for partition request. This is a bug.")); } } else { failExternally(new IllegalStateException("Received partition state for " + "unknown input gate " + resultId + ". This is a bug.")); } } else { LOG.debug("Ignoring partition state notification for not running task."); } } /** * Utility method to dispatch an asynchronous call on the invokable. * * @param runnable The async call runnable. * @param callName The name of the call, for logging purposes. */ private void executeAsyncCallRunnable(Runnable runnable, String callName) { // make sure the executor is initialized. lock against concurrent calls to this function synchronized (this) { if (executionState != ExecutionState.RUNNING) { return; } // get ourselves a reference on the stack that cannot be concurrently modified ExecutorService executor = this.asyncCallDispatcher; if (executor == null) { // first time use, initialize executor = Executors.newSingleThreadExecutor( new DispatcherThreadFactory(TASK_THREADS_GROUP, "Async calls on " + taskNameWithSubtask)); this.asyncCallDispatcher = executor; // double-check for execution state, and make sure we clean up after ourselves // if we created the dispatcher while the task was concurrently canceled if (executionState != ExecutionState.RUNNING) { executor.shutdown(); asyncCallDispatcher = null; return; } } LOG.debug("Invoking async call {} on task {}", callName, taskNameWithSubtask); try { executor.submit(runnable); } catch (RejectedExecutionException e) { // may be that we are concurrently finished or canceled. // if not, report that something is fishy if (executionState == ExecutionState.RUNNING) { throw new RuntimeException("Async call was rejected, even though the task is running.", e); } } } } // ------------------------------------------------------------------------ // Utilities // ------------------------------------------------------------------------ private void cancelInvokable() { // in case of an exception during execution, we still call "cancel()" on the task if (invokable != null && this.invokable != null && invokableHasBeenCanceled.compareAndSet(false, true)) { try { invokable.cancel(); } catch (Throwable t) { LOG.error("Error while canceling task " + taskNameWithSubtask, t); } } } @Override public String toString() { return taskNameWithSubtask + " [" + executionState + ']'; } /** * This runner calls cancel() on the invokable and periodically interrupts the * thread until it has terminated. */ private static class TaskCanceler implements Runnable { private final Logger logger; private final AbstractInvokable invokable; private final Thread executer; private final String taskName; private final long taskCancellationIntervalMillis; private final ResultPartition[] producedPartitions; private final SingleInputGate[] inputGates; public TaskCanceler( Logger logger, AbstractInvokable invokable, Thread executer, String taskName, long cancelationInterval, ResultPartition[] producedPartitions, SingleInputGate[] inputGates) { this.logger = logger; this.invokable = invokable; this.executer = executer; this.taskName = taskName; this.taskCancellationIntervalMillis = cancelationInterval; this.producedPartitions = producedPartitions; this.inputGates = inputGates; } @Override public void run() { try { // the user-defined cancel method may throw errors. // we need do continue despite that try { invokable.cancel(); } catch (Throwable t) { logger.error("Error while canceling the task", t); } // Early release of input and output buffer pools. We do this // in order to unblock async Threads, which produce/consume the // intermediate streams outside of the main Task Thread. // // Don't do this before cancelling the invokable. Otherwise we // will get misleading errors in the logs. for (ResultPartition partition : producedPartitions) { try { partition.destroyBufferPool(); } catch (Throwable t) { LOG.error("Failed to release result partition buffer pool.", t); } } for (SingleInputGate inputGate : inputGates) { try { inputGate.releaseAllResources(); } catch (Throwable t) { LOG.error("Failed to release input gate.", t); } } // interrupt the running thread initially executer.interrupt(); try { executer.join(taskCancellationIntervalMillis); } catch (InterruptedException e) { // we can ignore this } // it is possible that the user code does not react immediately. for that // reason, we spawn a separate thread that repeatedly interrupts the user code until // it exits while (executer.isAlive()) { // build the stack trace of where the thread is stuck, for the log StringBuilder bld = new StringBuilder(); StackTraceElement[] stack = executer.getStackTrace(); for (StackTraceElement e : stack) { bld.append(e).append('\n'); } logger.warn("Task '{}' did not react to cancelling signal, but is stuck in method:\n {}", taskName, bld.toString()); executer.interrupt(); try { executer.join(taskCancellationIntervalMillis); } catch (InterruptedException e) { // we can ignore this } } } catch (Throwable t) { logger.error("Error in the task canceler", t); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy