Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.executiongraph;
import akka.dispatch.OnComplete;
import akka.dispatch.OnFailure;
import org.apache.flink.api.common.accumulators.Accumulator;
import org.apache.flink.runtime.JobException;
import org.apache.flink.runtime.accumulators.AccumulatorRegistry;
import org.apache.flink.runtime.accumulators.StringifiedAccumulatorResult;
import org.apache.flink.runtime.deployment.InputChannelDeploymentDescriptor;
import org.apache.flink.runtime.deployment.PartialInputChannelDeploymentDescriptor;
import org.apache.flink.runtime.deployment.ResultPartitionLocation;
import org.apache.flink.runtime.deployment.TaskDeploymentDescriptor;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.instance.Instance;
import org.apache.flink.runtime.instance.InstanceConnectionInfo;
import org.apache.flink.runtime.instance.ActorGateway;
import org.apache.flink.runtime.instance.SimpleSlot;
import org.apache.flink.runtime.io.network.ConnectionID;
import org.apache.flink.runtime.io.network.partition.ResultPartitionID;
import org.apache.flink.runtime.jobgraph.IntermediateDataSetID;
import org.apache.flink.runtime.jobmanager.scheduler.CoLocationConstraint;
import org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException;
import org.apache.flink.runtime.jobmanager.scheduler.ScheduledUnit;
import org.apache.flink.runtime.jobmanager.scheduler.Scheduler;
import org.apache.flink.runtime.jobmanager.scheduler.SlotAllocationFuture;
import org.apache.flink.runtime.jobmanager.scheduler.SlotAllocationFutureAction;
import org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroup;
import org.apache.flink.runtime.messages.Messages;
import org.apache.flink.runtime.messages.TaskMessages.TaskOperationResult;
import org.apache.flink.runtime.state.StateHandle;
import org.apache.flink.runtime.util.SerializableObject;
import org.apache.flink.util.SerializedValue;
import org.apache.flink.util.ExceptionUtils;
import org.slf4j.Logger;
import scala.concurrent.ExecutionContext;
import scala.concurrent.Future;
import scala.concurrent.duration.FiniteDuration;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;
import static akka.dispatch.Futures.future;
import static org.apache.flink.runtime.execution.ExecutionState.CANCELED;
import static org.apache.flink.runtime.execution.ExecutionState.CANCELING;
import static org.apache.flink.runtime.execution.ExecutionState.CREATED;
import static org.apache.flink.runtime.execution.ExecutionState.DEPLOYING;
import static org.apache.flink.runtime.execution.ExecutionState.FAILED;
import static org.apache.flink.runtime.execution.ExecutionState.FINISHED;
import static org.apache.flink.runtime.execution.ExecutionState.RUNNING;
import static org.apache.flink.runtime.execution.ExecutionState.SCHEDULED;
import static org.apache.flink.runtime.messages.TaskMessages.CancelTask;
import static org.apache.flink.runtime.messages.TaskMessages.FailIntermediateResultPartitions;
import static org.apache.flink.runtime.messages.TaskMessages.StopTask;
import static org.apache.flink.runtime.messages.TaskMessages.SubmitTask;
import static org.apache.flink.runtime.messages.TaskMessages.UpdatePartitionInfo;
import static org.apache.flink.runtime.messages.TaskMessages.UpdateTaskSinglePartitionInfo;
import static org.apache.flink.runtime.messages.TaskMessages.createUpdateTaskMultiplePartitionInfos;
import static org.apache.flink.util.Preconditions.checkNotNull;
/**
* A single execution of a vertex. While an {@link ExecutionVertex} can be executed multiple times (for recovery,
* or other re-computation), this class tracks the state of a single execution of that vertex and the resources.
*
* NOTE ABOUT THE DESIGN RATIONAL:
*
* In several points of the code, we need to deal with possible concurrent state changes and actions.
* For example, while the call to deploy a task (send it to the TaskManager) happens, the task gets cancelled.
*
* We could lock the entire portion of the code (decision to deploy, deploy, set state to running) such that
* it is guaranteed that any "cancel command" will only pick up after deployment is done and that the "cancel
* command" call will never overtake the deploying call.
*
* This blocks the threads big time, because the remote calls may take long. Depending of their locking behavior, it
* may even result in distributed deadlocks (unless carefully avoided). We therefore use atomic state updates and
* occasional double-checking to ensure that the state after a completed call is as expected, and trigger correcting
* actions if it is not. Many actions are also idempotent (like canceling).
*/
public class Execution implements Serializable {
private static final long serialVersionUID = 42L;
private static final AtomicReferenceFieldUpdater STATE_UPDATER =
AtomicReferenceFieldUpdater.newUpdater(Execution.class, ExecutionState.class, "state");
private static final Logger LOG = ExecutionGraph.LOG;
private static final int NUM_CANCEL_CALL_TRIES = 3;
private static final int NUM_STOP_CALL_TRIES = 3;
// --------------------------------------------------------------------------------------------
private final ExecutionVertex vertex;
private final ExecutionAttemptID attemptId;
private final long[] stateTimestamps;
private final int attemptNumber;
private final FiniteDuration timeout;
private ConcurrentLinkedQueue partialInputChannelDeploymentDescriptors;
private volatile ExecutionState state = CREATED;
private volatile SimpleSlot assignedResource; // once assigned, never changes until the execution is archived
private volatile Throwable failureCause; // once assigned, never changes
private volatile InstanceConnectionInfo assignedResourceLocation; // for the archived execution
private SerializedValue> operatorState;
private Map>> operatorKvState;
/** The execution context which is used to execute futures. */
@SuppressWarnings("NonSerializableFieldInSerializableClass")
private ExecutionContext executionContext;
/* Lock for updating the accumulators atomically. */
private final SerializableObject accumulatorLock = new SerializableObject();
/* Continuously updated map of user-defined accumulators */
private volatile Map> userAccumulators;
/* Continuously updated map of internal accumulators */
private volatile Map> flinkAccumulators;
// --------------------------------------------------------------------------------------------
public Execution(
ExecutionContext executionContext,
ExecutionVertex vertex,
int attemptNumber,
long startTimestamp,
FiniteDuration timeout) {
this.executionContext = checkNotNull(executionContext);
this.vertex = checkNotNull(vertex);
this.attemptId = new ExecutionAttemptID();
this.attemptNumber = attemptNumber;
this.stateTimestamps = new long[ExecutionState.values().length];
markTimestamp(ExecutionState.CREATED, startTimestamp);
this.timeout = timeout;
this.partialInputChannelDeploymentDescriptors = new ConcurrentLinkedQueue();
}
// --------------------------------------------------------------------------------------------
// Properties
// --------------------------------------------------------------------------------------------
public ExecutionVertex getVertex() {
return vertex;
}
public ExecutionAttemptID getAttemptId() {
return attemptId;
}
public int getAttemptNumber() {
return attemptNumber;
}
public ExecutionState getState() {
return state;
}
public SimpleSlot getAssignedResource() {
return assignedResource;
}
public InstanceConnectionInfo getAssignedResourceLocation() {
return assignedResourceLocation;
}
public Throwable getFailureCause() {
return failureCause;
}
public long[] getStateTimestamps() {
return stateTimestamps;
}
public long getStateTimestamp(ExecutionState state) {
return this.stateTimestamps[state.ordinal()];
}
public boolean isFinished() {
return state == FINISHED || state == FAILED || state == CANCELED;
}
/**
* This method cleans fields that are irrelevant for the archived execution attempt.
*/
public void prepareForArchiving() {
if (assignedResource != null && assignedResource.isAlive()) {
throw new IllegalStateException("Cannot archive Execution while the assigned resource is still running.");
}
assignedResource = null;
executionContext = null;
partialInputChannelDeploymentDescriptors.clear();
partialInputChannelDeploymentDescriptors = null;
}
public void setInitialState(
SerializedValue> initialState,
Map>> initialKvState) {
if (state != ExecutionState.CREATED) {
throw new IllegalArgumentException("Can only assign operator state when execution attempt is in CREATED");
}
this.operatorState = initialState;
this.operatorKvState = initialKvState;
}
// --------------------------------------------------------------------------------------------
// Actions
// --------------------------------------------------------------------------------------------
/**
* NOTE: This method only throws exceptions if it is in an illegal state to be scheduled, or if the tasks needs
* to be scheduled immediately and no resource is available. If the task is accepted by the schedule, any
* error sets the vertex state to failed and triggers the recovery logic.
*
* @param scheduler The scheduler to use to schedule this execution attempt.
* @param queued Flag to indicate whether the scheduler may queue this task if it cannot
* immediately deploy it.
*
* @throws IllegalStateException Thrown, if the vertex is not in CREATED state, which is the only state that permits scheduling.
* @throws NoResourceAvailableException Thrown is no queued scheduling is allowed and no resources are currently available.
*/
public boolean scheduleForExecution(Scheduler scheduler, boolean queued) throws NoResourceAvailableException {
if (scheduler == null) {
throw new IllegalArgumentException("Cannot send null Scheduler when scheduling execution.");
}
final SlotSharingGroup sharingGroup = vertex.getJobVertex().getSlotSharingGroup();
final CoLocationConstraint locationConstraint = vertex.getLocationConstraint();
// sanity check
if (locationConstraint != null && sharingGroup == null) {
throw new RuntimeException("Trying to schedule with co-location constraint but without slot sharing allowed.");
}
if (transitionState(CREATED, SCHEDULED)) {
ScheduledUnit toSchedule = locationConstraint == null ?
new ScheduledUnit(this, sharingGroup) :
new ScheduledUnit(this, sharingGroup, locationConstraint);
// IMPORTANT: To prevent leaks of cluster resources, we need to make sure that slots are returned
// in all cases where the deployment failed. we use many try {} finally {} clauses to assure that
if (queued) {
SlotAllocationFuture future = scheduler.scheduleQueued(toSchedule);
future.setFutureAction(new SlotAllocationFutureAction() {
@Override
public void slotAllocated(SimpleSlot slot) {
try {
deployToSlot(slot);
}
catch (Throwable t) {
try {
slot.releaseSlot();
} finally {
markFailed(t);
}
}
}
});
}
else {
SimpleSlot slot = scheduler.scheduleImmediately(toSchedule);
try {
deployToSlot(slot);
}
catch (Throwable t) {
try {
slot.releaseSlot();
} finally {
markFailed(t);
}
}
}
return true;
}
else {
// call race, already deployed, or already done
return false;
}
}
public void deployToSlot(final SimpleSlot slot) throws JobException {
// sanity checks
if (slot == null) {
throw new NullPointerException();
}
if (!slot.isAlive()) {
throw new JobException("Target slot for deployment is not alive.");
}
// make sure exactly one deployment call happens from the correct state
// note: the transition from CREATED to DEPLOYING is for testing purposes only
ExecutionState previous = this.state;
if (previous == SCHEDULED || previous == CREATED) {
if (!transitionState(previous, DEPLOYING)) {
// race condition, someone else beat us to the deploying call.
// this should actually not happen and indicates a race somewhere else
throw new IllegalStateException("Cannot deploy task: Concurrent deployment call race.");
}
}
else {
// vertex may have been cancelled, or it was already scheduled
throw new IllegalStateException("The vertex must be in CREATED or SCHEDULED state to be deployed. Found state " + previous);
}
try {
// good, we are allowed to deploy
if (!slot.setExecutedVertex(this)) {
throw new JobException("Could not assign the ExecutionVertex to the slot " + slot);
}
this.assignedResource = slot;
this.assignedResourceLocation = slot.getInstance().getInstanceConnectionInfo();
// race double check, did we fail/cancel and do we need to release the slot?
if (this.state != DEPLOYING) {
slot.releaseSlot();
return;
}
if (LOG.isInfoEnabled()) {
LOG.info(String.format("Deploying %s (attempt #%d) to %s", vertex.getSimpleName(),
attemptNumber, slot.getInstance().getInstanceConnectionInfo().getHostname()));
}
final TaskDeploymentDescriptor deployment = vertex.createDeploymentDescriptor(
attemptId,
slot,
operatorState,
operatorKvState,
attemptNumber);
// register this execution at the execution graph, to receive call backs
vertex.getExecutionGraph().registerExecution(this);
final Instance instance = slot.getInstance();
final ActorGateway gateway = instance.getActorGateway();
final Future