org.apache.flink.runtime.executiongraph.ExecutionVertex Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.executiongraph;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.Archiveable;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.operators.ResourceSpec;
import org.apache.flink.api.common.resources.CommonExtendedResource;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.JobManagerOptions;
import org.apache.flink.configuration.TaskManagerOptions;
import org.apache.flink.core.io.InputSplit;
import org.apache.flink.runtime.JobException;
import org.apache.flink.runtime.blob.PermanentBlobKey;
import org.apache.flink.runtime.checkpoint.JobManagerTaskRestore;
import org.apache.flink.runtime.clusterframework.types.AllocationID;
import org.apache.flink.runtime.clusterframework.types.ResourceProfile;
import org.apache.flink.runtime.deployment.InputChannelDeploymentDescriptor;
import org.apache.flink.runtime.deployment.InputGateDeploymentDescriptor;
import org.apache.flink.runtime.deployment.PartialInputChannelDeploymentDescriptor;
import org.apache.flink.runtime.deployment.ResultPartitionDeploymentDescriptor;
import org.apache.flink.runtime.deployment.ResultPartitionLocationTrackerProxy;
import org.apache.flink.runtime.deployment.TaskDeploymentDescriptor;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.instance.SimpleSlot;
import org.apache.flink.runtime.io.network.partition.BlockingShuffleType;
import org.apache.flink.runtime.io.network.partition.ResultPartitionID;
import org.apache.flink.runtime.io.network.partition.ResultPartitionType;
import org.apache.flink.runtime.jobgraph.ExecutionVertexID;
import org.apache.flink.runtime.jobgraph.IntermediateDataSetID;
import org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID;
import org.apache.flink.runtime.jobgraph.JobVertexID;
import org.apache.flink.runtime.jobgraph.OperatorID;
import org.apache.flink.runtime.jobmanager.scheduler.CoLocationConstraint;
import org.apache.flink.runtime.jobmanager.scheduler.CoLocationGroup;
import org.apache.flink.runtime.jobmanager.scheduler.LocationPreferenceConstraint;
import org.apache.flink.runtime.jobmaster.LogicalSlot;
import org.apache.flink.runtime.jobmaster.TaskNetworkMemoryUtil;
import org.apache.flink.runtime.jobmaster.failover.ResultDescriptor;
import org.apache.flink.runtime.jobmaster.slotpool.SlotProvider;
import org.apache.flink.runtime.schedule.ExecutionVertexStatus;
import org.apache.flink.runtime.state.KeyGroupRangeAssignment;
import org.apache.flink.runtime.taskmanager.TaskManagerLocation;
import org.apache.flink.runtime.util.EvictingBoundedList;
import org.apache.flink.types.Either;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.FlinkRuntimeException;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.SerializedValue;
import org.slf4j.Logger;
import javax.annotation.Nullable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CompletableFuture;
import static org.apache.flink.runtime.execution.ExecutionState.FINISHED;
import static org.apache.flink.util.Preconditions.checkState;
/**
* The ExecutionVertex is a parallel subtask of the execution. It may be executed once, or several times, each of
* which time it spawns an {@link Execution}.
*/
public class ExecutionVertex implements AccessExecutionVertex, Archiveable {
private static final Logger LOG = ExecutionGraph.LOG;
private static final int MAX_DISTINCT_LOCATIONS_TO_CONSIDER = 8;
// --------------------------------------------------------------------------------------------
private final ExecutionJobVertex jobVertex;
private Map resultPartitions;
private final ExecutionEdge[][] inputEdges;
private final int subTaskIndex;
private final ExecutionVertexID executionVertexID;
private final EvictingBoundedList priorExecutions;
private final Time timeout;
/** The name in the format "myTask (2/7)", cached to avoid frequent string concatenations. */
private final String taskNameWithSubtask;
private volatile CoLocationConstraint locationConstraint;
/** The current or latest execution attempt of this vertex's task. */
private volatile Execution currentExecution; // this field must never be null
/** The create timestamp of execution. */
private long createTimestamp;
/** The location of the last execution. */
private TaskManagerLocation latestPriorLocation = null;
private final Map> assignedInputSplitsMap = new HashMap<>();
private final Map inputSplitIndexMap = new HashMap<>();
// --------------------------------------------------------------------------------------------
/**
* Convenience constructor for tests. Sets various fields to default values.
*/
@VisibleForTesting
ExecutionVertex(
ExecutionJobVertex jobVertex,
int subTaskIndex,
IntermediateResult[] producedDataSets,
Time timeout) {
this(
jobVertex,
subTaskIndex,
producedDataSets,
timeout,
1L,
System.currentTimeMillis(),
JobManagerOptions.MAX_ATTEMPTS_HISTORY_SIZE.defaultValue());
}
/**
*
* @param timeout
* The RPC timeout to use for deploy / cancel calls
* @param initialGlobalModVersion
* The global modification version to initialize the first Execution with.
* @param createTimestamp
* The timestamp for the vertex creation, used to initialize the first Execution with.
* @param maxPriorExecutionHistoryLength
* The number of prior Executions (= execution attempts) to keep.
*/
public ExecutionVertex(
ExecutionJobVertex jobVertex,
int subTaskIndex,
IntermediateResult[] producedDataSets,
Time timeout,
long initialGlobalModVersion,
long createTimestamp,
int maxPriorExecutionHistoryLength) {
this.jobVertex = jobVertex;
this.subTaskIndex = subTaskIndex;
this.executionVertexID = new ExecutionVertexID(jobVertex.getJobVertexId(), subTaskIndex);
this.taskNameWithSubtask = String.format("%s (%d/%d)",
jobVertex.getJobVertex().getName(), subTaskIndex + 1, jobVertex.getParallelism());
this.resultPartitions = new LinkedHashMap<>(producedDataSets.length, 1);
for (IntermediateResult result : producedDataSets) {
IntermediateResultPartition irp = new IntermediateResultPartition(result, this, subTaskIndex);
result.setPartition(subTaskIndex, irp);
resultPartitions.put(irp.getPartitionId(), irp);
}
this.inputEdges = new ExecutionEdge[jobVertex.getJobVertex().getInputs().size()][];
this.priorExecutions = new EvictingBoundedList<>(maxPriorExecutionHistoryLength);
this.createTimestamp = createTimestamp;
this.currentExecution = new Execution(
getExecutionGraph().getFutureExecutor(),
this,
0,
initialGlobalModVersion,
createTimestamp,
timeout);
// create a co-location scheduling hint, if necessary
final CoLocationGroup clg = jobVertex.getCoLocationGroup();
if (clg != null) {
synchronized (clg) {
this.locationConstraint = clg.getLocationConstraint(subTaskIndex);
}
}
else {
this.locationConstraint = null;
}
getExecutionGraph().registerExecution(currentExecution);
this.timeout = timeout;
}
// --------------------------------------------------------------------------------------------
// Properties
// --------------------------------------------------------------------------------------------
public JobID getJobId() {
return this.jobVertex.getJobId();
}
public ExecutionJobVertex getJobVertex() {
return jobVertex;
}
public JobVertexID getJobvertexId() {
return this.jobVertex.getJobVertexId();
}
public ExecutionVertexID getExecutionVertexID() {
return executionVertexID;
}
public String getTaskName() {
return this.jobVertex.getJobVertex().getName();
}
/**
* Creates a simple name representation in the style 'taskname (x/y)', where
* 'taskname' is the name as returned by {@link #getTaskName()}, 'x' is the parallel
* subtask index as returned by {@link #getParallelSubtaskIndex()}{@code + 1}, and 'y' is the total
* number of tasks, as returned by {@link #getTotalNumberOfParallelSubtasks()}.
*
* @return A simple name representation in the form 'myTask (2/7)'
*/
@Override
public String getTaskNameWithSubtaskIndex() {
return this.taskNameWithSubtask;
}
public int getTotalNumberOfParallelSubtasks() {
return this.jobVertex.getParallelism();
}
public int getMaxParallelism() {
return this.jobVertex.getMaxParallelism();
}
@Override
public int getParallelSubtaskIndex() {
return this.subTaskIndex;
}
public int getNumberOfInputs() {
return this.inputEdges.length;
}
public ExecutionEdge[] getInputEdges(int input) {
if (input < 0 || input >= this.inputEdges.length) {
throw new IllegalArgumentException(String.format("Input %d is out of range [0..%d)", input, this.inputEdges.length));
}
return inputEdges[input];
}
public CoLocationConstraint getLocationConstraint() {
return locationConstraint;
}
@Override
public Execution getCurrentExecutionAttempt() {
return currentExecution;
}
@Override
public ExecutionState getExecutionState() {
return currentExecution.getState();
}
@Override
public long getStateTimestamp(ExecutionState state) {
return currentExecution.getStateTimestamp(state);
}
@Override
public String getFailureCauseAsString() {
return ExceptionUtils.stringifyException(getFailureCause());
}
public Throwable getFailureCause() {
return currentExecution.getFailureCause();
}
public CompletableFuture getCurrentTaskManagerLocationFuture() {
return currentExecution.getTaskManagerLocationFuture();
}
public LogicalSlot getCurrentAssignedResource() {
return currentExecution.getAssignedResource();
}
@Override
public TaskManagerLocation getCurrentAssignedResourceLocation() {
return currentExecution.getAssignedResourceLocation();
}
@Override
public Execution getPriorExecutionAttempt(int attemptNumber) {
synchronized (priorExecutions) {
if (attemptNumber >= 0 && attemptNumber < priorExecutions.size()) {
return priorExecutions.get(attemptNumber);
} else {
throw new IllegalArgumentException("attempt does not exist");
}
}
}
public Execution getLatestPriorExecution() {
synchronized (priorExecutions) {
final int size = priorExecutions.size();
if (size > 0) {
return priorExecutions.get(size - 1);
}
else {
return null;
}
}
}
/**
* Gets the location where the latest completed/canceled/failed execution of the vertex's
* task happened.
*
* @return The latest prior execution location, or null, if there is none, yet.
*/
public TaskManagerLocation getLatestPriorLocation() {
return latestPriorLocation;
}
public void setLatestPriorLocation(TaskManagerLocation location) {
this.latestPriorLocation = location;
}
public AllocationID getLatestPriorAllocation() {
Execution latestPriorExecution = getLatestPriorExecution();
return latestPriorExecution != null ? latestPriorExecution.getAssignedAllocationID() : null;
}
EvictingBoundedList getCopyOfPriorExecutionsList() {
synchronized (priorExecutions) {
return new EvictingBoundedList<>(priorExecutions);
}
}
public ExecutionGraph getExecutionGraph() {
return this.jobVertex.getGraph();
}
public Map getProducedPartitions() {
return resultPartitions;
}
public ExecutionVertexStatus getCurrentStatus() {
return new ExecutionVertexStatus(executionVertexID, getExecutionState());
}
// --------------------------------------------------------------------------------------------
// Graph building
// --------------------------------------------------------------------------------------------
public void setInputExecutionEdges(ExecutionEdge[] edges, int inputNumber) {
this.inputEdges[inputNumber] = edges;
}
/**
* Gets the overall preferred execution location for this vertex's current execution.
* The preference is determined as follows:
*
*
* - If the task execution has state to load (from a checkpoint), then the location preference
* is the location of the previous execution (if there is a previous execution attempt).
*
- If the task execution has no state or no previous location, then the location preference
* is based on the task's inputs.
*
*
* These rules should result in the following behavior:
*
*
* - Stateless tasks are always scheduled based on co-location with inputs.
*
- Stateful tasks are on their initial attempt executed based on co-location with inputs.
*
- Repeated executions of stateful tasks try to co-locate the execution with its state.
*
*
* @return The preferred execution locations for the execution attempt.
*
* @see #getPreferredLocationsBasedOnState()
* @see #getPreferredLocationsBasedOnInputs()
*/
public Collection> getPreferredLocations() {
Collection> basedOnState = getPreferredLocationsBasedOnState();
return basedOnState != null ? basedOnState : getPreferredLocationsBasedOnInputs();
}
/**
* Gets the preferred location to execute the current task execution attempt, based on the state
* that the execution attempt will resume.
*
* @return A size-one collection with the location preference, or null, if there is no
* location preference based on the state.
*/
public Collection> getPreferredLocationsBasedOnState() {
TaskManagerLocation priorLocation;
if (currentExecution.getTaskRestore() != null && (priorLocation = getLatestPriorLocation()) != null) {
return Collections.singleton(CompletableFuture.completedFuture(priorLocation));
}
else {
return null;
}
}
/**
* Gets the location preferences of the vertex's current task execution, as determined by the locations
* of the predecessors from which it receives input data.
* If there are more than MAX_DISTINCT_LOCATIONS_TO_CONSIDER different locations of source data, this
* method returns {@code null} to indicate no location preference.
*
* @return The preferred locations based in input streams, or an empty iterable,
* if there is no input-based preference.
*/
public Collection> getPreferredLocationsBasedOnInputs() {
// otherwise, base the preferred locations on the input connections
if (inputEdges == null) {
return Collections.emptySet();
}
else {
Set> locations = new HashSet<>(getTotalNumberOfParallelSubtasks());
Set> inputLocations = new HashSet<>(getTotalNumberOfParallelSubtasks());
// go over all inputs
for (int i = 0; i < inputEdges.length; i++) {
inputLocations.clear();
ExecutionEdge[] sources = inputEdges[i];
if (sources != null) {
// go over all input sources
for (int k = 0; k < sources.length; k++) {
// look-up assigned slot of input source
CompletableFuture locationFuture = sources[k].getSource().getProducer().getCurrentTaskManagerLocationFuture();
// add input location
inputLocations.add(locationFuture);
// inputs which have too many distinct sources are not considered
if (inputLocations.size() > MAX_DISTINCT_LOCATIONS_TO_CONSIDER) {
inputLocations.clear();
break;
}
}
}
// keep the locations of the input with the least preferred locations
if (locations.isEmpty() || // nothing assigned yet
(!inputLocations.isEmpty() && inputLocations.size() < locations.size())) {
// current input has fewer preferred locations
locations.clear();
locations.addAll(inputLocations);
}
}
return locations.isEmpty() ? Collections.emptyList() : locations;
}
}
// --------------------------------------------------------------------------------------------
// Resources
// --------------------------------------------------------------------------------------------
public ResourceProfile calculateResourceProfile() {
if (jobVertex.getJobVertex().getMinResources().equals(ResourceSpec.DEFAULT)) {
return ResourceProfile.UNKNOWN;
} else {
int networkMemory = calculateTaskNetworkMemory();
int additionalManagedMemory = calculateTaskExtraManagedMemory();
ResourceSpec additionalResourceSpec = ResourceSpec.newBuilder().addExtendedResource(
new CommonExtendedResource(ResourceSpec.MANAGED_MEMORY_NAME, additionalManagedMemory))
.build();
return ResourceProfile.fromResourceSpec(
getJobVertex().getJobVertex().getMinResources()
.merge(additionalResourceSpec), networkMemory);
}
}
@VisibleForTesting
int calculateTaskNetworkMemory() {
Configuration config = jobVertex.getGraph().getJobManagerConfiguration();
BlockingShuffleType shuffleType =
BlockingShuffleType.getBlockingShuffleTypeFromConfiguration(config, LOG);
int numInternalSubpartitions = 0;
int numInternalResultPartitions = 0;
for (IntermediateResultPartition irp : getProducedPartitions().values()) {
if (!(shuffleType == BlockingShuffleType.YARN && irp.getIntermediateResult().getResultType().isBlocking())) {
for (List consumer : irp.getConsumers()) {
numInternalSubpartitions += consumer.size();
}
++numInternalResultPartitions;
}
}
final int maxBlockingRequestsInFlight = config.getInteger(TaskManagerOptions.TASK_EXTERNAL_SHUFFLE_MAX_CONCURRENT_REQUESTS);
int numPipelineChannels = 0;
int numPipelineGates = 0;
int numExternalBlockingChannels = 0;
int numExternalBlockingGates = 0;
for (int j = 0; j < getNumberOfInputs(); ++j) {
ExecutionEdge[] edges = getInputEdges(j);
checkState(edges.length > 0, "There should be at least on edge for each input");
// Check the result type by viewing the first edge
boolean isExternalBlocking = edges[0].getSource().getIntermediateResult().getResultType().isBlocking()
&& shuffleType == BlockingShuffleType.YARN;
if (isExternalBlocking) {
numExternalBlockingChannels += edges.length;
numExternalBlockingGates++;
} else {
numPipelineChannels += edges.length;
numPipelineGates++;
}
}
if (maxBlockingRequestsInFlight > 0) {
numExternalBlockingChannels = Math.min(numExternalBlockingChannels, maxBlockingRequestsInFlight);
// each blocking input gate should monopolize at least one piece of resource to
// support input selection by operator
numExternalBlockingChannels = Math.max(numExternalBlockingChannels, numExternalBlockingGates);
}
return TaskNetworkMemoryUtil.calculateTaskNetworkMemory(config,
numInternalSubpartitions, numInternalResultPartitions, numPipelineChannels,
numPipelineGates, numExternalBlockingChannels, numExternalBlockingGates);
}
private int calculateTaskExtraManagedMemory() {
Configuration config = getJobVertex().getGraph().getJobManagerConfiguration();
// Calculates managed memory for external result partition.
BlockingShuffleType shuffleType =
BlockingShuffleType.getBlockingShuffleTypeFromConfiguration(config, LOG);
int numExternalResultPartitions = 0;
for (IntermediateResultPartition irp : getProducedPartitions().values()) {
if (shuffleType == BlockingShuffleType.YARN && irp.getIntermediateResult().getResultType().isBlocking()) {
numExternalResultPartitions++;
}
}
int mapOutputMemoryInMB = config.getInteger(TaskManagerOptions.TASK_MANAGER_OUTPUT_MEMORY_MB);
return mapOutputMemoryInMB * numExternalResultPartitions;
}
// --------------------------------------------------------------------------------------------
// Actions
// --------------------------------------------------------------------------------------------
/**
* Archives the current Execution and creates a new Execution for this vertex.
*
* This method atomically checks if the ExecutionGraph is still of an expected
* global mod. version and replaces the execution if that is the case. If the ExecutionGraph
* has increased its global mod. version in the meantime, this operation fails.
*
*
This mechanism can be used to prevent conflicts between various concurrent recovery and
* reconfiguration actions in a similar way as "optimistic concurrency control".
*
* @param timestamp
* The creation timestamp for the new Execution
* @param originatingGlobalModVersion
* The
* @return Returns the new created Execution.
*
* @throws GlobalModVersionMismatch Thrown, if the execution graph has a new global mod
* version than the one passed to this message.
*/
public Execution resetForNewExecution(final long timestamp, final long originatingGlobalModVersion)
throws GlobalModVersionMismatch {
LOG.debug("Resetting execution vertex {} for new execution.", getTaskNameWithSubtaskIndex());
synchronized (priorExecutions) {
// check if another global modification has been triggered since the
// action that originally caused this reset/restart happened
final long actualModVersion = getExecutionGraph().getGlobalModVersion();
if (actualModVersion > originatingGlobalModVersion) {
// global change happened since, reject this action
throw new GlobalModVersionMismatch(originatingGlobalModVersion, actualModVersion);
}
final Execution oldExecution = currentExecution;
final ExecutionState oldState = oldExecution.getState();
if (oldState.isTerminal() || getExecutionGraph().getGraphManager().isReplaying()) {
priorExecutions.add(oldExecution);
latestPriorLocation = oldExecution.getAssignedResourceLocation();
final Execution newExecution = new Execution(
getExecutionGraph().getFutureExecutor(),
this,
oldExecution.getAttemptNumber() + 1,
originatingGlobalModVersion,
timestamp,
timeout);
this.currentExecution = newExecution;
CoLocationGroup grp = jobVertex.getCoLocationGroup();
if (grp != null) {
this.locationConstraint = grp.getLocationConstraint(subTaskIndex);
}
// register this execution at the execution graph, to receive call backs
getExecutionGraph().registerExecution(newExecution);
// if the execution was 'FINISHED' before, tell the ExecutionGraph that
// we take one step back on the road to reaching global FINISHED
if (oldState == FINISHED) {
getExecutionGraph().vertexUnFinished();
}
//TODO: set this index according to checkpoint when batch support checkpoint.
inputSplitIndexMap.clear();
// Reset intermediate results
for (IntermediateResultPartition resultPartition : resultPartitions.values()) {
resultPartition.resetForNewExecution();
}
return newExecution;
}
else {
throw new IllegalStateException("Cannot reset a vertex that is in non-terminal state " + oldState);
}
}
}
/**
* Schedules the current execution of this ExecutionVertex.
*
* @param slotProvider to allocate the slots from
* @param queued if the allocation can be queued
* @param locationPreferenceConstraint constraint for the location preferences
* @return Future which is completed once the execution is deployed. The future
* can also completed exceptionally.
*/
public CompletableFuture scheduleForExecution(
SlotProvider slotProvider,
boolean queued,
LocationPreferenceConstraint locationPreferenceConstraint) {
return this.currentExecution.scheduleForExecution(
slotProvider,
queued,
locationPreferenceConstraint);
}
@VisibleForTesting
public void deployToSlot(SimpleSlot slot) throws JobException {
if (this.currentExecution.tryAssignResource(slot)) {
this.currentExecution.deploy();
} else {
throw new IllegalStateException("Could not assign resource " + slot + " to current execution " +
currentExecution + '.');
}
}
/**
*
* @return A future that completes once the execution has reached its final state.
*/
public CompletableFuture> cancel() {
// to avoid any case of mixup in the presence of concurrent calls,
// we copy a reference to the stack to make sure both calls go to the same Execution.
final Execution exec = this.currentExecution;
exec.cancel();
return exec.getReleaseFuture();
}
public void stop() {
this.currentExecution.stop();
}
public void fail(Throwable t) {
this.currentExecution.fail(t);
}
/**
* Schedules or updates the consumer tasks of the result partition with the given ID.
*/
void scheduleOrUpdateConsumers(ResultPartitionID partitionId) {
final Execution execution = currentExecution;
// Abort this request if there was a concurrent reset
if (!partitionId.getProducerId().equals(execution.getAttemptId())) {
return;
}
final IntermediateResultPartition partition = resultPartitions.get(partitionId.getPartitionId());
if (partition == null) {
throw new IllegalStateException("Unknown partition " + partitionId + ".");
}
if (partition.getIntermediateResult().getResultType().isPipelined()) {
// Schedule or update receivers of this partition
partition.markDataProduced();
// Notify the scheduler to handle the consumable partition
notifyAndUpdateConsumers(partition);
}
else {
throw new IllegalArgumentException("ScheduleOrUpdateConsumers msg is only valid for" +
"pipelined partitions.");
}
}
protected void notifyAndUpdateConsumers(IntermediateResultPartition partition) {
getExecutionGraph().getGraphManager().notifyResultPartitionConsumable(
getExecutionVertexID(),
partition.getIntermediateResult().getId(),
partition.getPartitionNumber(),
getCurrentAssignedResourceLocation());
getExecutionGraph().getFutureExecutor().execute(() -> {
currentExecution.updateConsumers(partition.getConsumers());
});
}
public void cachePartitionInfo(PartialInputChannelDeploymentDescriptor partitionInfo){
getCurrentExecutionAttempt().cachePartitionInfo(partitionInfo);
}
void clearAssignedInputSplits() {
assignedInputSplitsMap.clear();
inputSplitIndexMap.clear();
}
/**
* Finish all blocking result partitions whose receivers can be scheduled/updated and notify.
*/
void finishPartitionsAndNotify() {
for (IntermediateResultPartition partition : resultPartitions.values()) {
partition.markFinished();
// Blocking partitions are consumable on finished
if (partition.getResultType().isBlocking()) {
notifyAndUpdateConsumers(partition);
}
}
}
void resetResultPartitionID(ResultPartitionID[] partitionIds) {
Map newResultPartitions =
new LinkedHashMap<>(resultPartitions.size());
Iterator iterator = resultPartitions.values().iterator();
for (int i = 0; i < resultPartitions.size(); i++) {
IntermediateResultPartition resultPartition = iterator.next();
IntermediateResultPartitionID originId = resultPartition.getPartitionId();
resultPartition.setPartitionId(partitionIds[i].getPartitionId());
resultPartition.getIntermediateResult().resetLookupHelper(originId, partitionIds[i].getPartitionId());
newResultPartitions.put(resultPartition.getPartitionId(), resultPartition);
}
this.resultPartitions = newResultPartitions;
}
// the following two method is added for region failover
// record the input split assigned to this task
public void inputSplitAssigned(OperatorID operatorID, InputSplit inputSplit) {
assignedInputSplitsMap.putIfAbsent(operatorID, new LinkedList<>());
assignedInputSplitsMap.get(operatorID).add(inputSplit);
inputSplitIndexMap.put(operatorID, inputSplitIndexMap.getOrDefault(operatorID, 0) + 1);
Preconditions.checkArgument(inputSplitIndexMap.get(operatorID) == assignedInputSplitsMap.get(operatorID).size());
}
public InputSplit getNextInputSplitFromAssgined(OperatorID operatorID) {
List assignedInputSplits = assignedInputSplitsMap.getOrDefault(operatorID, Collections.emptyList());
Integer inputSplitIndex = inputSplitIndexMap.getOrDefault(operatorID, 0);
if (assignedInputSplits.isEmpty() || inputSplitIndex >= assignedInputSplits.size()) {
return null;
}
InputSplit split = assignedInputSplits.get(inputSplitIndex++);
inputSplitIndexMap.put(operatorID, inputSplitIndex);
return split;
}
public Map> getAssignedInputSplits() {
return Collections.unmodifiableMap(assignedInputSplitsMap);
}
/**
* Recover the execution vertex status after job master failover.
*
* @param state The state in the log.
* @param assignedInputSplits The assigned input splits of a finished executions.
* @param resultDescriptor The result information of a finished executions.
*/
public void recoverStatus(
ExecutionState state,
Map> assignedInputSplits,
ResultDescriptor resultDescriptor) {
if (!ExecutionState.FINISHED.equals(state) &&
(assignedInputSplits != null && resultDescriptor != null)) {
throw new FlinkRuntimeException("Can not assign input split or result partion when execution is " + state);
}
switch (state) {
case FINISHED:
currentExecution.getTaskManagerLocationFuture().complete(resultDescriptor.getTaskManagerLocation());
resetResultPartitionID(resultDescriptor.getResultPartitionIds());
currentExecution.markFinished();
if (assignedInputSplits != null) {
assignedInputSplitsMap.clear();
assignedInputSplitsMap.putAll(assignedInputSplits);
for (Map.Entry> opToInputs : assignedInputSplits.entrySet()) {
getJobVertex().getSplitAssigner(opToInputs.getKey()).inputSplitsAssigned(subTaskIndex, opToInputs.getValue());
}
}
break;
case RUNNING:
currentExecution.switchToRunning();
break;
case DEPLOYING:
currentExecution.recoverState(state);
break;
default:
throw new FlinkRuntimeException("Unsupported replaying the state " + state);
}
}
/**
* Recover the pipelined result partition consume status after job master failover.
*
* @param resultId The intermediate data set id in the log.
*/
public void recoverResultPartitionStatus(
IntermediateDataSetID resultId,
TaskManagerLocation location) {
IntermediateResultPartition partitionToRecover = null;
for (IntermediateResultPartition irp : getProducedPartitions().values()) {
if (irp.getIntermediateResult().getId().equals(resultId)) {
partitionToRecover = irp;
}
}
if (partitionToRecover == null) {
throw new FlinkRuntimeException("Can not find the intermediate result " + resultId + " on " + getTaskNameWithSubtaskIndex());
}
if (!(ExecutionState.RUNNING.equals(currentExecution.getState()) &&
partitionToRecover.getResultType().isPipelined())) {
throw new FlinkRuntimeException("Invalid state " + currentExecution.getState() + " for " + getTaskNameWithSubtaskIndex());
}
currentExecution.getTaskManagerLocationFuture().complete(location);
scheduleOrUpdateConsumers(new ResultPartitionID(partitionToRecover.getPartitionId(), currentExecution.getAttemptId()));
}
/**
* Reconcile the execution with the running info reported by task executor.
* @param executionId
* @param attemptNumber
* @param startTimestamp
* @param partitionIds
*/
public boolean reconcileExecution(
ExecutionState state,
ExecutionAttemptID executionId,
int attemptNumber,
long startTimestamp,
ResultPartitionID[] partitionIds,
boolean[] partitionsConsumable,
Map> assignedInputSplits,
LogicalSlot slot) {
LOG.debug("Reconcile execution vertex {} for current execution.", getTaskNameWithSubtaskIndex());
if (resultPartitions.size() != partitionIds.length) {
LOG.info("Reconcile execution failed due to partition number with actual {}, expect {}.",
partitionIds.length, resultPartitions.size());
return false;
}
// first, update the IntermediateResultPartition and reset the map
resetResultPartitionID(partitionIds);
// second, update the pipelined partition info to its consumers.
for (int i = 0; i < partitionIds.length; i++) {
IntermediateResultPartition partition = resultPartitions.get(partitionIds[i].getPartitionId());
if (partition.getResultType().isPipelined()) {
if (partition.hasDataProduced() != partitionsConsumable[i]) {
LOG.info("Reconcile execution {} failed due to partition {} consumable not equals to {}.",
getTaskNameWithSubtaskIndex(), partition.getPartitionId(), partition.hasDataProduced());
currentExecution.getReconcileFuture().complete(currentExecution.getAttemptId());
return false;
}
}
}
// third, reset execution basic information
getExecutionGraph().deregisterExecution(currentExecution);
if (currentExecution.reconcileStatus(state, executionId, attemptNumber, startTimestamp, slot)) {
getExecutionGraph().registerExecution(currentExecution);
// forth, build the input split map
inputSplitIndexMap.clear();
assignedInputSplitsMap.clear();
for (Map.Entry> opToInputs : assignedInputSplits.entrySet()) {
for (InputSplit inputSplit : opToInputs.getValue()) {
inputSplitAssigned(opToInputs.getKey(), inputSplit);
}
getJobVertex().getSplitAssigner(opToInputs.getKey()).inputSplitsAssigned(subTaskIndex, opToInputs.getValue());
}
return true;
}
else {
getExecutionGraph().registerExecution(currentExecution);
return false;
}
}
// --------------------------------------------------------------------------------------------
// Notifications from the Execution Attempt
// --------------------------------------------------------------------------------------------
void executionFinished(Execution execution) {
getExecutionGraph().vertexFinished();
}
void executionCanceled(Execution execution) {
// nothing to do
}
void executionFailed(Execution execution, Throwable cause) {
// nothing to do
}
// --------------------------------------------------------------------------------------------
// Miscellaneous
// --------------------------------------------------------------------------------------------
/**
* Simply forward this notification.
*/
void notifyStateTransition(Execution execution, ExecutionState newState, Throwable error) {
// only forward this notification if the execution is still the current execution
// otherwise we have an outdated execution
if (currentExecution == execution) {
getExecutionGraph().notifyExecutionChange(execution, newState, error);
}
}
/**
* Creates a task deployment descriptor to deploy a subtask to the given target slot.
*
* TODO: This should actually be in the EXECUTION
*/
TaskDeploymentDescriptor createDeploymentDescriptor(
ExecutionAttemptID executionId,
LogicalSlot targetSlot,
@Nullable JobManagerTaskRestore taskRestore,
int attemptNumber) throws ExecutionGraphException {
// Produced intermediate results
List producedPartitions = new ArrayList<>(resultPartitions.size());
// Consumed intermediate results
List consumedPartitions = new ArrayList<>(inputEdges.length);
boolean lazyScheduling = getExecutionGraph().isLazyDeploymentAllowed();
for (IntermediateResultPartition partition : resultPartitions.values()) {
List> consumers = partition.getConsumers();
if (consumers.isEmpty()) {
//TODO this case only exists for test, currently there has to be exactly one consumer in real jobs!
producedPartitions.add(ResultPartitionDeploymentDescriptor.from(
partition,
KeyGroupRangeAssignment.UPPER_BOUND_MAX_PARALLELISM,
lazyScheduling));
} else {
Preconditions.checkState(1 == consumers.size(),
"Only one consumer supported in the current implementation! Found: " + consumers.size());
List consumer = consumers.get(0);
ExecutionJobVertex vertex = consumer.get(0).getTarget().getJobVertex();
int maxParallelism = vertex.getMaxParallelism();
producedPartitions.add(ResultPartitionDeploymentDescriptor.from(partition, maxParallelism, lazyScheduling));
}
}
ResultPartitionLocationTrackerProxy resultPartitionLocationTrackerProxy =
currentExecution.getVertex().getExecutionGraph().getResultPartitionLocationTrackerProxy();
for (ExecutionEdge[] edges : inputEdges) {
InputChannelDeploymentDescriptor[] partitions = InputChannelDeploymentDescriptor.fromEdges(
resultPartitionLocationTrackerProxy,
edges,
targetSlot.getTaskManagerLocation(),
lazyScheduling);
// If the produced partition has multiple consumers registered, we
// need to request the one matching our sub task index.
// TODO Refactor after removing the consumers from the intermediate result partitions
int numConsumerEdges = edges[0].getSource().getConsumers().get(0).size();
int queueToRequest = subTaskIndex % numConsumerEdges;
IntermediateResult consumedIntermediateResult = edges[0].getSource().getIntermediateResult();
final IntermediateDataSetID resultId = consumedIntermediateResult.getId();
final ResultPartitionType partitionType = consumedIntermediateResult.getResultType();
consumedPartitions.add(new InputGateDeploymentDescriptor(resultId, partitionType, queueToRequest, partitions));
}
final Either, PermanentBlobKey> jobInformationOrBlobKey = getExecutionGraph().getJobInformationOrBlobKey();
final TaskDeploymentDescriptor.MaybeOffloaded serializedJobInformation;
if (jobInformationOrBlobKey.isLeft()) {
serializedJobInformation = new TaskDeploymentDescriptor.NonOffloaded<>(jobInformationOrBlobKey.left());
} else {
serializedJobInformation = new TaskDeploymentDescriptor.Offloaded<>(jobInformationOrBlobKey.right());
}
final Either, PermanentBlobKey> taskInformationOrBlobKey;
try {
taskInformationOrBlobKey = jobVertex.getTaskInformationOrBlobKey();
} catch (IOException e) {
throw new ExecutionGraphException(
"Could not create a serialized JobVertexInformation for " +
jobVertex.getJobVertexId(), e);
}
final TaskDeploymentDescriptor.MaybeOffloaded serializedTaskInformation;
if (taskInformationOrBlobKey.isLeft()) {
serializedTaskInformation = new TaskDeploymentDescriptor.NonOffloaded<>(taskInformationOrBlobKey.left());
} else {
serializedTaskInformation = new TaskDeploymentDescriptor.Offloaded<>(taskInformationOrBlobKey.right());
}
return new TaskDeploymentDescriptor(
getJobId(),
serializedJobInformation,
serializedTaskInformation,
executionId,
targetSlot.getAllocationId(),
subTaskIndex,
attemptNumber,
targetSlot.getPhysicalSlotNumber(),
createTimestamp,
taskRestore,
producedPartitions,
consumedPartitions);
}
// --------------------------------------------------------------------------------------------
// Utilities
// --------------------------------------------------------------------------------------------
@Override
public String toString() {
return getTaskNameWithSubtaskIndex();
}
@Override
public ArchivedExecutionVertex archive() {
return new ArchivedExecutionVertex(this);
}
}