org.apache.flink.runtime.scheduler.adaptive.Executing Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.scheduler.adaptive;
import org.apache.flink.api.common.JobStatus;
import org.apache.flink.core.execution.SavepointFormatType;
import org.apache.flink.runtime.JobException;
import org.apache.flink.runtime.checkpoint.CheckpointScheduling;
import org.apache.flink.runtime.checkpoint.CheckpointStatsListener;
import org.apache.flink.runtime.checkpoint.CompletedCheckpoint;
import org.apache.flink.runtime.execution.ExecutionState;
import org.apache.flink.runtime.executiongraph.AccessExecutionGraph;
import org.apache.flink.runtime.executiongraph.AccessExecutionJobVertex;
import org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph;
import org.apache.flink.runtime.executiongraph.ExecutionGraph;
import org.apache.flink.runtime.executiongraph.ExecutionJobVertex;
import org.apache.flink.runtime.executiongraph.ExecutionVertex;
import org.apache.flink.runtime.scheduler.ExecutionGraphHandler;
import org.apache.flink.runtime.scheduler.OperatorCoordinatorHandler;
import org.apache.flink.runtime.scheduler.adaptive.allocator.VertexParallelism;
import org.apache.flink.runtime.scheduler.exceptionhistory.ExceptionHistoryEntry;
import org.apache.flink.runtime.scheduler.stopwithsavepoint.StopWithSavepointTerminationManager;
import org.apache.flink.util.Preconditions;
import org.slf4j.Logger;
import javax.annotation.Nullable;
import java.time.Duration;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.function.Function;
import java.util.stream.Collectors;
/** State which represents a running job with an {@link ExecutionGraph} and assigned slots. */
class Executing extends StateWithExecutionGraph
implements ResourceListener, StateTransitionManager.Context, CheckpointStatsListener {
private final Context context;
private final StateTransitionManager stateTransitionManager;
private final int rescaleOnFailedCheckpointCount;
// null indicates that there was no change event observed, yet
@Nullable private AtomicInteger failedCheckpointCountdown;
Executing(
ExecutionGraph executionGraph,
ExecutionGraphHandler executionGraphHandler,
OperatorCoordinatorHandler operatorCoordinatorHandler,
Logger logger,
Context context,
ClassLoader userCodeClassLoader,
List failureCollection,
Function
stateTransitionManagerFactory,
int rescaleOnFailedCheckpointCount) {
super(
context,
executionGraph,
executionGraphHandler,
operatorCoordinatorHandler,
logger,
userCodeClassLoader,
failureCollection);
this.context = context;
Preconditions.checkState(
executionGraph.getState() == JobStatus.RUNNING, "Assuming running execution graph");
this.stateTransitionManager = stateTransitionManagerFactory.apply(this);
Preconditions.checkArgument(
rescaleOnFailedCheckpointCount > 0,
"The rescaleOnFailedCheckpointCount should be larger than 0.");
this.rescaleOnFailedCheckpointCount = rescaleOnFailedCheckpointCount;
this.failedCheckpointCountdown = null;
deploy();
// check if new resources have come available in the meantime
context.runIfState(
this,
() -> {
stateTransitionManager.onChange();
stateTransitionManager.onTrigger();
},
Duration.ZERO);
}
@Override
public boolean hasSufficientResources() {
return parallelismChanged() && context.hasSufficientResources();
}
@Override
public boolean hasDesiredResources() {
return parallelismChanged() && context.hasDesiredResources();
}
private boolean parallelismChanged() {
final VertexParallelism currentParallelism =
extractCurrentVertexParallelism(getExecutionGraph());
return context.getAvailableVertexParallelism()
.map(
availableParallelism ->
availableParallelism.getVertices().stream()
.anyMatch(
vertex ->
currentParallelism.getParallelism(vertex)
!= availableParallelism
.getParallelism(vertex)))
.orElse(false);
}
private static VertexParallelism extractCurrentVertexParallelism(
AccessExecutionGraph executionGraph) {
return new VertexParallelism(
executionGraph.getAllVertices().values().stream()
.collect(
Collectors.toMap(
AccessExecutionJobVertex::getJobVertexId,
AccessExecutionJobVertex::getParallelism)));
}
@Override
public ScheduledFuture> scheduleOperation(Runnable callback, Duration delay) {
return context.runIfState(this, callback, delay);
}
@Override
public void transitionToSubsequentState() {
context.goToRestarting(
getExecutionGraph(),
getExecutionGraphHandler(),
getOperatorCoordinatorHandler(),
Duration.ofMillis(0L),
true,
getFailures());
}
@Override
public JobStatus getJobStatus() {
return JobStatus.RUNNING;
}
@Override
public void cancel() {
context.goToCanceling(
getExecutionGraph(),
getExecutionGraphHandler(),
getOperatorCoordinatorHandler(),
getFailures());
}
@Override
void onFailure(Throwable cause, CompletableFuture
© 2015 - 2025 Weber Informatics LLC | Privacy Policy