org.apache.flink.runtime.dispatcher.cleanup.CheckpointResourcesCleanupRunner Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.dispatcher.cleanup;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.JobStatus;
import org.apache.flink.configuration.CheckpointingOptions;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.core.execution.RecoveryClaimMode;
import org.apache.flink.runtime.checkpoint.CheckpointIDCounter;
import org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory;
import org.apache.flink.runtime.checkpoint.CheckpointsCleaner;
import org.apache.flink.runtime.checkpoint.CompletedCheckpointStore;
import org.apache.flink.runtime.checkpoint.DefaultCompletedCheckpointStoreUtils;
import org.apache.flink.runtime.dispatcher.JobCancellationFailedException;
import org.apache.flink.runtime.dispatcher.UnavailableDispatcherOperationException;
import org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph;
import org.apache.flink.runtime.jobmaster.JobManagerRunner;
import org.apache.flink.runtime.jobmaster.JobManagerRunnerResult;
import org.apache.flink.runtime.jobmaster.JobMaster;
import org.apache.flink.runtime.jobmaster.JobMasterGateway;
import org.apache.flink.runtime.jobmaster.JobResult;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.messages.webmonitor.JobDetails;
import org.apache.flink.runtime.scheduler.ExecutionGraphInfo;
import org.apache.flink.runtime.state.SharedStateRegistryFactory;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.concurrent.FutureUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.Executor;
/**
* {@code CheckpointResourcesCleanupRunner} implements {@link JobManagerRunner} in a way, that only
* the checkpoint-related resources are instantiated. It triggers any job-specific cleanup that's
* usually performed by the {@link JobMaster} without rebuilding the corresponding {@link
* org.apache.flink.runtime.executiongraph.ExecutionGraph}.
*/
public class CheckpointResourcesCleanupRunner implements JobManagerRunner {
private static final Logger LOG =
LoggerFactory.getLogger(CheckpointResourcesCleanupRunner.class);
private final JobResult jobResult;
private final CheckpointRecoveryFactory checkpointRecoveryFactory;
private final CheckpointsCleaner checkpointsCleaner;
private final SharedStateRegistryFactory sharedStateRegistryFactory;
private final Configuration jobManagerConfiguration;
private final Executor cleanupExecutor;
private final long initializationTimestamp;
private final CompletableFuture cleanupFuture;
private final CompletableFuture resultFuture;
public CheckpointResourcesCleanupRunner(
JobResult jobResult,
CheckpointRecoveryFactory checkpointRecoveryFactory,
SharedStateRegistryFactory sharedStateRegistryFactory,
Configuration jobManagerConfiguration,
Executor cleanupExecutor,
long initializationTimestamp) {
this.jobResult = Preconditions.checkNotNull(jobResult);
this.checkpointRecoveryFactory = Preconditions.checkNotNull(checkpointRecoveryFactory);
this.sharedStateRegistryFactory = Preconditions.checkNotNull(sharedStateRegistryFactory);
this.jobManagerConfiguration = Preconditions.checkNotNull(jobManagerConfiguration);
this.cleanupExecutor = Preconditions.checkNotNull(cleanupExecutor);
this.initializationTimestamp = initializationTimestamp;
this.checkpointsCleaner =
new CheckpointsCleaner(
jobManagerConfiguration.get(CheckpointingOptions.CLEANER_PARALLEL_MODE));
this.resultFuture = new CompletableFuture<>();
this.cleanupFuture = resultFuture.thenCompose(ignored -> runCleanupAsync());
}
private CompletableFuture runCleanupAsync() {
return CompletableFuture.runAsync(
() -> {
try {
cleanupCheckpoints();
} catch (Exception e) {
throw new CompletionException(e);
}
},
cleanupExecutor)
.thenCompose(ignore -> checkpointsCleaner.closeAsync());
}
@Override
public CompletableFuture closeAsync() {
return cleanupFuture;
}
@Override
public void start() throws Exception {
resultFuture.complete(
JobManagerRunnerResult.forSuccess(createExecutionGraphInfoFromJobResult()));
}
private void cleanupCheckpoints() throws Exception {
final CompletedCheckpointStore completedCheckpointStore = createCompletedCheckpointStore();
final CheckpointIDCounter checkpointIDCounter = createCheckpointIDCounter();
Exception exception = null;
try {
completedCheckpointStore.shutdown(getJobStatus(), checkpointsCleaner);
} catch (Exception e) {
exception = e;
}
try {
checkpointIDCounter.shutdown(getJobStatus()).get();
} catch (Exception e) {
exception = ExceptionUtils.firstOrSuppressed(e, exception);
}
if (exception != null) {
throw exception;
}
}
private CompletedCheckpointStore createCompletedCheckpointStore() throws Exception {
return checkpointRecoveryFactory.createRecoveredCompletedCheckpointStore(
getJobID(),
DefaultCompletedCheckpointStoreUtils.getMaximumNumberOfRetainedCheckpoints(
jobManagerConfiguration, LOG),
sharedStateRegistryFactory,
cleanupExecutor,
// Using RecoveryClaimMode.CLAIM to be able to discard shared state, if any.
// Note that it also means that the original shared state might be discarded as well
// because the initial checkpoint might be subsumed.
RecoveryClaimMode.CLAIM);
}
private CheckpointIDCounter createCheckpointIDCounter() throws Exception {
return checkpointRecoveryFactory.createCheckpointIDCounter(getJobID());
}
@Override
public CompletableFuture getJobMasterGateway() {
return FutureUtils.completedExceptionally(
new UnavailableDispatcherOperationException(
"Unable to get JobMasterGateway for job in cleanup phase. The requested operation is not available in that stage."));
}
@Override
public CompletableFuture getResultFuture() {
return resultFuture;
}
@Override
public JobID getJobID() {
return jobResult.getJobId();
}
@Override
public CompletableFuture cancel(Duration timeout) {
return FutureUtils.completedExceptionally(
new JobCancellationFailedException("Cleanup tasks are not meant to be cancelled."));
}
@Override
public CompletableFuture requestJobStatus(Duration timeout) {
return CompletableFuture.completedFuture(getJobStatus());
}
@Override
public CompletableFuture requestJobDetails(Duration timeout) {
return requestJob(timeout)
.thenApply(
executionGraphInfo ->
JobDetails.createDetailsForJob(
executionGraphInfo.getArchivedExecutionGraph()));
}
@Override
public CompletableFuture requestJob(Duration timeout) {
return CompletableFuture.completedFuture(createExecutionGraphInfoFromJobResult());
}
@Override
public boolean isInitialized() {
return true;
}
private ExecutionGraphInfo createExecutionGraphInfoFromJobResult() {
return generateExecutionGraphInfo(jobResult, initializationTimestamp);
}
private JobStatus getJobStatus() {
return getJobStatus(jobResult);
}
private static JobStatus getJobStatus(JobResult jobResult) {
return jobResult.getApplicationStatus().deriveJobStatus();
}
private static ExecutionGraphInfo generateExecutionGraphInfo(
JobResult jobResult, long initializationTimestamp) {
return new ExecutionGraphInfo(
ArchivedExecutionGraph.createSparseArchivedExecutionGraph(
jobResult.getJobId(),
"unknown",
getJobStatus(jobResult),
null,
jobResult.getSerializedThrowable().orElse(null),
null,
initializationTimestamp));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy