
org.apache.flink.client.deployment.application.ApplicationDispatcherBootstrap Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.client.deployment.application;
import org.apache.flink.annotation.Internal;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.JobID;
import org.apache.flink.client.ClientUtils;
import org.apache.flink.client.cli.ClientOptions;
import org.apache.flink.client.deployment.application.executors.EmbeddedExecutor;
import org.apache.flink.client.deployment.application.executors.EmbeddedExecutorServiceLoader;
import org.apache.flink.client.program.PackagedProgram;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.DeploymentOptions;
import org.apache.flink.configuration.HighAvailabilityOptions;
import org.apache.flink.configuration.PipelineOptionsInternal;
import org.apache.flink.core.execution.PipelineExecutorServiceLoader;
import org.apache.flink.runtime.client.DuplicateJobSubmissionException;
import org.apache.flink.runtime.clusterframework.ApplicationStatus;
import org.apache.flink.runtime.dispatcher.DispatcherBootstrap;
import org.apache.flink.runtime.dispatcher.DispatcherGateway;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.jobmanager.HighAvailabilityMode;
import org.apache.flink.runtime.jobmaster.JobResult;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.messages.FlinkJobNotFoundException;
import org.apache.flink.runtime.rpc.FatalErrorHandler;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.concurrent.FutureUtils;
import org.apache.flink.util.concurrent.ScheduledExecutor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import java.util.stream.Collectors;
import static org.apache.flink.util.Preconditions.checkNotNull;
/**
* A {@link DispatcherBootstrap} used for running the user's {@code main()} in "Application Mode"
* (see FLIP-85).
*
* This dispatcher bootstrap submits the recovered {@link JobGraph job graphs} for re-execution
* (in case of recovery from a failure), and then submits the remaining jobs of the application for
* execution.
*
*
To achieve this, it works in conjunction with the {@link EmbeddedExecutor EmbeddedExecutor}
* which decides if it should submit a job for execution (in case of a new job) or the job was
* already recovered and is running.
*/
@Internal
public class ApplicationDispatcherBootstrap implements DispatcherBootstrap {
@VisibleForTesting static final String FAILED_JOB_NAME = "(application driver)";
private static final Logger LOG = LoggerFactory.getLogger(ApplicationDispatcherBootstrap.class);
private static boolean isCanceledOrFailed(ApplicationStatus applicationStatus) {
return applicationStatus == ApplicationStatus.CANCELED
|| applicationStatus == ApplicationStatus.FAILED;
}
private final PackagedProgram application;
private final Collection recoveredJobIds;
private final Configuration configuration;
private final FatalErrorHandler errorHandler;
private final CompletableFuture applicationCompletionFuture;
private final CompletableFuture bootstrapCompletionFuture;
private ScheduledFuture> applicationExecutionTask;
public ApplicationDispatcherBootstrap(
final PackagedProgram application,
final Collection recoveredJobIds,
final Configuration configuration,
final DispatcherGateway dispatcherGateway,
final ScheduledExecutor scheduledExecutor,
final FatalErrorHandler errorHandler) {
this.configuration = checkNotNull(configuration);
this.recoveredJobIds = checkNotNull(recoveredJobIds);
this.application = checkNotNull(application);
this.errorHandler = checkNotNull(errorHandler);
this.applicationCompletionFuture =
fixJobIdAndRunApplicationAsync(dispatcherGateway, scheduledExecutor);
this.bootstrapCompletionFuture = finishBootstrapTasks(dispatcherGateway);
}
@Override
public void stop() {
if (applicationExecutionTask != null) {
applicationExecutionTask.cancel(true);
}
if (applicationCompletionFuture != null) {
applicationCompletionFuture.cancel(true);
}
}
@VisibleForTesting
ScheduledFuture> getApplicationExecutionFuture() {
return applicationExecutionTask;
}
@VisibleForTesting
CompletableFuture getApplicationCompletionFuture() {
return applicationCompletionFuture;
}
@VisibleForTesting
CompletableFuture getBootstrapCompletionFuture() {
return bootstrapCompletionFuture;
}
/**
* Logs final application status and invokes error handler in case of unexpected failures.
* Optionally shuts down the given dispatcherGateway when the application completes (either
* successfully or in case of failure), depending on the corresponding config option.
*/
private CompletableFuture finishBootstrapTasks(
final DispatcherGateway dispatcherGateway) {
final CompletableFuture shutdownFuture =
applicationCompletionFuture
.handle(
(ignored, t) -> {
if (t == null) {
LOG.info("Application completed SUCCESSFULLY");
return finish(
dispatcherGateway, ApplicationStatus.SUCCEEDED);
}
final Optional maybeApplicationStatus =
extractApplicationStatus(t);
if (maybeApplicationStatus.isPresent()
&& isCanceledOrFailed(maybeApplicationStatus.get())) {
final ApplicationStatus applicationStatus =
maybeApplicationStatus.get();
LOG.info("Application {}: ", applicationStatus, t);
return finish(dispatcherGateway, applicationStatus);
}
if (t instanceof CancellationException) {
LOG.warn(
"Application has been cancelled because the {} is being stopped.",
ApplicationDispatcherBootstrap.class
.getSimpleName());
return CompletableFuture.completedFuture(Acknowledge.get());
}
LOG.warn("Application failed unexpectedly: ", t);
return FutureUtils.completedExceptionally(t);
})
.thenCompose(Function.identity());
FutureUtils.handleUncaughtException(shutdownFuture, (t, e) -> errorHandler.onFatalError(e));
return shutdownFuture;
}
private CompletableFuture finish(
DispatcherGateway dispatcherGateway, ApplicationStatus applicationStatus) {
boolean shouldShutDownOnFinish =
configuration.get(DeploymentOptions.SHUTDOWN_ON_APPLICATION_FINISH);
return shouldShutDownOnFinish
? dispatcherGateway.shutDownCluster(applicationStatus)
: CompletableFuture.completedFuture(Acknowledge.get());
}
private Optional extractApplicationStatus(Throwable t) {
final Optional maybeException =
ExceptionUtils.findThrowable(t, UnsuccessfulExecutionException.class);
return maybeException.map(UnsuccessfulExecutionException::getStatus);
}
private CompletableFuture fixJobIdAndRunApplicationAsync(
final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor) {
final Optional configuredJobId =
configuration.getOptional(PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID);
final boolean submitFailedJobOnApplicationError =
configuration.get(DeploymentOptions.SUBMIT_FAILED_JOB_ON_APPLICATION_ERROR);
if (!HighAvailabilityMode.isHighAvailabilityModeActivated(configuration)
&& !configuredJobId.isPresent()) {
return runApplicationAsync(
dispatcherGateway, scheduledExecutor, false, submitFailedJobOnApplicationError);
}
if (!configuredJobId.isPresent()) {
// In HA mode, we only support single-execute jobs at the moment. Here, we manually
// generate the job id, if not configured, from the cluster id to keep it consistent
// across failover.
configuration.set(
PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID,
new JobID(
Preconditions.checkNotNull(
configuration.get(
HighAvailabilityOptions.HA_CLUSTER_ID))
.hashCode(),
0)
.toHexString());
}
return runApplicationAsync(
dispatcherGateway, scheduledExecutor, true, submitFailedJobOnApplicationError);
}
/**
* Runs the user program entrypoint by scheduling a task on the given {@code scheduledExecutor}.
* The returned {@link CompletableFuture} completes when all jobs of the user application
* succeeded. if any of them fails, or if job submission fails.
*/
private CompletableFuture runApplicationAsync(
final DispatcherGateway dispatcherGateway,
final ScheduledExecutor scheduledExecutor,
final boolean enforceSingleJobExecution,
final boolean submitFailedJobOnApplicationError) {
final CompletableFuture> applicationExecutionFuture = new CompletableFuture<>();
final Set tolerateMissingResult = Collections.synchronizedSet(new HashSet<>());
// we need to hand in a future as return value because we need to get those JobIs out
// from the scheduled task that executes the user program
applicationExecutionTask =
scheduledExecutor.schedule(
() ->
runApplicationEntryPoint(
applicationExecutionFuture,
tolerateMissingResult,
dispatcherGateway,
scheduledExecutor,
enforceSingleJobExecution,
submitFailedJobOnApplicationError),
0L,
TimeUnit.MILLISECONDS);
return applicationExecutionFuture.thenCompose(
jobIds ->
getApplicationResult(
dispatcherGateway,
jobIds,
tolerateMissingResult,
scheduledExecutor));
}
/**
* Runs the user program entrypoint and completes the given {@code jobIdsFuture} with the {@link
* JobID JobIDs} of the submitted jobs.
*
* This should be executed in a separate thread (or task).
*/
private void runApplicationEntryPoint(
final CompletableFuture> jobIdsFuture,
final Set tolerateMissingResult,
final DispatcherGateway dispatcherGateway,
final ScheduledExecutor scheduledExecutor,
final boolean enforceSingleJobExecution,
final boolean submitFailedJobOnApplicationError) {
if (submitFailedJobOnApplicationError && !enforceSingleJobExecution) {
jobIdsFuture.completeExceptionally(
new ApplicationExecutionException(
String.format(
"Submission of failed job in case of an application error ('%s') is not supported in non-HA setups.",
DeploymentOptions.SUBMIT_FAILED_JOB_ON_APPLICATION_ERROR
.key())));
return;
}
final List applicationJobIds = new ArrayList<>(recoveredJobIds);
try {
final PipelineExecutorServiceLoader executorServiceLoader =
new EmbeddedExecutorServiceLoader(
applicationJobIds, dispatcherGateway, scheduledExecutor);
ClientUtils.executeProgram(
executorServiceLoader,
configuration,
application,
enforceSingleJobExecution,
true /* suppress sysout */);
if (applicationJobIds.isEmpty()) {
jobIdsFuture.completeExceptionally(
new ApplicationExecutionException(
"The application contains no execute() calls."));
} else {
jobIdsFuture.complete(applicationJobIds);
}
} catch (Throwable t) {
// If we're running in a single job execution mode, it's safe to consider re-submission
// of an already finished a success.
final Optional maybeDuplicate =
ExceptionUtils.findThrowable(t, DuplicateJobSubmissionException.class);
if (enforceSingleJobExecution
&& maybeDuplicate.isPresent()
&& maybeDuplicate.get().isGloballyTerminated()) {
final JobID jobId = maybeDuplicate.get().getJobID();
tolerateMissingResult.add(jobId);
jobIdsFuture.complete(Collections.singletonList(jobId));
} else if (submitFailedJobOnApplicationError && applicationJobIds.isEmpty()) {
final JobID failedJobId =
JobID.fromHexString(
configuration.get(PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID));
dispatcherGateway
.submitFailedJob(failedJobId, FAILED_JOB_NAME, t)
.thenAccept(
ignored ->
jobIdsFuture.complete(
Collections.singletonList(failedJobId)));
} else {
jobIdsFuture.completeExceptionally(
new ApplicationExecutionException("Could not execute application.", t));
}
}
}
private CompletableFuture getApplicationResult(
final DispatcherGateway dispatcherGateway,
final Collection applicationJobIds,
final Set tolerateMissingResult,
final ScheduledExecutor executor) {
final List> jobResultFutures =
applicationJobIds.stream()
.map(
jobId ->
unwrapJobResultException(
getJobResult(
dispatcherGateway,
jobId,
executor,
tolerateMissingResult.contains(jobId))))
.collect(Collectors.toList());
return FutureUtils.waitForAll(jobResultFutures);
}
private CompletableFuture getJobResult(
final DispatcherGateway dispatcherGateway,
final JobID jobId,
final ScheduledExecutor scheduledExecutor,
final boolean tolerateMissingResult) {
final Duration timeout = configuration.get(ClientOptions.CLIENT_TIMEOUT);
final Duration retryPeriod = configuration.get(ClientOptions.CLIENT_RETRY_PERIOD);
final CompletableFuture jobResultFuture =
JobStatusPollingUtils.getJobResult(
dispatcherGateway, jobId, scheduledExecutor, timeout, retryPeriod);
if (tolerateMissingResult) {
// Return "unknown" job result if dispatcher no longer knows the actual result.
return FutureUtils.handleException(
jobResultFuture,
FlinkJobNotFoundException.class,
exception ->
new JobResult.Builder()
.jobId(jobId)
.applicationStatus(ApplicationStatus.UNKNOWN)
.netRuntime(Long.MAX_VALUE)
.build());
}
return jobResultFuture;
}
/**
* If the given {@link JobResult} indicates success, this passes through the {@link JobResult}.
* Otherwise, this returns a future that is finished exceptionally (potentially with an
* exception from the {@link JobResult}).
*/
private CompletableFuture unwrapJobResultException(
final CompletableFuture jobResult) {
return jobResult.thenApply(
result -> {
if (result.isSuccess()) {
return result;
}
throw new CompletionException(
UnsuccessfulExecutionException.fromJobResult(
result, application.getUserCodeClassLoader()));
});
}
}