All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.client.deployment.application.ApplicationDispatcherBootstrap Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.client.deployment.application;

import org.apache.flink.annotation.Internal;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.JobID;
import org.apache.flink.client.ClientUtils;
import org.apache.flink.client.cli.ClientOptions;
import org.apache.flink.client.deployment.application.executors.EmbeddedExecutor;
import org.apache.flink.client.deployment.application.executors.EmbeddedExecutorServiceLoader;
import org.apache.flink.client.program.PackagedProgram;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.configuration.DeploymentOptions;
import org.apache.flink.configuration.HighAvailabilityOptions;
import org.apache.flink.configuration.PipelineOptionsInternal;
import org.apache.flink.core.execution.PipelineExecutorServiceLoader;
import org.apache.flink.runtime.client.DuplicateJobSubmissionException;
import org.apache.flink.runtime.clusterframework.ApplicationStatus;
import org.apache.flink.runtime.dispatcher.DispatcherBootstrap;
import org.apache.flink.runtime.dispatcher.DispatcherGateway;
import org.apache.flink.runtime.jobgraph.JobGraph;
import org.apache.flink.runtime.jobmanager.HighAvailabilityMode;
import org.apache.flink.runtime.jobmaster.JobResult;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.messages.FlinkJobNotFoundException;
import org.apache.flink.runtime.rpc.FatalErrorHandler;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.concurrent.FutureUtils;
import org.apache.flink.util.concurrent.ScheduledExecutor;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.time.Duration;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import java.util.stream.Collectors;

import static org.apache.flink.util.Preconditions.checkNotNull;

/**
 * A {@link DispatcherBootstrap} used for running the user's {@code main()} in "Application Mode"
 * (see FLIP-85).
 *
 * 

This dispatcher bootstrap submits the recovered {@link JobGraph job graphs} for re-execution * (in case of recovery from a failure), and then submits the remaining jobs of the application for * execution. * *

To achieve this, it works in conjunction with the {@link EmbeddedExecutor EmbeddedExecutor} * which decides if it should submit a job for execution (in case of a new job) or the job was * already recovered and is running. */ @Internal public class ApplicationDispatcherBootstrap implements DispatcherBootstrap { @VisibleForTesting static final String FAILED_JOB_NAME = "(application driver)"; private static final Logger LOG = LoggerFactory.getLogger(ApplicationDispatcherBootstrap.class); private static boolean isCanceledOrFailed(ApplicationStatus applicationStatus) { return applicationStatus == ApplicationStatus.CANCELED || applicationStatus == ApplicationStatus.FAILED; } private final PackagedProgram application; private final Collection recoveredJobIds; private final Configuration configuration; private final FatalErrorHandler errorHandler; private final CompletableFuture applicationCompletionFuture; private final CompletableFuture bootstrapCompletionFuture; private ScheduledFuture applicationExecutionTask; public ApplicationDispatcherBootstrap( final PackagedProgram application, final Collection recoveredJobIds, final Configuration configuration, final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor, final FatalErrorHandler errorHandler) { this.configuration = checkNotNull(configuration); this.recoveredJobIds = checkNotNull(recoveredJobIds); this.application = checkNotNull(application); this.errorHandler = checkNotNull(errorHandler); this.applicationCompletionFuture = fixJobIdAndRunApplicationAsync(dispatcherGateway, scheduledExecutor); this.bootstrapCompletionFuture = finishBootstrapTasks(dispatcherGateway); } @Override public void stop() { if (applicationExecutionTask != null) { applicationExecutionTask.cancel(true); } if (applicationCompletionFuture != null) { applicationCompletionFuture.cancel(true); } } @VisibleForTesting ScheduledFuture getApplicationExecutionFuture() { return applicationExecutionTask; } @VisibleForTesting CompletableFuture getApplicationCompletionFuture() { return applicationCompletionFuture; } @VisibleForTesting CompletableFuture getBootstrapCompletionFuture() { return bootstrapCompletionFuture; } /** * Logs final application status and invokes error handler in case of unexpected failures. * Optionally shuts down the given dispatcherGateway when the application completes (either * successfully or in case of failure), depending on the corresponding config option. */ private CompletableFuture finishBootstrapTasks( final DispatcherGateway dispatcherGateway) { final CompletableFuture shutdownFuture = applicationCompletionFuture .handle( (ignored, t) -> { if (t == null) { LOG.info("Application completed SUCCESSFULLY"); return finish( dispatcherGateway, ApplicationStatus.SUCCEEDED); } final Optional maybeApplicationStatus = extractApplicationStatus(t); if (maybeApplicationStatus.isPresent() && isCanceledOrFailed(maybeApplicationStatus.get())) { final ApplicationStatus applicationStatus = maybeApplicationStatus.get(); LOG.info("Application {}: ", applicationStatus, t); return finish(dispatcherGateway, applicationStatus); } if (t instanceof CancellationException) { LOG.warn( "Application has been cancelled because the {} is being stopped.", ApplicationDispatcherBootstrap.class .getSimpleName()); return CompletableFuture.completedFuture(Acknowledge.get()); } LOG.warn("Application failed unexpectedly: ", t); return FutureUtils.completedExceptionally(t); }) .thenCompose(Function.identity()); FutureUtils.handleUncaughtException(shutdownFuture, (t, e) -> errorHandler.onFatalError(e)); return shutdownFuture; } private CompletableFuture finish( DispatcherGateway dispatcherGateway, ApplicationStatus applicationStatus) { boolean shouldShutDownOnFinish = configuration.get(DeploymentOptions.SHUTDOWN_ON_APPLICATION_FINISH); return shouldShutDownOnFinish ? dispatcherGateway.shutDownCluster(applicationStatus) : CompletableFuture.completedFuture(Acknowledge.get()); } private Optional extractApplicationStatus(Throwable t) { final Optional maybeException = ExceptionUtils.findThrowable(t, UnsuccessfulExecutionException.class); return maybeException.map(UnsuccessfulExecutionException::getStatus); } private CompletableFuture fixJobIdAndRunApplicationAsync( final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor) { final Optional configuredJobId = configuration.getOptional(PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID); final boolean submitFailedJobOnApplicationError = configuration.get(DeploymentOptions.SUBMIT_FAILED_JOB_ON_APPLICATION_ERROR); if (!HighAvailabilityMode.isHighAvailabilityModeActivated(configuration) && !configuredJobId.isPresent()) { return runApplicationAsync( dispatcherGateway, scheduledExecutor, false, submitFailedJobOnApplicationError); } if (!configuredJobId.isPresent()) { // In HA mode, we only support single-execute jobs at the moment. Here, we manually // generate the job id, if not configured, from the cluster id to keep it consistent // across failover. configuration.set( PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID, new JobID( Preconditions.checkNotNull( configuration.get( HighAvailabilityOptions.HA_CLUSTER_ID)) .hashCode(), 0) .toHexString()); } return runApplicationAsync( dispatcherGateway, scheduledExecutor, true, submitFailedJobOnApplicationError); } /** * Runs the user program entrypoint by scheduling a task on the given {@code scheduledExecutor}. * The returned {@link CompletableFuture} completes when all jobs of the user application * succeeded. if any of them fails, or if job submission fails. */ private CompletableFuture runApplicationAsync( final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor, final boolean enforceSingleJobExecution, final boolean submitFailedJobOnApplicationError) { final CompletableFuture> applicationExecutionFuture = new CompletableFuture<>(); final Set tolerateMissingResult = Collections.synchronizedSet(new HashSet<>()); // we need to hand in a future as return value because we need to get those JobIs out // from the scheduled task that executes the user program applicationExecutionTask = scheduledExecutor.schedule( () -> runApplicationEntryPoint( applicationExecutionFuture, tolerateMissingResult, dispatcherGateway, scheduledExecutor, enforceSingleJobExecution, submitFailedJobOnApplicationError), 0L, TimeUnit.MILLISECONDS); return applicationExecutionFuture.thenCompose( jobIds -> getApplicationResult( dispatcherGateway, jobIds, tolerateMissingResult, scheduledExecutor)); } /** * Runs the user program entrypoint and completes the given {@code jobIdsFuture} with the {@link * JobID JobIDs} of the submitted jobs. * *

This should be executed in a separate thread (or task). */ private void runApplicationEntryPoint( final CompletableFuture> jobIdsFuture, final Set tolerateMissingResult, final DispatcherGateway dispatcherGateway, final ScheduledExecutor scheduledExecutor, final boolean enforceSingleJobExecution, final boolean submitFailedJobOnApplicationError) { if (submitFailedJobOnApplicationError && !enforceSingleJobExecution) { jobIdsFuture.completeExceptionally( new ApplicationExecutionException( String.format( "Submission of failed job in case of an application error ('%s') is not supported in non-HA setups.", DeploymentOptions.SUBMIT_FAILED_JOB_ON_APPLICATION_ERROR .key()))); return; } final List applicationJobIds = new ArrayList<>(recoveredJobIds); try { final PipelineExecutorServiceLoader executorServiceLoader = new EmbeddedExecutorServiceLoader( applicationJobIds, dispatcherGateway, scheduledExecutor); ClientUtils.executeProgram( executorServiceLoader, configuration, application, enforceSingleJobExecution, true /* suppress sysout */); if (applicationJobIds.isEmpty()) { jobIdsFuture.completeExceptionally( new ApplicationExecutionException( "The application contains no execute() calls.")); } else { jobIdsFuture.complete(applicationJobIds); } } catch (Throwable t) { // If we're running in a single job execution mode, it's safe to consider re-submission // of an already finished a success. final Optional maybeDuplicate = ExceptionUtils.findThrowable(t, DuplicateJobSubmissionException.class); if (enforceSingleJobExecution && maybeDuplicate.isPresent() && maybeDuplicate.get().isGloballyTerminated()) { final JobID jobId = maybeDuplicate.get().getJobID(); tolerateMissingResult.add(jobId); jobIdsFuture.complete(Collections.singletonList(jobId)); } else if (submitFailedJobOnApplicationError && applicationJobIds.isEmpty()) { final JobID failedJobId = JobID.fromHexString( configuration.get(PipelineOptionsInternal.PIPELINE_FIXED_JOB_ID)); dispatcherGateway .submitFailedJob(failedJobId, FAILED_JOB_NAME, t) .thenAccept( ignored -> jobIdsFuture.complete( Collections.singletonList(failedJobId))); } else { jobIdsFuture.completeExceptionally( new ApplicationExecutionException("Could not execute application.", t)); } } } private CompletableFuture getApplicationResult( final DispatcherGateway dispatcherGateway, final Collection applicationJobIds, final Set tolerateMissingResult, final ScheduledExecutor executor) { final List> jobResultFutures = applicationJobIds.stream() .map( jobId -> unwrapJobResultException( getJobResult( dispatcherGateway, jobId, executor, tolerateMissingResult.contains(jobId)))) .collect(Collectors.toList()); return FutureUtils.waitForAll(jobResultFutures); } private CompletableFuture getJobResult( final DispatcherGateway dispatcherGateway, final JobID jobId, final ScheduledExecutor scheduledExecutor, final boolean tolerateMissingResult) { final Duration timeout = configuration.get(ClientOptions.CLIENT_TIMEOUT); final Duration retryPeriod = configuration.get(ClientOptions.CLIENT_RETRY_PERIOD); final CompletableFuture jobResultFuture = JobStatusPollingUtils.getJobResult( dispatcherGateway, jobId, scheduledExecutor, timeout, retryPeriod); if (tolerateMissingResult) { // Return "unknown" job result if dispatcher no longer knows the actual result. return FutureUtils.handleException( jobResultFuture, FlinkJobNotFoundException.class, exception -> new JobResult.Builder() .jobId(jobId) .applicationStatus(ApplicationStatus.UNKNOWN) .netRuntime(Long.MAX_VALUE) .build()); } return jobResultFuture; } /** * If the given {@link JobResult} indicates success, this passes through the {@link JobResult}. * Otherwise, this returns a future that is finished exceptionally (potentially with an * exception from the {@link JobResult}). */ private CompletableFuture unwrapJobResultException( final CompletableFuture jobResult) { return jobResult.thenApply( result -> { if (result.isSuccess()) { return result; } throw new CompletionException( UnsuccessfulExecutionException.fromJobResult( result, application.getUserCodeClassLoader())); }); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy