All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.jobmaster.JobMasterServiceLeadershipRunner Maven / Gradle / Ivy

There is a newer version: 1.13.6
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.jobmaster;

import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.JobStatus;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.runtime.concurrent.FutureUtils;
import org.apache.flink.runtime.execution.librarycache.LibraryCacheManager;
import org.apache.flink.runtime.highavailability.RunningJobsRegistry;
import org.apache.flink.runtime.jobmaster.factories.JobMasterServiceProcessFactory;
import org.apache.flink.runtime.leaderelection.LeaderContender;
import org.apache.flink.runtime.leaderelection.LeaderElectionService;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.messages.webmonitor.JobDetails;
import org.apache.flink.runtime.rpc.FatalErrorHandler;
import org.apache.flink.runtime.scheduler.ExecutionGraphInfo;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.FlinkException;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.function.ThrowingRunnable;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.concurrent.GuardedBy;

import java.io.IOException;
import java.util.Optional;
import java.util.UUID;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.function.Supplier;

/**
 * Leadership runner for the {@link JobMasterServiceProcess}.
 *
 * 

The responsibility of this component is to manage the leadership of the {@link * JobMasterServiceProcess}. This means that the runner will create an instance of the process when * it obtains the leadership. The process is stopped once the leadership is revoked. * *

This component only accepts signals (job result completion, initialization failure) as long as * it is running and as long as the signals are coming from the current leader process. This ensures * that only the current leader can affect this component. * *

All leadership operations are serialized. This means that granting the leadership has to * complete before the leadership can be revoked and vice versa. * *

The {@link #resultFuture} can be completed with the following values: * * * *

    *
  • {@link JobManagerRunnerResult} to signal an initialization failure of the {@link * JobMasterService} or the completion of a job *
  • {@link Exception} to signal an unexpected failure *
*/ public class JobMasterServiceLeadershipRunner implements JobManagerRunner, LeaderContender { private static final Logger LOG = LoggerFactory.getLogger(JobMasterServiceLeadershipRunner.class); private final Object lock = new Object(); private final JobMasterServiceProcessFactory jobMasterServiceProcessFactory; private final LeaderElectionService leaderElectionService; private final RunningJobsRegistry runningJobsRegistry; private final LibraryCacheManager.ClassLoaderLease classLoaderLease; private final FatalErrorHandler fatalErrorHandler; private final CompletableFuture terminationFuture = new CompletableFuture<>(); private final CompletableFuture resultFuture = new CompletableFuture<>(); @GuardedBy("lock") private State state = State.RUNNING; @GuardedBy("lock") private CompletableFuture sequentialOperation = FutureUtils.completedVoidFuture(); @GuardedBy("lock") private JobMasterServiceProcess jobMasterServiceProcess = JobMasterServiceProcess.waitingForLeadership(); @GuardedBy("lock") private CompletableFuture jobMasterGatewayFuture = new CompletableFuture<>(); @GuardedBy("lock") private boolean hasCurrentLeaderBeenCancelled = false; public JobMasterServiceLeadershipRunner( JobMasterServiceProcessFactory jobMasterServiceProcessFactory, LeaderElectionService leaderElectionService, RunningJobsRegistry runningJobsRegistry, LibraryCacheManager.ClassLoaderLease classLoaderLease, FatalErrorHandler fatalErrorHandler) { this.jobMasterServiceProcessFactory = jobMasterServiceProcessFactory; this.leaderElectionService = leaderElectionService; this.runningJobsRegistry = runningJobsRegistry; this.classLoaderLease = classLoaderLease; this.fatalErrorHandler = fatalErrorHandler; } @Override public CompletableFuture closeAsync() { synchronized (lock) { if (state != State.STOPPED) { state = State.STOPPED; LOG.debug("Terminating the leadership runner for job {}.", getJobID()); jobMasterGatewayFuture.completeExceptionally( new FlinkException( "JobMasterServiceLeadershipRunner is closed. Therefore, the corresponding JobMaster will never acquire the leadership.")); resultFuture.complete( JobManagerRunnerResult.forSuccess( createExecutionGraphInfoWithJobStatus(JobStatus.SUSPENDED))); final CompletableFuture processTerminationFuture = jobMasterServiceProcess.closeAsync(); final CompletableFuture serviceTerminationFuture = FutureUtils.runAfterwards( processTerminationFuture, () -> { classLoaderLease.release(); leaderElectionService.stop(); }); FutureUtils.forward(serviceTerminationFuture, terminationFuture); terminationFuture.whenComplete( (unused, throwable) -> LOG.debug( "Leadership runner for job {} has been terminated.", getJobID())); } } return terminationFuture; } @Override public void start() throws Exception { LOG.debug("Start leadership runner for job {}.", getJobID()); leaderElectionService.start(this); } @Override public CompletableFuture getJobMasterGateway() { synchronized (lock) { return jobMasterGatewayFuture; } } @Override public CompletableFuture getResultFuture() { return resultFuture; } @Override public JobID getJobID() { return jobMasterServiceProcessFactory.getJobId(); } @Override public CompletableFuture cancel(Time timeout) { synchronized (lock) { hasCurrentLeaderBeenCancelled = true; return getJobMasterGateway() .thenCompose(jobMasterGateway -> jobMasterGateway.cancel(timeout)) .exceptionally( e -> { throw new CompletionException( new FlinkException( "Cancellation failed.", ExceptionUtils.stripCompletionException(e))); }); } } @Override public CompletableFuture requestJobStatus(Time timeout) { return requestJob(timeout) .thenApply( executionGraphInfo -> executionGraphInfo.getArchivedExecutionGraph().getState()); } @Override public CompletableFuture requestJobDetails(Time timeout) { return requestJob(timeout) .thenApply( executionGraphInfo -> JobDetails.createDetailsForJob( executionGraphInfo.getArchivedExecutionGraph())); } @Override public CompletableFuture requestJob(Time timeout) { synchronized (lock) { if (state == State.RUNNING) { if (jobMasterServiceProcess.isInitializedAndRunning()) { return getJobMasterGateway() .thenCompose(jobMasterGateway -> jobMasterGateway.requestJob(timeout)); } else { return CompletableFuture.completedFuture( createExecutionGraphInfoWithJobStatus( hasCurrentLeaderBeenCancelled ? JobStatus.CANCELLING : JobStatus.INITIALIZING)); } } else { return resultFuture.thenApply(JobManagerRunnerResult::getExecutionGraphInfo); } } } @Override public boolean isInitialized() { synchronized (lock) { return jobMasterServiceProcess.isInitializedAndRunning(); } } @Override public void grantLeadership(UUID leaderSessionID) { runIfStateRunning( () -> startJobMasterServiceProcessAsync(leaderSessionID), "starting a new JobMasterServiceProcess"); } @GuardedBy("lock") private void startJobMasterServiceProcessAsync(UUID leaderSessionId) { sequentialOperation = sequentialOperation.thenRun( () -> runIfValidLeader( leaderSessionId, ThrowingRunnable.unchecked( () -> verifyJobSchedulingStatusAndCreateJobMasterServiceProcess( leaderSessionId)), "verify job scheduling status and create JobMasterServiceProcess")); handleAsyncOperationError(sequentialOperation, "Could not start the job manager."); } @GuardedBy("lock") private void verifyJobSchedulingStatusAndCreateJobMasterServiceProcess(UUID leaderSessionId) throws FlinkException { final RunningJobsRegistry.JobSchedulingStatus jobSchedulingStatus = getJobSchedulingStatus(); if (jobSchedulingStatus == RunningJobsRegistry.JobSchedulingStatus.DONE) { jobAlreadyDone(); } else { createNewJobMasterServiceProcess(leaderSessionId); } } private ExecutionGraphInfo createExecutionGraphInfoWithJobStatus(JobStatus jobStatus) { return new ExecutionGraphInfo( jobMasterServiceProcessFactory.createArchivedExecutionGraph(jobStatus, null)); } private void jobAlreadyDone() { resultFuture.complete( JobManagerRunnerResult.forSuccess( new ExecutionGraphInfo( jobMasterServiceProcessFactory.createArchivedExecutionGraph( JobStatus.FAILED, new JobAlreadyDoneException(getJobID()))))); } private RunningJobsRegistry.JobSchedulingStatus getJobSchedulingStatus() throws FlinkException { try { return runningJobsRegistry.getJobSchedulingStatus(getJobID()); } catch (IOException e) { throw new FlinkException( String.format( "Could not retrieve the job scheduling status for job %s.", getJobID()), e); } } @GuardedBy("lock") private void createNewJobMasterServiceProcess(UUID leaderSessionId) throws FlinkException { Preconditions.checkState(jobMasterServiceProcess.closeAsync().isDone()); LOG.debug( "Create new JobMasterServiceProcess because we were granted leadership under {}.", leaderSessionId); try { runningJobsRegistry.setJobRunning(getJobID()); } catch (IOException e) { throw new FlinkException( String.format( "Failed to set the job %s to running in the running jobs registry.", getJobID()), e); } jobMasterServiceProcess = jobMasterServiceProcessFactory.create(leaderSessionId); forwardIfValidLeader( leaderSessionId, jobMasterServiceProcess.getJobMasterGatewayFuture(), jobMasterGatewayFuture, "JobMasterGatewayFuture from JobMasterServiceProcess"); forwardResultFuture(leaderSessionId, jobMasterServiceProcess.getResultFuture()); confirmLeadership(leaderSessionId, jobMasterServiceProcess.getLeaderAddressFuture()); } private void confirmLeadership( UUID leaderSessionId, CompletableFuture leaderAddressFuture) { FutureUtils.assertNoException( leaderAddressFuture.thenAccept( address -> { synchronized (lock) { if (isValidLeader(leaderSessionId)) { LOG.debug("Confirm leadership {}.", leaderSessionId); leaderElectionService.confirmLeadership( leaderSessionId, address); } else { LOG.trace( "Ignore confirming leadership because the leader {} is no longer valid.", leaderSessionId); } } })); } private void forwardResultFuture( UUID leaderSessionId, CompletableFuture resultFuture) { resultFuture.whenComplete( (jobManagerRunnerResult, throwable) -> { synchronized (lock) { if (isValidLeader(leaderSessionId)) { onJobCompletion(jobManagerRunnerResult, throwable); } else { LOG.trace( "Ignore result future forwarding because the leader {} is no longer valid.", leaderSessionId); } } }); } @GuardedBy("lock") private void onJobCompletion( JobManagerRunnerResult jobManagerRunnerResult, Throwable throwable) { state = State.JOB_COMPLETED; LOG.debug("Completing the result for job {}.", getJobID()); if (throwable != null) { resultFuture.completeExceptionally(throwable); jobMasterGatewayFuture.completeExceptionally( new FlinkException( "Could not retrieve JobMasterGateway because the JobMaster failed.", throwable)); } else { if (jobManagerRunnerResult.isSuccess()) { try { runningJobsRegistry.setJobFinished(getJobID()); } catch (IOException e) { LOG.error( "Could not un-register from high-availability services job {}." + "Other JobManager's may attempt to recover it and re-execute it.", getJobID(), e); } } else { jobMasterGatewayFuture.completeExceptionally( new FlinkException( "Could not retrieve JobMasterGateway because the JobMaster initialization failed.", jobManagerRunnerResult.getInitializationFailure())); } resultFuture.complete(jobManagerRunnerResult); } } @Override public void revokeLeadership() { runIfStateRunning( this::stopJobMasterServiceProcessAsync, "revoke leadership from JobMasterServiceProcess"); } @GuardedBy("lock") private void stopJobMasterServiceProcessAsync() { sequentialOperation = sequentialOperation.thenCompose( ignored -> callIfRunning( this::stopJobMasterServiceProcess, "stop leading JobMasterServiceProcess") .orElse(FutureUtils.completedVoidFuture())); handleAsyncOperationError(sequentialOperation, "Could not suspend the job manager."); } @GuardedBy("lock") private CompletableFuture stopJobMasterServiceProcess() { LOG.debug("Stop current JobMasterServiceProcess because the leadership has been revoked."); jobMasterGatewayFuture.completeExceptionally( new FlinkException( "Cannot obtain JobMasterGateway because the JobMaster lost leadership.")); jobMasterGatewayFuture = new CompletableFuture<>(); hasCurrentLeaderBeenCancelled = false; return jobMasterServiceProcess.closeAsync(); } @Override public void handleError(Exception exception) { fatalErrorHandler.onFatalError(exception); } private void handleAsyncOperationError(CompletableFuture operation, String message) { operation.whenComplete( (unused, throwable) -> { if (throwable != null) { runIfStateRunning( () -> handleJobMasterServiceLeadershipRunnerError( new FlinkException(message, throwable)), "handle JobMasterServiceLeadershipRunner error"); } }); } private void handleJobMasterServiceLeadershipRunnerError(Throwable cause) { if (ExceptionUtils.isJvmFatalError(cause)) { fatalErrorHandler.onFatalError(cause); } else { resultFuture.completeExceptionally(cause); } } private void runIfStateRunning(Runnable action, String actionDescription) { synchronized (lock) { if (isRunning()) { action.run(); } else { LOG.trace( "Ignore '{}' because the leadership runner is no longer running.", actionDescription); } } } private Optional callIfRunning( Supplier supplier, String supplierDescription) { synchronized (lock) { if (isRunning()) { return Optional.of(supplier.get()); } else { LOG.trace( "Ignore '{}' because the leadership runner is no longer running.", supplierDescription); return Optional.empty(); } } } @GuardedBy("lock") private boolean isRunning() { return state == State.RUNNING; } private void runIfValidLeader( UUID expectedLeaderId, Runnable action, String actionDescription) { synchronized (lock) { if (isValidLeader(expectedLeaderId)) { action.run(); } else { LOG.trace( "Ignore leader action '{}' because the leadership runner is no longer the valid leader for {}.", actionDescription, expectedLeaderId); } } } @GuardedBy("lock") private boolean isValidLeader(UUID expectedLeaderId) { return isRunning() && leaderElectionService.hasLeadership(expectedLeaderId); } private void forwardIfValidLeader( UUID expectedLeaderId, CompletableFuture source, CompletableFuture target, String forwardDescription) { source.whenComplete( (t, throwable) -> { synchronized (lock) { if (isValidLeader(expectedLeaderId)) { if (throwable != null) { target.completeExceptionally(throwable); } else { target.complete(t); } } else { LOG.trace( "Ignore forwarding '{}' because the leadership runner is no longer the valid leader for {}.", forwardDescription, expectedLeaderId); } } }); } enum State { RUNNING, STOPPED, JOB_COMPLETED, } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy