org.apache.flink.runtime.resourcemanager.DefaultJobLeaderIdService Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.resourcemanager;
import org.apache.flink.api.common.JobID;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.jobmaster.JobMasterId;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalListener;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.flink.util.CollectionUtil;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.Preconditions;
import org.apache.flink.util.concurrent.ScheduledExecutor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.time.Duration;
import java.util.Map;
import java.util.Objects;
import java.util.UUID;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.CompletionException;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
/**
* Service which retrieves for a registered job the current job leader id (the leader id of the job
* manager responsible for the job). The leader id will be exposed as a future via the {@link
* #getLeaderId(JobID)}. The future will only be completed with an exception in case the service
* will be stopped.
*/
public class DefaultJobLeaderIdService implements JobLeaderIdService {
private static final Logger LOG = LoggerFactory.getLogger(DefaultJobLeaderIdService.class);
/** High availability services to use by this service. */
private final HighAvailabilityServices highAvailabilityServices;
private final ScheduledExecutor scheduledExecutor;
private final Duration jobTimeout;
/** Map of currently monitored jobs. */
private final Map jobLeaderIdListeners;
/** Actions to call when the job leader changes. */
private JobLeaderIdActions jobLeaderIdActions;
public DefaultJobLeaderIdService(
HighAvailabilityServices highAvailabilityServices,
ScheduledExecutor scheduledExecutor,
Duration jobTimeout) {
this.highAvailabilityServices =
Preconditions.checkNotNull(highAvailabilityServices, "highAvailabilityServices");
this.scheduledExecutor = Preconditions.checkNotNull(scheduledExecutor, "scheduledExecutor");
this.jobTimeout = Preconditions.checkNotNull(jobTimeout, "jobTimeout");
jobLeaderIdListeners = CollectionUtil.newHashMapWithExpectedSize(4);
jobLeaderIdActions = null;
}
@Override
public void start(JobLeaderIdActions initialJobLeaderIdActions) throws Exception {
if (isStarted()) {
clear();
}
this.jobLeaderIdActions = Preconditions.checkNotNull(initialJobLeaderIdActions);
}
@Override
public void stop() throws Exception {
clear();
this.jobLeaderIdActions = null;
}
/**
* Checks whether the service has been started.
*
* @return True if the service has been started; otherwise false
*/
public boolean isStarted() {
return jobLeaderIdActions != null;
}
@Override
public void clear() throws Exception {
Exception exception = null;
for (JobLeaderIdListener listener : jobLeaderIdListeners.values()) {
try {
listener.stop();
} catch (Exception e) {
exception = ExceptionUtils.firstOrSuppressed(e, exception);
}
}
if (exception != null) {
ExceptionUtils.rethrowException(
exception,
"Could not properly stop the "
+ DefaultJobLeaderIdService.class.getSimpleName()
+ '.');
}
jobLeaderIdListeners.clear();
}
@Override
public void addJob(JobID jobId) throws Exception {
Preconditions.checkNotNull(jobLeaderIdActions);
LOG.debug("Add job {} to job leader id monitoring.", jobId);
if (!jobLeaderIdListeners.containsKey(jobId)) {
LeaderRetrievalService leaderRetrievalService =
highAvailabilityServices.getJobManagerLeaderRetriever(jobId);
JobLeaderIdListener jobIdListener =
new JobLeaderIdListener(jobId, jobLeaderIdActions, leaderRetrievalService);
jobLeaderIdListeners.put(jobId, jobIdListener);
}
}
@Override
public void removeJob(JobID jobId) throws Exception {
LOG.debug("Remove job {} from job leader id monitoring.", jobId);
JobLeaderIdListener listener = jobLeaderIdListeners.remove(jobId);
if (listener != null) {
listener.stop();
}
}
@Override
public boolean containsJob(JobID jobId) {
return jobLeaderIdListeners.containsKey(jobId);
}
@Override
public CompletableFuture getLeaderId(JobID jobId) throws Exception {
if (!jobLeaderIdListeners.containsKey(jobId)) {
addJob(jobId);
}
JobLeaderIdListener listener = jobLeaderIdListeners.get(jobId);
return listener.getLeaderIdFuture().thenApply(JobMasterId::fromUuidOrNull);
}
@Override
public boolean isValidTimeout(JobID jobId, UUID timeoutId) {
JobLeaderIdListener jobLeaderIdListener = jobLeaderIdListeners.get(jobId);
if (null != jobLeaderIdListener) {
return Objects.equals(timeoutId, jobLeaderIdListener.getTimeoutId());
} else {
return false;
}
}
// --------------------------------------------------------------------------------
// Static utility classes
// --------------------------------------------------------------------------------
/**
* Listener which stores the current leader id and exposes them as a future value when
* requested. The returned future will always be completed properly except when stopping the
* listener.
*/
private final class JobLeaderIdListener implements LeaderRetrievalListener {
private final Object timeoutLock = new Object();
private final JobID jobId;
private final JobLeaderIdActions listenerJobLeaderIdActions;
private final LeaderRetrievalService leaderRetrievalService;
private volatile CompletableFuture leaderIdFuture;
private volatile boolean running = true;
/** Null if no timeout has been scheduled; otherwise non null. */
@Nullable private volatile ScheduledFuture> timeoutFuture;
/** Null if no timeout has been scheduled; otherwise non null. */
@Nullable private volatile UUID timeoutId;
private JobLeaderIdListener(
JobID jobId,
JobLeaderIdActions listenerJobLeaderIdActions,
LeaderRetrievalService leaderRetrievalService)
throws Exception {
this.jobId = Preconditions.checkNotNull(jobId);
this.listenerJobLeaderIdActions =
Preconditions.checkNotNull(listenerJobLeaderIdActions);
this.leaderRetrievalService = Preconditions.checkNotNull(leaderRetrievalService);
leaderIdFuture = new CompletableFuture<>();
activateTimeout();
// start the leader service we're listening to
leaderRetrievalService.start(this);
}
public CompletableFuture getLeaderIdFuture() {
return leaderIdFuture;
}
@Nullable
public UUID getTimeoutId() {
return timeoutId;
}
public void stop() throws Exception {
running = false;
leaderRetrievalService.stop();
cancelTimeout();
leaderIdFuture.completeExceptionally(
new Exception("Job leader id service has been stopped."));
}
@Override
public void notifyLeaderAddress(
@Nullable String leaderAddress, @Nullable UUID leaderSessionId) {
if (running) {
UUID previousJobLeaderId = null;
if (leaderIdFuture.isDone()) {
try {
previousJobLeaderId = leaderIdFuture.getNow(null);
} catch (CompletionException e) {
// this should never happen since we complete this future always properly
handleError(e);
}
if (leaderSessionId == null) {
// there was a leader, but we no longer have one
LOG.debug("Job {} no longer has a job leader.", jobId);
leaderIdFuture = new CompletableFuture<>();
} else {
// there was an active leader, but we now have a new leader
LOG.debug(
"Job {} has a new job leader {}@{}.",
jobId,
leaderSessionId,
leaderAddress);
leaderIdFuture = CompletableFuture.completedFuture(leaderSessionId);
}
} else {
if (leaderSessionId != null) {
// there was no active leader, but we now have a new leader
LOG.debug(
"Job {} has a new job leader {}@{}.",
jobId,
leaderSessionId,
leaderAddress);
leaderIdFuture.complete(leaderSessionId);
}
}
if (previousJobLeaderId != null && !previousJobLeaderId.equals(leaderSessionId)) {
// we had a previous job leader, so notify about his lost leadership
listenerJobLeaderIdActions.jobLeaderLostLeadership(
jobId, new JobMasterId(previousJobLeaderId));
if (null == leaderSessionId) {
// No current leader active ==> Set a timeout for the job
activateTimeout();
// check if we got stopped asynchronously
if (!running) {
cancelTimeout();
}
}
} else if (null != leaderSessionId) {
// Cancel timeout because we've found an active leader for it
cancelTimeout();
}
} else {
LOG.debug(
"A leader id change {}@{} has been detected after the listener has been stopped.",
leaderSessionId,
leaderAddress);
}
}
@Override
public void handleError(Exception exception) {
if (running) {
listenerJobLeaderIdActions.handleError(exception);
} else {
LOG.debug(
"An error occurred in the {} after the listener has been stopped.",
JobLeaderIdListener.class.getSimpleName(),
exception);
}
}
private void activateTimeout() {
synchronized (timeoutLock) {
cancelTimeout();
final UUID newTimeoutId = UUID.randomUUID();
timeoutId = newTimeoutId;
timeoutFuture =
scheduledExecutor.schedule(
new Runnable() {
@Override
public void run() {
listenerJobLeaderIdActions.notifyJobTimeout(
jobId, newTimeoutId);
}
},
jobTimeout.toMillis(),
TimeUnit.MILLISECONDS);
}
}
private void cancelTimeout() {
synchronized (timeoutLock) {
if (timeoutFuture != null) {
timeoutFuture.cancel(true);
}
timeoutFuture = null;
timeoutId = null;
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy