All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.resourcemanager.JobLeaderIdService Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.resourcemanager;

import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.runtime.concurrent.CompletableFuture;
import org.apache.flink.runtime.concurrent.Future;
import org.apache.flink.runtime.concurrent.ScheduledExecutor;
import org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalListener;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.flink.util.ExceptionUtils;
import org.apache.flink.util.Preconditions;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;

/**
 * Service which retrieves for a registered job the current job leader id (the leader id of the
 * job manager responsible for the job). The leader id will be exposed as a future via the
 * {@link #getLeaderId(JobID)}. The future will only be completed with an exception in case
 * the service will be stopped.
 */
public class JobLeaderIdService {

	private static final Logger LOG = LoggerFactory.getLogger(JobLeaderIdService.class);

	/** High availability services to use by this service */
	private final HighAvailabilityServices highAvailabilityServices;

	private final ScheduledExecutor scheduledExecutor;

	private final Time jobTimeout;

	/** Map of currently monitored jobs */
	private final Map jobLeaderIdListeners;

	/** Actions to call when the job leader changes */
	private JobLeaderIdActions jobLeaderIdActions;

	public JobLeaderIdService(
			HighAvailabilityServices highAvailabilityServices,
			ScheduledExecutor scheduledExecutor,
			Time jobTimeout) throws Exception {
		this.highAvailabilityServices = Preconditions.checkNotNull(highAvailabilityServices, "highAvailabilityServices");
		this.scheduledExecutor = Preconditions.checkNotNull(scheduledExecutor, "scheduledExecutor");
		this.jobTimeout = Preconditions.checkNotNull(jobTimeout, "jobTimeout");

		jobLeaderIdListeners = new HashMap<>(4);

		jobLeaderIdActions = null;
	}

	/**
	 * Start the service with the given job leader actions.
	 *
	 * @param initialJobLeaderIdActions to use for job leader id actions
	 * @throws Exception which is thrown when clearing up old state
	 */
	public void start(JobLeaderIdActions initialJobLeaderIdActions) throws Exception {
		if (isStarted()) {
			clear();
		}

		this.jobLeaderIdActions = Preconditions.checkNotNull(initialJobLeaderIdActions);
	}

	/**
	 * Stop the service.
	 *
	 * @throws Exception which is thrown in case a retrieval service cannot be stopped properly
	 */
	public void stop() throws Exception {
		clear();

		this.jobLeaderIdActions = null;
	}

	/**
	 * Checks whether the service has been started.
	 *
	 * @return True if the service has been started; otherwise false
	 */
	public boolean isStarted() {
		return jobLeaderIdActions == null;
	}

	/**
	 * Stop and clear the currently registered job leader id listeners.
	 *
	 * @throws Exception which is thrown in case a retrieval service cannot be stopped properly
	 */
	public void clear() throws Exception {
		Exception exception = null;

		for (JobLeaderIdListener listener: jobLeaderIdListeners.values()) {
			try {
				listener.stop();
			} catch (Exception e) {
				exception = ExceptionUtils.firstOrSuppressed(e, exception);
			}
		}

		if (exception != null) {
			ExceptionUtils.rethrowException(exception, "Could not properly stop the " +
				JobLeaderIdService.class.getSimpleName() + '.');
		}

		jobLeaderIdListeners.clear();
	}

	/**
	 * Add a job to be monitored to retrieve the job leader id.
	 *
	 * @param jobId identifying the job to monitor
	 * @throws Exception if the job could not be added to the service
	 */
	public void addJob(JobID jobId) throws Exception {
		Preconditions.checkNotNull(jobLeaderIdActions);

		LOG.debug("Add job {} to job leader id monitoring.", jobId);

		if (!jobLeaderIdListeners.containsKey(jobId)) {
			LeaderRetrievalService leaderRetrievalService = highAvailabilityServices.getJobManagerLeaderRetriever(jobId);

			JobLeaderIdListener jobIdListener = new JobLeaderIdListener(jobId, jobLeaderIdActions, leaderRetrievalService);
			jobLeaderIdListeners.put(jobId, jobIdListener);
		}
	}

	/**
	 * Remove the given job from being monitored by the service.
	 *
	 * @param jobId identifying the job to remove from monitor
	 * @throws Exception if removing the job fails
	 */
	public void removeJob(JobID jobId) throws Exception {
		LOG.debug("Remove job {} from job leader id monitoring.", jobId);

		JobLeaderIdListener listener = jobLeaderIdListeners.remove(jobId);

		if (listener != null) {
			listener.stop();
		}
	}

	/**
	 * Check whether the given job is being monitored or not.
	 *
	 * @param jobId identifying the job
	 * @return True if the job is being monitored; otherwise false
	 */
	public boolean containsJob(JobID jobId) {
		return jobLeaderIdListeners.containsKey(jobId);
	}

	public Future getLeaderId(JobID jobId) throws Exception {
		if (!jobLeaderIdListeners.containsKey(jobId)) {
			addJob(jobId);
		}

		JobLeaderIdListener listener = jobLeaderIdListeners.get(jobId);

		return listener.getLeaderIdFuture();
	}

	public boolean isValidTimeout(JobID jobId, UUID timeoutId) {
		JobLeaderIdListener jobLeaderIdListener = jobLeaderIdListeners.get(jobId);

		if (null != jobLeaderIdListener) {
			return Objects.equals(timeoutId, jobLeaderIdListener.getTimeoutId());
		} else {
			return false;
		}
	}

	// --------------------------------------------------------------------------------
	// Static utility classes
	// --------------------------------------------------------------------------------

	/**
	 * Listener which stores the current leader id and exposes them as a future value when
	 * requested. The returned future will always be completed properly except when stopping the
	 * listener.
	 */
	private final class JobLeaderIdListener implements LeaderRetrievalListener {
		private final Object timeoutLock = new Object();
		private final JobID jobId;
		private final JobLeaderIdActions listenerJobLeaderIdActions;
		private final LeaderRetrievalService leaderRetrievalService;

		private volatile CompletableFuture leaderIdFuture;
		private volatile boolean running = true;

		/** Null if no timeout has been scheduled; otherwise non null */
		@Nullable
		private  volatile ScheduledFuture timeoutFuture;

		/** Null if no timeout has been scheduled; otherwise non null */
		@Nullable
		private volatile UUID timeoutId;


		private JobLeaderIdListener(
				JobID jobId,
				JobLeaderIdActions listenerJobLeaderIdActions,
				LeaderRetrievalService leaderRetrievalService) throws Exception {
			this.jobId = Preconditions.checkNotNull(jobId);
			this.listenerJobLeaderIdActions = Preconditions.checkNotNull(listenerJobLeaderIdActions);
			this.leaderRetrievalService = Preconditions.checkNotNull(leaderRetrievalService);

			leaderIdFuture = new FlinkCompletableFuture<>();

			activateTimeout();

			// start the leader service we're listening to
			leaderRetrievalService.start(this);
		}

		public Future getLeaderIdFuture() {
			return leaderIdFuture;
		}

		@Nullable
		public UUID getTimeoutId() {
			return timeoutId;
		}

		public void stop() throws Exception {
			running = false;
			leaderRetrievalService.stop();
			cancelTimeout();
			leaderIdFuture.completeExceptionally(new Exception("Job leader id service has been stopped."));
		}

		@Override
		public void notifyLeaderAddress(String leaderAddress, UUID leaderSessionId) {
			if (running) {
				LOG.debug("Found a new job leader {}@{}.", leaderSessionId, leaderAddress);

				UUID previousJobLeaderId = null;

				if (leaderIdFuture.isDone()) {
					try {
						previousJobLeaderId = leaderIdFuture.getNow(null);
					} catch (ExecutionException e) {
						// this should never happen since we complete this future always properly
						handleError(e);
					}

					leaderIdFuture = FlinkCompletableFuture.completed(leaderSessionId);
				} else {
					leaderIdFuture.complete(leaderSessionId);
				}

				if (previousJobLeaderId != null && !previousJobLeaderId.equals(leaderSessionId)) {
					// we had a previous job leader, so notify about his lost leadership
					listenerJobLeaderIdActions.jobLeaderLostLeadership(jobId, previousJobLeaderId);

					if (null == leaderSessionId) {
						// No current leader active ==> Set a timeout for the job
						activateTimeout();

						// check if we got stopped asynchronously
						if (!running) {
							cancelTimeout();
						}
					}
				} else if (null != leaderSessionId) {
					// Cancel timeout because we've found an active leader for it
					cancelTimeout();
				}
			} else {
				LOG.debug("A leader id change {}@{} has been detected after the listener has been stopped.",
					leaderSessionId, leaderAddress);
			}
		}

		@Override
		public void handleError(Exception exception) {
			if (running) {
				listenerJobLeaderIdActions.handleError(exception);
			} else {
				LOG.debug("An error occurred in the {} after the listener has been stopped.",
					JobLeaderIdListener.class.getSimpleName(), exception);
			}
		}

		private void activateTimeout() {
			synchronized (timeoutLock) {
				cancelTimeout();

				final UUID newTimeoutId = UUID.randomUUID();

				timeoutId = newTimeoutId;
				timeoutFuture = scheduledExecutor.schedule(new Runnable() {
					@Override
					public void run() {
						listenerJobLeaderIdActions.notifyJobTimeout(jobId, newTimeoutId);
					}
				}, jobTimeout.toMilliseconds(), TimeUnit.MILLISECONDS);
			}
		}

		private void cancelTimeout() {
			synchronized (timeoutLock) {
				if (timeoutFuture != null) {
					timeoutFuture.cancel(true);
				}

				timeoutFuture = null;
				timeoutId = null;
			}
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy