Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.resourcemanager;
import org.apache.flink.annotation.VisibleForTesting;
import org.apache.flink.api.common.JobID;
import org.apache.flink.runtime.clusterframework.ApplicationStatus;
import org.apache.flink.runtime.clusterframework.messages.InfoMessage;
import org.apache.flink.runtime.clusterframework.types.AllocationID;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.clusterframework.types.ResourceProfile;
import org.apache.flink.runtime.clusterframework.types.SlotID;
import org.apache.flink.runtime.concurrent.AcceptFunction;
import org.apache.flink.runtime.concurrent.ApplyFunction;
import org.apache.flink.runtime.concurrent.BiFunction;
import org.apache.flink.runtime.concurrent.Future;
import org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture;
import org.apache.flink.runtime.heartbeat.HeartbeatListener;
import org.apache.flink.runtime.heartbeat.HeartbeatManager;
import org.apache.flink.runtime.heartbeat.HeartbeatServices;
import org.apache.flink.runtime.heartbeat.HeartbeatTarget;
import org.apache.flink.runtime.highavailability.HighAvailabilityServices;
import org.apache.flink.runtime.highavailability.LeaderIdMismatchException;
import org.apache.flink.runtime.instance.InstanceID;
import org.apache.flink.runtime.jobmaster.JobMasterRegistrationSuccess;
import org.apache.flink.runtime.leaderelection.LeaderContender;
import org.apache.flink.runtime.leaderelection.LeaderElectionService;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.metrics.MetricRegistry;
import org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException;
import org.apache.flink.runtime.resourcemanager.registration.JobManagerRegistration;
import org.apache.flink.runtime.resourcemanager.registration.WorkerRegistration;
import org.apache.flink.runtime.resourcemanager.slotmanager.SlotManager;
import org.apache.flink.runtime.resourcemanager.slotmanager.ResourceManagerActions;
import org.apache.flink.runtime.rpc.FatalErrorHandler;
import org.apache.flink.runtime.rpc.RpcEndpoint;
import org.apache.flink.runtime.rpc.RpcMethod;
import org.apache.flink.runtime.rpc.RpcService;
import org.apache.flink.runtime.jobmaster.JobMaster;
import org.apache.flink.runtime.jobmaster.JobMasterGateway;
import org.apache.flink.runtime.registration.RegistrationResponse;
import org.apache.flink.runtime.rpc.exceptions.LeaderSessionIDException;
import org.apache.flink.runtime.taskexecutor.SlotReport;
import org.apache.flink.runtime.taskexecutor.TaskExecutorGateway;
import org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess;
import org.apache.flink.util.ExceptionUtils;
import java.io.Serializable;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.UUID;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.TimeoutException;
import static org.apache.flink.util.Preconditions.checkNotNull;
/**
* ResourceManager implementation. The resource manager is responsible for resource de-/allocation
* and bookkeeping.
*
* It offers the following methods as part of its rpc interface to interact with him remotely:
*
*
{@link #registerJobManager(UUID, UUID, String, JobID, ResourceID)} registers a {@link JobMaster} at the resource manager
*
{@link #requestSlot(UUID, UUID, SlotRequest)} requests a slot from the resource manager
*
*/
public abstract class ResourceManager
extends RpcEndpoint
implements LeaderContender {
public static final String RESOURCE_MANAGER_NAME = "resourcemanager";
/** Unique id of the resource manager */
private final ResourceID resourceId;
/** Configuration of the resource manager */
private final ResourceManagerConfiguration resourceManagerConfiguration;
/** All currently registered JobMasterGateways scoped by JobID. */
private final Map jobManagerRegistrations;
/** All currently registered JobMasterGateways scoped by ResourceID. */
private final Map jmResourceIdRegistrations;
/** Service to retrieve the job leader ids */
private final JobLeaderIdService jobLeaderIdService;
/** All currently registered TaskExecutors with there framework specific worker information. */
private final Map> taskExecutors;
/** High availability services for leader retrieval and election. */
private final HighAvailabilityServices highAvailabilityServices;
/** The heartbeat manager with task managers. */
private final HeartbeatManager taskManagerHeartbeatManager;
/** The heartbeat manager with job managers. */
private final HeartbeatManager jobManagerHeartbeatManager;
/** Registry to use for metrics */
private final MetricRegistry metricRegistry;
/** Fatal error handler */
private final FatalErrorHandler fatalErrorHandler;
/** The slot manager maintains the available slots */
private final SlotManager slotManager;
/** The service to elect a ResourceManager leader. */
private LeaderElectionService leaderElectionService;
/** ResourceManager's leader session id which is updated on leader election. */
private volatile UUID leaderSessionId;
/** All registered listeners for status updates of the ResourceManager. */
private ConcurrentMap infoMessageListeners;
public ResourceManager(
RpcService rpcService,
String resourceManagerEndpointId,
ResourceID resourceId,
ResourceManagerConfiguration resourceManagerConfiguration,
HighAvailabilityServices highAvailabilityServices,
HeartbeatServices heartbeatServices,
SlotManager slotManager,
MetricRegistry metricRegistry,
JobLeaderIdService jobLeaderIdService,
FatalErrorHandler fatalErrorHandler) {
super(rpcService, resourceManagerEndpointId);
this.resourceId = checkNotNull(resourceId);
this.resourceManagerConfiguration = checkNotNull(resourceManagerConfiguration);
this.highAvailabilityServices = checkNotNull(highAvailabilityServices);
this.slotManager = checkNotNull(slotManager);
this.metricRegistry = checkNotNull(metricRegistry);
this.jobLeaderIdService = checkNotNull(jobLeaderIdService);
this.fatalErrorHandler = checkNotNull(fatalErrorHandler);
this.taskManagerHeartbeatManager = heartbeatServices.createHeartbeatManagerSender(
resourceId,
new TaskManagerHeartbeatListener(),
rpcService.getScheduledExecutor(),
log);
this.jobManagerHeartbeatManager = heartbeatServices.createHeartbeatManagerSender(
resourceId,
new JobManagerHeartbeatListener(),
rpcService.getScheduledExecutor(),
log);
this.jobManagerRegistrations = new HashMap<>(4);
this.jmResourceIdRegistrations = new HashMap<>(4);
this.taskExecutors = new HashMap<>(8);
this.leaderSessionId = null;
infoMessageListeners = new ConcurrentHashMap<>(8);
}
// ------------------------------------------------------------------------
// RPC lifecycle methods
// ------------------------------------------------------------------------
@Override
public void start() throws Exception {
// start a leader
super.start();
leaderElectionService = highAvailabilityServices.getResourceManagerLeaderElectionService();
try {
leaderElectionService.start(this);
} catch (Exception e) {
throw new ResourceManagerException("Could not start the leader election service.", e);
}
try {
jobLeaderIdService.start(new JobLeaderIdActionsImpl());
} catch (Exception e) {
throw new ResourceManagerException("Could not start the job leader id service.", e);
}
initialize();
}
@Override
public void shutDown() throws Exception {
Exception exception = null;
taskManagerHeartbeatManager.stop();
jobManagerHeartbeatManager.stop();
try {
slotManager.close();
} catch (Exception e) {
exception = ExceptionUtils.firstOrSuppressed(e, exception);
}
try {
leaderElectionService.stop();
} catch (Exception e) {
exception = ExceptionUtils.firstOrSuppressed(e, exception);
}
try {
super.shutDown();
} catch (Exception e) {
exception = ExceptionUtils.firstOrSuppressed(e, exception);
}
clearState();
if (exception != null) {
ExceptionUtils.rethrowException(exception, "Error while shutting the ResourceManager down.");
}
}
// ------------------------------------------------------------------------
// RPC methods
// ------------------------------------------------------------------------
@RpcMethod
public Future registerJobManager(
final UUID resourceManagerLeaderId,
final UUID jobManagerLeaderId,
final ResourceID jobManagerResourceId,
final String jobManagerAddress,
final JobID jobId) {
checkNotNull(resourceManagerLeaderId);
checkNotNull(jobManagerLeaderId);
checkNotNull(jobManagerResourceId);
checkNotNull(jobManagerAddress);
checkNotNull(jobId);
if (isValid(resourceManagerLeaderId)) {
if (!jobLeaderIdService.containsJob(jobId)) {
try {
jobLeaderIdService.addJob(jobId);
} catch (Exception e) {
ResourceManagerException exception = new ResourceManagerException("Could not add the job " +
jobId + " to the job id leader service.", e);
onFatalErrorAsync(exception);
log.error("Could not add job {} to job leader id service.", jobId, e);
return FlinkCompletableFuture.completedExceptionally(exception);
}
}
log.info("Registering job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
Future jobLeaderIdFuture;
try {
jobLeaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
} catch (Exception e) {
// we cannot check the job leader id so let's fail
// TODO: Maybe it's also ok to skip this check in case that we cannot check the leader id
ResourceManagerException exception = new ResourceManagerException("Cannot obtain the " +
"job leader id future to verify the correct job leader.", e);
onFatalErrorAsync(exception);
log.debug("Could not obtain the job leader id future to verify the correct job leader.");
return FlinkCompletableFuture.completedExceptionally(exception);
}
Future jobMasterGatewayFuture = getRpcService().connect(jobManagerAddress, JobMasterGateway.class);
Future registrationResponseFuture = jobMasterGatewayFuture.thenCombineAsync(jobLeaderIdFuture, new BiFunction() {
@Override
public RegistrationResponse apply(final JobMasterGateway jobMasterGateway, UUID jobLeaderId) {
if (isValid(resourceManagerLeaderId)) {
if (jobLeaderId.equals(jobManagerLeaderId)) {
if (jobManagerRegistrations.containsKey(jobId)) {
JobManagerRegistration oldJobManagerRegistration = jobManagerRegistrations.get(jobId);
if (oldJobManagerRegistration.getLeaderID().equals(jobLeaderId)) {
// same registration
log.debug("Job manager {}@{} was already registered.", jobManagerLeaderId, jobManagerAddress);
} else {
// tell old job manager that he is no longer the job leader
disconnectJobManager(
oldJobManagerRegistration.getJobID(),
new Exception("New job leader for job " + jobId + " found."));
JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(
jobId,
jobManagerResourceId,
jobLeaderId,
jobMasterGateway);
jobManagerRegistrations.put(jobId, jobManagerRegistration);
jmResourceIdRegistrations.put(jobManagerResourceId, jobManagerRegistration);
}
} else {
// new registration for the job
JobManagerRegistration jobManagerRegistration = new JobManagerRegistration(
jobId,
jobManagerResourceId,
jobLeaderId,
jobMasterGateway);
jobManagerRegistrations.put(jobId, jobManagerRegistration);
jmResourceIdRegistrations.put(jobManagerResourceId, jobManagerRegistration);
}
log.info("Registered job manager {}@{} for job {}.", jobManagerLeaderId, jobManagerAddress, jobId);
jobManagerHeartbeatManager.monitorTarget(jobManagerResourceId, new HeartbeatTarget() {
@Override
public void receiveHeartbeat(ResourceID resourceID, Void payload) {
// the ResourceManager will always send heartbeat requests to the JobManager
}
@Override
public void requestHeartbeat(ResourceID resourceID, Void payload) {
jobMasterGateway.heartbeatFromResourceManager(resourceID);
}
});
return new JobMasterRegistrationSuccess(
resourceManagerConfiguration.getHeartbeatInterval().toMilliseconds(),
getLeaderSessionId(),
resourceId);
} else {
log.debug("The job manager leader id {} did not match the job " +
"leader id {}.", jobManagerLeaderId, jobLeaderId);
return new RegistrationResponse.Decline("Job manager leader id did not match.");
}
} else {
log.debug("The resource manager leader id changed {}. Discarding job " +
"manager registration from {}.", getLeaderSessionId(), jobManagerAddress);
return new RegistrationResponse.Decline("Resource manager leader id changed.");
}
}
}, getMainThreadExecutor());
// handle exceptions which might have occurred in one of the futures inputs of combine
return registrationResponseFuture.handleAsync(new BiFunction() {
@Override
public RegistrationResponse apply(RegistrationResponse registrationResponse, Throwable throwable) {
if (throwable != null) {
if (log.isDebugEnabled()) {
log.debug("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress, throwable);
} else {
log.info("Registration of job manager {}@{} failed.", jobManagerLeaderId, jobManagerAddress);
}
return new RegistrationResponse.Decline(throwable.getMessage());
} else {
return registrationResponse;
}
}
}, getRpcService().getExecutor());
} else {
log.debug("Discard register job manager message from {}, because the leader id " +
"{} did not match the expected leader id {}.", jobManagerAddress,
resourceManagerLeaderId, leaderSessionId);
return FlinkCompletableFuture.completed(
new RegistrationResponse.Decline("Resource manager leader id did not match."));
}
}
/**
* Register a {@link org.apache.flink.runtime.taskexecutor.TaskExecutor} at the resource manager
*
* @param resourceManagerLeaderId The fencing token for the ResourceManager leader
* @param taskExecutorAddress The address of the TaskExecutor that registers
* @param taskExecutorResourceId The resource ID of the TaskExecutor that registers
*
* @return The response by the ResourceManager.
*/
@RpcMethod
public Future registerTaskExecutor(
final UUID resourceManagerLeaderId,
final String taskExecutorAddress,
final ResourceID taskExecutorResourceId,
final SlotReport slotReport) {
if (Objects.equals(leaderSessionId, resourceManagerLeaderId)) {
Future taskExecutorGatewayFuture = getRpcService().connect(taskExecutorAddress, TaskExecutorGateway.class);
return taskExecutorGatewayFuture.handleAsync(new BiFunction() {
@Override
public RegistrationResponse apply(final TaskExecutorGateway taskExecutorGateway, Throwable throwable) {
if (throwable != null) {
return new RegistrationResponse.Decline(throwable.getMessage());
} else {
WorkerRegistration oldRegistration = taskExecutors.remove(taskExecutorResourceId);
if (oldRegistration != null) {
// TODO :: suggest old taskExecutor to stop itself
log.info("Replacing old instance of worker for ResourceID {}", taskExecutorResourceId);
// remove old task manager registration from slot manager
slotManager.unregisterTaskManager(oldRegistration.getInstanceID());
}
WorkerType newWorker = workerStarted(taskExecutorResourceId);
WorkerRegistration registration =
new WorkerRegistration<>(taskExecutorGateway, newWorker);
taskExecutors.put(taskExecutorResourceId, registration);
slotManager.registerTaskManager(registration, slotReport);
taskManagerHeartbeatManager.monitorTarget(taskExecutorResourceId, new HeartbeatTarget() {
@Override
public void receiveHeartbeat(ResourceID resourceID, Void payload) {
// the ResourceManager will always send heartbeat requests to the
// TaskManager
}
@Override
public void requestHeartbeat(ResourceID resourceID, Void payload) {
taskExecutorGateway.heartbeatFromResourceManager(resourceID);
}
});
return new TaskExecutorRegistrationSuccess(
registration.getInstanceID(),
resourceId,
resourceManagerConfiguration.getHeartbeatInterval().toMilliseconds());
}
}
}, getMainThreadExecutor());
} else {
log.warn("Discard registration from TaskExecutor {} at ({}) because the expected leader session ID {} did " +
"not equal the received leader session ID {}",
taskExecutorResourceId, taskExecutorAddress, leaderSessionId, resourceManagerLeaderId);
return FlinkCompletableFuture.completed(
new RegistrationResponse.Decline("Discard registration because the leader id " +
resourceManagerLeaderId + " does not match the expected leader id " +
leaderSessionId + '.'));
}
}
@RpcMethod
public void heartbeatFromTaskManager(final ResourceID resourceID) {
taskManagerHeartbeatManager.receiveHeartbeat(resourceID, null);
}
@RpcMethod
public void heartbeatFromJobManager(final ResourceID resourceID) {
jobManagerHeartbeatManager.receiveHeartbeat(resourceID, null);
}
@RpcMethod
public void disconnectTaskManager(final ResourceID resourceId, final Exception cause) {
closeTaskManagerConnection(resourceId, cause);
}
@RpcMethod
public void disconnectJobManager(final JobID jobId, final Exception cause) {
closeJobManagerConnection(jobId, cause);
}
/**
* Requests a slot from the resource manager.
*
* @param slotRequest Slot request
* @return Slot assignment
*/
@RpcMethod
public Acknowledge requestSlot(
UUID jobMasterLeaderID,
UUID resourceManagerLeaderID,
SlotRequest slotRequest) throws ResourceManagerException, LeaderSessionIDException {
if (!Objects.equals(resourceManagerLeaderID, leaderSessionId)) {
throw new LeaderSessionIDException(resourceManagerLeaderID, leaderSessionId);
}
JobID jobId = slotRequest.getJobId();
JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.get(jobId);
if (null != jobManagerRegistration) {
if (Objects.equals(jobMasterLeaderID, jobManagerRegistration.getLeaderID())) {
log.info("Request slot with profile {} for job {} with allocation id {}.",
slotRequest.getResourceProfile(),
slotRequest.getJobId(),
slotRequest.getAllocationId());
slotManager.registerSlotRequest(slotRequest);
return Acknowledge.get();
} else {
throw new LeaderSessionIDException(jobMasterLeaderID, jobManagerRegistration.getLeaderID());
}
} else {
throw new ResourceManagerException("Could not find registered job manager for job " + jobId + '.');
}
}
/**
* Notification from a TaskExecutor that a slot has become available
* @param resourceManagerLeaderId TaskExecutor's resource manager leader id
* @param instanceID TaskExecutor's instance id
* @param slotId The slot id of the available slot
*/
@RpcMethod
public void notifySlotAvailable(
final UUID resourceManagerLeaderId,
final InstanceID instanceID,
final SlotID slotId,
final AllocationID allocationId) {
if (Objects.equals(resourceManagerLeaderId, leaderSessionId)) {
final ResourceID resourceId = slotId.getResourceID();
WorkerRegistration registration = taskExecutors.get(resourceId);
if (registration != null) {
InstanceID registrationId = registration.getInstanceID();
if (Objects.equals(registrationId, instanceID)) {
slotManager.freeSlot(slotId, allocationId);
} else {
log.debug("Invalid registration id for slot available message. This indicates an" +
" outdated request.");
}
} else {
log.debug("Could not find registration for resource id {}. Discarding the slot available" +
"message {}.", resourceId, slotId);
}
} else {
log.debug("Discarding notify slot available message for slot {}, because the " +
"leader id {} did not match the expected leader id {}.", slotId,
resourceManagerLeaderId, leaderSessionId);
}
}
/**
* Registers an info message listener
*
* @param address address of infoMessage listener to register to this resource manager
*/
@RpcMethod
public void registerInfoMessageListener(final String address) {
if(infoMessageListeners.containsKey(address)) {
log.warn("Receive a duplicate registration from info message listener on ({})", address);
} else {
Future infoMessageListenerRpcGatewayFuture = getRpcService().connect(address, InfoMessageListenerRpcGateway.class);
Future infoMessageListenerAcceptFuture = infoMessageListenerRpcGatewayFuture.thenAcceptAsync(new AcceptFunction() {
@Override
public void accept(InfoMessageListenerRpcGateway gateway) {
log.info("Receive a registration from info message listener on ({})", address);
infoMessageListeners.put(address, gateway);
}
}, getMainThreadExecutor());
infoMessageListenerAcceptFuture.exceptionallyAsync(new ApplyFunction() {
@Override
public Void apply(Throwable failure) {
log.warn("Receive a registration from unreachable info message listener on ({})", address);
return null;
}
}, getRpcService().getExecutor());
}
}
/**
* Unregisters an info message listener
*
* @param address of the info message listener to unregister from this resource manager
*
*/
@RpcMethod
public void unRegisterInfoMessageListener(final String address) {
infoMessageListeners.remove(address);
}
/**
* Cleanup application and shut down cluster
*
* @param finalStatus of the Flink application
* @param optionalDiagnostics for the Flink application
*/
@RpcMethod
public void shutDownCluster(final ApplicationStatus finalStatus, final String optionalDiagnostics) {
log.info("shut down cluster because application is in {}, diagnostics {}", finalStatus, optionalDiagnostics);
shutDownApplication(finalStatus, optionalDiagnostics);
}
@RpcMethod
public Integer getNumberOfRegisteredTaskManagers(UUID requestLeaderSessionId) throws LeaderIdMismatchException {
if (Objects.equals(leaderSessionId, requestLeaderSessionId)) {
return taskExecutors.size();
}
else {
throw new LeaderIdMismatchException(leaderSessionId, requestLeaderSessionId);
}
}
// ------------------------------------------------------------------------
// Testing methods
// ------------------------------------------------------------------------
/**
* Gets the leader session id of current resourceManager.
*
* @return return the leaderSessionId of current resourceManager, this returns null until the current resourceManager is granted leadership.
*/
@VisibleForTesting
UUID getLeaderSessionId() {
return leaderSessionId;
}
// ------------------------------------------------------------------------
// Internal methods
// ------------------------------------------------------------------------
private void clearState() {
jobManagerRegistrations.clear();
jmResourceIdRegistrations.clear();
taskExecutors.clear();
try {
jobLeaderIdService.clear();
} catch (Exception e) {
onFatalError(new ResourceManagerException("Could not properly clear the job leader id service.", e));
}
leaderSessionId = null;
}
/**
* This method should be called by the framework once it detects that a currently registered
* job manager has failed.
*
* @param jobId identifying the job whose leader shall be disconnected.
* @param cause The exception which cause the JobManager failed.
*/
protected void closeJobManagerConnection(JobID jobId, Exception cause) {
JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.remove(jobId);
if (jobManagerRegistration != null) {
final ResourceID jobManagerResourceId = jobManagerRegistration.getJobManagerResourceID();
final JobMasterGateway jobMasterGateway = jobManagerRegistration.getJobManagerGateway();
final UUID jobManagerLeaderId = jobManagerRegistration.getLeaderID();
log.info("Disconnect job manager {}@{} for job {} from the resource manager.",
jobManagerLeaderId,
jobMasterGateway.getAddress(),
jobId);
jobManagerHeartbeatManager.unmonitorTarget(jobManagerResourceId);
jmResourceIdRegistrations.remove(jobManagerResourceId);
// tell the job manager about the disconnect
jobMasterGateway.disconnectResourceManager(jobManagerLeaderId, getLeaderSessionId(), cause);
} else {
log.debug("There was no registered job manager for job {}.", jobId);
}
}
/**
* This method should be called by the framework once it detects that a currently registered
* task executor has failed.
*
* @param resourceID Id of the TaskManager that has failed.
* @param cause The exception which cause the TaskManager failed.
*/
protected void closeTaskManagerConnection(final ResourceID resourceID, final Exception cause) {
taskManagerHeartbeatManager.unmonitorTarget(resourceID);
WorkerRegistration workerRegistration = taskExecutors.remove(resourceID);
if (workerRegistration != null) {
log.info("Task manager {} failed because {}.", resourceID, cause);
// TODO :: suggest failed task executor to stop itself
slotManager.unregisterTaskManager(workerRegistration.getInstanceID());
workerRegistration.getTaskExecutorGateway().disconnectResourceManager(cause);
} else {
log.debug("Could not find a registered task manager with the process id {}.", resourceID);
}
}
/**
* Checks whether the given resource manager leader id is matching the current leader id and
* not null.
*
* @param resourceManagerLeaderId to check
* @return True if the given leader id matches the actual leader id and is not null; otherwise false
*/
protected boolean isValid(UUID resourceManagerLeaderId) {
return Objects.equals(resourceManagerLeaderId, leaderSessionId);
}
protected void removeJob(JobID jobId) {
try {
jobLeaderIdService.removeJob(jobId);
} catch (Exception e) {
log.warn("Could not properly remove the job {} from the job leader id service.", jobId, e);
}
if (jobManagerRegistrations.containsKey(jobId)) {
disconnectJobManager(jobId, new Exception("Job " + jobId + "was removed"));
}
}
protected void jobLeaderLostLeadership(JobID jobId, UUID oldJobLeaderId) {
if (jobManagerRegistrations.containsKey(jobId)) {
JobManagerRegistration jobManagerRegistration = jobManagerRegistrations.get(jobId);
if (Objects.equals(jobManagerRegistration.getLeaderID(), oldJobLeaderId)) {
disconnectJobManager(jobId, new Exception("Job leader lost leadership."));
} else {
log.debug("Discarding job leader lost leadership, because a new job leader was found for job {}. ", jobId);
}
} else {
log.debug("Discard job leader lost leadership for outdated leader {} for job {}.", oldJobLeaderId, jobId);
}
}
// ------------------------------------------------------------------------
// Info messaging
// ------------------------------------------------------------------------
public void sendInfoMessage(final String message) {
getRpcService().execute(new Runnable() {
@Override
public void run() {
InfoMessage infoMessage = new InfoMessage(message);
for (InfoMessageListenerRpcGateway listenerRpcGateway : infoMessageListeners.values()) {
listenerRpcGateway
.notifyInfoMessage(infoMessage);
}
}
});
}
// ------------------------------------------------------------------------
// Error Handling
// ------------------------------------------------------------------------
/**
* Notifies the ResourceManager that a fatal error has occurred and it cannot proceed.
* This method should be used when asynchronous threads want to notify the
* ResourceManager of a fatal error.
*
* @param t The exception describing the fatal error
*/
protected void onFatalErrorAsync(final Throwable t) {
runAsync(new Runnable() {
@Override
public void run() {
onFatalError(t);
}
});
}
/**
* Notifies the ResourceManager that a fatal error has occurred and it cannot proceed.
* This method must only be called from within the ResourceManager's main thread.
*
* @param t The exception describing the fatal error
*/
void onFatalError(Throwable t) {
log.error("Fatal error occurred.", t);
fatalErrorHandler.onFatalError(t);
}
// ------------------------------------------------------------------------
// Leader Contender
// ------------------------------------------------------------------------
/**
* Callback method when current resourceManager is granted leadership
*
* @param newLeaderSessionID unique leadershipID
*/
@Override
public void grantLeadership(final UUID newLeaderSessionID) {
runAsync(new Runnable() {
@Override
public void run() {
log.info("ResourceManager {} was granted leadership with leader session ID {}", getAddress(), newLeaderSessionID);
// clear the state if we've been the leader before
if (leaderSessionId != null) {
clearState();
}
leaderSessionId = newLeaderSessionID;
slotManager.start(leaderSessionId, getMainThreadExecutor(), new ResourceManagerActionsImpl());
getRpcService().execute(new Runnable() {
@Override
public void run() {
// confirming the leader session ID might be blocking,
leaderElectionService.confirmLeaderSessionID(newLeaderSessionID);
}
});
}
});
}
/**
* Callback method when current resourceManager loses leadership.
*/
@Override
public void revokeLeadership() {
runAsync(new Runnable() {
@Override
public void run() {
log.info("ResourceManager {} was revoked leadership.", getAddress());
clearState();
slotManager.suspend();
leaderSessionId = null;
}
});
}
/**
* Handles error occurring in the leader election service
*
* @param exception Exception being thrown in the leader election service
*/
@Override
public void handleError(final Exception exception) {
onFatalErrorAsync(new ResourceManagerException("Received an error from the LeaderElectionService.", exception));
}
// ------------------------------------------------------------------------
// Framework specific behavior
// ------------------------------------------------------------------------
/**
* Initializes the framework specific components.
*
* @throws ResourceManagerException which occurs during initialization and causes the resource manager to fail.
*/
protected abstract void initialize() throws ResourceManagerException;
/**
* The framework specific code for shutting down the application. This should report the
* application's final status and shut down the resource manager cleanly.
*
* This method also needs to make sure all pending containers that are not registered
* yet are returned.
*
* @param finalStatus The application status to report.
* @param optionalDiagnostics An optional diagnostics message.
*/
protected abstract void shutDownApplication(ApplicationStatus finalStatus, String optionalDiagnostics);
/**
* Allocates a resource using the resource profile.
* @param resourceProfile The resource description
*/
@VisibleForTesting
public abstract void startNewWorker(ResourceProfile resourceProfile);
public abstract void stopWorker(InstanceID instanceId);
/**
* Callback when a worker was started.
* @param resourceID The worker resource id
*/
protected abstract WorkerType workerStarted(ResourceID resourceID);
// ------------------------------------------------------------------------
// Static utility classes
// ------------------------------------------------------------------------
private class ResourceManagerActionsImpl implements ResourceManagerActions {
@Override
public void releaseResource(InstanceID instanceId) {
stopWorker(instanceId);
}
@Override
public void allocateResource(ResourceProfile resourceProfile) throws ResourceManagerException {
startNewWorker(resourceProfile);
}
@Override
public void notifyAllocationFailure(JobID jobId, AllocationID allocationId, Exception cause) {
log.info("Slot request with allocation id {} for job {} failed.", allocationId, jobId, cause);
}
}
private class JobLeaderIdActionsImpl implements JobLeaderIdActions {
@Override
public void jobLeaderLostLeadership(final JobID jobId, final UUID oldJobLeaderId) {
ResourceManager.this.jobLeaderLostLeadership(jobId, oldJobLeaderId);
}
@Override
public void notifyJobTimeout(final JobID jobId, final UUID timeoutId) {
runAsync(new Runnable() {
@Override
public void run() {
if (jobLeaderIdService.isValidTimeout(jobId, timeoutId)) {
removeJob(jobId);
}
}
});
}
@Override
public void handleError(Throwable error) {
onFatalErrorAsync(error);
}
}
private class TaskManagerHeartbeatListener implements HeartbeatListener {
@Override
public void notifyHeartbeatTimeout(final ResourceID resourceID) {
runAsync(new Runnable() {
@Override
public void run() {
log.info("The heartbeat of TaskManager with id {} timed out.", resourceID);
closeTaskManagerConnection(
resourceID,
new TimeoutException("The heartbeat of TaskManager with id " + resourceID + " timed out."));
}
});
}
@Override
public void reportPayload(ResourceID resourceID, Void payload) {
// nothing to do since there is no payload
}
@Override
public Future retrievePayload() {
return FlinkCompletableFuture.completed(null);
}
}
private class JobManagerHeartbeatListener implements HeartbeatListener {
@Override
public void notifyHeartbeatTimeout(final ResourceID resourceID) {
runAsync(new Runnable() {
@Override
public void run() {
log.info("The heartbeat of JobManager with id {} timed out.", resourceID);
if (jmResourceIdRegistrations.containsKey(resourceID)) {
JobManagerRegistration jobManagerRegistration = jmResourceIdRegistrations.get(resourceID);
if (jobManagerRegistration != null) {
closeJobManagerConnection(
jobManagerRegistration.getJobID(),
new TimeoutException("The heartbeat of JobManager with id " + resourceID + " timed out."));
}
}
}
});
}
@Override
public void reportPayload(ResourceID resourceID, Void payload) {
// nothing to do since there is no payload
}
@Override
public Future retrievePayload() {
return FlinkCompletableFuture.completed(null);
}
}
}