Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.runtime.clusterframework;
import akka.actor.ActorRef;
import akka.actor.ActorSelection;
import akka.actor.ActorSystem;
import akka.actor.Props;
import akka.dispatch.OnComplete;
import akka.pattern.Patterns;
import akka.util.Timeout;
import org.apache.flink.configuration.ConfigConstants;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.akka.AkkaUtils;
import org.apache.flink.runtime.akka.FlinkUntypedActor;
import org.apache.flink.runtime.clusterframework.messages.CheckAndAllocateContainers;
import org.apache.flink.runtime.clusterframework.messages.FatalErrorOccurred;
import org.apache.flink.runtime.clusterframework.messages.InfoMessage;
import org.apache.flink.runtime.clusterframework.messages.RegisterInfoMessageListenerSuccessful;
import org.apache.flink.runtime.clusterframework.messages.NotifyResourceStarted;
import org.apache.flink.runtime.clusterframework.messages.RegisterResourceManagerSuccessful;
import org.apache.flink.runtime.clusterframework.messages.NewLeaderAvailable;
import org.apache.flink.runtime.clusterframework.messages.RegisterInfoMessageListener;
import org.apache.flink.runtime.clusterframework.messages.RegisterResourceManager;
import org.apache.flink.runtime.clusterframework.messages.RemoveResource;
import org.apache.flink.runtime.clusterframework.messages.ResourceRemoved;
import org.apache.flink.runtime.clusterframework.messages.SetWorkerPoolSize;
import org.apache.flink.runtime.clusterframework.messages.StopCluster;
import org.apache.flink.runtime.clusterframework.messages.TriggerRegistrationAtJobManager;
import org.apache.flink.runtime.clusterframework.messages.UnRegisterInfoMessageListener;
import org.apache.flink.runtime.clusterframework.types.ResourceID;
import org.apache.flink.runtime.clusterframework.types.ResourceIDRetrievable;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalListener;
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService;
import org.apache.flink.runtime.messages.Acknowledge;
import org.apache.flink.runtime.messages.JobManagerMessages.LeaderSessionMessage;
import org.apache.flink.util.Preconditions;
import scala.concurrent.Future;
import scala.concurrent.duration.Duration;
import scala.concurrent.duration.FiniteDuration;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.TimeUnit;
import static java.util.Objects.requireNonNull;
/**
*
*
Worker allocation steps
*
*
*
The resource manager decides to request more workers. This can happen in order
* to fill the initial pool, or as a result of the JobManager requesting more workers.
*
*
The resource master calls {@link #requestNewWorkers(int)}, which triggers requests
* for more containers. After that, the {@link #getNumWorkerRequestsPending()}
* should reflect the pending requests.
*
*
The concrete framework may acquire containers and then trigger to start TaskManagers
* in those containers. That should be reflected in {@link #getNumWorkersPendingRegistration()}.
*
*
At some point, the TaskManager processes will have started and send a registration
* message to the JobManager. The JobManager will perform
* a lookup with the ResourceManager to check if it really started this TaskManager.
* The method {@link #workerStarted(ResourceID)} will be called
* to inform about a registered worker.
*
*
*/
public abstract class FlinkResourceManager extends FlinkUntypedActor {
/** The exit code with which the process is stopped in case of a fatal error */
protected static final int EXIT_CODE_FATAL_ERROR = -13;
/** The default name of the resource manager actor */
public static final String RESOURCE_MANAGER_NAME = "resourcemanager";
// ------------------------------------------------------------------------
/** The Flink configuration object */
protected final Configuration config;
/** The timeout for actor messages sent to the JobManager / TaskManagers */
private final FiniteDuration messageTimeout;
/** The service to find the right leader JobManager (to support high availability) */
private final LeaderRetrievalService leaderRetriever;
/** Map which contains the workers from which we know that they have been successfully started
* in a container. This notification is sent by the JM when a TM tries to register at it. */
private final Map startedWorkers;
/** List of listeners for info messages */
private final Set infoMessageListeners;
/** The JobManager that the framework master manages resources for */
private ActorRef jobManager;
/** Our JobManager's leader session */
private UUID leaderSessionID;
/** The size of the worker pool that the resource master strives to maintain */
private int designatedPoolSize;
// ------------------------------------------------------------------------
/**
* Creates a AbstractFrameworkMaster actor.
*
* @param flinkConfig The Flink configuration object.
*/
protected FlinkResourceManager(
int numInitialTaskManagers,
Configuration flinkConfig,
LeaderRetrievalService leaderRetriever) {
this.config = requireNonNull(flinkConfig);
this.leaderRetriever = requireNonNull(leaderRetriever);
this.startedWorkers = new HashMap<>();
FiniteDuration lt;
try {
lt = AkkaUtils.getLookupTimeout(config);
}
catch (Exception e) {
lt = new FiniteDuration(
Duration.apply(ConfigConstants.DEFAULT_AKKA_LOOKUP_TIMEOUT).toMillis(),
TimeUnit.MILLISECONDS);
}
this.messageTimeout = lt;
this.designatedPoolSize = numInitialTaskManagers;
this.infoMessageListeners = new HashSet<>();
}
// ------------------------------------------------------------------------
// Actor Behavior
// ------------------------------------------------------------------------
@Override
public void preStart() {
try {
// we start our leader retrieval service to make sure we get informed
// about JobManager leader changes
leaderRetriever.start(new LeaderRetrievalListener() {
@Override
public void notifyLeaderAddress(String leaderAddress, UUID leaderSessionID) {
self().tell(
new NewLeaderAvailable(leaderAddress, leaderSessionID),
ActorRef.noSender());
}
@Override
public void handleError(Exception e) {
self().tell(
new FatalErrorOccurred("Leader retrieval service failed", e),
ActorRef.noSender());
}
});
// framework specific initialization
initialize();
}
catch (Throwable t) {
self().tell(
new FatalErrorOccurred("Error during startup of ResourceManager actor", t),
ActorRef.noSender());
}
}
@Override
public void postStop() {
try {
leaderRetriever.stop();
}
catch (Throwable t) {
LOG.error("Could not cleanly shut down leader retrieval service", t);
}
}
/**
*
* This method receives the actor messages after they have been filtered for
* a match with the leader session.
*
* @param message The incoming actor message.
*/
@Override
protected void handleMessage(Object message) {
try {
// --- messages about worker allocation and pool sizes
if (message instanceof CheckAndAllocateContainers) {
checkWorkersPool();
}
else if (message instanceof SetWorkerPoolSize) {
SetWorkerPoolSize msg = (SetWorkerPoolSize) message;
adjustDesignatedNumberOfWorkers(msg.numberOfWorkers());
}
else if (message instanceof RemoveResource) {
RemoveResource msg = (RemoveResource) message;
removeRegisteredResource(msg.resourceId());
}
// --- lookup of registered resources
else if (message instanceof NotifyResourceStarted) {
NotifyResourceStarted msg = (NotifyResourceStarted) message;
handleResourceStarted(sender(), msg.getResourceID());
}
// --- messages about JobManager leader status and registration
else if (message instanceof NewLeaderAvailable) {
NewLeaderAvailable msg = (NewLeaderAvailable) message;
newJobManagerLeaderAvailable(msg.leaderAddress(), msg.leaderSessionId());
}
else if (message instanceof TriggerRegistrationAtJobManager) {
TriggerRegistrationAtJobManager msg = (TriggerRegistrationAtJobManager) message;
triggerConnectingToJobManager(msg.jobManagerAddress());
}
else if (message instanceof RegisterResourceManagerSuccessful) {
RegisterResourceManagerSuccessful msg = (RegisterResourceManagerSuccessful) message;
jobManagerLeaderConnected(msg.jobManager(), msg.currentlyRegisteredTaskManagers());
}
// --- end of application
else if (message instanceof StopCluster) {
StopCluster msg = (StopCluster) message;
shutdownCluster(msg.finalStatus(), msg.message());
}
// --- miscellaneous messages
else if (message instanceof RegisterInfoMessageListener) {
if (jobManager != null) {
infoMessageListeners.add(sender());
sender().tell(decorateMessage(
RegisterInfoMessageListenerSuccessful.get()),
// answer as the JobManager
jobManager);
}
}
else if (message instanceof UnRegisterInfoMessageListener) {
infoMessageListeners.remove(sender());
}
else if (message instanceof FatalErrorOccurred) {
FatalErrorOccurred fatalErrorOccurred = (FatalErrorOccurred) message;
fatalError(fatalErrorOccurred.message(), fatalErrorOccurred.error());
}
// --- unknown messages
else {
LOG.error("Discarding unknown message: {}", message);
}
}
catch (Throwable t) {
// fatal error, needs master recovery
fatalError("Error processing actor message", t);
}
}
@Override
protected final UUID getLeaderSessionID() {
return leaderSessionID;
}
// ------------------------------------------------------------------------
// Status
// ------------------------------------------------------------------------
/**
* Gets the current designated worker pool size, meaning the number of workers
* that the resource master strives to maintain. The actual number of workers
* may be lower (if worker requests are still pending) or higher (if workers have
* not yet been released).
*
* @return The designated worker pool size.
*/
public int getDesignatedWorkerPoolSize() {
return designatedPoolSize;
}
/**
* Gets the number of currently started TaskManagers.
*
* @return The number of currently started TaskManagers.
*/
public int getNumberOfStartedTaskManagers() {
return startedWorkers.size();
}
/**
* Gets the currently registered resources.
* @return
*/
public Collection getStartedTaskManagers() {
return startedWorkers.values();
}
/**
* Gets the started worker for a given resource ID, if one is available.
*
* @param resourceId The resource ID for the worker.
* @return True if already registered, otherwise false
*/
public boolean isStarted(ResourceID resourceId) {
return startedWorkers.containsKey(resourceId);
}
/**
* Gets an iterable for all currently started TaskManagers.
*
* @return All currently started TaskManagers.
*/
public Collection allStartedWorkers() {
return startedWorkers.values();
}
/**
* Tells the ResourceManager that a TaskManager had been started in a container with the given
* resource id.
*
* @param jobManager The sender (JobManager) of the message
* @param resourceID The resource id of the started TaskManager
*/
private void handleResourceStarted(ActorRef jobManager, ResourceID resourceID) {
if (resourceID != null) {
// check if resourceID is already registered (TaskManager may send duplicate register messages)
WorkerType oldWorker = startedWorkers.get(resourceID);
if (oldWorker != null) {
LOG.debug("Notification that TaskManager {} had been started was sent before.", resourceID);
} else {
WorkerType newWorker = workerStarted(resourceID);
if (newWorker != null) {
startedWorkers.put(resourceID, newWorker);
LOG.info("TaskManager {} has started.", resourceID);
} else {
LOG.info("TaskManager {} has not been started by this resource manager.", resourceID);
}
}
}
// Acknowledge the resource registration
jobManager.tell(decorateMessage(Acknowledge.get()), self());
}
/**
* Releases the given resource. Note that this does not automatically shrink
* the designated worker pool size.
*
* @param resourceId The TaskManager's resource id.
*/
private void removeRegisteredResource(ResourceID resourceId) {
WorkerType worker = startedWorkers.remove(resourceId);
if (worker != null) {
releaseStartedWorker(worker);
} else {
LOG.warn("Resource {} could not be released", resourceId);
}
}
// ------------------------------------------------------------------------
// Registration and consolidation with JobManager Leader
// ------------------------------------------------------------------------
/**
* Called as soon as we discover (via leader election) that a JobManager lost leadership
* or a different one gained leadership.
*
* @param leaderAddress The address (Akka URL) of the new leader. Null if there is currently no leader.
* @param leaderSessionID The unique session ID marking the leadership session.
*/
private void newJobManagerLeaderAvailable(String leaderAddress, UUID leaderSessionID) {
LOG.debug("Received new leading JobManager {}. Connecting.", leaderAddress);
// disconnect from the current leader (no-op if no leader yet)
jobManagerLostLeadership();
// a null leader address means that only a leader disconnect
// happened, without a new leader yet
if (leaderAddress != null) {
// the leaderSessionID implicitly filters out success and failure messages
// that come after leadership changed again
this.leaderSessionID = leaderSessionID;
triggerConnectingToJobManager(leaderAddress);
}
}
/**
* Causes the resource manager to announce itself at the new leader JobManager and
* obtains its connection information and currently known TaskManagers.
*
* @param leaderAddress The akka actor URL of the new leader JobManager.
*/
protected void triggerConnectingToJobManager(String leaderAddress) {
LOG.info("Trying to associate with JobManager leader " + leaderAddress);
final Object registerMessage = decorateMessage(new RegisterResourceManager(self()));
final Object retryMessage = decorateMessage(new TriggerRegistrationAtJobManager(leaderAddress));
// send the registration message to the JobManager
ActorSelection jobManagerSel = context().actorSelection(leaderAddress);
Future