All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.jobmanager.JobManager.scala Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.jobmanager

import java.io.IOException
import java.net._
import java.util.UUID
import java.util.concurrent.{TimeUnit, Future => _, TimeoutException => _, _}

import akka.actor.Status.{Failure, Success}
import akka.actor._
import akka.pattern.ask
import grizzled.slf4j.Logger
import org.apache.flink.api.common.JobID
import org.apache.flink.api.common.time.Time
import org.apache.flink.configuration._
import org.apache.flink.core.fs.{FileSystem, Path}
import org.apache.flink.core.io.InputSplitAssigner
import org.apache.flink.metrics.groups.UnregisteredMetricsGroup
import org.apache.flink.metrics.{Gauge, MetricGroup}
import org.apache.flink.runtime.accumulators.AccumulatorSnapshot
import org.apache.flink.runtime.akka.{AkkaUtils, ListeningBehaviour}
import org.apache.flink.runtime.blob.{BlobServer, BlobStore}
import org.apache.flink.runtime.checkpoint._
import org.apache.flink.runtime.checkpoint.savepoint.{SavepointLoader, SavepointStore}
import org.apache.flink.runtime.client._
import org.apache.flink.runtime.clusterframework.FlinkResourceManager
import org.apache.flink.runtime.clusterframework.messages._
import org.apache.flink.runtime.clusterframework.standalone.StandaloneResourceManager
import org.apache.flink.runtime.clusterframework.types.ResourceID
import org.apache.flink.runtime.concurrent.{AcceptFunction, ApplyFunction, BiFunction, Executors => FlinkExecutors}
import org.apache.flink.runtime.execution.SuppressRestartsException
import org.apache.flink.runtime.execution.librarycache.{BlobLibraryCacheManager, LibraryCacheManager}
import org.apache.flink.runtime.executiongraph.restart.RestartStrategyFactory
import org.apache.flink.runtime.executiongraph._
import org.apache.flink.runtime.highavailability.{HighAvailabilityServices, HighAvailabilityServicesUtils}
import org.apache.flink.runtime.highavailability.HighAvailabilityServicesUtils.AddressResolution
import org.apache.flink.runtime.instance.{AkkaActorGateway, InstanceID, InstanceManager}
import org.apache.flink.runtime.jobgraph.{JobGraph, JobStatus}
import org.apache.flink.runtime.jobmanager.SubmittedJobGraphStore.SubmittedJobGraphListener
import org.apache.flink.runtime.jobmanager.scheduler.{Scheduler => FlinkScheduler}
import org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway
import org.apache.flink.runtime.jobmaster.JobMaster
import org.apache.flink.runtime.leaderelection.{LeaderContender, LeaderElectionService}
import org.apache.flink.runtime.jobmaster.JobMaster.{ARCHIVE_NAME, JOB_MANAGER_NAME}
import org.apache.flink.runtime.messages.ArchiveMessages.ArchiveExecutionGraph
import org.apache.flink.runtime.messages.ExecutionGraphMessages.JobStatusChanged
import org.apache.flink.runtime.messages.JobManagerMessages._
import org.apache.flink.runtime.messages.Messages.Disconnect
import org.apache.flink.runtime.messages.RegistrationMessages._
import org.apache.flink.runtime.messages.{Acknowledge, StackTrace}
import org.apache.flink.runtime.messages.TaskManagerMessages.Heartbeat
import org.apache.flink.runtime.messages.TaskMessages.UpdateTaskExecutionState
import org.apache.flink.runtime.messages.accumulators._
import org.apache.flink.runtime.messages.checkpoint.{AbstractCheckpointMessage, AcknowledgeCheckpoint, DeclineCheckpoint}
import org.apache.flink.runtime.messages.webmonitor.{InfoMessage, _}
import org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup
import org.apache.flink.runtime.metrics.{MetricRegistryConfiguration, MetricRegistry => FlinkMetricRegistry}
import org.apache.flink.runtime.metrics.util.MetricUtils
import org.apache.flink.runtime.process.ProcessReaper
import org.apache.flink.runtime.query.KvStateMessage.{LookupKvStateLocation, NotifyKvStateRegistered, NotifyKvStateUnregistered}
import org.apache.flink.runtime.query.{KvStateMessage, UnknownKvStateLocation}
import org.apache.flink.runtime.rpc.akka.AkkaRpcServiceUtils
import org.apache.flink.runtime.security.SecurityUtils
import org.apache.flink.runtime.security.SecurityUtils.SecurityConfiguration
import org.apache.flink.runtime.taskexecutor.TaskExecutor
import org.apache.flink.runtime.taskexecutor.TaskExecutor.TASK_MANAGER_NAME
import org.apache.flink.runtime.taskmanager.TaskManager
import org.apache.flink.runtime.util._
import org.apache.flink.runtime.webmonitor.{WebMonitor, WebMonitorUtils}
import org.apache.flink.runtime.{FlinkActor, LeaderSessionMessageFilter, LogMessages}
import org.apache.flink.util.{ConfigurationUtil, InstantiationUtil, NetUtils}
import org.jboss.netty.channel.ChannelException

import scala.annotation.tailrec
import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.concurrent._
import scala.concurrent.duration._
import scala.language.postfixOps

/**
 * The job manager is responsible for receiving Flink jobs, scheduling the tasks, gathering the
 * job status and managing the task managers. It is realized as an actor and receives amongst others
 * the following messages:
 *
 *  - [[RegisterTaskManager]] is sent by a TaskManager which wants to register at the job manager.
 *  A successful registration at the instance manager is acknowledged by [[AcknowledgeRegistration]]
 *
 *  - [[SubmitJob]] is sent by a client which wants to submit a job to the system. The submit
 *  message contains the job description in the form of the JobGraph. The JobGraph is appended to
 *  the ExecutionGraph and the corresponding ExecutionJobVertices are scheduled for execution on
 *  the TaskManagers.
 *
 *  - [[CancelJob]] requests to cancel the job with the specified jobID. A successful cancellation
 *  is indicated by [[CancellationSuccess]] and a failure by [[CancellationFailure]]
 *
 * - [[UpdateTaskExecutionState]] is sent by a TaskManager to update the state of an
 * ExecutionVertex contained in the [[ExecutionGraph]].
 * A successful update is acknowledged by true and otherwise false.
 *
 * - [[RequestNextInputSplit]] requests the next input split for a running task on a
 * [[TaskManager]]. The assigned input split or null is sent to the sender in the form of the
 * message [[NextInputSplit]].
 *
 * - [[JobStatusChanged]] indicates that the status of job (RUNNING, CANCELING, FINISHED, etc.) has
 * changed. This message is sent by the ExecutionGraph.
 */
class JobManager(
    protected val flinkConfiguration: Configuration,
    protected val futureExecutor: ScheduledExecutorService,
    protected val ioExecutor: Executor,
    protected val instanceManager: InstanceManager,
    protected val scheduler: FlinkScheduler,
    protected val libraryCacheManager: BlobLibraryCacheManager,
    protected val archive: ActorRef,
    protected val restartStrategyFactory: RestartStrategyFactory,
    protected val timeout: FiniteDuration,
    protected val leaderElectionService: LeaderElectionService,
    protected val submittedJobGraphs : SubmittedJobGraphStore,
    protected val checkpointRecoveryFactory : CheckpointRecoveryFactory,
    protected val jobRecoveryTimeout: FiniteDuration,
    protected val metricsRegistry: Option[FlinkMetricRegistry])
  extends FlinkActor
  with LeaderSessionMessageFilter // mixin oder is important, we want filtering after logging
  with LogMessages // mixin order is important, we want first logging
  with LeaderContender
  with SubmittedJobGraphListener {

  override val log = Logger(getClass)

  /** Either running or not yet archived jobs (session hasn't been ended). */
  protected val currentJobs = scala.collection.mutable.HashMap[JobID, (ExecutionGraph, JobInfo)]()

  protected val haMode = HighAvailabilityMode.fromConfig(flinkConfiguration)

  var leaderSessionID: Option[UUID] = None

  protected val jobManagerMetricGroup : Option[JobManagerMetricGroup] = metricsRegistry match {
    case Some(registry) =>
      val host = flinkConfiguration.getString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, null)
      Option(new JobManagerMetricGroup(
        registry, NetUtils.unresolvedHostToNormalizedString(host)))
    case None =>
      log.warn("Could not instantiate JobManager metrics.")
      None
  }

  /** Futures which have to be completed before terminating the job manager */
  var futuresToComplete: Option[Seq[Future[Unit]]] = None

  /**
   * The port of the web monitor as configured. Make sure that it is actually configured before
   * starting the JobManager. This tightly couples the web monitor with the job manager. It is a
   * temporary workaround until all execution graph components are properly serializable and all
   * web monitors can transparently interact with each job manager. Currently each web server has
   * to run in the actor system of the associated job manager.
   */
  val webMonitorPort : Int = flinkConfiguration.getInteger(
    ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, -1)

  /** The default directory for savepoints. */
  val defaultSavepointDir: String = ConfigurationUtil.getStringWithDeprecatedKeys(
    flinkConfiguration,
    ConfigConstants.SAVEPOINT_DIRECTORY_KEY,
    null,
    ConfigConstants.SAVEPOINT_FS_DIRECTORY_KEY)

  /** The resource manager actor responsible for allocating and managing task manager resources. */
  var currentResourceManager: Option[ActorRef] = None

  var currentResourceManagerConnectionId: Long = 0

  val taskManagerMap = mutable.Map[ActorRef, InstanceID]()

  val triggerResourceManagerReconnectInterval = new FiniteDuration(
    flinkConfiguration.getLong(JobManagerOptions.RESOURCE_MANAGER_RECONNECT_INTERVAL),
    TimeUnit.MILLISECONDS)

  /**
   * Run when the job manager is started. Simply logs an informational message.
   * The method also starts the leader election service.
   */
  override def preStart(): Unit = {
    log.info(s"Starting JobManager at $getAddress.")

    try {
      leaderElectionService.start(this)
    } catch {
      case e: Exception =>
        log.error("Could not start the JobManager because the leader election service did not " +
          "start.", e)
        throw new RuntimeException("Could not start the leader election service.", e)
    }

    try {
      submittedJobGraphs.start(this)
    } catch {
      case e: Exception =>
        log.error("Could not start the submitted job graphs service.", e)
        throw new RuntimeException("Could not start the submitted job graphs service.", e)
    }

    try {
      checkpointRecoveryFactory.start()
    } catch {
      case e: Exception =>
        log.error("Could not start the checkpoint recovery service.", e)
        throw new RuntimeException("Could not start the checkpoint recovery service.", e)
    }

    jobManagerMetricGroup match {
      case Some(group) =>
        instantiateMetrics(group)
      case None =>
        log.warn("Could not instantiate JobManager metric group.")
    }
  }

  override def postStop(): Unit = {
    log.info(s"Stopping JobManager $getAddress.")

    val newFuturesToComplete = cancelAndClearEverything(
      new Exception("The JobManager is shutting down."))

    implicit val executionContext = context.dispatcher

    val futureToComplete = Future.sequence(
      futuresToComplete.getOrElse(Seq()) ++ newFuturesToComplete)

    Await.ready(futureToComplete, timeout)

    // disconnect the registered task managers
    instanceManager.getAllRegisteredInstances.asScala.foreach {
      instance => instance.getTaskManagerGateway().disconnectFromJobManager(
        instance.getId,
        new Exception("JobManager is shuttind down."))
    }

    try {
      // revoke leadership and stop leader election service
      leaderElectionService.stop()
    } catch {
      case e: Exception => log.error("Could not properly shutdown the leader election service.")
    }

    try {
      submittedJobGraphs.stop()
    } catch {
      case e: Exception => log.error("Could not properly stop the submitted job graphs service.")
    }

    try {
      checkpointRecoveryFactory.stop()
    } catch {
      case e: Exception => log.error("Could not properly stop the checkpoint recovery service.")
    }

    if (archive != ActorRef.noSender) {
      archive ! decorateMessage(PoisonPill)
    }

    instanceManager.shutdown()
    scheduler.shutdown()

    try {
      libraryCacheManager.shutdown()
    } catch {
      case e: IOException => log.error("Could not properly shutdown the library cache manager.", e)
    }

    // failsafe shutdown of the metrics registry
    try {
      metricsRegistry.foreach(_.shutdown())
    } catch {
      case t: Exception => log.error("MetricRegistry did not shutdown properly.", t)
    }

    log.debug(s"Job manager ${self.path} is completely stopped.")
  }

  /**
   * Central work method of the JobManager actor. Receives messages and reacts to them.
   *
   * @return
   */
  override def handleMessage: Receive = {

    case GrantLeadership(newLeaderSessionID) =>
      log.info(s"JobManager $getAddress was granted leadership with leader session ID " +
        s"$newLeaderSessionID.")

      leaderSessionID = newLeaderSessionID

      // confirming the leader session ID might be blocking, thus do it in a future
      future {
        leaderElectionService.confirmLeaderSessionID(newLeaderSessionID.orNull)

        // TODO (critical next step) This needs to be more flexible and robust (e.g. wait for task
        // managers etc.)
        if (haMode != HighAvailabilityMode.NONE) {
          log.info(s"Delaying recovery of all jobs by $jobRecoveryTimeout.")

          context.system.scheduler.scheduleOnce(
            jobRecoveryTimeout,
            self,
            decorateMessage(RecoverAllJobs))(
            context.dispatcher)
        }
      }(context.dispatcher)

    case RevokeLeadership =>
      log.info(s"JobManager ${self.path.toSerializationFormat} was revoked leadership.")

      val newFuturesToComplete = cancelAndClearEverything(
        new Exception("JobManager is no longer the leader."))

      futuresToComplete = Some(futuresToComplete.getOrElse(Seq()) ++ newFuturesToComplete)

      // disconnect the registered task managers
      instanceManager.getAllRegisteredInstances.asScala.foreach {
        instance => instance.getTaskManagerGateway().disconnectFromJobManager(
          instance.getId(),
          new Exception("JobManager is no longer the leader"))
      }

      instanceManager.unregisterAllTaskManagers()
      taskManagerMap.clear()

      leaderSessionID = None

    case msg: RegisterResourceManager =>
      log.debug(s"Resource manager registration: $msg")

      // ditch current resource manager (if any)
      currentResourceManager = Option(msg.resourceManager())
      currentResourceManagerConnectionId += 1

      val taskManagerResources = instanceManager.getAllRegisteredInstances.asScala.map(
        instance => instance.getTaskManagerID).toList.asJava

      // confirm registration and send known task managers with their resource ids
      sender ! decorateMessage(new RegisterResourceManagerSuccessful(self, taskManagerResources))

    case msg: ReconnectResourceManager =>
      log.debug(s"Resource manager reconnect: $msg")

      /**
        * In most cases, the ResourceManager handles the reconnect itself (due to leader change)
        * but in case it doesn't we're sending a TriggerRegistrationAtJobManager message until we
        * receive a registration of this or another ResourceManager.
        */
      def reconnectRepeatedly(): Unit = {
        msg.resourceManager() ! decorateMessage(new TriggerRegistrationAtJobManager(self))
        // try again after some delay
        context.system.scheduler.scheduleOnce(triggerResourceManagerReconnectInterval) {
          self ! decorateMessage(msg)
        }(context.dispatcher)
      }

      currentResourceManager match {
        case Some(rm) if rm.equals(msg.resourceManager()) &&
          currentResourceManagerConnectionId == msg.getConnectionId =>
          // we should ditch the current resource manager
          log.debug(s"Disconnecting resource manager $rm and forcing a reconnect.")
          currentResourceManager = None
          reconnectRepeatedly()
        case None =>
          log.warn(s"No resource manager ${msg.resourceManager()} connected. " +
            s"Telling old ResourceManager to register again.")
          reconnectRepeatedly()
        case _ =>
        // we have established a new connection to a ResourceManager in the meantime, stop sending
        // TriggerRegistrationAtJobManager messages to the old ResourceManager
      }

    case msg @ RegisterTaskManager(
          resourceId,
          connectionInfo,
          hardwareInformation,
          numberOfSlots) =>
      // we are being informed by the ResourceManager that a new task manager is available
      log.debug(s"RegisterTaskManager: $msg")

      val taskManager = sender()

      currentResourceManager match {
        case Some(rm) =>
          val future = (rm ? decorateMessage(new NotifyResourceStarted(msg.resourceId)))(timeout)
          future.onFailure {
            case t: Throwable =>
              t match {
                case _: TimeoutException =>
                  log.info("Attempt to register resource at ResourceManager timed out. Retrying")
                case _ =>
                  log.warn("Failure while asking ResourceManager for RegisterResource. Retrying", t)
              }
              self ! decorateMessage(
                new ReconnectResourceManager(
                  rm,
                  currentResourceManagerConnectionId))
          }(context.dispatcher)

        case None =>
          log.info("Task Manager Registration but not connected to ResourceManager")
      }

      // ResourceManager is told about the resource, now let's try to register TaskManager
      if (instanceManager.isRegistered(resourceId)) {
        val instanceID = instanceManager.getRegisteredInstance(resourceId).getId

        taskManager ! decorateMessage(
          AlreadyRegistered(
            instanceID,
            libraryCacheManager.getBlobServerPort))
      } else {
        try {
          val actorGateway = new AkkaActorGateway(taskManager, leaderSessionID.orNull)
          val taskManagerGateway = new ActorTaskManagerGateway(actorGateway)

          val instanceID = instanceManager.registerTaskManager(
            taskManagerGateway,
            connectionInfo,
            hardwareInformation,
            numberOfSlots)

          taskManagerMap.put(taskManager, instanceID)

          taskManager ! decorateMessage(
            AcknowledgeRegistration(instanceID, libraryCacheManager.getBlobServerPort))

          // to be notified when the taskManager is no longer reachable
          context.watch(taskManager)
        } catch {
          // registerTaskManager throws an IllegalStateException if it is already shut down
          // let the actor crash and restart itself in this case
          case e: Exception =>
            log.error("Failed to register TaskManager at instance manager", e)

            taskManager ! decorateMessage(
              RefuseRegistration(e))
        }
      }

    case msg: ResourceRemoved =>
      // we're being informed by the resource manager that a resource has become unavailable
      // note: a Terminated event may already have removed the instance.
      val resourceID = msg.resourceId()
      log.debug(s"Resource has been removed: $resourceID")

      Option(instanceManager.getRegisteredInstance(resourceID)) match {
        case Some(instance) =>
          // trigger removal of task manager
          val taskManagerGateway = instance.getTaskManagerGateway

          taskManagerGateway match {
            case x: ActorTaskManagerGateway =>
              handleTaskManagerTerminated(x.getActorGateway().actor(), instance.getId)
            case _ => log.debug(s"Cannot remove reosurce ${resourceID}, because there is " +
                                  s"no ActorRef registered.")
          }

        case None =>
          log.debug(s"Resource $resourceID has not been registered at job manager.")
      }

    case RequestNumberRegisteredTaskManager =>
      sender ! decorateMessage(instanceManager.getNumberOfRegisteredTaskManagers)

    case RequestTotalNumberOfSlots =>
      sender ! decorateMessage(instanceManager.getTotalNumberOfSlots)

    case SubmitJob(jobGraph, listeningBehaviour) =>
      val client = sender()

      val jobInfo = new JobInfo(client, listeningBehaviour, System.currentTimeMillis(),
        jobGraph.getSessionTimeout)

      submitJob(jobGraph, jobInfo)

    case RegisterJobClient(jobID, listeningBehaviour) =>
      val client = sender()
      currentJobs.get(jobID) match {
        case Some((executionGraph, jobInfo)) =>
          log.info(s"Registering client for job $jobID")
          jobInfo.clients += ((client, listeningBehaviour))
          val listener = new StatusListenerMessenger(client, leaderSessionID.orNull)
          executionGraph.registerJobStatusListener(listener)
          if (listeningBehaviour == ListeningBehaviour.EXECUTION_RESULT_AND_STATE_CHANGES) {
            executionGraph.registerExecutionListener(listener)
          }
          client ! decorateMessage(RegisterJobClientSuccess(jobID))
        case None =>
          client ! decorateMessage(JobNotFound(jobID))
      }

    case RecoverSubmittedJob(submittedJobGraph) =>
      if (!currentJobs.contains(submittedJobGraph.getJobId)) {
        log.info(s"Submitting recovered job ${submittedJobGraph.getJobId}.")

        submitJob(
          submittedJobGraph.getJobGraph(),
          submittedJobGraph.getJobInfo(),
          isRecovery = true)
      }
      else {
        log.info(s"Ignoring job recovery for ${submittedJobGraph.getJobId}, " +
          s"because it is already submitted.")
      }

    case RecoverJob(jobId) =>
      future {
        try {
          // The ActorRef, which is part of the submitted job graph can only be
          // de-serialized in the scope of an actor system.
          akka.serialization.JavaSerializer.currentSystem.withValue(
            context.system.asInstanceOf[ExtendedActorSystem]) {

            log.info(s"Attempting to recover job $jobId.")
            val submittedJobGraphOption = submittedJobGraphs.recoverJobGraph(jobId)

            Option(submittedJobGraphOption) match {
              case Some(submittedJobGraph) =>
                if (!leaderElectionService.hasLeadership()) {
                  // we've lost leadership. mission: abort.
                  log.warn(s"Lost leadership during recovery. Aborting recovery of $jobId.")
                } else {
                  self ! decorateMessage(RecoverSubmittedJob(submittedJobGraph))
                }

              case None => log.info(s"Attempted to recover job $jobId, but no job graph found.")
            }
          }
        } catch {
          case t: Throwable => log.warn(s"Failed to recover job $jobId.", t)
        }
      }(context.dispatcher)

    case RecoverAllJobs =>
      future {
        log.info("Attempting to recover all jobs.")

        try {
          val jobIdsToRecover = submittedJobGraphs.getJobIds().asScala

          if (jobIdsToRecover.isEmpty) {
            log.info("There are no jobs to recover.")
          } else {
            log.info(s"There are ${jobIdsToRecover.size} jobs to recover. Starting the job " +
                       s"recovery.")

            jobIdsToRecover foreach {
              jobId => self ! decorateMessage(RecoverJob(jobId))
            }
          }
        } catch {
          case e: Exception =>
            log.warn("Failed to recover job ids from submitted job graph store. Aborting " +
                       "recovery.", e)
        }
      }(context.dispatcher)

    case CancelJob(jobID) =>
      log.info(s"Trying to cancel job with ID $jobID.")

      currentJobs.get(jobID) match {
        case Some((executionGraph, _)) =>
          // execute the cancellation asynchronously
          Future {
            executionGraph.cancel()
          }(context.dispatcher)

          sender ! decorateMessage(CancellationSuccess(jobID))
        case None =>
          log.info(s"No job found with ID $jobID.")
          sender ! decorateMessage(
            CancellationFailure(
              jobID,
              new IllegalArgumentException(s"No job found with ID $jobID."))
          )
      }

    case CancelJobWithSavepoint(jobId, savepointDirectory) =>
      try {
        val targetDirectory = if (savepointDirectory != null) {
          savepointDirectory
        } else {
          defaultSavepointDir
        }

        if (targetDirectory == null) {
          log.info(s"Trying to cancel job $jobId with savepoint, but no " +
            "savepoint directory configured.")

          sender ! decorateMessage(CancellationFailure(jobId, new IllegalStateException(
            "No savepoint directory configured. You can either specify a directory " +
              "while cancelling via -s :targetDirectory or configure a cluster-wide " +
              "default via key '" + ConfigConstants.SAVEPOINT_DIRECTORY_KEY + "'.")))
        } else {
          log.info(s"Trying to cancel job $jobId with savepoint to $targetDirectory")

          currentJobs.get(jobId) match {
            case Some((executionGraph, _)) =>
              // We don't want any checkpoint between the savepoint and cancellation
              val coord = executionGraph.getCheckpointCoordinator
              coord.stopCheckpointScheduler()

              // Trigger the savepoint
              val future = coord.triggerSavepoint(System.currentTimeMillis(), targetDirectory)

              val senderRef = sender()
              future.handleAsync[Void](
                new BiFunction[CompletedCheckpoint, Throwable, Void] {
                  override def apply(success: CompletedCheckpoint, cause: Throwable): Void = {
                    if (success != null) {
                      val path = success.getExternalPointer()
                      log.info(s"Savepoint stored in $path. Now cancelling $jobId.")
                      executionGraph.cancel()
                      senderRef ! decorateMessage(CancellationSuccess(jobId, path))
                    } else {
                      val msg = CancellationFailure(
                        jobId,
                        new Exception("Failed to trigger savepoint.", cause))
                      senderRef ! decorateMessage(msg)
                    }
                    null
                  }
                },
                context.dispatcher)

            case None =>
              log.info(s"No job found with ID $jobId.")
              sender ! decorateMessage(
                CancellationFailure(
                  jobId,
                  new IllegalArgumentException(s"No job found with ID $jobId."))
              )
          }
        }
      } catch {
        case t: Throwable =>
          log.info(s"Failure during cancellation of job $jobId with savepoint.", t)
          sender ! decorateMessage(
            CancellationFailure(
              jobId,
              new Exception(s"Failed to cancel job $jobId with savepoint.", t)))
      }

    case StopJob(jobID) =>
      log.info(s"Trying to stop job with ID $jobID.")

      currentJobs.get(jobID) match {
        case Some((executionGraph, _)) =>
          try {
            if (!executionGraph.isStoppable()) {
              sender ! decorateMessage(
                StoppingFailure(
                  jobID,
                  new IllegalStateException(s"Job with ID $jobID is not stoppable."))
              )
            } else if (executionGraph.getState() != JobStatus.RUNNING) {
              sender ! decorateMessage(
                StoppingFailure(
                  jobID,
                  new IllegalStateException(s"Job with ID $jobID is in state " +
                    executionGraph.getState().name() + " but stopping is only allowed in state " +
                    "RUNNING."))
              )
            } else {
              executionGraph.stop()
              sender ! decorateMessage(StoppingSuccess(jobID))
            }
          } catch {
            case t: Throwable =>  sender ! decorateMessage(StoppingFailure(jobID, t))
          }
        case None =>
          log.info(s"No job found with ID $jobID.")
          sender ! decorateMessage(
            StoppingFailure(
              jobID,
              new IllegalArgumentException(s"No job found with ID $jobID."))
          )
      }

    case UpdateTaskExecutionState(taskExecutionState) =>
      if (taskExecutionState == null) {
        sender ! decorateMessage(false)
      } else {
        currentJobs.get(taskExecutionState.getJobID) match {
          case Some((executionGraph, _)) =>
            val originalSender = sender()

            Future {
              val result = executionGraph.updateState(taskExecutionState)
              originalSender ! decorateMessage(result)
            }(context.dispatcher)

          case None => log.error("Cannot find execution graph for ID " +
            s"${taskExecutionState.getJobID} to change state to " +
            s"${taskExecutionState.getExecutionState}.")
            sender ! decorateMessage(false)
        }
      }

    case RequestNextInputSplit(jobID, vertexID, executionAttempt) =>
      val serializedInputSplit = currentJobs.get(jobID) match {
        case Some((executionGraph,_)) =>
          val execution = executionGraph.getRegisteredExecutions.get(executionAttempt)

          if (execution == null) {
            log.error(s"Can not find Execution for attempt $executionAttempt.")
            null
          } else {
            val slot = execution.getAssignedResource
            val taskId = execution.getVertex.getParallelSubtaskIndex

            val host = if (slot != null) {
              slot.getTaskManagerLocation().getHostname()
            } else {
              null
            }

            executionGraph.getJobVertex(vertexID) match {
              case vertex: ExecutionJobVertex => vertex.getSplitAssigner match {
                case splitAssigner: InputSplitAssigner =>
                  val nextInputSplit = splitAssigner.getNextInputSplit(host, taskId)

                  log.debug(s"Send next input split $nextInputSplit.")

                  try {
                    InstantiationUtil.serializeObject(nextInputSplit)
                  } catch {
                    case ex: Exception =>
                      log.error(s"Could not serialize the next input split of " +
                        s"class ${nextInputSplit.getClass}.", ex)
                      vertex.fail(new RuntimeException("Could not serialize the next input split " +
                        "of class " + nextInputSplit.getClass + ".", ex))
                      null
                  }

                case _ =>
                  log.error(s"No InputSplitAssigner for vertex ID $vertexID.")
                  null
              }
              case _ =>
                log.error(s"Cannot find execution vertex for vertex ID $vertexID.")
                null
          }
        }
        case None =>
          log.error(s"Cannot find execution graph for job ID $jobID.")
          null
      }

      sender ! decorateMessage(NextInputSplit(serializedInputSplit))

    case checkpointMessage : AbstractCheckpointMessage =>
      handleCheckpointMessage(checkpointMessage)

    case kvStateMsg : KvStateMessage =>
      handleKvStateMessage(kvStateMsg)

    case TriggerSavepoint(jobId, savepointDirectory) =>
      currentJobs.get(jobId) match {
        case Some((graph, _)) =>
          val checkpointCoordinator = graph.getCheckpointCoordinator()

          if (checkpointCoordinator != null) {
            // Immutable copy for the future
            val senderRef = sender()
            try {
              val targetDirectory : String = savepointDirectory.getOrElse(
                flinkConfiguration.getString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, null))

              if (targetDirectory == null) {
                throw new IllegalStateException("No savepoint directory configured. " +
                  "You can either specify a directory when triggering this savepoint or " +
                  "configure a cluster-wide default via key '" +
                  ConfigConstants.SAVEPOINT_DIRECTORY_KEY + "'.")
              }

              // Do this async, because checkpoint coordinator operations can
              // contain blocking calls to the state backend or ZooKeeper.
              val savepointFuture = checkpointCoordinator.triggerSavepoint(
                System.currentTimeMillis(),
                targetDirectory)

              savepointFuture.handleAsync[Void](
                new BiFunction[CompletedCheckpoint, Throwable, Void] {
                  override def apply(success: CompletedCheckpoint, cause: Throwable): Void = {
                    if (success != null) {
                      if (success.getExternalPointer != null) {
                        senderRef ! TriggerSavepointSuccess(
                          jobId,
                          success.getCheckpointID,
                          success.getExternalPointer,
                          success.getTimestamp
                        )
                      } else {
                        senderRef ! TriggerSavepointFailure(
                          jobId, new Exception("Savepoint has not been persisted."))
                      }
                    } else {
                      senderRef ! TriggerSavepointFailure(
                        jobId, new Exception("Failed to complete savepoint", cause))
                    }
                    null
                  }
                },
                context.dispatcher)
            } catch {
              case e: Exception =>
                senderRef ! TriggerSavepointFailure(jobId, new Exception(
                  "Failed to trigger savepoint", e))
            }
          } else {
            sender() ! TriggerSavepointFailure(jobId, new IllegalStateException(
              "Checkpointing disabled. You can enable it via the execution environment of " +
                "your job."))
          }

        case None =>
          sender() ! TriggerSavepointFailure(jobId, new IllegalArgumentException("Unknown job."))
      }

    case DisposeSavepoint(savepointPath) =>
      val senderRef = sender()
      future {
        try {
          log.info(s"Disposing savepoint at '$savepointPath'.")
          //TODO user code class loader ?
          val savepoint = SavepointStore.loadSavepoint(
            savepointPath,
            Thread.currentThread().getContextClassLoader)

          log.debug(s"$savepoint")

          // Dispose checkpoint state
          savepoint.dispose()

          // Remove the header file
          SavepointStore.removeSavepointFile(savepointPath)

          senderRef ! DisposeSavepointSuccess
        } catch {
          case t: Throwable =>
            log.error(s"Failed to dispose savepoint at '$savepointPath'.", t)

            senderRef ! DisposeSavepointFailure(t)
        }
      }(context.dispatcher)

    case msg @ JobStatusChanged(jobID, newJobStatus, timeStamp, error) =>
      currentJobs.get(jobID) match {
        case Some((executionGraph, jobInfo)) => executionGraph.getJobName

          if (newJobStatus.isGloballyTerminalState()) {
            jobInfo.end = timeStamp

            future{
              // TODO If removing the JobGraph from the SubmittedJobGraphsStore fails, the job will
              // linger around and potentially be recovered at a later time. There is nothing we
              // can do about that, but it should be communicated with the Client.
              if (jobInfo.sessionAlive) {
                jobInfo.setLastActive()
                val lastActivity = jobInfo.lastActive
                context.system.scheduler.scheduleOnce(jobInfo.sessionTimeout seconds) {
                  // remove only if no activity occurred in the meantime
                  if (lastActivity == jobInfo.lastActive) {
                    self ! decorateMessage(RemoveJob(jobID, removeJobFromStateBackend = true))
                  }
                }(context.dispatcher)
              } else {
                self ! decorateMessage(RemoveJob(jobID, removeJobFromStateBackend = true))
              }

              // is the client waiting for the job result?
              newJobStatus match {
                case JobStatus.FINISHED =>
                try {
                  val accumulatorResults = executionGraph.getAccumulatorsSerialized()
                  val result = new SerializedJobExecutionResult(
                    jobID,
                    jobInfo.duration,
                    accumulatorResults)

                  jobInfo.notifyNonDetachedClients(
                    decorateMessage(JobResultSuccess(result)))
                } catch {
                  case e: Exception =>
                    log.error(s"Cannot fetch final accumulators for job $jobID", e)
                    val exception = new JobExecutionException(jobID,
                      "Failed to retrieve accumulator results.", e)

                    jobInfo.notifyNonDetachedClients(
                      decorateMessage(JobResultFailure(
                        new SerializedThrowable(exception))))
                }

                case JobStatus.CANCELED =>
                  // the error may be packed as a serialized throwable
                  val unpackedError = SerializedThrowable.get(
                    error, executionGraph.getUserClassLoader())

                  jobInfo.notifyNonDetachedClients(
                    decorateMessage(JobResultFailure(
                      new SerializedThrowable(
                        new JobCancellationException(jobID, "Job was cancelled.", unpackedError)))))

                case JobStatus.FAILED =>
                  val unpackedError = SerializedThrowable.get(
                    error, executionGraph.getUserClassLoader())

                  jobInfo.notifyNonDetachedClients(
                    decorateMessage(JobResultFailure(
                      new SerializedThrowable(
                        new JobExecutionException(jobID, "Job execution failed.", unpackedError)))))

                case x =>
                  val exception = new JobExecutionException(jobID, s"$x is not a terminal state.")
                  jobInfo.notifyNonDetachedClients(
                    decorateMessage(JobResultFailure(
                      new SerializedThrowable(exception))))
                  throw exception
              }
            }(context.dispatcher)
          }
        case None => log.debug(s"Received $msg for nonexistent job $jobID.")
      }

    case ScheduleOrUpdateConsumers(jobId, partitionId) =>
      currentJobs.get(jobId) match {
        case Some((executionGraph, _)) =>
          try {
            executionGraph.scheduleOrUpdateConsumers(partitionId)
            sender ! decorateMessage(Acknowledge.get())
          } catch {
            case e: Exception =>
              sender ! decorateMessage(
                Failure(new Exception("Could not schedule or update consumers.", e))
              )
          }
        case None =>
          log.error(s"Cannot find execution graph for job ID $jobId to schedule or update " +
            s"consumers.")
          sender ! decorateMessage(
            Failure(
              new IllegalStateException("Cannot find execution graph for job ID " +
                s"$jobId to schedule or update consumers.")
            )
          )
      }

    case RequestPartitionProducerState(jobId, intermediateDataSetId, resultPartitionId) =>
      currentJobs.get(jobId) match {
        case Some((executionGraph, _)) =>
          try {
            // Find the execution attempt producing the intermediate result partition.
            val execution = executionGraph
              .getRegisteredExecutions
              .get(resultPartitionId.getProducerId)

            if (execution != null) {
              // Common case for pipelined exchanges => producing execution is
              // still active.
              sender ! decorateMessage(execution.getState)
            } else {
              // The producing execution might have terminated and been
              // unregistered. We now look for the producing execution via the
              // intermediate result itself.
              val intermediateResult = executionGraph
                .getAllIntermediateResults.get(intermediateDataSetId)

              if (intermediateResult != null) {
                // Try to find the producing execution
                val producerExecution = intermediateResult
                  .getPartitionById(resultPartitionId.getPartitionId)
                  .getProducer
                  .getCurrentExecutionAttempt

                if (producerExecution.getAttemptId() == resultPartitionId.getProducerId()) {
                  sender ! decorateMessage(producerExecution.getState)
                } else {
                  val cause = new PartitionProducerDisposedException(resultPartitionId)
                  sender ! decorateMessage(Status.Failure(cause))
                }
              } else {
                val cause = new IllegalArgumentException(
                  s"Intermediate data set with ID $intermediateDataSetId not found.")
                sender ! decorateMessage(Status.Failure(cause))
              }
            }
          } catch {
            case e: Exception =>
              sender ! decorateMessage(
                Status.Failure(new RuntimeException("Failed to look up execution state of " +
                  s"producer with ID ${resultPartitionId.getProducerId}.", e)))
          }

        case None =>
          sender ! decorateMessage(
            Status.Failure(new IllegalArgumentException(s"Job with ID $jobId not found.")))

      }

    case RequestJobStatus(jobID) =>
      currentJobs.get(jobID) match {
        case Some((executionGraph,_)) =>
          sender ! decorateMessage(CurrentJobStatus(jobID, executionGraph.getState))
        case None =>
          // check the archive
          archive forward decorateMessage(RequestJobStatus(jobID))
      }

    case RequestRunningJobs =>
      val executionGraphs = currentJobs map {
        case (_, (eg, jobInfo)) => eg
      }

      sender ! decorateMessage(RunningJobs(executionGraphs))

    case RequestRunningJobsStatus =>
      try {
        val jobs = currentJobs map {
          case (_, (eg, _)) =>
            new JobStatusMessage(
              eg.getJobID,
              eg.getJobName,
              eg.getState,
              eg.getStatusTimestamp(JobStatus.CREATED)
            )
        }

        sender ! decorateMessage(RunningJobsStatus(jobs))
      }
      catch {
        case t: Throwable => log.error("Exception while responding to RequestRunningJobsStatus", t)
      }

    case RequestJob(jobID) =>
      currentJobs.get(jobID) match {
        case Some((eg, _)) => sender ! decorateMessage(JobFound(jobID, eg))
        case None =>
          // check the archive
          archive forward decorateMessage(RequestJob(jobID))
      }

    case RequestClassloadingProps(jobID) =>
      currentJobs.get(jobID) match {
        case Some((graph, jobInfo)) =>
          sender() ! decorateMessage(
            ClassloadingProps(
              libraryCacheManager.getBlobServerPort,
              graph.getRequiredJarFiles,
              graph.getRequiredClasspaths))
        case None =>
          sender() ! decorateMessage(JobNotFound(jobID))
      }

    case RequestBlobManagerPort =>
      sender ! decorateMessage(libraryCacheManager.getBlobServerPort)

    case RequestArchive =>
      sender ! decorateMessage(ResponseArchive(archive))

    case RequestRegisteredTaskManagers =>
      sender ! decorateMessage(
        RegisteredTaskManagers(
          instanceManager.getAllRegisteredInstances.asScala
        )
      )

    case RequestTaskManagerInstance(instanceID) =>
      sender ! decorateMessage(
        TaskManagerInstance(Option(instanceManager.getRegisteredInstanceById(instanceID)))
      )

    case Heartbeat(instanceID, accumulators) =>
      log.trace(s"Received heartbeat message from $instanceID.")

      updateAccumulators(accumulators)

      instanceManager.reportHeartBeat(instanceID)

    case message: AccumulatorMessage => handleAccumulatorMessage(message)

    case message: InfoMessage => handleInfoRequestMessage(message, sender())

    case RequestStackTrace(instanceID) =>
      val taskManagerGateway = instanceManager
        .getRegisteredInstanceById(instanceID)
        .getTaskManagerGateway

      val stackTraceFuture = taskManagerGateway
        .requestStackTrace(Time.milliseconds(timeout.toMillis))

      val originalSender = new AkkaActorGateway(sender(), leaderSessionID.orNull)

      val sendingFuture = stackTraceFuture.thenAccept(new AcceptFunction[StackTrace] {
        override def accept(value: StackTrace): Unit = {
          originalSender.tell(value)
        }
      })

      sendingFuture.exceptionally(new ApplyFunction[Throwable, Void] {
        override def apply(value: Throwable): Void = {
          log.info("Could not send requested stack trace.", value)

          return null
        }
      })

    case Terminated(taskManagerActorRef) =>
      taskManagerMap.get(taskManagerActorRef) match {
        case Some(instanceId) => handleTaskManagerTerminated(taskManagerActorRef, instanceId)
        case None =>  log.debug("Received terminated message for task manager " +
                                  s"$taskManagerActorRef which is not " +
                                  "connected to this job manager.")
      }

    case RequestJobManagerStatus =>
      sender() ! decorateMessage(JobManagerStatusAlive)

    case RemoveJob(jobID, clearPersistedJob) =>
      currentJobs.get(jobID) match {
        case Some((graph, info)) =>
            removeJob(graph.getJobID, clearPersistedJob) match {
              case Some(futureToComplete) =>
                futuresToComplete = Some(futuresToComplete.getOrElse(Seq()) :+ futureToComplete)
              case None =>
            }
        case None => log.debug(s"Tried to remove nonexistent job $jobID.")
      }

    case RemoveCachedJob(jobID) =>
      currentJobs.get(jobID) match {
        case Some((graph, info)) =>
          if (graph.getState.isGloballyTerminalState) {
            removeJob(graph.getJobID, removeJobFromStateBackend = true) match {
              case Some(futureToComplete) =>
                futuresToComplete = Some(futuresToComplete.getOrElse(Seq()) :+ futureToComplete)
              case None =>
            }
          } else {
            // triggers removal upon completion of job
            info.sessionAlive = false
          }
        case None =>
      }

    case Disconnect(instanceId, cause) =>
      val taskManager = sender()

      if (instanceManager.isRegistered(instanceId)) {
        log.info(s"Task manager ${taskManager.path} wants to disconnect, " +
                   s"because ${cause.getMessage}.")

        instanceManager.unregisterTaskManager(instanceId, false)
        taskManagerMap.remove(taskManager)
        context.unwatch(taskManager)
      }

    case msg: StopCluster =>

      log.info(s"Stopping JobManager with final application status ${msg.finalStatus()} " +
        s"and diagnostics: ${msg.message()}")

      // stop all task managers
      instanceManager.getAllRegisteredInstances.asScala foreach {
        instance =>
          instance.getTaskManagerGateway.stopCluster(msg.finalStatus(), msg.message())
      }

      // send resource manager the ok
      currentResourceManager match {
        case Some(rm) =>
          try {
            // inform rm and wait for it to confirm
            val waitTime = FiniteDuration(5, TimeUnit.SECONDS)
            val answer = (rm ? decorateMessage(msg))(waitTime)
            Await.ready(answer, waitTime)
          } catch {
            case e: TimeoutException =>
            case e: InterruptedException =>
          }
        case None =>
          // ResourceManager not available
          // we choose not to wait here because it might block the shutdown forever
      }

      sender() ! decorateMessage(StopClusterSuccessful.getInstance())
      shutdown()

    case RequestLeaderSessionID =>
      sender() ! ResponseLeaderSessionID(leaderSessionID.orNull)

    case RequestWebMonitorPort =>
      sender() ! ResponseWebMonitorPort(webMonitorPort)
  }

  /**
    * Handler to be executed when a task manager terminates.
    * (Akka Deathwatch or notification from ResourceManager)
    *
    * @param taskManager The ActorRef of the task manager
    * @param instanceId identifying the dead task manager
    */
  private def handleTaskManagerTerminated(taskManager: ActorRef, instanceId: InstanceID): Unit = {
    if (instanceManager.isRegistered(instanceId)) {
      log.info(s"Task manager ${taskManager.path} terminated.")

      instanceManager.unregisterTaskManager(instanceId, true)
      taskManagerMap.remove(taskManager)
      context.unwatch(taskManager)
    }
  }

  /**
   * Submits a job to the job manager. The job is registered at the libraryCacheManager which
   * creates the job's class loader. The job graph is appended to the corresponding execution
   * graph and the execution vertices are queued for scheduling.
   *
   * @param jobGraph representing the Flink job
   * @param jobInfo the job info
   * @param isRecovery Flag indicating whether this is a recovery or initial submission
   */
  private def submitJob(jobGraph: JobGraph, jobInfo: JobInfo, isRecovery: Boolean = false): Unit = {
    if (jobGraph == null) {
      jobInfo.notifyClients(
        decorateMessage(JobResultFailure(
          new SerializedThrowable(
            new JobSubmissionException(null, "JobGraph must not be null.")))))
    }
    else {
      val jobId = jobGraph.getJobID
      val jobName = jobGraph.getName
      var executionGraph: ExecutionGraph = null

      log.info(s"Submitting job $jobId ($jobName)" + (if (isRecovery) " (Recovery)" else "") + ".")

      try {
        // Important: We need to make sure that the library registration is the first action,
        // because this makes sure that the uploaded jar files are removed in case of
        // unsuccessful
        try {
          libraryCacheManager.registerJob(jobGraph.getJobID, jobGraph.getUserJarBlobKeys,
            jobGraph.getClasspaths)
        }
        catch {
          case t: Throwable =>
            throw new JobSubmissionException(jobId,
              "Cannot set up the user code libraries: " + t.getMessage, t)
        }

        val userCodeLoader = libraryCacheManager.getClassLoader(jobGraph.getJobID)
        if (userCodeLoader == null) {
          throw new JobSubmissionException(jobId,
            "The user code class loader could not be initialized.")
        }

        if (jobGraph.getNumberOfVertices == 0) {
          throw new JobSubmissionException(jobId, "The given job is empty")
        }

        val restartStrategy =
          Option(jobGraph.getSerializedExecutionConfig()
            .deserializeValue(userCodeLoader)
            .getRestartStrategy())
            .map(RestartStrategyFactory.createRestartStrategy)
            .filter(p => p != null) match {
            case Some(strategy) => strategy
            case None => restartStrategyFactory.createRestartStrategy()
          }

        log.info(s"Using restart strategy $restartStrategy for $jobId.")

        val jobMetrics = jobManagerMetricGroup match {
          case Some(group) =>
            group.addJob(jobGraph) match {
              case (jobGroup:Any) => jobGroup
              case null => new UnregisteredMetricsGroup()
            }
          case None =>
            new UnregisteredMetricsGroup()
        }

        val numSlots = scheduler.getTotalNumberOfSlots()

        // see if there already exists an ExecutionGraph for the corresponding job ID
        val registerNewGraph = currentJobs.get(jobGraph.getJobID) match {
          case Some((graph, currentJobInfo)) =>
            executionGraph = graph
            currentJobInfo.setLastActive()
            false
          case None =>
            true
        }

        executionGraph = ExecutionGraphBuilder.buildGraph(
          executionGraph,
          jobGraph,
          flinkConfiguration,
          futureExecutor,
          ioExecutor,
          scheduler,
          userCodeLoader,
          checkpointRecoveryFactory,
          Time.of(timeout.length, timeout.unit),
          restartStrategy,
          jobMetrics,
          numSlots,
          log.logger)
        
        if (registerNewGraph) {
          currentJobs.put(jobGraph.getJobID, (executionGraph, jobInfo))
        }

        // get notified about job status changes
        executionGraph.registerJobStatusListener(
          new StatusListenerMessenger(self, leaderSessionID.orNull))

        jobInfo.clients foreach {
          // the sender wants to be notified about state changes
          case (client, ListeningBehaviour.EXECUTION_RESULT_AND_STATE_CHANGES) =>
            val listener  = new StatusListenerMessenger(client, leaderSessionID.orNull)
            executionGraph.registerExecutionListener(listener)
            executionGraph.registerJobStatusListener(listener)
          case _ => // do nothing
        }

      } catch {
        case t: Throwable =>
          log.error(s"Failed to submit job $jobId ($jobName)", t)

          libraryCacheManager.unregisterJob(jobId)
          currentJobs.remove(jobId)

          if (executionGraph != null) {
            executionGraph.failGlobal(t)
          }

          val rt: Throwable = if (t.isInstanceOf[JobExecutionException]) {
            t
          } else {
            new JobExecutionException(jobId, s"Failed to submit job $jobId ($jobName)", t)
          }

          jobInfo.notifyClients(
            decorateMessage(JobResultFailure(new SerializedThrowable(rt))))
          return
      }

      // execute the recovery/writing the jobGraph into the SubmittedJobGraphStore asynchronously
      // because it is a blocking operation
      future {
        try {
          if (isRecovery) {
            // this is a recovery of a master failure (this master takes over)
            executionGraph.restoreLatestCheckpointedState(false, false)
          }
          else {
            // load a savepoint only if this is not starting from a newer checkpoint
            // as part of an master failure recovery
            val savepointSettings = jobGraph.getSavepointRestoreSettings
            if (savepointSettings.restoreSavepoint()) {
              try {
                val savepointPath = savepointSettings.getRestorePath()
                val allowNonRestored = savepointSettings.allowNonRestoredState()

                executionGraph.getCheckpointCoordinator.restoreSavepoint(
                  savepointPath, 
                  allowNonRestored,
                  executionGraph.getAllVertices,
                  executionGraph.getUserClassLoader
                )
              } catch {
                case e: Exception =>
                  jobInfo.notifyClients(
                    decorateMessage(JobResultFailure(new SerializedThrowable(e))))
                  throw new SuppressRestartsException(e)
              }
            }

            try {
              submittedJobGraphs.putJobGraph(new SubmittedJobGraph(jobGraph, jobInfo))
            } catch {
              case t: Throwable =>
                // Don't restart the execution if this fails. Otherwise, the
                // job graph will skip ZooKeeper in case of HA.
                jobInfo.notifyClients(
                  decorateMessage(JobResultFailure(new SerializedThrowable(t))))
                throw new SuppressRestartsException(t)
            }
          }

          jobInfo.notifyClients(
            decorateMessage(JobSubmitSuccess(jobGraph.getJobID)))

          if (leaderElectionService.hasLeadership) {
            // There is a small chance that multiple job managers schedule the same job after if
            // they try to recover at the same time. This will eventually be noticed, but can not be
            // ruled out from the beginning.

            // NOTE: Scheduling the job for execution is a separate action from the job submission.
            // The success of submitting the job must be independent from the success of scheduling
            // the job.
            log.info(s"Scheduling job $jobId ($jobName).")

            executionGraph.scheduleForExecution()
          } else {
            // Remove the job graph. Otherwise it will be lingering around and possibly removed from
            // ZooKeeper by this JM.
            self ! decorateMessage(RemoveJob(jobId, removeJobFromStateBackend = false))

            log.warn(s"Submitted job $jobId, but not leader. The other leader needs to recover " +
              "this. I am not scheduling the job for execution.")
          }
        } catch {
          case t: Throwable => try {
            executionGraph.failGlobal(t)
          } catch {
            case tt: Throwable =>
              log.error("Error while marking ExecutionGraph as failed.", tt)
          }
        }
      }(context.dispatcher)
    }
  }

  /**
   * Dedicated handler for checkpoint messages.
   *
   * @param actorMessage The checkpoint actor message.
   */
  private def handleCheckpointMessage(actorMessage: AbstractCheckpointMessage): Unit = {
    actorMessage match {
      case ackMessage: AcknowledgeCheckpoint =>
        val jid = ackMessage.getJob()
        currentJobs.get(jid) match {
          case Some((graph, _)) =>
            val checkpointCoordinator = graph.getCheckpointCoordinator()

            if (checkpointCoordinator != null) {
              future {
                try {
                  if (!checkpointCoordinator.receiveAcknowledgeMessage(ackMessage)) {
                    log.info("Received message for non-existing checkpoint " +
                      ackMessage.getCheckpointId)
                  }
                }
                catch {
                  case t: Throwable =>
                    log.error(s"Error in CheckpointCoordinator while processing $ackMessage", t)
                }
              }(context.dispatcher)
            }
            else {
              log.error(
                s"Received AcknowledgeCheckpoint message for job $jid with no " +
                  s"CheckpointCoordinator")
            }

          case None => log.error(s"Received AcknowledgeCheckpoint for unavailable job $jid")
        }

      case declineMessage: DeclineCheckpoint =>
        val jid = declineMessage.getJob()
        currentJobs.get(jid) match {
          case Some((graph, _)) =>
            val checkpointCoordinator = graph.getCheckpointCoordinator()

            if (checkpointCoordinator != null) {
              future {
                try {
                 checkpointCoordinator.receiveDeclineMessage(declineMessage)
                }
                catch {
                  case t: Throwable =>
                    log.error(s"Error in CheckpointCoordinator while processing $declineMessage", t)
                }
              }(context.dispatcher)
            }
            else {
              log.error(
                s"Received DeclineCheckpoint message for job $jid with no CheckpointCoordinator")
            }

          case None => log.error(s"Received DeclineCheckpoint for unavailable job $jid")
        }


      // unknown checkpoint message
      case _ => unhandled(actorMessage)
    }
  }

  /**
    * Handle all [KvStateMessage] instances for KvState location lookups and
    * registration.
    *
    * @param actorMsg The KvState actor message.
    */
  private def handleKvStateMessage(actorMsg: KvStateMessage): Unit = {
    actorMsg match {
      // Client KvStateLocation lookup
      case msg: LookupKvStateLocation =>
        currentJobs.get(msg.getJobId) match {
          case Some((graph, _)) =>
            try {
              log.debug(s"Lookup key-value state for job ${msg.getJobId} with registration " +
                         s"name ${msg.getRegistrationName}.")

              val registry = graph.getKvStateLocationRegistry
              val location = registry.getKvStateLocation(msg.getRegistrationName)
              if (location == null) {
                sender() ! Failure(new UnknownKvStateLocation(msg.getRegistrationName))
              } else {
                sender() ! Success(location)
              }
            } catch {
              case t: Throwable =>
                sender() ! Failure(t)
            }

          case None =>
            sender() ! Status.Failure(new IllegalStateException(s"Job ${msg.getJobId} not found"))
        }

      // TaskManager KvState registration
      case msg: NotifyKvStateRegistered =>
        currentJobs.get(msg.getJobId) match {
          case Some((graph, _)) =>
            try {
              log.debug(s"Key value state registered for job ${msg.getJobId} under " +
                         s"name ${msg.getRegistrationName}.")

              graph.getKvStateLocationRegistry.notifyKvStateRegistered(
                msg.getJobVertexId,
                msg.getKeyGroupRange,
                msg.getRegistrationName,
                msg.getKvStateId,
                msg.getKvStateServerAddress)
            } catch {
              case t: Throwable =>
                log.error(s"Failed to notify KvStateRegistry about registration $msg.")
            }

          case None => log.error(s"Received $msg for unavailable job.")
        }

      // TaskManager KvState unregistration
      case msg: NotifyKvStateUnregistered =>
        currentJobs.get(msg.getJobId) match {
          case Some((graph, _)) =>
            try {
              graph.getKvStateLocationRegistry.notifyKvStateUnregistered(
                msg.getJobVertexId,
                msg.getKeyGroupRange,
                msg.getRegistrationName)
            } catch {
              case t: Throwable =>
                log.error(s"Failed to notify KvStateRegistry about registration $msg.")
            }

          case None => log.error(s"Received $msg for unavailable job.")
        }

      case _ => unhandled(actorMsg)
    }
  }

  /**
   * Handle unmatched messages with an exception.
   */
  override def unhandled(message: Any): Unit = {
    // let the actor crash
    throw new RuntimeException("Received unknown message " + message)
  }

  /**
   * Handle messages that request or report accumulators.
   *
   * @param message The accumulator message.
   */
  private def handleAccumulatorMessage(message: AccumulatorMessage): Unit = {
    message match {
      case RequestAccumulatorResults(jobID) =>
        try {
          currentJobs.get(jobID) match {
            case Some((graph, jobInfo)) =>
              val accumulatorValues = graph.getAccumulatorsSerialized()
              sender() ! decorateMessage(AccumulatorResultsFound(jobID, accumulatorValues))
            case None =>
              archive.forward(message)
          }
        } catch {
        case e: Exception =>
          log.error("Cannot serialize accumulator result.", e)
          sender() ! decorateMessage(AccumulatorResultsErroneous(jobID, e))
        }

      case RequestAccumulatorResultsStringified(jobId) =>
        currentJobs.get(jobId) match {
          case Some((graph, jobInfo)) =>
            val stringifiedAccumulators = graph.getAccumulatorResultsStringified()
            sender() ! decorateMessage(
              AccumulatorResultStringsFound(jobId, stringifiedAccumulators)
            )
          case None =>
            archive.forward(message)
        }

      case unknown =>
        log.warn(s"Received unknown AccumulatorMessage: $unknown")
    }
  }

  /**
   * Dedicated handler for monitor info request messages.
   * 
   * Note that this handler does not fail. Errors while responding to info messages are logged,
   * but will not cause the actor to crash.
   *
   * @param actorMessage The info request message.
   */
  private def handleInfoRequestMessage(actorMessage: InfoMessage, theSender: ActorRef): Unit = {
    try {
      actorMessage match {

        case _ : RequestJobsOverview =>
          // get our own overview
          val ourJobs = createJobStatusOverview()

          // get the overview from the archive
          val future = (archive ? RequestJobsOverview.getInstance())(timeout)

          future.onSuccess {
            case archiveOverview: JobsOverview =>
              theSender ! new JobsOverview(ourJobs, archiveOverview)
          }(context.dispatcher)

        case _ : RequestJobsWithIDsOverview =>
          // get our own overview
          val ourJobs = createJobStatusWithIDsOverview()

          // get the overview from the archive
          val future = (archive ? RequestJobsWithIDsOverview.getInstance())(timeout)

          future.onSuccess {
            case archiveOverview: JobsWithIDsOverview =>
              theSender ! new JobsWithIDsOverview(ourJobs, archiveOverview)
          }(context.dispatcher)

        case _ : RequestStatusOverview =>

          val ourJobs = createJobStatusOverview()

          val numTMs = instanceManager.getNumberOfRegisteredTaskManagers()
          val numSlotsTotal = instanceManager.getTotalNumberOfSlots()
          val numSlotsAvailable = instanceManager.getNumberOfAvailableSlots()

          // add to that the jobs from the archive
          val future = (archive ? RequestJobsOverview.getInstance())(timeout)
          future.onSuccess {
            case archiveOverview: JobsOverview =>
              theSender ! new StatusOverview(numTMs, numSlotsTotal, numSlotsAvailable,
                ourJobs, archiveOverview)
          }(context.dispatcher)

        case msg : RequestJobDetails => 
          
          val ourDetails: Array[JobDetails] = if (msg.shouldIncludeRunning()) {
            currentJobs.values.map {
              v => WebMonitorUtils.createDetailsForJob(v._1)
            }.toArray[JobDetails]
          } else {
            null
          }
          
          if (msg.shouldIncludeFinished()) {
            val future = (archive ? msg)(timeout)
            future.onSuccess {
              case archiveDetails: MultipleJobsDetails =>
                theSender ! new MultipleJobsDetails(ourDetails, archiveDetails.getFinishedJobs())
            }(context.dispatcher)
          } else {
            theSender ! new MultipleJobsDetails(ourDetails, null)
          }
          
        case _ => log.error("Unrecognized info message " + actorMessage)
      }
    }
    catch {
      case e: Throwable => log.error(s"Error responding to message $actorMessage", e)
    }
  }

  private def createJobStatusOverview() : JobsOverview = {
    var runningOrPending = 0
    var finished = 0
    var canceled = 0
    var failed = 0

    currentJobs.values.foreach {
      _._1.getState() match {
        case JobStatus.FINISHED => finished += 1
        case JobStatus.CANCELED => canceled += 1
        case JobStatus.FAILED => failed += 1
        case _ => runningOrPending += 1
      }
    }

    new JobsOverview(runningOrPending, finished, canceled, failed)
  }

  private def createJobStatusWithIDsOverview() : JobsWithIDsOverview = {
    val runningOrPending = new java.util.ArrayList[JobID]()
    val finished = new java.util.ArrayList[JobID]()
    val canceled = new java.util.ArrayList[JobID]()
    val failed = new java.util.ArrayList[JobID]()
    
    currentJobs.values.foreach { case (graph, _) =>
      graph.getState() match {
        case JobStatus.FINISHED => finished.add(graph.getJobID)
        case JobStatus.CANCELED => canceled.add(graph.getJobID)
        case JobStatus.FAILED => failed.add(graph.getJobID)
        case _ => runningOrPending.add(graph.getJobID)
      }
    }

    new JobsWithIDsOverview(runningOrPending, finished, canceled, failed)
  }

  /**
   * Removes the job and sends it to the MemoryArchivist.
   *
   * This should be called asynchronously. Removing the job from the [[SubmittedJobGraphStore]]
   * might block. Therefore be careful not to block the actor thread.
   *
   * @param jobID ID of the job to remove and archive
   * @param removeJobFromStateBackend true if the job shall be archived and removed from the state
   *                            backend
   */
  private def removeJob(jobID: JobID, removeJobFromStateBackend: Boolean): Option[Future[Unit]] = {
    // Don't remove the job yet...
    val futureOption = currentJobs.get(jobID) match {
      case Some((eg, _)) =>
        val result = if (removeJobFromStateBackend) {
          val futureOption = Some(future {
            try {
              // ...otherwise, we can have lingering resources when there is a  concurrent shutdown
              // and the ZooKeeper client is closed. Not removing the job immediately allow the
              // shutdown to release all resources.
              submittedJobGraphs.removeJobGraph(jobID)
            } catch {
              case t: Throwable => log.warn(s"Could not remove submitted job graph $jobID.", t)
            }
          }(context.dispatcher))

          try {
            archive ! decorateMessage(ArchiveExecutionGraph(jobID, eg.archive()))
          } catch {
            case t: Throwable => log.warn(s"Could not archive the execution graph $eg.", t)
          }

          futureOption
        } else {
          None
        }

        currentJobs.remove(jobID)

        result
      case None => None
    }

    try {
      libraryCacheManager.unregisterJob(jobID)
    } catch {
      case t: Throwable =>
        log.error(s"Could not properly unregister job $jobID form the library cache.", t)
    }
    jobManagerMetricGroup.foreach(_.removeJob(jobID))

    futureOption
  }

  /** Fails all currently running jobs and empties the list of currently running jobs. If the
    * [[JobClientActor]] waits for a result, then a [[JobExecutionException]] is sent.
    *
    * @param cause Cause for the cancelling.
    */
  private def cancelAndClearEverything(cause: Throwable)
    : Seq[Future[Unit]] = {
    val futures = for ((jobID, (eg, jobInfo)) <- currentJobs) yield {
      future {
        eg.suspend(cause)

        jobInfo.notifyNonDetachedClients(
          decorateMessage(
            Failure(
              new JobExecutionException(jobID, "All jobs are cancelled and cleared.", cause))))
      }(context.dispatcher)
    }

    currentJobs.clear()

    futures.toSeq
  }

  override def grantLeadership(newLeaderSessionID: UUID): Unit = {
    self ! decorateMessage(GrantLeadership(Option(newLeaderSessionID)))
  }

  override def revokeLeadership(): Unit = {
    self ! decorateMessage(RevokeLeadership)
  }

  override def onAddedJobGraph(jobId: JobID): Unit = {
    if (leaderSessionID.isDefined && !currentJobs.contains(jobId)) {
      self ! decorateMessage(RecoverJob(jobId))
    }
  }

  override def onRemovedJobGraph(jobId: JobID): Unit = {
    if (leaderSessionID.isDefined) {
      currentJobs.get(jobId).foreach(
        job =>
          future {
            // Fail the execution graph
            job._1.failGlobal(new IllegalStateException("Another JobManager removed the job from " +
              "ZooKeeper."))
          }(context.dispatcher)
      )
    }
  }

  override def getAddress: String = {
    AkkaUtils.getAkkaURL(context.system, self)
  }

  /** Handles error occurring in the leader election service
    *
    * @param exception Exception being thrown in the leader election service
    */
  override def handleError(exception: Exception): Unit = {
    log.error("Received an error from the LeaderElectionService.", exception)

    // terminate JobManager in case of an error
    self ! decorateMessage(PoisonPill)
  }
  
  /**
   * Updates the accumulators reported from a task manager via the Heartbeat message.
   *
   * @param accumulators list of accumulator snapshots
   */
  private def updateAccumulators(accumulators : Seq[AccumulatorSnapshot]): Unit = {
    accumulators.foreach( snapshot => {
        if (snapshot != null) {
          currentJobs.get(snapshot.getJobID) match {
            case Some((jobGraph, jobInfo)) =>
              future {
                jobGraph.updateAccumulators(snapshot)
              }(context.dispatcher)
            case None =>
              // ignore accumulator values for old job
          }
        }
    })
  }

  /**
    * Shutdown method which may be overridden for testing.
    */
  protected def shutdown() : Unit = {
    // Await actor system termination and shut down JVM
    new ProcessShutDownThread(
      log.logger,
      context.system,
      FiniteDuration(10, SECONDS)).start()

    // Shutdown and discard all queued messages
    context.system.shutdown()
  }

  private def instantiateMetrics(jobManagerMetricGroup: MetricGroup) : Unit = {
    jobManagerMetricGroup.gauge[Long, Gauge[Long]]("taskSlotsAvailable", new Gauge[Long] {
      override def getValue: Long = JobManager.this.instanceManager.getNumberOfAvailableSlots
    })
    jobManagerMetricGroup.gauge[Long, Gauge[Long]]("taskSlotsTotal", new Gauge[Long] {
      override def getValue: Long = JobManager.this.instanceManager.getTotalNumberOfSlots
    })
    jobManagerMetricGroup.gauge[Long, Gauge[Long]]("numRegisteredTaskManagers", new Gauge[Long] {
      override def getValue: Long
      = JobManager.this.instanceManager.getNumberOfRegisteredTaskManagers
    })
    jobManagerMetricGroup.gauge[Long, Gauge[Long]]("numRunningJobs", new Gauge[Long] {
      override def getValue: Long = JobManager.this.currentJobs.size
    })
    MetricUtils.instantiateStatusMetrics(jobManagerMetricGroup)
  }
}

/**
 * Job Manager companion object. Contains the entry point (main method) to run the JobManager in a
 * standalone fashion. Also contains various utility methods to start the JobManager and to
 * look up the JobManager actor reference.
 */
object JobManager {

  val LOG = Logger(classOf[JobManager])

  val STARTUP_FAILURE_RETURN_CODE = 1
  val RUNTIME_FAILURE_RETURN_CODE = 2


  /**
   * Entry point (main method) to run the JobManager in a standalone fashion.
   *
   * @param args The command line arguments.
   */
  def main(args: Array[String]): Unit = {
    // startup checks and logging
    EnvironmentInformation.logEnvironmentInfo(LOG.logger, "JobManager", args)
    SignalHandler.register(LOG.logger)
    JvmShutdownSafeguard.installAsShutdownHook(LOG.logger)

    // parsing the command line arguments
    val (configuration: Configuration,
         executionMode: JobManagerMode,
         externalHostName: String,
         portRange: java.util.Iterator[Integer]) =
    try {
      parseArgs(args)
    }
    catch {
      case t: Throwable =>
        LOG.error(t.getMessage(), t)
        t.printStackTrace()
        System.exit(STARTUP_FAILURE_RETURN_CODE)
        null
    }

    // we want to check that the JobManager hostname is in the config
    // if it is not in there, the actor system will bind to the loopback interface's
    // address and will not be reachable from anyone remote
    if (externalHostName == null) {
      val message = "Config parameter '" + ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY +
        "' is missing (hostname/address to bind JobManager to)."
      LOG.error(message)
      System.exit(STARTUP_FAILURE_RETURN_CODE)
    }

    if (!portRange.hasNext) {
      if (ZooKeeperUtils.isZooKeeperRecoveryMode(configuration)) {
        val message = "Config parameter '" + ConfigConstants.HA_JOB_MANAGER_PORT +
          "' does not specify a valid port range."
        LOG.error(message)
        System.exit(STARTUP_FAILURE_RETURN_CODE)
      }
      else {
        val message = s"Config parameter '" + ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY +
          "' does not specify a valid port."
        LOG.error(message)
        System.exit(STARTUP_FAILURE_RETURN_CODE)
      }
    }

    // run the job manager
    SecurityUtils.install(new SecurityConfiguration(configuration))

    try {
      SecurityUtils.getInstalledContext.runSecured(new Callable[Unit] {
        override def call(): Unit = {
          runJobManager(
            configuration,
            executionMode,
            externalHostName,
            portRange)
        }
      })
    } catch {
      case t: Throwable =>
        LOG.error("Failed to run JobManager.", t)
        t.printStackTrace()
        System.exit(STARTUP_FAILURE_RETURN_CODE)
    }
  }

  /**
   * Starts and runs the JobManager with all its components. First, this method starts a
   * dedicated actor system for the JobManager. Second, its starts all components of the
   * JobManager (including library cache, instance manager, scheduler). Finally, it starts
   * the JobManager actor itself.
   *
   * This method blocks indefinitely (or until the JobManager's actor system is shut down).
   *
   * @param configuration The configuration object for the JobManager.
   * @param executionMode The execution mode in which to run. Execution mode LOCAL will spawn an
   *                      an additional TaskManager in the same process.
   * @param listeningAddress The hostname where the JobManager should listen for messages.
   * @param listeningPort The port where the JobManager should listen for messages.
   */
  def runJobManager(
      configuration: Configuration,
      executionMode: JobManagerMode,
      listeningAddress: String,
      listeningPort: Int)
    : Unit = {

    val numberProcessors = Hardware.getNumberCPUCores()

    val futureExecutor = Executors.newScheduledThreadPool(
      numberProcessors,
      new ExecutorThreadFactory("jobmanager-future"))

    val ioExecutor = Executors.newFixedThreadPool(
      numberProcessors,
      new ExecutorThreadFactory("jobmanager-io"))

    val timeout = AkkaUtils.getTimeout(configuration)

    // we have to first start the JobManager ActorSystem because this determines the port if 0
    // was chosen before. The method startActorSystem will update the configuration correspondingly.
    val jobManagerSystem = startActorSystem(
      configuration,
      listeningAddress,
      listeningPort)

    val highAvailabilityServices = HighAvailabilityServicesUtils.createHighAvailabilityServices(
      configuration,
      ioExecutor,
      AddressResolution.NO_ADDRESS_RESOLUTION);

    val (_, _, webMonitorOption, _) = try {
      startJobManagerActors(
        jobManagerSystem,
        configuration,
        executionMode,
        listeningAddress,
        futureExecutor,
        ioExecutor,
        highAvailabilityServices,
        classOf[JobManager],
        classOf[MemoryArchivist],
        Option(classOf[StandaloneResourceManager])
      )
    } catch {
      case t: Throwable =>
        futureExecutor.shutdownNow()
        ioExecutor.shutdownNow()

        throw t
    }

    // block until everything is shut down
    jobManagerSystem.awaitTermination()

    webMonitorOption.foreach{
      webMonitor =>
        try {
          webMonitor.stop()
        } catch {
          case t: Throwable =>
            LOG.warn("Could not properly stop the web monitor.", t)
        }
    }

    try {
      highAvailabilityServices.close()
    } catch {
      case t: Throwable =>
        LOG.warn("Could not properly stop the high availability services.", t)
    }

    FlinkExecutors.gracefulShutdown(
      timeout.toMillis,
      TimeUnit.MILLISECONDS,
      futureExecutor,
      ioExecutor)
  }

  /**
    * Starts and runs the JobManager with all its components trying to bind to
    * a port in the specified range.
    *
    * @param configuration The configuration object for the JobManager.
    * @param executionMode The execution mode in which to run. Execution mode LOCAL will spawn an
    *                      an additional TaskManager in the same process.
    * @param listeningAddress The hostname where the JobManager should listen for messages.
    * @param listeningPortRange The port range where the JobManager should listen for messages.
    */
  def runJobManager(
      configuration: Configuration,
      executionMode: JobManagerMode,
      listeningAddress: String,
      listeningPortRange: java.util.Iterator[Integer])
    : Unit = {

    val result = retryOnBindException({
      // Try all ports in the range until successful
      val socket = NetUtils.createSocketFromPorts(
        listeningPortRange,
        new NetUtils.SocketFactory {
          override def createSocket(port: Int): ServerSocket = new ServerSocket(
            // Use the correct listening address, bound ports will only be
            // detected later by Akka.
            port, 0, InetAddress.getByName(NetUtils.getWildcardIPAddress))
        })

      val port =
        if (socket == null) {
          throw new BindException(s"Unable to allocate port for JobManager.")
        } else {
          try {
            socket.getLocalPort()
          } finally {
            socket.close()
          }
        }

      runJobManager(configuration, executionMode, listeningAddress, port)
    }, { !listeningPortRange.hasNext }, 5000)

    result match {
      case scala.util.Failure(f) => throw f
      case _ =>
    }
  }

  /**
    * Retries a function if it fails because of a [[java.net.BindException]].
    *
    * @param fn The function to retry
    * @param stopCond Flag to signal termination
    * @param maxSleepBetweenRetries Max random sleep time between retries
    * @tparam T Return type of the the function to retry
    * @return Return value of the the function to retry
    */
  @tailrec
  def retryOnBindException[T](
      fn: => T,
      stopCond: => Boolean,
      maxSleepBetweenRetries : Long = 0 )
    : scala.util.Try[T] = {

    def sleepBeforeRetry() : Unit = {
      if (maxSleepBetweenRetries > 0) {
        val sleepTime = (Math.random() * maxSleepBetweenRetries).asInstanceOf[Long]
        LOG.info(s"Retrying after bind exception. Sleeping for $sleepTime ms.")
        Thread.sleep(sleepTime)
      }
    }

    scala.util.Try {
      fn
    } match {
      case scala.util.Failure(x: BindException) =>
        if (stopCond) {
          scala.util.Failure(new RuntimeException(
            "Unable to do further retries starting the actor system"))
        } else {
          sleepBeforeRetry()
          retryOnBindException(fn, stopCond)
        }
      case scala.util.Failure(x: Exception) => x.getCause match {
        case c: ChannelException =>
          if (stopCond) {
            scala.util.Failure(new RuntimeException(
              "Unable to do further retries starting the actor system"))
          } else {
            sleepBeforeRetry()
            retryOnBindException(fn, stopCond)
          }
        case _ => scala.util.Failure(x)
      }
      case f => f
    }
  }

  /**
    * Starts the JobManager actor system.
    *
    * @param configuration Configuration to use for the job manager actor system
    * @param externalHostname External hostname to bind to
    * @param port Port to bind to
    * @return Actor system for the JobManager and its components
    */
  def startActorSystem(
      configuration: Configuration,
      externalHostname: String,
      port: Int): ActorSystem = {

    val hostPort = NetUtils.unresolvedHostAndPortToNormalizedString(externalHostname, port)

    // Bring up the job manager actor system first, bind it to the given address.
    LOG.info(s"Starting JobManager actor system reachable at $hostPort")

    val jobManagerSystem = try {
      val akkaConfig = AkkaUtils.getAkkaConfig(
        configuration,
        Some((externalHostname, port))
      )
      if (LOG.isDebugEnabled) {
        LOG.debug("Using akka configuration\n " + akkaConfig)
      }
      AkkaUtils.createActorSystem(akkaConfig)
    }
    catch {
      case t: Throwable =>
        if (t.isInstanceOf[org.jboss.netty.channel.ChannelException]) {
          val cause = t.getCause()
          if (cause != null && t.getCause().isInstanceOf[java.net.BindException]) {
            throw new Exception("Unable to create JobManager at address " + hostPort +
                                  " - " + cause.getMessage(), t)
          }
        }
        throw new Exception("Could not create JobManager actor system", t)
    }

    val address = AkkaUtils.getAddress(jobManagerSystem)

    configuration.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, address.host.get)
    configuration.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, address.port.get)

    jobManagerSystem
  }

  /** Starts the JobManager and all its components including the WebMonitor.
    *
    * @param configuration The configuration object for the JobManager
    * @param executionMode The execution mode in which to run. Execution mode LOCAL with spawn an
    *                      additional TaskManager in the same process.
    * @param externalHostname The hostname where the JobManager is reachable for rpc communication
    * @param futureExecutor to run the JobManager's futures
    * @param ioExecutor to run blocking io operations
    * @param highAvailabilityServices to instantiate high availability services
    * @param jobManagerClass The class of the JobManager to be started
    * @param archiveClass The class of the Archivist to be started
    * @param resourceManagerClass Optional class of resource manager if one should be started
    * @return A tuple containing the started ActorSystem, ActorRefs to the JobManager and the
    *         Archivist and an Option containing a possibly started WebMonitor
    */
  def startJobManagerActors(
      jobManagerSystem: ActorSystem,
      configuration: Configuration,
      executionMode: JobManagerMode,
      externalHostname: String,
      futureExecutor: ScheduledExecutorService,
      ioExecutor: Executor,
      highAvailabilityServices: HighAvailabilityServices,
      jobManagerClass: Class[_ <: JobManager],
      archiveClass: Class[_ <: MemoryArchivist],
      resourceManagerClass: Option[Class[_ <: FlinkResourceManager[_]]])
    : (ActorRef, ActorRef, Option[WebMonitor], Option[ActorRef]) = {

    val webMonitor: Option[WebMonitor] =
      if (configuration.getInteger(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, 0) >= 0) {
        LOG.info("Starting JobManager web frontend")

        // start the web frontend. we need to load this dynamically
        // because it is not in the same project/dependencies
        val webServer = WebMonitorUtils.startWebRuntimeMonitor(
          configuration,
          highAvailabilityServices,
          jobManagerSystem)

        Option(webServer)
      }
      else {
        None
      }

    // Reset the port (necessary in case of automatic port selection)
    webMonitor.foreach{ monitor => configuration.setInteger(
      ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, monitor.getServerPort) }

    try {
      // bring up the job manager actor
      LOG.info("Starting JobManager actor")
      val (jobManager, archive) = startJobManagerActors(
        configuration,
        jobManagerSystem,
        futureExecutor,
        ioExecutor,
        highAvailabilityServices,
        jobManagerClass,
        archiveClass)

      // start a process reaper that watches the JobManager. If the JobManager actor dies,
      // the process reaper will kill the JVM process (to ensure easy failure detection)
      LOG.debug("Starting JobManager process reaper")
      jobManagerSystem.actorOf(
        Props(
          classOf[ProcessReaper],
          jobManager,
          LOG.logger,
          RUNTIME_FAILURE_RETURN_CODE),
        "JobManager_Process_Reaper")

      // bring up a local task manager, if needed
      if (executionMode == JobManagerMode.LOCAL) {
        LOG.info("Starting embedded TaskManager for JobManager's LOCAL execution mode")

        val taskManagerActor = TaskManager.startTaskManagerComponentsAndActor(
          configuration,
          ResourceID.generate(),
          jobManagerSystem,
          highAvailabilityServices,
          externalHostname,
          Some(TaskExecutor.TASK_MANAGER_NAME),
          localTaskManagerCommunication = true,
          classOf[TaskManager])

        LOG.debug("Starting TaskManager process reaper")
        jobManagerSystem.actorOf(
          Props(
            classOf[ProcessReaper],
            taskManagerActor,
            LOG.logger,
            RUNTIME_FAILURE_RETURN_CODE),
          "TaskManager_Process_Reaper")
      }

      // start web monitor
      webMonitor.foreach {
        monitor =>
          val hostnamePort = HighAvailabilityServicesUtils.getJobManagerAddress(configuration)
          val jobManagerAkkaUrl = AkkaRpcServiceUtils.getRpcUrl(
            hostnamePort.f0,
            hostnamePort.f1,
            JobMaster.JOB_MANAGER_NAME,
            AddressResolution.NO_ADDRESS_RESOLUTION,
            configuration)
          monitor.start(jobManagerAkkaUrl)
      }

      val resourceManager =
        resourceManagerClass match {
          case Some(rmClass) =>
            LOG.debug("Starting Resource manager actor")
            Option(
              FlinkResourceManager.startResourceManagerActors(
                configuration,
                jobManagerSystem,
                highAvailabilityServices.getJobManagerLeaderRetriever(
                  HighAvailabilityServices.DEFAULT_JOB_ID),
                rmClass))
          case None =>
            LOG.info("Resource Manager class not provided. No resource manager will be started.")
            None
        }

      (jobManager, archive, webMonitor, resourceManager)
    }
    catch {
      case t: Throwable =>
        LOG.error("Error while starting up JobManager", t)
        try {
          jobManagerSystem.shutdown()
        } catch {
          case tt: Throwable => LOG.warn("Could not cleanly shut down actor system", tt)
        }
        throw t
    }
  }

  /**
   * Loads the configuration, execution mode and the listening address from the provided command
   * line arguments.
   *
   * @param args command line arguments
   * @return Quadruple of configuration, execution mode and an optional listening address
   */
  def parseArgs(args: Array[String])
    : (Configuration, JobManagerMode, String, java.util.Iterator[Integer]) = {
    val parser = new scopt.OptionParser[JobManagerCliOptions]("JobManager") {
      head("Flink JobManager")

      opt[String]("configDir") action { (arg, conf) => 
        conf.setConfigDir(arg)
        conf
      } text {
        "The configuration directory."
      }

      opt[String]("executionMode") action { (arg, conf) =>
        conf.setJobManagerMode(arg)
        conf
      } text {
        "The execution mode of the JobManager (CLUSTER / LOCAL)"
      }

      opt[String]("host").optional().action { (arg, conf) =>
        conf.setHost(arg)
        conf
      } text {
        "Network address for communication with the job manager"
      }

      opt[Int]("webui-port").optional().action { (arg, conf) =>
        conf.setWebUIPort(arg)
        conf
      } text {
        "Port for the UI web server"
      }
    }

    val cliOptions = parser.parse(args, new JobManagerCliOptions()).getOrElse {
      throw new Exception(
        s"Invalid command line arguments: ${args.mkString(" ")}. Usage: ${parser.usage}")
    }
    
    val configDir = cliOptions.getConfigDir()
    
    if (configDir == null) {
      throw new Exception("Missing parameter '--configDir'")
    }
    if (cliOptions.getJobManagerMode() == null) {
      throw new Exception("Missing parameter '--executionMode'")
    }

    LOG.info("Loading configuration from " + configDir)
    val configuration = GlobalConfiguration.loadConfiguration(configDir)

    try {
      FileSystem.setDefaultScheme(configuration)
    }
    catch {
      case e: IOException => {
        throw new Exception("Error while setting the default " +
          "filesystem scheme from configuration.", e)
      }
    }

    if (cliOptions.getWebUIPort() >= 0) {
      configuration.setInteger(ConfigConstants.JOB_MANAGER_WEB_PORT_KEY, cliOptions.getWebUIPort())
    }

    if (cliOptions.getHost() != null) {
      configuration.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, cliOptions.getHost())
    }

    val host = configuration.getString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, null)

    val portRange =
      // high availability mode
      if (ZooKeeperUtils.isZooKeeperRecoveryMode(configuration)) {
        LOG.info("Starting JobManager with high-availability")

        configuration.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, 0)

        // The port range of allowed job manager ports or 0 for random
        configuration.getValue(HighAvailabilityOptions.HA_JOB_MANAGER_PORT_RANGE)
      }
      else {
        LOG.info("Starting JobManager without high-availability")

        // In standalone mode, we don't allow port ranges
        val listeningPort = configuration.getInteger(
          ConfigConstants.JOB_MANAGER_IPC_PORT_KEY,
          ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT)

        if (listeningPort <= 0 || listeningPort >= 65536) {
          val message = "Config parameter '" + ConfigConstants.JOB_MANAGER_IPC_PORT_KEY +
            "' is invalid, it must be greater than 0 and less than 65536."
          LOG.error(message)
          System.exit(STARTUP_FAILURE_RETURN_CODE)
        }

        String.valueOf(listeningPort)
      }

    val executionMode = cliOptions.getJobManagerMode

    LOG.info(s"Starting JobManager on $host:$portRange with execution mode $executionMode")

    val portRangeIterator = NetUtils.getPortRangeFromString(portRange)

    (configuration, executionMode, host, portRangeIterator)
  }

  /**
   * Create the job manager components as (instanceManager, scheduler, libraryCacheManager,
   *              archiverProps, defaultExecutionRetries,
   *              delayBetweenRetries, timeout)
   *
   * @param configuration The configuration from which to parse the config values.
   * @param futureExecutor to run JobManager's futures
   * @param ioExecutor to run blocking io operations
   * @param blobStore to store blobs persistently
   * @return The members for a default JobManager.
   */
  def createJobManagerComponents(
      configuration: Configuration,
      futureExecutor: ScheduledExecutorService,
      ioExecutor: Executor,
      blobStore: BlobStore) :
    (InstanceManager,
    FlinkScheduler,
    BlobLibraryCacheManager,
    RestartStrategyFactory,
    FiniteDuration, // timeout
    Int, // number of archived jobs
    Option[Path], // archive path
    FiniteDuration, // timeout for job recovery
    Option[FlinkMetricRegistry]
   ) = {

    val timeout: FiniteDuration = AkkaUtils.getTimeout(configuration)

    val cleanupInterval = configuration.getLong(
      ConfigConstants.LIBRARY_CACHE_MANAGER_CLEANUP_INTERVAL,
      ConfigConstants.DEFAULT_LIBRARY_CACHE_MANAGER_CLEANUP_INTERVAL) * 1000

    val restartStrategy = RestartStrategyFactory.createRestartStrategyFactory(configuration)

    val archiveCount = configuration.getInteger(ConfigConstants.JOB_MANAGER_WEB_ARCHIVE_COUNT,
      ConfigConstants.DEFAULT_JOB_MANAGER_WEB_ARCHIVE_COUNT)

    val archiveDir = configuration.getString(JobManagerOptions.ARCHIVE_DIR)

    val archivePath = if (archiveDir != null) {
      try {
        Option.apply(
          WebMonitorUtils.validateAndNormalizeUri(new Path(archiveDir).toUri))
      } catch {
        case e: Exception =>
          LOG.warn(s"Failed to validate specified archive directory in '$archiveDir'. " +
            "Jobs will not be archived for the HistoryServer.", e)
          Option.empty
      }
    } else {
      LOG.debug("No archive directory was configured. Jobs will not be archived.")
      Option.empty
    }

    var blobServer: BlobServer = null
    var instanceManager: InstanceManager = null
    var scheduler: FlinkScheduler = null
    var libraryCacheManager: BlobLibraryCacheManager = null

    try {
      blobServer = new BlobServer(configuration, blobStore)
      instanceManager = new InstanceManager()
      scheduler = new FlinkScheduler(ExecutionContext.fromExecutor(futureExecutor))
      libraryCacheManager = new BlobLibraryCacheManager(blobServer, cleanupInterval)

      instanceManager.addInstanceListener(scheduler)
    }
    catch {
      case t: Throwable =>
        if (libraryCacheManager != null) {
          libraryCacheManager.shutdown()
        }
        if (scheduler != null) {
          scheduler.shutdown()
        }
        if (instanceManager != null) {
          instanceManager.shutdown()
        }
        if (blobServer != null) {
          blobServer.close()
        }
        
        throw t
    }

    val jobRecoveryTimeoutStr = configuration.getValue(HighAvailabilityOptions.HA_JOB_DELAY)

    val jobRecoveryTimeout = if (jobRecoveryTimeoutStr == null || jobRecoveryTimeoutStr.isEmpty) {
      timeout
    } else {
      try {
        FiniteDuration(Duration(jobRecoveryTimeoutStr).toMillis, TimeUnit.MILLISECONDS)
      } catch {
        case n: NumberFormatException =>
          throw new Exception(
            s"Invalid config value for ${HighAvailabilityOptions.HA_JOB_DELAY.key()}: " +
              s"$jobRecoveryTimeoutStr. Value must be a valid duration (such as '10 s' or '1 min')")
      }
    }

    val metricRegistry = try {
      Option(new FlinkMetricRegistry(MetricRegistryConfiguration.fromConfiguration(configuration)))
    } catch {
      case _: Exception =>
        None
    }

    (instanceManager,
      scheduler,
      libraryCacheManager,
      restartStrategy,
      timeout,
      archiveCount,
      archivePath,
      jobRecoveryTimeout,
      metricRegistry)
  }

  /**
   * Starts the JobManager and job archiver based on the given configuration, in
   * the given actor system.
   *
   * @param configuration The configuration for the JobManager
   * @param actorSystem The actor system running the JobManager
   * @param futureExecutor to run JobManager's futures
   * @param ioExecutor to run blocking io operations
   * @param jobManagerClass The class of the JobManager to be started
   * @param archiveClass The class of the MemoryArchivist to be started
   * @return A tuple of references (JobManager Ref, Archiver Ref)
   */
  def startJobManagerActors(
      configuration: Configuration,
      actorSystem: ActorSystem,
      futureExecutor: ScheduledExecutorService,
      ioExecutor: Executor,
      highAvailabilityServices: HighAvailabilityServices,
      jobManagerClass: Class[_ <: JobManager],
      archiveClass: Class[_ <: MemoryArchivist])
    : (ActorRef, ActorRef) = {

    startJobManagerActors(
      configuration,
      actorSystem,
      futureExecutor,
      ioExecutor,
      highAvailabilityServices,
      Some(JobMaster.JOB_MANAGER_NAME),
      Some(JobMaster.ARCHIVE_NAME),
      jobManagerClass,
      archiveClass)
  }

  /**
   * Starts the JobManager and job archiver based on the given configuration, in the
   * given actor system.
   *
   * @param configuration The configuration for the JobManager
   * @param actorSystem The actor system running the JobManager
   * @param futureExecutor to run JobManager's futures
   * @param ioExecutor to run blocking io operations
   * @param jobManagerActorName Optionally the name of the JobManager actor. If none is given,
   *                          the actor will have the name generated by the actor system.
   * @param archiveActorName Optionally the name of the archive actor. If none is given,
   *                          the actor will have the name generated by the actor system.
   * @param jobManagerClass The class of the JobManager to be started
   * @param archiveClass The class of the MemoryArchivist to be started
    * @return A tuple of references (JobManager Ref, Archiver Ref)
   */
  def startJobManagerActors(
      configuration: Configuration,
      actorSystem: ActorSystem,
      futureExecutor: ScheduledExecutorService,
      ioExecutor: Executor,
      highAvailabilityServices: HighAvailabilityServices,
      jobManagerActorName: Option[String],
      archiveActorName: Option[String],
      jobManagerClass: Class[_ <: JobManager],
      archiveClass: Class[_ <: MemoryArchivist])
    : (ActorRef, ActorRef) = {

    val (instanceManager,
    scheduler,
    libraryCacheManager,
    restartStrategy,
    timeout,
    archiveCount,
    archivePath,
    jobRecoveryTimeout,
    metricsRegistry) = createJobManagerComponents(
      configuration,
      futureExecutor,
      ioExecutor,
      highAvailabilityServices.createBlobStore())

    val archiveProps = getArchiveProps(archiveClass, archiveCount, archivePath)

    // start the archiver with the given name, or without (avoid name conflicts)
    val archive: ActorRef = archiveActorName match {
      case Some(actorName) => actorSystem.actorOf(archiveProps, actorName)
      case None => actorSystem.actorOf(archiveProps)
    }

    val jobManagerProps = getJobManagerProps(
      jobManagerClass,
      configuration,
      futureExecutor,
      ioExecutor,
      instanceManager,
      scheduler,
      libraryCacheManager,
      archive,
      restartStrategy,
      timeout,
      highAvailabilityServices.getJobManagerLeaderElectionService(
        HighAvailabilityServices.DEFAULT_JOB_ID),
      highAvailabilityServices.getSubmittedJobGraphStore(),
      highAvailabilityServices.getCheckpointRecoveryFactory(),
      jobRecoveryTimeout,
      metricsRegistry)

    val jobManager: ActorRef = jobManagerActorName match {
      case Some(actorName) => actorSystem.actorOf(jobManagerProps, actorName)
      case None => actorSystem.actorOf(jobManagerProps)
    }

    metricsRegistry match {
      case Some(registry) =>
        registry.startQueryService(actorSystem, null)
      case None =>
    }

    (jobManager, archive)
  }

  def getArchiveProps(
      archiveClass: Class[_ <: MemoryArchivist],
      archiveCount: Int,
      archivePath: Option[Path]): Props = {
    Props(archiveClass, archiveCount, archivePath)
  }

  def getJobManagerProps(
    jobManagerClass: Class[_ <: JobManager],
    configuration: Configuration,
    futureExecutor: ScheduledExecutorService,
    ioExecutor: Executor,
    instanceManager: InstanceManager,
    scheduler: FlinkScheduler,
    libraryCacheManager: LibraryCacheManager,
    archive: ActorRef,
    restartStrategyFactory: RestartStrategyFactory,
    timeout: FiniteDuration,
    leaderElectionService: LeaderElectionService,
    submittedJobGraphStore: SubmittedJobGraphStore,
    checkpointRecoveryFactory: CheckpointRecoveryFactory,
    jobRecoveryTimeout: FiniteDuration,
    metricsRegistry: Option[FlinkMetricRegistry]): Props = {

    Props(
      jobManagerClass,
      configuration,
      futureExecutor,
      ioExecutor,
      instanceManager,
      scheduler,
      libraryCacheManager,
      archive,
      restartStrategyFactory,
      timeout,
      leaderElectionService,
      submittedJobGraphStore,
      checkpointRecoveryFactory,
      jobRecoveryTimeout,
      metricsRegistry)
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy