All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.flink.runtime.minicluster.LocalFlinkMiniCluster.scala Maven / Gradle / Ivy

There is a newer version: 1.13.6
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flink.runtime.minicluster

import java.net.InetAddress
import java.util.concurrent.{Executor, ScheduledExecutorService}

import akka.actor.{ActorRef, ActorSystem, Props}
import org.apache.flink.api.common.JobID
import org.apache.flink.api.common.io.FileOutputFormat
import org.apache.flink.configuration.{ConfigConstants, Configuration, QueryableStateOptions, TaskManagerOptions}
import org.apache.flink.core.fs.Path
import org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory
import org.apache.flink.runtime.clusterframework.FlinkResourceManager
import org.apache.flink.runtime.clusterframework.standalone.StandaloneResourceManager
import org.apache.flink.runtime.clusterframework.types.{ResourceID, ResourceIDRetrievable}
import org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager
import org.apache.flink.runtime.executiongraph.restart.RestartStrategyFactory
import org.apache.flink.runtime.highavailability.{HighAvailabilityServices, HighAvailabilityServicesUtils}
import org.apache.flink.runtime.instance.InstanceManager
import org.apache.flink.runtime.io.disk.iomanager.IOManager
import org.apache.flink.runtime.io.network.NetworkEnvironment
import org.apache.flink.runtime.io.network.netty.NettyConfig
import org.apache.flink.runtime.jobmanager.scheduler.Scheduler
import org.apache.flink.runtime.jobmanager.{JobManager, MemoryArchivist, SubmittedJobGraphStore}
import org.apache.flink.runtime.jobmaster.JobMaster
import org.apache.flink.runtime.leaderelection.LeaderElectionService
import org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService
import org.apache.flink.runtime.memory.MemoryManager
import org.apache.flink.runtime.messages.JobManagerMessages
import org.apache.flink.runtime.messages.JobManagerMessages.{RunningJobsStatus, StoppingFailure, StoppingResponse}
import org.apache.flink.runtime.metrics.MetricRegistry
import org.apache.flink.runtime.taskexecutor.{TaskExecutor, TaskManagerConfiguration, TaskManagerServices, TaskManagerServicesConfiguration}
import org.apache.flink.runtime.taskmanager.{TaskManager, TaskManagerLocation}
import org.apache.flink.runtime.util.EnvironmentInformation

import scala.concurrent.{Await, ExecutionContext}
import scala.concurrent.duration.FiniteDuration

/**
 * Local Flink mini cluster which executes all [[TaskManager]]s and the [[JobManager]] in the same
 * JVM. It extends the [[FlinkMiniCluster]] by having convenience functions to setup Flink's
 * configuration and implementations to create [[JobManager]] and [[TaskManager]].
 *
 * @param userConfiguration Configuration object with the user provided configuration values
 * @param singleActorSystem true if all actors (JobManager and TaskManager) shall be run in the same
 *                          [[ActorSystem]], otherwise false
 */
class LocalFlinkMiniCluster(
    userConfiguration: Configuration,
    highAvailabilityServices: HighAvailabilityServices,
    singleActorSystem: Boolean) extends FlinkMiniCluster(
  userConfiguration,
  highAvailabilityServices,
  singleActorSystem) {

  def this(userConfiguration: Configuration, useSingleActorSystem: Boolean) = {
    this(
      userConfiguration,
       HighAvailabilityServicesUtils.createAvailableOrEmbeddedServices(
         userConfiguration,
         ExecutionContext.global),
      useSingleActorSystem)
  }

  def this(userConfiguration: Configuration) = this(userConfiguration, true)

  // --------------------------------------------------------------------------

  override def generateConfiguration(userConfiguration: Configuration): Configuration = {
    val config = getDefaultConfig

    setDefaultCiConfig(config)

    config.addAll(userConfiguration)
    setMemory(config)
    initializeIOFormatClasses(config)

    // Disable queryable state server if nothing else is configured explicitly
    if (!config.containsKey(QueryableStateOptions.SERVER_ENABLE.key())) {
      LOG.info("Disabled queryable state server")
      config.setBoolean(QueryableStateOptions.SERVER_ENABLE, false)
    }

    config
  }

  //------------------------------------------------------------------------------------------------
  // Actor classes
  //------------------------------------------------------------------------------------------------

  val jobManagerClass: Class[_ <: JobManager] = classOf[JobManager]

  val taskManagerClass: Class[_ <: TaskManager] = classOf[TaskManager]

  val memoryArchivistClass: Class[_ <: MemoryArchivist] = classOf[MemoryArchivist]

  val resourceManagerClass: Class[_ <: FlinkResourceManager[_ <: ResourceIDRetrievable]] =
    classOf[StandaloneResourceManager]

  //------------------------------------------------------------------------------------------------
  // Start methods for the distributed components
  //------------------------------------------------------------------------------------------------

  override def startJobManager(index: Int, system: ActorSystem): ActorRef = {
    val config = originalConfiguration.clone()

    val jobManagerName = getJobManagerName(index)
    val archiveName = getArchiveName(index)

    val jobManagerPort = config.getInteger(
      ConfigConstants.JOB_MANAGER_IPC_PORT_KEY,
      ConfigConstants.DEFAULT_JOB_MANAGER_IPC_PORT)

    if(jobManagerPort > 0) {
      config.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, jobManagerPort + index)
    }

    val (instanceManager,
    scheduler,
    libraryCacheManager,
    restartStrategyFactory,
    timeout,
    archiveCount,
    archivePath,
    jobRecoveryTimeout,
    metricsRegistry) = JobManager.createJobManagerComponents(
      config,
      futureExecutor,
      ioExecutor,
      highAvailabilityServices.createBlobStore())

    if (config.getBoolean(ConfigConstants.LOCAL_START_WEBSERVER, false)) {
      metricsRegistry.get.startQueryService(system, null)
    }

    val archive = system.actorOf(
      getArchiveProps(
        memoryArchivistClass,
        archiveCount,
        archivePath),
      archiveName)

    system.actorOf(
      getJobManagerProps(
        jobManagerClass,
        config,
        futureExecutor,
        ioExecutor,
        instanceManager,
        scheduler,
        libraryCacheManager,
        archive,
        restartStrategyFactory,
        timeout,
        highAvailabilityServices.getJobManagerLeaderElectionService(
          HighAvailabilityServices.DEFAULT_JOB_ID),
        highAvailabilityServices.getSubmittedJobGraphStore(),
        highAvailabilityServices.getCheckpointRecoveryFactory(),
        jobRecoveryTimeout,
        metricsRegistry),
      jobManagerName)
  }

  override def startResourceManager(index: Int, system: ActorSystem): ActorRef = {
    val config = originalConfiguration.clone()

    val resourceManagerName = getResourceManagerName(index)

    val resourceManagerPort = config.getInteger(
      ConfigConstants.RESOURCE_MANAGER_IPC_PORT_KEY,
      ConfigConstants.DEFAULT_RESOURCE_MANAGER_IPC_PORT)

    if(resourceManagerPort > 0) {
      config.setInteger(ConfigConstants.RESOURCE_MANAGER_IPC_PORT_KEY, resourceManagerPort + index)
    }

    val resourceManagerProps = getResourceManagerProps(
      resourceManagerClass,
      config,
      highAvailabilityServices.getJobManagerLeaderRetriever(
        HighAvailabilityServices.DEFAULT_JOB_ID))

    system.actorOf(resourceManagerProps, resourceManagerName)
  }

  override def startTaskManager(index: Int, system: ActorSystem): ActorRef = {
    val config = originalConfiguration.clone()

    val rpcPort = config.getInteger(
      ConfigConstants.TASK_MANAGER_IPC_PORT_KEY,
      ConfigConstants.DEFAULT_TASK_MANAGER_IPC_PORT)

    val dataPort = config.getInteger(
      ConfigConstants.TASK_MANAGER_DATA_PORT_KEY,
      ConfigConstants.DEFAULT_TASK_MANAGER_DATA_PORT)

    if (rpcPort > 0) {
      config.setInteger(ConfigConstants.TASK_MANAGER_IPC_PORT_KEY, rpcPort + index)
    }
    if (dataPort > 0) {
      config.setInteger(ConfigConstants.TASK_MANAGER_DATA_PORT_KEY, dataPort + index)
    }

    val localExecution = numTaskManagers == 1

    val taskManagerActorName = if (singleActorSystem) {
      TaskExecutor.TASK_MANAGER_NAME + "_" + (index + 1)
    } else {
      TaskExecutor.TASK_MANAGER_NAME
    }

    val resourceID = ResourceID.generate() // generate random resource id

    val taskManagerAddress = InetAddress.getByName(hostname)

    val taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(config)
    val taskManagerServicesConfiguration = TaskManagerServicesConfiguration.fromConfiguration(
      config,
      taskManagerAddress,
      localExecution)

    val taskManagerServices = TaskManagerServices.fromConfiguration(
      taskManagerServicesConfiguration,
      resourceID)

    val metricRegistry = taskManagerServices.getMetricRegistry()

    val props = getTaskManagerProps(
      taskManagerClass,
      taskManagerConfiguration,
      resourceID,
      taskManagerServices.getTaskManagerLocation(),
      taskManagerServices.getMemoryManager(),
      taskManagerServices.getIOManager(),
      taskManagerServices.getNetworkEnvironment,
      metricRegistry)

    if (config.getBoolean(ConfigConstants.LOCAL_START_WEBSERVER, false)) {
      metricRegistry.startQueryService(system, resourceID)
    }

    system.actorOf(props, taskManagerActorName)
  }

  //------------------------------------------------------------------------------------------------
  // Props for the distributed components
  //------------------------------------------------------------------------------------------------

  def getArchiveProps(
      archiveClass: Class[_ <: MemoryArchivist],
      archiveCount: Int,
      arhivePath: Option[Path]): Props = {
    JobManager.getArchiveProps(archiveClass, archiveCount, Option.empty)
  }

  def getJobManagerProps(
      jobManagerClass: Class[_ <: JobManager],
      configuration: Configuration,
      futureExecutor: ScheduledExecutorService,
      ioExecutor: Executor,
      instanceManager: InstanceManager,
      scheduler: Scheduler,
      libraryCacheManager: BlobLibraryCacheManager,
      archive: ActorRef,
      restartStrategyFactory: RestartStrategyFactory,
      timeout: FiniteDuration,
      leaderElectionService: LeaderElectionService,
      submittedJobGraphStore: SubmittedJobGraphStore,
      checkpointRecoveryFactory: CheckpointRecoveryFactory,
      jobRecoveryTimeout: FiniteDuration,
      metricsRegistry: Option[MetricRegistry])
    : Props = {

    JobManager.getJobManagerProps(
      jobManagerClass,
      configuration,
      futureExecutor,
      ioExecutor,
      instanceManager,
      scheduler,
      libraryCacheManager,
      archive,
      restartStrategyFactory,
      timeout,
      leaderElectionService,
      submittedJobGraphStore,
      checkpointRecoveryFactory,
      jobRecoveryTimeout,
      metricsRegistry)
  }

  def getTaskManagerProps(
    taskManagerClass: Class[_ <: TaskManager],
    taskManagerConfig: TaskManagerConfiguration,
    resourceID: ResourceID,
    taskManagerLocation: TaskManagerLocation,
    memoryManager: MemoryManager,
    ioManager: IOManager,
    networkEnvironment: NetworkEnvironment,
    metricsRegistry: MetricRegistry): Props = {

    TaskManager.getTaskManagerProps(
      taskManagerClass,
      taskManagerConfig,
      resourceID,
      taskManagerLocation,
      memoryManager,
      ioManager,
      networkEnvironment,
      highAvailabilityServices,
      metricsRegistry)
  }

  def getResourceManagerProps(
    resourceManagerClass: Class[_ <: FlinkResourceManager[_ <: ResourceIDRetrievable]],
    configuration: Configuration,
    leaderRetrievalService: LeaderRetrievalService): Props = {

    FlinkResourceManager.getResourceManagerProps(
      resourceManagerClass,
      configuration,
      leaderRetrievalService)
  }

  //------------------------------------------------------------------------------------------------
  // Helper methods
  //------------------------------------------------------------------------------------------------

  def initializeIOFormatClasses(configuration: Configuration): Unit = {
    try {
      val om = classOf[FileOutputFormat[_]].getDeclaredMethod("initDefaultsFromConfiguration",
        classOf[Configuration])
      om.setAccessible(true)
      om.invoke(null, configuration)
    } catch {
      case e: Exception =>
        LOG.error("Cannot (re) initialize the globally loaded defaults. Some classes might not " +
          "follow the specified default behaviour.")
    }
  }

  def setMemory(config: Configuration): Unit = {
    // set this only if no memory was pre-configured
    if (config.getLong(TaskManagerOptions.MANAGED_MEMORY_SIZE) ==
        TaskManagerOptions.MANAGED_MEMORY_SIZE.defaultValue()) {

      val numTaskManager = config.getInteger(
        ConfigConstants.LOCAL_NUMBER_TASK_MANAGER,
        ConfigConstants.DEFAULT_LOCAL_NUMBER_TASK_MANAGER)

      val memoryFraction = config.getFloat(TaskManagerOptions.MANAGED_MEMORY_FRACTION)

      // full memory size
      var memorySize: Long = EnvironmentInformation.getSizeOfFreeHeapMemoryWithDefrag

      // compute the memory size per task manager. we assume equally much memory for
      // each TaskManagers and each JobManager
      memorySize /= numTaskManager + 1 // the +1 is the job manager

      // for each TaskManager, subtract the memory needed for network memory buffers
      memorySize -= TaskManagerServices.calculateNetworkBufferMemory(memorySize, config)
      memorySize = (memorySize * memoryFraction).toLong
      memorySize >>= 20 // bytes to megabytes
      config.setLong(TaskManagerOptions.MANAGED_MEMORY_SIZE, memorySize)
    }
  }

  def getDefaultConfig: Configuration = {
    val config: Configuration = new Configuration()

    config.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, hostname)
    config.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, 0)

    config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER,
      ConfigConstants.DEFAULT_LOCAL_NUMBER_TASK_MANAGER)

    // Reduce number of threads for local execution
    config.setInteger(NettyConfig.NUM_THREADS_CLIENT, 1)
    config.setInteger(NettyConfig.NUM_THREADS_SERVER, 2)

    config
  }

  protected def getJobManagerName(index: Int): String = {
    if(singleActorSystem) {
      JobMaster.JOB_MANAGER_NAME + "_" + (index + 1)
    } else {
      JobMaster.JOB_MANAGER_NAME
    }
  }

  protected def getResourceManagerName(index: Int): String = {
    if(singleActorSystem) {
      FlinkResourceManager.RESOURCE_MANAGER_NAME + "_" + (index + 1)
    } else {
      FlinkResourceManager.RESOURCE_MANAGER_NAME
    }
  }

  protected def getArchiveName(index: Int): String = {
    if(singleActorSystem) {
      JobMaster.ARCHIVE_NAME + "_" + (index + 1)
    } else {
      JobMaster.ARCHIVE_NAME
    }
  }

  // --------------------------------------------------------------------------
  //  Actions on running jobs
  // --------------------------------------------------------------------------

  def currentlyRunningJobs: Iterable[JobID] = {
    val leader = getLeaderGateway(timeout)
    val future = leader.ask(JobManagerMessages.RequestRunningJobsStatus, timeout)
                       .mapTo[RunningJobsStatus]
    Await.result(future, timeout).runningJobs.map(_.getJobId)
  }

  def getCurrentlyRunningJobsJava(): java.util.List[JobID] = {
    val list = new java.util.ArrayList[JobID]()
    currentlyRunningJobs.foreach(list.add)
    list
  }

  def stopJob(id: JobID) : Unit = {
    val leader = getLeaderGateway(timeout)
    val response = leader.ask(new JobManagerMessages.StopJob(id), timeout)
                         .mapTo[StoppingResponse]
    val rc = Await.result(response, timeout)

    rc match {
      case failure: StoppingFailure =>
        throw new Exception(s"Stopping the job with ID $id failed.", failure.cause)
      case _ =>
    }
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy