com.intel.analytics.bigdl.utils.Engine.scala Maven / Gradle / Ivy

/*
 * Copyright 2016 The BigDL Authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.intel.analytics.bigdl.utils

import java.io.{FileOutputStream, InputStream, PrintWriter}
import java.util.Locale
import java.util.concurrent.atomic.AtomicBoolean

import org.apache.log4j.Logger
import org.apache.spark._
import com.intel.analytics.bigdl.mkl.MKL
import com.intel.analytics.bigdl.mkl.hardware.{Affinity, CpuInfo}
import org.apache.spark.utils.SparkUtils
import py4j.GatewayServer

import scala.util.control.{ControlThrowable, NonFatal}

/**
 * define engine type trait
 */
sealed trait EngineType

case object MklBlas extends EngineType
case object MklDnn extends EngineType

/**
 * define optimizer version trait
 */
sealed trait OptimizerVersion

case object OptimizerV1 extends OptimizerVersion
case object OptimizerV2 extends OptimizerVersion


object Engine {

  // Initialize some properties for mkldnn engine. We should call it at the beginning.
  // Otherwise some properties will have no effect.
  if (System.getProperty("bigdl.engineType") == "mkldnn" &&
    System.getProperty("bigdl.multiModels", "false") == "false") {
    setMklDnnEnvironments()
  }

  @deprecated(
    "See https://bigdl-project.github.io/master/#APIGuide/Engine/",
    "0.1.0")
  def init(nExecutor: Int,
           executorCores: Int,
           onSpark: Boolean): Option[SparkConf] = {
    logger.warn("Engine.init(nExecutor, executorCores, onSpark) is deprecated. " +
      "Please refer to " +
      "https://bigdl-project.github.io/master/#APIGuide/Engine/")
    setNodeAndCore(nExecutor, executorCores)
    val res = if (onSpark) {
      require(localMode == false,
        s"Engine.init: bigdl.localMode should not be set while onSpark is " +
          s"true. Please set correct java property.")
      Some(createSparkConf())
    } else {
      require(localMode == true,
        s"Engine.init: bigdl.localMode should be set while onSpark is " +
          s"false. Please set correct java property.")
      None
    }
    res
  }

  /**
   * BigDL need some Spark conf values to be set correctly to have better performance.
   *
   * This method will create a SparkConf, or use the existing one if you provide one.
   * Populate it with correct values.
   *
   * We recommend you use this method instead of setting spark conf values directly. This can
   * make the Spark conf values changes transparent to you. However, if you use spark-shell or
   * Jupyter notebook, as the Spark context is created before your code, you have to
   * set them directly (through command line options or properties file)
   *
   * @return
   */
  def createSparkConf(exisitingConf : SparkConf = null) : SparkConf = {
    var _conf = exisitingConf
    if (_conf == null) {
      _conf = new SparkConf()
    }
    readConf.foreach(c => _conf.set(c._1, c._2))
    _conf
  }

  /**
   * This method should be call before any BigDL procedure and after the Spark context is created.
   *
   * BigDL needs some Spark conf values to be set correctly to have a better performance. There's
   * also multi-thread engines so executor number and core number per executor need to be known
   * to set the parameter of these engines correctly.
   *
   * The method can set parameters of multi-thread engines, verify spark conf values of an
   * existing spark context.
   */
  def init: Unit = this.synchronized {
    if (localMode) {
      logger.info("Detect bigdl.localMode is set. Run workload without spark")
      // The physical core number should have been initialized
      // by java property -Dbigdl.coreNumber=xx
      setNodeAndCore(1, getCoreNumberFromProperty)
    } else {
      logger.info("Auto detect executor number and executor cores number")
      val (nExecutor, executorCores) = sparkExecutorAndCore().get
      logger.info(s"Executor number is $nExecutor and executor cores number is $executorCores")
      setNodeAndCore(nExecutor, executorCores)
      checkSparkContext
    }
  }

  private val logger = Logger.getLogger(getClass)
  private val singletonCounter = new AtomicBoolean()
  private var physicalCoreNumber = -1
  private var nodeNum: Int = -1

  @volatile
  private var gatewayServer: py4j.GatewayServer = null

  private def createGatewayPortFile(port: Int): Unit = {
    val file = new java.io.File(SparkFiles.getRootDirectory(), "gateway_port")
    logger.debug(s"Creating JavaGatewayServer port file" +
      s" on executor-${SparkEnv.get.executorId}:${file.getAbsolutePath}")
    if (file.exists()) {
      file.delete()
    }
    file.createNewFile()
    val out = new PrintWriter(file)
    try {
      out.print(port)
      out.flush()
    } finally {
      out.close()
    }
  }

  private[bigdl] def createJavaGateway(driverPort: Int): Unit = {
    if (gatewayServer != null) return
    this.synchronized {
      if (gatewayServer != null) return
      gatewayServer = new py4j.GatewayServer(null, 0)
    }

    logger.info(s"Initializing JavaGatewayServer on executor-${SparkEnv.get.executorId} ")
    GatewayServer.turnLoggingOn()
    val thread = new Thread(new Runnable() {
      override def run(): Unit = try {
        gatewayServer.start()
      } catch {
        case ct: ControlThrowable =>
          throw ct
        case t: Throwable =>
          throw new Exception(s"Uncaught exception " +
            s"in thread ${Thread.currentThread().getName}, when staring JavaGatewayServer", t)
      }
    })
    thread.setName("py4j-executor-gateway-init")
    thread.setDaemon(true)
    thread.start()

    thread.join()

    logger.info(s"JavaGatewayServer initialized")

    Runtime.getRuntime().addShutdownHook(new Thread {
      override def run(): Unit = {
        gatewayServer.shutdown()
      }
    })

    try {
      createGatewayPortFile(gatewayServer.getListeningPort)
    } catch {
      case NonFatal(e) =>
        throw new Exception("Could not create java gateway port file", e)
    }
  }




  private[bigdl] def localMode: Boolean = {
    System.getProperty("bigdl.localMode", "false").toLowerCase(Locale.ROOT) match {
      case "true" => true
      case "false" => false
      case option => throw new IllegalArgumentException(s"Unknown bigdl.localMode $option")
    }
  }

  private val NOT_INIT_ERROR =
    "Do you call Engine.init? See more at " +
      "https://bigdl-project.github.io/master/#APIGuide/Engine/"

  private val SPARK_CONF_ERROR = "For details please check " +
    "https://bigdl-project.github.io/master/#APIGuide/Engine/"

  /**
   * Notice: Please use property bigdl.engineType to set engineType.
   * Default engine is mklblas
   */
  private var engineType: EngineType = {
    System.getProperty("bigdl.engineType", "mklblas").toLowerCase(Locale.ROOT) match {
      case "mklblas" => MklBlas
      case "mkldnn" => MklDnn
      case engineType => throw new IllegalArgumentException(s"Unknown engine type $engineType")
    }
  }

  /**
   * Notice: Please use property bigdl.optimizerVersion to set optimizerVersion.
   * Default version is OptimizerV1
   */
  private var optimizerVersion: OptimizerVersion = {
    System.getProperty("bigdl.optimizerVersion", "optimizerv1").toLowerCase(Locale.ROOT) match {
      case "optimizerv1" => OptimizerV1
      case "optimizerv2" => OptimizerV2
      case optimizerVersion => throw new IllegalArgumentException(s"Unknown type $optimizerVersion")
    }
  }

  // Thread pool for default use
  @volatile private var _default: ThreadPool = null

  // Thread pool for layer use
  @volatile private var _model: ThreadPool = new ThreadPool(1)

  // Thread pool for blas wrapper layer
  private[bigdl] var wrapperComputing: ThreadPool = null

  // This thread is mainly for mkldnn library.
  // Because if we use the parent thread directly, there will be two bugs,
  //   1. The child threads forked from parent thread will be bound to core 0
  //      because of the affinity settings.
  //   2. The native thread has some unknown thread local variables. So if
  //      the parent thread exits and is recreated, such as the thread from
  //      Executors.newFixedThreadPool. The whole app will be segment fault.
  // The parent thread means the main thread (Local Mode) or worker thread of
  // `mapPartition` (Distributed Mode).
  // --------------------------------------------------------------------------
  // We will only use the `threadPool` in ThreadPool, which is a ExecutorService.
  // For `context` in ThreadPool, it is the called thread when poolSize is 1.
  // So many usages of that thread, we will not change it for now.
  val dnnComputing: ThreadPool = new ThreadPool(1)
  // We need to init dnn thread in case that users directly call model operation in java local
  initDnnThread()

  /**
   * If user undefine the property bigdl.coreNumber, it will return physical core number
   * system has. The biggest number it supports is the physical cores number.
   *
   * Currently, it not support detect true physical cores number. Get it through
   * Runtime.getRuntime().availableProcessors() / 2
   */
  private def getCoreNumberFromProperty() = {
    System.getProperty("bigdl.coreNumber", getNumMachineCores.toString).toInt
  }

  private def getNumMachineCores: Int = {
    val coreNum = Runtime.getRuntime().availableProcessors()
    require(coreNum > 0, "Get a non-positive core number")
    // We assume the HT is enabled
    // TODO: check the Hyper threading
    if (coreNum > 1) coreNum / 2 else 1
  }

  /**
   * @return true if current execution is a singleton on the JVM
   */
  private[bigdl] def checkSingleton(): Boolean = singletonCounter.compareAndSet(false, true)

  /**
   * Reset the singleton flag
   */
  private[bigdl] def resetSingletonFlag(): Unit = singletonCounter.set(false)

  /**
   * Return number of cores, the engine.init must be called before use this method or an exception
   * will be thrown
   *
   * @return
   */
  private[bigdl] def coreNumber(): Int = {
    require(physicalCoreNumber != -1, s"Engine.init: Core number is " +
      s"not initialized. $NOT_INIT_ERROR")
    physicalCoreNumber
  }

  /**
   * This method should only be used for test purpose.
   *
   * @param n
   */
  private[bigdl] def setCoreNumber(n: Int): Unit = {
    require(n > 0, "Engine.init: core number is smaller than zero")
    physicalCoreNumber = n
    initThreadPool(n)
  }

  /**
   * Return node number, the engine.init must be called before use this method or an
   * exception will be thrown
   *
   * @return
   */
  private[bigdl] def nodeNumber(): Int = {
    require(nodeNum != -1, s"Engine.init: Node number is not initialized. $NOT_INIT_ERROR")
    nodeNum
  }

  /**
   * This method should only be used for test purpose.
   *
   * @param n
   */
  private[bigdl] def setNodeNumber(n : Int): Unit = {
    require(n > 0)
    nodeNum = n
  }

  /**
   * This method should only be used for test purpose.
   *
   * @param optimizerVersion
   */
  private[bigdl] def setOptimizerVersion(optimizerVersion : OptimizerVersion): Unit = {
    this.optimizerVersion = optimizerVersion
  }

  private[bigdl] def getOptimizerVersion(): OptimizerVersion = {
    this.optimizerVersion
  }

  /**
   * This method should only be used for test purpose.
   *
   * @param engineType
   */
  private[bigdl] def setEngineType(engineType: EngineType): Unit = {
    this.engineType = engineType
  }

  private[bigdl] def getEngineType(): EngineType = {
    this.engineType
  }

  private[bigdl] def isMultiModels: Boolean = {
    getEngineType() match {
      case MklBlas => true
      case MklDnn => System.getProperty("bigdl.multiModels", "false").toBoolean
    }
  }

  private[bigdl] def model: ThreadPool = {
    _model
  }

  private[bigdl] def default: ThreadPool = {
    if (_default == null) {
      throw new IllegalStateException(s"Engine.init: Thread engine is not " +
        s"initialized. $NOT_INIT_ERROR")
    }
    _default
  }

  private def initThreadPool(core : Int) : Unit = {
    val defaultPoolSize: Int = System.getProperty("bigdl.utils.Engine.defaultPoolSize",
      (core * 50).toString).toInt
    if(_default == null || _default.getPoolSize != defaultPoolSize) {
      _default = new ThreadPool(defaultPoolSize)
    }
    if (wrapperComputing == null || wrapperComputing.getPoolSize != defaultPoolSize) {
      wrapperComputing = new ThreadPool(defaultPoolSize)
    }

    // for dnn model we should set the pool size to 1 also.
    // otherwise, it will downgrade the performance and
    // FIXME make the loss to NaN.
    val modelPoolSize = 1

    if(_model == null || _model.getPoolSize != modelPoolSize) {
      _model = new ThreadPool(modelPoolSize)
    }
    _model.setMKLThread(MKL.getMklNumThreads)

    // do two things, set number of threads for omp thread pool and set the affinity
    // only effects the `threadPool` and `computing.invoke/invokeAndWait` will not
    // be effected. And affinity will not effect the other threads except
    // this thread and the omp threads forked from computing.
    if (engineType == MklDnn) {
      dnnComputing.setMKLThreadOfMklDnnBackend(MKL.getMklNumThreads)
      _model.setMKLThreadOfMklDnnBackend(MKL.getMklNumThreads)
    }
    if (System.getProperty("multiThread", "false").toBoolean) {
      wrapperComputing.setMKLThread(1)
    }
  }

  /**
   * Read conf values from config file
   * @return
   */
  private[utils] def readConf : Seq[(String, String)] = {
    val stream : InputStream = getClass.getResourceAsStream("/spark-bigdl.conf")
    val lines = scala.io.Source.fromInputStream(stream)
      .getLines.filter(_.startsWith("spark")).toArray

    // For spark 1.5, we observe nio block manager has better performance than netty block manager
    // So we will force set block manager to nio. If user don't want this, he/she can set
    // bigdl.network.nio == false to customize it. This configuration/blcok manager setting won't
    // take affect on newer spark version as the nio block manger has been removed
    lines.map(_.split("\\s+")).map(d => (d(0), d(1))).toSeq
      .filter(_._1 != "spark.shuffle.blockTransferService" ||
        System.getProperty("bigdl.network.nio", "true").toBoolean)
  }

  /**
   * Check the spark conf of spark context if there's an exsiting one
   */
  private def checkSparkContext : Unit = {
    val tmpContext = SparkContext.getOrCreate(new SparkConf()
      .set("bigdl.temp.context", "true").setAppName("tmp context for Engine check"))
    // If there's already a spark context, it should not include the property
    val existingSparkContext = !tmpContext.getConf.contains("bigdl.temp.context")
    if (!existingSparkContext) {
      tmpContext.stop()
      throw new IllegalArgumentException("Engine.init: Cannot find an existing"
        + " spark context. Do you call this method after create spark context?")
    }
    logger.info("Find existing spark context. Checking the spark conf...")
    val sparkConf = tmpContext.getConf

    def verify(key: String, value: String): Unit = {
      val v = sparkConf.getOption(key)
      require(v.isDefined,
        s"Engine.init: Can not find $key. " + SPARK_CONF_ERROR)
      require(v.get == value,
        s"Engine.init: $key should be $value, " +
          s"but it is ${v.get}. " + SPARK_CONF_ERROR
      )
    }

    readConf.foreach(c => verify(c._1, c._2))
  }

  /**
   * Set executor number and cores per executor
   *
   * @param nodeNum
   * @param coreNum
   */
  private[bigdl] def setNodeAndCore(nodeNum: Int, coreNum: Int): Unit = {
    setNodeNumber(nodeNum)
    setCoreNumber(coreNum)
  }

  /**
   * Reset engine envs. Test purpose
   */
  private[bigdl] def reset : Unit = {
    nodeNum = 1
    physicalCoreNumber = 1
  }

  private def dynamicAllocationExecutor(conf: SparkConf): Option[Int] = {
    if (conf.get("spark.dynamicAllocation.enabled", null) == "true") {
      val maxExecutors = conf.get("spark.dynamicAllocation.maxExecutors", "1").toInt
      val minExecutors = conf.get("spark.dynamicAllocation.minExecutors", "1").toInt
      require(maxExecutors == minExecutors, "Engine.init: " +
        "spark.dynamicAllocation.maxExecutors and " +
        "spark.dynamicAllocation.minExecutors must be identical " +
        "in dynamic allocation for BigDL")
      Some(minExecutors)
    } else {
      None
    }
  }

  /**
   * Extract spark executor number and executor cores from environment.
   * @return (nExecutor, executorCore)
   */
  private[utils] def sparkExecutorAndCore(): Option[(Int, Int)] = {
    try {
      parseExecutorAndCore(SparkContext.getOrCreate().getConf)
    } catch {
      case s: SparkException =>
        if (s.getMessage.contains("A master URL must be set in your configuration")) {
          throw new IllegalArgumentException("A master URL must be set in your configuration." +
            " Or if you want to run BigDL in a local JVM environment, you should set Java " +
            "property bigdl.localMode=true")
        }
        throw s
    }
  }

  /**
   * Extract spark executor number and executor cores from given conf.
   * Exposed for testing.
   * @return (nExecutor, executorCore)
   */
  private[utils] def parseExecutorAndCore(conf: SparkConf): Option[(Int, Int)] = {
    val master = conf.get("spark.master", null)
    if (master.toLowerCase.startsWith("local")) {
      // Spark local mode
      val patternLocalN = "local\\[(\\d+)\\]".r
      val patternLocalStar = "local\\[\\*\\]".r
      master match {
        case patternLocalN(n) => Some(1, n.toInt)
        case patternLocalStar(_*) => Some(1, getNumMachineCores)
        case _ => throw new IllegalArgumentException(s"Can't parser master $master")
      }
    } else if (master.toLowerCase.startsWith("spark")) {
      // Spark standalone mode
      val coreString = conf.get("spark.executor.cores", null)
      val maxString = conf.get("spark.cores.max", null)
      require(coreString != null, "Engine.init: Can't find executor core number" +
        ", do you submit with --executor-cores option")
      require(maxString != null, "Engine.init: Can't find total core number" +
        ". Do you submit with --total-executor-cores")
      val core = coreString.toInt
      val nodeNum = dynamicAllocationExecutor(conf).getOrElse {
        val total = maxString.toInt
        require(total >= core && total % core == 0, s"Engine.init: total core " +
          s"number($total) can't be divided " +
          s"by single core number($core) provided to spark-submit")
        total / core
      }
      Some(nodeNum, core)
    } else if (master.toLowerCase.startsWith("yarn")) {
      // yarn mode
      val coreString = conf.get("spark.executor.cores", null)
      require(coreString != null, "Engine.init: Can't find executor core number" +
        ", do you submit with " +
        "--executor-cores option")
      val core = coreString.toInt
      val node = dynamicAllocationExecutor(conf).getOrElse {
        val numExecutorString = conf.get("spark.executor.instances", null)
        require(numExecutorString != null, "Engine.init: Can't find executor number" +
          ", do you submit with " +
          "--num-executors option")
        numExecutorString.toInt
      }
      Some(node, core)
    } else if (master.toLowerCase.startsWith("mesos")) {
      // mesos mode
      require(conf.get("spark.mesos.coarse", null) != "false", "Engine.init: " +
        "Don't support mesos fine-grained mode")
      val coreString = conf.get("spark.executor.cores", null)
      require(coreString != null, "Engine.init: Can't find executor core number" +
        ", do you submit with --executor-cores option")
      val core = coreString.toInt
      val nodeNum = dynamicAllocationExecutor(conf).getOrElse {
        val maxString = conf.get("spark.cores.max", null)
        require(maxString != null, "Engine.init: Can't find total core number" +
          ". Do you submit with --total-executor-cores")
        val total = maxString.toInt
        require(total >= core && total % core == 0, s"Engine.init: total core " +
          s"number($total) can't be divided " +
          s"by single core number($core) provided to spark-submit")
        total / core
      }
      Some(nodeNum, core)
    } else if (master.toLowerCase.startsWith("k8s")) {
      // Spark-on-kubernetes mode
      val coreString = conf.get("spark.executor.cores", null)
      val maxString = conf.get("spark.cores.max", null)
      require(coreString != null, "Engine.init: Can't find executor core number" +
        ", do you submit with --conf spark.executor.cores option")
      require(maxString != null, "Engine.init: Can't find total core number" +
        ". Do you submit with --conf spark.cores.max option")
      val core = coreString.toInt
      val nodeNum = dynamicAllocationExecutor(conf).getOrElse {
        val total = maxString.toInt
        require(total >= core && total % core == 0, s"Engine.init: total core " +
          s"number($total) can't be divided " +
          s"by single core number($core) provided to spark-submit")
        total / core
      }
      Some(nodeNum, core)
    } else {
      throw new IllegalArgumentException(s"Engine.init: Unsupported master format $master")
    }
  }

  private def setMklDnnEnvironments(): Unit = {
    import com.intel.analytics.bigdl.mkl.hardware.CpuInfo
    val affinityCores = Affinity.getAffinity
    val physicalCoreNum = CpuInfo.getPhysicalProcessorCount
    val affinityCoreNum = affinityCores.length

    // 1. this library in docker/cgroup env, which sets cpu affinity fist. so we can't use
    //    resources exceeding limits.
    // 2. this library is in a hyper threading envs, so we should set the mkl num threads
    //    to physical core number for performance

    val default = if (affinityCores.min > 0 && affinityCores.max >= physicalCoreNumber) {
      affinityCoreNum
    } else if (physicalCoreNum > affinityCoreNum ) {
      affinityCoreNum
    } else {
      physicalCoreNum
    }

    val threadsNumber = System.getProperty("bigdl.mklNumThreads", default.toString)
    System.setProperty("bigdl.mklNumThreads", s"$threadsNumber")

    System.setProperty("bigdl.disable.mklBlockTime", "true")
    System.setProperty("bigdl.coreNumber", "1")
  }

  private def initDnnThread(): Unit = {
    if (engineType == MklDnn) {
      dnnComputing.setMKLThreadOfMklDnnBackend(MKL.getMklNumThreads)
    }
  }
}