ai.catboost.spark.SparkHelpers.scala Maven / Gradle / Ivy

Go to download
package ai.catboost.spark;

import util.control.Breaks._

import org.apache.spark.{SparkContext,SparkJobInfo,SparkStageInfo}
import org.apache.spark.sql.SparkSession

import ai.catboost.CatBoostError


private[spark] object SparkHelpers {
  def parseMemoryOverHeadOption(memoryOverhead: String) : Long = {
    val withUnitRE = "^([\\d]+)(k|m|g|t|K|M|G|T)$".r
    val withoutUnitRE = "^([\\d]+)$".r
    memoryOverhead match {
      case withUnitRE(value, unit) => {
        value.toLong << (
          unit match {
            case "k" | "K" => 10
            case "m" | "M" => 20
            case "g" | "G" => 30
            case "t" | "T" => 40
            case _ => throw new java.lang.RuntimeException("Internal error: Incorrect regex matching")
          }
        )
      }
      case withoutUnitRE(value) => { value.toLong << 20 } // default unit is Megabytes
      case _ => throw new CatBoostError(s"bad format for memory overhead string: $memoryOverhead")
    }
  }

  def getThreadCountForDriver(spark : SparkSession) : Int = {
    val taskCpusConfig = spark.sparkContext.getConf.getOption("spark.driver.cores")
    taskCpusConfig.getOrElse("1").toInt
  }

  def getThreadCountForTask(spark : SparkSession) : Int = {
    val taskCpusConfig = spark.sparkContext.getConf.getOption("spark.task.cpus")
    taskCpusConfig.getOrElse("1").toInt
  }

  // Technically Spark can run several executors on one worker but this is not a recommended case
  def getWorkerCount(spark: SparkSession) : Int = {
    if (spark.sparkContext.isLocal) {
      1
    } else {
      /* There's a period at the start of the Spark application when executors have not been started yet
       *  http://apache-spark-user-list.1001560.n3.nabble.com/Getting-the-number-of-slaves-tp10604p10816.html
       *  retry with sleep until the number stabilizes
       */
      val NumRetriesForExecutorsStartWait = 60 // wait for 1 second each time

      var currentExecutorCount = 0
      var retryCount = 0

      breakable {
        while (true) {
          if (retryCount == NumRetriesForExecutorsStartWait) {
            throw new java.lang.RuntimeException(
              s"Unable to get the number of Spark executors in ${NumRetriesForExecutorsStartWait} seconds"
            )
          }
          retryCount = 0

          val executorCount = spark.sparkContext.statusTracker.getExecutorInfos.length
          if (executorCount > 1) {
            if (executorCount == currentExecutorCount) {
              // heuristic: number is stable for 1 second, so it should be ok
              break
            } else {
              currentExecutorCount = executorCount
            }
          }
          Thread.sleep(1000) // 1 second
          retryCount = retryCount + 1
        }
      }
      currentExecutorCount - 1 // one is the driver
    }
  }

  def getDriverHost(spark: SparkSession): String = {
    spark.sparkContext.getConf.getOption("spark.driver.host").get
  }

  def getDriverNativeMemoryLimit(spark: SparkSession): Option[Long]  = {
    val optionalValue = spark.sparkContext.getConf.getOption("spark.driver.memoryOverhead")
    if (optionalValue.isDefined) { Some(parseMemoryOverHeadOption(optionalValue.get)) } else { None }
  }

  def getExecutorNativeMemoryLimit(spark: SparkSession): Option[Long] = {
    val optionalValue = spark.sparkContext.getConf.getOption("spark.executor.memoryOverhead")
    if (optionalValue.isDefined) { Some(parseMemoryOverHeadOption(optionalValue.get)) } else { None }
  }
}

private[spark] class SparkJobGroupLastStagesFailedStats(
  val failedAttempts : Int,
  val failedTasksInLastAttempt: Int
)

private[spark] object SparkJobGroupLastStagesFailedStats {
  def apply(sparkContext: SparkContext, jobGroup: String) : SparkJobGroupLastStagesFailedStats = {
    val statusTracker = sparkContext.statusTracker

    var failedAttempts = 0
    var failedTasksInLastAttempt = 0

    for (jobId <- statusTracker.getJobIdsForGroup(jobGroup)) {
      statusTracker.getJobInfo(jobId).foreach{
        case jobInfo : SparkJobInfo => {
          if (jobInfo.stageIds.nonEmpty) {
            statusTracker.getStageInfo(jobInfo.stageIds.max).foreach{
              case stageInfo : SparkStageInfo => {
                failedAttempts += stageInfo.currentAttemptId
                failedTasksInLastAttempt += stageInfo.numFailedTasks
              }
            }
          }
        }
      }
    }
    new SparkJobGroupLastStagesFailedStats(failedAttempts, failedTasksInLastAttempt)
  }
}