spark.jobserver.util.SparkJobUtils.scala Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of spark-jobserver Show documentation
Show all versions of spark-jobserver Show documentation
SnappyData distributed data store and execution engine
The newest version!
package spark.jobserver.util
import java.util.concurrent.TimeUnit
import com.typesafe.config.Config
import org.apache.spark.SparkConf
import scala.util.Try
/**
* Holds a few functions common to Job Server SparkJob's and SparkContext's
*/
object SparkJobUtils {
import collection.JavaConverters._
/**
* Creates a SparkConf for initializing a SparkContext based on various configs.
* Note that anything in contextConfig with keys beginning with spark. get
* put directly in the SparkConf.
*
* @param config the overall Job Server configuration (Typesafe Config)
* @param contextConfig the Typesafe Config specific to initializing this context
* (typically based on particular context/job)
* @param contextName the context name
* @return a SparkConf with everything properly configured
*/
def configToSparkConf(config: Config, contextConfig: Config,
contextName: String): SparkConf = {
val sparkMaster = SparkMasterProvider.fromConfig(config).getSparkMaster(config)
val conf = new SparkConf()
conf
.setMaster(sparkMaster)
.setAppName(contextName)
for (cores <- Try(contextConfig.getInt("num-cpu-cores"))) {
conf.set("spark.cores.max", cores.toString)
}
// Should be a -Xmx style string eg "512m", "1G"
for (nodeMemStr <- Try(contextConfig.getString("memory-per-node"))) {
conf.set("spark.executor.memory", nodeMemStr)
}
Try(config.getString("spark.home")).foreach { home => conf.setSparkHome(home) }
// Set the Jetty port to 0 to find a random port
conf.set("spark.ui.port", "0")
// Set spark broadcast factory in yarn-client mode
if (sparkMaster == "yarn-client") {
conf.set("spark.broadcast.factory", config.getString("spark.jobserver.yarn-broadcast-factory"))
}
// Set number of akka threads
// TODO: need to figure out how many extra threads spark needs, besides the job threads
conf.set("spark.akka.threads", (getMaxRunningJobs(config) + 4).toString)
// Set any other settings in context config that start with "spark"
for (e <- contextConfig.entrySet().asScala if e.getKey.startsWith("spark.")) {
conf.set(e.getKey, e.getValue.unwrapped.toString)
}
// Set any other settings in context config that start with "passthrough"
// These settings will be directly set in sparkConf, but with "passthrough." stripped
// This is useful for setting configurations for hadoop connectors such as
// elasticsearch, cassandra, etc.
for (e <- Try(contextConfig.getConfig("passthrough"))) {
e.entrySet().asScala.map { s=>
conf.set(s.getKey, s.getValue.unwrapped.toString)
}
}
conf
}
/**
* Returns the maximum number of jobs that can run at the same time
*/
def getMaxRunningJobs(config: Config): Int = {
val cpuCores = Runtime.getRuntime.availableProcessors
Try(config.getInt("spark.jobserver.max-jobs-per-context")).getOrElse(cpuCores)
}
/**
* According "spark.master", returns the timeout of create sparkContext
*/
def getContextTimeout(config: Config): Int = {
config.getString("spark.master") match {
case "yarn-client" =>
Try(config.getDuration("spark.jobserver.yarn-context-creation-timeout",
TimeUnit.MILLISECONDS).toInt / 1000).getOrElse(40)
case _ =>
Try(config.getDuration("spark.jobserver.context-creation-timeout",
TimeUnit.MILLISECONDS).toInt / 1000).getOrElse(15)
}
}
}