
com.tencent.angel.spark.context.AngelPSContext.scala Maven / Gradle / Ivy
The newest version!
/*
* Tencent is pleased to support the open source community by making Angel available.
*
* Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
*
* https://opensource.org/licenses/Apache-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*
*/
package com.tencent.angel.spark.context
import java.util.UUID
import java.util.concurrent.ConcurrentHashMap
import com.tencent.angel.client.AngelContext
import com.tencent.angel.common.location.Location
import com.tencent.angel.conf.AngelConf
import com.tencent.angel.conf.AngelConf._
import com.tencent.angel.exception.AngelException
import com.tencent.angel.ml.matrix.{MatrixContext, MatrixMeta, RowType}
import com.tencent.angel.model.{ModelLoadContext, ModelSaveContext}
import com.tencent.angel.ps.ParameterServer
import com.tencent.angel.psagent.PSAgent
import com.tencent.angel.psagent.matrix.{MatrixClient, MatrixClientFactory}
import com.tencent.angel.spark.models.{PSMatrix, PSVector}
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.util.ShutdownHookManager
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.{SparkConf, SparkEnv, TaskContext}
import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer
import scala.collection.{Map, mutable}
import com.tencent.angel.ps.storage.partitioner.Partitioner
/**
* AngelPSContext for driver and executor, it is an implement of `PSContext`
*/
class AngelPSContext(contextId: Int, angelCtx: AngelContext) extends PSContext {
import AngelPSContext._
private val matrixMetaMap = mutable.HashMap.empty[Int, MatrixMeta]
private val psVectorPools = new ConcurrentHashMap[Int, PSVectorPool]()
// Angel configuration that is delivered to executors by spark driver
private val angelConf = {
val map = mutable.HashMap.empty[String, String]
map.put(MASTER_IP, angelCtx.getMasterLocation.getIp)
map.put(MASTER_PORT, angelCtx.getMasterLocation.getPort.toString)
val hadoopConf = angelCtx.getConf
val keys = ArrayBuffer.empty[String]
for (entry <- hadoopConf.asScala) {
map.put(entry.getKey, entry.getValue)
keys += entry.getKey
}
map.put(CONF_KEYS, keys.mkString(";"))
map
}
private val psAgent = {
val masterIp = angelCtx.getMasterLocation.getIp
val masterPort = angelCtx.getMasterLocation.getPort
new PSAgent(angelCtx.getConf, masterIp, masterPort, contextId, false, null)
}
private var matrixCounter = 0
def createVector(dimension: Long,
t: RowType = RowType.T_DOUBLE_DENSE,
poolCapacity: Int = PSVectorPool.DEFAULT_POOL_CAPACITY,
range: Long,
additionalConfiguration:Map[String, String] = Map()): PSVector = {
assertCallByDriver("The operation of creating a vector can only be called on the driver side.")
createVectorPool(dimension, poolCapacity, t, range, additionalConfiguration).allocate()
}
private[spark] def createVectorPool(dimension: Long,
capacity: Int,
t: RowType,
range: Long,
additionalConfiguration:Map[String, String]): PSVectorPool = {
val thisCapacity = if (capacity > 0) capacity else PSVectorPool.DEFAULT_POOL_CAPACITY
val matrixMeta = if (t.isDense)
PSMatrix.dense(thisCapacity, dimension, -1, -1, t, additionalConfiguration)
else
PSMatrix.sparse(thisCapacity, dimension, range, t, additionalConfiguration)
val pool = new PSVectorPool(matrixMeta.id, dimension, thisCapacity, t)
psVectorPools.put(pool.id, pool)
pool
}
def createDenseMatrix(rows: Int,
cols: Long,
rowInBlock: Int = -1,
colInBlock: Long = -1,
rowType: RowType = RowType.T_DOUBLE_DENSE,
additionalConfiguration: Map[String, String] = Map()): MatrixMeta = {
val maxRowNumInBlock = if (rowInBlock == -1) rows else rowInBlock
val maxBlockSize = 5000000
val maxColNumInBlock = if (colInBlock == -1) {
math.max(maxBlockSize / maxRowNumInBlock, 1L)
} else {
colInBlock
}
createMatrix(rows, cols, -1, maxRowNumInBlock, maxColNumInBlock, rowType, additionalConfiguration)
}
def createMatrix(rows: Int,
cols: Long,
validIndexNum: Long,
rowInBlock: Int,
colInBlock: Long,
rowType: RowType,
additionalConfiguration: Map[String, String] = Map()): MatrixMeta = {
assertCallByDriver("The operation of creating a matrix can only be called on the driver side.")
val mc = new MatrixContext(s"spark-$matrixCounter", rows, cols, validIndexNum,
rowInBlock, colInBlock, rowType)
additionalConfiguration.foreach { case (key, value) =>
mc.getAttributes.put(key, value)}
if (mc.getAttributes.containsKey(AngelConf.Angel_PS_PARTITION_CLASS)) {
val partitionClassName = mc.getAttributes.get(AngelConf.Angel_PS_PARTITION_CLASS)
mc.setPartitionerClass(Class.forName(partitionClassName).asInstanceOf[Class[Partitioner]])
mc.getAttributes.remove(AngelConf.Angel_PS_PARTITION_CLASS)
}
psAgent.createMatrix(mc, 5000L)
val meta = psAgent.getMatrix(mc.getName)
matrixCounter += 1
matrixMetaMap(meta.getId) = meta
meta
}
def getMatrixMeta(matrixId: Int): Option[MatrixMeta] = {
matrixMetaMap.get(matrixId)
}
/**
* create a sparse long key matrix
*
* @param rows matrix row num
* @param cols dimension for each row
* @param validIndexNum long type range for each row
* if range = -1, the range is (Long.MinValue, Long.MaxValue),
* else range is [0, range), usually range will be set to Long.MaxValue.
* @param rowInBlock row number for each block
* @param colInBlock col number for each block
* @return
*/
def createSparseMatrix(rows: Int,
cols: Long,
validIndexNum: Long,
rowInBlock: Int = -1,
colInBlock: Long = -1,
rowType: RowType = RowType.T_DOUBLE_SPARSE,
additionalConfiguration: Map[String, String] = Map()): MatrixMeta = {
createMatrix(rows, cols, validIndexNum, rowInBlock, colInBlock, rowType, additionalConfiguration)
}
def duplicateVector(original: PSVector): PSVector = {
assertCallByDriver("The operation of duplicate a vector can only be called on the driver side.")
getPool(original.poolId).allocate().reset
}
def destroyVector(vector: PSVector): Unit = {
assertCallByDriver("The operation of destroying a vector can only be called on the driver side.")
getPool(vector.poolId).delete(vector)
}
private[spark] def getPool(id: Int): PSVectorPool = {
psVectorPools.get(id)
}
def destroyVectorPool(vector: PSVector): Unit = {
assertCallByDriver("The operation of destroying a matrix can only be called on the driver side.")
destroyVectorPool(vector.poolId)
}
def destroyVectorPool(poolId: Int): Unit = {
val pool = psVectorPools.remove(poolId)
pool.destroy()
// destroy matrix
destroyMatrix(poolId)
}
def destroyMatrix(matrixId: Int): Unit = {
matrixMetaMap.remove(matrixId).foreach { x =>
psAgent.releaseMatrix(x.getId)
}
}
/**
* Refresh the matrix meta information for this psagent.
* This method will pull matrix infos from master.
*/
override def refreshMatrix(): Unit = {
psAgent.refreshMatrixInfo()
}
def TOTAL_PS_CORES: Int = angelCtx.getConf.getInt(TOTAL_CORES, 2)
def getMatrixClient(matrixId: Int): MatrixClient = {
MatrixClientFactory.get(matrixId, getTaskId)
}
psAgent.initAndStart()
ShutdownHookManager.get().addShutdownHook(
new Runnable {
def run(): Unit = psAgent.stop()
},
FileSystem.SHUTDOWN_HOOK_PRIORITY + 20
)
//========Init Part Finish======//
private def getTaskId: Int = {
val tc = TaskContext.get()
if (tc == null) {
-1
} else {
tc.partitionId()
}
}
protected def stop() {
matrixMetaMap.foreach { entry =>
destroyMatrix(entry._1)
}
if (psAgent != null) {
psAgent.stop()
}
}
private[spark] def conf: Map[String, String] = angelConf
override def createMatrix(mc: MatrixContext): MatrixMeta = {
//assertCallByDriver("The operation of creating a matrix can only be called on the driver side.")
psAgent.createMatrix(mc, 5000L)
val meta = psAgent.getMatrix(mc.getName)
matrixCounter += 1
matrixMetaMap(meta.getId) = meta
meta
}
override def save(ctx: ModelSaveContext): Unit = AngelPSContext.save(ctx)
override def checkpoint(checkpointId:Int, ctx: ModelSaveContext): Unit = AngelPSContext.checkpoint(checkpointId, ctx)
override def load(ctx: ModelLoadContext): Unit = AngelPSContext.load(ctx)
override def recover(checkpointId:Int, ctx: ModelLoadContext): Unit = AngelPSContext.recover(checkpointId, ctx)
}
object AngelPSContext {
private val MASTER_IP = "angel.master.ip"
private val MASTER_PORT = "angel.master.port"
private val TOTAL_CORES = "angel.ps.total.cores"
private val CONF_KEYS = "angel.conf.keys"
private var angelClient: com.tencent.angel.client.AngelPSClient = _
private var hookTask: Runnable = _
def apply(executorId: String, sparkConf: SparkConf): AngelPSContext = {
if (executorId == "driver") {
println("from driver start Angel PS! ")
val angelContext = launchAngel(sparkConf)
new AngelPSContext(-1, angelContext)
} else {
val taskConf = SparkHadoopUtil.get.newConfiguration(sparkConf)
println("from executor to connect Angel PS! ")
val taskContext = TaskContext.get()
val ip = taskContext.getLocalProperty(MASTER_IP)
val port = taskContext.getLocalProperty(MASTER_PORT).toInt
val masterAddr = new Location(ip, port)
val keys = taskContext.getLocalProperty(CONF_KEYS).split(";")
for (key <- keys) {
taskConf.set(key, taskContext.getLocalProperty(key))
}
val angelContext = new AngelContext(masterAddr, taskConf)
new AngelPSContext(executorId.toInt - 1, angelContext)
}
}
def save(context: ModelSaveContext): Unit = {
angelClient.save(context)
}
def checkpoint(checkpointId:Int, context: ModelSaveContext): Unit = {
angelClient.checkpoint(checkpointId, context)
}
def load(context: ModelLoadContext): Unit = {
angelClient.load(context)
}
def recover(checkpointId:Int, context: ModelLoadContext): Unit = {
angelClient.recover(checkpointId, context)
}
//Start Angel
private def launchAngel(conf: SparkConf): AngelContext = {
angelClient = new com.tencent.angel.client.AngelPSClient(convertToHadoop(conf))
hookTask = new Runnable {
def run(): Unit = doStop()
}
ShutdownHookManager.get().addShutdownHook(hookTask,
FileSystem.SHUTDOWN_HOOK_PRIORITY + 10)
angelClient.startPS()
}
/**
* Convert SparkConf to Angel Configuration.
*/
def convertToHadoop(conf: SparkConf): Configuration = {
val appName = conf.get("spark.app.name") + "-ps"
val queue = conf.get("spark.yarn.queue", "root.default")
/** mode: YARN, KUBERNETES or LOCAL */
val master = conf.getOption("spark.master")
val isLocal = if (master.isEmpty || master.get.toLowerCase.startsWith("local")) true else false
val deployMode = if (isLocal) "LOCAL" else conf.get("spark.ps.mode", DEFAULT_ANGEL_DEPLOY_MODE)
val masterMem = conf.getSizeAsGb("spark.angel.master.memory", "2g").toInt
val psNum = conf.getInt("spark.ps.instances", 1)
val psCores = conf.getInt("spark.ps.cores", 1)
val psMem = conf.getSizeAsGb("spark.ps.memory", "4g").toInt
val psOpts = conf.getOption("spark.ps.extraJavaOptions")
val psJars = conf.get("spark.ps.jars", "")
val psLogLevel = conf.get("spark.ps.log.level", "INFO")
val psClass = conf.get("spark.ps.class", classOf[ParameterServer].getName)
val defaultFS = conf.get("spark.hadoop.fs.defaultFS", "file://")
val tempPath = defaultFS + "/tmp/spark-on-angel/" + UUID.randomUUID()
val psOutOverwrite = conf.getBoolean("spark.ps.out.overwrite", defaultValue = true)
val psOutTmpOption = conf.getOption("spark.ps.out.tmp.path.prefix")
import com.tencent.angel.conf.AngelConf._
val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
// setting running mode, app name, queue and deploy mode
hadoopConf.set(ANGEL_RUNNING_MODE, "ANGEL_PS")
hadoopConf.set(ANGEL_JOB_NAME, appName)
hadoopConf.set(ANGEL_QUEUE, queue)
hadoopConf.set(ANGEL_DEPLOY_MODE, deployMode)
// For local mode, we set heartbeat a small value for fast debugging
if (deployMode == "LOCAL")
hadoopConf.set(ANGEL_PS_HEARTBEAT_INTERVAL_MS, "200")
if (psOpts.isDefined)
hadoopConf.set(ANGEL_PS_JAVA_OPTS, psOpts.get)
// Set the temp path as the angel.save.model.path to fake the angel-ps system
// The action type is also a fake setting.
hadoopConf.set(ANGEL_ACTION_TYPE, "train")
hadoopConf.set(ANGEL_SAVE_MODEL_PATH, tempPath)
if (deployMode == "KUBERNETES") {
hadoopConf.set(ANGEL_KUBERNETES_MASTER, master.get.substring("k8s://".length))
}
// Setting resource
hadoopConf.setInt(ANGEL_AM_MEMORY_GB, masterMem)
hadoopConf.setInt(ANGEL_PS_NUMBER, psNum)
hadoopConf.setInt(ANGEL_PS_CPU_VCORES, psCores)
hadoopConf.setInt(ANGEL_PS_MEMORY_GB, psMem)
hadoopConf.setInt(TOTAL_CORES, psNum * psCores)
hadoopConf.set(ANGEL_AM_LOG_LEVEL, psLogLevel)
hadoopConf.set(ANGEL_PS_LOG_LEVEL, psLogLevel)
hadoopConf.set(ANGEL_PS_CLASS, psClass)
hadoopConf.set(ANGEL_JOB_LIBJARS, psJars)
hadoopConf.setBoolean(ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, psOutOverwrite)
psOutTmpOption.foreach(hadoopConf.set(ANGEL_JOB_TMP_OUTPUT_PATH_PREFIX, _))
// No need for sync clock values, we don't need ssp in Spark.
hadoopConf.setInt(ANGEL_PSAGENT_CACHE_SYNC_TIMEINTERVAL_MS, 100000000)
hadoopConf.set(ANGEL_LOG_PATH, tempPath)
if (deployMode != "KUBERNETES") {
// add user resource files
addUserResourceFiles(conf, hadoopConf)
}
// Some other settings
conf.getAllWithPrefix("angel").foreach {
case (key, value) => hadoopConf.set(s"angel$key", value)
}
conf.getAllWithPrefix("spark.angel").foreach {
case (key, value) => hadoopConf.set(s"angel$key", value)
}
hadoopConf
}
private def addUserResourceFiles(sparkConf: SparkConf, conf: Configuration): Unit = {
val appStagingBaseDir = sparkConf.getOption("spark.yarn.stagingDir")
.fold(FileSystem.get(conf).getHomeDirectory)(new Path(_))
val appStagingDir = new Path(appStagingBaseDir, s".sparkStaging/${sparkConf.getAppId}")
val resourceFiles = new ArrayBuffer[String]()
sparkConf.getOption("spark.yarn.dist.files").foreach { fileList =>
fileList.split(",").foreach { file =>
val fileName = file.trim.split("/").last
if (fileName.nonEmpty) {
resourceFiles.append(new Path(appStagingDir, fileName).toString)
}
}
conf.set(AngelConf.ANGEL_APP_USER_RESOURCE_FILES, resourceFiles.mkString(","))
}
}
private def assertCallByDriver(message: String): Unit = {
val executorId = SparkEnv.get.executorId
if (executorId != "driver")
throw new AngelException(s"executorId = $executorId, $message")
}
def stopAngel() {
if (hookTask != null) {
ShutdownHookManager.get().removeShutdownHook(hookTask)
hookTask = null
}
doStop()
}
def doStop(): Unit = {
if (angelClient != null) {
angelClient.stopPS()
angelClient = null
}
}
/**
* return if AngelPSClient is alive
*/
def isAlive: Boolean = {
if (angelClient != null && hookTask != null) true else false
}
def adjustExecutorJVM(conf: SparkConf): SparkConf = {
val extraOps = conf.getOption("spark.ps.executor.extraJavaOptions")
val defaultOps = conf.get("spark.executor.extraJavaOptions", "")
val extraOpsStr = if (extraOps.isDefined) {
extraOps.get
} else {
var executorMemSizeInMB = conf.getSizeAsMb("spark.executor.memory", "2048M")
if (executorMemSizeInMB < 2048) executorMemSizeInMB = 2048
val isUseDirect: Boolean = conf.getBoolean("spark.ps.usedirectbuffer", defaultValue = true)
val maxUse = executorMemSizeInMB - 512
var directRegionSize: Int = 0
if (isUseDirect) directRegionSize = (maxUse * 0.3).toInt
else directRegionSize = (maxUse * 0.2).toInt
val heapMax = maxUse - directRegionSize
val youngRegionSize = (heapMax * 0.3).toInt
val survivorRatio = 4
conf.set("spark.executor.memory", heapMax + "M")
val executorOps = new StringBuilder()
.append(" -Xmn").append(youngRegionSize).append("M")
.append(" -XX:MaxDirectMemorySize=").append(directRegionSize).append("M")
.append(" -XX:SurvivorRatio=").append(survivorRatio)
.append(" -XX:+AggressiveOpts")
.append(" -XX:+UseLargePages")
.append(" -XX:+UseConcMarkSweepGC")
.append(" -XX:CMSInitiatingOccupancyFraction=50")
.append(" -XX:+UseCMSInitiatingOccupancyOnly")
.append(" -XX:+CMSScavengeBeforeRemark")
.append(" -XX:+UseCMSCompactAtFullCollection")
.append(" -verbose:gc")
.append(" -XX:+PrintGCDateStamps")
.append(" -XX:+PrintGCDetails")
.append(" -XX:+PrintCommandLineFlags")
.append(" -XX:+PrintTenuringDistribution")
.append(" -XX:+PrintAdaptiveSizePolicy")
.toString()
executorOps
}
conf.set("spark.executor.extraJavaOptions", defaultOps + " " + extraOpsStr)
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy