
com.tencent.angel.spark.examples.util.Logistic.scala Maven / Gradle / Ivy
/*
* Tencent is pleased to support the open source community by making Angel available.
*
* Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
*
* Licensed under the BSD 3-Clause License (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
*
* https://opensource.org/licenses/BSD-3-Clause
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*
*/
package com.tencent.angel.spark.examples.util
import scala.collection.mutable.ArrayBuffer
import scala.util.Random
import breeze.linalg.{DenseVector => BDV}
import breeze.optimize.DiffFunction
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.linalg.{DenseVector, Vector}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import com.tencent.angel.spark.models.vector.{BreezePSVector, RemotePSVector}
/**
* This is LogisticRegression data generator and its DiffFunction.
* We have implemented three styles of DiffFunction:
* 1. Pure Spark Style, without PS
* 2. PS mode, update PSVector by RemotePSVector.increment
* 3. PS mode, update PSVector by RDDPSFunction.psAggregate
*/
object Logistic {
def generateLRData(sampleNum: Int, dim: Int, partitionNum: Int): RDD[(Vector, Double)] = {
val spark = SparkSession.builder().getOrCreate()
val rand = new Random(42)
val initWeight = new DenseVector((0 until dim).map(_ => rand.nextGaussian()).toArray)
val bcWeight = spark.sparkContext.broadcast(initWeight)
spark.sparkContext.parallelize(0 until partitionNum, partitionNum)
.flatMap { pid =>
val rand = new Random(42 + pid)
(0 until (sampleNum / partitionNum)).map { instanceId =>
val featArray = (0 until dim).toArray.map(_ => rand.nextGaussian())
val feat = new DenseVector(featArray)
val score = (0 until dim).map(i => feat(i) * bcWeight.value(i)).sum
val prob = 1.0 / (1.0 + math.exp(-1 * score))
val label = if (rand.nextInt() < prob) 1.0 else 0.0
(feat, label)
}
}
}
case class Cost(trainData: RDD[(Vector, Double)]) extends DiffFunction[BDV[Double]] {
def calculate(x: BDV[Double]): (Double, BDV[Double]) = {
val sampleNum = trainData.count()
val (cumGradient, cumLoss) = {
val seqOp = (c: (BDV[Double], Double), point: (Vector, Double)) => {
val (feat, label) = point
val (combGrad, combLoss) = c
val brzData = new BDV[Double](feat.toArray)
val margin: Double = -1.0 * x.dot(brzData)
val gradientMultiplier = (1.0 / (1.0 + math.exp(margin))) - label
val gradient = brzData * gradientMultiplier
val loss =
if (label > 0) {
// log1p is log(1+p) but more accurate for small p
math.log1p(math.exp(margin))
} else {
math.log1p(math.exp(margin)) - margin
}
combGrad += gradient
(combGrad, combLoss + loss)
}
val combOp = (c1: (BDV[Double], Double), c2: (BDV[Double], Double)) => {
(c1._1 + c2._1, c1._2 + c2._2)
}
trainData.treeAggregate((new BDV[Double](x.length), 0.0)) (seqOp, combOp)
}
val resGradient = new BDV[Double](cumGradient.toArray.map(_ / sampleNum))
(cumLoss / sampleNum, resGradient)
}
}
case class PSCost(trainData: RDD[(Vector, Double)]) extends DiffFunction[BreezePSVector] {
override def calculate(x: BreezePSVector) : (Double, BreezePSVector) = {
val localX = new BDV[Double](x.toRemote.pull())
val bcX = trainData.sparkContext.broadcast(localX)
val cumGradient = x.proxy.getPool().createZero().mkBreeze()
val sampleNum = trainData.count()
val cumLoss = trainData.mapPartitions { iter =>
val lossArray = new ArrayBuffer[Double]()
val gradientSum = iter.map { case (feat, label) =>
val brzData = new BDV[Double](feat.toArray)
val margin: Double = -1.0 * bcX.value.dot(brzData)
val gradientMultiplier = (1.0 / (1.0 + math.exp(margin))) - label
val gradient = brzData * gradientMultiplier
val loss =
if (label > 0) {
// log1p is log(1+p) but more accurate for small p
math.log1p(math.exp(margin))
} else {
math.log1p(math.exp(margin)) - margin
}
lossArray += loss
gradient
}.reduce(_ + _)
cumGradient.toRemote.incrementAndFlush(gradientSum.toArray)
lossArray.toIterator
}.sum()
BreezePSVector.blas.scal(1.0 / sampleNum, cumGradient)
(cumLoss / sampleNum, cumGradient)
}
}
case class PSAggregateCost(trainData: RDD[(Vector, Double)])
extends DiffFunction[BreezePSVector] {
case class Aggregator(bcX: Broadcast[BDV[Double]], remoteGradient: RemotePSVector)
extends Serializable {
private var lossSum = 0.0
private var count = 0L
def add(point: (Vector, Double)): Aggregator = {
val brzFeat = new BDV[Double](point._1.toArray)
val label = point._2
val margin = -1.0 * bcX.value.dot(brzFeat)
val multiplier = (1.0 / (1.0 + math.exp(margin))) - label
val gradient = brzFeat * multiplier
val loss = if (label > 0) {
math.log1p(math.exp(margin))
} else {
math.log1p(math.exp(margin)) - margin
}
remoteGradient.increment(gradient.toArray)
lossSum += loss
count += 1
this
}
def merge(other: Aggregator): Aggregator = {
this.lossSum += other.lossSum
this.count += other.count
this
}
def loss: Double = lossSum / count
}
override def calculate(x: BreezePSVector): (Double, BreezePSVector) = {
import com.tencent.angel.spark.rdd.RDDPSFunctions._
val pool = x.proxy.getPool()
val localX = new BDV(x.toRemote.pull())
val bcX = trainData.sparkContext.broadcast(localX)
val cumGradient = pool.createZero().mkRemote()
val aggregator = {
val seqOp = (c: Aggregator, point: (Vector, Double)) => c.add(point)
val combOp = (c1: Aggregator, c2: Aggregator) => c1.merge(c2)
trainData.psAggregate(Aggregator(bcX, cumGradient))(seqOp, combOp)
}
(aggregator.loss, cumGradient.toBreeze)
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy