
org.apache.flink.ml.recommendation.ALS.scala Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.flink.ml.recommendation
import java.{util, lang}
import org.apache.flink.api.common.operators.base.JoinOperatorBase.JoinHint
import org.apache.flink.api.scala._
import org.apache.flink.api.common.operators.Order
import org.apache.flink.core.memory.{DataOutputView, DataInputView}
import org.apache.flink.ml.common._
import org.apache.flink.ml.pipeline.{FitOperation, PredictDataSetOperation, Predictor}
import org.apache.flink.types.Value
import org.apache.flink.util.Collector
import org.apache.flink.api.common.functions.{Partitioner => FlinkPartitioner, GroupReduceFunction, CoGroupFunction}
import com.github.fommil.netlib.BLAS.{ getInstance => blas }
import com.github.fommil.netlib.LAPACK.{ getInstance => lapack }
import org.netlib.util.intW
import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import scala.util.Random
/** Alternating least squares algorithm to calculate a matrix factorization.
*
* Given a matrix `R`, ALS calculates two matricess `U` and `V` such that `R ~~ U^TV`. The
* unknown row dimension is given by the number of latent factors. Since matrix factorization
* is often used in the context of recommendation, we'll call the first matrix the user and the
* second matrix the item matrix. The `i`th column of the user matrix is `u_i` and the `i`th
* column of the item matrix is `v_i`. The matrix `R` is called the ratings matrix and
* `(R)_{i,j} = r_{i,j}`.
*
* In order to find the user and item matrix, the following problem is solved:
*
* `argmin_{U,V} sum_(i,j\ with\ r_{i,j} != 0) (r_{i,j} - u_{i}^Tv_{j})^2 +
* lambda (sum_(i) n_{u_i} ||u_i||^2 + sum_(j) n_{v_j} ||v_j||^2)`
*
* with `\lambda` being the regularization factor, `n_{u_i}` being the number of items the user `i`
* has rated and `n_{v_j}` being the number of times the item `j` has been rated. This
* regularization scheme to avoid overfitting is called weighted-lambda-regularization. Details
* can be found in the work of [[http://dx.doi.org/10.1007/978-3-540-68880-8_32 Zhou et al.]].
*
* By fixing one of the matrices `U` or `V` one obtains a quadratic form which can be solved. The
* solution of the modified problem is guaranteed to decrease the overall cost function. By
* applying this step alternately to the matrices `U` and `V`, we can iteratively improve the
* matrix factorization.
*
* The matrix `R` is given in its sparse representation as a tuple of `(i, j, r)` where `i` is the
* row index, `j` is the column index and `r` is the matrix value at position `(i,j)`.
*
* @example
* {{{
* val inputDS: DataSet[(Int, Int, Double)] = env.readCsvFile[(Int, Int, Double)](
* pathToTrainingFile)
*
* val als = ALS()
* .setIterations(10)
* .setNumFactors(10)
*
* als.fit(inputDS)
*
* val data2Predict: DataSet[(Int, Int)] = env.readCsvFile[(Int, Int)](pathToData)
*
* als.predict(data2Predict)
* }}}
*
* =Parameters=
*
* - [[org.apache.flink.ml.recommendation.ALS.NumFactors]]:
* The number of latent factors. It is the dimension of the calculated user and item vectors.
* (Default value: '''10''')
*
* - [[org.apache.flink.ml.recommendation.ALS.Lambda]]:
* Regularization factor. Tune this value in order to avoid overfitting/generalization.
* (Default value: '''1''')
*
* - [[org.apache.flink.ml.regression.MultipleLinearRegression.Iterations]]:
* The number of iterations to perform. (Default value: '''10''')
*
* - [[org.apache.flink.ml.recommendation.ALS.Blocks]]:
* The number of blocks into which the user and item matrix a grouped. The fewer
* blocks one uses, the less data is sent redundantly. However, bigger blocks entail bigger
* update messages which have to be stored on the Heap. If the algorithm fails because of
* an OutOfMemoryException, then try to increase the number of blocks. (Default value: '''None''')
*
* - [[org.apache.flink.ml.recommendation.ALS.Seed]]:
* Random seed used to generate the initial item matrix for the algorithm.
* (Default value: '''0''')
*
* - [[org.apache.flink.ml.recommendation.ALS.TemporaryPath]]:
* Path to a temporary directory into which intermediate results are stored. If
* this value is set, then the algorithm is split into two preprocessing steps, the ALS iteration
* and a post-processing step which calculates a last ALS half-step. The preprocessing steps
* calculate the [[org.apache.flink.ml.recommendation.ALS.OutBlockInformation]] and
* [[org.apache.flink.ml.recommendation.ALS.InBlockInformation]] for the given rating matrix.
* The result of the individual steps are stored in the specified directory. By splitting the
* algorithm into multiple smaller steps, Flink does not have to split the available memory
* amongst too many operators. This allows the system to process bigger individual messasges and
* improves the overall performance. (Default value: '''None''')
*
* The ALS implementation is based on Spark's MLLib implementation of ALS which you can find
* [[https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/mllib/
* recommendation/ALS.scala here]].
*/
class ALS extends Predictor[ALS] {
import ALS._
// Stores the matrix factorization after the fitting phase
var factorsOption: Option[(DataSet[Factors], DataSet[Factors])] = None
/** Sets the number of latent factors/row dimension of the latent model
*
* @param numFactors
* @return
*/
def setNumFactors(numFactors: Int): ALS = {
parameters.add(NumFactors, numFactors)
this
}
/** Sets the regularization coefficient lambda
*
* @param lambda
* @return
*/
def setLambda(lambda: Double): ALS = {
parameters.add(Lambda, lambda)
this
}
/** Sets the number of iterations of the ALS algorithm
*
* @param iterations
* @return
*/
def setIterations(iterations: Int): ALS = {
parameters.add(Iterations, iterations)
this
}
/** Sets the number of blocks into which the user and item matrix shall be partitioned
*
* @param blocks
* @return
*/
def setBlocks(blocks: Int): ALS = {
parameters.add(Blocks, blocks)
this
}
/** Sets the random seed for the initial item matrix initialization
*
* @param seed
* @return
*/
def setSeed(seed: Long): ALS = {
parameters.add(Seed, seed)
this
}
/** Sets the temporary path into which intermediate results are written in order to increase
* performance.
*
* @param temporaryPath
* @return
*/
def setTemporaryPath(temporaryPath: String): ALS = {
parameters.add(TemporaryPath, temporaryPath)
this
}
/** Empirical risk of the trained model (matrix factorization).
*
* @param labeledData Reference data
* @param riskParameters Additional parameters for the empirical risk calculation
* @return
*/
def empiricalRisk(
labeledData: DataSet[(Int, Int, Double)],
riskParameters: ParameterMap = ParameterMap.Empty)
: DataSet[Double] = {
val resultingParameters = parameters ++ riskParameters
val lambda = resultingParameters(Lambda)
val data = labeledData map {
x => (x._1, x._2)
}
factorsOption match {
case Some((userFactors, itemFactors)) => {
val predictions = data.join(userFactors, JoinHint.REPARTITION_HASH_SECOND).where(0)
.equalTo(0).join(itemFactors, JoinHint.REPARTITION_HASH_SECOND).where("_1._2")
.equalTo(0).map {
triple => {
val (((uID, iID), uFactors), iFactors) = triple
val uFactorsVector = uFactors.factors
val iFactorsVector = iFactors.factors
val squaredUNorm2 = blas.ddot(
uFactorsVector.length,
uFactorsVector,
1,
uFactorsVector,
1)
val squaredINorm2 = blas.ddot(
iFactorsVector.length,
iFactorsVector,
1,
iFactorsVector,
1)
val prediction = blas.ddot(uFactorsVector.length, uFactorsVector, 1, iFactorsVector, 1)
(uID, iID, prediction, squaredUNorm2, squaredINorm2)
}
}
labeledData.join(predictions).where(0,1).equalTo(0,1) {
(left, right) => {
val (_, _, expected) = left
val (_, _, predicted, squaredUNorm2, squaredINorm2) = right
val residual = expected - predicted
residual * residual + lambda * (squaredUNorm2 + squaredINorm2)
}
} reduce {
_ + _
}
}
case None => throw new RuntimeException("The ALS model has not been fitted to data. " +
"Prior to predicting values, it has to be trained on data.")
}
}
}
object ALS {
val USER_FACTORS_FILE = "userFactorsFile"
val ITEM_FACTORS_FILE = "itemFactorsFile"
// ========================================= Parameters ==========================================
case object NumFactors extends Parameter[Int] {
val defaultValue: Option[Int] = Some(10)
}
case object Lambda extends Parameter[Double] {
val defaultValue: Option[Double] = Some(1.0)
}
case object Iterations extends Parameter[Int] {
val defaultValue: Option[Int] = Some(10)
}
case object Blocks extends Parameter[Int] {
val defaultValue: Option[Int] = None
}
case object Seed extends Parameter[Long] {
val defaultValue: Option[Long] = Some(0L)
}
case object TemporaryPath extends Parameter[String] {
val defaultValue: Option[String] = None
}
// ==================================== ALS type definitions =====================================
/** Representation of a user-item rating
*
* @param user User ID of the rating user
* @param item Item iD of the rated item
* @param rating Rating value
*/
case class Rating(user: Int, item: Int, rating: Double)
/** Latent factor model vector
*
* @param id
* @param factors
*/
case class Factors(id: Int, factors: Array[Double]) {
override def toString = s"($id, ${factors.mkString(",")})"
}
case class Factorization(userFactors: DataSet[Factors], itemFactors: DataSet[Factors])
case class OutBlockInformation(elementIDs: Array[Int], outLinks: OutLinks) {
override def toString: String = {
s"OutBlockInformation:((${elementIDs.mkString(",")}), ($outLinks))"
}
}
class OutLinks(var links: Array[scala.collection.mutable.BitSet]) extends Value {
def this() = this(null)
override def toString: String = {
s"${links.mkString("\n")}"
}
override def write(out: DataOutputView): Unit = {
out.writeInt(links.length)
links foreach {
link => {
val bitMask = link.toBitMask
out.writeInt(bitMask.length)
for (element <- bitMask) {
out.writeLong(element)
}
}
}
}
override def read(in: DataInputView): Unit = {
val length = in.readInt()
links = new Array[scala.collection.mutable.BitSet](length)
for (i <- 0 until length) {
val bitMaskLength = in.readInt()
val bitMask = new Array[Long](bitMaskLength)
for (j <- 0 until bitMaskLength) {
bitMask(j) = in.readLong()
}
links(i) = mutable.BitSet.fromBitMask(bitMask)
}
}
def apply(idx: Int) = links(idx)
}
case class InBlockInformation(elementIDs: Array[Int], ratingsForBlock: Array[BlockRating]) {
override def toString: String = {
s"InBlockInformation:((${elementIDs.mkString(",")}), (${ratingsForBlock.mkString("\n")}))"
}
}
case class BlockRating(var ratings: Array[(Array[Int], Array[Double])]) {
def apply(idx: Int) = ratings(idx)
override def toString: String = {
ratings.map {
case (left, right) => s"((${left.mkString(",")}),(${right.mkString(",")}))"
}.mkString(",")
}
}
case class BlockedFactorization(userFactors: DataSet[(Int, Array[Array[Double]])],
itemFactors: DataSet[(Int, Array[Array[Double]])])
class BlockIDPartitioner extends FlinkPartitioner[Int] {
override def partition(blockID: Int, numberOfPartitions: Int): Int = {
blockID % numberOfPartitions
}
}
class BlockIDGenerator(blocks: Int) extends Serializable {
def apply(id: Int): Int = {
id % blocks
}
}
// ================================= Factory methods =============================================
def apply(): ALS = {
new ALS()
}
// ===================================== Operations ==============================================
/** Predict operation which calculates the matrix entry for the given indices */
implicit val predictRating = new PredictDataSetOperation[ALS, (Int, Int), (Int ,Int, Double)] {
override def predictDataSet(
instance: ALS,
predictParameters: ParameterMap,
input: DataSet[(Int, Int)])
: DataSet[(Int, Int, Double)] = {
instance.factorsOption match {
case Some((userFactors, itemFactors)) => {
input.join(userFactors, JoinHint.REPARTITION_HASH_SECOND).where(0).equalTo(0)
.join(itemFactors, JoinHint.REPARTITION_HASH_SECOND).where("_1._2").equalTo(0).map {
triple => {
val (((uID, iID), uFactors), iFactors) = triple
val uFactorsVector = uFactors.factors
val iFactorsVector = iFactors.factors
val prediction = blas.ddot(
uFactorsVector.length,
uFactorsVector,
1,
iFactorsVector,
1)
(uID, iID, prediction)
}
}
}
case None => throw new RuntimeException("The ALS model has not been fitted to data. " +
"Prior to predicting values, it has to be trained on data.")
}
}
}
/** Calculates the matrix factorization for the given ratings. A rating is defined as
* a tuple of user ID, item ID and the corresponding rating.
*
* @return Factorization containing the user and item matrix
*/
implicit val fitALS = new FitOperation[ALS, (Int, Int, Double)] {
override def fit(
instance: ALS,
fitParameters: ParameterMap,
input: DataSet[(Int, Int, Double)])
: Unit = {
val resultParameters = instance.parameters ++ fitParameters
val userBlocks = resultParameters.get(Blocks).getOrElse(input.count.toInt)
val itemBlocks = userBlocks
val persistencePath = resultParameters.get(TemporaryPath)
val seed = resultParameters(Seed)
val factors = resultParameters(NumFactors)
val iterations = resultParameters(Iterations)
val lambda = resultParameters(Lambda)
val ratings = input.map {
entry => {
val (userID, itemID, rating) = entry
Rating(userID, itemID, rating)
}
}
val blockIDPartitioner = new BlockIDPartitioner()
val ratingsByUserBlock = ratings.map{
rating =>
val blockID = rating.user % userBlocks
(blockID, rating)
} partitionCustom(blockIDPartitioner, 0)
val ratingsByItemBlock = ratings map {
rating =>
val blockID = rating.item % itemBlocks
(blockID, new Rating(rating.item, rating.user, rating.rating))
} partitionCustom(blockIDPartitioner, 0)
val (uIn, uOut) = createBlockInformation(userBlocks, itemBlocks, ratingsByUserBlock,
blockIDPartitioner)
val (iIn, iOut) = createBlockInformation(itemBlocks, userBlocks, ratingsByItemBlock,
blockIDPartitioner)
val (userIn, userOut) = persistencePath match {
case Some(path) => FlinkMLTools.persist(uIn, uOut, path + "userIn", path + "userOut")
case None => (uIn, uOut)
}
val (itemIn, itemOut) = persistencePath match {
case Some(path) => FlinkMLTools.persist(iIn, iOut, path + "itemIn", path + "itemOut")
case None => (iIn, iOut)
}
val initialItems = itemOut.partitionCustom(blockIDPartitioner, 0).map{
outInfos =>
val blockID = outInfos._1
val infos = outInfos._2
(blockID, infos.elementIDs.map{
id =>
val random = new Random(id ^ seed)
randomFactors(factors, random)
})
}.withForwardedFields("0")
// iteration to calculate the item matrix
val items = initialItems.iterate(iterations) {
items => {
val users = updateFactors(userBlocks, items, itemOut, userIn, factors, lambda,
blockIDPartitioner)
updateFactors(itemBlocks, users, userOut, itemIn, factors, lambda, blockIDPartitioner)
}
}
val pItems = persistencePath match {
case Some(path) => FlinkMLTools.persist(items, path + "items")
case None => items
}
// perform last half-step to calculate the user matrix
val users = updateFactors(userBlocks, pItems, itemOut, userIn, factors, lambda,
blockIDPartitioner)
instance.factorsOption = Some((
unblock(users, userOut, blockIDPartitioner),
unblock(pItems, itemOut, blockIDPartitioner)))
}
}
/** Calculates a single half step of the ALS optimization. The result is the new value for
* either the user or item matrix, depending with which matrix the method was called.
*
* @param numUserBlocks Number of blocks in the respective dimension
* @param items Fixed matrix value for the half step
* @param itemOut Out information to know where to send the vectors
* @param userIn In information for the cogroup step
* @param factors Number of latent factors
* @param lambda Regularization constant
* @param blockIDPartitioner Custom Flink partitioner
* @return New value for the optimized matrix (either user or item)
*/
def updateFactors(numUserBlocks: Int,
items: DataSet[(Int, Array[Array[Double]])],
itemOut: DataSet[(Int, OutBlockInformation)],
userIn: DataSet[(Int, InBlockInformation)],
factors: Int,
lambda: Double, blockIDPartitioner: FlinkPartitioner[Int]):
DataSet[(Int, Array[Array[Double]])] = {
// send the item vectors to the blocks whose users have rated the items
val partialBlockMsgs = itemOut.join(items).where(0).equalTo(0).
withPartitioner(blockIDPartitioner).apply {
(left, right, col: Collector[(Int, Int, Array[Array[Double]])]) => {
val blockID = left._1
val outInfo = left._2
val factors = right._2
var userBlock = 0
var itemIdx = 0
while(userBlock < numUserBlocks){
itemIdx = 0
val buffer = new ArrayBuffer[Array[Double]]
while(itemIdx < outInfo.elementIDs.length){
if(outInfo.outLinks(userBlock)(itemIdx)){
buffer += factors(itemIdx)
}
itemIdx += 1
}
if(buffer.nonEmpty){
// send update message to userBlock
col.collect(userBlock, blockID, buffer.toArray)
}
userBlock += 1
}
}
}
// collect the partial update messages and calculate for each user block the new user vectors
partialBlockMsgs.coGroup(userIn).where(0).equalTo(0).sortFirstGroup(1, Order.ASCENDING).
withPartitioner(blockIDPartitioner).apply{
new CoGroupFunction[(Int, Int, Array[Array[Double]]), (Int,
InBlockInformation), (Int, Array[Array[Double]])](){
// in order to save space, store only the upper triangle of the XtX matrix
val triangleSize = (factors*factors - factors)/2 + factors
val matrix = Array.fill(triangleSize)(0.0)
val fullMatrix = Array.fill(factors * factors)(0.0)
val userXtX = new ArrayBuffer[Array[Double]]()
val userXy = new ArrayBuffer[Array[Double]]()
val numRatings = new ArrayBuffer[Int]()
override def coGroup(left: lang.Iterable[(Int, Int, Array[Array[Double]])],
right: lang.Iterable[(Int, InBlockInformation)],
collector: Collector[(Int, Array[Array[Double]])]): Unit = {
// there is only one InBlockInformation per user block
val inInfo = right.iterator().next()._2
val updates = left.iterator()
val numUsers = inInfo.elementIDs.length
var blockID = -1
var i = 0
// clear old matrices and allocate new ones
val matricesToClear = if (numUsers > userXtX.length) {
val oldLength = userXtX.length
while(i < (numUsers - oldLength)) {
userXtX += Array.fill(triangleSize)(0.0)
userXy += Array.fill(factors)(0.0)
numRatings.+=(0)
i += 1
}
oldLength
} else {
numUsers
}
i = 0
while(i < matricesToClear){
numRatings(i) = 0
util.Arrays.fill(userXtX(i), 0.0)
util.Arrays.fill(userXy(i), 0.0)
i += 1
}
var itemBlock = 0
// build XtX matrices and Xy vector
while(updates.hasNext){
val update = updates.next()
val blockFactors = update._3
blockID = update._1
var p = 0
while(p < blockFactors.length){
val vector = blockFactors(p)
outerProduct(vector, matrix, factors)
val (users, ratings) = inInfo.ratingsForBlock(itemBlock)(p)
var i = 0
while (i < users.length) {
numRatings(users(i)) += 1
blas.daxpy(matrix.length, 1, matrix, 1, userXtX(users(i)), 1)
blas.daxpy(vector.length, ratings(i), vector, 1, userXy(users(i)), 1)
i += 1
}
p += 1
}
itemBlock += 1
}
val array = new Array[Array[Double]](numUsers)
i = 0
while(i < numUsers){
generateFullMatrix(userXtX(i), fullMatrix, factors)
var f = 0
// add regularization constant
while(f < factors){
fullMatrix(f*factors + f) += lambda * numRatings(i)
f += 1
}
// calculate new user vector
val result = new intW(0)
lapack.dposv("U", factors, 1, fullMatrix, factors , userXy(i), factors, result)
array(i) = userXy(i)
i += 1
}
collector.collect((blockID, array))
}
}
}.withForwardedFieldsFirst("0").withForwardedFieldsSecond("0")
}
/** Creates the meta information needed to route the item and user vectors to the respective user
* and item blocks.
* * @param userBlocks
* @param itemBlocks
* @param ratings
* @param blockIDPartitioner
* @return
*/
def createBlockInformation(userBlocks: Int, itemBlocks: Int, ratings: DataSet[(Int, Rating)],
blockIDPartitioner: BlockIDPartitioner):
(DataSet[(Int, InBlockInformation)], DataSet[(Int, OutBlockInformation)]) = {
val blockIDGenerator = new BlockIDGenerator(itemBlocks)
val usersPerBlock = createUsersPerBlock(ratings)
val outBlockInfos = createOutBlockInformation(ratings, usersPerBlock, itemBlocks,
blockIDGenerator)
val inBlockInfos = createInBlockInformation(ratings, usersPerBlock, blockIDGenerator)
(inBlockInfos, outBlockInfos)
}
/** Calculates the userIDs in ascending order of each user block
*
* @param ratings
* @return
*/
def createUsersPerBlock(ratings: DataSet[(Int, Rating)]): DataSet[(Int, Array[Int])] = {
ratings.map{ x => (x._1, x._2.user)}.withForwardedFields("0").groupBy(0).
sortGroup(1, Order.ASCENDING).reduceGroup {
users => {
val result = ArrayBuffer[Int]()
var id = -1
var oldUser = -1
while(users.hasNext) {
val user = users.next()
id = user._1
if (user._2 != oldUser) {
result.+=(user._2)
oldUser = user._2
}
}
val userIDs = result.toArray
(id, userIDs)
}
}.withForwardedFields("0")
}
/** Creates the outgoing block information
*
* Creates for every user block the outgoing block information. The out block information
* contains for every item block a [[scala.collection.mutable.BitSet]] which indicates which
* user vector has to be sent to this block. If a vector v has to be sent to a block b, then
* bitsets(b)'s bit v is set to 1, otherwise 0. Additionally the user IDataSet are replaced by
* the user vector's index value.
*
* @param ratings
* @param usersPerBlock
* @param itemBlocks
* @param blockIDGenerator
* @return
*/
def createOutBlockInformation(ratings: DataSet[(Int, Rating)],
usersPerBlock: DataSet[(Int, Array[Int])],
itemBlocks: Int, blockIDGenerator: BlockIDGenerator):
DataSet[(Int, OutBlockInformation)] = {
ratings.coGroup(usersPerBlock).where(0).equalTo(0).apply {
(ratings, users) =>
val userIDs = users.next()._2
val numUsers = userIDs.length
val userIDToPos = userIDs.zipWithIndex.toMap
val shouldDataSend = Array.fill(itemBlocks)(new scala.collection.mutable.BitSet(numUsers))
var blockID = -1
while (ratings.hasNext) {
val r = ratings.next()
val pos =
try {
userIDToPos(r._2.user)
}catch{
case e: NoSuchElementException =>
throw new RuntimeException(s"Key ${r._2.user} not found. BlockID $blockID. " +
s"Elements in block ${userIDs.take(5).mkString(", ")}. " +
s"UserIDList contains ${userIDs.contains(r._2.user)}.", e)
}
blockID = r._1
shouldDataSend(blockIDGenerator(r._2.item))(pos) = true
}
(blockID, OutBlockInformation(userIDs, new OutLinks(shouldDataSend)))
}.withForwardedFieldsFirst("0").withForwardedFieldsSecond("0")
}
/** Creates the incoming block information
*
* Creates for every user block the incoming block information. The incoming block information
* contains the userIDs of the users in the respective block and for every item block a
* BlockRating instance. The BlockRating instance describes for every incoming set of item
* vectors of an item block, which user rated these items and what the rating was. For that
* purpose it contains for every incoming item vector a tuple of an id array us and a rating
* array rs. The array us contains the indices of the users having rated the respective
* item vector with the ratings in rs.
*
* @param ratings
* @param usersPerBlock
* @param blockIDGenerator
* @return
*/
def createInBlockInformation(ratings: DataSet[(Int, Rating)],
usersPerBlock: DataSet[(Int, Array[Int])],
blockIDGenerator: BlockIDGenerator):
DataSet[(Int, InBlockInformation)] = {
// Group for every user block the users which have rated the same item and collect their ratings
val partialInInfos = ratings.map { x => (x._1, x._2.item, x._2.user, x._2.rating)}
.withForwardedFields("0").groupBy(0, 1).reduceGroup {
x =>
var userBlockID = -1
var itemID = -1
val userIDs = ArrayBuffer[Int]()
val ratings = ArrayBuffer[Double]()
while (x.hasNext) {
val (uBlockID, item, user, rating) = x.next
userBlockID = uBlockID
itemID = item
userIDs += user
ratings += rating
}
(userBlockID, blockIDGenerator(itemID), itemID, (userIDs.toArray, ratings.toArray))
}.withForwardedFields("0")
// Aggregate all ratings for items belonging to the same item block. Sort ascending with
// respect to the itemID, because later the item vectors of the update message are sorted
// accordingly.
val collectedPartialInfos = partialInInfos.groupBy(0, 1).sortGroup(2, Order.ASCENDING).
reduceGroup {
new GroupReduceFunction[(Int, Int, Int, (Array[Int], Array[Double])), (Int,
Int, Array[(Array[Int], Array[Double])])](){
val buffer = new ArrayBuffer[(Array[Int], Array[Double])]
override def reduce(iterable: lang.Iterable[(Int, Int, Int, (Array[Int],
Array[Double]))], collector: Collector[(Int, Int, Array[(Array[Int],
Array[Double])])]): Unit = {
val infos = iterable.iterator()
var counter = 0
var blockID = -1
var itemBlockID = -1
while (infos.hasNext && counter < buffer.length) {
val info = infos.next()
blockID = info._1
itemBlockID = info._2
buffer(counter) = info._4
counter += 1
}
while (infos.hasNext) {
val info = infos.next()
blockID = info._1
itemBlockID = info._2
buffer += info._4
counter += 1
}
val array = new Array[(Array[Int], Array[Double])](counter)
buffer.copyToArray(array)
collector.collect((blockID, itemBlockID, array))
}
}
}.withForwardedFields("0", "1")
// Aggregate all item block ratings with respect to their user block ID. Sort the blocks with
// respect to their itemBlockID, because the block update messages are sorted the same way
collectedPartialInfos.coGroup(usersPerBlock).where(0).equalTo(0).
sortFirstGroup(1, Order.ASCENDING).apply{
new CoGroupFunction[(Int, Int, Array[(Array[Int], Array[Double])]),
(Int, Array[Int]), (Int, InBlockInformation)] {
val buffer = ArrayBuffer[BlockRating]()
override def coGroup(partialInfosIterable:
lang.Iterable[(Int, Int, Array[(Array[Int], Array[Double])])],
userIterable: lang.Iterable[(Int, Array[Int])],
collector: Collector[(Int, InBlockInformation)]): Unit = {
val users = userIterable.iterator()
val partialInfos = partialInfosIterable.iterator()
val userWrapper = users.next()
val id = userWrapper._1
val userIDs = userWrapper._2
val userIDToPos = userIDs.zipWithIndex.toMap
var counter = 0
while (partialInfos.hasNext && counter < buffer.length) {
val partialInfo = partialInfos.next()
// entry contains the ratings and userIDs of a complete item block
val entry = partialInfo._3
// transform userIDs to positional indices
for (row <- 0 until entry.length; col <- 0 until entry(row)._1.length) {
entry(row)._1(col) = userIDToPos(entry(row)._1(col))
}
buffer(counter).ratings = entry
counter += 1
}
while (partialInfos.hasNext) {
val partialInfo = partialInfos.next()
// entry contains the ratings and userIDs of a complete item block
val entry = partialInfo._3
// transform userIDs to positional indices
for (row <- 0 until entry.length; col <- 0 until entry(row)._1.length) {
entry(row)._1(col) = userIDToPos(entry(row)._1(col))
}
buffer += new BlockRating(entry)
counter += 1
}
val array = new Array[BlockRating](counter)
buffer.copyToArray(array)
collector.collect((id, InBlockInformation(userIDs, array)))
}
}
}.withForwardedFieldsFirst("0").withForwardedFieldsSecond("0")
}
/** Unblocks the blocked user and item matrix representation so that it is at DataSet of
* column vectors.
*
* @param users
* @param outInfo
* @param blockIDPartitioner
* @return
*/
def unblock(users: DataSet[(Int, Array[Array[Double]])],
outInfo: DataSet[(Int, OutBlockInformation)],
blockIDPartitioner: BlockIDPartitioner): DataSet[Factors] = {
users.join(outInfo).where(0).equalTo(0).withPartitioner(blockIDPartitioner).apply {
(left, right, col: Collector[Factors]) => {
val outInfo = right._2
val factors = left._2
for(i <- 0 until outInfo.elementIDs.length){
val id = outInfo.elementIDs(i)
val factorVector = factors(i)
col.collect(Factors(id, factorVector))
}
}
}
}
// ================================ Math helper functions ========================================
def outerProduct(vector: Array[Double], matrix: Array[Double], factors: Int): Unit = {
var row = 0
var pos = 0
while(row < factors){
var col = 0
while(col <= row){
matrix(pos) = vector(row) * vector(col)
col += 1
pos += 1
}
row += 1
}
}
def generateFullMatrix(triangularMatrix: Array[Double], fullMatrix: Array[Double],
factors: Int): Unit = {
var row = 0
var pos = 0
while(row < factors){
var col = 0
while(col < row){
fullMatrix(row*factors + col) = triangularMatrix(pos)
fullMatrix(col*factors + row) = triangularMatrix(pos)
pos += 1
col += 1
}
fullMatrix(row*factors + row) = triangularMatrix(pos)
pos += 1
row += 1
}
}
def generateRandomMatrix(users: DataSet[Int], factors: Int, seed: Long): DataSet[Factors] = {
users map {
id =>{
val random = new Random(id ^ seed)
Factors(id, randomFactors(factors, random))
}
}
}
def randomFactors(factors: Int, random: Random): Array[Double] = {
Array.fill(factors)(random.nextDouble())
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy