All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.scalding.examples.WeightedPageRankFromMatrix.scala Maven / Gradle / Ivy

package com.twitter.scalding.examples

import com.twitter.scalding._
import com.twitter.scalding.mathematics.{ Matrix, ColVector }
import com.twitter.scalding.mathematics.Matrix._

/**
 * A weighted PageRank implementation using the Scalding Matrix API. This
 * assumes that all rows and columns are of type {@link Int} and values or egde
 * weights are {@link Double}. If you want an unweighted PageRank, simply set
 * the weights on the edges to 1.
 *
 * Input arguments:
 *
 *  d -- damping factor
 *  n -- number of nodes in the graph
 *  currentIteration -- start with 0 probably
 *  maxIterations -- stop after n iterations
 *  convergenceThreshold -- using the sum of the absolute difference between
 *                          iteration solutions, iterating stops once we reach
 *                          this threshold
 *  rootDir -- the root directory holding all starting, intermediate and final
 *             data/output
 *
 * The expected structure of the rootDir is:
 *
 *   rootDir
 *     |- iterations
 *     |  |- 0       <-- a TSV of (row, value) of size n, value can be 1/n (generate this)
 *     |  |- n       <-- holds future iterations/solutions
 *     |- edges      <-- a TSV of (row, column, value) for edges in the graph
 *     |- onesVector <-- a TSV of (row, 1) of size n (generate this)
 *     |- diff       <-- a single line representing the difference between the last iterations
 *     |- constants  <-- built at iteration 0, these are constant for any given matrix/graph
 *        |- M_hat
 *        |- priorVector
 *
 * Don't forget to set the number of reducers for this job:
 * -D mapred.reduce.tasks=n
 */
class WeightedPageRankFromMatrix(args: Args) extends Job(args) {

  val d = args("d").toDouble // aka damping factor
  val n = args("n").toInt // number of nodes in the graph

  val currentIteration = args("currentIteration").toInt
  val maxIterations = args("maxIterations").toInt
  val convergenceThreshold = args("convergenceThreshold").toDouble

  val rootDir = args("rootDir")
  val edgesLoc = rootDir + "/edges"
  val onesVectorLoc = rootDir + "/onesVector"

  val iterationsDir = rootDir + "/iterations"
  val previousVectorLoc = iterationsDir + "/" + currentIteration
  val nextVectorLoc = iterationsDir + "/" + (currentIteration + 1)

  val diffLoc = rootDir + "/diff"

  // load the previous iteration
  val previousVector = colVectorFromTsv(previousVectorLoc)

  // iterate, write results
  // R(t + 1) = d * M * R(t) + ((1 - d) / n) * _1_
  val nextVector = M_hat * previousVector + priorVector
  nextVector.write(Tsv(nextVectorLoc))

  measureConvergenceAndStore()

  /**
   * Recurse and iterate again iff we are under the max number of iterations and
   * vector has not converged.
   */
  override def next = {
    val diff = TypedTsv[Double](diffLoc).toIterator.next

    if (currentIteration + 1 < maxIterations && diff > convergenceThreshold) {
      val newArgs = args + ("currentIteration", Some((currentIteration + 1).toString))
      Some(clone(newArgs))
    } else {
      None
    }
  }

  /**
   * Measure convergence by  calculating the total of the absolute difference
   * between the previous and next vectors. This stores the result after
   * calculation.
   */
  def measureConvergenceAndStore(): Unit = {
    (previousVector - nextVector).
      mapWithIndex { case (value, index) => math.abs(value) }.
      sum.
      write(TypedTsv[Double](diffLoc))
  }

  /**
   * Load or generate on first iteration the matrix M^ given A.
   */
  def M_hat: Matrix[Int, Int, Double] = {

    if (currentIteration == 0) {
      val A = matrixFromTsv(edgesLoc)
      val M = A.rowL1Normalize.transpose
      val M_hat = d * M

      M_hat.write(Tsv(rootDir + "/constants/M_hat"))
    } else {
      matrixFromTsv(rootDir + "/constants/M_hat")
    }
  }

  /**
   * Load or generate on first iteration the prior vector given d and n.
   */
  def priorVector: ColVector[Int, Double] = {

    if (currentIteration == 0) {
      val onesVector = colVectorFromTsv(onesVectorLoc)
      val priorVector = ((1 - d) / n) * onesVector.toMatrix(0)

      priorVector.getCol(0).write(Tsv(rootDir + "/constants/priorVector"))
    } else {
      colVectorFromTsv(rootDir + "/constants/priorVector")
    }
  }

  def matrixFromTsv(input: String): Matrix[Int, Int, Double] =
    TypedTsv[(Int, Int, Double)](input).toMatrix

  def colVectorFromTsv(input: String): ColVector[Int, Double] =
    TypedTsv[(Int, Double)](input).toCol
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy