com.twitter.scalding.examples.WeightedPageRank.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of scalding_2.9.1 Show documentation
scalding
The newest version!
package com.twitter.scalding.examples

import com.twitter.scalding._

/**
 * weighted page rank for the given graph, start from the given pagerank,
 * perform one iteartion, test for convergence, if not yet, clone itself
 * and start the next page rank job with updated pagerank as input.
 *
 * This class is very similar to the PageRank class, main differences are:
 * 1. supported weighted pagerank
 * 2. the reset pagerank is pregenerated, possibly through a previous job
 * 3. dead pagerank is evenly distributed
 *
 * Options:
 * --pwd: working directory, will read/generate the following files there
 *        numnodes: total number of nodes
 *        nodes: nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior>
 *        pagerank: the page rank file eg pagerank_0, pagerank_1 etc
 *        totaldiff: the current max pagerank delta
 * Optional arguments:
 * --weighted: do weighted pagerank, default false
 * --curiteration: what is the current iteration, default 0
 * --maxiterations: how many iterations to run.  Default is 20
 * --jumpprob: probability of a random jump, default is 0.1
 * --threshold: total difference before finishing early, default 0.001
 */
class WeightedPageRank(args: Args) extends Job(args) {
  val ROW_TYPE_1 = 1
  val ROW_TYPE_2 = 2

  val PWD = args("pwd")
  val ALPHA = args.getOrElse("jumpprob","0.1").toDouble
  val WEIGHTED = args.getOrElse("weighted","false").toBoolean
  val THRESHOLD = args.getOrElse("threshold", "0.001").toDouble
  val MAXITERATIONS = args.getOrElse("maxiterations", "20").toInt
  val CURITERATION = args.getOrElse("curiteration", "0").toInt

  // 'size
  val numNodes = getNumNodes(PWD + "/numnodes")

  // 'src_id, 'dst_ids, 'weights, 'mass_prior
  val nodes = getNodes(Mode.mode, PWD + "/nodes")

  // 'src_id_input, 'mass_input
  val inputPagerank = getInputPagerank(PWD + "/pagerank_" + CURITERATION)

  // one iteration of pagerank
  val outputPagerank = doPageRank(nodes, inputPagerank)
  val outputFileName = PWD + "/pagerank_" + (CURITERATION + 1)
  outputPagerank
    .project('src_id, 'mass_n)
    .write(Tsv(outputFileName))

  // detect convergence
  val totalDiff = outputPagerank
    .mapTo(('mass_input, 'mass_n) -> 'mass_diff) { args : (Double, Double) =>
      scala.math.abs(args._1 - args._2)
    }
    .groupAll { _.sum('mass_diff) }
    .write(Tsv(PWD + "/totaldiff"))

  /**
   * test convergence, if not yet, kick off the next iteration
   */
  override def next = {
    // the max diff generated above
    val totalDiff = Tsv(PWD + "/totaldiff").readAtSubmitter[Double].head

    if (CURITERATION < MAXITERATIONS-1 && totalDiff > THRESHOLD) {
      val newArgs = args + ("curiteration", Some( (CURITERATION+1).toString))
      Some(clone(newArgs))
    } else {
      None
    }
  }

  def getInputPagerank(fileName: String) = {
    Tsv(fileName).read
    .mapTo((0,1) -> ('src_id_input, 'mass_input)) {
      input : (Int, Double) => input
    }
  }

  /**
   * read the pregenerated nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior>
   */
  def getNodes(mode: Mode, fileName: String) = {
    mode match {
      case Hdfs(_, conf) => {
        SequenceFile(fileName).read
        .mapTo((0,1,2,3)->('src_id, 'dst_ids, 'weights, 'mass_prior)) {
          input : (Int, Array[Int], Array[Float], Double) => input
        }
      }
      case _ => {
        Tsv(fileName).read
        .mapTo((0,1,2,3)->('src_id, 'dst_ids, 'weights, 'mass_prior)) {
          input : (Int, String, String, Double) => {
            (
              input._1,
              // convert string to int array
              if (input._2 != null && input._2.length > 0) {
                input._2.split(",").map { _.toInt }
              } else {
                Array[Int]()
              },
              // convert string to float array
              if (input._3 != null && input._3.length > 0) {
                input._3.split(",").map { _.toFloat }
              } else {
                Array[Float]()
              },
              input._4
            )
          }
        }
      }
    }
  }

  /**
   * the total number of nodes, single line file
   */
  def getNumNodes(fileName: String) = {
    Tsv(fileName).read
    .mapTo(0 -> 'size) { input: Int => input }
  }

  /**
   * one iteration of pagerank
   * inputPagerank: <'src_id_input, 'mass_input>
   * return <'src_id, 'mass_n, 'mass_input>
   *
   * Here is a highlevel view of the unweighted algorithm:
   * let
   * N: number of nodes
   * inputPagerank(N_i): prob of walking to node i,
   * d(N_j): N_j's out degree
   * then
   * pagerankNext(N_i) = (\sum_{j points to i} inputPagerank(N_j) / d_j)
   * deadPagerank = (1 - \sum_{i} pagerankNext(N_i)) / N
   * randomPagerank(N_i) = userMass(N_i) * ALPHA + deadPagerank * (1-ALPHA)
   * pagerankOutput(N_i) = randomPagerank(N_i) + pagerankNext(N_i) * (1-ALPHA)
   *
   * For weighted algorithm:
   * let
   * w(N_j, N_i): weight from N_j to N_i
   * tw(N_j): N_j's total out weights
   * then
   * pagerankNext(N_i) = (\sum_{j points to i} inputPagerank(N_j) * w(N_j, N_i) / tw(N_j))
   *
   */
  def doPageRank(nodeRows: RichPipe, inputPagerank: RichPipe) : RichPipe = {
    // 'src_id, 'dst_ids, 'weights, 'mass_prior, 'mass_input
    val nodeJoined = nodeRows
    .joinWithSmaller('src_id -> 'src_id_input, inputPagerank)
    .discard('src_id_input)

    // 'src_id, 'mass_n
    val pagerankNext = nodeJoined
    .flatMapTo(('dst_ids, 'weights, 'mass_input) -> ('src_id, 'mass_n)) {
      args : (Array[Int], Array[Float], Double) => {
        if (args._1.length > 0) {
          if (WEIGHTED) {
            // weighted distribution
            val total: Double = args._2.sum
            (args._1 zip args._2).map { idWeight : (Int, Float) =>
              (idWeight._1, args._3 * idWeight._2 / total)
            }
          } else {
            // equal distribution
            val dist: Double = args._3 / args._1.length
            args._1.map { id: Int => (id, dist) }
          }
        } else {
          //Here is a node that points to no other nodes (dangling)
          Nil
        }
      }
    }
    .groupBy('src_id) {
      _.sum('mass_n)
    }

    // 'sum_mass
    val sumPagerankNext = pagerankNext.groupAll { _.sum('mass_n -> 'sum_mass) }

    // 'deadMass
    // single row jobs
    // the dead page rank equally distributed to every node
    val deadPagerank = sumPagerankNext
    .crossWithTiny(numNodes)
    .map(('sum_mass, 'size) -> 'deadMass) { input : (Double, Int) =>
      (1.0 - input._1) / input._2
    }
    .discard('size, 'sum_mass)

    // 'src_id_r, 'mass_n_r
    // random jump probability plus dead page rank
    val randomPagerank = nodeJoined.crossWithTiny(deadPagerank)
    .mapTo(('src_id, 'mass_prior, 'deadMass, 'mass_input) -> ('src_id, 'mass_n, 'mass_input)) {
      ranks : (Int, Double, Double, Double) =>
        (ranks._1, ranks._2 * ALPHA + ranks._3 * (1-ALPHA), ranks._4)
    }

    // 'src_id, 'mass_n
    // scale next page rank to 1-ALPHA
    val pagerankNextScaled = pagerankNext
    .map('mass_n -> ('mass_n, 'mass_input)) { m: Double => ((1-ALPHA) * m, 0.0) }

    // 'src_id, 'mass_n, 'mass_input
    // random probability + next probability
    (randomPagerank ++ pagerankNextScaled)
      .groupBy('src_id) {
        _.sum('mass_input) // keep the input pagerank
        .sum('mass_n) // take the sum
      }
  }
}