All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.scalding.examples.MergeTest.scala Maven / Gradle / Ivy

package com.twitter.scalding.examples

import scala.annotation.tailrec

import com.twitter.scalding._

/**
 * This example job does not yet work.  It is a test for Kyro serialization
 */
class MergeTest(args: Args) extends Job(args) {
  TextLine(args("input")).flatMapTo('word) { _.split("""\s+""") }
    .groupBy('word) { _.size }
    //Now, let's get the top 10 words:
    .groupAll {
      _.mapReduceMap(('word, 'size) -> 'list) /* map1 */ { tup: (String, Long) => List(tup) } /* reduce */ { (l1: List[(String, Long)], l2: List[(String, Long)]) =>
        mergeSort2(l1, l2, 10, cmpTup)
      } /* map2 */ {
        lout: List[(String, Long)] => lout
      }
    }
    //Now expand out the list.
    .flatMap('list -> ('word, 'cnt)) { list: List[(String, Long)] => list }
    .project('word, 'cnt)
    .write(Tsv(args("output")))

  //Reverse sort to get the top items
  def cmpTup(t1: (String, Long), t2: (String, Long)) = t2._2.compareTo(t1._2)

  def mergeSort2[T](v1: List[T], v2: List[T], k: Int, cmp: Function2[T, T, Int]) = {
    @tailrec
    def mergeSortR(acc: List[T], list1: List[T], list2: List[T], k: Int): List[T] = {
      (list1, list2, k) match {
        case (_, _, 0) => acc
        case (x1 :: t1, x2 :: t2, _) => {
          if (cmp(x1, x2) < 0) {
            mergeSortR(x1 :: acc, t1, list2, k - 1)
          } else {
            mergeSortR(x2 :: acc, list1, t2, k - 1)
          }
        }
        case (x1 :: t1, Nil, _) => mergeSortR(x1 :: acc, t1, Nil, k - 1)
        case (Nil, x2 :: t2, _) => mergeSortR(x2 :: acc, Nil, t2, k - 1)
        case (Nil, Nil, _) => acc
      }
    }
    mergeSortR(Nil, v1, v2, k).reverse
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy