All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.scalding.examples.MergeTest.scala Maven / Gradle / Ivy

The newest version!
package com.twitter.scalding.examples

import scala.annotation.tailrec

import com.twitter.scalding._

/**
* This example job does not yet work.  It is a test for Kyro serialization
*/
class MergeTest(args : Args) extends Job(args) {
  TextLine(args("input")).flatMapTo('word) { _.split("""\s+""") }
    .groupBy('word) { _.size }
    //Now, let's get the top 10 words:
    .groupAll {
      _.mapReduceMap(('word,'size)->'list) /* map1 */ { tup : (String,Long) => List(tup) }
        /* reduce */ { (l1 : List[(String,Long)], l2 : List[(String,Long)]) =>
          mergeSort2(l1, l2, 10, cmpTup)
        } /* map2 */ {
          lout : List[(String,Long)] => lout
        }
    }
    //Now expand out the list.
    .flatMap('list -> ('word, 'cnt)) { list : List[(String,Long)] => list }
    .project('word, 'cnt)
    .write(Tsv(args("output")))

  //Reverse sort to get the top items
  def cmpTup( t1 : (String,Long), t2 : (String,Long) ) = t2._2.compareTo(t1._2)

  def mergeSort2[T](v1 : List[T], v2 : List[T], k : Int, cmp : Function2[T,T,Int]) = {
    @tailrec
    def mergeSortR(acc : List[T], list1 : List[T], list2 : List[T], k : Int) : List[T] = {
      (list1, list2, k) match {
        case (_,_,0) => acc
        case (x1 :: t1, x2 :: t2, _) => {
          if( cmp(x1,x2) < 0 ) {
            mergeSortR(x1 :: acc, t1, list2, k-1)
          }
          else {
            mergeSortR(x2 :: acc, list1, t2, k-1)
          }
        }
        case (x1 :: t1, Nil, _) => mergeSortR(x1 :: acc, t1, Nil, k-1)
        case (Nil, x2 :: t2, _) => mergeSortR(x2 :: acc, Nil, t2, k-1)
        case (Nil, Nil, _) => acc
      }
    }
    mergeSortR(Nil, v1, v2, k).reverse
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy