All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.scalding.GroupBuilder.scala Maven / Gradle / Ivy

There is a newer version: 0.7.3
Show newest version
/*
Copyright 2012 Twitter, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.twitter.scalding

import cascading.pipe.Pipe
import cascading.pipe.Every
import cascading.pipe.GroupBy
import cascading.pipe.assembly._
import cascading.operation._
import cascading.operation.aggregator._
import cascading.operation.filter._
import cascading.tuple.Fields

import scala.collection.JavaConverters._
import scala.annotation.tailrec
import scala.math.Ordering

// This controls the sequence of reductions that happen inside a
// particular grouping operation.  Not all elements can be combined,
// for instance, a scanLeft/foldLeft generally requires a sorting
// but such sorts are (at least for now) incompatible with doing a combine
// which includes some map-side reductions.
class GroupBuilder(val groupFields : Fields) extends FieldConversions
  with TupleConversions with java.io.Serializable {

  /**
  * Holds the "reducers/combiners", the things that we can do paritially map-side.
  */
  private var reds : Option[List[AggregateBy]] = Some(Nil)

  /**
  * This is the description of this Grouping in terms of a sequence of Every operations
  */
  private var evs : List[Pipe => Every] = Nil
  private var isReversed : Boolean = false
  private var sortBy : Option[Fields] = None
  /*
  * maxMF is the maximum index of a "middle field" allocated for mapReduceMap operations
  */
  private var maxMF : Int = 0

  private def getNextMiddlefield : String = {
    val out = "__middlefield__" + maxMF.toString
    maxMF += 1
    return out
  }

  //Put any pure reduce functions into the below object
  import CommonReduceFunctions._

  def aggregate(args : Fields)(a : Aggregator[_]) : GroupBuilder = {
    every(pipe => new Every(pipe, args, a))
  }

  private def tryAggregateBy(ab : AggregateBy, ev : Pipe => Every) : Boolean = {
    // Concat if there if not none
    reds = reds.map(rl => ab::rl)
    evs = ev :: evs
    return !reds.isEmpty
  }

  /**
  * Holds the number of reducers to use in the reduce stage of the groupBy/aggregateBy.
  * By default uses whatever value is set in the jobConf.
  */
  private var numReducers : Option[Int] = None
  /**
   * Override the number of reducers used in the groupBy.
   */
  def reducers(r : Int) = {
    if(r > 0) {
      numReducers = Some(r)
    }
    this
  }

  private def overrideReducers(p : Pipe) : Pipe = {
    numReducers.map { r => RichPipe.setReducers(p, r) }.getOrElse(p)
  }

  // When combining averages, if the counts sizes are too close we should use a different
  // algorithm.  This constant defines how close the ratio of the smaller to the total count
  // can be:
  private val STABILITY_CONSTANT = 0.1
  /**
   * uses a more stable online algorithm which should
   * be suitable for large numbers of records
   * similar to:
   * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
   */
  def average(f : (Fields, Fields)) : GroupBuilder = {
    mapReduceMap(f){(x:Double) =>
      (1L, x)
    } {(cntAve1, cntAve2) =>
      val (big, small) = if (cntAve1._1 >= cntAve2._1) (cntAve1, cntAve2) else (cntAve2, cntAve1)
      val n = big._1
      val k = small._1
      val an = big._2
      val ak = small._2
      val newCnt = n+k
      val scaling = k.toDouble/newCnt
      // a_n + (a_k - a_n)*(k/(n+k)) is only stable if n is not approximately k
      val newAve = if (scaling < STABILITY_CONSTANT) (an + (ak - an)*scaling) else (n*an + k*ak)/newCnt
      (newCnt, newAve)
    } { res => res._2 }
  }
  def average(f : Symbol) : GroupBuilder = average(f->f)

  // WARNING! This may significantly reduce performance of your job.
  // It kills the ability to do map-side aggregation.
  def buffer(args : Fields)(b : Buffer[_]) : GroupBuilder = {
    every(pipe => new Every(pipe, args, b))
  }

  // By default adds a column with name "count" counting the number in
  // this group. deprecated, use size.
  @deprecated("Use size instead to match the scala.collections.Iterable API")
  def count(f : Symbol = 'count) : GroupBuilder = size(f)

  // This is count with a predicate: only counts the tuples for which fn(tuple) is true
  def count[T:TupleConverter](fieldDef : (Fields, Fields))(fn : T => Boolean) : GroupBuilder = {
    mapReduceMap[T,Long,Long](fieldDef)(arg => if(fn(arg)) 1L else 0L)((s1 : Long, s2 : Long) => s1+s2)(s => s)
  }

  /**
  * Opposite of RichPipe.unpivot.  See SQL/Excel for more on this function
  * converts a row-wise representation into a column-wise one.
  * example: pivot(('feature, 'value) -> ('clicks, 'impressions, 'requests))
  * it will find the feature named "clicks", and put the value in the column with the field named
  * clicks.
  * Absent fields result in null. Unnamed output fields are ignored.
  * NOTE: Duplicated fields will result in an error.
  *
  * Hint: if you want more precision, first do a
  * map('value -> value) { x : AnyRef => Option(x) }
  * and you will have non-nulls for all present values, and Nones for values that were present
  * but previously null.  All nulls in the final output will be those truly missing.
  * Similarly, if you want to check if there are any items present that shouldn't be:
  * map('feature -> 'feature) { fname : String =>
  *   if (!goodFeatures(fname)) { throw new Exception("ohnoes") }
  *   else fname
  * }
  */
  def pivot(fieldDef : (Fields, Fields)) : GroupBuilder = {
    // Make sure the fields are strings:
    mapReduceMap(fieldDef) { pair : (String, AnyRef) =>
      List(pair)
    } { (prev, next) => next ++ prev } // concat into the bigger one
    { outputList =>
      val asMap = outputList.toMap
      assert(asMap.size == outputList.size, "Repeated pivot key fields: " + outputList.toString)
      val values = fieldDef._2
        .iterator.asScala
        // Look up this key:
        .map { fname => asMap.getOrElse(fname.asInstanceOf[String], null) }
      // Create the cascading tuple (only place this is used, so no import
      // to avoid confusion with scala tuples:
      new cascading.tuple.Tuple(values.toSeq : _*)
    }
  }

  /**
   * Convert a subset of fields into a list of Tuples. Need to provide the types of the tuple fields.
   * Note that the order of the tuples is not preserved: EVEN IF YOU GroupBuilder.sortBy!
   * If you need ordering use sortedTake or sortBy + scanLeft
   */
  def toList[T](fieldDef : (Fields, Fields))(implicit conv : TupleConverter[T]) : GroupBuilder = {
    val (fromFields, toFields) = fieldDef
    conv.assertArityMatches(fromFields)
    val out_arity = toFields.size
    assert(out_arity == 1, "toList: can only add a single element to the GroupBuilder")
    mapReduceMap[T, List[T], List[T]](fieldDef) { //Map
      // TODO this is questionable, how do you get a list including nulls?
      x => if (null != x) List(x) else Nil
    } { //Reduce, note the bigger list is likely on the left, so concat into it:
      (prev, current) => current ++ prev
    } { //Map
      t => t
    }
  }

  /**
   * Compute the count, ave and stdard deviation in one pass
   * example: g.cntAveStdev('x -> ('cntx, 'avex, 'stdevx))
   * uses: http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
   */
  def sizeAveStdev(fieldDef : (Fields,Fields)) = {
    val (fromFields, toFields) = fieldDef
    val in_arity = fromFields.size
    val out_arity = toFields.size
    assert(in_arity == 1, "cntAveVar: Can only take the moment of a single arg")
    assert(out_arity == 3, "cntAveVar: Need names for cnt, ave and var moments")
    // unbiased estimator: sqrt((1/(N-1))(sum_i(x_i - ave)^2))
    // sum_i (x_i - ave)^2 = sum_i x_i^2 - 2x_i ave +ave^2
    //                     = (sum_i x_i^2) - 2N ave^2 + N ave^2
    //                     = (sum_i x_i^2) - N ave^2
    mapReduceMap[Double, (Long,Double,Double), (Long,Double,Double)](fieldDef) { //Map
      (x : Double) => (1L,x,0.0)
    } {(cntAve1, cntAve2) =>
      val (big, small) = if (cntAve1._1 >= cntAve2._1) (cntAve1, cntAve2) else (cntAve2, cntAve1)
      val n = big._1
      val k = small._1
      val an = big._2
      val ak = small._2
      val delta = (ak - an)
      val mn = big._3
      val mk = small._3
      val newCnt = n+k
      val scaling = k.toDouble/newCnt
      // a_n + (a_k - a_n)*(k/(n+k)) is only stable if n is not approximately k
      val newAve = if (scaling < STABILITY_CONSTANT) (an + delta*scaling) else (n*an + k*ak)/newCnt
      val newStdMom = mn + mk + delta*delta*(n*scaling)
      (newCnt, newAve, newStdMom)
    } { //Map
      moms =>
        val cnt = moms._1
        (cnt, moms._2, scala.math.sqrt(moms._3/(cnt - 1)))
    }
  }

  //Remove the first cnt elements
  def drop(cnt : Int) : GroupBuilder = {
    val b = new DropBuffer(cnt)
    every(pipe => new Every(pipe, Fields.VALUES, b, Fields.REPLACE))
  }
  //Drop while the predicate is true, starting at the first false, output all
  def dropWhile[T](f : Fields)(fn : T => Boolean)(implicit conv : TupleConverter[T]) : GroupBuilder = {
    conv.assertArityMatches(f)
    every(pipe => new Every(pipe, f, new DropWhileBuffer[T](fn, conv), Fields.REPLACE))
  }

  //Prefer aggregateBy operations!
  def every(ev : Pipe => Every) : GroupBuilder = {
    reds = None
    evs = ev :: evs
    this
  }

  /*
   *  prefer reduce or mapReduceMap. foldLeft will force all work to be
   *  done on the reducers.  If your function is not associative and
   *  commutative, foldLeft may be required.
   */
  def foldLeft[X,T](fieldDef : (Fields,Fields))(init : X)(fn : (X,T) => X)
                 (implicit setter : TupleSetter[X], conv : TupleConverter[T]) : GroupBuilder = {
      val (inFields, outFields) = fieldDef
      conv.assertArityMatches(inFields)
      setter.assertArityMatches(outFields)
      val ag = new FoldAggregator[T,X](fn, init, outFields, conv, setter)
      every(pipe => new Every(pipe, inFields, ag))
  }

  /*
   * check if a predicate is satisfied for all in the values for this key
   */
  def forall[T:TupleConverter](fieldDef : (Fields,Fields))(fn : (T) => Boolean) : GroupBuilder = {
    mapReduceMap(fieldDef)(fn)({(x : Boolean, y : Boolean) => x && y})({ x => x })
  }

  // Return the first, useful probably only for sorted case.
  def head(f : Fields) = aggregate(f)(new First())
  def last(f : Fields) = aggregate(f)(new Last())

  private def extremum(max : Boolean, fieldDef : (Fields,Fields)) : GroupBuilder = {
    val (fromFields, toFields) = fieldDef
    val in_arity = fromFields.size
    val out_arity = toFields.size
    assert(in_arity == out_arity, "Number of field names must match for rename")
    //Now do the work:
    val ag = new ExtremumAggregator(max, toFields)
    val ev = (pipe => new Every(pipe, fromFields, ag)) : Pipe => Every
    tryAggregateBy(new ExtremumBy(max, fromFields, toFields), ev)
    this
  }

  /**
  * Type T is the type of the input field (input to map, T => X)
  * Type X is the intermediate type, which your reduce function operates on
  * (reduce is (X,X) => X)
  * Type U is the final result type, (final map is: X => U)
  *
  * The previous output goes into the reduce function on the left, like foldLeft,
  * so if your operation is faster for the accumulator to be on one side, be aware.
  */
  def mapReduceMap[T,X,U](fieldDef : (Fields, Fields))(mapfn : T => X )(redfn : (X, X) => X)
      (mapfn2 : X => U)(implicit startConv : TupleConverter[T],
                        middleSetter : TupleSetter[X],
                        middleConv : TupleConverter[X],
                        endSetter : TupleSetter[U]) : GroupBuilder = {
    val (fromFields, toFields) = fieldDef
    //Check for arity safety:
    startConv.assertArityMatches(fromFields)
    endSetter.assertArityMatches(toFields)

    val ag = new MRMAggregator[T,X,U](mapfn, redfn, mapfn2, toFields, startConv, endSetter)
    val ev = (pipe => new Every(pipe, fromFields, ag)) : Pipe => Every

    // Create the required number of middlefields based on the arity of middleSetter
    val middleFields = strFields( Range(0, middleSetter.arity).map{i => getNextMiddlefield} )
    val mrmBy = new MRMBy[T,X,U](fromFields, middleFields, toFields,
      mapfn, redfn, mapfn2, startConv, middleSetter, middleConv, endSetter)
    tryAggregateBy(mrmBy, ev)
    this
  }

  def max(fieldDef : (Fields, Fields)) = extremum(true, fieldDef)
  def max(fieldDef : Symbol*) = {
    val f : Fields = fieldDef
    extremum(true, (f,f))
  }
  def min(fieldDef : (Fields, Fields)) = extremum(false, fieldDef)
  def min(fieldDef : Symbol*) = {
    val f : Fields = fieldDef
    extremum(false, (f,f))
  }
  /*
   * similar to the scala.collection.Iterable.mkString
   * takes the source and destination fieldname, which should be a single
   * field.
   * the result will be start, each item.toString separated by sep, followed
   * by end
   * for convenience there several common variants below
   */
  def mkString(fieldDef : (Fields,Fields), start : String, sep : String, end : String) : GroupBuilder = {
    val (inFields, outFields) = fieldDef
    val in_arity = inFields.size
    val out_arity = outFields.size
    assert(in_arity == 1, "mkString works on single column, concat in a map before, if you need.")
    assert(out_arity == 1, "output field count must also be 1")
    /*
     * Logically a mapReduceMap works here, but it does O(N) string
     * concats, if each are order m in length which costs (\sum_{i=1}^N  i\times m)
     * which is m N^2/2 cost. We can do O(N) if we allocate once long enough
     * for all N items, and then copy, which is what
     * Iterable.mkString does
     */
    val mkag = new MkStringAggregator(start, sep, end, outFields)
    val ev = (pipe => new Every(pipe, inFields, mkag)) : Pipe => Every
    tryAggregateBy(new MkStringBy(start, sep, end, inFields, outFields), ev)
    this
  }
  def mkString(fieldDef : (Fields,Fields), sep : String) : GroupBuilder = mkString(fieldDef,"",sep,"")
  def mkString(fieldDef : (Fields,Fields)) : GroupBuilder = mkString(fieldDef,"","","")
  /**
  * these will only be called if a tuple is not passed, meaning just one
  * column
  */
  def mkString(fieldDef : Symbol, start : String, sep : String, end : String) : GroupBuilder = {
    val f : Fields = fieldDef
    mkString((f,f),start,sep,end)
  }
  def mkString(fieldDef : Symbol, sep : String) : GroupBuilder = mkString(fieldDef,"",sep,"")
  def mkString(fieldDef : Symbol) : GroupBuilder = mkString(fieldDef,"","","")

  /**
   * apply an associative/commutative operation on the left field.
   * Example: reduce(('mass,'allids)->('totalMass, 'idset)) { (left:(Double,Set[Long]),right:(Double,Set[Long])) =>
   *   (left._1 + right._1, left._2 ++ right._2)
   * }
   * Equivalent to a mapReduceMap with trivial (identity) map functions.
   *
   * The previous output goes into the reduce function on the left, like foldLeft,
   * so if your operation is faster for the accumulator to be on one side, be aware.
   */
  def reduce[T](fieldDef : (Fields, Fields))(fn : (T,T)=>T)
               (implicit setter : TupleSetter[T], conv : TupleConverter[T]) : GroupBuilder = {
    mapReduceMap[T,T,T](fieldDef)({ t => t })(fn)({t => t})(conv,setter,conv,setter)
  }
  //Same as reduce(f->f)
  def reduce[T](fieldDef : Symbol*)(fn : (T,T)=>T)(implicit setter : TupleSetter[T],
                                 conv : TupleConverter[T]) : GroupBuilder = {
    reduce(fieldDef -> fieldDef)(fn)(setter,conv)
  }

  def reverse : GroupBuilder = {
    assert(reds.isEmpty, "Cannot sort when reducing")
    assert(!isReversed, "Reverse called a second time! Only one allowed")
    isReversed = true
    this
  }

  //This invalidates map-side aggregation, forces all data to be transferred
  //to reducers.  Use only if you REALLY have to.
  def scanLeft[X,T](fieldDef : (Fields,Fields))(init : X)(fn : (X,T) => X)
                 (implicit setter : TupleSetter[X], conv : TupleConverter[T]) : GroupBuilder = {
    val (inFields, outFields) = fieldDef
    //Check arity
    conv.assertArityMatches(inFields)
    setter.assertArityMatches(outFields)

    val b = new ScanBuffer[T,X](fn, init, outFields, conv, setter)
    every(pipe => new Every(pipe, inFields, b))
  }

  def schedule(name : String, allpipes : Pipe*) : Pipe = {
    val mpipes : Array[Pipe] = allpipes.toArray
    reds match {
      case None => {
        //We cannot aggregate, so group:
        val startPipe : Pipe = sortBy match {
          case None => new GroupBy(name, mpipes, groupFields)
          case Some(sf) => new GroupBy(name, mpipes, groupFields, sf, isReversed)
        }
        overrideReducers(startPipe)

        // Time to schedule the addEverys:
        evs.foldRight(startPipe)( (op : Pipe => Every, p) => op(p) )
      }
      //This is the case where the group function is identity: { g => g }
      case Some(Nil) => {
        val gb = new GroupBy(name, mpipes, groupFields)
        overrideReducers(gb)
        gb
      }
      //There is some non-empty AggregateBy to do:
      case Some(redlist) => {
        val THRESHOLD = 100000 //tune this, default is 10k
        val ag = new AggregateBy(name, mpipes, groupFields,
          THRESHOLD, redlist.reverse.toArray : _*)
        overrideReducers(ag.getGroupBy())
        ag
      }
    }
  }

  //This invalidates aggregateBy!
  def sortBy(f : Fields) : GroupBuilder = {
    reds = None
    sortBy = sortBy match {
      case None => Some(f)
      case Some(sf) => {
        sf.append(f)
        Some(sf)
      }
    }
    this
  }

  //How many values are there for this key
  def size : GroupBuilder = size('size)
  def size(thisF : Fields) : GroupBuilder = {
      assert(thisF.size == 1, "size only gives a single column output")
      //Count doesn't need inputs, but if you use Fields.ALL it will
      //fail if it comes after any other Every.
      val ev = (pipe => new Every(pipe, Fields.VALUES, new Count(thisF))) : Pipe => Every
      tryAggregateBy(new CountBy(thisF), ev)
      this
  }

  def sum(f : (Fields, Fields)) : GroupBuilder = {
    val (input, output) = f
    val in_arity = input.size
    val out_arity = input.size
    assert(in_arity == 1, "size can only sum a single column")
    assert(out_arity == 1, "output field size must also be 1")
    val ag = new Sum(output)
    val ev = (pipe => new Every(pipe, input, ag)) : Pipe => Every
    tryAggregateBy(new SumBy(input, output, java.lang.Double.TYPE), ev)
    this
  }
  def sum(f : Symbol) : GroupBuilder = {
    //Implicitly convert to a pair of fields:
    val field : Fields = f
    sum(field -> field)
  }
  //Only keep the first cnt elements
  def take(cnt : Int) : GroupBuilder = {
    val b = new TakeBuffer(cnt)
    every(pipe => new Every(pipe, Fields.VALUES, b, Fields.REPLACE))
  }
  //Take while the predicate is true, starting at the first false, output all
  def takeWhile[T](f : Fields)(fn : (T) => Boolean)(implicit conv : TupleConverter[T]) : GroupBuilder = {
    conv.assertArityMatches(f)
    every(pipe => new Every(pipe, f, new TakeWhileBuffer[T](fn, conv), Fields.REPLACE))
  }

  // This is convenience method to allow plugging in blocks of group operations
  // similar to RichPipe.then
  def then(fn : (GroupBuilder) => GroupBuilder) = fn(this)

  // Equivalent to sorting by a comparison function
  // then take-ing k items.  This is MUCH more efficient than doing a total sort followed by a take,
  // since these bounded sorts are done on the mapper, so only a sort of size k is needed.
  // example:
  // sortWithTake( ('clicks, 'tweet) -> 'topClicks, 5) { fn : (t0 :(Long,Long), t1:(Long,Long) => t0._1 < t1._1 }
  // topClicks will be a List[(Long,Long)]
  def sortWithTake[T:TupleConverter](f : (Fields, Fields), k : Int)(lt : (T,T) => Boolean) : GroupBuilder = {
    assert(f._2.size == 1, "output field size must be 1")
    mapReduceMap(f) /* map1 */ { (tup : T) => List(tup) }
    /* reduce */ { (l1 : List[T], l2 : List[T]) =>
      mergeSorted(l1, l2, lt, k)
    } /* map2 */ {
      (lout : List[T]) => lout
    }
  }

  // Reverse of above when the implicit ordering makes sense.
  def sortedReverseTake[T](f : (Fields, Fields), k : Int)
    (implicit conv : TupleConverter[T], ord : Ordering[T]) : GroupBuilder = {
    sortWithTake(f,k) { (t0:T,t1:T) => ord.gt(t0,t1) }
  }

  // Same as above but useful when the implicit ordering makes sense.
  def sortedTake[T](f : (Fields, Fields), k : Int)
    (implicit conv : TupleConverter[T], ord : Ordering[T]) : GroupBuilder = {
    sortWithTake(f,k) { (t0:T,t1:T) => ord.lt(t0,t1) }
  }
}

/*
 * These need to be serializable, but GroupBuilder has state and is a larger object,
 * so it is not ideal to make it serializable
 */
object CommonReduceFunctions extends java.io.Serializable {
  /*
   * merge two sorted lists.
   */
  final def mergeSorted[T](v1 : List[T], v2 : List[T], lt : (T,T) => Boolean, k : Int = -1) : List[T] = {
    @tailrec
    //This is the internal loop that does one comparison:
    def mergeSortR(acc : List[T], list1 : List[T], list2 : List[T], k : Int) : List[T] = {
      (list1, list2, k) match {
        case (_,_,0) => acc
        case (x1 :: t1, x2 :: t2, _) => {
          if( lt(x1,x2) ) {
            mergeSortR(x1 :: acc, t1, list2, k-1)
          }
          else {
            mergeSortR(x2 :: acc, list1, t2, k-1)
          }
        }
        case (x1 :: t1, Nil, _) => mergeSortR(x1 :: acc, t1, Nil, k-1)
        case (Nil, x2 :: t2, _) => mergeSortR(x2 :: acc, Nil, t2, k-1)
        case (Nil, Nil, _) => acc
      }
    }
    mergeSortR(Nil, v1, v2, k).reverse
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy