com.twitter.scalding.Operations.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of scalding_2.9.1 Show documentation
scalding
The newest version!
/*
Copyright 2012 Twitter, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package com.twitter.scalding {

import cascading.operation._
import cascading.tuple._
import cascading.flow._
import cascading.pipe.assembly.AggregateBy
import cascading.pipe._

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration

import com.esotericsoftware.kryo.Kryo;

object CascadingUtils {
  def flowProcessToConfiguration(fp : FlowProcess[_]) : Configuration = {
    val confCopy = fp.asInstanceOf[FlowProcess[AnyRef]].getConfigCopy
    if (confCopy.isInstanceOf[Configuration]) {
      confCopy.asInstanceOf[Configuration]
    }
    else {
      // For local mode, we don't have a hadoop configuration
      val conf = new Configuration()
      fp.getPropertyKeys.asScala.foreach { key =>
        conf.set(key, fp.getStringProperty(key))
      }
      conf
    }
  }
  def kryoFor(fp : FlowProcess[_]) : Kryo = {
    (new cascading.kryo.KryoSerialization(flowProcessToConfiguration(fp)))
      .populatedKryo
  }
}

import CascadingUtils.kryoFor

  class FlatMapFunction[S,T](fn : S => Iterable[T], fields : Fields,
    conv : TupleConverter[S], set : TupleSetter[T])
    extends BaseOperation[Any](fields) with Function[Any] {

    def operate(flowProcess : FlowProcess[_], functionCall : FunctionCall[Any]) {
      fn(conv(functionCall.getArguments)).foreach { arg : T =>
        val this_tup = set(arg)
        functionCall.getOutputCollector.add(this_tup)
      }
    }
  }

  class MapFunction[S,T](fn : S => T, fields : Fields,
    conv : TupleConverter[S], set : TupleSetter[T])
    extends BaseOperation[Any](fields) with Function[Any] {

    def operate(flowProcess : FlowProcess[_], functionCall : FunctionCall[Any]) {
      val res = fn(conv(functionCall.getArguments))
      functionCall.getOutputCollector.add(set(res))
    }
  }

  class FilterFunction[T](fn : T => Boolean, conv : TupleConverter[T]) extends BaseOperation[Any] with Filter[Any] {
    def isRemove(flowProcess : FlowProcess[_], filterCall : FilterCall[Any]) = {
      !fn(conv(filterCall.getArguments))
    }
  }

  // All the following are operations for use in GroupBuilder

  class FoldAggregator[T,X](fn : (X,T) => X, init : X, fields : Fields,
    conv : TupleConverter[T], set : TupleSetter[X])
    extends BaseOperation[X](fields) with Aggregator[X] {

    def start(flowProcess : FlowProcess[_], call : AggregatorCall[X]) {
      val deepCopyInit = kryoFor(flowProcess).copy(init)
      call.setContext(deepCopyInit)
    }

    def aggregate(flowProcess : FlowProcess[_], call : AggregatorCall[X]) {
      val left = call.getContext
      val right = conv(call.getArguments)
      call.setContext(fn(left, right))
    }

    def complete(flowProcess : FlowProcess[_], call : AggregatorCall[X]) {
      emit(flowProcess, call)
    }

    def emit(flowProcess : FlowProcess[_], call : AggregatorCall[X]) {
      call.getOutputCollector.add(set(call.getContext))
    }
  }

  /*
   * fields are the declared fields of this aggregator
   */
  class MRMAggregator[T,X,U](fsmf : T => X, rfn : (X,X) => X, mrfn : X => U, fields : Fields,
    conv : TupleConverter[T], set : TupleSetter[U])
    extends BaseOperation[Tuple](fields) with Aggregator[Tuple] {
    // The context is a singleton Tuple, which is mutable so
    // we don't have to allocate at every step of the loop:
    def start(flowProcess : FlowProcess[_], call : AggregatorCall[Tuple]) {
        call.setContext(null)
    }

    def extractArgument(call : AggregatorCall[Tuple]) : X = fsmf(conv(call.getArguments))

    def aggregate(flowProcess : FlowProcess[_], call : AggregatorCall[Tuple]) {
      val arg = extractArgument(call)
      val ctx = call.getContext
      if (null == ctx) {
        // Initialize the context, this is the only allocation done by this loop.
        val newCtx = Tuple.size(1)
        newCtx.set(0, arg.asInstanceOf[AnyRef])
        call.setContext(newCtx)
      }
      else {
        // Mutate the context:
        val oldValue = ctx.getObject(0).asInstanceOf[X]
        val newValue = rfn(oldValue, arg)
        ctx.set(0, newValue.asInstanceOf[AnyRef])
      }
    }

    def complete(flowProcess : FlowProcess[_], call : AggregatorCall[Tuple]) {
      val ctx = call.getContext
      if (null != ctx) {
        val lastValue = ctx.getObject(0).asInstanceOf[X]
        call.getOutputCollector.add(set(mrfn(lastValue)))
      }
      else {
        throw new Exception("MRMAggregator completed without any args")
      }
    }
  }

  /**
   * This handles the mapReduceMap work on the map-side of the operation.  The code below
   * attempts to be optimal with respect to memory allocations and performance, not functional
   * style purity.
   */
  abstract class FoldFunctor[X](fields : Fields) extends AggregateBy.Functor {

    // Extend these three methods:
    def first(args : TupleEntry) : X
    def subsequent(oldValue : X, newArgs : TupleEntry) : X
    def finish(lastValue : X) : Tuple

    override final def getDeclaredFields = fields

    /*
     * It's important to keep all state in the context as Cascading seems to
     * reuse these objects, so any per instance state might give unexpected
     * results.
     */
    override final def aggregate(flowProcess : FlowProcess[_], args : TupleEntry, context : Tuple) = {
      var nextContext : Tuple = null
      val newContextObj = if (null == context) {
        // First call, make a new mutable tuple to reduce allocations:
        nextContext = Tuple.size(1)
        first(args)
      }
      else {
        //We are updating
        val oldValue = context.getObject(0).asInstanceOf[X]
        nextContext = context
        subsequent(oldValue, args)
      }
      nextContext.set(0, newContextObj.asInstanceOf[AnyRef])
      //Return context for reuse next time:
      nextContext
    }

    override final def complete(flowProcess : FlowProcess[_], context : Tuple) = {
      if (null == context) {
        throw new Exception("FoldFunctor completed with any aggregate calls")
      }
      else {
        finish(context.getObject(0).asInstanceOf[X])
      }
    }
  }

  /**
   * This handles the mapReduceMap work on the map-side of the operation.  The code below
   * attempts to be optimal with respect to memory allocations and performance, not functional
   * style purity.
   */
  class MRMFunctor[T,X](mrfn : T => X, rfn : (X, X) => X, fields : Fields,
    conv : TupleConverter[T], set : TupleSetter[X])
    extends FoldFunctor[X](fields) {

    override def first(args : TupleEntry) : X = mrfn(conv(args))
    override def subsequent(oldValue : X, newArgs : TupleEntry) = {
      val right = mrfn(conv(newArgs))
      rfn(oldValue, right)
    }
    override def finish(lastValue : X) = set(lastValue)
  }

  /**
   * MapReduceMapBy Class
   */
  class MRMBy[T,X,U](arguments : Fields,
                   middleFields : Fields,
                   declaredFields : Fields,
                   mfn : T => X,
                   rfn : (X,X) => X,
                   mfn2 : X => U,
                   startConv : TupleConverter[T],
                   midSet : TupleSetter[X],
                   midConv : TupleConverter[X],
                   endSet : TupleSetter[U]) extends AggregateBy(
        arguments,
        new MRMFunctor[T,X](mfn, rfn, middleFields, startConv, midSet),
        new MRMAggregator[X,X,U](args => args, rfn, mfn2, declaredFields, midConv, endSet))

  class BufferOp[I,T,X](init : I, iterfn : (I, Iterator[T]) => TraversableOnce[X], fields : Fields,
    conv : TupleConverter[T], set : TupleSetter[X])
    extends BaseOperation[Any](fields) with Buffer[Any] {

    def operate(flowProcess : FlowProcess[_], call : BufferCall[Any]) {
      val deepCopyInit = kryoFor(flowProcess).copy(init)
      val oc = call.getOutputCollector
      val in = call.getArgumentsIterator.asScala.map { entry => conv(entry) }
      iterfn(deepCopyInit, in).foreach { x => oc.add(set(x)) }
    }
  }
  /*
   * fields are the declared fields of this aggregator
   */
  class ExtremumAggregator(choose_max : Boolean, fields : Fields)
    extends BaseOperation[Tuple](fields) with Aggregator[Tuple] {

    def start(flowProcess : FlowProcess[_], call : AggregatorCall[Tuple]) {
        call.setContext(null)
    }
    private def getArgs(call : AggregatorCall[Tuple]) = call.getArguments.getTuple

    def aggregate(flowProcess : FlowProcess[_], call : AggregatorCall[Tuple]) = {
      val arg = getArgs(call)
      val ctx = call.getContext
      if (null == ctx) {
        call.setContext(arg)
      }
      else {
        val (max, min) = if( ctx.compareTo(arg) < 0 ) {
          (arg, ctx)
        } else { (ctx, arg) }
        call.setContext(if(choose_max) max else min)
      }
    }
    def complete(flowProcess : FlowProcess[_], call : AggregatorCall[Tuple]) {
      val ctx = call.getContext
      if (null != ctx) {
        call.getOutputCollector.add(ctx)
      }
      else {
        throw new Exception("ExtremumAggregator called only once")
      }
    }
  }
  class ExtremumFunctor(choose_max : Boolean, fields : Fields) extends AggregateBy.Functor {
    override def getDeclaredFields = fields
    def aggregate(flowProcess : FlowProcess[_], args : TupleEntry, context : Tuple) = {
      val this_tup = args.getTuple
      if(context == null) { this_tup }
      else {
        val (max, min) = if( context.compareTo(this_tup) < 0 ) {
          (this_tup, context)
        } else { (context, this_tup) }
        if(choose_max) max else min
      }
    }
    def complete(flowProcess : FlowProcess[_], context : Tuple) = context
  }
  class ExtremumBy(choosemax : Boolean, arguments : Fields, result : Fields) extends AggregateBy (
        arguments,
        new ExtremumFunctor(choosemax, result),
        new ExtremumAggregator(choosemax, result))
}