com.twitter.algebird.Aggregator.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of algebird-core_2.10 Show documentation
algebird-core
There is a newer version: 0.12.2
package com.twitter.algebird

import java.util.PriorityQueue
import scala.collection.generic.CanBuildFrom

/**
 * Aggregators compose well.
 *
 * To create a parallel aggregator that operates on a single
 * input in parallel, use:
 * GeneratedTupleAggregator.from2((agg1, agg2))
 */
object Aggregator extends java.io.Serializable {
  implicit def applicative[I]: Applicative[({ type L[O] = Aggregator[I, _, O] })#L] = new AggregatorApplicative[I]

  private val DefaultSeed = 471312384

  /**
   * This is a trivial aggregator that always returns a single value
   */
  def const[T](t: T): MonoidAggregator[Any, Unit, T] =
    prepareMonoid { _: Any => () }
      .andThenPresent(_ => t)
  /**
   * Using Aggregator.prepare,present you can add to this aggregator
   */
  def fromReduce[T](red: (T, T) => T): Aggregator[T, T, T] = fromSemigroup(Semigroup.from(red))
  def fromSemigroup[T](implicit sg: Semigroup[T]): Aggregator[T, T, T] = new Aggregator[T, T, T] {
    def prepare(input: T) = input
    def semigroup = sg
    def present(reduction: T) = reduction
  }
  def fromMonoid[T](implicit mon: Monoid[T]): MonoidAggregator[T, T, T] = prepareMonoid(identity[T])
  // Uses the product from the ring
  def fromRing[T](implicit rng: Ring[T]): RingAggregator[T, T, T] = fromRing[T, T](rng, identity[T])

  def fromMonoid[F, T](implicit mon: Monoid[T], prep: F => T): MonoidAggregator[F, T, T] =
    prepareMonoid(prep)(mon)

  def prepareSemigroup[F, T](prep: F => T)(implicit sg: Semigroup[T]): Aggregator[F, T, T] = new Aggregator[F, T, T] {
    def prepare(input: F) = prep(input)
    def semigroup = sg
    def present(reduction: T) = reduction
  }
  def prepareMonoid[F, T](prep: F => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] = new MonoidAggregator[F, T, T] {
    def prepare(input: F) = prep(input)
    def monoid = m
    def present(reduction: T) = reduction
  }
  // Uses the product from the ring
  def fromRing[F, T](implicit rng: Ring[T], prep: F => T): RingAggregator[F, T, T] = new RingAggregator[F, T, T] {
    def prepare(input: F) = prep(input)
    def ring = rng
    def present(reduction: T) = reduction
  }

  /**
   * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation.
   * Equivalent to {{{ appendSemigroup(prep, appnd, identity[T]_)(sg) }}}
   */
  def appendSemigroup[F, T](prep: F => T, appnd: (T, F) => T)(implicit sg: Semigroup[T]): Aggregator[F, T, T] =
    appendSemigroup(prep, appnd, identity[T]_)(sg)

  /**
   * Obtain an [[Aggregator]] that uses an efficient append operation for faster aggregation
   * @tparam F Data input type
   * @tparam T Aggregating [[Semigroup]] type
   * @tparam P Presentation (output) type
   * @param prep The preparation function.  Expected to construct an instance of type T from a single data element.
   * @param appnd Function that appends the [[Semigroup]].  Defines the [[append]] method for this aggregator.
   * Analogous to the 'seqop' function in Scala's sequence 'aggregate' method
   * @param pres The presentation function
   * @param sg The [[Semigroup]] type class
   * @note The functions 'appnd' and 'prep' are expected to obey the law: {{{ appnd(t, f) == sg.plus(t, prep(f)) }}}
   */
  def appendSemigroup[F, T, P](prep: F => T, appnd: (T, F) => T, pres: T => P)(implicit sg: Semigroup[T]): Aggregator[F, T, P] =
    new Aggregator[F, T, P] {
      def semigroup: Semigroup[T] = sg
      def prepare(input: F): T = prep(input)
      def present(reduction: T): P = pres(reduction)

      override def apply(inputs: TraversableOnce[F]): P = applyOption(inputs).get

      override def applyOption(inputs: TraversableOnce[F]): Option[P] = agg(inputs).map(pres)

      override def append(l: T, r: F): T = appnd(l, r)

      override def appendAll(old: T, items: TraversableOnce[F]): T =
        if (items.isEmpty) old else reduce(old, agg(items).get)

      private def agg(inputs: TraversableOnce[F]): Option[T] =
        if (inputs.isEmpty) None else {
          val itr = inputs.toIterator
          val t = prepare(itr.next)
          Some(itr.foldLeft(t)(appnd))
        }
    }

  /**
   * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation.
   * Equivalent to {{{ appendMonoid(appnd, identity[T]_)(m) }}}
   */
  def appendMonoid[F, T](appnd: (T, F) => T)(implicit m: Monoid[T]): MonoidAggregator[F, T, T] =
    appendMonoid(appnd, identity[T]_)(m)

  /**
   * Obtain a [[MonoidAggregator]] that uses an efficient append operation for faster aggregation
   * @tparam F Data input type
   * @tparam T Aggregating [[Monoid]] type
   * @tparam P Presentation (output) type
   * @param appnd Function that appends the [[Monoid]].  Defines the [[append]] method for this aggregator.
   * Analogous to the 'seqop' function in Scala's sequence 'aggregate' method
   * @param pres The presentation function
   * @param m The [[Monoid]] type class
   * @note The function 'appnd' is expected to obey the law: {{{ appnd(t, f) == m.plus(t, appnd(m.zero, f)) }}}
   */
  def appendMonoid[F, T, P](appnd: (T, F) => T, pres: T => P)(implicit m: Monoid[T]): MonoidAggregator[F, T, P] =
    new MonoidAggregator[F, T, P] {
      def monoid: Monoid[T] = m
      def prepare(input: F): T = appnd(m.zero, input)
      def present(reduction: T): P = pres(reduction)

      override def apply(inputs: TraversableOnce[F]): P = present(agg(inputs))

      override def applyOption(inputs: TraversableOnce[F]): Option[P] =
        if (inputs.isEmpty) None else Some(apply(inputs))

      override def append(l: T, r: F): T = appnd(l, r)

      override def appendAll(old: T, items: TraversableOnce[F]): T = reduce(old, agg(items))

      override def appendAll(items: TraversableOnce[F]): T = agg(items)

      private def agg(inputs: TraversableOnce[F]): T = inputs.aggregate(m.zero)(appnd, m.plus)
    }

  /**
   * How many items satisfy a predicate
   */
  def count[T](pred: T => Boolean): MonoidAggregator[T, Long, Long] =
    prepareMonoid { t: T => if (pred(t)) 1L else 0L }

  /**
   * Do any items satisfy some predicate
   */
  def exists[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] =
    prepareMonoid(pred)(OrVal.unboxedMonoid)
  /**
   * Do all items satisfy a predicate
   */
  def forall[T](pred: T => Boolean): MonoidAggregator[T, Boolean, Boolean] =
    prepareMonoid(pred)(AndVal.unboxedMonoid)
  /**
   * Take the first (left most in reduce order) item found
   */
  def head[T]: Aggregator[T, T, T] = fromReduce[T] { (l, r) => l }
  /**
   * Take the last (right most in reduce order) item found
   */
  def last[T]: Aggregator[T, T, T] = fromReduce[T] { (l, r) => r }
  /**
   * Get the maximum item
   */
  def max[T: Ordering]: Aggregator[T, T, T] = new MaxAggregator[T]
  def maxBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = {
    implicit val ordU = Ordering.by(fn)
    max[U]
  }
  /**
   * Get the minimum item
   */
  def min[T: Ordering]: Aggregator[T, T, T] = new MinAggregator[T]
  def minBy[U, T: Ordering](fn: U => T): Aggregator[U, U, U] = {
    implicit val ordU = Ordering.by(fn)
    min[U]
  }
  /**
   * This returns the number of items we find
   */
  def size: MonoidAggregator[Any, Long, Long] =
    prepareMonoid { (_: Any) => 1L }
  /**
   * Take the smallest `count` items using a heap
   */
  def sortedTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
    new mutable.PriorityQueueToListAggregator[T](count)
  /**
   * Same as sortedTake, but using a function that returns a value that has an Ordering.
   *
   * This function is like writing list.sortBy(fn).take(count).
   */
  def sortByTake[T, U: Ordering](count: Int)(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
    Aggregator.sortedTake(count)(Ordering.by(fn))
  /**
   * Take the largest `count` items using a heap
   */
  def sortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
    new mutable.PriorityQueueToListAggregator[T](count)(implicitly[Ordering[T]].reverse)
  /**
   * Same as sortedReverseTake, but using a function that returns a value that has an Ordering.
   *
   * This function is like writing list.sortBy(fn).reverse.take(count).
   */
  def sortByReverseTake[T, U: Ordering](count: Int)(fn: T => U): MonoidAggregator[T, PriorityQueue[T], Seq[T]] =
    Aggregator.sortedReverseTake(count)(Ordering.by(fn))
  /**
   * Immutable version of sortedTake, for frameworks that check immutability of reduce functions.
   */
  def immutableSortedTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] =
    new TopKToListAggregator[T](count)
  /**
   * Immutable version of sortedReverseTake, for frameworks that check immutability of reduce functions.
   */
  def immutableSortedReverseTake[T: Ordering](count: Int): MonoidAggregator[T, TopK[T], Seq[T]] =
    new TopKToListAggregator[T](count)(implicitly[Ordering[T]].reverse)
  /**
   * Randomly selects input items where each item has an independent probability 'prob' of being
   * selected. This assumes that all sampled records can fit in memory, so use this only when the
   * expected number of sampled values is small.
   */
  def randomSample[T](prob: Double, seed: Int = DefaultSeed): MonoidAggregator[T, Option[Batched[T]], List[T]] = {
    assert(prob >= 0 && prob <= 1, "randomSample.prob must lie in [0, 1]")
    val rng = new java.util.Random(seed)
    Preparer[T]
      .filter(_ => rng.nextDouble() <= prob)
      .monoidAggregate(toList)
  }
  /**
   * Selects exactly 'count' of the input records randomly (or all of the records if there are less
   * then 'count' total records). This assumes that all 'count' of the records can fit in memory,
   * so use this only for small values of 'count'.
   */
  def reservoirSample[T](count: Int, seed: Int = DefaultSeed): MonoidAggregator[T, PriorityQueue[(Double, T)], Seq[T]] = {
    val rng = new java.util.Random(seed)
    Preparer[T]
      .map(rng.nextDouble() -> _)
      .monoidAggregate(sortByTake(count)(_._1))
      .andThenPresent(_.map(_._2))
  }
  /**
   * Put everything in a List. Note, this could fill the memory if the List is very large.
   */
  def toList[T]: MonoidAggregator[T, Option[Batched[T]], List[T]] =
    new MonoidAggregator[T, Option[Batched[T]], List[T]] {
      def prepare(t: T): Option[Batched[T]] = Some(Batched(t))
      def monoid: Monoid[Option[Batched[T]]] = Monoid.optionMonoid(Batched.semigroup)
      def present(o: Option[Batched[T]]): List[T] = o.map(_.toList).getOrElse(Nil)
    }

  /**
   * Put everything in a Set. Note, this could fill the memory if the Set is very large.
   */
  def toSet[T]: MonoidAggregator[T, Set[T], Set[T]] =
    prepareMonoid { t: T => Set(t) }

  /**
   * This builds an in-memory Set, and then finally gets the size of that set.
   * This may not be scalable if the Uniques are very large. You might check the
   * approximateUniqueCount or HyperLogLog Aggregator to get an approximate version
   * of this that is scalable.
   */
  def uniqueCount[T]: MonoidAggregator[T, Set[T], Int] =
    toSet[T].andThenPresent(_.size)

  /**
   * Using a constant amount of memory, give an approximate unique count (~ 1% error).
   * This uses an exact set for up to 100 items,
   * then HyperLogLog (HLL) with an 1.2% standard error which uses at most 8192 bytes
   * for each HLL. For more control, see HyperLogLogAggregator.
   */
  def approximateUniqueCount[T: Hash128]: MonoidAggregator[T, Either[HLL, Set[T]], Long] =
    SetSizeHashAggregator[T](hllBits = 13, maxSetSize = 100)

  /**
   * Returns the lower bound of a given percentile where the percentile is between (0,1]
   * The items that are iterated over cannot be negative.
   */
  def approximatePercentile[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit num: Numeric[T]): QTreeAggregatorLowerBound[T] =
    QTreeAggregatorLowerBound[T](percentile, k)

  /**
   * Returns the intersection of a bounded percentile where the percentile is between (0,1]
   * The items that are iterated over cannot be negative.
   */
  def approximatePercentileBounds[T](percentile: Double, k: Int = QTreeAggregator.DefaultK)(implicit num: Numeric[T]): QTreeAggregator[T] =
    QTreeAggregator[T](percentile, k)
}

/**
 * This is a type that models map/reduce(map). First each item is mapped,
 * then we reduce with a semigroup, then finally we present the results.
 *
 * Unlike Fold, Aggregator keeps it's middle aggregation type externally visible.
 * This is because Aggregators are useful in parallel map/reduce systems where
 * there may be some additional types needed to cross the map/reduce boundary
 * (such a serialization and intermediate storage). If you don't care about the
 * middle type, an _ may be used and the main utility of the instance is still
 * preserved (e.g. def operate[T](ag: Aggregator[T, _, Int]): Int)
 *
 * Note, join is very useful to combine multiple aggregations with one pass.
 * Also GeneratedTupleAggregator.fromN((agg1, agg2, ... aggN)) can glue these
 * together well.
 *
 * This type is the the Fold.M from Haskell's fold package:
 * https://hackage.haskell.org/package/folds-0.6.2/docs/Data-Fold-M.html
 */
trait Aggregator[-A, B, +C] extends java.io.Serializable { self =>
  def prepare(input: A): B
  def semigroup: Semigroup[B]
  def present(reduction: B): C

  /* *****
   * All the following are in terms of the above
   */

  /**
   * combine two inner values
   */
  def reduce(l: B, r: B): B = semigroup.plus(l, r)
  /**
   * This may error if items is empty. To be safe you might use reduceOption
   * if you don't know that items is non-empty
   */
  def reduce(items: TraversableOnce[B]): B = semigroup.sumOption(items).get
  /**
   * This is the safe version of the above. If the input in empty, return None,
   * else reduce the items
   */
  def reduceOption(items: TraversableOnce[B]): Option[B] = semigroup.sumOption(items)

  /**
   * This may error if inputs are empty (for Monoid Aggregators it never will, instead
   * you see present(Monoid.zero[B])
   */
  def apply(inputs: TraversableOnce[A]): C = present(reduce(inputs.map(prepare)))
  /**
   * This returns None if the inputs are empty
   */
  def applyOption(inputs: TraversableOnce[A]): Option[C] =
    reduceOption(inputs.map(prepare))
      .map(present)

  /**
   * This returns the cumulative sum of its inputs, in the same order.
   * If the inputs are empty, the result will be empty too.
   */
  def cumulativeIterator(inputs: Iterator[A]): Iterator[C] =
    inputs
      .scanLeft(None: Option[B]) {
        case (None, a) => Some(prepare(a))
        case (Some(b), a) => Some(append(b, a))
      }
      .collect { case Some(b) => present(b) }

  /**
   * This returns the cumulative sum of its inputs, in the same order.
   * If the inputs are empty, the result will be empty too.
   */
  def applyCumulatively[In <: TraversableOnce[A], Out](inputs: In)(implicit bf: CanBuildFrom[In, C, Out]): Out = {
    val builder = bf()
    builder ++= cumulativeIterator(inputs.toIterator)
    builder.result
  }

  def append(l: B, r: A): B = reduce(l, prepare(r))

  def appendAll(old: B, items: TraversableOnce[A]): B =
    if (items.isEmpty) old else reduce(old, reduce(items.map(prepare)))

  /** Like calling andThen on the present function */
  def andThenPresent[D](present2: C => D): Aggregator[A, B, D] =
    new Aggregator[A, B, D] {
      def prepare(input: A) = self.prepare(input)
      def semigroup = self.semigroup
      def present(reduction: B) = present2(self.present(reduction))
    }
  /** Like calling compose on the prepare function */
  def composePrepare[A1](prepare2: A1 => A): Aggregator[A1, B, C] =
    new Aggregator[A1, B, C] {
      def prepare(input: A1) = self.prepare(prepare2(input))
      def semigroup = self.semigroup
      def present(reduction: B) = self.present(reduction)
    }

  /**
   * This allows you to run two aggregators on the same data with a single pass
   */
  def join[A2 <: A, B2, C2](that: Aggregator[A2, B2, C2]): Aggregator[A2, (B, B2), (C, C2)] =
    GeneratedTupleAggregator.from2((this, that))

  /**
   * This allows you to join two aggregators into one that takes a tuple input,
   * which in turn allows you to chain .composePrepare onto the result if you have
   * an initial input that has to be prepared differently for each of the joined aggregators.
   *
   * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs))
   */
  def zip[A2, B2, C2](ag2: Aggregator[A2, B2, C2]): Aggregator[(A, A2), (B, B2), (C, C2)] = {
    val ag1 = this
    new Aggregator[(A, A2), (B, B2), (C, C2)] {
      def prepare(a: (A, A2)) = (ag1.prepare(a._1), ag2.prepare(a._2))
      val semigroup = new Tuple2Semigroup()(ag1.semigroup, ag2.semigroup)
      def present(b: (B, B2)) = (ag1.present(b._1), ag2.present(b._2))
    }
  }

  /**
   * An Aggregator can be converted to a Fold, but not vice-versa
   * Note, a Fold is more constrained so only do this if you require
   * joining a Fold with an Aggregator to produce a Fold
   */
  def toFold: Fold[A, Option[C]] = Fold.fold[Option[B], A, Option[C]](
    {
      case (None, a) => Some(self.prepare(a))
      case (Some(b), a) => Some(self.append(b, a))
    },
    None,
    { _.map(self.present(_)) })

  def lift: MonoidAggregator[A, Option[B], Option[C]] =
    new MonoidAggregator[A, Option[B], Option[C]] {
      def prepare(input: A): Option[B] = Some(self.prepare(input))
      def present(reduction: Option[B]): Option[C] = reduction.map(self.present)
      def monoid = new OptionMonoid[B]()(self.semigroup)
    }
}

/**
 * Aggregators are Applicatives, but this hides the middle type. If you need a join that
 * does not hide the middle type use join on the trait, or GeneratedTupleAggregator.fromN
 */
class AggregatorApplicative[I] extends Applicative[({ type L[O] = Aggregator[I, _, O] })#L] {
  override def map[T, U](mt: Aggregator[I, _, T])(fn: T => U): Aggregator[I, _, U] =
    mt.andThenPresent(fn)
  override def apply[T](v: T): Aggregator[I, _, T] =
    Aggregator.const(v)
  override def join[T, U](mt: Aggregator[I, _, T], mu: Aggregator[I, _, U]): Aggregator[I, _, (T, U)] =
    mt.join(mu)
  override def join[T1, T2, T3](m1: Aggregator[I, _, T1],
    m2: Aggregator[I, _, T2],
    m3: Aggregator[I, _, T3]): Aggregator[I, _, (T1, T2, T3)] =
    GeneratedTupleAggregator.from3(m1, m2, m3)

  override def join[T1, T2, T3, T4](m1: Aggregator[I, _, T1],
    m2: Aggregator[I, _, T2],
    m3: Aggregator[I, _, T3],
    m4: Aggregator[I, _, T4]): Aggregator[I, _, (T1, T2, T3, T4)] =
    GeneratedTupleAggregator.from4(m1, m2, m3, m4)

  override def join[T1, T2, T3, T4, T5](m1: Aggregator[I, _, T1],
    m2: Aggregator[I, _, T2],
    m3: Aggregator[I, _, T3],
    m4: Aggregator[I, _, T4],
    m5: Aggregator[I, _, T5]): Aggregator[I, _, (T1, T2, T3, T4, T5)] =
    GeneratedTupleAggregator.from5(m1, m2, m3, m4, m5)
}

trait MonoidAggregator[-A, B, +C] extends Aggregator[A, B, C] { self =>
  def monoid: Monoid[B]
  def semigroup = monoid
  final override def reduce(items: TraversableOnce[B]): B =
    monoid.sum(items)

  def appendAll(items: TraversableOnce[A]): B = reduce(items.map(prepare))

  override def andThenPresent[D](present2: C => D): MonoidAggregator[A, B, D] = {
    val self = this
    new MonoidAggregator[A, B, D] {
      def prepare(a: A) = self.prepare(a)
      def monoid = self.monoid
      def present(b: B) = present2(self.present(b))
    }
  }
  override def composePrepare[A2](prepare2: A2 => A): MonoidAggregator[A2, B, C] = {
    val self = this
    new MonoidAggregator[A2, B, C] {
      def prepare(a: A2) = self.prepare(prepare2(a))
      def monoid = self.monoid
      def present(b: B) = self.present(b)
    }
  }

  /**
   * Build a MonoidAggregator that either takes left or right input
   * and outputs the pair from both
   */
  def either[A2, B2, C2](that: MonoidAggregator[A2, B2, C2]): MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] =
    new MonoidAggregator[Either[A, A2], (B, B2), (C, C2)] {
      def prepare(e: Either[A, A2]) = e match {
        case Left(a) => (self.prepare(a), that.monoid.zero)
        case Right(a2) => (self.monoid.zero, that.prepare(a2))
      }
      val monoid = new Tuple2Monoid[B, B2]()(self.monoid, that.monoid)
      def present(bs: (B, B2)) = (self.present(bs._1), that.present(bs._2))
    }

  /**
   * Only aggregate items that match a predicate
   */
  def filterBefore[A1 <: A](pred: A1 => Boolean): MonoidAggregator[A1, B, C] =
    new MonoidAggregator[A1, B, C] {
      def prepare(a: A1) = if (pred(a)) self.prepare(a) else self.monoid.zero
      def monoid = self.monoid
      def present(b: B) = self.present(b)
    }
  /**
   * This maps the inputs to Bs, then sums them, effectively flattening
   * the inputs to the MonoidAggregator
   */
  def sumBefore: MonoidAggregator[TraversableOnce[A], B, C] =
    new MonoidAggregator[TraversableOnce[A], B, C] {
      def monoid: Monoid[B] = self.monoid
      def prepare(input: TraversableOnce[A]): B = monoid.sum(input.map(self.prepare))
      def present(reduction: B): C = self.present(reduction)
    }

  /**
   * This allows you to join two aggregators into one that takes a tuple input,
   * which in turn allows you to chain .composePrepare onto the result if you have
   * an initial input that has to be prepared differently for each of the joined aggregators.
   *
   * The law here is: ag1.zip(ag2).apply(as.zip(bs)) == (ag1(as), ag2(bs))
   */
  def zip[A2, B2, C2](ag2: MonoidAggregator[A2, B2, C2]): MonoidAggregator[(A, A2), (B, B2), (C, C2)] = {
    val ag1 = self
    new MonoidAggregator[(A, A2), (B, B2), (C, C2)] {
      def prepare(a: (A, A2)) = (ag1.prepare(a._1), ag2.prepare(a._2))
      val monoid = new Tuple2Monoid[B, B2]()(ag1.monoid, ag2.monoid)
      def present(b: (B, B2)) = (ag1.present(b._1), ag2.present(b._2))
    }
  }
}

trait RingAggregator[-A, B, +C] extends MonoidAggregator[A, B, C] {
  def ring: Ring[B]
  def monoid = Ring.asTimesMonoid(ring)
}