All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.twitter.algebird.MomentsGroup.scala Maven / Gradle / Ivy

/*
Copyright 2012 Twitter, Inc.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package com.twitter.algebird

/**
 * A class to calculate the first five central moments over a sequence of Doubles.
 * Given the first five central moments, we can then calculate metrics like skewness
 * and kurtosis.
 *
 * m{i} denotes the ith central moment.
 */
case class Moments(m0: Long, m1: Double, m2: Double, m3: Double, m4: Double) {
  def count = m0

  def mean = m1

  // Population variance, not sample variance.
  def variance = m2 / count

  // Population standard deviation, not sample standard deviation.
  def stddev = math.sqrt(variance)

  def skewness = math.sqrt(count) * m3 / math.pow(m2, 1.5)

  def kurtosis = count * m4 / math.pow(m2, 2) - 3
}

object Moments {
  implicit val group = MomentsGroup
  implicit val aggregator = MomentsAggregator

  def numericAggregator[N](implicit num: Numeric[N]): MonoidAggregator[N, Moments, Moments] =
    Aggregator.prepareMonoid { n: N => Moments(num.toDouble(n)) }

  // Create a Moments object given a single value. This is useful for
  // initializing moment calculations at the start of a stream.
  def apply[V <% Double](value: V) = new Moments(1L, value, 0, 0, 0)

  def apply[V <% Double](m0: Long, m1: V, m2: V, m3: V, m4: V) =
    new Moments(m0, m1, m2, m3, m4)
}

/**
 * A monoid to perform moment calculations.
 */
object MomentsGroup extends Group[Moments] {

  // When combining averages, if the counts sizes are too close we should use a different
  // algorithm. This constant defines how close the ratio of the smaller to the total count
  // can be.
  private val STABILITY_CONSTANT = 0.1

  val zero = Moments(0L, 0.0, 0.0, 0.0, 0.0)

  override def negate(a: Moments): Moments = {
    Moments(-a.count, a.m1, -a.m2, -a.m3, -a.m4)
  }

  // Combines the moment calculations from two streams.
  // See http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics
  // for more information on the formulas used to update the moments.
  def plus(a: Moments, b: Moments): Moments = {
    val delta = b.mean - a.mean
    val countCombined = a.count + b.count
    if (countCombined == 0)
      return zero
    val meanCombined = getCombinedMean(a.count, a.mean, b.count, b.mean)

    val m2 = a.m2 + b.m2 +
      math.pow(delta, 2) * a.count * b.count / countCombined

    val m3 = a.m3 + b.m3 +
      math.pow(delta, 3) * a.count * b.count * (a.count - b.count) / math.pow(countCombined, 2) +
      3 * delta * (a.count * b.m2 - b.count * a.m2) / countCombined

    val m4 = a.m4 + b.m4 +
      math.pow(delta, 4) * a.count * b.count * (math.pow(a.count, 2) -
        a.count * b.count + math.pow(b.count, 2)) / math.pow(countCombined, 3) +
        6 * math.pow(delta, 2) * (math.pow(a.count, 2) * b.m2 +
          math.pow(b.count, 2) * a.m2) / math.pow(countCombined, 2) +
          4 * delta * (a.count * b.m3 - b.count * a.m3) / countCombined

    Moments(countCombined, meanCombined, m2, m3, m4)
  }

  /**
   * Given two streams of doubles A and B, with the specified counts and means,
   * calculate the mean of the combined stream.
   */
  def getCombinedMean(countA: Long, meanA: Double, countB: Long, meanB: Double): Double = {
    val (big, small) =
      if (math.abs(countA) >= math.abs(countB))
        ((countA, meanA), (countB, meanB))
      else
        ((countB, meanB), (countA, meanA))

    val (countBig, meanBig) = big
    val (countSmall, meanSmall) = small

    if (countSmall == 0) {
      meanBig
    } else {
      val countCombined = countSmall + countBig
      val scaling = countSmall.toDouble / countCombined
      val meanCombined =
        if (math.abs(scaling) < STABILITY_CONSTANT)
          // This formula for the combined mean is only stable if
          // countA is not approximately countB.
          meanBig + (meanSmall - meanBig) * scaling
        else
          // Use this more stable formulation if the sizes of the two streams
          // are close.
          (countBig * meanBig + countSmall * meanSmall) / countCombined

      meanCombined
    }
  }
}

object MomentsAggregator extends MonoidAggregator[Double, Moments, Moments] {
  val monoid = MomentsGroup

  def prepare(input: Double): Moments = Moments(input)
  def present(m: Moments) = m
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy