com.twitter.finagle.stats.BucketedHistogram.scala Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of finagle-stats-core_2.11 Show documentation
finagle-stats-core
There is a newer version: 21.2.0
Show newest version
package com.twitter.finagle.stats

import java.util.concurrent.atomic.{AtomicLong, AtomicIntegerArray}

private[twitter] object BucketedHistogram {

  private[stats] val DefaultQuantiles = IndexedSeq(
    0.5, 0.9, 0.95, 0.99, 0.999, 0.9999
  )

  /**
   * Given an error, compute all the bucket values from 1 until we run out of positive
   * 32-bit ints. The error should be in percent, between 0.0 and 1.0.
   *
   * Each value in the returned array will be `max(1, 2 * error * n)` larger than
   * the previous `n` value (before rounding).
   *
   * Because percentiles are then computed as the midpoint between two adjacent limits,
   * this means that a value can be at most `1 + error` percent off of the actual
   * percentile.
   *
   * The last bucket tracks up to `Int.MaxValue`.
   */
  private[stats] def makeLimitsFor(error: Double): Array[Int] = {
    def build(maxValue: Double, factor: Double, n: Double): Stream[Double] = {
      val next = n * factor
      if (next >= maxValue)
        Stream.empty
      else
        Stream.cons(next, build(maxValue, factor, next))
    }
    require(error > 0.0 && error <= 1.0, error)

    // we construct an exponential bucketing system, floor every value, and then remove buckets
    // that are repeated.  in practice, this means we may or may not see duplicates until the
    // exponent gets high enough that incrementing the exponent increases the value by at least
    // one, at which point there's an inflection point.  an alternate way of constructing the same
    // array would be
    //
    // val inflectionPoint = (1.0 / (error * 2)).toInt
    // val prefix = 0.to(inflectionPoint).toSeq
    // val unadjusted = prefix ++ build(Int.MaxValue.toDouble, 1.0 + (error * 2), inflectionPoint)
    // unadjusted.map(_ + 1)
    //
    // we exploit this later on in our bucket-finding algorithm
    val values = build(Int.MaxValue.toDouble, 1.0 + (error * 2), 1.0)
      .map(_.toInt + 1) // this ensures that the smallest value is 2 (below we prepend `1`)
      .distinct
      .force
    (Seq(1) ++ values).toArray
  }

  // 0.5% error => 1797 buckets, 7188 bytes, max 11 compares on binary search
  private[stats] val DefaultErrorPercent = 0.005

  // this is exposed for testing, but should not be mutated
  private[stats] val DefaultLimits: Array[Int] =
    makeLimitsFor(DefaultErrorPercent)

  /** check all the limits are non-negative and increasing in value. */
  private def assertLimits(limits: Array[Int]): Unit = {
    require(limits.length > 0)
    var i = 0
    var prev = -1L
    while (i < limits.length) {
      val value = limits(i)
      require(value >= 0 && value > prev, i)
      prev = value
      i += 1
    }
  }

  /**
   * Creates an instance using the default bucket limits.
   */
  def apply(): BucketedHistogram =
    new BucketedHistogram(DefaultErrorPercent)

  /**
   * A mutable struct used to store the most recent calculation
   * of snapshot. By reusing a single instance per Stat allows us to
   * avoid creating objects with medium length lifetimes that would
   * need to exist from one stat collection to the next.
   *
   * @param percentiles represent the quantiles that we will compute, and should
   *        be between 0 and 1.
   */
  private[stats] final class MutableSnapshot(val percentiles: IndexedSeq[Double]) {
    @volatile var count: Long = 0L
    @volatile var sum: Long = 0L
    @volatile var max: Long = 0L
    @volatile var min: Long = 0L
    @volatile var avg: Double = 0.0
    @volatile var quantiles: Array[Long] = new Array[Long](percentiles.length)

    def clear(): Unit = {
      count = 0L
      sum = 0L
      max = 0L
      min = 0L
      avg = 0.0
      quantiles = new Array[Long](percentiles.length) // resets to 0
    }
  }
}

/**
 * Allows for computing approximate percentiles from a stream of
 * data points.
 *
 * The precision is relative to the size of the data points that are
 * collected (not the number of points, but how big each value is).
 *
 * For instances created using the defaults via [[BucketedHistogram.apply()]],
 * the memory footprint should be around 7.2 KB.
 *
 * This is thread-safe.
 *
 * ''Note:'' while the interface for [[add(Long)]] takes a `Long`,
 * internally the maximum value we will observe is `Int.MaxValue`. This is subject
 * to change and should be considered the minimum upper bound. Also, the smallest
 * value we will record is `0`.
 *
 * ''Note:'' this code borrows heavily from
 * [[https://github.com/twitter/ostrich/blob/master/src/main/scala/com/twitter/ostrich/stats/Histogram.scala Ostrich's Histogram]].
 * A few of the differences include:
 *  - bucket limits are configurable instead of fixed
 *  - counts per bucket are stored in int's
 *  - no tracking of min, max, sum
 *
 * @param limits the values at each index represent upper bounds, exclusive,
 *               of values for the bucket. As an example, given limits of `Array(1, 3, MaxValue)`,
 *               index=0 counts values `[0..1)`,
 *               index=1 counts values `[1..3)`, and
 *               index=2 counts values `[3..MaxValue)`.
 *               An Int per bucket should suffice, as standard usage only gives
 *               20 seconds before rolling to the next BucketedHistogram.
 *               This gives you up to Int.MaxValue / 20 = ~107MM add()s per second
 *               to a ''single'' bucket.
 *
 * @see [[BucketedHistogram.apply()]] for creation.
 */
private[stats] final class BucketedHistogram(error: Double) {
  assert(0 < error && error < 1, "Error must be in the range (0.0, 1.0)")

  // this is the point at which we stopped seeing duplicates when we multiplied
  // the previous  number by the factor.
  // where x is the inflection point where we no longer see duplicates and y
  // is the factor
  // x + 1 > x(1 + y) => x + 1 > x + xy => 1 > xy => 1/x > y => x < 1/y
  private[this] val inflectionPoint = (1.0 / (error * 2))
  private[this] val factor = 1.0 + (error * 2)

  // we need to multiply by this to convert to the right base.
  // log_factor(x) == log10(x) / log10(factor) == log10(x) * 1 / log10(factor)
  private[this] val logFactor = 1.0 / math.log10(factor)
  private[this] def logarithm(num: Double): Double = {
    math.log10(num) * logFactor
  }

  // this is different from the floor of the inflection point because we
  // increment the floors of the powers when making the bucket limits
  private[this] val inflectionBucket = inflectionPoint.toInt + 1

  // we need to subtract the offset because our bucket making algorithm removes duplicates
  // we can find the number of buckets we would have if we hadn't removed duplicates, and
  // then subtract the number of buckets we actually have.
  private[this] val offset: Int = logarithm(inflectionBucket).toInt - inflectionBucket

  private[this] val limits =
    if (error == BucketedHistogram.DefaultErrorPercent) BucketedHistogram.DefaultLimits
    else BucketedHistogram.makeLimitsFor(error)

  /**
   * Given a number that you want to insert into a bucket, find the bucket that it should
   * go into.  If it's below the point where we're still removing duplicates, index directly
   * into the bucket.  If it's above, take the logarithm.
   */
  // 0 to inflectionPoint
  private[stats] def findBucket(num: Int): Int = {
    if (num <= inflectionBucket) {
      math.max(0, num)
    } else {
      logarithm(num).toInt - offset
    }
  }

  // we acquire the "read" lock when we add an element to the histogram
  // we acquire the "write" lock when we need to rezero the histogram, or when
  // we need to read the histogram.
  private[this] val sync = new NonReentrantReadWriteLock
  protected def countsLength: Int = limits.length + 1

  /** total number of samples seen */
  private[this] val num = new AtomicLong(0)

  /** total value of all samples seen */
  private[this] val total = new AtomicLong(0)

  /**
   * Number of samples seen per corresponding bucket in `limits`
   */
  private[this] val counts = new AtomicIntegerArray(countsLength)

  /**
   * Note: only values between `0` and `Int.MaxValue`, inclusive, are recorded.
   *
   * @inheritdoc
   */
  def add(value: Long): Unit = {
    sync.acquireShared(1)
    try {
      val index = if (value >= Int.MaxValue) {
        total.getAndAdd(Int.MaxValue)
        countsLength - 1
      } else {
        total.getAndAdd(value)
        val asInt = value.toInt
        // recall that limits represent upper bounds, exclusive — so take the next position (+1).
        // we assume that no inputs can be larger than the largest value in the limits array.
        findBucket(asInt)
      }
      counts.getAndIncrement(index)
      num.getAndIncrement()
    } finally {
      sync.releaseShared(1)
    }
  }

  def clear(): Unit = {
    sync.acquire(1)
    try {
      var i = 0
      while (i < countsLength) {
        counts.set(i, 0)
        i += 1
      }
      num.set(0)
      total.set(0)
    } finally {
      sync.release(1)
    }
  }

  /**
   * Calculate the value of the percentile rank, `p`, for the added data points
   * such that `p * 100`-percent of the data points are the same or less than it.
   *
   * Not threadsafe, exposed for testing.
   *
   * @param p must be within 0.0 to 1.0, inclusive.
   * @return the approximate value for the requested percentile.
   *         The returned value will be within
   *         [[BucketedHistogram.DefaultErrorPercent]] of the actual value.
   */
  def percentile(p: Double): Long = {
    if (p < 0.0 || p > 1.0)
      throw new AssertionError(s"percentile must be within 0.0 to 1.0 inclusive: $p")

    val target = Math.round(p * num.get)
    var total = 0L
    var i = 0
    while (i < countsLength && total < target) {
      total += counts.get(i)
      i += 1
    }
    i match {
      case 0 => 0
      case _ if i == countsLength => maximum
      case _ => limitMidpoint(i - 1)
    }
  }

  /**
   * The maximum value seen by calls to [[add]].
   *
   * Not threadsafe, exposed for testing.
   *
   * @return 0 if no values have been added.
   *         The returned value will be within
   *         [[BucketedHistogram.DefaultErrorPercent]] of the actual value.
   */
  def maximum: Long = {
    if (num.get == 0) {
      0L
    } else if (counts.get(countsLength - 1) > 0) {
      Int.MaxValue
    } else {
      var i = countsLength - 2 // already checked the last, start 1 before
      while (i >= 0 && counts.get(i) == 0) {
        i -= 1
      }
      if (i == 0) 0
      else limitMidpoint(i)
    }
  }

  /**
   * The minimum value seen by calls to [[add]].
   *
   * Not threadsafe, exposed for testing.
   *
   * @return 0 if no values have been added.
   *         The returned value will be within
   *         [[BucketedHistogram.DefaultErrorPercent]] of the actual value.
   */
  def minimum: Long =
    if (num.get == 0) {
      0L
    } else {
      var i = 0
      while (i < countsLength && counts.get(i) == 0) {
        i += 1
      }
      limitMidpoint(i)
    }

  /** Get the midpoint of bucket `i` */
  private[this] def limitMidpoint(i: Int): Long = {
    i match {
      case 0 => 0
      case _ if i >= limits.length => Int.MaxValue
      case _ => (limits(i - 1).toLong + limits(i)) / 2
    }
  }

  def getQuantile(quantile: Double): Long =
    percentile(quantile)

  def getQuantiles(quantiles: IndexedSeq[Double]): Array[Long] = {
    val ps = new Array[Long](quantiles.length)
    var i = 0
    while (i < ps.length) {
      // Note: we could speed this up via just one pass over `counts` instead of
      // of a pass per quantile.
      // We could speed up calls to `percentile` by tracking the maximum
      // bucket used during `add()`s to minimize how much of `counts` to scan.
      ps(i) = percentile(quantiles(i))
      i += 1
    }
    ps
  }

  /**
   * Recomputes all of the metrics.  Only recompute should be used in a
   * concurrent context, since the underlying statistics methods are not
   * threadsafe and only exposed for testing.
   */
  def recompute(snap: BucketedHistogram.MutableSnapshot): Unit = {
    sync.acquire(1)
    try {
      snap.count = count
      snap.sum = sum
      snap.max = maximum
      snap.min = minimum
      snap.avg = average
      snap.quantiles = getQuantiles(snap.percentiles)
    } finally {
      sync.release(1)
    }
  }

  /**
   * The total of all the values seen by calls to [[add]].
   *
   * Not threadsafe, exposed for testing.
   */
  def sum: Long = total.get

  /**
   * The number of values [[add added]].
   *
   * Not threadsafe, exposed for testing.
   */
  def count: Long = num.get

  /**
   * The average, or arithmetic mean, of all values seen
   * by calls to [[add]].
   *
   * Not threadsafe, exposed for testing.
   *
   * @return 0.0 if no values have been [[add added]].
   */
  def average: Double = {
    val count = num.get
    if (count == 0) 0.0 else total.get / count.toDouble
  }

  /**
   * Returns a seq containing nonzero values of the histogram.
   * The sequence contains instances of BucketAndCount which are
   * the bucket's upper and lower limits and a count of the number
   * of times a value in range of the limits was added.
   */
  def bucketAndCounts: Seq[BucketAndCount] = {
    sync.acquire(1)

    // note that this method is optimized for reducing allocations, but has not
    // been benchmarked.
    try {
      // first iterate over to find the exact length of the eventual array
      var idx = 0
      var arrayLength = 0
      while (idx < countsLength) {
        if (counts.get(idx) > 0) {
          arrayLength += 1
        }
        idx += 1
      }

      // iterate again after making the array
      idx = 0
      var arrayIdx = 0
      val out = new Array[BucketAndCount](arrayLength)
      while (idx < countsLength) {
        val count = counts.get(idx)
        if (count > 0) {
          // counts is 1 bucket longer than limits
          // The last bucket of counts tracks added
          // values greater than or equal to Int.MaxValue
          val upperLimit = if (idx != limits.length) {
            limits(idx)
          } else Int.MaxValue
          val lowerLimit = if (idx != 0) {
            limits(idx - 1)
          } else 0
          out(arrayIdx) = BucketAndCount(lowerLimit, upperLimit, count)
          arrayIdx += 1
        }
        idx += 1
      }
      out
    } finally {
      sync.release(1)
    }
  }

}