All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.pulsar.shade.com.yahoo.sketches.BinomialBoundsN Maven / Gradle / Ivy

There is a newer version: 4.0.0.4
Show newest version
/*
 * Copyright 2015-16, Yahoo! Inc.
 * Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
 */

package org.apache.pulsar.shade.com.yahoo.sketches;

/**
 * This class enables the estimation of error bounds given a sample set size, the sampling
 * probability theta, the number of standard deviations and a simple noDataSeen flag.  This can
 * be used to estimate error bounds for fixed threshold sampling as well as the error bounds
 * calculations for sketches.
 *
 * @author Kevin Lang
 */
// BTW, the suffixes "NStar", "NPrimeB", and "NPrimeF" correspond to variables in the formal
// writeup of this scheme.
@SuppressWarnings({"cast"})
public final class BinomialBoundsN {

  private BinomialBoundsN() {}

  private static double[] deltaOfNumSDev =
  {
    0.5000000000000000000, // not actually using this value
    0.1586553191586026479,
    0.0227502618904135701,
    0.0013498126861731796
  };

  // our "classic" bounds, but now with continuity correction

  private static double contClassicLB(final double numSamplesF, final double theta,
      final double numSDev) {
    final double nHat = (numSamplesF - 0.5) / theta;
    final double b = numSDev * Math.sqrt((1.0 - theta) / theta);
    final double d  = 0.5 * b * Math.sqrt(b * b + 4.0 * nHat);
    final double center = nHat + 0.5 * (b * b);
    return (center - d);
  }

  private static double contClassicUB(final double numSamplesF, final double theta,
      final double numSDev) {
    final double nHat = (numSamplesF + 0.5) / theta;
    final double b   = numSDev * Math.sqrt((1.0 - theta) / theta);
    final double d  = 0.5 * b * Math.sqrt(b * b + 4.0 * nHat);
    final double center = nHat + 0.5 * (b * b);
    return (center + d);
  }

  // This is a special purpose calculator for NStar, using a computational
  // strategy inspired by its Bayesian definition. It is only appropriate
  // for a very limited set of inputs. However, the procedure computeApproxBinoLB ()
  // below does in fact only call it for suitably limited inputs.
  // Outside of this limited range, two different bad things will happen.
  // First, because we are not using logarithms, the values of intermediate
  // quantities will exceed the dynamic range of doubles. Second, even if that
  // problem were fixed, the running time of this procedure is essentially linear
  // in est = (numSamples / p), and that can be Very, Very Big.

  private static long specialNStar(final long numSamplesI, final double p, final double delta) {
    final double q, numSamplesF;
    double tot, curTerm;
    long m;
    assertTrue(numSamplesI >= 1);
    assertTrue(0.0 < p && p < 1.0);
    assertTrue(0.0 < delta && delta < 1.0);
    q = 1.0 - p;
    numSamplesF = (double) numSamplesI;
    // Use a different algorithm if the following isn't true; this one will be too slow, or worse.
    assertTrue((numSamplesF / p) < 500.0);
    curTerm = Math.pow(p, numSamplesF);  // curTerm = posteriorProbability (k, k, p)
    assertTrue(curTerm > 1e-100); // sanity check for non-use of logarithms
    tot = curTerm;
    m = numSamplesI;
    while (tot <= delta) { // this test can fail even the first time
      curTerm = curTerm * q * ((double) m) / ((double) (m + 1 - numSamplesI));
      tot += curTerm;
      m += 1;
    }
    // we have reached a state where tot > delta, so back up one
    return (m - 1);
  }

  //  The following procedure has very limited applicability.
  //  The above remarks about specialNStar() also apply here.
  private static long specialNPrimeB(final long numSamplesI, final double p, final double delta) {
    final double q, numSamplesF, oneMinusDelta;
    double tot, curTerm;
    long m;
    assertTrue(numSamplesI >= 1);
    assertTrue(0.0 < p && p < 1.0);
    assertTrue(0.0 < delta && delta < 1.0);
    q = 1.0 - p;
    oneMinusDelta = 1.0 - delta;
    numSamplesF = (double) numSamplesI;
    curTerm = Math.pow(p, numSamplesF);  // curTerm = posteriorProbability (k, k, p)
    assertTrue(curTerm > 1e-100); // sanity check for non-use of logarithms
    tot = curTerm;
    m = numSamplesI;
    while (tot < oneMinusDelta) {
      curTerm = curTerm * q * ((double) m) / ((double) (m + 1 - numSamplesI));
      tot += curTerm;
      m += 1;
    }
    return (m); // don't need to back up
  }

  private static long specialNPrimeF(final long numSamplesI, final double p, final double delta) {
    // Use a different algorithm if the following isn't true; this one will be too slow, or worse.
    assertTrue((((double) numSamplesI) / p) < 500.0); //A super-small delta could also make it slow.
    return (specialNPrimeB(numSamplesI + 1, p, delta));
  }

  // The following computes an approximation to the lower bound of
  // a Frequentist confidence interval based on the tails of the Binomial distribtuion.
  private static double computeApproxBinoLB(final long numSamplesI, final double theta,
      final int numSDev) {
    if (theta == 1.0) {
      return ((double) numSamplesI);
    }

    else if (numSamplesI == 0) {
      return (0.0);
    }

    else if (numSamplesI == 1) {
      final double delta = deltaOfNumSDev[numSDev];
      final double rawLB = (Math.log(1.0 - delta)) / (Math.log(1.0 - theta));
      return (Math.floor(rawLB)); // round down
    }

    else if (numSamplesI > 120) {
      // plenty of samples, so gaussian approximation to binomial distribution isn't too bad
      final double rawLB = contClassicLB( (double)numSamplesI, theta, (double)numSDev);
      return (rawLB - 0.5); // fake round down
    }

    // at this point we know 2 <= numSamplesI <= 120

    else if (theta > (1.0 - 1e-5)) {  // empirically-determined threshold
      return ((double) numSamplesI);
    }

    else if (theta < ((double) numSamplesI) / 360.0) {  // empirically-determined threshold
      // here we use the gaussian approximation, but with a modified "numSDev"
      final int index;
      final double rawLB;
      index = 3 * ((int) numSamplesI) + (numSDev - 1);
      rawLB = contClassicLB((double) numSamplesI, theta, EquivTables.getLB(index));
      return (rawLB - 0.5); // fake round down
    }

    else { // This is the most difficult range to approximate; we will compute an "exact" LB.
      // We know that est <= 360, so specialNStar() shouldn't be ridiculously slow.
      final double delta = deltaOfNumSDev[numSDev];
      final long nstar = specialNStar(numSamplesI, theta, delta);
      return ((double) nstar); // don't need to round
    }
  }

  // The following computes an approximation to the upper bound of
  // a Frequentist confidence interval based on the tails of the Binomial distribution.
  private static double computeApproxBinoUB(final long numSamplesI, final double theta,
      final int numSDev) {
    if (theta == 1.0) {
      return ((double)numSamplesI);
    }

    else if (numSamplesI == 0) {
      final double delta = deltaOfNumSDev[numSDev];
      final double rawUB = (Math.log(delta)) / (Math.log(1.0 - theta));
      return (Math.ceil(rawUB)); // round up
    }

    else if (numSamplesI > 120) {
      // plenty of samples, so gaussian approximation to binomial distribution isn't too bad
      final double rawUB = contClassicUB((double) numSamplesI, theta, (double) numSDev);
      return (rawUB + 0.5); // fake round up
    }

    // at this point we know 1 <= numSamplesI <= 120

    else if (theta > (1.0 - 1e-5)) { // empirically-determined threshold
      return ((double) (numSamplesI + 1));
    }

    else if (theta < ((double) numSamplesI) / 360.0) {  // empirically-determined threshold
      // here we use the gaussian approximation, but with a modified "numSDev"
      final int index;
      final double rawUB;
      index = 3 * ((int) numSamplesI) + (numSDev - 1);
      rawUB = contClassicUB((double) numSamplesI, theta, EquivTables.getUB(index));
      return (rawUB + 0.5); // fake round up
    }

    else { // This is the most difficult range to approximate; we will compute an "exact" UB.
      // We know that est <= 360, so specialNPrimeF() shouldn't be ridiculously slow.
      final double delta = deltaOfNumSDev[numSDev];
      final long nprimef = specialNPrimeF(numSamplesI, theta, delta);
      return ((double) nprimef); // don't need to round
    }
  }

  // The following two procedures enforce some extra rules that help
  // to prevent the return of bounds that might be confusing to users.
  /**
   * Returns the approximate lower bound value
   * @param numSamples the number of samples in the sample set
   * @param theta the sampling probability
   * @param numSDev the number of "standard deviations" from the mean for the tail bounds.  This
   * must be an integer value of 1, 2 or 3.
   * @param noDataSeen this is normally false. However, in the case where you have zero samples and
   * a a theta < 1.0, this flag enables the distinction between a virgin case when no actual
   * data has been seen and the case where the estimate may be zero but an upper error bound may
   * still exist.
   * @return the approximate upper bound value
   */
  public static double getLowerBound(final long numSamples, final double theta, final int numSDev,
      final boolean noDataSeen) {
    //in earlier code numSamples was called numSamplesI
    if (noDataSeen) { return 0.0; }
    checkArgs(numSamples, theta, numSDev);
    final double lb = computeApproxBinoLB(numSamples, theta, numSDev);
    final double numSamplesF = (double) numSamples;
    final double est = numSamplesF / theta;
    return (Math.min(est, Math.max(numSamplesF, lb)));
  }

  /**
   * Returns the approximate upper bound value
   * @param numSamples the number of samples in the sample set
   * @param theta the sampling probability
   * @param numSDev the number of "standard deviations" from the mean used to compute thetail
   * bounds. This must be an integer value of 1, 2 or 3.
   * @param noDataSeen this is normally false. However, in the case where you have zero samples and
   * a a theta < 1.0, this flag enables the distinction between a virgin case when no actual
   * data has been seen and the case where the estimate may be zero but an upper error bound may
   * still exist.
   * @return the approximate upper bound value
   */
  public static double getUpperBound(final long numSamples, final double theta, final int numSDev,
      final boolean noDataSeen) {
    //in earlier code numSamples was called numSamplesI
    if (noDataSeen) { return 0.0; }
    checkArgs(numSamples, theta, numSDev);
    final double ub = computeApproxBinoUB(numSamples, theta, numSDev);
    final double numSamplesF = (double) numSamples;
    final double est = numSamplesF / theta;
    return (Math.max(est, ub));
  }

  //exposed only for test
  static final void checkArgs(final long numSamples, final double theta, final int numSDev) {
    if ((numSDev | (numSDev - 1) | (3 - numSDev) | numSamples) < 0) {
      throw new SketchesArgumentException(
          "numSDev must only be 1,2, or 3 and numSamples must >= 0: numSDev="
              + numSDev + ", numSamples=" + numSamples);
    }
    if ((theta < 0.0) || (theta > 1.0)) {
      throw new SketchesArgumentException("0.0 < theta <= 1.0: " + theta);
    }
  }

  private static void assertTrue(final boolean truth) {
    assert (truth);
  }

} // end of class "BinomialBoundsN"




© 2015 - 2024 Weber Informatics LLC | Privacy Policy