org.apache.datasketches.thetacommon.BinomialBoundsN Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of datasketches-java Show documentation
Core sketch algorithms used alone and by other Java repositories in the DataSketches library.
There is a newer version: 6.1.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.apache.datasketches.thetacommon;

import org.apache.datasketches.common.SketchesArgumentException;

/**
 * This class enables the estimation of error bounds given a sample set size, the sampling
 * probability theta, the number of standard deviations and a simple noDataSeen flag.  This can
 * be used to estimate error bounds for fixed threshold sampling as well as the error bounds
 * calculations for sketches.
 *
 * @author Kevin Lang
 */
// BTW, the suffixes "NStar", "NPrimeB", and "NPrimeF" correspond to variables in the formal
// writeup of this scheme.
public final class BinomialBoundsN {

  private BinomialBoundsN() {}

  private static final double[] deltaOfNumSDev =
  {
    0.5000000000000000000, // = 0.5 (1 + erf(0)
    0.1586553191586026479, // = 0.5 (1 + erf((-1/sqrt(2))))
    0.0227502618904135701, // = 0.5 (1 + erf((-2/sqrt(2))))
    0.0013498126861731796  // = 0.5 (1 + erf((-3/sqrt(2))))
  };

  // our "classic" bounds, but now with continuity correction

  private static double contClassicLB(final double numSamplesF, final double theta,
      final double numSDev) {
    final double nHat = (numSamplesF - 0.5) / theta;
    final double b = numSDev * Math.sqrt((1.0 - theta) / theta);
    final double d  = 0.5 * b * Math.sqrt((b * b) + (4.0 * nHat));
    final double center = nHat + (0.5 * (b * b));
    return (center - d);
  }

  private static double contClassicUB(final double numSamplesF, final double theta,
      final double numSDev) {
    final double nHat = (numSamplesF + 0.5) / theta;
    final double b   = numSDev * Math.sqrt((1.0 - theta) / theta);
    final double d  = 0.5 * b * Math.sqrt((b * b) + (4.0 * nHat));
    final double center = nHat + (0.5 * (b * b));
    return (center + d);
  }

  // This is a special purpose calculator for NStar, using a computational
  // strategy inspired by its Bayesian definition. It is only appropriate
  // for a very limited set of inputs. However, the procedure computeApproxBinoLB ()
  // below does in fact only call it for suitably limited inputs.
  // Outside of this limited range, two different bad things will happen.
  // First, because we are not using logarithms, the values of intermediate
  // quantities will exceed the dynamic range of doubles. Second, even if that
  // problem were fixed, the running time of this procedure is essentially linear
  // in est = (numSamples / p), and that can be Very, Very Big.

  private static long specialNStar(final long numSamplesI, final double p, final double delta) {
    final double q, numSamplesF;
    double tot, curTerm;
    long m;
    assertTrue(numSamplesI >= 1);
    assertTrue((0.0 < p) && (p < 1.0));
    assertTrue((0.0 < delta) && (delta < 1.0));
    q = 1.0 - p;
    numSamplesF = numSamplesI;
    // Use a different algorithm if the following isn't true; this one will be too slow, or worse.
    assertTrue((numSamplesF / p) < 500.0);
    curTerm = Math.pow(p, numSamplesF);  // curTerm = posteriorProbability (k, k, p)
    assertTrue(curTerm > 1e-100); // sanity check for non-use of logarithms
    tot = curTerm;
    m = numSamplesI;
    while (tot <= delta) { // this test can fail even the first time
      curTerm = (curTerm * q * (m)) / ((m + 1) - numSamplesI);
      tot += curTerm;
      m += 1;
    }
    // we have reached a state where tot > delta, so back up one
    return (m - 1);
  }

  //  The following procedure has very limited applicability.
  //  The above remarks about specialNStar() also apply here.
  private static long specialNPrimeB(final long numSamplesI, final double p, final double delta) {
    final double q, numSamplesF, oneMinusDelta;
    double tot, curTerm;
    long m;
    assertTrue(numSamplesI >= 1);
    assertTrue((0.0 < p) && (p < 1.0));
    assertTrue((0.0 < delta) && (delta < 1.0));
    q = 1.0 - p;
    oneMinusDelta = 1.0 - delta;
    numSamplesF = numSamplesI;
    curTerm = Math.pow(p, numSamplesF);  // curTerm = posteriorProbability (k, k, p)
    assertTrue(curTerm > 1e-100); // sanity check for non-use of logarithms
    tot = curTerm;
    m = numSamplesI;
    while (tot < oneMinusDelta) {
      curTerm = (curTerm * q * (m)) / ((m + 1) - numSamplesI);
      tot += curTerm;
      m += 1;
    }
    return (m); // don't need to back up
  }

  private static long specialNPrimeF(final long numSamplesI, final double p, final double delta) {
    // Use a different algorithm if the following isn't true; this one will be too slow, or worse.
    assertTrue(((numSamplesI) / p) < 500.0); //A super-small delta could also make it slow.
    return (specialNPrimeB(numSamplesI + 1, p, delta));
  }

  // The following computes an approximation to the lower bound of
  // a Frequentist confidence interval based on the tails of the Binomial distribution.
  private static double computeApproxBinoLB(final long numSamplesI, final double theta,
      final int numSDev) {
    if (theta == 1.0) {
      return (numSamplesI);
    }

    else if (numSamplesI == 0) {
      return (0.0);
    }

    else if (numSamplesI == 1) {
      final double delta = deltaOfNumSDev[numSDev];
      final double rawLB = (Math.log(1.0 - delta)) / (Math.log(1.0 - theta));
      return (Math.floor(rawLB)); // round down
    }

    else if (numSamplesI > 120) {
      // plenty of samples, so gaussian approximation to binomial distribution isn't too bad
      final double rawLB = contClassicLB( numSamplesI, theta, numSDev);
      return (rawLB - 0.5); // fake round down
    }

    // at this point we know 2 <= numSamplesI <= 120

    else if (theta > (1.0 - 1e-5)) {  // empirically-determined threshold
      return (numSamplesI);
    }

    else if (theta < ((numSamplesI) / 360.0)) {  // empirically-determined threshold
      // here we use the gaussian approximation, but with a modified "numSDev"
      final int index;
      final double rawLB;
      index = (3 * ((int) numSamplesI)) + (numSDev - 1);
      rawLB = contClassicLB(numSamplesI, theta, EquivTables.getLB(index));
      return (rawLB - 0.5); // fake round down
    }

    else { // This is the most difficult range to approximate; we will compute an "exact" LB.
      // We know that est <= 360, so specialNStar() shouldn't be ridiculously slow.
      final double delta = deltaOfNumSDev[numSDev];
      final long nstar = specialNStar(numSamplesI, theta, delta);
      return (nstar); // don't need to round
    }
  }

  // The following computes an approximation to the upper bound of
  // a Frequentist confidence interval based on the tails of the Binomial distribution.
  private static double computeApproxBinoUB(final long numSamplesI, final double theta,
      final int numSDev) {
    if (theta == 1.0) {
      return (numSamplesI);
    }

    else if (numSamplesI == 0) {
      final double delta = deltaOfNumSDev[numSDev];
      final double rawUB = (Math.log(delta)) / (Math.log(1.0 - theta));
      return (Math.ceil(rawUB)); // round up
    }

    else if (numSamplesI > 120) {
      // plenty of samples, so gaussian approximation to binomial distribution isn't too bad
      final double rawUB = contClassicUB(numSamplesI, theta, numSDev);
      return (rawUB + 0.5); // fake round up
    }

    // at this point we know 1 <= numSamplesI <= 120

    else if (theta > (1.0 - 1e-5)) { // empirically-determined threshold
      return (numSamplesI + 1);
    }

    else if (theta < ((numSamplesI) / 360.0)) {  // empirically-determined threshold
      // here we use the gaussian approximation, but with a modified "numSDev"
      final int index;
      final double rawUB;
      index = (3 * ((int) numSamplesI)) + (numSDev - 1);
      rawUB = contClassicUB(numSamplesI, theta, EquivTables.getUB(index));
      return (rawUB + 0.5); // fake round up
    }

    else { // This is the most difficult range to approximate; we will compute an "exact" UB.
      // We know that est <= 360, so specialNPrimeF() shouldn't be ridiculously slow.
      final double delta = deltaOfNumSDev[numSDev];
      final long nprimef = specialNPrimeF(numSamplesI, theta, delta);
      return (nprimef); // don't need to round
    }
  }

  // The following two procedures enforce some extra rules that help
  // to prevent the return of bounds that might be confusing to users.
  /**
   * Returns the approximate lower bound value
   * @param numSamples the number of samples in the sample set
   * @param theta the sampling probability
   * @param numSDev the number of "standard deviations" from the mean for the tail bounds.
   * This must be an integer value of 1, 2 or 3.
   * @param noDataSeen this is normally false. However, in the case where you have zero samples
   * and a theta < 1.0, this flag enables the distinction between a virgin case when no actual
   * data has been seen and the case where the estimate may be zero but an upper error bound may
   * still exist.
   * @return the approximate lower bound value
   */
  public static double getLowerBound(final long numSamples, final double theta, final int numSDev,
      final boolean noDataSeen) {
    //in earlier code numSamples was called numSamplesI
    if (noDataSeen) { return 0.0; }
    checkArgs(numSamples, theta, numSDev);
    final double lb = computeApproxBinoLB(numSamples, theta, numSDev);
    final double numSamplesF = numSamples;
    final double est = numSamplesF / theta;
    return (Math.min(est, Math.max(numSamplesF, lb)));
  }

  /**
   * Returns the approximate upper bound value
   * @param numSamples the number of samples in the sample set
   * @param theta the sampling probability
   * @param numSDev the number of "standard deviations" from the mean for the tail bounds.
   * This must be an integer value of 1, 2 or 3.
   * @param noDataSeen this is normally false. However, in the case where you have zero samples
   * and a theta < 1.0, this flag enables the distinction between a virgin case when no actual
   * data has been seen and the case where the estimate may be zero but an upper error bound may
   * still exist.
   * @return the approximate upper bound value
   */
  public static double getUpperBound(final long numSamples, final double theta, final int numSDev,
      final boolean noDataSeen) {
    //in earlier code numSamples was called numSamplesI
    if (noDataSeen) { return 0.0; }
    checkArgs(numSamples, theta, numSDev);
    final double ub = computeApproxBinoUB(numSamples, theta, numSDev);
    final double numSamplesF = numSamples;
    final double est = numSamplesF / theta;
    return (Math.max(est, ub));
  }

  //exposed only for test
  static void checkArgs(final long numSamples, final double theta, final int numSDev) {
    if ((numSDev | (numSDev - 1) | (3 - numSDev) | numSamples) < 0) {
      throw new SketchesArgumentException(
          "numSDev must only be 1,2, or 3 and numSamples must >= 0: numSDev="
              + numSDev + ", numSamples=" + numSamples);
    }
    if ((theta < 0.0) || (theta > 1.0)) {
      throw new SketchesArgumentException("0.0 < theta <= 1.0: " + theta);
    }
  }

  private static void assertTrue(final boolean truth) {
    assert (truth);
  }

} // end of class "BinomialBoundsN"