com.yahoo.sketches.BoundsOnBinomialProportions Maven / Gradle / Ivy
/*
* Copyright 2015-16, Yahoo! Inc.
* Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.
*/
package com.yahoo.sketches;
/**
* Confidence intervals for binomial proportions.
*
* This class computes an approximation to the Clopper-Pearson confidence interval
* for a binomial proportion. Exact Clopper-Pearson intervals are strictly
* conservative, but these approximations are not.
*
* The main inputs are numbers n and k which are not the same as other things
* that are called n and k in our sketching library. There is also a third
* parameter, numStdDev, that specifies the desired confidence level.
*
* - n is the number of independent randomized trials. It is given and therefore known.
*
* - p is the probability of a trial being a success. It is unknown.
* - k is the number of trials (out of n) that turn out to be successes. It is
* a random variable governed by a binomial distribution. After any given
* batch of n independent trials, the random variable k has a specific
* value which is observed and is therefore known.
* - pHat = k / n is an unbiased estimate of the unknown success
* probability p.
*
*
* Alternatively, consider a coin with unknown heads probability p. Where
* n is the number of independent flips of that coin, and k is the number
* of times that the coin comes up heads during a given batch of n flips.
* This class computes a frequentist confidence interval [lowerBoundOnP, upperBoundOnP] for the
* unknown p.
*
* Conceptually, the desired confidence level is specified by a tail probability delta.
*
* Ideally, over a large ensemble of independent batches of trials,
* the fraction of batches in which the true p lies below lowerBoundOnP would be at most
* delta, and the fraction of batches in which the true p lies above upperBoundOnP
* would also be at most delta.
*
*
Setting aside the philophical difficulties attaching to that statement, it isn't quite
* true because we are approximating the Clopper-Pearson interval.
*
* Finally, we point out that in this class's interface, the confidence parameter delta is
* not specified directly, but rather through a "number of standard deviations" numStdDev.
* The library effectively converts that to a delta via delta = normalCDF (-1.0 * numStdDev).
*
* It is perhaps worth emphasizing that the library is NOT merely adding and subtracting
* numStdDev standard deviations to the estimate. It is doing something better, that to some
* extent accounts for the fact that the binomial distribution has a non-gaussian shape.
*
* In particular, it is using an approximation to the inverse of the incomplete beta function
* that appears as formula 26.5.22 on page 945 of the "Handbook of Mathematical Functions"
* by Abramowitz and Stegun.
*
* @author Kevin Lang
*/
public final class BoundsOnBinomialProportions { // confidence intervals for binomial proportions
private BoundsOnBinomialProportions() {}
/**
* Computes lower bound of approximate Clopper-Pearson confidence interval for a binomial
* proportion.
*
* Implementation Notes:
* The approximateLowerBoundOnP is defined with respect to the right tail of the binomial
* distribution.
*
* - We want to solve for the p for which sumj,k,nbino(j;n,p)
* = delta.
* - We now restate that in terms of the left tail.
* - We want to solve for the p for which sumj,0,(k-1)bino(j;n,p)
* = 1 - delta.
* - Define x = 1-p.
* - We want to solve for the x for which Ix(n-k+1,k) = 1 - delta.
* - We specify 1-delta via numStdDevs through the right tail of the standard normal
* distribution.
* - Smaller values of numStdDevs correspond to bigger values of 1-delta and hence to smaller
* values of delta. In fact, usefully small values of delta correspond to negative values of
* numStdDevs.
* - return p = 1-x.
*
*
* @param n is the number of trials. Must be non-negative.
* @param k is the number of successes. Must be non-negative, and cannot exceed n.
* @param numStdDevs the number of standard deviations defining the confidence interval
* @return the lower bound of the approximate Clopper-Pearson confidence interval for the
* unknown success probability.
*/
public static double approximateLowerBoundOnP(long n, long k, double numStdDevs) {
checkInputs(n, k);
if (n == 0) { return 0.0; } // the coin was never flipped, so we know nothing
else if (k == 0) { return 0.0; }
else if (k == 1) { return (exactLowerBoundOnPForKequalsOne(n, deltaOfNumStdevs(numStdDevs))); }
else if (k == n) { return (exactLowerBoundOnPForKequalsN(n, deltaOfNumStdevs(numStdDevs))); }
else {
double x = abramowitzStegunFormula26p5p22(n - k + 1, k, (-1.0 * numStdDevs));
return (1.0 - x); // which is p
}
}
/**
* Computes upper bound of approximate Clopper-Pearson confidence interval for a binomial
* proportion.
*
* Implementation Notes:
* The approximateUpperBoundOnP is defined with respect to the left tail of the binomial
* distribution.
*
* - We want to solve for the p for which sumj,0,kbino(j;n,p)
* = delta.
* - Define x = 1-p.
* - We want to solve for the x for which Ix(n-k,k+1) = delta.
* - We specify delta via numStdDevs through the right tail of the standard normal
* distribution.
* - Bigger values of numStdDevs correspond to smaller values of delta.
* - return p = 1-x.
*
* @param n is the number of trials. Must be non-negative.
* @param k is the number of successes. Must be non-negative, and cannot exceed n.
* @param numStdDevs the number of standard deviations defining the confidence interval
* @return the upper bound of the approximate Clopper-Pearson confidence interval for the
* unknown success probability.
*/
public static double approximateUpperBoundOnP(long n, long k, double numStdDevs) {
checkInputs(n, k);
if (n == 0) { return 1.0; } // the coin was never flipped, so we know nothing
else if (k == n) { return 1.0; }
else if (k == n - 1) {
return (exactUpperBoundOnPForKequalsNminusOne(n, deltaOfNumStdevs(numStdDevs)));
}
else if (k == 0) {
return (exactUpperBoundOnPForKequalsZero(n, deltaOfNumStdevs(numStdDevs)));
}
else {
double x = abramowitzStegunFormula26p5p22(n - k, k + 1, numStdDevs);
return (1.0 - x); // which is p
}
}
/**
* Computes an estimate of an unknown binomial proportion.
* @param n is the number of trials. Must be non-negative.
* @param k is the number of successes. Must be non-negative, and cannot exceed n.
* @return the estimate of the unknown binomial proportion.
*/
public static double estimateUnknownP(long n, long k) {
checkInputs(n, k);
if (n == 0) { return 0.5; } // the coin was never flipped, so we know nothing
else { return ((double) k / (double) n); }
}
private static void checkInputs(long n, long k) {
if (n < 0) { throw new SketchesArgumentException("N must be non-negative"); }
if (k < 0) { throw new SketchesArgumentException("K must be non-negative"); }
if (k > n) { throw new SketchesArgumentException("K cannot exceed N"); }
}
/**
* Computes an approximation to the erf() function.
* @param x is the input to the erf function
* @return returns erf(x), accurate to roughly 7 decimal digits.
*/
public static double erf(double x) {
if (x < 0.0) { return (-1.0 * (erf_of_nonneg(-1.0 * x))); }
else { return (erf_of_nonneg(x)); }
}
/**
* Computes an approximation to normalCDF(x).
* @param x is the input to the normalCDF function
* @return returns the approximation to normalCDF(x).
*/
public static double normalCDF(double x) {
return (0.5 * (1.0 + (erf(x / (Math.sqrt(2.0))))));
}
//@formatter:off
// Abramowitz and Stegun formula 7.1.28, p. 88; Claims accuracy of about 7 decimal digits */
private static double erf_of_nonneg(double x) {
// The constants that appear below, formatted for easy checking against the book.
// a1 = 0.07052 30784
// a3 = 0.00927 05272
// a5 = 0.00027 65672
// a2 = 0.04228 20123
// a4 = 0.00015 20143
// a6 = 0.00004 30638
double a1 = 0.0705230784;
double a3 = 0.0092705272;
double a5 = 0.0002765672;
double a2 = 0.0422820123;
double a4 = 0.0001520143;
double a6 = 0.0000430638;
double x2 = x * x; // x squared, x cubed, etc.
double x3 = x2 * x;
double x4 = x2 * x2;
double x5 = x2 * x3;
double x6 = x3 * x3;
double sum = ( 1.0
+ a1 * x
+ a2 * x2
+ a3 * x3
+ a4 * x4
+ a5 * x5
+ a6 * x6 );
double sum2 = sum * sum; // raise the sum to the 16th power
double sum4 = sum2 * sum2;
double sum8 = sum4 * sum4;
double sum16 = sum8 * sum8;
return (1.0 - (1.0 / sum16));
}
private static double deltaOfNumStdevs(double kappa) {
return (normalCDF(-1.0 * kappa));
}
//@formatter:on
// Formula 26.5.22 on page 945 of Abramowitz & Stegun, which is an approximation
// of the inverse of the incomplete beta function I_x(a,b) = delta
// viewed as a scalar function of x.
// In other words, we specify delta, and it gives us x (with a and b held constant).
// However, delta is specified in an indirect way through yp which
// is the number of stdevs that leaves delta probability in the right
// tail of a standard gaussian distribution.
// We point out that the variable names correspond to those in the book,
// and it is worth keeping it that way so that it will always be easy to verify
// that the formula was typed in correctly.
private static double abramowitzStegunFormula26p5p22(double a, double b, double yp) {
double b2m1 = 2.0 * b - 1.0;
double a2m1 = 2.0 * a - 1.0;
double lambda = ((yp * yp) - 3.0) / 6.0;
double htmp = (1.0 / a2m1) + (1.0 / b2m1);
double h = 2.0 / htmp;
double term1 = yp * (Math.sqrt(h + lambda)) / h;
double term2 = (1.0 / b2m1) - (1.0 / a2m1);
double term3 = lambda + (5.0 / 6.0) - (2.0 / (3.0 * h));
double w = term1 - (term2 * term3);
double xp = a / (a + (b * (Math.exp(2.0 * w))));
return xp;
}
// Formulas for some special cases.
private static double exactUpperBoundOnPForKequalsZero(double n, double delta) {
return (1.0 - Math.pow(delta, (1.0 / n)));
}
private static double exactLowerBoundOnPForKequalsN(double n, double delta) {
return (Math.pow(delta, (1.0 / n)));
}
private static double exactLowerBoundOnPForKequalsOne(double n, double delta) {
return (1.0 - Math.pow((1.0 - delta), (1.0 / n)));
}
private static double exactUpperBoundOnPForKequalsNminusOne(double n, double delta) {
return (Math.pow((1.0 - delta), (1.0 / n)));
}
}