org.apache.commons.statistics.inference.Hypergeom Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.statistics.inference;
import org.apache.commons.statistics.distribution.HypergeometricDistribution;
/**
* Provide a wrapper around the {@link HypergeometricDistribution} that caches
* all probability mass values.
*
* This class extracts the logic from the HypergeometricDistribution implementation
* used for the cumulative probability functions. It allows fast computation of
* the CDF and SF for the entire supported domain.
*
* @since 1.1
*/
class Hypergeom {
/** 1/2. */
private static final double HALF = 0.5;
/** The lower bound of the support (inclusive). */
private final int lowerBound;
/** The upper bound of the support (inclusive). */
private final int upperBound;
/** Cached probability values. This holds values from x=0 even though the supported
* lower bound may be above x=0. This allows x to be used as an index without offsetting
* using the lower bound. */
private final double[] prob;
/** Cached midpoint, m, of the CDF/SF. This is not the true median. It is the value where
* the CDF is closest to 0.5; as such the CDF(m) may be below 0.5 if the next value
* CDF(m+1) is further from 0.5. Used for the cumulative probability functions. */
private final int m;
/** Cached CDF of the midpoint.
* Used for the cumulative probability functions. */
private final double midCDF;
/** Lower mode. */
private final int m1;
/** Upper mode. */
private final int m2;
/**
* @param populationSize Population size.
* @param numberOfSuccesses Number of successes in the population.
* @param sampleSize Sample size.
*/
Hypergeom(int populationSize,
int numberOfSuccesses,
int sampleSize) {
final HypergeometricDistribution dist =
HypergeometricDistribution.of(populationSize, numberOfSuccesses, sampleSize);
// Cache all values required to compute the cumulative probability functions
// Bounds
lowerBound = dist.getSupportLowerBound();
upperBound = dist.getSupportUpperBound();
// PMF values
prob = new double[upperBound + 1];
for (int x = lowerBound; x <= upperBound; x++) {
prob[x] = dist.probability(x);
}
// Compute mid-point for CDF/SF computation
// Find the closest sum(PDF) to 0.5.
int x = lowerBound;
double p0 = 0;
double p1 = prob[x];
// No check of the upper bound required here as the CDF should sum to 1 and 0.5
// is exceeded before a bounds error.
while (p1 < HALF) {
x++;
p0 = p1;
p1 += prob[x];
}
// p1 >= 0.5 > p0
// Pick closet
if (p1 - HALF >= HALF - p0) {
x--;
p1 = p0;
}
m = x;
midCDF = p1;
// Compute the mode (lower != upper in the case where v is integer).
// This value is used by the UnconditionedExactTest and is cached here for convenience.
final double v = ((double) numberOfSuccesses + 1) * ((double) sampleSize + 1) / (populationSize + 2.0);
m1 = (int) Math.ceil(v) - 1;
m2 = (int) Math.floor(v);
}
/**
* Get the lower bound of the support.
*
* @return lower bound
*/
int getSupportLowerBound() {
return lowerBound;
}
/**
* Get the upper bound of the support.
*
* @return upper bound
*/
int getSupportUpperBound() {
return upperBound;
}
/**
* Get the lower mode of the distribution.
*
* @return lower mode
*/
int getLowerMode() {
return m1;
}
/**
* Get the upper mode of the distribution.
*
* @return upper mode
*/
int getUpperMode() {
return m2;
}
/**
* Compute the probability mass function (PMF) at the specified value.
*
* @param x Value.
* @return P(X = x)
* @throws IndexOutOfBoundsException if the value {@code x} is not in the supported domain.
*/
double pmf(int x) {
return prob[x];
}
/**
* Compute the cumulative distribution function (CDF) at the specified value.
*
* @param x Value.
* @return P(X <= x)
*/
double cdf(int x) {
if (x < lowerBound) {
return 0.0;
} else if (x >= upperBound) {
return 1.0;
}
if (x < m) {
return innerCumulativeProbability(lowerBound, x);
} else if (x > m) {
return 1 - innerCumulativeProbability(upperBound, x + 1);
}
// cdf(x)
return midCDF;
}
/**
* Compute the survival function (SF) at the specified value. This is the complementary
* cumulative distribution function.
*
* @param x Value.
* @return P(X > x)
*/
double sf(int x) {
if (x < lowerBound) {
return 1.0;
} else if (x >= upperBound) {
return 0.0;
}
if (x < m) {
return 1 - innerCumulativeProbability(lowerBound, x);
} else if (x > m) {
return innerCumulativeProbability(upperBound, x + 1);
}
// 1 - cdf(x)
return 1 - midCDF;
}
/**
* For this distribution, {@code X}, this method returns
* {@code P(x0 <= X <= x1)}.
* This probability is computed by summing the point probabilities for the
* values {@code x0, x0 + dx, x0 + 2 * dx, ..., x1}; the direction {@code dx} is determined
* using a comparison of the input bounds.
* This should be called by using {@code x0} as the domain limit and {@code x1}
* as the internal value. This will result in a sum of increasingly larger magnitudes.
*
* @param x0 Inclusive domain bound.
* @param x1 Inclusive internal bound.
* @return {@code P(x0 <= X <= x1)}.
*/
private double innerCumulativeProbability(int x0, int x1) {
// Assume the range is within the domain.
int x = x0;
double ret = prob[x];
if (x0 < x1) {
while (x != x1) {
x++;
ret += prob[x];
}
} else {
while (x != x1) {
x--;
ret += prob[x];
}
}
return ret;
}
}