All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.statistics.distribution.HypergeometricDistribution Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.commons.statistics.distribution;

import java.util.function.DoublePredicate;

/**
 * Implementation of the hypergeometric distribution.
 *
 * 

The probability mass function of \( X \) is: * *

\[ f(k; N, K, n) = \frac{\binom{K}{k} \binom{N - K}{n-k}}{\binom{N}{n}} \] * *

for \( N \in \{0, 1, 2, \dots\} \) the population size, * \( K \in \{0, 1, \dots, N\} \) the number of success states, * \( n \in \{0, 1, \dots, N\} \) the number of samples, * \( k \in \{\max(0, n+K-N), \dots, \min(n, K)\} \) the number of successes, and * *

\[ \binom{a}{b} = \frac{a!}{b! \, (a-b)!} \] * *

is the binomial coefficient. * * @see Hypergeometric distribution (Wikipedia) * @see Hypergeometric distribution (MathWorld) */ public final class HypergeometricDistribution extends AbstractDiscreteDistribution { /** 1/2. */ private static final double HALF = 0.5; /** The number of successes in the population. */ private final int numberOfSuccesses; /** The population size. */ private final int populationSize; /** The sample size. */ private final int sampleSize; /** The lower bound of the support (inclusive). */ private final int lowerBound; /** The upper bound of the support (inclusive). */ private final int upperBound; /** Binomial probability of success (sampleSize / populationSize). */ private final double bp; /** Binomial probability of failure ((populationSize - sampleSize) / populationSize). */ private final double bq; /** Cached midpoint of the CDF/SF. The array holds [x, cdf(x)] for the midpoint x. * Used for the cumulative probability functions. */ private double[] midpoint; /** * @param populationSize Population size. * @param numberOfSuccesses Number of successes in the population. * @param sampleSize Sample size. */ private HypergeometricDistribution(int populationSize, int numberOfSuccesses, int sampleSize) { this.numberOfSuccesses = numberOfSuccesses; this.populationSize = populationSize; this.sampleSize = sampleSize; lowerBound = getLowerDomain(populationSize, numberOfSuccesses, sampleSize); upperBound = getUpperDomain(numberOfSuccesses, sampleSize); bp = (double) sampleSize / populationSize; bq = (double) (populationSize - sampleSize) / populationSize; } /** * Creates a hypergeometric distribution. * * @param populationSize Population size. * @param numberOfSuccesses Number of successes in the population. * @param sampleSize Sample size. * @return the distribution * @throws IllegalArgumentException if {@code numberOfSuccesses < 0}, or * {@code populationSize <= 0} or {@code numberOfSuccesses > populationSize}, or * {@code sampleSize > populationSize}. */ public static HypergeometricDistribution of(int populationSize, int numberOfSuccesses, int sampleSize) { if (populationSize <= 0) { throw new DistributionException(DistributionException.NOT_STRICTLY_POSITIVE, populationSize); } if (numberOfSuccesses < 0) { throw new DistributionException(DistributionException.NEGATIVE, numberOfSuccesses); } if (sampleSize < 0) { throw new DistributionException(DistributionException.NEGATIVE, sampleSize); } if (numberOfSuccesses > populationSize) { throw new DistributionException(DistributionException.TOO_LARGE, numberOfSuccesses, populationSize); } if (sampleSize > populationSize) { throw new DistributionException(DistributionException.TOO_LARGE, sampleSize, populationSize); } return new HypergeometricDistribution(populationSize, numberOfSuccesses, sampleSize); } /** * Return the lowest domain value for the given hypergeometric distribution * parameters. * * @param nn Population size. * @param k Number of successes in the population. * @param n Sample size. * @return the lowest domain value of the hypergeometric distribution. */ private static int getLowerDomain(int nn, int k, int n) { // Avoid overflow given N > n: // n + K - N == K - (N - n) return Math.max(0, k - (nn - n)); } /** * Return the highest domain value for the given hypergeometric distribution * parameters. * * @param k Number of successes in the population. * @param n Sample size. * @return the highest domain value of the hypergeometric distribution. */ private static int getUpperDomain(int k, int n) { return Math.min(n, k); } /** * Gets the population size parameter of this distribution. * * @return the population size. */ public int getPopulationSize() { return populationSize; } /** * Gets the number of successes parameter of this distribution. * * @return the number of successes. */ public int getNumberOfSuccesses() { return numberOfSuccesses; } /** * Gets the sample size parameter of this distribution. * * @return the sample size. */ public int getSampleSize() { return sampleSize; } /** {@inheritDoc} */ @Override public double probability(int x) { return Math.exp(logProbability(x)); } /** {@inheritDoc} */ @Override public double probability(int x0, int x1) { if (x0 > x1) { throw new DistributionException(DistributionException.INVALID_RANGE_LOW_GT_HIGH, x0, x1); } if (x0 == x1 || x1 < lowerBound) { return 0; } // If the range is outside the bounds use the appropriate cumulative probability if (x0 < lowerBound) { return cumulativeProbability(x1); } if (x1 >= upperBound) { // 1 - cdf(x0) return survivalProbability(x0); } // Here: lower <= x0 < x1 < upper: // sum(pdf(x)) for x in (x0, x1] final int lo = x0 + 1; // Sum small values first by starting at the point the greatest distance from the mode. final int mode = (int) Math.floor((sampleSize + 1.0) * (numberOfSuccesses + 1.0) / (populationSize + 2.0)); return Math.abs(mode - lo) > Math.abs(mode - x1) ? innerCumulativeProbability(lo, x1) : innerCumulativeProbability(x1, lo); } /** {@inheritDoc} */ @Override public double logProbability(int x) { if (x < lowerBound || x > upperBound) { return Double.NEGATIVE_INFINITY; } return computeLogProbability(x); } /** * Compute the log probability. * * @param x Value. * @return log(P(X = x)) */ private double computeLogProbability(int x) { final double p1 = SaddlePointExpansionUtils.logBinomialProbability(x, numberOfSuccesses, bp, bq); final double p2 = SaddlePointExpansionUtils.logBinomialProbability(sampleSize - x, populationSize - numberOfSuccesses, bp, bq); final double p3 = SaddlePointExpansionUtils.logBinomialProbability(sampleSize, populationSize, bp, bq); return p1 + p2 - p3; } /** {@inheritDoc} */ @Override public double cumulativeProbability(int x) { if (x < lowerBound) { return 0.0; } else if (x >= upperBound) { return 1.0; } final double[] mid = getMidPoint(); final int m = (int) mid[0]; if (x < m) { return innerCumulativeProbability(lowerBound, x); } else if (x > m) { return 1 - innerCumulativeProbability(upperBound, x + 1); } // cdf(x) return mid[1]; } /** {@inheritDoc} */ @Override public double survivalProbability(int x) { if (x < lowerBound) { return 1.0; } else if (x >= upperBound) { return 0.0; } final double[] mid = getMidPoint(); final int m = (int) mid[0]; if (x < m) { return 1 - innerCumulativeProbability(lowerBound, x); } else if (x > m) { return innerCumulativeProbability(upperBound, x + 1); } // 1 - cdf(x) return 1 - mid[1]; } /** * For this distribution, {@code X}, this method returns * {@code P(x0 <= X <= x1)}. * This probability is computed by summing the point probabilities for the * values {@code x0, x0 + dx, x0 + 2 * dx, ..., x1}; the direction {@code dx} is determined * using a comparison of the input bounds. * This should be called by using {@code x0} as the domain limit and {@code x1} * as the internal value. This will result in an initial sum of increasing larger magnitudes. * * @param x0 Inclusive domain bound. * @param x1 Inclusive internal bound. * @return {@code P(x0 <= X <= x1)}. */ private double innerCumulativeProbability(int x0, int x1) { // Assume the range is within the domain. // Reuse the computation for probability(x) but avoid checking the domain for each call. int x = x0; double ret = Math.exp(computeLogProbability(x)); if (x0 < x1) { while (x != x1) { x++; ret += Math.exp(computeLogProbability(x)); } } else { while (x != x1) { x--; ret += Math.exp(computeLogProbability(x)); } } return ret; } @Override public int inverseCumulativeProbability(double p) { ArgumentUtils.checkProbability(p); return computeInverseProbability(p, 1 - p, false); } @Override public int inverseSurvivalProbability(double p) { ArgumentUtils.checkProbability(p); return computeInverseProbability(1 - p, p, true); } /** * Implementation for the inverse cumulative or survival probability. * * @param p Cumulative probability. * @param q Survival probability. * @param complement Set to true to compute the inverse survival probability. * @return the value */ private int computeInverseProbability(double p, double q, boolean complement) { if (p == 0) { return lowerBound; } if (q == 0) { return upperBound; } // Sum the PDF(x) until the appropriate p-value is obtained // CDF: require smallest x where P(X<=x) >= p // SF: require smallest x where P(X>x) <= q // The choice of summation uses the mid-point. // The test on the CDF or SF is based on the appropriate input p-value. final double[] mid = getMidPoint(); final int m = (int) mid[0]; final double mp = mid[1]; final int midPointComparison = complement ? Double.compare(1 - mp, q) : Double.compare(p, mp); if (midPointComparison < 0) { return inverseLower(p, q, complement); } else if (midPointComparison > 0) { // Avoid floating-point summation error when the mid-point computed using the // lower sum is different to the midpoint computed using the upper sum. // Here we know the result must be above the midpoint so we can clip the result. return Math.max(m + 1, inverseUpper(p, q, complement)); } // Exact mid-point return m; } /** * Compute the inverse cumulative or survival probability using the lower sum. * * @param p Cumulative probability. * @param q Survival probability. * @param complement Set to true to compute the inverse survival probability. * @return the value */ private int inverseLower(double p, double q, boolean complement) { // Sum from the lower bound (computing the cdf) int x = lowerBound; final DoublePredicate test = complement ? i -> 1 - i > q : i -> i < p; double cdf = Math.exp(computeLogProbability(x)); while (test.test(cdf)) { x++; cdf += Math.exp(computeLogProbability(x)); } return x; } /** * Compute the inverse cumulative or survival probability using the upper sum. * * @param p Cumulative probability. * @param q Survival probability. * @param complement Set to true to compute the inverse survival probability. * @return the value */ private int inverseUpper(double p, double q, boolean complement) { // Sum from the upper bound (computing the sf) int x = upperBound; final DoublePredicate test = complement ? i -> i < q : i -> 1 - i > p; double sf = 0; while (test.test(sf)) { sf += Math.exp(computeLogProbability(x)); x--; } // Here either sf(x) >= q, or cdf(x) <= p // Ensure sf(x) <= q, or cdf(x) >= p if (complement && sf > q || !complement && 1 - sf < p) { x++; } return x; } /** * {@inheritDoc} * *

For population size \( N \), number of successes \( K \), and sample * size \( n \), the mean is: * *

\[ n \frac{K}{N} \] */ @Override public double getMean() { return getSampleSize() * (getNumberOfSuccesses() / (double) getPopulationSize()); } /** * {@inheritDoc} * *

For population size \( N \), number of successes \( K \), and sample * size \( n \), the variance is: * *

\[ n \frac{K}{N} \frac{N-K}{N} \frac{N-n}{N-1} \] */ @Override public double getVariance() { final double N = getPopulationSize(); final double K = getNumberOfSuccesses(); final double n = getSampleSize(); return (n * K * (N - K) * (N - n)) / (N * N * (N - 1)); } /** * {@inheritDoc} * *

For population size \( N \), number of successes \( K \), and sample * size \( n \), the lower bound of the support is \( \max \{ 0, n + K - N \} \). * * @return lower bound of the support */ @Override public int getSupportLowerBound() { return lowerBound; } /** * {@inheritDoc} * *

For number of successes \( K \), and sample * size \( n \), the upper bound of the support is \( \min \{ n, K \} \). * * @return upper bound of the support */ @Override public int getSupportUpperBound() { return upperBound; } /** * Return the mid-point {@code x} of the distribution, and the cdf(x). * *

This is not the true median. It is the value where the CDF(x) is closest to 0.5; * as such the CDF may be below 0.5 if the next value of x is further from 0.5. * * @return the mid-point ([x, cdf(x)]) */ private double[] getMidPoint() { double[] v = midpoint; if (v == null) { // Find the closest sum(PDF) to 0.5 int x = lowerBound; double p0 = 0; double p1 = Math.exp(computeLogProbability(x)); // No check of the upper bound required here as the CDF should sum to 1 and 0.5 // is exceeded before a bounds error. while (p1 < HALF) { x++; p0 = p1; p1 += Math.exp(computeLogProbability(x)); } // p1 >= 0.5 > p0 // Pick closet if (p1 - HALF >= HALF - p0) { x--; p1 = p0; } midpoint = v = new double[] {x, p1}; } return v; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy