org.apache.commons.statistics.inference.BinomialTest Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.statistics.inference;
import java.util.Objects;
import org.apache.commons.statistics.distribution.BinomialDistribution;
/**
* Implements binomial test statistics.
*
* Performs an exact test for the statistical significance of deviations from a
* theoretically expected distribution of observations into two categories.
*
* @see Binomial test (Wikipedia)
* @since 1.1
*/
public final class BinomialTest {
/** Default instance. */
private static final BinomialTest DEFAULT = new BinomialTest(AlternativeHypothesis.TWO_SIDED);
/** Alternative hypothesis. */
private final AlternativeHypothesis alternative;
/**
* @param alternative Alternative hypothesis.
*/
private BinomialTest(AlternativeHypothesis alternative) {
this.alternative = alternative;
}
/**
* Return an instance using the default options.
*
*
* - {@link AlternativeHypothesis#TWO_SIDED}
*
*
* @return default instance
*/
public static BinomialTest withDefaults() {
return DEFAULT;
}
/**
* Return an instance with the configured alternative hypothesis.
*
* @param v Value.
* @return an instance
*/
public BinomialTest with(AlternativeHypothesis v) {
return new BinomialTest(Objects.requireNonNull(v));
}
/**
* Performs a binomial test about the probability of success \( \pi \).
*
* The null hypothesis is \( H_0:\pi=\pi_0 \) where \( \pi_0 \) is between 0 and 1.
*
*
The probability of observing \( k \) successes from \( n \) trials with a given
* probability of success \( p \) is:
*
*
\[ \Pr(X=k)=\binom{n}{k}p^k(1-p)^{n-k} \]
*
*
The test is defined by the {@link AlternativeHypothesis}.
*
*
To test \( \pi < \pi_0 \) (less than):
*
*
\[ p = \sum_{i=0}^k\Pr(X=i)=\sum_{i=0}^k\binom{n}{i}\pi_0^i(1-\pi_0)^{n-i} \]
*
*
To test \( \pi > \pi_0 \) (greater than):
*
*
\[ p = \sum_{i=0}^k\Pr(X=i)=\sum_{i=k}^n\binom{n}{i}\pi_0^i(1-\pi_0)^{n-i} \]
*
*
To test \( \pi \ne \pi_0 \) (two-sided) requires finding all \( i \) such that
* \( \mathcal{I}=\{i:\Pr(X=i)\leq \Pr(X=k)\} \) and compute the sum:
*
*
\[ p = \sum_{i\in\mathcal{I}}\Pr(X=i)=\sum_{i\in\mathcal{I}}\binom{n}{i}\pi_0^i(1-\pi_0)^{n-i} \]
*
*
The two-sided p-value represents the likelihood of getting a result at least as
* extreme as the sample, given the provided {@code probability} of success on a
* single trial.
*
*
The test statistic is equal to the estimated proportion \( \frac{k}{n} \).
*
* @param numberOfTrials Number of trials performed.
* @param numberOfSuccesses Number of successes observed.
* @param probability Assumed probability of a single trial under the null
* hypothesis.
* @return test result
* @throws IllegalArgumentException if {@code numberOfTrials} or
* {@code numberOfSuccesses} is negative; {@code probability} is not between 0
* and 1; or if {@code numberOfTrials < numberOfSuccesses}
* @see #with(AlternativeHypothesis)
*/
public SignificanceResult test(int numberOfTrials, int numberOfSuccesses, double probability) {
// Note: The distribution validates number of trials and probability.
// Here we only have to validate the number of successes.
Arguments.checkNonNegative(numberOfSuccesses);
if (numberOfTrials < numberOfSuccesses) {
throw new InferenceException(
"must have n >= k for binomial coefficient (n, k), got n = %d, k = %d",
numberOfSuccesses, numberOfTrials);
}
final BinomialDistribution distribution = BinomialDistribution.of(numberOfTrials, probability);
final double p;
if (alternative == AlternativeHypothesis.GREATER_THAN) {
p = distribution.survivalProbability(numberOfSuccesses - 1);
} else if (alternative == AlternativeHypothesis.LESS_THAN) {
p = distribution.cumulativeProbability(numberOfSuccesses);
} else {
p = twoSidedBinomialTest(numberOfTrials, numberOfSuccesses, probability, distribution);
}
return new BaseSignificanceResult((double) numberOfSuccesses / numberOfTrials, p);
}
/**
* Returns the observed significance level, or p-value, associated with a
* two-sided binomial test about the probability of success \( \pi \).
*
* @param n Number of trials performed.
* @param k Number of successes observed.
* @param probability Assumed probability of a single trial under the null
* hypothesis.
* @param distribution Binomial distribution.
* @return p-value
*/
private static double twoSidedBinomialTest(int n, int k, double probability,
BinomialDistribution distribution) {
// Find all i where Pr(X = i) <= Pr(X = k) and sum them.
// Exploit the known unimodal distribution to increase the
// search speed. Note the search depends only on magnitude differences.
// The current BinomialDistribution is faster using log probability
// as it omits a call to Math.exp.
// Use the mode as the point of largest probability.
// The lower or upper mode is important for the search below.
final int m1 = (int) Math.ceil((n + 1.0) * probability) - 1;
final int m2 = (int) Math.floor((n + 1.0) * probability);
if (k < m1) {
final double pk = distribution.logProbability(k);
// Lower half = cdf(k)
// Find upper half. As k < lower mode i should never
// reach the lower mode based on the probability alone.
// Bracket with the upper mode.
final int i = Searches.searchDescending(m2, n, pk, distribution::logProbability);
return distribution.cumulativeProbability(k) +
distribution.survivalProbability(i - 1);
} else if (k > m2) {
final double pk = distribution.logProbability(k);
// Upper half = sf(k - 1)
// Find lower half. As k > upper mode i should never
// reach the upper mode based on the probability alone.
// Bracket with the lower mode.
final int i = Searches.searchAscending(0, m1, pk, distribution::logProbability);
return distribution.cumulativeProbability(i) +
distribution.survivalProbability(k - 1);
}
// k == mode
// Edge case where the sum of probabilities will be either
// 1 or 1 - Pr(X = mode) where mode != k
final double pk = distribution.probability(k);
final double pm = distribution.probability(k == m1 ? m2 : m1);
return pm > pk ? 1 - pm : 1;
}
}