All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.statistics.inference.FisherExactTest Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.statistics.inference;

import java.util.Objects;
import org.apache.commons.statistics.distribution.HypergeometricDistribution;

/**
 * Implements Fisher's exact test.
 *
 * 

Performs an exact test for the statistical significance of the association (contingency) * between two kinds of categorical classification. * *

Fisher's test applies in the case that the row sums and column sums are fixed in advance * and not random. * * @see Fisher's exact test (Wikipedia) * @since 1.1 */ public final class FisherExactTest { /** Default instance. */ private static final FisherExactTest DEFAULT = new FisherExactTest(AlternativeHypothesis.TWO_SIDED); /** Alternative hypothesis. */ private final AlternativeHypothesis alternative; /** * @param alternative Alternative hypothesis. */ private FisherExactTest(AlternativeHypothesis alternative) { this.alternative = alternative; } /** * Return an instance using the default options. * *

    *
  • {@link AlternativeHypothesis#TWO_SIDED} *
* * @return default instance */ public static FisherExactTest withDefaults() { return DEFAULT; } /** * Return an instance with the configured alternative hypothesis. * * @param v Value. * @return an instance */ public FisherExactTest with(AlternativeHypothesis v) { return new FisherExactTest(Objects.requireNonNull(v)); } /** * Compute the prior odds ratio for the 2-by-2 contingency table. This is the * "sample" or "unconditional" maximum likelihood estimate. For a table of: * *

\[ \left[ {\begin{array}{cc} * a & b \\ * c & d \\ * \end{array} } \right] \] * *

this is: * *

\[ r = \frac{a d}{b c} \] * *

Special cases: *

    *
  • If the denominator is zero, the value is {@linkplain Double#POSITIVE_INFINITY infinity}. *
  • If a row or column sum is zero, the value is {@link Double#NaN NaN}. *
* *

Note: This statistic is equal to the statistic computed by the SciPy function * {@code scipy.stats.fisher_exact}. It is different to the conditional maximum * likelihood estimate computed by R function {@code fisher.test}. * * @param table 2-by-2 contingency table. * @return odds ratio * @throws IllegalArgumentException if the {@code table} is not a 2-by-2 table; any * table entry is negative; or the sum of the table is 0 or larger than a 32-bit signed integer. * @see #test(int[][]) */ public double statistic(int[][] table) { Arguments.checkTable(table); final double a = table[0][0]; final double b = table[0][1]; final double c = table[1][0]; final double d = table[1][1]; return (a * d) / (b * c); } /** * Performs Fisher's exact test on the 2-by-2 contingency table. * *

The test statistic is equal to the prior odds ratio. This is the * "sample" or "unconditional" maximum likelihood estimate. * *

The test is defined by the {@link AlternativeHypothesis}. * *

For a table of [[a, b], [c, d]] the possible values of any table are conditioned * with the same marginals (row and column totals). In this case the possible values {@code x} * of the upper-left element {@code a} are {@code min(0, a - d) <= x <= a + min(b, c)}. *

    *
  • 'two-sided': the odds ratio of the underlying population is not one; the p-value * is the probability that a random table has probability equal to or less than the input table. *
  • 'greater': the odds ratio of the underlying population is greater than one; the p-value * is the probability that a random table has {@code x >= a}. *
  • 'less': the odds ratio of the underlying population is less than one; the p-value * is the probability that a random table has {@code x <= a}. *
* * @param table 2-by-2 contingency table. * @return test result * @throws IllegalArgumentException if the {@code table} is not a 2-by-2 table; any * table entry is negative; or the sum of the table is 0 or larger than a 32-bit signed integer. * @see #with(AlternativeHypothesis) * @see #statistic(int[][]) */ public SignificanceResult test(int[][] table) { Arguments.checkTable(table); final int a = table[0][0]; final int b = table[0][1]; final int c = table[1][0]; final int d = table[1][1]; // Odd-ratio. final double statistic = ((double) a * d) / ((double) b * c); final int nn = a + b + c + d; final int k = a + b; final int n = a + c; // Note: The distribution validates the population size is > 0 final HypergeometricDistribution distribution = HypergeometricDistribution.of(nn, k, n); final double p; if (alternative == AlternativeHypothesis.GREATER_THAN) { p = distribution.survivalProbability(a - 1); } else if (alternative == AlternativeHypothesis.LESS_THAN) { p = distribution.cumulativeProbability(a); } else { p = twoSidedTest(a, distribution); } return new BaseSignificanceResult(statistic, p); } /** * Returns the observed significance level, or p-value, associated with a * two-sided test about the observed value. * * @param k Observed value. * @param distribution Hypergeometric distribution. * @return p-value */ private static double twoSidedTest(int k, HypergeometricDistribution distribution) { // Find all i where Pr(X = i) <= Pr(X = k) and sum them. // Exploit the known unimodal distribution to increase the // search speed. Note the search depends only on magnitude differences. // The current HypergeometricDistribution is faster using log probability // as it omits a call to Math.exp. // Use the mode as the point of largest probability. // The lower or upper mode is important for the search below. final int nn = distribution.getPopulationSize(); final int kk = distribution.getNumberOfSuccesses(); final int n = distribution.getSampleSize(); final double v = ((double) n + 1) * ((double) kk + 1) / (nn + 2.0); final int m1 = (int) Math.ceil(v) - 1; final int m2 = (int) Math.floor(v); if (k < m1) { final double pk = distribution.logProbability(k); // Lower half = cdf(k) // Find upper half. As k < lower mode i should never // reach the lower mode based on the probability alone. // Bracket with the upper mode. final int i = Searches.searchDescending(m2, distribution.getSupportUpperBound(), pk, distribution::logProbability); return distribution.cumulativeProbability(k) + distribution.survivalProbability(i - 1); } else if (k > m2) { final double pk = distribution.logProbability(k); // Upper half = sf(k - 1) // Find lower half. As k > upper mode i should never // reach the upper mode based on the probability alone. // Bracket with the lower mode. final int i = Searches.searchAscending(distribution.getSupportLowerBound(), m1, pk, distribution::logProbability); return distribution.cumulativeProbability(i) + distribution.survivalProbability(k - 1); } // k == mode // Edge case where the sum of probabilities will be either // 1 or 1 - Pr(X = mode) where mode != k final double pk = distribution.probability(k); final double pm = distribution.probability(k == m1 ? m2 : m1); return pm > pk ? 1 - pm : 1; } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy