All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.statistics.inference.GTest Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.statistics.inference;

import org.apache.commons.numbers.core.Sum;
import org.apache.commons.statistics.descriptive.LongMean;
import org.apache.commons.statistics.distribution.ChiSquaredDistribution;

/**
 * Implements G-test (Generalized Log-Likelihood Ratio Test) statistics.
 *
 * 

This is known in statistical genetics as the McDonald-Kreitman test. * The implementation handles both known and unknown distributions. * *

Two samples tests can be used when the distribution is unknown a priori * but provided by one sample, or when the hypothesis under test is that the two * samples come from the same underlying distribution. * * @see G-test (Wikipedia) * @since 1.1 */ public final class GTest { // Note: // The g-test statistic is a summation of terms with positive and negative sign // and thus the sum may exhibit cancellation. This class uses separate high precision // sums of the positive and negative terms which are then combined. // Total cancellation for a large number of terms will not impact // p-values of interest around critical alpha values as the Chi^2 // distribution exhibits strong concentration around its mean (degrees of freedom, k). // The summation only need maintain enough bits in the final sum to distinguish // g values around critical alpha values where 0 << chisq.sf(g, k) << 0.5: g > k, // with k = number of terms - 1. /** Default instance. */ private static final GTest DEFAULT = new GTest(0); /** Degrees of freedom adjustment. */ private final int degreesOfFreedomAdjustment; /** * @param degreesOfFreedomAdjustment Degrees of freedom adjustment. */ private GTest(int degreesOfFreedomAdjustment) { this.degreesOfFreedomAdjustment = degreesOfFreedomAdjustment; } /** * Return an instance using the default options. * *

    *
  • {@linkplain #withDegreesOfFreedomAdjustment(int) Degrees of freedom adjustment = 0} *
* * @return default instance */ public static GTest withDefaults() { return DEFAULT; } /** * Return an instance with the configured degrees of freedom adjustment. * *

The default degrees of freedom for a sample of length {@code n} are * {@code n - 1}. An intrinsic null hypothesis is one where you estimate one or * more parameters from the data in order to get the numbers for your null * hypothesis. For a distribution with {@code p} parameters where up to * {@code p} parameters have been estimated from the data the degrees of freedom * is in the range {@code [n - 1 - p, n - 1]}. * * @param v Value. * @return an instance * @throws IllegalArgumentException if the value is negative */ public GTest withDegreesOfFreedomAdjustment(int v) { return new GTest(Arguments.checkNonNegative(v)); } /** * Computes the G-test goodness-of-fit statistic comparing the {@code observed} counts to * a uniform expected value (each category is equally likely). * *

Note: This is a specialized version of a comparison of {@code observed} * with an {@code expected} array of uniform values. The result is faster than * calling {@link #statistic(double[], long[])} and the statistic is the same, * with an allowance for accumulated floating-point error due to the optimized * routine. * * @param observed Observed frequency counts. * @return G-test statistic * @throws IllegalArgumentException if the sample size is less than 2; * {@code observed} has negative entries; or all the observations are zero. * @see #test(long[]) */ public double statistic(long[] observed) { Arguments.checkValuesRequiredSize(observed.length, 2); Arguments.checkNonNegative(observed); final double e = LongMean.of(observed).getAsDouble(); if (e == 0) { throw new InferenceException(InferenceException.NO_DATA); } // g = 2 * sum{o * ln(o/e)} // = 2 * [ sum{o * ln(o)} - sum(o) * ln(e) ] // The second form has more cancellation as the sums are larger. // Separate sum for positive and negative terms. final Sum sum = Sum.create(); final Sum sum2 = Sum.create(); for (final double o : observed) { if (o > e) { // Positive term sum.add(o * Math.log(o / e)); } else if (o > 0) { // Negative term // Process non-zero counts to avoid 0 * -inf = NaN sum2.add(o * Math.log(o / e)); } } return sum.add(sum2).getAsDouble() * 2; } /** * Computes the G-test goodness-of-fit statistic comparing {@code observed} and {@code expected} * frequency counts. * *

Note:This implementation rescales the values * if necessary to ensure that the sum of the expected and observed counts * are equal. * * @param expected Expected frequency counts. * @param observed Observed frequency counts. * @return G-test statistic * @throws IllegalArgumentException if the sample size is less than 2; the array * sizes do not match; {@code expected} has entries that are not strictly * positive; {@code observed} has negative entries; or all the observations are zero. * @see #test(double[], long[]) */ public double statistic(double[] expected, long[] observed) { // g = 2 * sum{o * ln(o/e)} // The sum of o and e must be the same. final double ratio = StatisticUtils.computeRatio(expected, observed); // High precision sum to reduce cancellation. // Separate sum for positive and negative terms. final Sum sum = Sum.create(); final Sum sum2 = Sum.create(); for (int i = 0; i < observed.length; i++) { final long o = observed[i]; // Process non-zero counts to avoid 0 * -inf = NaN if (o != 0) { final double term = o * Math.log(o / (ratio * expected[i])); if (term < 0) { sum2.add(term); } else { sum.add(term); } } } return sum.add(sum2).getAsDouble() * 2; } /** * Computes a G-test statistic associated with a G-test of * independence based on the input {@code counts} array, viewed as a two-way * table. The formula used to compute the test statistic is: * *

\[ G = 2 \cdot \sum_{ij}{O_{ij}} \cdot \left[ H(r) + H(c) - H(r,c) \right] \] * *

and \( H \) is the * Shannon Entropy of the random variable formed by viewing the elements of * the argument array as incidence counts: * *

\[ H(X) = - {\sum_{x \in \text{Supp}(X)} p(x) \ln p(x)} \] * * @param counts 2-way table. * @return G-test statistic * @throws IllegalArgumentException if the number of rows or columns is less * than 2; the array is non-rectangular; the array has negative entries; or the * sum of a row or column is zero. * @see ChiSquareTest#test(long[][]) */ public double statistic(long[][] counts) { Arguments.checkCategoriesRequiredSize(counts.length, 2); Arguments.checkValuesRequiredSize(counts[0].length, 2); Arguments.checkRectangular(counts); Arguments.checkNonNegative(counts); final int ni = counts.length; final int nj = counts[0].length; // Compute row, column and total sums final double[] sumi = new double[ni]; final double[] sumj = new double[nj]; double n = 0; // We can sum data on the first pass. See below for computation details. final Sum sum = Sum.create(); for (int i = 0; i < ni; i++) { for (int j = 0; j < nj; j++) { final long c = counts[i][j]; sumi[i] += c; sumj[j] += c; if (c > 1) { sum.add(c * Math.log(c)); } } checkNonZero(sumi[i], "Row", i); n += sumi[i]; } for (int j = 0; j < nj; j++) { checkNonZero(sumj[j], "Column", j); } // This computes a modified form of the Shannon entropy H without requiring // normalisation of observations to probabilities and without negation, // i.e. we compute n * [ H(r) + H(c) - H(r,c) ] as [ H'(r,c) - H'(r) - H'(c) ]. // H = -sum (p * log(p)) // H' = n * sum (p * log(p)) // = n * sum (o/n * log(o/n)) // = n * [ sum(o/n * log(o)) - sum(o/n * log(n)) ] // = sum(o * log(o)) - n log(n) // After 3 modified entropy sums H'(r,c) - H'(r) - H'(c) compensation is (-1 + 2) * n log(n) sum.addProduct(n, Math.log(n)); // Negative terms final Sum sum2 = Sum.create(); // All these counts are above zero so no check for zeros for (final double c : sumi) { sum2.add(c * -Math.log(c)); } for (final double c : sumj) { sum2.add(c * -Math.log(c)); } return sum.add(sum2).getAsDouble() * 2; } /** * Perform a G-test for goodness-of-fit evaluating the null hypothesis that the {@code observed} * counts conform to a uniform distribution (each category is equally likely). * * @param observed Observed frequency counts. * @return test result * @throws IllegalArgumentException if the sample size is less than 2; * {@code observed} has negative entries; or all the observations are zero * @see #statistic(long[]) */ public SignificanceResult test(long[] observed) { final int df = observed.length - 1; final double g = statistic(observed); final double p = computeP(g, df); return new BaseSignificanceResult(g, p); } /** * Perform a G-test for goodness-of-fit evaluating the null hypothesis that the {@code observed} * counts conform to the {@code expected} counts. * *

The test can be configured to apply an adjustment to the degrees of freedom * if the observed data has been used to create the expected counts. * * @param expected Expected frequency counts. * @param observed Observed frequency counts. * @return test result * @throws IllegalArgumentException if the sample size is less than 2; the array * sizes do not match; {@code expected} has entries that are not strictly * positive; {@code observed} has negative entries; all the observations are zero; or * the adjusted degrees of freedom are not strictly positive * @see #withDegreesOfFreedomAdjustment(int) * @see #statistic(double[], long[]) */ public SignificanceResult test(double[] expected, long[] observed) { final int df = StatisticUtils.computeDegreesOfFreedom(observed.length, degreesOfFreedomAdjustment); final double g = statistic(expected, observed); final double p = computeP(g, df); return new BaseSignificanceResult(g, p); } /** * Perform a G-test of independence based on the input * {@code counts} array, viewed as a two-way table. * * @param counts 2-way table. * @return test result * @throws IllegalArgumentException if the number of rows or columns is less * than 2; the array is non-rectangular; the array has negative entries; or the * sum of a row or column is zero. * @see #statistic(long[][]) */ public SignificanceResult test(long[][] counts) { final double g = statistic(counts); final double df = (counts.length - 1.0) * (counts[0].length - 1.0); final double p = computeP(g, df); return new BaseSignificanceResult(g, p); } /** * Compute the G-test p-value. * * @param g G-test statistic. * @param degreesOfFreedom Degrees of freedom. * @return p-value */ private static double computeP(double g, double degreesOfFreedom) { return ChiSquaredDistribution.of(degreesOfFreedom).survivalProbability(g); } /** * Check the array value is non-zero. * * @param value Value * @param name Name of the array * @param index Index in the array * @throws IllegalArgumentException if the value is zero */ private static void checkNonZero(double value, String name, int index) { if (value == 0) { throw new InferenceException(InferenceException.ZERO_AT, name, index); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy