org.apache.commons.statistics.inference.ChiSquareTest Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.commons.statistics.inference;
import org.apache.commons.statistics.descriptive.LongMean;
import org.apache.commons.statistics.distribution.ChiSquaredDistribution;
/**
* Implements chi-square test statistics.
*
* This implementation handles both known and unknown distributions.
*
*
Two samples tests can be used when the distribution is unknown a priori
* but provided by one sample, or when the hypothesis under test is that the two
* samples come from the same underlying distribution.
*
* @see Chi-square test (Wikipedia)
* @since 1.1
*/
public final class ChiSquareTest {
/** Name for the row. */
private static final String ROW = "row";
/** Name for the column. */
private static final String COLUMN = "column";
/** Default instance. */
private static final ChiSquareTest DEFAULT = new ChiSquareTest(0);
/** Degrees of freedom adjustment. */
private final int degreesOfFreedomAdjustment;
/**
* @param degreesOfFreedomAdjustment Degrees of freedom adjustment.
*/
private ChiSquareTest(int degreesOfFreedomAdjustment) {
this.degreesOfFreedomAdjustment = degreesOfFreedomAdjustment;
}
/**
* Return an instance using the default options.
*
*
* - {@linkplain #withDegreesOfFreedomAdjustment(int) Degrees of freedom adjustment = 0}
*
*
* @return default instance
*/
public static ChiSquareTest withDefaults() {
return DEFAULT;
}
/**
* Return an instance with the configured degrees of freedom adjustment.
*
* The default degrees of freedom for a sample of length {@code n} are
* {@code n - 1}. An intrinsic null hypothesis is one where you estimate one or
* more parameters from the data in order to get the numbers for your null
* hypothesis. For a distribution with {@code p} parameters where up to
* {@code p} parameters have been estimated from the data the degrees of freedom
* is in the range {@code [n - 1 - p, n - 1]}.
*
* @param v Value.
* @return an instance
* @throws IllegalArgumentException if the value is negative
*/
public ChiSquareTest withDegreesOfFreedomAdjustment(int v) {
return new ChiSquareTest(Arguments.checkNonNegative(v));
}
/**
* Computes the chi-square goodness-of-fit statistic comparing the {@code observed} counts to a
* uniform expected value (each category is equally likely).
*
*
Note: This is a specialized version of a comparison of {@code observed}
* with an {@code expected} array of uniform values. The result is faster than
* calling {@link #statistic(double[], long[])} and the statistic is the same,
* with an allowance for accumulated floating-point error due to the optimized
* routine.
*
* @param observed Observed frequency counts.
* @return Chi-square statistic
* @throws IllegalArgumentException if the sample size is less than 2;
* {@code observed} has negative entries; or all the observations are zero.
* @see #test(long[])
*/
public double statistic(long[] observed) {
Arguments.checkValuesRequiredSize(observed.length, 2);
Arguments.checkNonNegative(observed);
final double e = LongMean.of(observed).getAsDouble();
if (e == 0) {
throw new InferenceException(InferenceException.NO_DATA);
}
// chi2 = sum{ (o-e)^2 / e }. Use a single division at the end.
double chi2 = 0;
for (final long o : observed) {
final double d = o - e;
chi2 += d * d;
}
return chi2 / e;
}
/**
* Computes the chi-square goodness-of-fit statistic comparing {@code observed} and
* {@code expected} frequency counts.
*
*
Note:This implementation rescales the {@code expected}
* array if necessary to ensure that the sum of the expected and observed counts
* are equal.
*
* @param expected Expected frequency counts.
* @param observed Observed frequency counts.
* @return Chi-square statistic
* @throws IllegalArgumentException if the sample size is less than 2; the array
* sizes do not match; {@code expected} has entries that are not strictly
* positive; {@code observed} has negative entries; or all the observations are zero.
* @see #test(double[], long[])
*/
public double statistic(double[] expected, long[] observed) {
final double ratio = StatisticUtils.computeRatio(expected, observed);
// chi2 = sum{ (o-e)^2 / e }
double chi2 = 0;
for (int i = 0; i < observed.length; i++) {
final double e = ratio * expected[i];
final double d = observed[i] - e;
chi2 += d * d / e;
}
return chi2;
}
/**
* Computes the chi-square statistic associated with a chi-square test of
* independence based on the input {@code counts} array, viewed as a two-way
* table in row-major format.
*
* @param counts 2-way table.
* @return Chi-square statistic
* @throws IllegalArgumentException if the number of rows or columns is less
* than 2; the array is non-rectangular; the array has negative entries; or the
* sum of a row or column is zero.
* @see #test(long[][])
*/
public double statistic(long[][] counts) {
Arguments.checkCategoriesRequiredSize(counts.length, 2);
Arguments.checkValuesRequiredSize(counts[0].length, 2);
Arguments.checkRectangular(counts);
Arguments.checkNonNegative(counts);
final int nRows = counts.length;
final int nCols = counts[0].length;
// compute row, column and total sums
final double[] rowSum = new double[nRows];
final double[] colSum = new double[nCols];
double sum = 0;
for (int row = 0; row < nRows; row++) {
for (int col = 0; col < nCols; col++) {
rowSum[row] += counts[row][col];
colSum[col] += counts[row][col];
}
checkNonZero(rowSum[row], ROW, row);
sum += rowSum[row];
}
for (int col = 0; col < nCols; col++) {
checkNonZero(colSum[col], COLUMN, col);
}
// Compute expected counts and chi-square
double chi2 = 0;
for (int row = 0; row < nRows; row++) {
for (int col = 0; col < nCols; col++) {
final double e = (rowSum[row] * colSum[col]) / sum;
final double d = counts[row][col] - e;
chi2 += d * d / e;
}
}
return chi2;
}
/**
* Computes a chi-square statistic associated with a chi-square test of
* independence of frequency counts in {@code observed1} and {@code observed2}.
* The sums of frequency counts in the two samples are not required to be the
* same. The formula used to compute the test statistic is:
*
*
\[ \sum_i{ \frac{(K * a_i - b_i / K)^2}{a_i + b_i} } \]
*
*
where
*
*
\[ K = \sqrt{ \sum_i{a_i} / \sum_i{b_i} } \]
*
*
Note: This is a specialized version of a 2-by-n contingency table. The
* result is faster than calling {@link #statistic(long[][])} with the table
* composed as {@code new long[][]{observed1, observed2}}. The statistic is the
* same, with an allowance for accumulated floating-point error due to the
* optimized routine.
*
* @param observed1 Observed frequency counts of the first data set.
* @param observed2 Observed frequency counts of the second data set.
* @return Chi-square statistic
* @throws IllegalArgumentException if the sample size is less than 2; the array
* sizes do not match; either array has entries that are negative; either all
* counts of {@code observed1} or {@code observed2} are zero; or if the count at
* some index is zero for both arrays.
* @see ChiSquareTest#test(long[], long[])
*/
public double statistic(long[] observed1, long[] observed2) {
Arguments.checkValuesRequiredSize(observed1.length, 2);
Arguments.checkValuesSizeMatch(observed1.length, observed2.length);
Arguments.checkNonNegative(observed1);
Arguments.checkNonNegative(observed2);
// Compute and compare count sums
long colSum1 = 0;
long colSum2 = 0;
for (int i = 0; i < observed1.length; i++) {
final long obs1 = observed1[i];
final long obs2 = observed2[i];
checkNonZero(obs1 | obs2, ROW, i);
colSum1 += obs1;
colSum2 += obs2;
}
// Create the same exception message as chiSquare(long[][])
checkNonZero(colSum1, COLUMN, 0);
checkNonZero(colSum2, COLUMN, 1);
// Compare and compute weight only if different
final boolean unequalCounts = colSum1 != colSum2;
final double weight = unequalCounts ?
Math.sqrt((double) colSum1 / colSum2) : 1;
// Compute chi-square
// This exploits an algebraic rearrangement of the generic n*m contingency table case
// for a single sum squared addition per row.
double chi2 = 0;
for (int i = 0; i < observed1.length; i++) {
final double obs1 = observed1[i];
final double obs2 = observed2[i];
// apply weights
final double d = unequalCounts ?
obs1 / weight - obs2 * weight :
obs1 - obs2;
chi2 += (d * d) / (obs1 + obs2);
}
return chi2;
}
/**
* Perform a chi-square goodness-of-fit test evaluating the null hypothesis that
* the {@code observed} counts conform to a uniform distribution (each category
* is equally likely).
*
* @param observed Observed frequency counts.
* @return test result
* @throws IllegalArgumentException if the sample size is less than 2;
* {@code observed} has negative entries; or all the observations are zero
* @see #statistic(long[])
*/
public SignificanceResult test(long[] observed) {
final int df = observed.length - 1;
final double chi2 = statistic(observed);
final double p = computeP(chi2, df);
return new BaseSignificanceResult(chi2, p);
}
/**
* Perform a chi-square goodness-of-fit test evaluating the null hypothesis that the
* {@code observed} counts conform to the {@code expected} counts.
*
*
The test can be configured to apply an adjustment to the degrees of freedom
* if the observed data has been used to create the expected counts.
*
* @param expected Expected frequency counts.
* @param observed Observed frequency counts.
* @return test result
* @throws IllegalArgumentException if the sample size is less than 2; the array
* sizes do not match; {@code expected} has entries that are not strictly
* positive; {@code observed} has negative entries; all the observations are zero; or
* the adjusted degrees of freedom are not strictly positive
* @see #withDegreesOfFreedomAdjustment(int)
* @see #statistic(double[], long[])
*/
public SignificanceResult test(double[] expected, long[] observed) {
final int df = StatisticUtils.computeDegreesOfFreedom(observed.length, degreesOfFreedomAdjustment);
final double chi2 = statistic(expected, observed);
final double p = computeP(chi2, df);
return new BaseSignificanceResult(chi2, p);
}
/**
* Perform a chi-square test of independence based on the input {@code counts} array,
* viewed as a two-way table.
*
* @param counts 2-way table.
* @return test result
* @throws IllegalArgumentException if the number of rows or columns is less
* than 2; the array is non-rectangular; the array has negative entries; or the
* sum of a row or column is zero.
* @see #statistic(long[][])
*/
public SignificanceResult test(long[][] counts) {
final double chi2 = statistic(counts);
final double df = (counts.length - 1.0) * (counts[0].length - 1.0);
final double p = computeP(chi2, df);
return new BaseSignificanceResult(chi2, p);
}
/**
* Perform a chi-square test of independence of frequency counts in
* {@code observed1} and {@code observed2}.
*
*
Note: This is a specialized version of a 2-by-n contingency table.
*
* @param observed1 Observed frequency counts of the first data set.
* @param observed2 Observed frequency counts of the second data set.
* @return test result
* @throws IllegalArgumentException if the sample size is less than 2; the array
* sizes do not match; either array has entries that are negative; either all
* counts of {@code observed1} or {@code observed2} are zero; or if the count at
* some index is zero for both arrays.
* @see #statistic(long[], long[])
*/
public SignificanceResult test(long[] observed1, long[] observed2) {
final double chi2 = statistic(observed1, observed2);
final double p = computeP(chi2, observed1.length - 1.0);
return new BaseSignificanceResult(chi2, p);
}
/**
* Compute the chi-square test p-value.
*
* @param chi2 Chi-square statistic.
* @param degreesOfFreedom Degrees of freedom.
* @return p-value
*/
private static double computeP(double chi2, double degreesOfFreedom) {
return ChiSquaredDistribution.of(degreesOfFreedom).survivalProbability(chi2);
}
/**
* Check the array value is non-zero.
*
* @param value Value
* @param name Name of the array
* @param index Index in the array
* @throws IllegalArgumentException if the value is zero
*/
private static void checkNonZero(double value, String name, int index) {
if (value == 0) {
throw new InferenceException(InferenceException.ZERO_AT, name, index);
}
}
}