All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.statistics.inference.ChiSquareTest Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.statistics.inference;

import org.apache.commons.statistics.descriptive.LongMean;
import org.apache.commons.statistics.distribution.ChiSquaredDistribution;

/**
 * Implements chi-square test statistics.
 *
 * 

This implementation handles both known and unknown distributions. * *

Two samples tests can be used when the distribution is unknown a priori * but provided by one sample, or when the hypothesis under test is that the two * samples come from the same underlying distribution. * * @see Chi-square test (Wikipedia) * @since 1.1 */ public final class ChiSquareTest { /** Name for the row. */ private static final String ROW = "row"; /** Name for the column. */ private static final String COLUMN = "column"; /** Default instance. */ private static final ChiSquareTest DEFAULT = new ChiSquareTest(0); /** Degrees of freedom adjustment. */ private final int degreesOfFreedomAdjustment; /** * @param degreesOfFreedomAdjustment Degrees of freedom adjustment. */ private ChiSquareTest(int degreesOfFreedomAdjustment) { this.degreesOfFreedomAdjustment = degreesOfFreedomAdjustment; } /** * Return an instance using the default options. * *

    *
  • {@linkplain #withDegreesOfFreedomAdjustment(int) Degrees of freedom adjustment = 0} *
* * @return default instance */ public static ChiSquareTest withDefaults() { return DEFAULT; } /** * Return an instance with the configured degrees of freedom adjustment. * *

The default degrees of freedom for a sample of length {@code n} are * {@code n - 1}. An intrinsic null hypothesis is one where you estimate one or * more parameters from the data in order to get the numbers for your null * hypothesis. For a distribution with {@code p} parameters where up to * {@code p} parameters have been estimated from the data the degrees of freedom * is in the range {@code [n - 1 - p, n - 1]}. * * @param v Value. * @return an instance * @throws IllegalArgumentException if the value is negative */ public ChiSquareTest withDegreesOfFreedomAdjustment(int v) { return new ChiSquareTest(Arguments.checkNonNegative(v)); } /** * Computes the chi-square goodness-of-fit statistic comparing the {@code observed} counts to a * uniform expected value (each category is equally likely). * *

Note: This is a specialized version of a comparison of {@code observed} * with an {@code expected} array of uniform values. The result is faster than * calling {@link #statistic(double[], long[])} and the statistic is the same, * with an allowance for accumulated floating-point error due to the optimized * routine. * * @param observed Observed frequency counts. * @return Chi-square statistic * @throws IllegalArgumentException if the sample size is less than 2; * {@code observed} has negative entries; or all the observations are zero. * @see #test(long[]) */ public double statistic(long[] observed) { Arguments.checkValuesRequiredSize(observed.length, 2); Arguments.checkNonNegative(observed); final double e = LongMean.of(observed).getAsDouble(); if (e == 0) { throw new InferenceException(InferenceException.NO_DATA); } // chi2 = sum{ (o-e)^2 / e }. Use a single division at the end. double chi2 = 0; for (final long o : observed) { final double d = o - e; chi2 += d * d; } return chi2 / e; } /** * Computes the chi-square goodness-of-fit statistic comparing {@code observed} and * {@code expected} frequency counts. * *

Note:This implementation rescales the {@code expected} * array if necessary to ensure that the sum of the expected and observed counts * are equal. * * @param expected Expected frequency counts. * @param observed Observed frequency counts. * @return Chi-square statistic * @throws IllegalArgumentException if the sample size is less than 2; the array * sizes do not match; {@code expected} has entries that are not strictly * positive; {@code observed} has negative entries; or all the observations are zero. * @see #test(double[], long[]) */ public double statistic(double[] expected, long[] observed) { final double ratio = StatisticUtils.computeRatio(expected, observed); // chi2 = sum{ (o-e)^2 / e } double chi2 = 0; for (int i = 0; i < observed.length; i++) { final double e = ratio * expected[i]; final double d = observed[i] - e; chi2 += d * d / e; } return chi2; } /** * Computes the chi-square statistic associated with a chi-square test of * independence based on the input {@code counts} array, viewed as a two-way * table in row-major format. * * @param counts 2-way table. * @return Chi-square statistic * @throws IllegalArgumentException if the number of rows or columns is less * than 2; the array is non-rectangular; the array has negative entries; or the * sum of a row or column is zero. * @see #test(long[][]) */ public double statistic(long[][] counts) { Arguments.checkCategoriesRequiredSize(counts.length, 2); Arguments.checkValuesRequiredSize(counts[0].length, 2); Arguments.checkRectangular(counts); Arguments.checkNonNegative(counts); final int nRows = counts.length; final int nCols = counts[0].length; // compute row, column and total sums final double[] rowSum = new double[nRows]; final double[] colSum = new double[nCols]; double sum = 0; for (int row = 0; row < nRows; row++) { for (int col = 0; col < nCols; col++) { rowSum[row] += counts[row][col]; colSum[col] += counts[row][col]; } checkNonZero(rowSum[row], ROW, row); sum += rowSum[row]; } for (int col = 0; col < nCols; col++) { checkNonZero(colSum[col], COLUMN, col); } // Compute expected counts and chi-square double chi2 = 0; for (int row = 0; row < nRows; row++) { for (int col = 0; col < nCols; col++) { final double e = (rowSum[row] * colSum[col]) / sum; final double d = counts[row][col] - e; chi2 += d * d / e; } } return chi2; } /** * Computes a chi-square statistic associated with a chi-square test of * independence of frequency counts in {@code observed1} and {@code observed2}. * The sums of frequency counts in the two samples are not required to be the * same. The formula used to compute the test statistic is: * *

\[ \sum_i{ \frac{(K * a_i - b_i / K)^2}{a_i + b_i} } \] * *

where * *

\[ K = \sqrt{ \sum_i{a_i} / \sum_i{b_i} } \] * *

Note: This is a specialized version of a 2-by-n contingency table. The * result is faster than calling {@link #statistic(long[][])} with the table * composed as {@code new long[][]{observed1, observed2}}. The statistic is the * same, with an allowance for accumulated floating-point error due to the * optimized routine. * * @param observed1 Observed frequency counts of the first data set. * @param observed2 Observed frequency counts of the second data set. * @return Chi-square statistic * @throws IllegalArgumentException if the sample size is less than 2; the array * sizes do not match; either array has entries that are negative; either all * counts of {@code observed1} or {@code observed2} are zero; or if the count at * some index is zero for both arrays. * @see ChiSquareTest#test(long[], long[]) */ public double statistic(long[] observed1, long[] observed2) { Arguments.checkValuesRequiredSize(observed1.length, 2); Arguments.checkValuesSizeMatch(observed1.length, observed2.length); Arguments.checkNonNegative(observed1); Arguments.checkNonNegative(observed2); // Compute and compare count sums long colSum1 = 0; long colSum2 = 0; for (int i = 0; i < observed1.length; i++) { final long obs1 = observed1[i]; final long obs2 = observed2[i]; checkNonZero(obs1 | obs2, ROW, i); colSum1 += obs1; colSum2 += obs2; } // Create the same exception message as chiSquare(long[][]) checkNonZero(colSum1, COLUMN, 0); checkNonZero(colSum2, COLUMN, 1); // Compare and compute weight only if different final boolean unequalCounts = colSum1 != colSum2; final double weight = unequalCounts ? Math.sqrt((double) colSum1 / colSum2) : 1; // Compute chi-square // This exploits an algebraic rearrangement of the generic n*m contingency table case // for a single sum squared addition per row. double chi2 = 0; for (int i = 0; i < observed1.length; i++) { final double obs1 = observed1[i]; final double obs2 = observed2[i]; // apply weights final double d = unequalCounts ? obs1 / weight - obs2 * weight : obs1 - obs2; chi2 += (d * d) / (obs1 + obs2); } return chi2; } /** * Perform a chi-square goodness-of-fit test evaluating the null hypothesis that * the {@code observed} counts conform to a uniform distribution (each category * is equally likely). * * @param observed Observed frequency counts. * @return test result * @throws IllegalArgumentException if the sample size is less than 2; * {@code observed} has negative entries; or all the observations are zero * @see #statistic(long[]) */ public SignificanceResult test(long[] observed) { final int df = observed.length - 1; final double chi2 = statistic(observed); final double p = computeP(chi2, df); return new BaseSignificanceResult(chi2, p); } /** * Perform a chi-square goodness-of-fit test evaluating the null hypothesis that the * {@code observed} counts conform to the {@code expected} counts. * *

The test can be configured to apply an adjustment to the degrees of freedom * if the observed data has been used to create the expected counts. * * @param expected Expected frequency counts. * @param observed Observed frequency counts. * @return test result * @throws IllegalArgumentException if the sample size is less than 2; the array * sizes do not match; {@code expected} has entries that are not strictly * positive; {@code observed} has negative entries; all the observations are zero; or * the adjusted degrees of freedom are not strictly positive * @see #withDegreesOfFreedomAdjustment(int) * @see #statistic(double[], long[]) */ public SignificanceResult test(double[] expected, long[] observed) { final int df = StatisticUtils.computeDegreesOfFreedom(observed.length, degreesOfFreedomAdjustment); final double chi2 = statistic(expected, observed); final double p = computeP(chi2, df); return new BaseSignificanceResult(chi2, p); } /** * Perform a chi-square test of independence based on the input {@code counts} array, * viewed as a two-way table. * * @param counts 2-way table. * @return test result * @throws IllegalArgumentException if the number of rows or columns is less * than 2; the array is non-rectangular; the array has negative entries; or the * sum of a row or column is zero. * @see #statistic(long[][]) */ public SignificanceResult test(long[][] counts) { final double chi2 = statistic(counts); final double df = (counts.length - 1.0) * (counts[0].length - 1.0); final double p = computeP(chi2, df); return new BaseSignificanceResult(chi2, p); } /** * Perform a chi-square test of independence of frequency counts in * {@code observed1} and {@code observed2}. * *

Note: This is a specialized version of a 2-by-n contingency table. * * @param observed1 Observed frequency counts of the first data set. * @param observed2 Observed frequency counts of the second data set. * @return test result * @throws IllegalArgumentException if the sample size is less than 2; the array * sizes do not match; either array has entries that are negative; either all * counts of {@code observed1} or {@code observed2} are zero; or if the count at * some index is zero for both arrays. * @see #statistic(long[], long[]) */ public SignificanceResult test(long[] observed1, long[] observed2) { final double chi2 = statistic(observed1, observed2); final double p = computeP(chi2, observed1.length - 1.0); return new BaseSignificanceResult(chi2, p); } /** * Compute the chi-square test p-value. * * @param chi2 Chi-square statistic. * @param degreesOfFreedom Degrees of freedom. * @return p-value */ private static double computeP(double chi2, double degreesOfFreedom) { return ChiSquaredDistribution.of(degreesOfFreedom).survivalProbability(chi2); } /** * Check the array value is non-zero. * * @param value Value * @param name Name of the array * @param index Index in the array * @throws IllegalArgumentException if the value is zero */ private static void checkNonZero(double value, String name, int index) { if (value == 0) { throw new InferenceException(InferenceException.ZERO_AT, name, index); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy