All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.commons.math3.stat.inference.OneWayAnova Maven / Gradle / Ivy

Go to download

The Math project is a library of lightweight, self-contained mathematics and statistics components addressing the most common practical problems not immediately available in the Java programming language or commons-lang.

There is a newer version: 3.6.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.math3.stat.inference;

import org.apache.commons.math3.distribution.FDistribution;
import org.apache.commons.math3.exception.ConvergenceException;
import org.apache.commons.math3.exception.DimensionMismatchException;
import org.apache.commons.math3.exception.MaxCountExceededException;
import org.apache.commons.math3.exception.NullArgumentException;
import org.apache.commons.math3.exception.OutOfRangeException;
import org.apache.commons.math3.exception.util.LocalizedFormats;
import org.apache.commons.math3.stat.descriptive.summary.Sum;
import org.apache.commons.math3.stat.descriptive.summary.SumOfSquares;

import java.util.Collection;

/**
 * Implements one-way ANOVA (analysis of variance) statistics.
 *
 * 

Tests for differences between two or more categories of univariate data * (for example, the body mass index of accountants, lawyers, doctors and * computer programmers). When two categories are given, this is equivalent to * the {@link org.apache.commons.math3.stat.inference.TTest}. *

* Uses the {@link org.apache.commons.math3.distribution.FDistribution * commons-math F Distribution implementation} to estimate exact p-values.

*

This implementation is based on a description at * http://faculty.vassar.edu/lowry/ch13pt1.html

*
 * Abbreviations: bg = between groups,
 *                wg = within groups,
 *                ss = sum squared deviations
 * 
* * @since 1.2 * @version $Id: OneWayAnova.java 1244107 2012-02-14 16:17:55Z erans $ */ public class OneWayAnova { /** * Default constructor. */ public OneWayAnova() { } /** * Computes the ANOVA F-value for a collection of double[] * arrays. * *

Preconditions:

    *
  • The categoryData Collection must contain * double[] arrays.
  • *
  • There must be at least two double[] arrays in the * categoryData collection and each of these arrays must * contain at least two values.

* This implementation computes the F statistic using the definitional * formula

     *   F = msbg/mswg
* where
     *  msbg = between group mean square
     *  mswg = within group mean square
* are as defined * here

* * @param categoryData Collection of double[] * arrays each containing data for one category * @return Fvalue * @throws NullArgumentException if categoryData is null * @throws DimensionMismatchException if the length of the categoryData * array is less than 2 or a contained double[] array does not have * at least two values */ public double anovaFValue(final Collection categoryData) throws NullArgumentException, DimensionMismatchException { AnovaStats a = anovaStats(categoryData); return a.F; } /** * Computes the ANOVA P-value for a collection of double[] * arrays. * *

Preconditions:

    *
  • The categoryData Collection must contain * double[] arrays.
  • *
  • There must be at least two double[] arrays in the * categoryData collection and each of these arrays must * contain at least two values.

* This implementation uses the * {@link org.apache.commons.math3.distribution.FDistribution * commons-math F Distribution implementation} to estimate the exact * p-value, using the formula

     *   p = 1 - cumulativeProbability(F)
* where F is the F value and cumulativeProbability * is the commons-math implementation of the F distribution.

* * @param categoryData Collection of double[] * arrays each containing data for one category * @return Pvalue * @throws NullArgumentException if categoryData is null * @throws DimensionMismatchException if the length of the categoryData * array is less than 2 or a contained double[] array does not have * at least two values * @throws ConvergenceException if the p-value can not be computed due to a convergence error * @throws MaxCountExceededException if the maximum number of iterations is exceeded */ public double anovaPValue(final Collection categoryData) throws NullArgumentException, DimensionMismatchException, ConvergenceException, MaxCountExceededException { AnovaStats a = anovaStats(categoryData); FDistribution fdist = new FDistribution(a.dfbg, a.dfwg); return 1.0 - fdist.cumulativeProbability(a.F); } /** * Performs an ANOVA test, evaluating the null hypothesis that there * is no difference among the means of the data categories. * *

Preconditions:

    *
  • The categoryData Collection must contain * double[] arrays.
  • *
  • There must be at least two double[] arrays in the * categoryData collection and each of these arrays must * contain at least two values.
  • *
  • alpha must be strictly greater than 0 and less than or equal to 0.5. *

* This implementation uses the * {@link org.apache.commons.math3.distribution.FDistribution * commons-math F Distribution implementation} to estimate the exact * p-value, using the formula

     *   p = 1 - cumulativeProbability(F)
* where F is the F value and cumulativeProbability * is the commons-math implementation of the F distribution.

*

True is returned iff the estimated p-value is less than alpha.

* * @param categoryData Collection of double[] * arrays each containing data for one category * @param alpha significance level of the test * @return true if the null hypothesis can be rejected with * confidence 1 - alpha * @throws NullArgumentException if categoryData is null * @throws DimensionMismatchException if the length of the categoryData * array is less than 2 or a contained double[] array does not have * at least two values * @throws OutOfRangeException if alpha is not in the range (0, 0.5] * @throws ConvergenceException if the p-value can not be computed due to a convergence error * @throws MaxCountExceededException if the maximum number of iterations is exceeded */ public boolean anovaTest(final Collection categoryData, final double alpha) throws NullArgumentException, DimensionMismatchException, OutOfRangeException, ConvergenceException, MaxCountExceededException { if ((alpha <= 0) || (alpha > 0.5)) { throw new OutOfRangeException( LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL, alpha, 0, 0.5); } return anovaPValue(categoryData) < alpha; } /** * This method actually does the calculations (except P-value). * * @param categoryData Collection of double[] * arrays each containing data for one category * @return computed AnovaStats * @throws NullArgumentException if categoryData is null * @throws DimensionMismatchException if the length of the categoryData * array is less than 2 or a contained double[] array does not contain * at least two values */ private AnovaStats anovaStats(final Collection categoryData) throws NullArgumentException, DimensionMismatchException { if (categoryData == null) { throw new NullArgumentException(); } // check if we have enough categories if (categoryData.size() < 2) { throw new DimensionMismatchException( LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED, categoryData.size(), 2); } // check if each category has enough data and all is double[] for (double[] array : categoryData) { if (array.length <= 1) { throw new DimensionMismatchException( LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED, array.length, 2); } } int dfwg = 0; double sswg = 0; Sum totsum = new Sum(); SumOfSquares totsumsq = new SumOfSquares(); int totnum = 0; for (double[] data : categoryData) { Sum sum = new Sum(); SumOfSquares sumsq = new SumOfSquares(); int num = 0; for (int i = 0; i < data.length; i++) { double val = data[i]; // within category num++; sum.increment(val); sumsq.increment(val); // for all categories totnum++; totsum.increment(val); totsumsq.increment(val); } dfwg += num - 1; double ss = sumsq.getResult() - sum.getResult() * sum.getResult() / num; sswg += ss; } double sst = totsumsq.getResult() - totsum.getResult() * totsum.getResult()/totnum; double ssbg = sst - sswg; int dfbg = categoryData.size() - 1; double msbg = ssbg/dfbg; double mswg = sswg/dfwg; double F = msbg/mswg; return new AnovaStats(dfbg, dfwg, F); } /** Convenience class to pass dfbg,dfwg,F values around within AnovaImpl. No get/set methods provided. */ private static class AnovaStats { /** Degrees of freedom in numerator (between groups). */ private final int dfbg; /** Degrees of freedom in denominator (within groups). */ private final int dfwg; /** Statistic. */ private final double F; /** * Constructor * @param dfbg degrees of freedom in numerator (between groups) * @param dfwg degrees of freedom in denominator (within groups) * @param F statistic */ private AnovaStats(int dfbg, int dfwg, double F) { this.dfbg = dfbg; this.dfwg = dfwg; this.F = F; } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy