org.apache.commons.math3.stat.inference.OneWayAnova Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of commons-math3 Show documentation
The Math project is a library of lightweight, self-contained mathematics and statistics components addressing the most common practical problems not immediately available in the Java programming language or commons-lang.
There is a newer version: 3.6.1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.commons.math3.stat.inference;

import org.apache.commons.math3.distribution.FDistribution;
import org.apache.commons.math3.exception.ConvergenceException;
import org.apache.commons.math3.exception.DimensionMismatchException;
import org.apache.commons.math3.exception.MaxCountExceededException;
import org.apache.commons.math3.exception.NullArgumentException;
import org.apache.commons.math3.exception.OutOfRangeException;
import org.apache.commons.math3.exception.util.LocalizedFormats;
import org.apache.commons.math3.stat.descriptive.summary.Sum;
import org.apache.commons.math3.stat.descriptive.summary.SumOfSquares;

import java.util.Collection;

/**
 * Implements one-way ANOVA (analysis of variance) statistics.
 *
 *  Tests for differences between two or more categories of univariate data
 * (for example, the body mass index of accountants, lawyers, doctors and
 * computer programmers).  When two categories are given, this is equivalent to
 * the {@link org.apache.commons.math3.stat.inference.TTest}.
 * 

 * Uses the {@link org.apache.commons.math3.distribution.FDistribution
 * commons-math F Distribution implementation} to estimate exact p-values.
 * This implementation is based on a description at
 * http://faculty.vassar.edu/lowry/ch13pt1.html
 *  * Abbreviations: bg = between groups,
 *                wg = within groups,
 *                ss = sum squared deviations
 * 
 *
 * @since 1.2
 * @version $Id: OneWayAnova.java 1244107 2012-02-14 16:17:55Z erans $
 */
public class OneWayAnova {

    /**
     * Default constructor.
     */
    public OneWayAnova() {
    }

    /**
     * Computes the ANOVA F-value for a collection of double[]
     * arrays.
     *
     * Preconditions: 

     * The categoryData Collection must contain
     * double[] arrays.
     *  There must be at least two double[] arrays in the
     * categoryData collection and each of these arrays must
     * contain at least two values.

     * This implementation computes the F statistic using the definitional
     * formula
     *   F = msbg/mswg
     * where     *  msbg = between group mean square
     *  mswg = within group mean square
     * are as defined 
     * here
     *
     * @param categoryData Collection of double[]
     * arrays each containing data for one category
     * @return Fvalue
     * @throws NullArgumentException if categoryData is null
     * @throws DimensionMismatchException if the length of the categoryData
     * array is less than 2 or a contained double[] array does not have
     * at least two values
     */
    public double anovaFValue(final Collection categoryData)
        throws NullArgumentException, DimensionMismatchException {

        AnovaStats a = anovaStats(categoryData);
        return a.F;

    }

    /**
     * Computes the ANOVA P-value for a collection of double[]
     * arrays.
     *
     * Preconditions: 

     * The categoryData Collection must contain
     * double[] arrays.
     *  There must be at least two double[] arrays in the
     * categoryData collection and each of these arrays must
     * contain at least two values.

     * This implementation uses the
     * {@link org.apache.commons.math3.distribution.FDistribution
     * commons-math F Distribution implementation} to estimate the exact
     * p-value, using the formula
     *   p = 1 - cumulativeProbability(F)
     * where F is the F value and cumulativeProbability
     * is the commons-math implementation of the F distribution.
     *
     * @param categoryData Collection of double[]
     * arrays each containing data for one category
     * @return Pvalue
     * @throws NullArgumentException if categoryData is null
     * @throws DimensionMismatchException if the length of the categoryData
     * array is less than 2 or a contained double[] array does not have
     * at least two values
     * @throws ConvergenceException if the p-value can not be computed due to a convergence error
     * @throws MaxCountExceededException if the maximum number of iterations is exceeded
     */
    public double anovaPValue(final Collection categoryData)
        throws NullArgumentException, DimensionMismatchException,
        ConvergenceException, MaxCountExceededException {

        AnovaStats a = anovaStats(categoryData);
        FDistribution fdist = new FDistribution(a.dfbg, a.dfwg);
        return 1.0 - fdist.cumulativeProbability(a.F);

    }

    /**
     * Performs an ANOVA test, evaluating the null hypothesis that there
     * is no difference among the means of the data categories.
     *
     * Preconditions: 

     * The categoryData Collection must contain
     * double[] arrays.
     *  There must be at least two double[] arrays in the
     * categoryData collection and each of these arrays must
     * contain at least two values.
     * alpha must be strictly greater than 0 and less than or equal to 0.5.
     * 

     * This implementation uses the
     * {@link org.apache.commons.math3.distribution.FDistribution
     * commons-math F Distribution implementation} to estimate the exact
     * p-value, using the formula
     *   p = 1 - cumulativeProbability(F)
     * where F is the F value and cumulativeProbability
     * is the commons-math implementation of the F distribution.
     * True is returned iff the estimated p-value is less than alpha.
     *
     * @param categoryData Collection of double[]
     * arrays each containing data for one category
     * @param alpha significance level of the test
     * @return true if the null hypothesis can be rejected with
     * confidence 1 - alpha
     * @throws NullArgumentException if categoryData is null
     * @throws DimensionMismatchException if the length of the categoryData
     * array is less than 2 or a contained double[] array does not have
     * at least two values
     * @throws OutOfRangeException if alpha is not in the range (0, 0.5]
     * @throws ConvergenceException if the p-value can not be computed due to a convergence error
     * @throws MaxCountExceededException if the maximum number of iterations is exceeded
     */
    public boolean anovaTest(final Collection categoryData,
                             final double alpha)
        throws NullArgumentException, DimensionMismatchException,
        OutOfRangeException, ConvergenceException, MaxCountExceededException {

        if ((alpha <= 0) || (alpha > 0.5)) {
            throw new OutOfRangeException(
                    LocalizedFormats.OUT_OF_BOUND_SIGNIFICANCE_LEVEL,
                    alpha, 0, 0.5);
        }
        return anovaPValue(categoryData) < alpha;

    }

    /**
     * This method actually does the calculations (except P-value).
     *
     * @param categoryData Collection of double[]
     * arrays each containing data for one category
     * @return computed AnovaStats
     * @throws NullArgumentException if categoryData is null
     * @throws DimensionMismatchException if the length of the categoryData
     * array is less than 2 or a contained double[] array does not contain
     * at least two values
     */
    private AnovaStats anovaStats(final Collection categoryData)
        throws NullArgumentException, DimensionMismatchException {

        if (categoryData == null) {
            throw new NullArgumentException();
        }

        // check if we have enough categories
        if (categoryData.size() < 2) {
            throw new DimensionMismatchException(
                    LocalizedFormats.TWO_OR_MORE_CATEGORIES_REQUIRED,
                    categoryData.size(), 2);
        }

        // check if each category has enough data and all is double[]
        for (double[] array : categoryData) {
            if (array.length <= 1) {
                throw new DimensionMismatchException(
                        LocalizedFormats.TWO_OR_MORE_VALUES_IN_CATEGORY_REQUIRED,
                        array.length, 2);
            }
        }

        int dfwg = 0;
        double sswg = 0;
        Sum totsum = new Sum();
        SumOfSquares totsumsq = new SumOfSquares();
        int totnum = 0;

        for (double[] data : categoryData) {

            Sum sum = new Sum();
            SumOfSquares sumsq = new SumOfSquares();
            int num = 0;

            for (int i = 0; i < data.length; i++) {
                double val = data[i];

                // within category
                num++;
                sum.increment(val);
                sumsq.increment(val);

                // for all categories
                totnum++;
                totsum.increment(val);
                totsumsq.increment(val);
            }
            dfwg += num - 1;
            double ss = sumsq.getResult() - sum.getResult() * sum.getResult() / num;
            sswg += ss;
        }
        double sst = totsumsq.getResult() - totsum.getResult() *
            totsum.getResult()/totnum;
        double ssbg = sst - sswg;
        int dfbg = categoryData.size() - 1;
        double msbg = ssbg/dfbg;
        double mswg = sswg/dfwg;
        double F = msbg/mswg;

        return new AnovaStats(dfbg, dfwg, F);
    }

    /**
        Convenience class to pass dfbg,dfwg,F values around within AnovaImpl.
        No get/set methods provided.
    */
    private static class AnovaStats {

        /** Degrees of freedom in numerator (between groups). */
        private final int dfbg;

        /** Degrees of freedom in denominator (within groups). */
        private final int dfwg;

        /** Statistic. */
        private final double F;

        /**
         * Constructor
         * @param dfbg degrees of freedom in numerator (between groups)
         * @param dfwg degrees of freedom in denominator (within groups)
         * @param F statistic
         */
        private AnovaStats(int dfbg, int dfwg, double F) {
            this.dfbg = dfbg;
            this.dfwg = dfwg;
            this.F = F;
        }
    }

}