smile.stat.hypothesis.TTest Maven / Gradle / Ivy

Go to download
/*******************************************************************************
 * Copyright (c) 2010-2020 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Smile.  If not, see .
 ******************************************************************************/

package smile.stat.hypothesis;

import smile.math.MathEx;
import smile.math.special.Beta;

/**
 * Student's t test. A t-test is any statistical hypothesis test in which the test statistic has
 * a Student's t distribution if the null hypothesis is true. It is applied
 * when the population is assumed to be normally distributed but the sample
 * sizes are small enough that the statistic on which inference is based is
 * not normally distributed because it relies on an uncertain estimate of
 * standard deviation rather than on a precisely known value.
 * 
 * Among the most frequently used t tests are:
 * 

 *  A test of whether the mean of a normally distributed population has
 * a value specified in a null hypothesis.
 * 
 A test of the null hypothesis that the means of two normally
 * distributed populations are equal. Given two data sets, each characterized
 * by its mean, standard deviation and number of data points, we can use some
 * kind of t test to determine whether the means are distinct, provided that
 * the underlying distributions can be assumed to be normal. All such tests
 * are usually called Student's t tests, though strictly speaking that name
 * should only be used if the variances of the two populations are also assumed
 * to be equal; the form of the test used when this assumption is dropped is
 * sometimes called Welch's t test. There are different versions of the t test
 * depending on whether the two samples are
 * 
 *  unpaired, independent of each other (e.g., individuals randomly
 * assigned into two groups, measured after an intervention and compared
 * with the other group),
 * 
or paired, so that each member of one sample has a unique relationship
 * with a particular member of the other sample (e.g., the same people measured
 * before and after an intervention).
 * 
 * If the calculated p-value is below the threshold chosen for
 * statistical significance (usually 0.05 or 0.01 level), then
 * the null hypothesis which usually states that the two groups do not differ
 * is rejected in favor of an alternative hypothesis, which typically states
 * that the groups do differ.
 * 
 A test of whether the slope of a regression line differs significantly
 * from 0.
 * 
 *
 * @author Haifeng Li
 */
public class TTest {
    /**
     * A character string indicating what type of test was performed.
     */
    public final String method;

    /**
     * The degree of freedom of t-statistic.
     */
    public final double df;

    /**
     * t-statistic
     */
    public final double t;

    /**
     * p-value
     */
    public final double pvalue;

    /**
     * Constructor.
     */
    private TTest(String method, double t, double df, double pvalue) {
        this.method = method;
        this.t = t;
        this.df = df;
        this.pvalue = pvalue;
    }

    @Override
    public String toString() {
        return String.format("%s t-test(t = %.4f, df = %.3f, p-value = %G)", method, t, df, pvalue);
    }

    /**
     * Independent one-sample t-test whether the mean of a normally distributed
     * population has a value specified in a null hypothesis. Small values of
     * p-value indicate that the array has significantly different mean.
     */
    public static TTest test(double[] x, double mean) {
        int n = x.length;

        double mu = MathEx.mean(x);
        double var = MathEx.var(x);

        int df = n - 1;

        double t = (mu - mean) / Math.sqrt(var/n);
        double p = Beta.regularizedIncompleteBetaFunction(0.5 * df, 0.5, df / (df + t * t));

        return new TTest("One Sample", t, df, p);
    }

    /**
     * Test if the arrays x and y have significantly different means. The data
     * arrays are assumed to be drawn from populations with unequal variances.
     * Small values of p-value indicate that the two arrays have significantly
     * different means.
     */
    public static TTest test(double[] x, double[] y) {
        return test(x, y, false);
    }

    /**
     * Test if the arrays x and y have significantly different means.  Small
     * values of p-value indicate that the two arrays have significantly
     * different means.
     * @param equalVariance true if the data arrays are assumed to be
     * drawn from populations with the same true variance. Otherwise, The data
     * arrays are allowed to be drawn from populations with unequal variances.
     */
    public static TTest test(double[] x, double[] y, boolean equalVariance) {
        if (equalVariance) {
            int n1 = x.length;
            int n2 = y.length;

            double mu1 = MathEx.mean(x);
            double var1 = MathEx.var(x);

            double mu2 = MathEx.mean(y);
            double var2 = MathEx.var(y);

            int df = n1 + n2 - 2;

            double svar = ((n1 - 1) * var1 + (n2 - 1) * var2) / df;

            double t = (mu1 - mu2) / Math.sqrt(svar * (1.0 / n1 + 1.0 / n2));
            double p = Beta.regularizedIncompleteBetaFunction(0.5 * df, 0.5, df / (df + t * t));

            return new TTest("Equal Variance Two Sample", t, df, p);
        } else {
            int n1 = x.length;
            int n2 = y.length;

            double mu1 = MathEx.mean(x);
            double var1 = MathEx.var(x);

            double mu2 = MathEx.mean(y);
            double var2 = MathEx.var(y);

            double df = MathEx.sqr(var1 / n1 + var2 / n2) / (MathEx.sqr(var1 / n1) / (n1 - 1) + MathEx.sqr(var2 / n2) / (n2 - 1));

            double t = (mu1 - mu2) / Math.sqrt(var1 / n1 + var2 / n2);
            double p = Beta.regularizedIncompleteBetaFunction(0.5 * df, 0.5, df / (df + t * t));

            return new TTest("Unequal Variance Two Sample", t, df, p);
        }
    }

    /**
     * Given the paired arrays x and y, test if they have significantly
     * different means. Small values of p-value indicate that the two arrays
     * have significantly different means.
     */
    public static TTest testPaired(double[] x, double[] y) {
        if (x.length != y.length) {
            throw new IllegalArgumentException("Input vectors have different size");
        }

        double mu1 = MathEx.mean(x);
        double var1 = MathEx.var(x);

        double mu2 = MathEx.mean(y);
        double var2 = MathEx.var(y);

        int n = x.length;
        int df = n - 1;

        double cov = 0.0;
        for (int j = 0; j < n; j++) {
            cov += (x[j] - mu1) * (y[j] - mu2);
        }
        cov /= df;

        double sd = Math.sqrt((var1 + var2 - 2.0 * cov) / n);
        double t = (mu1 - mu2) / sd;
        double p = Beta.regularizedIncompleteBetaFunction(0.5 * df, 0.5, df / (df + t * t));

        return new TTest("Paired", t, df, p);
    }

    /**
     * Test whether the Pearson correlation coefficient, the slope of
     * a regression line, differs significantly from 0. Small values of p-value
     * indicate a significant correlation.
     * @param r the Pearson correlation coefficient.
     * @param df the degree of freedom. df = n - 2, where n is the number of samples
     * used in the calculation of r.
     */
    public static TTest test(double r, int df) {
        final double TINY = 1.0e-16;

        double t = r * Math.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY)));
        double p = Beta.regularizedIncompleteBetaFunction(0.5 * df, 0.5, df / (df + t * t));

        return new TTest("Pearson correlation coefficient", t, df, p);
    }
}