All Downloads are FREE. Search and download functionalities are using the official Maven repository.

smile.stat.hypothesis.CorTest Maven / Gradle / Ivy

/*******************************************************************************
 * Copyright (c) 2010-2020 Haifeng Li. All rights reserved.
 *
 * Smile is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3 of
 * the License, or (at your option) any later version.
 *
 * Smile is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with Smile.  If not, see .
 ******************************************************************************/

package smile.stat.hypothesis;

import smile.math.MathEx;
import smile.math.special.Beta;
import smile.math.special.Erf;
import smile.sort.QuickSort;

/**
 * Correlation test. Correlation of two variables is a measure of the degree
 * to which they vary together. More accurately, correlation is the covariation
 * of standardized variables. In positive correlation, as one variable
 * increases, so also does the other. In negative correlation, as one variable
 * increases, the other variable decreases. Perfect positive correlation usually
 * is calculated as a value of 1 (or 100%). Perfect negative correlation usually
 * is calculated as a value of -1. A values of zero shows no correlation at all.
 * 

* Three common types of correlation are Pearson, Spearman (for ranked data) * and Kendall (for uneven or multiple rankings), and can be selected using * the table below. *

* * * * * * * * * * * *
*

* Parametric variables follow normal distribution and linear * relationship between x and y)

*

Y

*

Pearson correlation

*

N

* * * * * * * * * * * * *
*

Equidistant * positions on variables measured?

*

Y

*

* Spearman correlation

*

N

*

Kendall correlation

*
*

* To deal with measures of association between nominal variables, we can use Chi-square * test for independence. For any pair of nominal variables, the data can be * displayed as a contingency table, whose rows are labels by the values of one * nominal variable, whose columns are labels by the values of the other nominal * variable, and whose entries are nonnegative integers giving the number of * observed events for each combination of row and column. * * @author Haifeng Li */ public class CorTest { /** * A character string indicating what type of test was performed. */ public final String method; /** * Correlation coefficient */ public final double cor; /** * Degree of freedom */ public final double df; /** * test statistic */ public final double t; /** * (two-sided) p-value of test */ public final double pvalue; /** * Constructor. */ private CorTest(String method, double cor, double df, double t, double pvalue) { this.method = method; this.cor = cor; this.df = df; this.t = t; this.pvalue = pvalue; } @Override public String toString() { return String.format("%s Correlation Test(cor = %.2f, t = %.4f, df = %.3f, p-value = %G)", method, cor, t, df, pvalue); } /** * Pearson correlation coefficient test. */ public static CorTest pearson(double[] x, double[] y) { final double TINY = 1.0e-16; int n = x.length; double syy = 0.0, sxy = 0.0, sxx = 0.0, ay = 0.0, ax = 0.0; for (int j = 0; j < n; j++) { ax += x[j]; ay += y[j]; } ax /= n; ay /= n; for (int j = 0; j < n; j++) { double xt = x[j] - ax; double yt = y[j] - ay; sxx += xt * xt; syy += yt * yt; sxy += xt * yt; } double r = sxy / (Math.sqrt(sxx * syy) + TINY); int df = n - 2; double t = r * Math.sqrt(df / ((1.0 - r + TINY) * (1.0 + r + TINY))); double prob = Beta.regularizedIncompleteBetaFunction(0.5 * df, 0.5, df / (df + t * t)); return new CorTest("Pearson", r, df, t, prob); } /** * Given a sorted array, replaces the elements by their rank, including * midranking of ties, and returns the sum of f3-f, where * f is the number of elements in each tie. */ private static double crank(double[] w) { int n = w.length; double s = 0.0; int j = 1; while (j < n) { if (w[j] != w[j - 1]) { w[j - 1] = j; ++j; } else { int jt = j + 1; while (jt <= n && w[jt - 1] == w[j - 1]) { jt++; } double rank = 0.5 * (j + jt - 1); for (int ji = j; ji <= (jt - 1); ji++) { w[ji - 1] = rank; } double t = jt - j; s += (t * t * t - t); j = jt; } } if (j == n) { w[n - 1] = n; } return s; } /** * Spearman rank correlation coefficient test. The Spearman Rank Correlation * Coefficient is a form of the Pearson coefficient with the data converted * to rankings (ie. when variables are ordinal). It can be used when there * is non-parametric data and hence Pearson cannot be used. *

* The raw scores are converted to ranks and the differences between * the ranks of each observation on the two variables are calculated. *

* The p-value is calculated by approximation, which is good for n > 10. */ public static CorTest spearman(double[] x, double[] y) { if (x.length != y.length) { throw new IllegalArgumentException("Input vectors have different size"); } int n = x.length; double[] wksp1 = new double[n]; double[] wksp2 = new double[n]; for (int j = 0; j < n; j++) { wksp1[j] = x[j]; wksp2[j] = y[j]; } QuickSort.sort(wksp1, wksp2); double sf = crank(wksp1); QuickSort.sort(wksp2, wksp1); double sg = crank(wksp2); double d = 0.0; for (int j = 0; j < n; j++) { d += MathEx.sqr(wksp1[j] - wksp2[j]); } int en = n; double en3n = en * en * en - en; double fac = (1.0 - sf / en3n) * (1.0 - sg / en3n); double rs = (1.0 - (6.0 / en3n) * (d + (sf + sg) / 12.0)) / Math.sqrt(fac); fac = (rs + 1.0) * (1.0 - rs); double probrs = 0.0; double t = rs * Math.sqrt((en - 2.0) / fac); int df = en - 2; if (fac > 0.0) { probrs = Beta.regularizedIncompleteBetaFunction(0.5 * df, 0.5, df / (df + t * t)); } return new CorTest("Spearman", rs, df, t, probrs); } /** * Kendall rank correlation test. The Kendall Tau Rank Correlation * Coefficient is used to measure the degree of correspondence * between sets of rankings where the measures are not equidistant. * It is used with non-parametric data. The p-value is calculated by * approximation, which is good for n > 10. */ public static CorTest kendall(double[] x, double[] y) { if (x.length != y.length) { throw new IllegalArgumentException("Input vectors have different size"); } int is = 0, n2 = 0, n1 = 0, n = x.length; double aa, a2, a1; for (int j = 0; j < n - 1; j++) { for (int k = j + 1; k < n; k++) { a1 = x[j] - x[k]; a2 = y[j] - y[k]; aa = a1 * a2; if (aa != 0.0) { ++n1; ++n2; if (aa > 0) ++is; else --is; } else { if (a1 != 0.0) ++n1; if (a2 != 0.0) ++n2; } } } double tau = is / (Math.sqrt(n1) * Math.sqrt(n2)); double svar = (4.0 * n + 10.0) / (9.0 * n * (n - 1.0)); double z = tau / Math.sqrt(svar); double prob = Erf.erfcc(Math.abs(z) / 1.4142136); return new CorTest("Kendall", tau, 0, z, prob); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy