smile.stat.hypothesis.KSTest Maven / Gradle / Ivy
/*******************************************************************************
* Copyright (c) 2010-2020 Haifeng Li. All rights reserved.
*
* Smile is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation, either version 3 of
* the License, or (at your option) any later version.
*
* Smile is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with Smile. If not, see .
******************************************************************************/
package smile.stat.hypothesis;
import java.util.Arrays;
import smile.math.MathEx;
import smile.stat.distribution.Distribution;
/**
* The Kolmogorov-Smirnov test (K-S test) is a form of minimum distance
* estimation used as a non-parametric test of equality of one-dimensional
* probability distributions. K-S test is used to compare a sample with a reference
* probability distribution (one-sample K-S test), or to compare two samples
* (two-sample K-S test). The Kolmogorov-Smirnov statistic quantifies a
* distance between the empirical distribution function of the sample and the
* cumulative distribution function of the reference distribution, or between
* the empirical distribution functions of two samples. The null distribution
* of this statistic is calculated under the null hypothesis that the samples
* are drawn from the same distribution (in the two-sample case) or that the
* sample is drawn from the reference distribution (in the one-sample case).
* In each case, the distributions considered under the null hypothesis are
* continuous distributions but are otherwise unrestricted.
*
* The two-sample KS test is one of the most useful and general non-parametric
* methods for comparing two samples, as it is sensitive to differences in
* both location and shape of the empirical cumulative distribution functions
* of the two samples.
*
* The Kolmogorov-Smirnov test can be modified to serve goodness of fit test.
* In the special case of testing for normality of the distribution, samples
* are standardized and compared with a standard normal distribution. This is
* equivalent to setting the mean and variance of the reference distribution
* equal to the sample estimates, and it is known that using the sample to
* modify the null hypothesis reduces the power of a test. Correcting for this
* bias leads to the Lilliefors test. However, even Lilliefors' modification
* is less powerful than the Shapiro-Wilk test or Anderson-Darling test for
* testing normality.
*
* @author Haifeng Li
*/
public class KSTest {
/**
* A character string indicating what type of test was performed.
*/
public final String method;
/**
* Kolmogorov-Smirnov statistic
*/
public final double d;
/**
* P-value
*/
public final double pvalue;
private KSTest(String method, double d, double pvalue) {
this.method = method;
this.d = d;
this.pvalue = pvalue;
}
@Override
public String toString() {
return String.format("%s Kolmogorov-Smirnov Test(d = %.4f, p-value = %G)", method, d, pvalue);
}
/**
* Cumulative distribution function of Kolmogorov-Smirnov distribution.
*/
private static double pks(double z) {
if (z < 0.0) {
throw new IllegalArgumentException("Invalid z: " + z);
}
if (z == 0.0) {
return 0.0;
}
if (z < 1.18) {
double y = Math.exp(-1.23370055013616983 / (z * z));
return 2.25675833419102515 * Math.sqrt(-Math.log(y)) * (y + Math.pow(y, 9) + Math.pow(y, 25) + Math.pow(y, 49));
} else {
double x = Math.exp(-2. * (z * z));
return 1. - 2. * (x - Math.pow(x, 4) + Math.pow(x, 9));
}
}
/**
* Complementary cumulative distribution function of Kolmogorov-Smirnov
* distribution.
*/
private static double qks(double z) {
if (z < 0.0) {
throw new IllegalArgumentException("Invalid z: " + z);
}
if (z == 0.0) {
return 1.0;
}
if (z < 1.18) {
return 1.0 - pks(z);
}
double x = Math.exp(-2. * (z * z));
return 2. * (x - Math.pow(x, 4) + Math.pow(x, 9));
}
/**
* Inverse of the complementary cumulative distribution function of
* Kolmogorov-Smirnov distribution.
*/
private static double invqks(double q) {
if (q <= 0.0 || q > 1.0) {
throw new IllegalArgumentException("Invalid q: " + q);
}
if (q == 1.0) {
return 0.0;
}
if (q > 0.3) {
double f = -0.392699081698724155 * MathEx.sqr(1. - q);
double y = invxlogx(f);
double t;
do {
double logy = Math.log(y);
double ff = f / MathEx.sqr(1. + Math.pow(y, 4) + Math.pow(y, 12));
double u = (y * logy - ff) / (1. + logy);
t = u / Math.max(0.5, 1. - 0.5 * u / (y * (1. + logy)));
y -= t;
} while (Math.abs(t / y) > 1.e-15);
return 1.57079632679489662 / Math.sqrt(-Math.log(y));
} else {
double x = 0.03;
double xp;
do {
xp = x;
x = 0.5 * q + Math.pow(x, 4) - Math.pow(x, 9);
if (x > 0.06) {
x += Math.pow(x, 16) - Math.pow(x, 25);
}
} while (Math.abs((xp - x) / x) > 1.e-15);
return Math.sqrt(-0.5 * Math.log(x));
}
}
/**
* Inverse of the cumulative distribution function of Kolmogorov-Smirnov
* distribution.
*/
static double invpks(double p) {
return invqks(1.0 - p);
}
/**
* Inverse function to y log y. Used for initial guess of invqks.
*/
private static double invxlogx(double y) {
final double ooe = 0.367879441171442322;
double t, u, to = 0.;
if (y >= 0. || y <= -ooe) {
throw new IllegalArgumentException("Invalid y: " + y);
}
if (y < -0.2) {
u = Math.log(ooe - Math.sqrt(2 * ooe * (y + ooe)));
} else {
u = -10.;
}
do {
u += (t = (Math.log(y / u) - u) * (u / (1. + u)));
if (t < 1.e-8 && Math.abs(t + to) < 0.01 * Math.abs(t)) {
break;
}
to = t;
} while (Math.abs(t / u) > 1.e-15);
return Math.exp(u);
}
/**
* The one-sample KS test for the null hypothesis that the data set x
* is drawn from the given distribution. Small values of p-value show that
* the cumulative distribution function of x is significantly different from
* the given distribution. The array x is modified by being sorted into
* ascending order.
*/
public static KSTest test(double[] x, Distribution dist) {
Arrays.sort(x);
int n = x.length;
double en = n;
double d = 0.0;
double fo = 0.0;
for (int j = 0; j < n; j++) {
double fn = (j + 1) / en;
double ff = dist.cdf(x[j]);
double dt = Math.max(Math.abs(fo - ff), Math.abs(fn - ff));
if (dt > d) {
d = dt;
}
fo = fn;
}
en = Math.sqrt(en);
double p = qks((en + 0.12 + 0.11 / en) * d);
return new KSTest(dist.toString(), d, p);
}
/**
* The two-sample KS test for the null hypothesis that the data sets
* are drawn from the same distribution. Small values of p-value show that
* the cumulative distribution function of x is significantly different from
* that of y. The arrays x and y are modified by being sorted into
* ascending order.
*/
public static KSTest test(double[] x, double[] y) {
Arrays.sort(x);
Arrays.sort(y);
int j1 = 0, j2 = 0;
int n1 = x.length, n2 = y.length;
double en1 = n1;
double en2 = n2;
double d = 0.0;
double d1, d2, dt, fn1 = 0.0, fn2 = 0.0;
while (j1 < n1 && j2 < n2) {
if ((d1 = x[j1]) <= (d2 = y[j2])) {
do {
fn1 = ++j1 / en1;
} while (j1 < n1 && d1 == x[j1]);
}
if (d2 <= d1) {
do {
fn2 = ++j2 / en2;
} while (j2 < n2 && d2 == y[j2]);
}
if ((dt = Math.abs(fn2 - fn1)) > d) {
d = dt;
}
}
double en = Math.sqrt(en1 * en2 / (en1 + en2));
double p = qks((en + 0.12 + 0.11 / en) * d);
return new KSTest("Two Sample", d, p);
}
}