gov.sandia.cognition.statistics.method.KolmogorovSmirnovConfidence Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cognitive-foundry Show documentation
Show all versions of cognitive-foundry Show documentation
A single jar with all the Cognitive Foundry components.
/*
* File: KolmogorovSmirnovConfidence.java
* Authors: Kevin R. Dixon
* Company: Sandia National Laboratories
* Project: Cognitive Foundry
*
* Copyright August 15, 2007, Sandia Corporation. Under the terms of Contract
* DE-AC04-94AL85000, there is a non-exclusive license for use of this work by
* or on behalf of the U.S. Government. Export of this program may require a
* license from the United States Government. See CopyrightHistory.txt for
* complete details.
*
*/
package gov.sandia.cognition.statistics.method;
import gov.sandia.cognition.annotation.PublicationReference;
import gov.sandia.cognition.annotation.PublicationType;
import gov.sandia.cognition.collection.NumberComparator;
import gov.sandia.cognition.statistics.CumulativeDistributionFunction;
import gov.sandia.cognition.statistics.distribution.KolmogorovDistribution;
import gov.sandia.cognition.statistics.distribution.UnivariateGaussian;
import gov.sandia.cognition.util.AbstractCloneableSerializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
/**
* Performs a Kolmogorov-Smirnov Confidence Test. This is often simply called
* the "K-S test". This is a powerful
* nonparametric test that determines the probability that two distributions
* were generated by the same distribution. There are minimal (no?)
* assumptions on the underlying data or distributions. That is, the
* distributions are NOT assumed to be Gaussian, etc.
* @author Kevin R. Dixon
* @since 2.0
*
*/
@ConfidenceTestAssumptions(
name="Kolmogorov-Smirnov test",
alsoKnownAs="K-S test",
description={
"Determines if two datasets were drawn from the same univariate distribution.",
"Robust, nonparameteric test that makes no assumptions on the underlying distribution (continuous, discrete, etc.)."
},
assumptions={
"The data were sampled independently from each other."
},
nullHypothesis="The data were drawn from the same distribution.",
dataPaired=false,
dataSameSize=false,
distribution=KolmogorovDistribution.CDF.class,
reference=@PublicationReference(
author="Wikipedia",
title="Kolmogorov-Smirnov test",
type=PublicationType.WebPage,
year=2009,
url="http://en.wikipedia.org/wiki/Kolmogorov-Smirnov_test"
)
)
public class KolmogorovSmirnovConfidence
extends AbstractCloneableSerializable
implements NullHypothesisEvaluator>
{
/**
* Default instance of the K-S test.
*/
public static final KolmogorovSmirnovConfidence INSTANCE =
new KolmogorovSmirnovConfidence();
/**
* Creates a new instance of KolmogorovSmirnovConfidence
*/
public KolmogorovSmirnovConfidence()
{
}
/**
* Returns an array of ascending sorted values from the given Collection
* @param data
* Collection of doubles to sort into ascending order
* @return
* Array of ascending sorted values
*/
protected static double[] computeAscendingArray(
Collection extends Number> data)
{
double[] values = new double[data.size()];
int index = 0;
for (Number value : data)
{
values[index] = value.doubleValue();
index++;
}
Arrays.sort(values);
return values;
}
/**
* This is the standard K-S test for two distributions of data. Determines
* the probability that the two distributions of data were generated by
* the same underlying distributions. This is a parameter-free test, so
* the assumptions on the underlying data are minimal (inexistent?).
* @param data1
* First dataset to consider
* @param data2
* Second dataset to consider
* @return
* ConfidenceStatistic from the K-S test.
*/
@PublicationReference(
author={
"William H. Press",
"Saul A. Teukolsky",
"William T. Vetterling",
"Brian P. Flannery"
},
title="Numerical Recipes in C, Second Edition",
type=PublicationType.Book,
year=1992,
pages={625,626},
notes={
"Section 14.3",
"Function kstwo()"
},
url="http://www.nrbook.com/a/bookcpdf.php"
)
@Override
public KolmogorovSmirnovConfidence.Statistic evaluateNullHypothesis(
Collection extends Number> data1,
Collection extends Number> data2)
{
double[] dataArray1 =
KolmogorovSmirnovConfidence.computeAscendingArray(data1);
double[] dataArray2 =
KolmogorovSmirnovConfidence.computeAscendingArray(data2);
int j1 = 0;
int j2 = 0;
double N1 = dataArray1.length;
double N2 = dataArray2.length;
double fn1 = 0.0;
double fn2 = 0.0;
double D = 0.0;
while ((j1 < N1) && (j2 < N2))
{
double d1 = dataArray1[j1];
double d2 = dataArray2[j2];
if (d1 <= d2 || Double.isNaN(d1))
{
j1++;
fn1 = j1 / N1;
}
if (d2 <= d1 || Double.isNaN(d2))
{
j2++;
fn2 = j2 / N2;
}
double dt = Math.abs(fn2 - fn1);
if (dt > D)
{
D = dt;
}
}
double Ne = Math.sqrt((N1 * N2) / (N1 + N2));
return new KolmogorovSmirnovConfidence.Statistic(Ne, D);
}
/**
* This is the standard K-S test for determining if the given data were
* generated by the given CDF. Computes the probability that the two
* distributions of data are actually the same underlying distributions.
* This is a parameter-free test, so the assumptions on the underlying data
* are minimal (inexistent?). For example, to test if a dataset is normally
* distribution, call
* computeNullHypothesisProbability( data, new UnivariateGaussian.CumulativeDistribution() ).
* @param Type of Number to consider
* @param data1 Dataset to consider
* @param function CDF to compare against the given data
* @return
* ConfidenceStatistic from the K-S test.
*/
@PublicationReference(
author={
"William H. Press",
"Saul A. Teukolsky",
"William T. Vetterling",
"Brian P. Flannery"
},
title="Numerical Recipes in C, Second Edition",
type=PublicationType.Book,
year=1992,
pages=625,
notes={
"Section 14.3",
"Function ksone()"
}
)
public static KolmogorovSmirnovConfidence.Statistic evaluateNullHypothesis(
Collection extends DomainType> data1,
CumulativeDistributionFunction function)
{
// This code nulls out the early repeated values. This signals to
// the subsequent loop that only the final repeated domain value
// should be tested, making the discontinuities with discrete-valued
// distributions manageable. -- krdixon 2010-03-24
ArrayList sortedData1 = new ArrayList( data1 );
Collections.sort( sortedData1, NumberComparator.INSTANCE );
for( int n = 1; n < sortedData1.size(); n++ )
{
if( sortedData1.get(n-1).equals( sortedData1.get(n) ) )
{
sortedData1.set(n-1, null);
}
}
// This method computes the two-tailed K-S statistic (written "D*")
// This can be transformed into a one-sided statistic by changing the
// "double dt..." line to select the D+ (fn-ff) or D- (fo-ff) terms.
// Knuth prefers the one-sided statistics, but I haven't seen a huge
// difference one way or the other.
double fo = 0.0;
double D = 0.0;
final double Ne = sortedData1.size();
for (int j = 0; j < Ne; j++)
{
double fn = (j + 1) / Ne;
if( sortedData1.get(j) != null )
{
double ff = function.evaluate( sortedData1.get(j) );
double dt = Math.max(Math.abs(fo - ff), Math.abs(fn - ff));
if (dt > D)
{
D = dt;
}
}
fo = fn;
}
return new KolmogorovSmirnovConfidence.Statistic(Ne, D);
}
/**
* Evaluates the Hypothesis that the given data were generated according
* to a UnivariateGaussian distribution. A high null-hypothesis
* probability is not conclusive proof that the data were generated by
* a Gaussian. However, a low null-hypothesis probability is conclusive
* that the data were NOT likely generated by a Gaussian
* @param data
* Data to evaluate the possibility that they were generated according to
* a Gaussian Distribution
* @return Confidence statistic from the K-S test
*/
public static KolmogorovSmirnovConfidence.Statistic evaluateGaussianHypothesis(
Collection data)
{
// First, fit the ML Gaussian to the data
UnivariateGaussian gaussian =
UnivariateGaussian.MaximumLikelihoodEstimator.learn(data, 0.0);
UnivariateGaussian.CDF cdf = new UnivariateGaussian.CDF(gaussian);
// Now, run a standard K-S test against the data and the ML gaussian
return evaluateNullHypothesis(data, cdf);
}
/**
* Computes the ConfidenceStatistic associated with a K-S test
*/
public static class Statistic
extends AbstractConfidenceStatistic
{
/**
* This is the D-statistic used in the K-S CDF,
* usually known as the D-statistic, which is the maximum
* difference between the two distributions. I use the two-tail
* version of D.
*/
private double D;
/**
* This is the degrees of freedom in the K-S distribution for the
* CDF calculation.
*/
private double Ne;
/**
* Creates a new instance of Statistic
* @param D
* This is the D-statistic used in the K-S CDF,
* usually known as the D-statistic, which is the maximum
* difference between the two distributions. I use the two-tail
* version of D.
* @param Ne
* This is the degrees of freedom in the K-S distribution for the
* CDF calculation.
*/
public Statistic(
double Ne,
double D)
{
super(Statistic.KSsignificance(Ne, D));
this.setNe(Ne);
this.setD(D);
}
/**
* Computes the significance of the K-S test from the given degrees of
* freedom and D-statistic. This approximation is from Numerical
* Recipes in C, p. 624
* @param Ne
* Number of degrees of freedom in the data
* @param D
* This is the D-statistic used in the K-S CDF,
* usually known as the D-statistic, which is the maximum
* difference between the two distributions. I use the two-tail
* version of D.
* @return
* Probability of the null hypothesis
*/
@PublicationReference(
author={
"William H. Press",
"Saul A. Teukolsky",
"William T. Vetterling",
"Brian P. Flannery"
},
title="Numerical Recipes in C, Second Edition",
type=PublicationType.Book,
year=1992,
pages=624,
notes={
"Section 14.3",
"Equation 14.3.9"
}
)
public static double KSsignificance(
double Ne,
double D)
{
double Nesqrt = Math.sqrt(Ne);
double x = (Nesqrt + 0.12 + 0.11 / Nesqrt) * D;
return 1.0 - KolmogorovDistribution.CDF.evaluate(x);
}
/**
* Setter for D
* @return
* This is the D-statistic used in the K-S CDF,
* usually known as the D-statistic, which is the maximum
* difference between the two distributions. I use the two-tail
* version of D.
*/
public double getD()
{
return this.D;
}
/**
* Setter for D
* @param D
* This is the D-statistic used in the K-S CDF,
* usually known as the D-statistic, which is the maximum
* difference between the two distributions. I use the two-tail
* version of D. 0.0 <= D <= 1.0
*/
protected void setD(
double D)
{
if ((D < 0.0) ||
(D > 1.0))
{
throw new IllegalArgumentException("0.0 <= D <= 1.0");
}
this.D = D;
}
/**
* Getter for Ne
* @return
* This is the degrees of freedom in the K-S distribution for the
* CDF calculation.
*/
public double getNe()
{
return this.Ne;
}
/**
* Setter for Ne
* @param Ne
* This is the degrees of freedom in the K-S distribution for the
* CDF calculation.
*/
protected void setNe(
double Ne)
{
if (Ne <= 0.0)
{
throw new IllegalArgumentException("Ne > 0.0");
}
this.Ne = Ne;
}
@Override
public double getTestStatistic()
{
return this.getD();
}
}
}