com.aliasi.stats.Statistics Maven / Gradle / Ivy
Show all versions of aliasi-lingpipe Show documentation
/*
* LingPipe v. 4.1.0
* Copyright (C) 2003-2011 Alias-i
*
* This program is licensed under the Alias-i Royalty Free License
* Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i
* Royalty Free License Version 1 for more details.
*
* You should have received a copy of the Alias-i Royalty Free License
* Version 1 along with this program; if not, visit
* http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
* Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
* +1 (718) 290-9170.
*/
package com.aliasi.stats;
import java.util.Arrays;
import java.util.Random;
/**
* The Statistics
class provides static utility methods
* for statistical computations.
*
* @author Bob Carpenter
* @version 3.5
* @since LingPipe2.0
*/
public class Statistics {
// don't allow instances
private Statistics() {
/* do nothing */
}
/**
* Returns the Kullback-Leibler divergence of the second specified
* Dirichlet distribution relative to the first. The Dirichlet
* distributions are specified by their distribution and concentration,
* which are folded into a single count argument.
*
* The KL divergence between two Dirichlet distributions with
* parameters xs
and ys
of dimensionality
* n
is:
*
*
* DKL(Dirichlet(xs) || Dirichlet(ys))
* = ∫ p(θ|xs) log ( p(θ|xs) / p(θ|ys) ) dθ
* = log Γ(Σi < n xs[i])
* - log Γ(Σi < n ys[i])
* - Σi < n log Γ(xs[i])
* + Σi < n log Γ(ys[i])
* + Σi < n (xs[i] - ys[i]) * (Ψ(xs[i]) - Ψ(Σk < n xs[i]))
*
* where Γ
is the gamma function (see {@link
* com.aliasi.util.Math#log2Gamma(double)}), and where
* Ψ
is the digamma function defined in {@link
* com.aliasi.util.Math#digamma(double)}.
*
* This method in keeping with other information-theoretic
* methods returns the answer in bits (base 2) rather than nats (base e
).
* The return is rescaled from the natural-log based divergence:
*
*
* klDivergenceDirichlet(xs,ys)
* = DKL(Dirichlet(xs) || Dirichlet(ys)) / log2 e
*
* Further descriptions of the computation of KL-divergence between
* Dirichlets may be found in:
*
*
* - W.D. Penny. 2001. Kullback-Liebler
* Divergences of Normal, Gamma, Dirichlet and Wishart
* Densities. Technical report, Wellcome Department of
* Cognitive Neurology, 2001.
-
*
- Blei, Franks, Jordan and Mian. 2006. Statistical
* modeling of biomedical corpora. BMC Bioinformatics
* 7:250.
*
*
* @param xs Dirichlet parameter for the first distribution.
* @param ys Dirichlet parameter for the second distribution.
* @return The KL-divergence of the second Dirichlet distribution
* relative to the first.
*/
public static double klDivergenceDirichlet(double[] xs, double[] ys) {
verifyDivergenceDirichletArgs(xs,ys);
// verifyDivergenceArgs(xs,ys);
double sumXs = sum(xs);
double sumYs = sum(ys);
double divergence
= (logGamma(sumXs)
- logGamma(sumYs));
double digammaSumXs = com.aliasi.util.Math.digamma(sumXs);
for (int i = 0; i < xs.length; ++i) {
divergence +=
(logGamma(ys[i])
- logGamma(xs[i]))
+ (xs[i] - ys[i]) * (com.aliasi.util.Math.digamma(xs[i]) - digammaSumXs);
}
return divergence;
}
static void verifyDivergenceDirichletArgs(double[] xs, double[] ys) {
if (xs.length != ys.length) {
String msg = "Parameter arrays must be the same length."
+ " Found xs.length=" + xs.length
+ " ys.length=" + ys.length;
throw new IllegalArgumentException(msg);
}
for (int i = 0; i < xs.length; ++i) {
if (xs[i] <= 0.0 || Double.isInfinite(xs[i]) || Double.isNaN(xs[i])) {
String msg = "All parameters must be positive and finite."
+ " Found xs[" + i + "]=" + xs[i];
throw new IllegalArgumentException(msg);
}
}
for (int i = 0; i < ys.length; ++i) {
if (ys[i] <= 0.0 || Double.isInfinite(ys[i]) || Double.isNaN(ys[i])) {
String msg = "All parameters must be positive and finite."
+ " Found ys[" + i + "]=" + ys[i];
throw new IllegalArgumentException(msg);
}
}
}
/**
* Returns the symmetric version of the Kullback-Leibler divergence
* between the Dirichlet distributions determined by the specified
* parameters.
*
* The symmetrized KL divergence for Dirichlets is just the
* average of both relative divergences:
*
*
* DSKL(Dirichlet(xs), Dirichlet(ys))
* = (DKL(Dirichlet(xs) || Dirichlet(ys)) + DKL(Dirichlet(ys) || Dirichlet(xs))) / 2
*
*
* See the method documentation for {@link
* #klDivergenceDirichlet(double[],double[])} for a definition of
* the relative divergence for Dirichlet distributions.
*
* @param xs Dirichlet parameter for the first distribution.
* @param ys Dirichlet parameter for the second distribution.
* @return The symmetrized KL-divergence of the distributions.
*/
public static double symmetrizedKlDivergenceDirichlet(double[] xs, double[] ys) {
return (klDivergenceDirichlet(xs,ys) + klDivergenceDirichlet(ys,xs)) / 2.0;
}
static double logGamma(double x) {
return com.aliasi.util.Math.log2Gamma(x)
/ com.aliasi.util.Math.log2(java.lang.Math.E);
}
static double sum(double[] xs) {
double sum = 0.0;
for (int i = 0; i < xs.length; ++i)
sum += xs[i];
return sum;
}
/**
* Returns the Kullback-Leibler divergence of the second
* specified multinomial relative to the first.
*
*
The K-L divergence of a multinomial q
relative
* to a multinomial p
, both with n
* outcomes, is:
*
*
* DKL(p||q)
* = Σi < n p(i) log2 (p(i) / q(i))
*
* The value is guaranteed to be non-negative, and will be 0.0
* only if the two distributions are identicial. If any outcome
* has zero probability in q
and non-zero probability
* in p
, the result is infinite.
*
* KL divergence is not symmetric. That is, there are
* p
and q
such that
* DKL(p||q) !=
* DKL(q||p)
. See {@link
* #symmetrizedKlDivergence(double[],double[])} and {@link
* #jsDivergence(double[],double[])} for symmetric variants.
*
*
KL divergence is equivalent to conditional entropy, although
* it is written in the opposite order. If H(p,q)
is
* the joint entropy of the distributions p
and
* q
, and H(p)
is the entropy of
* p
, then:
*
*
* DKL(p||q) = H(p,q) - H(p)
*
* @param p First multinomial distribution.
* @param q Second multinomial distribution.
* @throws IllegalArgumentException If the distributions are not
* the same length or have entries less than zero or greater than
* 1.
*/
public static double klDivergence(double[] p, double[] q) {
verifyDivergenceArgs(p,q);
double divergence = 0.0;
int len = p.length;
for (int i = 0; i < len; ++i) {
if (p[i] > 0.0 && p[i] != q[i])
divergence += p[i] * com.aliasi.util.Math.log2(p[i] / q[i]);
}
return divergence;
}
static void verifyDivergenceArgs(double[] p, double[] q) {
if (p.length != q.length) {
String msg = "Input distributions must have same length."
+ " Found p.length=" + p.length
+ " q.length=" + q.length;
throw new IllegalArgumentException(msg);
}
int len = p.length;
for (int i = 0; i < len; ++i) {
if (p[i] < 0.0
|| p[i] > 1.0
|| Double.isNaN(p[i])
|| Double.isInfinite(p[i])) {
String msg = "p[i] must be between 0.0 and 1.0 inclusive."
+ " found p[" + i + "]=" + p[i];
throw new IllegalArgumentException(msg);
}
if (q[i] < 0.0
|| q[i] > 1.0
|| Double.isNaN(q[i])
|| Double.isInfinite(q[i])) {
String msg = "q[i] must be between 0.0 and 1.0 inclusive."
+ " found q[" + i + "] =" + q[i];
throw new IllegalArgumentException(msg);
}
}
}
/**
* Returns the symmetrized KL-divergence between the specified distributions.
* The symmetrization is carried out by averaging their relative
* divergences:
*
*
* DSKL(p,q)
* = ( DKL(p,q) + DKL(q,p) ) / 2
*
*
* @param p First multinomial.
* @param q Second multinomial.
* @return The Symmetrized KL divergence between the multinomials.
* @throws IllegalArgumentException If the distributions are not
* the same length or have entries less than zero or greater than
* 1.
*/
public static double symmetrizedKlDivergence(double[] p, double[] q) {
verifyDivergenceArgs(p,q);
return (klDivergence(p,q) + klDivergence(q,p)) / 2.0;
}
/**
* Return the Jenson-Shannon divergence between the specified
* multinomial distributions. The JS divergence is defined by
*
*
* DJS(p,q)
* = ( DKL(p,m) + DKL(q,m) ) / 2
*
* where m
is defined as the balanced linear
* interpolation (that is, the average) of p
and
* q
:
*
*
* m[i] = (p[i] + q[i]) / 2
*
* The JS divergence is non-zero, equal to zero only if p
* and q
are the same distribution, and symmetric.
*
* @param p First multinomial.
* @param q Second multinomial.
* @return The JS divergence between the multinomials.
* @throws IllegalArgumentException If the distributions are not
* the same length or have entries less than zero or greater than
* 1.
*/
public static double jsDivergence(double[] p, double[] q) {
verifyDivergenceArgs(p,q);
double[] m = new double[p.length];
for (int i = 0; i < p.length; ++i)
m[i] = (p[i] + q[i])/2.0;
return (klDivergence(p,m) + klDivergence(q,m)) / 2.0;
}
/**
* Returns a permutation of the integers between 0 and
* the specified length minus one.
*
* @param length Size of permutation to return.
* @return Permutation of the specified length.
*/
public static int[] permutation(int length) {
return permutation(length,new Random());
}
/**
* Returns a permutation of the integers between 0 and
* the specified length minus one using the specified
* randomizer.
*
* @param length Size of permutation to return.
* @param random Randomizer to use for permutation.
* @return Permutation of the specified length.
*/
public static int[] permutation(int length, Random random) {
int[] xs = new int[length];
for (int i = 0; i < xs.length; ++i)
xs[i] = i;
for (int i = xs.length; --i > 0; ) {
int pos = random.nextInt(i);
int temp = xs[pos];
xs[pos] = xs[i];
xs[i] = temp;
}
return xs;
}
/**
* Returns Pearson's C2 goodness-of-fit
* statistic for independence over the specified binary
* contingency matrix. Asymptotically, this statistic has a
* χ2 distribution with one degree of freedom. The
* higher the value, the less likely the two outcomes are
* independent. Note that this method is just a special case of
* the general chi-squared independence test:
*
*
* chiSquaredIndependence(both,oneOnly,twoOnly,neither)
* = chiSquaredIndependence(new double[][] { {both, oneOnly},
* {twoOnly, neither} });
*
*
* The specified values make up the following contingency matrix
* for boolean outcomes labeled One
and
* Two
:
*
*
*
* +Two -Two
* +One both
oneOnly
* -One twoOnly
neither
*
*
*
* The value both
is the count of events where both
* outcomes occurred, oneOnly
for where only the
* first outcome occurred, twoOnly
for where only
* the second outcome occurred, and neither
for
* when neither occurred. Let totalCount
be
* the sum of all of the cells in the matrix.
*
*
From the contingency matrix, marginal probabilities for
* the two outcomes may be computed:
*
*
* P(One) = (both + oneOnly) / totalCount
*
* P(Two) = (both + twoOnly) / totalCount
*
*
* If these probabilities are independent, the expected values of
* the matrix cells are:
*
*
* E(both)= totalCount * P(One) * P(Two)
*
* E(oneOnly) = totalCount * P(One) * (1-P(Two))
*
* E(twoOnly) = totalCount * (1-P(One)) * P(Two)
*
* E(neither) = totalCount * (1-P(One)) * (1-P(Two))
*
*
* These are used to derive the independence test statistic, which
* is the square differences between observed and expected values
* under the independence assumption, normalized by the expected
* values:
*
*
* C2
* = (both - E(both))2 / E(both)
*
* + (oneOnly - E(oneOnly))2 / E(oneOnly)
*
* + (twoOnly - E(twoOnly))2 / E(twoOnly)
*
* + (neither - E(neither))2 / E(neither)
*
*
* Unlike the higher dimensional case, this statistic applies as a
* hypothesis test only in the case when all expected values are
* at least 10.
* @param both Count of samples of both outcomes.
* @param oneOnly Count of samples with the first and not the
* second outcome.
* @param twoOnly Count of samples with the second and not the
* first outcome.
* @param neither Count of samples with with neither outcome.
* @throws IllegalArgumentException If any of the arguments are not
* non-negative finite numbers.
* @return Pearson's C2 goodness-of-fit
* statistic for independence over the specified sample counts.
*/
public static double chiSquaredIndependence(double both, double oneOnly,
double twoOnly,
double neither) {
assertNonNegative("both",both);
assertNonNegative("oneOnly",oneOnly);
assertNonNegative("twoOnly",twoOnly);
assertNonNegative("neither",neither);
double n = both + oneOnly + twoOnly + neither;
double p1 = (both + oneOnly) / n;
double p2 = (both + twoOnly) / n;
double eBoth = n * p1 * p2;
double eOneOnly = n * p1 * (1.0 - p2);
double eTwoOnly = n * (1.0 - p1) * p2;
double eNeither = n * (1.0 - p1) * (1.0 - p2);
return csTerm(both,eBoth)
+ csTerm(oneOnly,eOneOnly)
+ csTerm(twoOnly,eTwoOnly)
+ csTerm(neither,eNeither);
}
/**
* Returns a two-element array of lineary regression coefficients
* for the specified x and y values. The coefficients returned,
* { β0, β1 }
, define a linear function:
*
*
* f(x) = β1 * x + β0
*
*
* The coefficients returned produce the linear function f(x)
* with the smallest squared error:
*
*
* sqErr(f,xs,ys) =
* Σi
* (f(xs[i]) - ys[i])2
*
*
* where all sums are for 0 << i < xs.length
.
*
* The funciton requires only a single pass through the two
* arrays, with β0
and β1
* given by:
*
*
* β1 = n * Σi x[i] * y[i] - (Σi x[i])(Σi y[i])
* ------------------------------------------
* n * Σi x[i]*x[i] - (Σi x[i])2
*
*
*
* β0 = Σi y[i] - β1 Σi x[i]
* ---------------------
* n
*
*
* where n = xs.length = ys.length
.
*
* @param xs Array of x values.
* @param ys Parallel array of y values.
* @return Pair of regression coefficients.
* @throws IllegalArgumentException If the arrays are of length
* less than 2, or if the arrays are not of the same length.
*/
public static double[] linearRegression(double[] xs, double[] ys) {
if (xs.length != ys.length) {
String msg = "Require parallel arrays of x and y values."
+ " Found xs.length=" + xs.length
+ " ys.length=" + ys.length;
throw new IllegalArgumentException(msg);
}
if (xs.length < 2) {
String msg = "Require arrays of length >= 2."
+ " Found xs.length=" + xs.length;
throw new IllegalArgumentException(msg);
}
double n = xs.length;
double xSum = 0.0;
double ySum = 0.0;
double xySum = 0.0;
double xxSum = 0.0;
for (int i = 0; i < xs.length; ++i) {
double x = xs[i];
double y = ys[i];
xSum += x;
ySum += y;
xxSum += x * x;
xySum += x * y;
}
double denominator = n * xxSum - xSum * xSum;
if (denominator == 0.0) {
String msg = "Ill formed input. Denominator for beta1 is zero."
+ " Most likely cause is fewer than 2 distinct inputs.";
throw new IllegalArgumentException(msg);
}
double beta1 = (n * xySum - xSum * ySum) / denominator;
double beta0 = (ySum - beta1 * xSum) / n;
return new double[] { beta0, beta1 };
}
/**
* Returns a two-element array of logistic regression coefficients
* for the specified x and y values. The coefficients returned,
* { β0, β1 }
, define the logistic function:
*
*
* L
* f(x) = ---------------
* 1 + e β1 * x + β0
*
*
* with the minimum squared error. See {@link
* #linearRegression(double[],double[])} for a definition of
* squared error. This function takes real values in the the open
* interval (0, L)
.
*
* Logistic regression coefficients are computed using
* linear regression, after transforming the y values. This
* is possible because of the following linear relation:
*
*
* log ((L - y) / y) = β1 * x + β0
*
*
* @param xs Array of x values.
* @param ys Array of y values.
* @param maxValue Maximum value of function.
* @return Binary array of logistic regression coordinates.
* @throws IllegalArgumentException If the maximum value is not
* positive and finite, if the arrays are of length less than 2,
* or if the arrays are not of the same length.
*/
public static double[] logisticRegression(double[] xs, double[] ys,
double maxValue) {
if (maxValue <= 0.0 || Double.isInfinite(maxValue) || Double.isNaN(maxValue)) {
String msg = "Require finite max value > 0."
+ " Found maxValue=" + maxValue;
throw new IllegalArgumentException(msg);
}
double[] logisticYs = new double[ys.length];
for (int i = 0; i < ys.length; ++i)
logisticYs[i] = java.lang.Math.log((maxValue - ys[i]) / ys[i]);
return linearRegression(xs,logisticYs);
}
/**
* Returns the value of Pearson's C2
* goodness of fit statistic for independence over the specified
* contingency matrix. Asymptotically, this statistic has a
* χ2 distribution with
* (numRows-1)*(numCols-1)
degrees of freedom. The
* higher the value, the less likely the two outcomes are
* independent.
*
* Pearson's C2 statistic is defined as follows:
*
*
* C2
* = Σi
* Σj
* (matrix[i][j] - expected(i,j,matrix))2 / expectedCount(i,j,matrix)
*
*
* where the expected count is the total count times the max
* likelihood estimates of row i
probability times
* column j
probability:
*
*
* expectedCount(i,j,matrix)
*
* = totalCount(matrix)
*
* * rowCount(i,matrix)/totalCount(matrix)
*
* * colCount(j,matrix)/totalCount(matrix)
*
* = rowCount(i,matrix) * colCount(j,matrix) / totalCount(matrix)
*
*
* where
*
*
* rowCount(i,matrix)
* = Σ0<=j<=numCols
* matrix[i][j]
*
*
* colCount(j,matrix)
* = Σ0<=i<=numRows
* matrix[i][j]
*
* totalCount(matrix)
* = Σ0<=i<=numRows
* = Σ0<=j<=numCols
* matrix[i][j]
*
*
* The χ2 test is a large sample test and is only
* valid if all of the expected counts are at least 5. This restriction
* is often ignored for ranking purposes.
*
* @param contingencyMatrix The specified contingency matrix.
* @return Pearson's C2 statistic for the independence
* testing over the contingency matrix.
* @throws Illegal argument exception if the matrix is not rectangular
* or if all values are not non-negative finite numbers.
*/
public static double chiSquaredIndependence(double[][] contingencyMatrix) {
int numRows = contingencyMatrix.length;
if (numRows < 2) {
String msg = "Require at least two rows."
+ " Found numRows=" + numRows;
throw new IllegalArgumentException(msg);
}
int numCols = contingencyMatrix[0].length;
if (numCols < 2) {
String msg = "Require at least two cols."
+ " Found numCols=" + numCols;
throw new IllegalArgumentException(msg);
}
double[] rowSums = new double[numRows];
Arrays.fill(rowSums,0.0);
double[] colSums = new double[numCols];
Arrays.fill(colSums,0.0);
double totalCount = 0.0;
for (int i = 0; i < numRows; ++i) {
if (contingencyMatrix[i].length != numCols) {
String msg = "Matrix must be rectangular."
+ "Row 0 length=" + numCols
+ "Row " + i + " length=" + contingencyMatrix[i].length;
throw new IllegalArgumentException(msg);
}
for (int j = 0; j < numCols; ++j) {
double val = contingencyMatrix[i][j];
if (Double.isInfinite(val) || val < 0.0
|| Double.isNaN(val)) {
String msg = "Values must be finite non-negative."
+ " Found matrix[" + i + "][" + j + "]="
+ val;
throw new IllegalArgumentException(msg);
}
rowSums[i] += val;
colSums[j] += val;
totalCount += val;
}
}
double result = 0.0;
for (int i = 0; i < numRows; ++i)
for (int j = 0; j < numCols; ++j)
result += csTerm(contingencyMatrix[i][j],
rowSums[i] * colSums[j] / totalCount);
return result;
}
/**
* Return an array of probabilities resulting from normalizing the
* specified probability ratios. The resulting array of
* probabilities is the same length as the input ratio array and
* each probability is simply the input array's value divided by
* the sum of the ratios.
*
*
Warning: This method is implemented by summing the
* probability ratios and then dividing each element by the sum.
* Because of the limited precision of double
-based
* arithmetic, if the largest ratio is much larger than the next
* largest ratio, then the largest normalized probability will be
* one and all others will be zero. Java double values follow the
* IEEE 754 arithmetic standard and thus use 52 bits for their
* mantissas. Thus only ratios within
* 252~1016 of
* the maximum ratio will be non-zero.
*
* @param probabilityRatios Ratios of probabilities.
* @return Probabilities resulting from normalizing the ratios.
* @throws IllegalArgumentException If the input contains a value
* that is not a finite non-negative number, or if the input does
* not contain at least one non-zero entry.
*/
public static double[] normalize(double[] probabilityRatios) {
for (int i = 0; i < probabilityRatios.length; ++i) {
if (probabilityRatios[i] < 0.0
|| Double.isInfinite(probabilityRatios[i])
|| Double.isNaN(probabilityRatios[i])) {
String msg = "Probabilities must be finite non-negative."
+ " Found probabilityRatios[" + i + "]="
+ probabilityRatios[i];
throw new IllegalArgumentException(msg);
}
}
double sum = com.aliasi.util.Math.sum(probabilityRatios);
if (sum <= 0.0) {
String msg = "Ratios must sum to number greater than zero."
+ " Found sum=" + sum;
throw new IllegalArgumentException(msg);
}
double[] result = new double[probabilityRatios.length];
for (int i = 0; i < probabilityRatios.length; ++i)
result[i] = probabilityRatios[i]/sum;
return result;
}
/**
* Returns the value of the kappa statistic for the specified
* observed and expected probabilities. The kappa statistic
* provides a kind of adjustment for the exptected (random)
* difficulty of a problem. It is defined by:
*
*
* kappa(p,e) = (p - e) / (1 - e)
*
*
* The most typical use for kappa is in evaluating
* classification problems of a machine versus a gold standard or
* between two human annotators to assess inter-annotator
* agreement.
*
* @param observedProb Observed probability.
* @param expectedProb Expected probability.
* @return The value of the kappa statistic for the specified
* probability and expected probability.
*/
public static double kappa(double observedProb, double expectedProb) {
return (observedProb - expectedProb)/(1 - expectedProb);
}
/**
* Returns the mean of the specified array of input values. The mean
* of an array is defined by:
*
*
* mean(xs)
* = Σi < xs.length
* xs[i] / xs.length
*
*
* If the array is of length zero, the result is {@link Double#NaN}.
*
* @param xs Array of values.
* @return Mean of array of values.
*/
public static double mean(double[] xs) {
return com.aliasi.util.Math.sum(xs) / (double) xs.length;
}
/**
* Returns the variance of the specified array of input values.
* The variance of an array of values is the mean of the
* squared differences between the values and the mean:
*
*
* variance(xs)
* = Σi < xs.length
* (mean(xs) - xs[i])2 / xs.length
*
*
* If the array is of length zero, the result is {@link Double#NaN}.
*
* @param xs Array of values.
* @return Variance of array of values.
*/
public static double variance(double[] xs) {
return variance(xs,mean(xs));
}
/**
* Returns the standard deviation of the specified array of input
* values. The standard deviation is just the square root of the
* variance:
*
*
* standardDeviation(xs) = variance(xs)(1/2)
*
*
* If the array is of length zero, the result is {@link Double#NaN}.
*
* @param xs Array of values.
* @return Standard deviation of array of values.
*/
public static double standardDeviation(double[] xs) {
return java.lang.Math.sqrt(variance(xs));
}
/**
* Returns (Pearson's) correlation coefficient between two
* specified arrays. The correlation coefficient, traditionally
* notated as r2
, measures the
* square error between a best linear fit between the two arrays
* of data. Rescaling either array by a positive constant will
* not affect the result.
*
* The square root r
of the correlation
* coefficient r2
is the variance
* in the second array explained by a linear relation with the
* the first array.
*
*
The definition of the correlation coefficient is:
*
*
* correlation(xs,ys)2
*
= sumSqDiff(xs,ys)2
*
/ (sumSqDiff(xs,xs) * sumSqDiff(xs,xs))
*
*
* where
*
*
* sumSqDiff(xs,ys)
*
= Σi<xs.length
* (xs[i] - mean(xs)) * (ys[i] - mean(ys))
*
*
* and thus the terms in the denominator above reduce using:
*
*
* sumSqDiffs(xs,xs)
*
= Σi<xs.length
* (xs[i] - mean(xs))2
*
*
* See the following for more details:
*
*
*
* - Eric W. Weisstein. "Correlation Coefficient." From
* MathWorld--A Wolfram Web Resource. http://mathworld.wolfram.com/CorrelationCoefficient.html
*
*
- Wikipedia. Pearson
* product-moment correlation coefficient.
*
*
*
* @param xs First array of values.
* @param ys Second array of values.
* @return The correlation coefficient of the two arrays.
* @throws IllegalArgumentException If the arrays are not the same
* length.
*/
public static double correlation(double[] xs, double[] ys) {
if (xs.length != ys.length) {
String msg = "xs and ys must be the same length."
+ " Found xs.length=" + xs.length
+ " ys.length=" + ys.length;
throw new IllegalArgumentException(msg);
}
double meanXs = mean(xs);
double meanYs = mean(ys);
double ssXX = sumSquareDiffs(xs,meanXs);
double ssYY = sumSquareDiffs(ys,meanYs);
double ssXY = sumSquareDiffs(xs,ys,meanXs,meanYs);
return java.lang.Math.sqrt((ssXY*ssXY) / (ssXX * ssYY));
}
/**
* Returns a sample from the discrete distribution represented by the
* specified cumulative probability ratios, using the specified random
* number generator.
*
* The cumulative probability ratios represent unnormalized probabilities
* of generating the value of their index or less, that is, unnormalized
* cumulative probabilities. For instance, consider
* the cumulative probability ratios { 0.5, 0.5, 3.9, 10.1}
:
*
*
* Outcome Value Unnormalized Prob Prob
* 0 0.5 0.5 0.5/10.1
* 1 0.5 0.0 0.0/10.1
* 2 3.9 3.4 3.4/10.1
* 3 10.1 6.2 6.2/10.1
*
*
* A sample is taken by generating a random number x between 0.0 and
* the value of the last element (here 10.1). The value returned is
* the index i such that:
*
*
* cumulativeProbRatios[i-1] < x <= cumulativeProbRatios[i]
*
* The corresponding probabilities given the sample input are
* listed in the last column in the table above.
*
* Note that if two
* elements have the same value, there is no chance of generating
* the outcome with the higher index. In the example above, this
* corresponds to outcome 1 having probaiblity 0.0
*
*
Warning: The cumulative probability ratios are required to meet
* two conditions. First, all values must be finite and non-negative. Second,
* the values must be non-decreasing, so that
* cumulativeProbRatios[i] <= cumulativeProbRatios[i+1]
.
* If either of these conditions is not met, the result is undefined.
*
* @param cumulativeProbRatios Cumulative probability for outcome less than
* or equal to index.
* @param random Random number generator for sampling.
* @return A sample from the specified distribution.
*/
public static int sample(double[] cumulativeProbRatios, Random random) {
int low = 0;
int high = cumulativeProbRatios.length - 1;
double x = random.nextDouble() * cumulativeProbRatios[high];
while (low < high) {
int mid = (high + low)/2;
if (x > cumulativeProbRatios[mid]) {
low = mid+1;
} else if (high == mid) {
return (x > cumulativeProbRatios[low]) ? mid : low;
} else {
high = mid;
}
}
return low;
}
/**
* Returns the log (base 2) of the probability of the specified
* discrete distribution given the specified uniform Dirichlet
* with concentration parameters equal to the specified value.
*
*
See {@link #dirichletLog2Prob(double[],double[])} for
* more information on the Dirichlet distribution. This method
* returns a result equivalent to the following (though is more
* efficiently implemented):
*
*
* double[] alphas = new double[xs.length];
* java.util.Arrays.fill(alphas,alpha);
* assert(dirichletLog2Prob(alpha,xs) == dirichletLog2Prob(alphas,xs));
*
* For the uniform Dirichlet, the distribution simplifies to
* the following form:
*
*
* p(xs | Dirichlet(α)) = (1/Z(α)) Πi < k xs[i]α-1
*
* where
*
*
* Z(α) = Γ(α)k / Γ(k * α)
*
* Warning: The probability distribution must be proper
* in the sense of having values between 0 and 1 inclusive and
* summing to 1.0. This property is not checked by this method.
*
* @param alpha Dirichlet concentration parameter to use for each
* dimension.
* @param xs The distribution whose probability is returned.
* @return The log (base 2) probability of the specified
* distribution in the uniform Dirichlet distribution with concentration
* parameters equal to alpha
.
* @throws IllegalArgumentException If the values in the distribution
* are not between 0 and 1 inclusive or if the concentration parameter is
* not positive and finite.
*/
public static double dirichletLog2Prob(double alpha, double[] xs) {
verifyAlpha(alpha);
verifyDistro(xs);
int k = xs.length;
// normalizing term
double result = com.aliasi.util.Math.log2Gamma(k * alpha)
- k * com.aliasi.util.Math.log2Gamma(alpha);
double alphaMinus1 = alpha - 1;
for (int i = 0; i < k; ++i)
result += alphaMinus1 * com.aliasi.util.Math.log2(xs[i]);
return result;
}
/**
* Returns the log (base 2) of the probability of the specified
* discrete distribution given the specified Dirichlet
* distribution concentration parameters. A Dirichlet
* distribution is a distribution over k
-dimensional
* discrete distributions.
*
*
The Dirichlet is widely used because it is the conjugate
* prior for multinomials in a Bayesian setting, and thus may
* be used to specify a convenient distribution over discrete
* distributions.
*
* A Dirichlet distribution is specified by a dimensionality
* k
and a concentration parameter α[i] > 0
* for each i < k
.
* The probability of the distribution xs
* in a Dirichlet distribution of dimension k
* and concentration parameters α
is
* given (up to a normalizing constant) by:
*
*
* p(xs | Dirichlet(α))
* ∝ Πi < k xs[i]α[i]-1
*
* The full distribution is:
*
*
* p(xs | Dirichlet(α))
* = (1/Z(α)) * Πi < k xs[i]α[i]-1
*
* where the normalizing constant is given by:
*
*
* Z(α) = Πi < k Γ(α[i])
* / Γ(Σi < k α[i])
*
* Warning: The probability distribution must be proper
* in the sense of having values between 0 and 1 inclusive and
* summing to 1.0. This property is not checked by this method.
*
* @param alphas The concentration parameters for the uniform Dirichlet.
* @param xs The outcome distribution
* @return The probability of the outcome distribution given the
* Dirichlet concentratioin parameter.
* @throws IllegalArgumentException If the Dirichlet parameters and
* distribution are not arrays of the same length or if the distribution
* parameters in xs are not between 0 and 1 inclusive, or if any of
* the concentration parameters is not positive and finite.
*/
public static double dirichletLog2Prob(double[] alphas, double[] xs) {
if (alphas.length != xs.length) {
String msg = "Dirichlet prior alphas and distribution xs must be the same length."
+ " Found alphas.length=" + alphas.length
+ " xs.length=" + xs.length;
throw new IllegalArgumentException(msg);
}
for (int i = 0; i < alphas.length; ++i)
verifyAlpha(alphas[i]);
verifyDistro(xs);
int k = xs.length;
double result = 0.0;
double alphaSum = 0.0;
for (int i = 0; i < alphas.length; ++i) {
alphaSum += alphas[i];
result -= com.aliasi.util.Math.log2Gamma(alphas[i]);
}
result += com.aliasi.util.Math.log2Gamma(alphaSum);
for (int i = 0; i < k; ++i)
result += (alphas[i] - 1) * com.aliasi.util.Math.log2(xs[i]);
return result;
}
static void verifyAlpha(double alpha) {
if (Double.isNaN(alpha) || Double.isInfinite(alpha) || alpha <= 0.0) {
String msg = "Concentration parameter must be positive and finite."
+ " Found alpha=" + alpha;
throw new IllegalArgumentException(msg);
}
}
static void verifyDistro(double[] xs) {
for (int i = 0; i < xs.length; ++i) {
if (xs[i] < 0.0 || xs[i] > 1.0 || Double.isNaN(xs[i]) || Double.isInfinite(xs[i])) {
String msg = "All xs must be betwee 0.0 and 1.0 inclusive."
+ " Found xs[" + i + "]=" + xs[i];
throw new IllegalArgumentException(msg);
}
}
}
// = sumSquareDiffs(xs,mean,xs,mean);
static double sumSquareDiffs(double[] xs, double mean) {
double sum = 0.0;
for (int i = 0; i < xs.length; ++i) {
double diff = xs[i] - mean;
sum += diff * diff;
}
return sum;
}
static double sumSquareDiffs(double[] xs, double[] ys, double meanXs, double meanYs) {
double sum = 0.0;
for (int i = 0; i < xs.length; ++i)
sum += (xs[i] - meanXs) * (ys[i] - meanYs);
return sum;
}
static double variance(double[] xs, double mean) {
return sumSquareDiffs(xs,mean) / (double) xs.length;
}
static void assertNonNegative(String variableName, double value) {
if (Double.isInfinite(value) || Double.isNaN(value) || value < 0.0) {
String msg = "Require finite non-negative value."
+ " Found " + variableName + " =" + value;
throw new IllegalArgumentException(msg);
}
}
private static double csTerm(double found, double expected) {
double diff = found - expected;
return diff * diff / expected;
}
}