src.it.unimi.dsi.stat.Jackknife Maven / Gradle / Ivy
package it.unimi.dsi.stat;
/*
* DSI utilities
*
* Copyright (C) 2011-2017 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import java.math.BigDecimal;
import java.math.MathContext;
import java.math.RoundingMode;
import java.util.List;
/** Applies the jackknife to generic statistics.
*
* This class applies the jackknife method (see, e.g., “A leisurely look at the bootstrap, the jackknife, and cross-validation”, by Bradley Efron and Gail Gong,
* The American Statistician, 37(1):36−48, 1983) to reduce the bias in the estimation of a nonlinear
* statistic of interest (linear statistics, such as the mean, pass through the jackknife without change).
* The {@linkplain Statistic statistic} must take a sample (an array of {@linkplain BigDecimal big decimals}) and return
* corresponding values (again as an array of {@linkplain BigDecimal big decimals}). In case high-precision
* arithmetic is not required, an instance of {@link AbstractStatistic} just takes an array of doubles and returns an
* array of doubles, handling all necessary type conversions.
*
*
The static method {@link #compute(List, Statistic, MathContext)} takes a list
* of samples (arrays of doubles of the same length) and returns an instance of this class containing
* {@linkplain #estimate estimates} and {@linkplain #standardError standard errors} for every
* value computed by the statistic (estimates of the statistic are available both as
* {@linkplain #bigEstimate an array of big decimals} and as {@linkplain #estimate an array of doubles},
* whereas {@linkplain #standardError estimates of standard errors} are provided in double format, only).
*
*
All computations are performed internally using {@link BigDecimal} and a provided {@link MathContext}.
* The method {@link #compute(List, Statistic)} uses {@linkplain #DEFAULT_MATH_CONTEXT 100 decimal digits}.
*
*
The {@linkplain #IDENTITY identical} statistic can be used to compute the (pointwise) empirical mean
* and standard error of a sample.
*
* @author Sebastiano Vigna
*/
public class Jackknife {
/** The default {@link MathContext} used by {@link #compute(List, Statistic)}: 100 digits and {@link RoundingMode#HALF_EVEN}. */
public final static MathContext DEFAULT_MATH_CONTEXT = new MathContext(100, RoundingMode.HALF_EVEN);
/** A vector of high-precision estimates for a statistic of interest. */
public final BigDecimal[] bigEstimate;
/** A vector of estimates for a statistic of interest (obtained by invoking {@link BigDecimal#doubleValue()} on {@link #bigEstimate}). */
public final double[] estimate;
/** A vector of (estimates of the) standard error parallel to {@link #bigEstimate}/{@link #estimate}. */
public final double[] standardError;
public static double[] bigDecimalArray2DoubleArray(final BigDecimal[] input) {
final double[] output = new double[input.length];
for(int i = input.length; i-- != 0;) output[i] = input[i].doubleValue();
return output;
}
public static BigDecimal[] doubleArray2BigDecimalArray(final double[] input) {
final BigDecimal[] output = new BigDecimal[input.length];
for(int i = input.length; i-- != 0;) output[i] = BigDecimal.valueOf(input[i]);
return output;
}
private Jackknife(final BigDecimal[] estimate, final double[] standardError) {
this.standardError = standardError;
this.estimate = bigDecimalArray2DoubleArray(estimate);
this.bigEstimate = estimate;
}
@Override
public String toString() {
StringBuilder s = new StringBuilder();
for(int i = estimate.length; i++ != 0;) s.append(estimate[i]).append('\t').append(standardError[i]).append(System.getProperty("\n"));
return s.toString();
}
/** A statistic to be estimated using the jackknife on a set of samples. */
public static interface Statistic {
/** Computes the statistic.
*
*
Note that the {@link BigDecimal} instances passed to this method are guaranteed to
* have a {@linkplain BigDecimal#scale() scale} set by the caller. If you have to perform divisions,
* please use the supplied {@link MathContext}.
*
* @param sample the samples over which the statistic must be computed.
* @param mc the mathematical context to be used when dividing {@linkplain BigDecimal big decimals}.
* @return the resulting statistic.
*/
public BigDecimal[] compute(BigDecimal[] sample, MathContext mc);
}
/** A statistic that returns the sample. Useful to compute the average and the empirical standard error. */
public static Jackknife.Statistic IDENTITY = new Jackknife.Statistic() {
@Override
public BigDecimal[] compute(final BigDecimal[] sample, final MathContext unused) {
return sample;
}
};
/** An abstract statistic with a {@linkplain #compute(double[]) template method} that
* accepts an array of doubles, returns an array of doubles and handles the data conversions that
* are necessary to call {@link Statistic#compute(BigDecimal[], MathContext)}. Useful if you do not
* want to fiddle with {@link BigDecimal}. */
public abstract static class AbstractStatistic implements Statistic {
public abstract double[] compute(final double[] sample);
@Override
public BigDecimal[] compute(BigDecimal[] bigSample, final MathContext unused) {
return doubleArray2BigDecimalArray(compute(bigDecimalArray2DoubleArray(bigSample)));
}
};
/** Applies the jackknife to a statistic of interest using a list of samples using {@link #DEFAULT_MATH_CONTEXT} as context.
*
* @param samples a list of samples (arrays of doubles of the same length).
* @param f a statistic of interest.
* @return an instance of this class containing estimates of f
and corresponding standard errors
* obtained by the jackknife on the given set of samples.
*/
public static Jackknife compute(final List samples, final Statistic f) {
return compute(samples, f, DEFAULT_MATH_CONTEXT);
}
/** Applies the jackknife to a statistic of interest using a list of samples.
*
* @param samples a list of samples (arrays of doubles of the same length).
* @param f a statistic of interest.
* @param mc the mathematical context to be used when dividing {@linkplain BigDecimal big decimals}.
* @return an instance of this class containing estimates of f
and corresponding standard errors
* obtained by the jackknife on the given set of samples.
*/
public static Jackknife compute(final List samples, final Statistic f, final MathContext mc) {
final int n = samples.size();
final BigDecimal big1OverN = BigDecimal.ONE.divide(BigDecimal.valueOf(n), mc);
final BigDecimal big1OverNMinus1 = BigDecimal.ONE.divide(BigDecimal.valueOf(n - 1), mc);
final BigDecimal bigNMinus1OverN = BigDecimal.valueOf(n - 1).divide(BigDecimal.valueOf(n), mc);
final int l = samples.get(0).length;
final BigDecimal[] sum = new BigDecimal[l];
for(int p = l; p-- != 0;) sum[p] = BigDecimal.ZERO;
// Gather all samples
for(double[] sample: samples) {
if (sample.length != l) throw new IllegalArgumentException("Samples have different sizes: " + sample.length + " != " + l);
for(int p = l; p-- != 0;) sum[p] = sum[p].add(BigDecimal.valueOf(sample[p]), mc);
}
final BigDecimal[] averagedSample = new BigDecimal[l];
for(int p = l; p-- != 0;) averagedSample[p] = sum[p].multiply(big1OverN, mc);
final BigDecimal[] naiveStatistics = f.compute(averagedSample, mc);
final int k = naiveStatistics.length;
final BigDecimal[][] leaveOneOutStatistic = new BigDecimal[n][];
// Compute leave-one-out statistics
for(int s = 0; s < n; s++) {
final BigDecimal[] leaveOneOutSample = new BigDecimal[l];
// Leave-one-out sample
final double[] t = samples.get(s);
for(int p = l; p-- != 0;) leaveOneOutSample[p] = sum[p].subtract(BigDecimal.valueOf(t[p]), mc).multiply(big1OverNMinus1, mc);
// Leave-one-out statistic
leaveOneOutStatistic[s] = f.compute(leaveOneOutSample, mc);
if (leaveOneOutStatistic[s].length != k) throw new IllegalArgumentException("Statistics have different sizes: " + leaveOneOutStatistic[s].length + " != " + k);
}
final BigDecimal[] estimate = new BigDecimal[k];
final double[] standardError = new double[k];
for(int i = k; i-- != 0;) {
BigDecimal e = BigDecimal.valueOf(n).multiply(naiveStatistics[i], mc);
for(int s = n; s-- != 0;) e = e.subtract(leaveOneOutStatistic[s][i].multiply(bigNMinus1OverN, mc), mc);
estimate[i] = e;
BigDecimal variance = BigDecimal.ZERO;
for(int s = n; s-- != 0;) {
final BigDecimal t = naiveStatistics[i].subtract(leaveOneOutStatistic[s][i], mc);
variance = variance.add(t.multiply(t, mc), mc);
}
standardError[i] = Math.sqrt(variance.multiply(bigNMinus1OverN, mc).doubleValue());
}
return new Jackknife(estimate, standardError);
}
}