src.it.unimi.dsi.stat.Jackknife Maven / Gradle / Ivy

Go to download
package it.unimi.dsi.stat;

/*
 * DSI utilities
 *
 * Copyright (C) 2011-2017 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import java.math.BigDecimal;
import java.math.MathContext;
import java.math.RoundingMode;
import java.util.List;

/** Applies the jackknife to generic statistics.
 *
 * This class applies the jackknife method (see, e.g., “A leisurely look at the bootstrap, the jackknife, and cross-validation”, by Bradley Efron and Gail Gong,
 * The American Statistician, 37(1):36−48, 1983) to reduce the bias in the estimation of a nonlinear
 * statistic of interest (linear statistics, such as the mean, pass through the jackknife without change).
 * The {@linkplain Statistic statistic} must take a sample (an array of {@linkplain BigDecimal big decimals}) and return
 * corresponding values (again as an array of {@linkplain BigDecimal big decimals}). In case high-precision
 * arithmetic is not required, an instance of {@link AbstractStatistic} just takes an array of doubles and returns an
 * array of doubles, handling all necessary type conversions.
 *
 * 
The static method {@link #compute(List, Statistic, MathContext)} takes a list
 * of samples (arrays of doubles of the same length) and returns an instance of this class containing
 * {@linkplain #estimate estimates} and {@linkplain #standardError standard errors} for every
 * value computed by the statistic (estimates of the statistic are available both as
 * {@linkplain #bigEstimate an array of big decimals} and as {@linkplain #estimate an array of doubles},
 * whereas {@linkplain #standardError estimates of standard errors} are provided in double format, only).
 *
 * 
All computations are performed internally using {@link BigDecimal} and a provided {@link MathContext}.
 * The method {@link #compute(List, Statistic)} uses {@linkplain #DEFAULT_MATH_CONTEXT 100 decimal digits}.
 *
 * 
The {@linkplain #IDENTITY identical} statistic can be used to compute the (pointwise) empirical mean
 * and standard error of a sample.
 *
 * @author Sebastiano Vigna
 */

public class Jackknife {
	/** The default {@link MathContext} used by {@link #compute(List, Statistic)}: 100 digits and {@link RoundingMode#HALF_EVEN}. */
	public final static MathContext DEFAULT_MATH_CONTEXT = new MathContext(100, RoundingMode.HALF_EVEN);

	/** A vector of high-precision estimates for a statistic of interest. */
	public final BigDecimal[] bigEstimate;
	/** A vector of estimates for a statistic of interest (obtained by invoking {@link BigDecimal#doubleValue()} on {@link #bigEstimate}). */
	public final double[] estimate;
	/** A vector of (estimates of the) standard error parallel to {@link #bigEstimate}/{@link #estimate}. */
	public final double[] standardError;

	public static double[] bigDecimalArray2DoubleArray(final BigDecimal[] input) {
		final double[] output = new double[input.length];
		for(int i = input.length; i-- != 0;) output[i] = input[i].doubleValue();
		return output;
	}

	public static BigDecimal[] doubleArray2BigDecimalArray(final double[] input) {
		final BigDecimal[] output = new BigDecimal[input.length];
		for(int i = input.length; i-- != 0;) output[i] = BigDecimal.valueOf(input[i]);
		return output;
	}

	private Jackknife(final BigDecimal[] estimate, final double[] standardError) {
		this.standardError = standardError;
		this.estimate = bigDecimalArray2DoubleArray(estimate);
		this.bigEstimate = estimate;
	}

	@Override
	public String toString() {
		StringBuilder s = new StringBuilder();
		for(int i = estimate.length; i++ != 0;) s.append(estimate[i]).append('\t').append(standardError[i]).append(System.getProperty("\n"));
		return s.toString();
	}

	/** A statistic to be estimated using the jackknife on a set of samples. */
	public static interface Statistic {
		/** Computes the statistic.
		 *
		 * Note that the {@link BigDecimal} instances passed to this method are guaranteed to
		 * have a {@linkplain BigDecimal#scale() scale} set by the caller. If you have to perform divisions,
		 * please use the supplied {@link MathContext}.
		 *
		 * @param sample the samples over which the statistic must be computed.
		 * @param mc the mathematical context to be used when dividing {@linkplain BigDecimal big decimals}.
		 * @return the resulting statistic.
		 */
		public BigDecimal[] compute(BigDecimal[] sample, MathContext mc);
	}

	/** A statistic that returns the sample. Useful to compute the average and the empirical standard error. */
	public static Jackknife.Statistic IDENTITY = new Jackknife.Statistic() {
		@Override
		public BigDecimal[] compute(final BigDecimal[] sample, final MathContext unused) {
			return sample;
		}
	};

	/** An abstract statistic with a {@linkplain #compute(double[]) template method} that
	 * accepts an array of doubles, returns an array of doubles and handles the data conversions that
	 * are necessary to call {@link Statistic#compute(BigDecimal[], MathContext)}. Useful if you do not
	 * want to fiddle with {@link BigDecimal}. */
	public abstract static class AbstractStatistic implements Statistic {
		public abstract double[] compute(final double[] sample);

		@Override
		public BigDecimal[] compute(BigDecimal[] bigSample, final MathContext unused) {
			return doubleArray2BigDecimalArray(compute(bigDecimalArray2DoubleArray(bigSample)));
		}
	};


	/** Applies the jackknife to a statistic of interest using a list of samples using {@link #DEFAULT_MATH_CONTEXT} as context.
	 *
	 * @param samples a list of samples (arrays of doubles of the same length).
	 * @param f a statistic of interest.
	 * @return an instance of this class containing estimates of f and corresponding standard errors
	 * obtained by the jackknife on the given set of samples.
	 */
	public static Jackknife compute(final List samples, final Statistic f) {
		return compute(samples, f, DEFAULT_MATH_CONTEXT);
	}

	/** Applies the jackknife to a statistic of interest using a list of samples.
	 *
	 * @param samples a list of samples (arrays of doubles of the same length).
	 * @param f a statistic of interest.
	 * @param mc the mathematical context to be used when dividing {@linkplain BigDecimal big decimals}.
	 * @return an instance of this class containing estimates of f and corresponding standard errors
	 * obtained by the jackknife on the given set of samples.
	 */
	public static Jackknife compute(final List samples, final Statistic f, final MathContext mc) {
		final int n = samples.size();
		final BigDecimal big1OverN =  BigDecimal.ONE.divide(BigDecimal.valueOf(n), mc);
		final BigDecimal big1OverNMinus1 =  BigDecimal.ONE.divide(BigDecimal.valueOf(n - 1), mc);
		final BigDecimal bigNMinus1OverN =  BigDecimal.valueOf(n - 1).divide(BigDecimal.valueOf(n), mc);
		final int l = samples.get(0).length;
		final BigDecimal[] sum = new BigDecimal[l];
		for(int p = l; p-- != 0;) sum[p] = BigDecimal.ZERO;
		// Gather all samples
		for(double[] sample: samples) {
			if (sample.length != l) throw new IllegalArgumentException("Samples have different sizes: " + sample.length + " != " + l);
			for(int p = l; p-- != 0;) sum[p] = sum[p].add(BigDecimal.valueOf(sample[p]), mc);
		}

		final BigDecimal[] averagedSample = new BigDecimal[l];
		for(int p = l; p-- != 0;) averagedSample[p] = sum[p].multiply(big1OverN, mc);
		final BigDecimal[] naiveStatistics = f.compute(averagedSample, mc);
		final int k = naiveStatistics.length;

		final BigDecimal[][] leaveOneOutStatistic = new BigDecimal[n][];
		// Compute leave-one-out statistics
		for(int s = 0; s < n; s++) {
			final BigDecimal[] leaveOneOutSample = new BigDecimal[l];
			// Leave-one-out sample
			final double[] t = samples.get(s);
			for(int p = l; p-- != 0;) leaveOneOutSample[p] = sum[p].subtract(BigDecimal.valueOf(t[p]), mc).multiply(big1OverNMinus1, mc);
			// Leave-one-out statistic
			leaveOneOutStatistic[s] = f.compute(leaveOneOutSample, mc);
			if (leaveOneOutStatistic[s].length != k) throw new IllegalArgumentException("Statistics have different sizes: " + leaveOneOutStatistic[s].length + " != " + k);
		}

		final BigDecimal[] estimate = new BigDecimal[k];
		final double[] standardError = new double[k];
		for(int i = k; i-- != 0;) {
			BigDecimal e = BigDecimal.valueOf(n).multiply(naiveStatistics[i], mc);
			for(int s = n; s-- != 0;) e = e.subtract(leaveOneOutStatistic[s][i].multiply(bigNMinus1OverN, mc), mc);
			estimate[i] = e;

			BigDecimal variance = BigDecimal.ZERO;
			for(int s = n; s-- != 0;) {
				final BigDecimal t = naiveStatistics[i].subtract(leaveOneOutStatistic[s][i], mc);
				variance = variance.add(t.multiply(t, mc), mc);
			}
			standardError[i] = Math.sqrt(variance.multiply(bigNMinus1OverN, mc).doubleValue());
		}

		return new Jackknife(estimate, standardError);
	}
}