src.it.unimi.dsi.stat.Jackknife Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of dsiutils Show documentation
The DSI utilities are a mish mash of classes accumulated during the last ten years in projects developed at the DSI (Dipartimento di Scienze dell'Informazione, i.e., Information Sciences Department), now DI (Dipartimento di Informatica, i.e., Informatics Department), of the Universita` degli Studi di Milano.
There is a newer version: 2.7.3
Show newest version
package it.unimi.dsi.stat;

/*		 
 * DSI utilities
 *
 * Copyright (C) 2011-2016 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import java.math.BigDecimal;
import java.math.MathContext;
import java.math.RoundingMode;
import java.util.List;

/** Applies the jackknife to generic statistics.
 * 
 * This class applies the jackknife method (see, e.g., “A leisurely look at the bootstrap, the jackknife, and cross-validation”, by Bradley Efron and Gail Gong,
 * The American Statistician, 37(1):36−48, 1983) to reduce the bias in the estimation of a nonlinear
 * statistic of interest (linear statistics, such as the mean, pass through the jackknife without change). 
 * The {@linkplain Statistic statistic} must take a sample (an array of {@linkplain BigDecimal big decimals}) and return 
 * corresponding values (again as an array of {@linkplain BigDecimal big decimals}). In case high-precision
 * arithmetic is not required, an instance of {@link AbstractStatistic} just takes an array of doubles and returns an
 * array of doubles, handling all necessary type conversions. 
 * 
 * 
The static method {@link #compute(List, Statistic, MathContext)} takes a list
 * of samples (arrays of doubles of the same length) and returns an instance of this class containing
 * {@linkplain #estimate estimates} and {@linkplain #standardError standard errors} for every 
 * value computed by the statistic (estimates of the statistic are available both as 
 * {@linkplain #bigEstimate an array of big decimals} and as {@linkplain #estimate an array of doubles},
 * whereas {@linkplain #standardError estimates of standard errors} are provided in double format, only).
 * 
 * 
All computations are performed internally using {@link BigDecimal} and a provided {@link MathContext}.
 * The method {@link #compute(List, Statistic)} uses {@linkplain #DEFAULT_MATH_CONTEXT 100 decimal digits}.
 * 
 * 
The {@linkplain #IDENTITY identical} statistic can be used to compute the (pointwise) empirical mean
 * and standard error of a sample.
 * 
 * @author Sebastiano Vigna
 */

public class Jackknife {
	/** The default {@link MathContext} used by {@link #compute(List, Statistic)}: 100 digits and {@link RoundingMode#HALF_EVEN}. */
	public final static MathContext DEFAULT_MATH_CONTEXT = new MathContext( 100, RoundingMode.HALF_EVEN );
	
	/** A vector of high-precision estimates for a statistic of interest. */
	public final BigDecimal[] bigEstimate;
	/** A vector of estimates for a statistic of interest (obtained by invoking {@link BigDecimal#doubleValue()} on {@link #bigEstimate}). */
	public final double[] estimate;
	/** A vector of (estimates of the) standard error parallel to {@link #bigEstimate}/{@link #estimate}. */
	public final double[] standardError;
	
	public static double[] bigDecimalArray2DoubleArray( final BigDecimal[] input ) {
		final double[] output = new double[ input.length ];
		for( int i = input.length; i-- != 0; ) output[ i ] = input[ i ].doubleValue();
		return output;
	}
	
	public static BigDecimal[] doubleArray2BigDecimalArray( final double[] input ) {
		final BigDecimal[] output = new BigDecimal[ input.length ];
		for( int i = input.length; i-- != 0; ) output[ i ] = BigDecimal.valueOf( input[ i ] );
		return output;
	}
	
	private Jackknife( final BigDecimal[] estimate, final double[] standardError ) {
		this.standardError = standardError;
		this.estimate = bigDecimalArray2DoubleArray( estimate );
		this.bigEstimate = estimate;
	}

	public String toString() {
		StringBuilder s = new StringBuilder();
		for( int i = estimate.length; i++ != 0; ) s.append( estimate[ i ] ).append( '\t' ).append( standardError[ i ] ).append( System.getProperty( "\n" ) );
		return s.toString();
	}
	
	/** A statistic to be estimated using the jackknife on a set of samples. */
	public static interface Statistic {
		/** Computes the statistic.
		 * 
		 * Note that the {@link BigDecimal} instances passed to this method are guaranteed to 
		 * have a {@linkplain BigDecimal#scale() scale} set by the caller. If you have to perform divisions, 
		 * please use the supplied {@link MathContext}.
		 * 
		 * @param sample the samples over which the statistic must be computed.
		 * @param mc the mathematical context to be used when dividing {@linkplain BigDecimal big decimals}.
		 * @return the resulting statistic.
		 */
		public BigDecimal[] compute( BigDecimal[] sample, MathContext mc );
	}
	
	/** A statistic that returns the sample. Useful to compute the average and the empirical standard error. */
	public static Jackknife.Statistic IDENTITY = new Jackknife.Statistic() {
		@Override
		public BigDecimal[] compute( final BigDecimal[] sample, final MathContext unused ) {
			return sample;
		}
	};

	/** An abstract statistic with a {@linkplain #compute(double[]) template method} that 
	 * accepts an array of doubles, returns an array of doubles and handles the data conversions that
	 * are necessary to call {@link Statistic#compute(BigDecimal[], MathContext)}. Useful if you do not
	 * want to fiddle with {@link BigDecimal}. */
	public abstract static class AbstractStatistic implements Statistic {
		public abstract double[] compute( final double[] sample );

		@Override
		public BigDecimal[] compute( BigDecimal[] bigSample, final MathContext unused ) {
			return doubleArray2BigDecimalArray( compute( bigDecimalArray2DoubleArray( bigSample ) ) );
		}
	};

	
	/** Applies the jackknife to a statistic of interest using a list of samples using {@link #DEFAULT_MATH_CONTEXT} as context.
	 * 
	 * @param samples a list of samples (arrays of doubles of the same length).
	 * @param f a statistic of interest.
	 * @return an instance of this class containing estimates of f and corresponding standard errors 
	 * obtained by the jackknife on the given set of samples.
	 */
	public static Jackknife compute( final List samples, final Statistic f ) {
		return compute( samples, f, DEFAULT_MATH_CONTEXT );
	}
	
	/** Applies the jackknife to a statistic of interest using a list of samples.
	 * 
	 * @param samples a list of samples (arrays of doubles of the same length).
	 * @param f a statistic of interest.
	 * @param mc the mathematical context to be used when dividing {@linkplain BigDecimal big decimals}.
	 * @return an instance of this class containing estimates of f and corresponding standard errors 
	 * obtained by the jackknife on the given set of samples.
	 */
	public static Jackknife compute( final List samples, final Statistic f, final MathContext mc ) {
		final int n = samples.size();
		final BigDecimal big1OverN =  BigDecimal.ONE.divide( BigDecimal.valueOf( n ), mc );
		final BigDecimal big1OverNMinus1 =  BigDecimal.ONE.divide( BigDecimal.valueOf( n - 1 ), mc );
		final BigDecimal bigNMinus1OverN =  BigDecimal.valueOf( n - 1 ).divide( BigDecimal.valueOf( n ), mc );
		final int l = samples.get( 0 ).length;
		final BigDecimal[] sum = new BigDecimal[ l ];
		for( int p = l; p-- != 0; ) sum[ p ] = BigDecimal.ZERO;
		// Gather all samples
		for( double[] sample: samples ) { 
			if ( sample.length != l ) throw new IllegalArgumentException( "Samples have different sizes: " + sample.length + " != " + l );
			for( int p = l; p-- != 0; ) sum[ p ] = sum[ p ].add( BigDecimal.valueOf( sample[ p ] ), mc );
		}

		final BigDecimal[] averagedSample = new BigDecimal[ l ];
		for( int p = l; p-- != 0; ) averagedSample[ p ] = sum[ p ].multiply( big1OverN, mc );
		final BigDecimal[] naiveStatistics = f.compute( averagedSample, mc );
		final int k = naiveStatistics.length;

		final BigDecimal[][] leaveOneOutStatistic = new BigDecimal[ n ][];
		// Compute leave-one-out statistics
		for( int s = 0; s < n; s++ ) {
			final BigDecimal[] leaveOneOutSample = new BigDecimal[ l ];
			// Leave-one-out sample
			final double[] t = samples.get( s );
			for( int p = l; p-- != 0; ) leaveOneOutSample[ p ] = sum[ p ].subtract( BigDecimal.valueOf( t[ p ] ), mc ).multiply( big1OverNMinus1, mc );
			// Leave-one-out statistic
			leaveOneOutStatistic[ s ] = f.compute( leaveOneOutSample, mc );
			if ( leaveOneOutStatistic[ s ].length != k ) throw new IllegalArgumentException( "Statistics have different sizes: " + leaveOneOutStatistic[ s ].length + " != " + k );
		}
		
		final BigDecimal[] estimate = new BigDecimal[ k ];
		final double[] standardError = new double[ k ];
		for( int i = k; i-- != 0; ) {
			BigDecimal e = BigDecimal.valueOf( n ).multiply( naiveStatistics[ i ], mc );  
			for( int s = n; s-- != 0; ) e = e.subtract( leaveOneOutStatistic[ s ][ i ].multiply( bigNMinus1OverN, mc ), mc );
			estimate[ i ] = e;

			BigDecimal variance = BigDecimal.ZERO;
			for( int s = n; s-- != 0; ) {
				final BigDecimal t = naiveStatistics[ i ].subtract( leaveOneOutStatistic[ s ][ i ], mc );
				variance = variance.add( t.multiply( t, mc ), mc );
			}
			standardError[ i ] = Math.sqrt( variance.multiply( bigNMinus1OverN, mc ).doubleValue() );			
		}
		
		return new Jackknife( estimate, standardError );
	}
}