jaitools.numeric.SampleStats Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of jt-all Show documentation
Provides a single jar containing all JAI-tools modules which you can use instead of including individual modules in your project. Note: It does not include the Jiffle scripting language or Jiffle image operator.
The newest version!
/*
 * Copyright 2009-2011 Michael Bedward
 * 
 * This file is part of jai-tools.
 *
 * jai-tools is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3 of the 
 * License, or (at your option) any later version.
 *
 * jai-tools is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public 
 * License along with jai-tools.  If not, see .
 * 
 */

package jaitools.numeric;

import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.SortedSet;

import jaitools.CollectionFactory;

/**
 * A collection of static methods to calculate summary statistics for
 * a sample of double-valued data. This class is used by both Jiffle
 * and the KernelStats operator.
 *
 * @author Michael Bedward
 * @author Daniele Romagnoli, GeoSolutions S.A.S.
 * @since 1.0
 * @version $Id: SampleStats.java 1610 2011-03-31 04:44:28Z michael.bedward $
 */
public class SampleStats {
    
    /**
     * Return the maximum of the given values.
     *
     * @param values sample values
     * @param ignoreNaN specifies whether to ignore NaN values
     * @return max value or Double.NaN if the sample is empty
     */
    public static double max(Double[] values, boolean ignoreNaN) {
        if (values == null || values.length == 0) {
            return Double.NaN;
        } else if (values.length == 1) {
            return values[0];
        }
        
        SortedSet set = CollectionFactory.sortedSet();
        set.addAll(Arrays.asList(values));
        if (ignoreNaN) set.remove(Double.NaN);
        return set.last();
    }

    /**
     * Return the mean of the given values.
     *
     * @param values sample values
     * @param ignoreNaN specifies whether to ignore NaN values
     * @return mean value or Double.NaN if the sample is empty
     */
    public static double mean(Double[] values, boolean ignoreNaN) {
        if (values == null || values.length == 0) {
            return Double.NaN;
        } else if (values.length == 1) {
            return values[0];
        }

        double sum = 0.0d;
        int n = 0;
        for (Double val : values) {
            if (val.isNaN()) {
                if (!ignoreNaN) return Double.NaN;
            } else {
                sum += val;
                n++ ;
            }
        }

        return sum / n;
    }

    /**
     * Calculates the minimum of the given values.
     *
     * @param values sample values
     * @param ignoreNaN specifies whether to ignore NaN values
     * @return min value or Double.NaN if the sample is empty
     */
    public static double min(Double[] values, boolean ignoreNaN) {
        if (values == null || values.length == 0) {
            return Double.NaN;
        } else if (values.length == 1) {
            return values[0];
        }
        
        SortedSet set = CollectionFactory.sortedSet();
        set.addAll(Arrays.asList(values));
        if (ignoreNaN) set.remove(Double.NaN);
        return set.first();
    }

    /**
     * Calculates the median of the given values. For a sample with an odd
     * number of elements the median is the mid-point value of the 
     * sorted sample. For an even number of elements it is the mean of
     * the two values on either side of the mid-point. 
     * 
     * @param values sample values (need not be pre-sorted)
     * @param ignoreNaN specifies whether to ignore NaN values
     * @return median value or Double.NaN if the sample is empty
     */
    @SuppressWarnings("empty-statement")
    public static double median(Double[] values, boolean ignoreNaN) {
        if (values == null) {
            return Double.NaN;
        }
        
        List nonNaNValues = CollectionFactory.list();
        nonNaNValues.addAll(Arrays.asList(values));
        if (ignoreNaN) {
            while (nonNaNValues.remove(Double.NaN)) /* deliberately empty */ ;
        }
        
        if (nonNaNValues.isEmpty()) {
            return Double.NaN;
        } else if (nonNaNValues.size() == 1) {
            return nonNaNValues.get(0);
        } else if (nonNaNValues.size() == 2) {
            return (nonNaNValues.get(0) + nonNaNValues.get(1)) / 2;
        }
        
        Collections.sort(nonNaNValues);
        
        int midHi = nonNaNValues.size() / 2;
        int midLo = midHi - 1;
        boolean even = nonNaNValues.size() % 2 == 0;

        Double result = 0.0d;
        int k = 0;
        for (Double val : nonNaNValues) {
            if (k == midHi) {
                if (!even) {
                    return val;
                } else {
                    result += val;
                    return result / 2;
                }
            } else if (even && k == midLo) {
                result += val;
            }
            k++ ;
        }
        
        return 0;  // to suppress compiler warning
    }
    
    /**
     * Calculates the empirical mode (highest frequency value) of the given values.
     * Double.NaN values are ignored. If more than one data value occurs with
     * maximum frequency the following tie-break rules are used:
     * 
     *  for an odd number of tied values, return their median
     * 
 for an even number of tied values, return the value below
     *      the mid-point of the sorted list of tied values
     * 
     * This ensures that the calculated mode occurs in the sample data.
     * Whether or not the mode is meaningful for the sample is up to the user !
     * 
     * @param values sample values
     * @param ignoreNaN specifies whether to ignore NaN values
     * @return calculated mode or Double.NaN if the sample is empty
     */
    @SuppressWarnings("empty-statement")
    public static double mode(Double[] values, boolean ignoreNaN) {
        if (values == null) {
            return Double.NaN;
        }
        
        List list = CollectionFactory.list();
        list.addAll(Arrays.asList(values));
        if (ignoreNaN) {
            while (list.remove(Double.NaN)) /* deliberately empty */ ;
        }
        
        if (list.isEmpty()) {
            return Double.NaN;
        } else if (list.size() == 1) {
            return list.get(0);
        }
        
        Collections.sort(list);
        
        List uniqueValues = CollectionFactory.list();
        List freq = CollectionFactory.list();
        
        Double curVal = list.get(0);
        int curFreq = 1;
        int maxFreq = 1;
        
        for (int i = 1; i < list.size(); i++) {
            if (CompareOp.aequal(curVal, list.get(i))) {
                curFreq++ ;
            } else {
                uniqueValues.add(curVal);
                freq.add(curFreq);
                curVal = list.get(i);
                if (curFreq > maxFreq) maxFreq = curFreq;
                curFreq = 1;
            }
        }
        uniqueValues.add(curVal);
        freq.add(curFreq);
        if (curFreq > maxFreq) maxFreq = curFreq;
        
        List maxFreqIndices = CollectionFactory.list();
        int k = 0;
        for (Integer f : freq) {
            if (f == maxFreq) {
                maxFreqIndices.add(k);
            }
            k++ ;
        }
        
        if (maxFreqIndices.size() == 1) {
            return uniqueValues.get(maxFreqIndices.get(0));
        }

        boolean even = maxFreqIndices.size() % 2 == 0;
        int i = maxFreqIndices.size() / 2;
        if (even) i-- ;
        return uniqueValues.get(maxFreqIndices.get(i));
    }

    /**
     * Calculates the range (max - min) of a set of values.
     *
     * @param values sample values
     * @param ignoreNaN specifies whether to ignore NaN values
     * @return the range or Double.NaN if the set is empty
     */
    public static double range(Double[] values, boolean ignoreNaN) {
        if (values == null || values.length == 0) {
            return Double.NaN;
        } else if (values.length == 1) {
            return 0d;
        }
        
        SortedSet set = CollectionFactory.sortedSet();
        set.addAll(Arrays.asList(values));
        if (ignoreNaN) set.remove(Double.NaN);
        return set.last() - set.first();
    }

    /**
     * Calculates sample variance using the running sample algorithm
     * of Welford (1962) described by Knuth in The Art of Computer
     * Programming (3rd ed) Vol.2 p.232
     * 
     * @param values sample values
     * @param ignoreNaN specifies whether to ignore NaN values
     * @return sample variance
     */
    public static double variance(Double[] values, boolean ignoreNaN) {
        if (values.length < 2) {
            return Double.NaN;
        }

        double mNew, mOld = 0.0d, s = 0.0d;

        int n = 0;
        for (int i = 0; i < values.length; i++) {
            if (Double.isNaN(values[i])) {
                if (!ignoreNaN) {
                    return Double.NaN;
                }
                
            } else {
                n++;
                if (n == 1) {
                    mNew = mOld = values[i];
                } else {
                    mNew = mOld + (values[i] - mOld) / n;
                    s = s + (values[i] - mOld) * (values[i] - mNew);
                    mOld = mNew;
                }
            }
        }

        if (n > 1) {
            return s / (n - 1);
        } else if (n == 1) {
            return 0.0d;
        } else {
            return Double.NaN;
        }
    }

    /**
     * Calculates sample standard deviation. This is a convenience
     * method that calls {@linkplain #variance(java.lang.Double[], boolean) }
     * and returns the square-root of the result
     *
     * @param values sample values
     * @param ignoreNaN specifies whether to ignore NaN values
     * @return sample standard deviation as a double
     */
    public static double sdev(Double[] values, boolean ignoreNaN) {
        double var = variance(values, ignoreNaN);
        return (Double.isNaN(var) ? Double.NaN : Math.sqrt(var));
    }

    /**
     * Calculates the sum of the values.
     * 
     * @param values sample values
     * @param ignoreNaN specifies whether to ignore NaN values
     * @return sum of the values
     */
    public static double sum(Double[] values, boolean ignoreNaN) {
        double sum = 0.0d;

        for (int i = 0; i < values.length; i++) {
            if (Double.isNaN(values[i])) {
                if (!ignoreNaN) {
                    return Double.NaN;
                }
            } else {
              sum = sum + values[i];  
            }
        }
        
        return sum;
    }
}