com.github.TKnudsen.ComplexDataObject.model.tools.StatisticsSupport Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of complex-data-object Show documentation
A library that models real-world objects in Java, referred to as ComplexDataObjects. Other features: IO and preprocessing of ComplexDataObjects.
The newest version!
package com.github.TKnudsen.ComplexDataObject.model.tools;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;

/**
 * 
 * Title: StatisticsSupport
 * 
 * 
 * 
 * Description: extension of apache commons math stat descriptive
 * DescriptiveStatistics. DescriptiveStatistics maintains the input data in
 * memory and has the capability of producing "rolling" statistics computed from
 * a "window" consisting of the most recently added values.
 * 
 * Aggregate Statistics Included: min, max, mean, geometric mean, n, sum, sum of
 * squares, standard deviation, variance, percentiles, skewness, kurtosis,
 * median
 * 
 * "Rolling" capability? Yes
 * 
 * Values stored? Yes
 * 
 * 
 * *
 * 
 * Copyright: (c) 2012-2020 Juergen Bernard,
 * https://github.com/TKnudsen/ComplexDataObject
 * 
 * 
 * @author Juergen Bernard
 * @version 1.08
 *
 */
public class StatisticsSupport extends DescriptiveStatistics implements Iterable {

	/**
	 * 
	 */
	private static final long serialVersionUID = -4338838213221265737L;

	private double median = Double.NaN;
	private int count = -1;
	private int uniqueObservations = -1;

	/**
	 * NANs are removed!
	 * 
	 * @param vector
	 */
	public StatisticsSupport(Number[] vector) {
		for (int i = 0; i < vector.length; i++)
			if (!Double.isNaN(vector[i].doubleValue()))
				addValue(vector[i].doubleValue());
	}

	/**
	 * NANs are removed!
	 * 
	 * @param vector
	 */
	public StatisticsSupport(double[] vector) {
		for (int i = 0; i < vector.length; i++)
			if (!Double.isNaN(vector[i]))
				addValue(vector[i]);
	}

	/**
	 * NANs are removed!
	 * 
	 * @param values
	 */
	public StatisticsSupport(Collection values) {
		Iterator iterator = values.iterator();
		while (iterator.hasNext()) {
			Number n = iterator.next();
			if (n != null) {
				Double d = n.doubleValue();
				if (!Double.isNaN(d))
					addValue(d);
			}
		}
	}

	/**
	 * NANs are removed!
	 * 
	 * @param values
	 */
	public StatisticsSupport(List values) {
		for (int i = 0; i < values.size(); i++)
			if (!Double.isNaN(values.get(i)))
				addValue(values.get(i));
	}

	/**
	 * NANs are removed!
	 * 
	 * @param values
	 */
	public StatisticsSupport(Set values) {
		Iterator iterator = values.iterator();
		while (iterator.hasNext()) {
			double d = iterator.next();
			if (!Double.isNaN(d))
				addValue(d);
		}
	}

	/**
	 * 
	 * @param quantile the median is called by typing 50.0, not by 0.5!
	 * @return
	 */
	public double[] getOutliers(double quantile) {
		double dNotOutmin = this.getPercentile(quantile);
		double dNotOutmax = this.getPercentile(100.0 - quantile);

		List outliers = new ArrayList<>();
		for (int i = 0; i < getValues().length; i++)
			if (getValues()[i] < dNotOutmin || getValues()[i] > dNotOutmax)
				outliers.add(getValues()[i]);

		double[] ret = new double[outliers.size()];
		for (int i = 0; i < outliers.size(); i++)
			ret[i] = outliers.get(i);
		return ret;
	}

	private void resetValues() {
		this.median = Double.NaN;
		this.count = -1;
		uniqueObservations = -1;
	}

	@Override
	public void addValue(double v) {
		resetValues();
		super.addValue(v);
	}

	/**
	 * Adds a list of values. may be slow for large data sets. Requires testing.
	 * 
	 * @param values
	 */
	public void addAll(List values) {
		for (Double d : values)
			addValue(d);
	}

	public double getMedian() {
		if (Double.isNaN(median))
			median = getPercentile(50);
		return median;
	}

	public double getMean() {
		return super.getMean();
	}

	public int getCount() {
		if (count == -1)
			count = getValues().length;
		return count;
	}

	@Override
	public double getMax() {
		return super.getMax();
	}

	@Override
	public double getMin() {
		return super.getMin();
	}

	/**
	 * Must be between 0 and 100
	 * 
	 * @param percent
	 * @return
	 */
	public double getPercentile(int percent) {
		return getPercentile((double) percent);
	}

	/**
	 * Must be between 0 and 100
	 * 
	 * @param percent
	 * @return
	 */
	public double getPercentile(double percent) {
		if (percent <= 0)
			return getMin();
		if (percent >= 100)
			return getMax();
		return super.getPercentile((double) percent);
	}

	@Override
	public double getStandardDeviation() {
		return super.getStandardDeviation();
	}

	@Override
	public double getVariance() {
		return super.getVariance();
	}

	/**
	 * Number of different values
	 * 
	 * @return
	 */
	public int getCountUniqueObservations() {
		if (uniqueObservations == -1) {
			List list = DataConversion.doublePrimitivesToList(getValues());
			@SuppressWarnings({ "rawtypes", "unchecked" })
			Set uniqueValues = new HashSet(list);
			uniqueObservations = uniqueValues.size();
		}
		return uniqueObservations;
	}

	@Override
	public double[] getValues() {
		return super.getValues();
	}

	/**
	 * Predicts if a given variable is discrete by examining the ratio
	 * #uniques/#elements.
	 * 
	 * @param percent the variable which values are checked.
	 * @return true if the ratio is smaller than the given parameter (0.01 means
	 *         1%).
	 */
	public boolean isLikelyDiscrete(double ratio) {
		int uniqueObservations = getCountUniqueObservations();

		double ratioObserved = (double) uniqueObservations / getValues().length;

		return (ratioObserved < ratio);
	}

	@Override
	public Iterator iterator() {
		return DataConversion.doublePrimitivesToList(getValues()).iterator();
	}

	/**
	 * Calculates the Entropy for the value distribution. Should only be applied for
	 * positive values. Negative values will be inverted.
	 * 
	 * @return
	 */
	public double getEntropy() {
		if (getCount() == 0)
			return 0;

		double entropy = 0.0;
		for (Iterator iter = iterator(); iter.hasNext();) {
			Double d = iter.next();
			if (d > 0)
				entropy -= (d * Math.log(d));
			else if (d < 0)
				entropy -= (Math.abs(d) * Math.log(Math.abs(d)));
		}

		entropy /= Math.log(2.0);

		return entropy;
	}
}