jaitools.numeric.StreamingSampleStats Maven / Gradle / Ivy
Show all versions of jt-all Show documentation
/*
* Copyright 2009-2011 Michael Bedward
*
* This file is part of jai-tools.
*
* jai-tools is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* jai-tools is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with jai-tools. If not, see .
*
*/
package jaitools.numeric;
import jaitools.CollectionFactory;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* A class to calculate summary statistics for a sample of Double-valued
* buffers that is received as a (potentially long) stream of values rather
* than in a single batch. Any Double.NaN values in the stream will be
* ignored.
*
* Two options are offered to calculate sample median. Where it is known a priori
* that the data stream can be accomodated in memory, the exact median can be
* requested with Statistic.MEDIAN. Where the length of the data stream is unknown,
* or known to be too large to be held in memory, an approximate median can be
* calculated using the 'remedian' estimator as described in:
*
* PJ Rousseeuw and GW Bassett (1990)
* The remedian: a robust averaging method for large data sets.
* Journal of the American Statistical Society 85:97-104
*
* This is requested with Statistic.APPROX_MEDIAN.
*
* Note: the 'remedian' estimator performs badly with non-stationary data, e.g. a
* data stream that is monotonically increasing will result in an estimate for the
* median that is too high. If possible, it is best to de-trend or randomly order
* the data prior to streaming it.
*
* Example of use:
*
* StreamingSampleStats strmStats = new StreamingSampleStats();
*
* // set the statistics that will be calculated
* Statistic[] stats = {
* Statistic.MEAN,
* Statistic.SDEV,
* Statistic.RANGE,
* Statistic.APPROX_MEDIAN
* };
* strmStats.setStatistics(stats);
*
* // some process that generates a long stream of data
* while (somethingBigIsRunning) {
* double value = ...
* strmStats.offer(value);
* }
*
* // report the results
* for (Statistic s : stats) {
* System.out.println(String.format("%s: %.4f", s, strmStats.getStatisticValue(s)));
* }
*
*
*
* @author Michael Bedward
* @author Daniele Romagnoli, GeoSolutions S.A.S.
* @since 1.0
* @version $Id: StreamingSampleStats.java 1504 2011-03-05 10:56:10Z michael.bedward $
*/
public class StreamingSampleStats {
private static final Logger LOGGER = Logger.getLogger("jaitools.numeric");
private ProcessorFactory factory = new ProcessorFactory();
private List processors;
private List> ranges;
private List> noDataRanges;
private final Range.Type rangesType;
/**
* Creates a new sampler and sets the default range type to
* {@link Range.Type#EXCLUDE}.
*/
public StreamingSampleStats() {
this(Range.Type.EXCLUDE);
}
/**
* Creates a new sampler with specified use of {@code Ranges}.
*
* @param rangesType either {@link Range.Type#INCLUDE}
* or {@link Range.Type#EXCLUDE}
*/
public StreamingSampleStats(Range.Type rangesType) {
processors = CollectionFactory.list();
ranges = CollectionFactory.list();
noDataRanges = CollectionFactory.list();
this.rangesType = rangesType;
}
/**
* Adds a statistic to those calculated by this sampler.
* If the same statistic was previously set then calling this method
* has no effect.
*
* @param stat the statistic
* @see Statistic
*/
public void setStatistic(Statistic stat) {
Processor p = findProcessor(stat);
if (p == null) {
p = factory.getForStatistic(stat);
if (p == null) {
LOGGER.log(Level.SEVERE, "Unsupported Statistic: {0}", stat);
} else {
processors.add(p);
// apply cached excluded ranges to the new processor
for (Range range : ranges) {
p.addRange(range, rangesType);
}
for (Range nRange : noDataRanges) {
p.addNoDataRange(nRange);
}
}
}
}
/**
* Adds the given statistics to those that will be calculated by this sampler.
*
* @param stats the statistics
*
* @see #setStatistic(Statistic)
*/
public void setStatistics(Statistic[] stats) {
for (Statistic stat : stats) {
setStatistic(stat);
}
}
/**
* Tests whether the specified statistic is currently set. Note that
* statistics can be set indirectly because of logical groupings. For
* example, if {@code Statistic.MEAN} is set then {@code SDEV} and
* {@code VARIANCE} will also be set as these three are calculated
* together. The same is true for {@code MIN}, {@code MAX} and {@code RANGE}.
*
* @param stat the statistic
*
* @return {@code true} if the statistic has been set; {@code false} otherwise.
*/
public boolean isSet(Statistic stat) {
return findProcessor(stat) != null;
}
/**
* Adds a range of values to be considered as NoData and then to be excluded
* from the calculation of all statistics. NoData ranges take precedence
* over included / excluded data ranges.
*
* @param noData the range defining NoData values
*/
public void addNoDataRange(Range noData) {
noDataRanges.add(new Range(noData));
for (Processor p : processors) {
p.addNoDataRange(noData);
}
}
/**
* Adds a single value to be considered as NoData.
*
* @param noData the value to be treated as NoData
*
* @see #addNoDataRange(jaitools.numeric.Range)
*/
public void addNoDataValue(Double noData) {
if (noData != null && !noData.isNaN()) {
addNoDataRange(new Range(noData));
}
}
/**
* Adds a range of values to include in or exclude from the calculation
* of all statistics. If further statistics are set after calling
* this method the range will be applied to them as well.
*
* @param range the range to include/exclude
*/
public void addRange(Range range) {
ranges.add(new Range(range));
for (Processor p : processors) {
p.addRange(range);
}
}
/**
* Adds a range of values to include in or exclude from the calculation
* of all statistics. If further statistics are set after calling
* this method the range will be applied to them as well.
*
* @param range the range to include/exclude
* @param rangesType one of {@link Range.Type#INCLUDE} or {@link Range.Type#EXCLUDE}
*/
public void addRange(Range range, Range.Type rangesType) {
for (Processor p : processors) {
p.addRange(range, rangesType);
}
ranges.add(new Range(range));
}
/**
* Gets the statistics that are currently set.
*
* @return the statistics
*/
public Set getStatistics() {
Set stats = CollectionFactory.orderedSet();
for (Processor p : processors) {
for (Statistic s : p.getSupported()) {
stats.add(s);
}
}
return stats;
}
/**
* Gets the current value of a running statistic. If there have not
* been enough samples provided to compute the statistic, Double.NaN
* is returned.
*
* @param stat the statistic
* @return the current value of the statistic
*
* @throws IllegalStateException if {@code stat} was not previously set
*/
public Double getStatisticValue(Statistic stat) {
Processor p = findProcessor(stat);
if (p == null) {
throw new IllegalStateException(
"requesting a result for a statistic that hasn't been set: " + stat);
}
return p.get(stat);
}
/**
* Gets the number of sample values that have been accepted for the
* specified {@code Statistic}.
*
* Note that different statistics might have been set at different
* times in the sampling process.
*
* @param stat the statistic
*
* @return number of samples that have been accepted
*
* @throws IllegalArgumentException if the statistic hasn't been set
*/
public long getNumAccepted(Statistic stat) {
Processor p = findProcessor(stat);
if (p == null) {
throw new IllegalArgumentException(
"requesting sample size for a statistic that is not set: " + stat);
}
return p.getNumAccepted();
}
/**
* Gets the number of sample values that have been offered for the
* specified {@code Statistic}. This might be higher than the value
* returned by {@link #getNumAccepted} due to {@code nulls},
* {@code Double.NaNs} and excluded values in the data stream.
*
* Note that different statistics might have been set at different
* times in the sampling process.
*
* @param stat the statistic
*
* @return number of samples that have been accepted
*
* @throws IllegalArgumentException if the statistic hasn't been set
*/
public long getNumOffered(Statistic stat) {
Processor p = findProcessor(stat);
if (p == null) {
throw new IllegalArgumentException(
"requesting sample size for a statistic that is not set: " + stat);
}
return p.getNumOffered();
}
/**
* Gets the number of NaN values that have been offered.
* Note that different statistics might have been set at different
* times in the sampling process.
*
* @param stat the statistic
*
* @return number of NaN samples offered
*
* @throws IllegalArgumentException if the statistic hasn't been set
*/
public long getNumNaN(Statistic stat) {
Processor p = findProcessor(stat);
if (p == null) {
throw new IllegalArgumentException(
"requesting sample size for a statistic that is not set: " + stat);
}
return p.getNumNaN();
}
/**
* Gets the number of NoData values (including NaN) that have been offered.
* Note that different statistics might have been set at different
* times in the sampling process.
*
* @param stat the statistic
*
* @return number of NoData samples offered
*
* @throws IllegalArgumentException if the statistic hasn't been set
*/
public long getNumNoData(Statistic stat) {
Processor p = findProcessor(stat);
if (p == null) {
throw new IllegalArgumentException(
"requesting sample size for a statistic that is not set: " + stat);
}
return p.getNumNoData();
}
/**
* Offers a sample value. Offered values are filtered through excluded ranges.
* {@code Double.NaNs} and {@code nulls} are excluded by default.
*
* @param sample the sample value
*/
public void offer(Double sample) {
for (Processor p : processors) {
p.offer(sample);
}
}
/**
* Offers an array of sample values.
*
* @param samples the sample values
*/
public void offer(Double[] samples) {
for (int i = 0; i < samples.length; i++) {
offer(samples[i]);
}
}
/**
* Searches the list of {@code Processors} for one that supports
* the given {@code Statistic}.
*
* @param stat the statistic
*
* @return the supporting {@code Processor} or null if one has not
* been set for the statistic
*/
private Processor findProcessor(Statistic stat) {
for (Processor p : processors) {
if (p.getSupported().contains(stat)) {
return p;
}
}
return null;
}
/**
* Gets the values of all statistics calculated by this sampler.
*
* @return calculated values
*/
public Map getStatisticValues() {
Map results = CollectionFactory.orderedMap();
for (Statistic s : getStatistics()) {
results.put(s, getStatisticValue(s));
}
return results;
}
}