All Downloads are FREE. Search and download functionalities are using the official Maven repository.

jaitools.numeric.StreamingSampleStats Maven / Gradle / Ivy

/*
 * Copyright 2009 Michael Bedward
 *
 * This file is part of jai-tools.
 *
 * jai-tools is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * jai-tools is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with jai-tools.  If not, see .
 *
 */
package jaitools.numeric;

import jaitools.CollectionFactory;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Logger;

/**
 * A class to calculate summary statistics for a sample of Double-valued
 * buffers that is received as a (potentially long) stream of values rather
 * than in a single batch. Any Double.NaN values in the stream will be
 * ignored.
 * 

* Two options are offered to calculate sample median. Where it is known a priori * that the data stream can be accomodated in memory, the exact median can be * requested with Statistic.MEDIAN. Where the length of the data stream is unknown, * or known to be too large to be held in memory, an approximate median can be * calculated using the 'remedian' estimator as described in: *

* PJ Rousseeuw and GW Bassett (1990) * The remedian: a robust averaging method for large data sets. * Journal of the American Statistical Society 85:97-104 *
* This is requested with Statistic.APPROX_MEDIAN. *

* Note: the 'remedian' estimator performs badly with non-stationary data, e.g. a * data stream that is monotonically increasing will result in an estimate for the * median that is too high. If possible, it is best to de-trend or randomly order * the data prior to streaming it. *

* Example of use: *


 * StreamingSampleStats strmStats = new StreamingSampleStats();
 *
 * // set the statistics that will be calculated
 * Statistic[] stats = {
 *     Statistic.MEAN,
 *     Statistic.SDEV,
 *     Statistic.RANGE,
 *     Statistic.APPROX_MEDIAN
 * };
 * strmStats.setStatistics(stats);
 *
 * // some process that generates a long stream of data
 * while (somethingBigIsRunning) {
 *     double value = ...
 *     strmStats.offer(value);
 * }
 *
 * // report the results
 * for (Statistic s : stats) {
 *     System.out.println(String.format("%s: %.4f", s, strmStats.getStatisticValue(s)));
 * }
 * 
 * 
* * @author Michael Bedward * @since 1.0 * @source $URL: https://jai-tools.googlecode.com/svn/tags/1.0.0/utils/src/main/java/jaitools/numeric/StreamingSampleStats.java $ * @version $Id: StreamingSampleStats.java 1100 2010-02-10 07:28:08Z michael.bedward $ */ public class StreamingSampleStats { private static final Logger LOGGER = Logger.getLogger("jaitools.numeric"); private ProcessorFactory factory = new ProcessorFactory(); private List processors; private List> excludedRanges; /** * Constructor */ public StreamingSampleStats() { processors = CollectionFactory.list(); excludedRanges = CollectionFactory.list(); } /** * Set a statistic to be calculated as sample values are added. * If the same statistic was previously set then calling this method * has no effect. * * @param stat the requested statistic * @see Statistic */ public void setStatistic(Statistic stat) { Processor p = findProcessor(stat); if (p == null) { p = factory.getForStatistic(stat); if (p == null) { LOGGER.severe("Unsupported Statistic: " + stat); } else { processors.add(p); // apply cached excluded ranges to the new processor for (Range excluded : excludedRanges) { p.addExcludedRange(excluded); } } } } /** * Convenience method: sets the specified statistics. * * @param stats the statistics * @see #setStatistic(Statistic) */ public void setStatistics(Statistic[] stats) { for (Statistic stat : stats) { setStatistic(stat); } } /** * Query whether the specified statistic is currently set. Note that * statistics can be set indirectly because of logical groupings. For * example, if {@code Statistic.MEAN} is set then {@code SDEV} and * {@code VARIANCE} will also be set as these three are calculated * together. The same is true for {@code MIN}, {@code MAX} and {@code RANGE}. * * @param stat the statistic * * @return true if the statistic has been set; false otherwise. */ public boolean isSet(Statistic stat) { return findProcessor(stat) != null; } /** * Add a range of values to exclude from the calculation of all * statistics. If further statistics are set after calling this method * the excluded range will be applied to them as well. * * @param exclude the {@code Range} to exclude */ public void addExcludedRange(Range exclude) { excludedRanges.add(new Range(exclude)); for (Processor p : processors) { p.addExcludedRange(exclude); } } /** * Get the statistics that are currently set. * * @return the statistics */ public Set getStatistics() { Set stats = CollectionFactory.orderedSet(); for (Processor p : processors) { for (Statistic s : p.getSupported()) { stats.add(s); } } return stats; } /** * Get the (current) value of a running statistic. If there have not * been enough samples provided to compute the statistic, Double.NaN * is returned. * * @param stat * @return the (current) value of the statistic * * @throws IllegalStateException if stat was not previously set */ public Double getStatisticValue(Statistic stat) { Processor p = findProcessor(stat); if (p == null) { throw new IllegalStateException( "requesting a result for a statistic that hasn't been set: " + stat); } return p.get(stat); } /** * Get the number of sample values that have been accepted for the * specified {@code Statistic}. *

* Note that different statistics might have been set at different * times in the sampling process. * * @param stat the statistic * * @return number of samples that have been accepted * * @throws IllegalArgumentException if the statistic hasn't been set */ public long getNumAccepted(Statistic stat) { Processor p = findProcessor(stat); if (p == null) { throw new IllegalArgumentException( "requesting sample size for a statistic that is not set: " + stat); } return p.getNumAccepted(); } /** * Get the number of sample values that have been offered for the * specified {@code Statistic}. This might be higher than the value * returned by {@linkplain #getNumAccepted} due to {@code nulls}, * {@code Double.NaNs} and excluded values in the data stream. *

* Note that different statistics might have been set at different * times in the sampling process. * * @param stat the statistic * * @return number of samples that have been accepted * * @throws IllegalArgumentException if the statistic hasn't been set */ public long getNumOffered(Statistic stat) { Processor p = findProcessor(stat); if (p == null) { throw new IllegalArgumentException( "requesting sample size for a statistic that is not set: " + stat); } return p.getNumOffered(); } /** * Offer a sample value. Offered values are filtered through excluded ranges. * {@code Double.NaNs} and {@code nulls} are excluded by default. * * @param sample the sample value * * @see #getNumOffered * @see #getNumAccepted */ public void offer(Double sample) { for (Processor p : processors) { p.offer(sample); } } /** * Convenience method: adds an array of new sample values and * updates all currently set statistics. * * @param samples the new sample values */ public void addSamples(Double[] samples) { for (Processor p : processors) { for (int i = 0; i < samples.length; i++) { offer(samples[i]); } } } /** * Search the list of {@code Processors} for one that supports * the given {@code Statistic}. * * @param stat the statistic * * @return the supporting {@code Processor} or null if one has not * been set for the statistic */ private Processor findProcessor(Statistic stat) { for (Processor p : processors) { if (p.getSupported().contains(stat)) { return p; } } return null; } public Map getStatisticValues() { Map results = CollectionFactory.orderedMap(); for (Statistic s : getStatistics()) { results.put(s, getStatisticValue(s)); } return results; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy