jaitools.numeric.ApproxMedianProcessor Maven / Gradle / Ivy
Show all versions of jt-all Show documentation
/*
* Copyright 2009-2010 Michael Bedward
*
* This file is part of jai-tools.
*
* jai-tools is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* jai-tools is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with jai-tools. If not, see .
*
*/
package jaitools.numeric;
import jaitools.CollectionFactory;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
/**
* Processor for the approximate median {@code Statistic.APPROX_MEDIAN}.
*
* Calculation of an exact median is only possible by storing all sample
* values. For very large data streams, this processor will calculate an
* approximate median using the remedian estimator:
*
* PJ Rousseeuw and GW Bassett (1990)
* The remedian: a robust averaging method for large data sets.
* Journal of the American Statistical Society 85:97-104
*
*
* The remedian estimator performs badly with non-stationary data, e.g. a
* data stream that is monotonically increasing will result in an over-estimate.
* If possible (which it probably isn't), it will help to de-trend or randomly
* order the data prior to streaming it.
*
* @see Statistic
* @see StreamingSampleStats
*
* @author Michael Bedward
* @since 1.0
* @version $Id: ApproxMedianProcessor.java 1383 2011-02-10 11:22:29Z michael.bedward $
*/
public class ApproxMedianProcessor extends AbstractProcessor {
private static final Set SUPPORTED = Collections.singleton(Statistic.APPROX_MEDIAN);
// this must be an odd value
private static final int BASE = 21;
private static final int MEDIAN_POS = BASE / 2;
private boolean needsCalculation = true;
private double remedian;
private static class Buffer {
double[] data = new double[BASE];
int pos = 0;
void add(double value) {
data[pos++] = value;
}
boolean isFull() {
return pos >= BASE;
}
}
private List buffers;
private Buffer buf0;
private static class WeightedSample implements Comparable {
double value;
long weight;
public int compareTo(WeightedSample other) {
return Double.compare(value, other.value);
}
}
/**
* Default constructor.
*/
public ApproxMedianProcessor() {
buffers = CollectionFactory.list();
buf0 = new Buffer();
buffers.add(buf0);
}
/**
* {@inheritDoc}
*/
public Collection getSupported() {
return SUPPORTED;
}
/**
* {@inheritDoc}
*/
protected boolean update(Double sample) {
if (isAccepted(sample)) {
if (buf0.isFull()) {
cascade(0);
}
buf0.add(sample);
needsCalculation = true;
return true;
}
return false;
}
/**
* {@inheritDoc}
*/
public Double get(Statistic stat) {
if (SUPPORTED.contains(stat)) {
if (getNumAccepted() == 0) {
return Double.NaN;
}
if (getNumAccepted() == 1) {
return buf0.data[0];
}
if (needsCalculation) {
/*
* Calculate the remedian as the weighted median of the buffer values
* where the weight for each value in buffer i is BASE^i, i = 0..numBuffers-1
*/
List samples = CollectionFactory.list();
long weight = 1;
for (Buffer buf : buffers) {
for (int i = 0; i < buf.pos; i++) {
WeightedSample datum = new WeightedSample();
datum.value = buf.data[i];
datum.weight = weight;
samples.add(datum);
}
weight = weight * BASE;
}
Collections.sort(samples);
long nHalf = getNumAccepted() / 2;
long n = 0;
Iterator iter = samples.iterator();
WeightedSample datum = null;
while (n < nHalf) {
datum = iter.next();
n += datum.weight;
}
remedian = datum.value;
needsCalculation = false;
}
return remedian;
}
throw new IllegalArgumentException(stat + " not supported by " + getClass().getName());
}
/*
* Calculate the median of the values in the full buffer at
* the given level and store the result in the next
* available position of the buffer at level+1, creating this
* next buffer if necessary. If the next buffer is also full
* it is cascaded with a recursive call.
*/
private void cascade(int level) {
Buffer buf = buffers.get(level);
Arrays.sort(buf.data);
double median = buf.data[MEDIAN_POS];
Buffer nextBuf;
if (level + 1 < buffers.size()) {
nextBuf = buffers.get(level + 1);
} else {
nextBuf = new Buffer();
buffers.add(nextBuf);
}
if (nextBuf.isFull()) {
cascade(level + 1);
}
buf.pos = 0;
nextBuf.add(median);
}
}