All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.stats.NumericStats Maven / Gradle / Ivy

Go to download

This package provides generic configuration class and distributed map/reduce style tasks for Weka

There is a newer version: 1.0.17
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    NumericStats
 *    Copyright (C) 2014 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.stats;

import java.io.Serializable;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;

import weka.core.Attribute;
import weka.core.Utils;
import weka.distributed.CSVToARFFHeaderMapTask;

import com.clearspring.analytics.stream.quantile.TDigest;

/**
 * Class for computing numeric stats
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision: 11580 $
 */
public class NumericStats extends Stats implements Serializable {

  /** Default compression for TDigest quantile estimators */
  public static final double Q_COMPRESSION = 50.0;

  /** For serialization */
  private static final long serialVersionUID = 5328158049841703129L;

  /** Holds the actual stats values */
  protected double[] m_stats =
    new double[ArffSummaryNumericMetric.values().length];

  /** For quantiles/histograms (if we are estimating them) */
  protected transient TDigest m_quantileEstimator;

  /** Holds the serialized quantile estimator (if we are using them) */
  protected byte[] m_encodedTDigestEstimator;

  /**
   * The compression level to use (bigger = less compression/more accuracy/more
   * space/more time)
   */
  protected double m_quantileCompression = Q_COMPRESSION;

  /** Labels for a histogram */
  List m_binLabels;

  /** Bin frequencies */
  List m_binFreqs;

  /**
   * Constructs a new NumericStats
   * 
   * @param attributeName the name of the attribute that these statistics are
   *          for
   */
  public NumericStats(String attributeName) {
    super(attributeName);

    m_stats[ArffSummaryNumericMetric.MIN.ordinal()] = Utils.missingValue();
    m_stats[ArffSummaryNumericMetric.MAX.ordinal()] = Utils.missingValue();
    m_stats[ArffSummaryNumericMetric.FIRSTQUARTILE.ordinal()] =
      Utils.missingValue();
    m_stats[ArffSummaryNumericMetric.MEDIAN.ordinal()] = Utils.missingValue();
    m_stats[ArffSummaryNumericMetric.THIRDQUARTILE.ordinal()] =
      Utils.missingValue();
  }

  /**
   * Construct a new NumericStats
   * 
   * @param attributeName the name of the attribute that these statistics are
   *          for
   * @param quantileCompression the degree of compression for quantile
   *          estimation (bigger = less compression)
   */
  public NumericStats(String attributeName, double quantileCompression) {
    this(attributeName);

    m_quantileCompression = quantileCompression;
  }

  /**
   * Update the incremental aggregateable portions of this NumericStats with the
   * supplied value
   * 
   * @param value the value to update with
   * @param weight the weight to use
   * @param treatZeroAsMissing true if zeros count as missing
   * @param updateQuantiles true if we should update our quantile estimator
   */
  public void update(double value, double weight, boolean treatZeroAsMissing,
    boolean updateQuantiles) {
    if (Utils.isMissingValue(value) || (treatZeroAsMissing && value == 0)) {
      m_stats[ArffSummaryNumericMetric.MISSING.ordinal()] += weight;
    } else {
      m_stats[ArffSummaryNumericMetric.COUNT.ordinal()] += weight;
      m_stats[ArffSummaryNumericMetric.SUM.ordinal()] += value * weight;
      m_stats[ArffSummaryNumericMetric.SUMSQ.ordinal()] +=
        value * value * weight;
      if (Double.isNaN(m_stats[ArffSummaryNumericMetric.MIN.ordinal()])) {
        m_stats[ArffSummaryNumericMetric.MIN.ordinal()] =
          m_stats[ArffSummaryNumericMetric.MAX.ordinal()] = value;
      } else if (value < m_stats[ArffSummaryNumericMetric.MIN.ordinal()]) {
        m_stats[ArffSummaryNumericMetric.MIN.ordinal()] = value;
      } else if (value > m_stats[ArffSummaryNumericMetric.MAX.ordinal()]) {
        m_stats[ArffSummaryNumericMetric.MAX.ordinal()] = value;
      }

      if (updateQuantiles) {
        if (m_encodedTDigestEstimator != null) {
          deSerializeCurrentQuantileEstimator();
        }
        if (m_quantileEstimator == null) {
          m_quantileEstimator =
            new TDigest(m_quantileCompression, new Random(1));
        }
        m_quantileEstimator.add(value, (int) (weight < 1 ? 1 : weight));
      }
    }
  }

  /**
   * Return the array of statistics
   * 
   * @return the array of statistics
   */
  public double[] getStats() {
    return m_stats;
  }

  /**
   * Sets the array of statistics. Does not check to see if the supplied array
   * is of the correct length.
   * 
   * @param stats the stats array to use
   */
  public void setStats(double[] stats) {
    m_stats = stats;
  }

  /**
   * Get the quantile estimator in use (if any)
   * 
   * @return the quantile estmator
   */
  public TDigest getQuantileEstimator() {
    return m_quantileEstimator;
  }

  /**
   * Set the quantile estimator to use
   * 
   * @param estimator the estimator to use
   */
  public void setQuantileEstimator(TDigest estimator) {
    m_quantileEstimator = estimator;
  }

  /**
   * Serialize the current TDigest quantile estimator
   */
  public void serializeCurrentQuantileEstimator() {
    if (m_quantileEstimator != null) {
      ByteBuffer buff = ByteBuffer.allocate(m_quantileEstimator.byteSize());
      m_quantileEstimator.asSmallBytes(buff);
      m_encodedTDigestEstimator = buff.array();
    }
  }

  /**
   * Decode the current TDigest quatile estimator
   */
  public void deSerializeCurrentQuantileEstimator() {
    if (m_encodedTDigestEstimator != null) {
      ByteBuffer buff = ByteBuffer.wrap(m_encodedTDigestEstimator);
      m_quantileEstimator = TDigest.fromBytes(buff);
      m_encodedTDigestEstimator = null;
    }
  }

  /**
   * Set histogram data for this numeric stats
   * 
   * @param labs bin labels
   * @param freqs bin frequencies
   */
  public void setHistogramData(List labs, List freqs) {
    m_binLabels = labs;
    m_binFreqs = freqs;
  }

  /**
   * Get the histogram labels
   * 
   * @return the list of histogram labels or null if not set
   */
  public List getHistogramBinLabels() {
    return m_binLabels;
  }

  /**
   * Get the histogram bin frequencies
   * 
   * @return the list of histogram bin frequencies or null if not set
   */
  public List getHistogramFrequencies() {
    return m_binFreqs;
  }

  @Override
  public Attribute makeAttribute() {
    ArrayList vals = new ArrayList();

    for (ArffSummaryNumericMetric m : ArffSummaryNumericMetric.values()) {
      if (m.ordinal() > m_stats.length - 1) {
        continue;
      }
      if (m == ArffSummaryNumericMetric.FIRSTQUARTILE
        || m == ArffSummaryNumericMetric.MEDIAN
        || m == ArffSummaryNumericMetric.THIRDQUARTILE) {
        if (Utils
          .isMissingValue(m_stats[ArffSummaryNumericMetric.FIRSTQUARTILE
            .ordinal()])
          && Utils.isMissingValue(m_stats[ArffSummaryNumericMetric.MEDIAN
            .ordinal()])
          && Utils
            .isMissingValue(m_stats[ArffSummaryNumericMetric.THIRDQUARTILE
              .ordinal()])) {
          continue;
        }
      }
      String v = m.makeAttributeValue(m_stats[m.ordinal()]);
      vals.add(v);
    }

    // histogram (if present)
    if (m_binLabels != null && m_binLabels.size() > 0) {
      StringBuilder b = new StringBuilder();
      for (int i = 0; i < m_binLabels.size(); i++) {
        String v = m_binLabels.get(i) + ":" + m_binFreqs.get(i);
        b.append(v);
        if (i < m_binLabels.size() - 1) {
          b.append("!");
        }
      }
      vals.add(b.toString());
    }

    Attribute a =
      new Attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX
        + m_attributeName, vals);
    return a;
  }

  /**
   * Convert a summary meta attribute into a NumericStats object (does not
   * recover the internal TDigest quantile estimator)
   * 
   * @param a the summary meta attribute to convert
   * @return a NumericStats instance
   * @throws IllegalArgumentException if a problem occurs
   */
  public static NumericStats attributeToStats(Attribute a)
    throws IllegalArgumentException {
    if (!a.isNominal()) {
      throw new IllegalArgumentException("Stats attribute is not nominal!");
    }

    // we assume that either just the set of aggregateable stats will
    // be present or all the stats (i.e. + quartiles and histogram)
    if (a.numValues() != ArffSummaryNumericMetric.values().length + 1
      && a.numValues() != ArffSummaryNumericMetric.values().length - 3) {
      throw new IllegalArgumentException("Was expecting there to be either "
        + (ArffSummaryNumericMetric.values().length + 1) + " or "
        + (ArffSummaryNumericMetric.values().length - 3)
        + " values in a summary attribute, but found " + a.numValues());
    }

    double[] stats = new double[ArffSummaryNumericMetric.values().length];
    stats[ArffSummaryNumericMetric.MIN.ordinal()] = Utils.missingValue();
    stats[ArffSummaryNumericMetric.MAX.ordinal()] = Utils.missingValue();
    stats[ArffSummaryNumericMetric.FIRSTQUARTILE.ordinal()] =
      Utils.missingValue();
    stats[ArffSummaryNumericMetric.MEDIAN.ordinal()] = Utils.missingValue();
    stats[ArffSummaryNumericMetric.THIRDQUARTILE.ordinal()] =
      Utils.missingValue();

    for (ArffSummaryNumericMetric m : ArffSummaryNumericMetric.values()) {

      if (m.ordinal() < a.numValues()) {
        String v = a.value(m.ordinal());

        double value = m.toValue(v, m.toString());
        stats[m.ordinal()] = value;
      }
    }

    List histLabs = null;
    List histFreqs = null;
    if (a.numValues() > ArffSummaryNumericMetric.values().length) {
      String hist = a.value(a.numValues() - 1);
      histLabs = new ArrayList();
      histFreqs = new ArrayList();

      String[] parts = hist.split("!");
      for (String p : parts) {
        String[] entry = p.split(":");
        histLabs.add(entry[0]);
        histFreqs.add(Double.parseDouble(entry[1]));
      }
    }

    NumericStats s =
      new NumericStats(a.name().replace(
        CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX, ""));
    s.m_stats = stats;
    s.setHistogramData(histLabs, histFreqs);

    return s;
  }

  /**
   * Compute the derived statistics
   */
  public void computeDerived() {
    double count = m_stats[ArffSummaryNumericMetric.COUNT.ordinal()];
    double sum = m_stats[ArffSummaryNumericMetric.SUM.ordinal()];
    double sumSq = m_stats[ArffSummaryNumericMetric.SUMSQ.ordinal()];
    double mean = 0;

    double stdDev = 0;
    if (count > 0) {
      mean = sum / count;
      // stdDev = Double.POSITIVE_INFINITY;
      if (count > 1) {
        stdDev = sumSq - (sum * sum) / count;
        stdDev /= (count - 1);
        if (stdDev < 0) {
          stdDev = 0;
        }
        stdDev = Math.sqrt(stdDev);
      }
    }

    m_stats[ArffSummaryNumericMetric.MEAN.ordinal()] = mean;
    m_stats[ArffSummaryNumericMetric.STDDEV.ordinal()] = stdDev;
  }

  /**
   * Computes derived stats and computes quartiles and histogram data from our
   * quantile estimator. If quartiles are not being estimated then the result is
   * just to call computeDerived().
   */
  public void computeQuartilesAndHistogram() {
    computeDerived();

    if (m_quantileEstimator == null) {
      return;
    }

    m_stats[ArffSummaryNumericMetric.FIRSTQUARTILE.ordinal()] =
      m_quantileEstimator.quantile(0.25);
    m_stats[ArffSummaryNumericMetric.MEDIAN.ordinal()] =
      m_quantileEstimator.quantile(0.5);
    m_stats[ArffSummaryNumericMetric.THIRDQUARTILE.ordinal()] =
      m_quantileEstimator.quantile(0.75);

    double min = m_stats[ArffSummaryNumericMetric.MIN.ordinal()];
    double count = m_stats[ArffSummaryNumericMetric.COUNT.ordinal()];
    NumericAttributeBinData binData =
      new NumericAttributeBinData(m_attributeName, count, min,
        m_stats[ArffSummaryNumericMetric.MAX.ordinal()],
        m_stats[ArffSummaryNumericMetric.STDDEV.ordinal()],
        m_stats[ArffSummaryNumericMetric.MISSING.ordinal()], -1);

    // heuristic based on count & std. dev
    int numBins = binData.getNumBins();

    // Check against compression (another hokey heuristic)
    numBins = Math.min(numBins, (int) m_quantileCompression * 2 / 10);
    if (numBins != binData.getNumBins()) {
      binData =
        new NumericAttributeBinData(m_attributeName, count, min,
          m_stats[ArffSummaryNumericMetric.MAX.ordinal()],
          m_stats[ArffSummaryNumericMetric.STDDEV.ordinal()],
          m_stats[ArffSummaryNumericMetric.MISSING.ordinal()], numBins);
    }

    double binWidth = binData.getBinWidth();
    double prev = 0;

    for (int i = 0; i < numBins; i++) {
      double lower = min + (i * binWidth);
      double upper = min + ((i + 1) * binWidth);
      double midVal = lower + ((upper - lower) / 2.0);
      double cdf = m_quantileEstimator.cdf(upper);

      boolean ok = !Double.isInfinite(cdf) && !Double.isNaN(cdf);

      double freq = ok ? cdf : 0;
      if (i > 0 && ok) {
        freq = cdf - prev;
      }
      if (freq < 0) {
        freq = 0;
      }
      freq *= count;

      binData.addValue(midVal, freq);

      if (ok) {
        prev = cdf;
      }
    }

    m_binLabels = binData.getBinLabels();
    m_binFreqs = binData.getBinFreqs();
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy