All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.stats.NumericAttributeBinData Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    NumericAttributeBinData
 *    Copyright (C) 2014 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.stats;

import java.util.ArrayList;
import java.util.List;

import weka.core.Aggregateable;
import weka.core.Attribute;
import weka.core.Utils;

/**
 * Class for managing bin data for a histogram based on an attribute
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision: 11019 $
 */
public class NumericAttributeBinData implements
  Aggregateable {

  /** Maximum bins to create */
  public static final int MAX_BINS = 15;

  /** The name of the attribute */
  protected String m_attName;

  /** Holds the number of bins chosen for this attribute */
  protected int m_numBins;

  /** Holds the width of each bin (equal width bins) */
  protected double m_binWidth;

  /** The cut points for the bins */
  protected double[] m_binCutpoints;

  /** The frequency of each bin */
  protected double[] m_binFreqs;

  /** The frequency of missing value */
  protected double m_missingFreq;

  /**
   * Constructor
   * 
   * @param attName the name of the attribute
   * @param summaryAtt the summary attribute containing the bin cutpoints and
   *          frequencies
   * @param maxBins the maximum number of bins to allow (if setting bin
   *          cutpoints based on range and overall count) or -1 to use the
   *          default max.
   */
  public NumericAttributeBinData(String attName, Attribute summaryAtt,
    int maxBins) {
    m_attName = attName;

    double missingFreq =
      ArffSummaryNumericMetric.MISSING.valueFromAttribute(summaryAtt);

    double numPoints =
      ArffSummaryNumericMetric.COUNT.valueFromAttribute(summaryAtt);
    double min = ArffSummaryNumericMetric.MIN.valueFromAttribute(summaryAtt);
    double max = ArffSummaryNumericMetric.MAX.valueFromAttribute(summaryAtt);
    double stdDev =
      ArffSummaryNumericMetric.STDDEV.valueFromAttribute(summaryAtt);

    NumericStats stats = NumericStats.attributeToStats(summaryAtt);
    List binLabels = stats.getHistogramBinLabels();
    List binFreqs = stats.getHistogramFrequencies();

    setup(attName, numPoints, min, max, stdDev, missingFreq, binLabels,
      binFreqs, maxBins);
  }

  /**
   * Constructor
   * 
   * @param attName the name of the attribute
   * @param numPoints the number of points that have been seen for this
   *          attribute
   * @param min the minimum value
   * @param max the maximum value
   * @param stdDev the standard deviation
   * @param missingFreq the number of missing values
   * @param maxBins the maximum number of bins to allow (if setting bin
   *          cutpoints based on range and overall count) or -1 to use the
   *          default max.
   */
  public NumericAttributeBinData(String attName, double numPoints,
    double min, double max, double stdDev, double missingFreq, int maxBins) {
    setup(attName, numPoints, min, max, stdDev, missingFreq, null, null,
      maxBins);
  }

  /**
   * Set up histogram
   * 
   * @param attName the name of the attribute
   * @param numPoints the number of points seen
   * @param min the minimum
   * @param max the maximum
   * @param stdDev the standard deviation
   * @param missingFreq the number of missing values
   * @param binLabels a list of bin labels to use (may be null)
   * @param binFreqs a list of frequencies corresponding to bins (may be null)
   * @param maxBins the maximum number of bins to allow (if setting bin
   *          cutpoints based on range and overall count) or -1 to use the
   *          default max.
   */
  protected void setup(String attName, double numPoints, double min,
    double max, double stdDev, double missingFreq, List binLabels,
    List binFreqs, int maxBins) {

    m_missingFreq = missingFreq;

    if (binLabels == null && binFreqs == null) {
      int numBins =
        numPoints > 0 ?
          numBinsHeuristic(stdDev, numPoints, min, max, maxBins > 0 ? maxBins
            : MAX_BINS) : 0;

      m_binCutpoints = new double[numBins];
      m_binFreqs = new double[numBins];

      double step = 0;
      if (numBins > 0) {
        double range = max - min;
        step = range / numBins;

        for (int i = 0; i < numBins; i++) {
          if (i == numBins - 1) {
            m_binCutpoints[i] = max;
          } else {
            m_binCutpoints[i] = min + ((i + 1) * step);
          }
        }
      }

      m_numBins = numBins;
      m_binWidth = step;
    } else {
      m_numBins = binLabels.size();
      m_binCutpoints = new double[m_numBins];
      m_binFreqs = new double[m_numBins];

      for (int i = 0; i < binLabels.size(); i++) {
        String l = binLabels.get(i).replace("]", "");
        m_binCutpoints[i] = Double.parseDouble(l);

        m_binFreqs[i] = binFreqs.get(i);
      }

      m_binWidth =
        m_numBins > 1 ? m_binCutpoints[1] - m_binCutpoints[0] : max - min;
    }
  }

  /**
   * Get the number of bins for this attribute
   * 
   * @return the number of bins
   */
  public int getNumBins() {
    return m_numBins;
  }

  /**
   * Get the bin width for this attribute
   * 
   * @return the bin width for this attribute
   */
  public double getBinWidth() {
    return m_binWidth;
  }

  /**
   * Get a list of bin labels for this histogram
   * 
   * @return a list of bin labels
   */
  public List getBinLabels() {
    List labs = new ArrayList();

    for (double c : m_binCutpoints) {
      labs.add(Utils.doubleToString(c, 3) + "]");
    }

    return labs;
  }

  /**
   * Get a list of bin frequencies for this histogram
   * 
   * @return a list of bin frequencies
   */
  public List getBinFreqs() {
    List freqs = new ArrayList();

    for (double f : m_binFreqs) {
      freqs.add(f);
    }

    return freqs;
  }

  /**
   * Get the number of missing values
   * 
   * @return the number of missing values
   */
  public double getMissingFreq() {
    return m_missingFreq;
  }

  /**
   * Get the name of the attribute that this histogram is for
   * 
   * @return the name of the attribute that this histogram is for
   */
  public String getAttributeName() {
    return m_attName;
  }

  /**
   * Add a value to the histogram. Finds the correct bin and increases the
   * frequency
   * 
   * @param value the value to add
   * @param weight the weight
   */
  public void addValue(double value, double weight) {
    for (int i = 0; i < m_binCutpoints.length; i++) {
      if (value <= m_binCutpoints[i]) {
        m_binFreqs[i] += weight;
        break;
      }
    }
  }

  @Override
  public String toString() {
    StringBuilder b = new StringBuilder();

    List labs = getBinLabels();
    for (int i = 0; i < m_binCutpoints.length; i++) {
      b.append(labs.get(i)).append(" : ").append("" + m_binFreqs[i])
        .append("\n");
    }

    return b.toString();
  }

  @Override
  public NumericAttributeBinData aggregate(NumericAttributeBinData b)
    throws Exception {

    if (!b.m_attName.equals(m_attName)) {
      throw new Exception(
        "Can't aggregate histograms for different attributes!");
    }

    if (b.m_binCutpoints.length != m_binCutpoints.length) {
      throw new Exception("Can't aggregate histogram data for attribute '"
        + m_attName + "' - differing numbers of bins");
    }

    // don't aggregate missing as this is already global (computed on
    // the first pass over the data)

    for (int i = 0; i < m_binFreqs.length; i++) {
      m_binFreqs[i] += b.m_binFreqs[i];
    }

    return this;
  }

  @Override
  public void finalizeAggregation() throws Exception {
    // Nothing to do
  }

  /**
   * Compute the number of bins for a histogram given summary stats
   * 
   * @param stdDev the standard deviation of the variable in question
   * @param numPoints the number of observed data points
   * @param min the minimum value
   * @param max the maximum
   * @param maxBins the maximum number of bins to allow
   * @return the number of bins
   */
  public static int numBinsHeuristic(double stdDev, double numPoints,
    double min, double max, int maxBins) {
    double intervalWidth =
      3.49 * stdDev * StrictMath.pow(numPoints, (-1.0 / 3.0));
    double range = max - min;
    int numBins =
      StrictMath.max(1, (int) StrictMath.round(range / intervalWidth));

    if (numBins > maxBins) {
      numBins = maxBins;
    }

    return numBins;
  }

  public static void main(String[] args) {
    try {
      double count = 4898430;
      double min = 0;
      double max = 1.379963888E9;
      double missing = 0;
      double stdDev = 941431.170584845;

      NumericAttributeBinData b =
        new NumericAttributeBinData("test", count, min, max, stdDev, missing,
          -1);
      System.out.println(b);
    } catch (Exception ex) {
      ex.printStackTrace();
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy