All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.stats.StringStats Maven / Gradle / Ivy

Go to download

This package provides generic configuration class and distributed map/reduce style tasks for Weka

There is a newer version: 1.0.17
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    StringStats
 *    Copyright (C) 2014 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.stats;

import java.io.Serializable;
import java.util.ArrayList;

import weka.core.Attribute;
import weka.core.Utils;
import weka.core.tokenizers.WordTokenizer;
import weka.distributed.CSVToARFFHeaderMapTask;
import distributed.core.DistributedJobConfig;

/**
 * Class for computing string-related stats. Computes counts, means, min max
 * etc. for string length and word count
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * 
 */
public class StringStats extends Stats implements Serializable {

  /** ID prefix for string length stats */
  protected static final String STRLEN_PREFIX = "strlen_";

  /** ID prefix for word count stats */
  protected static final String WORDC_PREFIX = "wordc_";

  /** For serialization */
  private static final long serialVersionUID = -743216387263285114L;

  /** The NumericStats to keep track of string length stats */
  protected NumericStats m_stringLengthStats;

  /** The NumericStats to keep track of word count stats */
  protected NumericStats m_wordStats;

  /** The tokenizer to use */
  protected StringStats.CountWordTokenizer m_tokenizer =
    new CountWordTokenizer();

  /** The count of missing values for this field */
  protected double m_missingCount;

  /**
   * Constructs a new StringStats
   * 
   * @param attributeName the name of the attribute/field to collect stats for
   */
  public StringStats(String attributeName) {
    super(attributeName);

    m_stringLengthStats = new NumericStats(attributeName);
    m_wordStats = new NumericStats(attributeName);
  }

  /**
   * Get the underlying NumericStats object that is tracking string length stats
   * 
   * @return the string length stats
   */
  public NumericStats getStringLengthStats() {
    return m_stringLengthStats;
  }

  /**
   * Get the underlying NumericStats object that is tracking word count stats
   * 
   * @return the word count stats
   */
  public NumericStats getWordCountStats() {
    return m_wordStats;
  }

  @Override
  public Attribute makeAttribute() {

    ArrayList vals = new ArrayList();
    Attribute stringStats = m_stringLengthStats.makeAttribute();
    Attribute wordStats = m_wordStats.makeAttribute();

    for (int i = 0; i < stringStats.numValues(); i++) {
      vals.add(STRLEN_PREFIX + stringStats.value(i));
    }

    for (int i = 0; i < wordStats.numValues(); i++) {
      vals.add(WORDC_PREFIX + wordStats.value(i));
    }

    Attribute a =
      new Attribute(CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX
        + m_attributeName, vals);

    return a;
  }

  /**
   * Update with a new value
   * 
   * @param value the string value to update with
   * @param weight the weight for the update
   */
  public void update(String value, double weight) {
    if (DistributedJobConfig.isEmpty(value)) {
      m_missingCount += weight;
      m_stringLengthStats.update(Utils.missingValue(), weight, false, false);
      m_wordStats.update(Utils.missingValue(), weight, false, false);
    } else {
      m_stringLengthStats.update(value.length(), weight, false, false);

      m_tokenizer.tokenize(value);
      m_wordStats.update(m_tokenizer.countTokens(), weight, false, false);
    }
  }

  /**
   * Compute derived statistics - e.g. mean, standard deviation
   */
  public void computeDerived() {
    m_stringLengthStats.computeDerived();
    m_wordStats.computeDerived();
  }

  /**
   * Convert a meta summary attribute containing string stats into a StringStats
   * object
   * 
   * @param a a meta summary attribute
   * @return a StringStats object
   */
  public static StringStats attributeToStats(Attribute a) {

    ArrayList strLenVals = new ArrayList();
    ArrayList wordVals = new ArrayList();

    for (int i = 0; i < a.numValues(); i++) {
      if (a.value(i).startsWith(STRLEN_PREFIX)) {
        strLenVals.add(a.value(i).replace(STRLEN_PREFIX, ""));
      } else if (a.value(i).startsWith(WORDC_PREFIX)) {
        wordVals.add(a.value(i).replace(WORDC_PREFIX, ""));
      }
    }

    Attribute strLenA = new Attribute(a.name(), strLenVals);
    Attribute wordCA = new Attribute(a.name(), wordVals);
    NumericStats strLenStats = NumericStats.attributeToStats(strLenA);
    NumericStats wordCStats = NumericStats.attributeToStats(wordCA);

    StringStats strStats =
      new StringStats(a.name().replace(
        CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX, ""));
    strStats.m_stringLengthStats = strLenStats;
    strStats.m_wordStats = wordCStats;

    strStats.m_missingCount =
      ArffSummaryNumericMetric.MISSING.valueFromAttribute(strLenA);

    return strStats;
  }

  /**
   * Extends the standard Weka WordTokenizer to expose the countTokens() method
   * in the underlying Tokenizer
   * 
   * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
   */
  protected static class CountWordTokenizer extends WordTokenizer {
    /** For serialization */
    private static final long serialVersionUID = -7144386321420719962L;

    /**
     * Constructor
     * 
     * @return the number of tokens in the currently set string to be tokenized
     */
    public int countTokens() {
      return m_Tokenizer.countTokens();
    }
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy