weka.core.DictionaryBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine learning workbench. This version represents the developer version, the "bleeding edge" of development, you could say. New functionality gets added to this version.
There is a newer version: 3.9.6
Show newest version
/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    DictionaryBuilder.java
 *    Copyright (C) 2015 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.io.Serializable;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.Vector;

import weka.core.stemmers.NullStemmer;
import weka.core.stemmers.Stemmer;
import weka.core.stopwords.Null;
import weka.core.stopwords.StopwordsHandler;
import weka.core.tokenizers.Tokenizer;
import weka.core.tokenizers.WordTokenizer;
import weka.gui.ProgrammaticProperty;

/**
 * Class for building and maintaining a dictionary of terms. Has methods for
 * loading, saving and aggregating dictionaries. Supports loading/saving in
 * binary and textual format. Textual format is expected to have one or two
 * comma separated values per line of the format.
 * 
 * 
 * 
 * term [,doc_count]
 * 
 * 
 * where
 * 
 *  * doc_count
 * 
 * 
 * is the number of documents that the term has occurred in.
 * 
 * @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
 * @version $Revision: 15573 $
 */
public class DictionaryBuilder implements Aggregateable,
  OptionHandler, Serializable {

  /** For serialization */
  private static final long serialVersionUID = 5579506627960356012L;

  /** Input structure */
  protected Instances m_inputFormat;

  /** Output structure */
  protected Instances m_outputFormat;

  /** Holds the dictionaries (one per class) that are compiled while processing */
  protected Map[] m_dictsPerClass;

  /**
   * Holds the final dictionary that is consolidated across classes and pruned
   * according to m_wordsToKeep. First element of array contains the index of
   * the word. The second (optional) element contains the document count for the
   * word (i.e. number of training docs the word occurs in).
   */
  protected Map m_consolidatedDict;

  /** Holds the tokenized input vector */
  protected transient Map m_inputVector;

  /**
   * True if the final number of words to keep should not be applied on a per
   * class basis
   */
  protected boolean m_doNotOperateOnPerClassBasis;

  /** Whether to output frequency counts instead of presence indicators */
  protected boolean m_outputCounts;

  /** True if all tokens should be downcased. */
  protected boolean m_lowerCaseTokens;

  /** the stemming algorithm. */
  protected Stemmer m_stemmer = new NullStemmer();

  /** Stopword handler to use. */
  protected StopwordsHandler m_stopwordsHandler = new Null();

  /**
   * The default number of words (per class if there is a class attribute
   * assigned) to attempt to keep.
   */
  protected int m_wordsToKeep = 1000;

  /**
   * Prune dictionary (per class) of low freq terms after every x documents. 0 =
   * no periodic pruning
   */
  protected long m_periodicPruneRate;

  /** Minimum frequency to retain dictionary entries */
  protected int m_minFrequency = 1;

  /** Count of input vectors seen */
  protected int m_count = 0;

  /** the tokenizer algorithm to use. */
  protected Tokenizer m_tokenizer = new WordTokenizer();

  /** Range of columns to convert to word vectors. */
  protected Range m_selectedRange = new Range("first-last");

  /** Holds the class index */
  protected int m_classIndex = -1;

  /** Number of classes */
  protected int m_numClasses = 1;

  /** A String prefix for the attribute names. */
  protected String m_Prefix = "";

  /** True if the TF transform is to be applied */
  protected boolean m_TFTransform;

  /** True if the IDF transform is to be applied */
  protected boolean m_IDFTransform;

  /** Whether to normalize to average length of training docs */
  protected boolean m_normalize;

  /** The sum of document lengths */
  protected double m_docLengthSum;

  /** The average document length */
  protected double m_avgDocLength;

  /** Whether to keep the dictionary(s) sorted alphabetically */
  protected boolean m_sortDictionary;

  /** True if the input data contains string attributes to convert */
  protected boolean m_inputContainsStringAttributes;

  /**
   * Set the average document length to use when normalizing
   *
   * @param averageDocLength the average document length to use
   */
  @ProgrammaticProperty
  public void setAverageDocLength(double averageDocLength) {
    m_avgDocLength = averageDocLength;
  }

  /**
   * Get the average document length to use when normalizing
   *
   * @return the average document length
   */
  public double getAverageDocLength() {
    return m_avgDocLength;
  }

  /**
   * Tip text for this property
   *
   * @return the tip text for this property
   */
  public String sortDictionaryTipText() {
    return "Sort the dictionary alphabetically";
  }

  /**
   * Set whether to keep the dictionary sorted alphabetically as it is built.
   * Setting this to true uses a TreeMap internally (which is slower than the
   * default unsorted LinkedHashMap).
   *
   * @param sortDictionary true to keep the dictionary sorted alphabetically
   */
  public void setSortDictionary(boolean sortDictionary) {
    m_sortDictionary = sortDictionary;
  }

  /**
   * Get whether to keep the dictionary sorted alphabetically as it is built.
   * Setting this to true uses a TreeMap internally (which is slower than the
   * default unsorted LinkedHashMap).
   *
   * @return true to keep the dictionary sorted alphabetically
   */
  public boolean getSortDictionary() {
    return m_sortDictionary;
  }

  /**
   * Gets whether output instances contain 0 or 1 indicating word presence, or
   * word counts.
   * 
   * @return true if word counts should be output.
   */
  public boolean getOutputWordCounts() {
    return m_outputCounts;
  }

  /**
   * Sets whether output instances contain 0 or 1 indicating word presence, or
   * word counts.
   * 
   * @param outputWordCounts true if word counts should be output.
   */
  public void setOutputWordCounts(boolean outputWordCounts) {
    m_outputCounts = outputWordCounts;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String outputWordCountsTipText() {
    return "Output word counts rather than boolean 0 or 1"
      + "(indicating presence or absence of a word).";
  }

  /**
   * Get the value of m_SelectedRange.
   * 
   * @return Value of m_SelectedRange.
   */
  public Range getSelectedRange() {
    return m_selectedRange;
  }

  /**
   * Set the value of m_SelectedRange.
   * 
   * @param newSelectedRange Value to assign to m_SelectedRange.
   */
  public void setSelectedRange(String newSelectedRange) {
    m_selectedRange = new Range(newSelectedRange);
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String attributeIndicesTipText() {
    return "Specify range of attributes to act on."
      + " This is a comma separated list of attribute indices, with"
      + " \"first\" and \"last\" valid values. Specify an inclusive"
      + " range with \"-\". E.g: \"first-3,5,6-10,last\".";
  }

  /**
   * Gets the current range selection.
   * 
   * @return a string containing a comma separated list of ranges
   */
  public String getAttributeIndices() {
    return m_selectedRange.getRanges();
  }

  /**
   * Sets which attributes are to be worked on.
   * 
   * @param rangeList a string representing the list of attributes. Since the
   *          string will typically come from a user, attributes are indexed
   *          from 1. 

   *          eg: first-3,5,6-last
   * @throws IllegalArgumentException if an invalid range list is supplied
   */
  public void setAttributeIndices(String rangeList) {
    m_selectedRange.setRanges(rangeList);
  }

  /**
   * Sets which attributes are to be processed.
   * 
   * @param attributes an array containing indexes of attributes to process.
   *          Since the array will typically come from a program, attributes are
   *          indexed from 0.
   * @throws IllegalArgumentException if an invalid set of ranges is supplied
   */
  public void setAttributeIndicesArray(int[] attributes) {
    setAttributeIndices(Range.indicesToRangeList(attributes));
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String invertSelectionTipText() {
    return "Set attribute selection mode. If false, only selected"
      + " attributes in the range will be worked on; if"
      + " true, only non-selected attributes will be processed.";
  }

  /**
   * Gets whether the supplied columns are to be processed or skipped.
   * 
   * @return true if the supplied columns will be kept
   */
  public boolean getInvertSelection() {
    return m_selectedRange.getInvert();
  }

  /**
   * Sets whether selected columns should be processed or skipped.
   * 
   * @param invert the new invert setting
   */
  public void setInvertSelection(boolean invert) {
    m_selectedRange.setInvert(invert);
  }

  /**
   * Gets the number of words (per class if there is a class attribute assigned)
   * to attempt to keep.
   * 
   * @return the target number of words in the output vector (per class if
   *         assigned).
   */
  public int getWordsToKeep() {
    return m_wordsToKeep;
  }

  /**
   * Sets the number of words (per class if there is a class attribute assigned)
   * to attempt to keep.
   * 
   * @param newWordsToKeep the target number of words in the output vector (per
   *          class if assigned).
   */
  public void setWordsToKeep(int newWordsToKeep) {
    m_wordsToKeep = newWordsToKeep;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String wordsToKeepTipText() {
    return "The number of words (per class if there is a class attribute "
      + "assigned) to attempt to keep.";
  }

  /**
   * Gets the rate (number of instances) at which the dictionary is periodically
   * pruned.
   * 
   * @return the rate at which the dictionary is periodically pruned
   */
  public long getPeriodicPruning() {
    return m_periodicPruneRate;
  }

  /**
   * Sets the rate (number of instances) at which the dictionary is periodically
   * pruned
   * 
   * 
   * @param newPeriodicPruning the rate at which the dictionary is periodically
   *          pruned
   */
  public void setPeriodicPruning(long newPeriodicPruning) {
    m_periodicPruneRate = newPeriodicPruning;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String periodicPruningTipText() {
    return "Specify the rate (x% of the input dataset) at which to periodically prune the dictionary. "
      + "wordsToKeep prunes after creating a full dictionary. You may not have enough "
      + "memory for this approach.";
  }

  /**
   * Gets whether if the word frequencies should be transformed into log(1+fij)
   * where fij is the frequency of word i in document(instance) j.
   * 
   * @return true if word frequencies are to be transformed.
   */
  public boolean getTFTransform() {
    return m_TFTransform;
  }

  /**
   * Sets whether if the word frequencies should be transformed into log(1+fij)
   * where fij is the frequency of word i in document(instance) j.
   * 
   * @param TFTransform true if word frequencies are to be transformed.
   */
  public void setTFTransform(boolean TFTransform) {
    this.m_TFTransform = TFTransform;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String TFTransformTipText() {
    return "Sets whether if the word frequencies should be transformed into:\n "
      + "   log(1+fij) \n"
      + "       where fij is the frequency of word i in document (instance) j.";
  }

  /**
   * Get the attribute name prefix.
   * 
   * @return The current attribute name prefix.
   */
  public String getAttributeNamePrefix() {
    return m_Prefix;
  }

  /**
   * Set the attribute name prefix.
   * 
   * @param newPrefix String to use as the attribute name prefix.
   */
  public void setAttributeNamePrefix(String newPrefix) {
    m_Prefix = newPrefix;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String attributeNamePrefixTipText() {
    return "Prefix for the created attribute names. " + "(default: \"\")";
  }

  /**
   * Sets whether if the word frequencies in a document should be transformed
   * into: 

   * fij*log(num of Docs/num of Docs with word i) 

   * where fij is the frequency of word i in document(instance) j.
   * 
   * @return true if the word frequencies are to be transformed.
   */
  public boolean getIDFTransform() {
    return this.m_IDFTransform;
  }

  /**
   * Sets whether if the word frequencies in a document should be transformed
   * into: 

   * fij*log(num of Docs/num of Docs with word i) 

   * where fij is the frequency of word i in document(instance) j.
   * 
   * @param IDFTransform true if the word frequecies are to be transformed
   */
  public void setIDFTransform(boolean IDFTransform) {
    this.m_IDFTransform = IDFTransform;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String IDFTransformTipText() {
    return "Sets whether if the word frequencies in a document should be "
      + "transformed into: \n"
      + "   fij*log(num of Docs/num of Docs with word i) \n"
      + "      where fij is the frequency of word i in document (instance) j.";
  }

  /**
   * Get whether word frequencies for a document should be normalized
   *
   * @return true if word frequencies should be normalized
   */
  public boolean getNormalize() {
    return m_normalize;
  }

  /**
   * Set whether word frequencies for a document should be normalized
   *
   * @param n true if word frequencies should be normalized
   */
  public void setNormalize(boolean n) {
    m_normalize = n;
  }

  /**
   * Tip text for this property
   * 
   * @return the tip text for this property
   */
  public String normalizeTipText() {
    return "Whether word frequencies for a document (instance) should "
      + "be normalized or not";
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String normalizeDocLengthTipText() {
    return "Sets whether if the word frequencies for a document (instance) "
      + "should be normalized or not.";
  }

  /**
   * Gets whether if the tokens are to be downcased or not.
   * 
   * @return true if the tokens are to be downcased.
   */
  public boolean getLowerCaseTokens() {
    return this.m_lowerCaseTokens;
  }

  /**
   * Sets whether if the tokens are to be downcased or not. (Doesn't affect
   * non-alphabetic characters in tokens).
   * 
   * @param downCaseTokens should be true if only lower case tokens are to be
   *          formed.
   */
  public void setLowerCaseTokens(boolean downCaseTokens) {
    this.m_lowerCaseTokens = downCaseTokens;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String lowerCaseTokensTipText() {
    return "If set then all the word tokens are converted to lower case "
      + "before being added to the dictionary.";
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String doNotOperateOnPerClassBasisTipText() {
    return "If this is set, the maximum number of words and the "
      + "minimum term frequency is not enforced on a per-class "
      + "basis but based on the documents in all the classes "
      + "(even if a class attribute is set).";
  }

  /**
   * Get the DoNotOperateOnPerClassBasis value.
   * 
   * @return the DoNotOperateOnPerClassBasis value.
   */
  public boolean getDoNotOperateOnPerClassBasis() {
    return m_doNotOperateOnPerClassBasis;
  }

  /**
   * Set the DoNotOperateOnPerClassBasis value.
   * 
   * @param newDoNotOperateOnPerClassBasis The new DoNotOperateOnPerClassBasis
   *          value.
   */
  public void setDoNotOperateOnPerClassBasis(
    boolean newDoNotOperateOnPerClassBasis) {
    this.m_doNotOperateOnPerClassBasis = newDoNotOperateOnPerClassBasis;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String minTermFreqTipText() {
    return "Sets the minimum term frequency. This is enforced "
      + "on a per-class basis.";
  }

  /**
   * Get the MinTermFreq value.
   * 
   * @return the MinTermFreq value.
   */
  public int getMinTermFreq() {
    return m_minFrequency;
  }

  /**
   * Set the MinTermFreq value.
   * 
   * @param newMinTermFreq The new MinTermFreq value.
   */
  public void setMinTermFreq(int newMinTermFreq) {
    m_minFrequency = newMinTermFreq;
  }

  /**
   * Returns the current stemming algorithm, null if none is used.
   *
   * @return the current stemming algorithm, null if none set
   */
  public Stemmer getStemmer() {
    return m_stemmer;
  }

  /**
   * the stemming algorithm to use, null means no stemming at all (i.e., the
   * NullStemmer is used).
   *
   * @param value the configured stemming algorithm, or null
   * @see NullStemmer
   */
  public void setStemmer(Stemmer value) {
    if (value != null) {
      m_stemmer = value;
    } else {
      m_stemmer = new NullStemmer();
    }
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String stemmerTipText() {
    return "The stemming algorithm to use on the words.";
  }

  /**
   * Gets the stopwords handler.
   *
   * @return the stopwords handler
   */
  public StopwordsHandler getStopwordsHandler() {
    return m_stopwordsHandler;
  }

  /**
   * Sets the stopwords handler to use.
   *
   * @param value the stopwords handler, if null, Null is used
   */
  public void setStopwordsHandler(StopwordsHandler value) {
    if (value != null) {
      m_stopwordsHandler = value;
    } else {
      m_stopwordsHandler = new Null();
    }
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String stopwordsHandlerTipText() {
    return "The stopwords handler to use (Null means no stopwords are used).";
  }

  /**
   * Returns the current tokenizer algorithm.
   *
   * @return the current tokenizer algorithm
   */
  public Tokenizer getTokenizer() {
    return m_tokenizer;
  }

  /**
   * the tokenizer algorithm to use.
   *
   * @param value the configured tokenizing algorithm
   */
  public void setTokenizer(Tokenizer value) {
    m_tokenizer = value;
  }

  /**
   * Returns the tip text for this property.
   * 
   * @return tip text for this property suitable for displaying in the
   *         explorer/experimenter gui
   */
  public String tokenizerTipText() {
    return "The tokenizing algorithm to use on the strings.";
  }

  /**
   * Returns an enumeration describing the available options.
   * 
   * @return an enumeration of all the available options
   */
  @Override
  public Enumeration