All Downloads are FREE. Search and download functionalities are using the official Maven repository.

weka.core.DictionaryBuilder Maven / Gradle / Ivy

/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see .
 */

/*
 *    DictionaryBuilder.java
 *    Copyright (C) 2015 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.io.Serializable;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.Vector;

import weka.core.stemmers.NullStemmer;
import weka.core.stemmers.Stemmer;
import weka.core.stopwords.Null;
import weka.core.stopwords.StopwordsHandler;
import weka.core.tokenizers.Tokenizer;
import weka.core.tokenizers.WordTokenizer;
import weka.gui.ProgrammaticProperty;

/**
 * Class for building and maintaining a dictionary of terms. Has methods for
 * loading, saving and aggregating dictionaries. Supports loading/saving in
 * binary and textual format. Textual format is expected to have one or two
 * comma separated values per line of the format.
 * 

* *

 * term [,doc_count]
 * 
* * where * *
 * doc_count
 * 
* * is the number of documents that the term has occurred in. * * @author Mark Hall (mhall{[at]}pentaho{[dot]}com) * @version $Revision: 13725 $ */ public class DictionaryBuilder implements Aggregateable, OptionHandler, Serializable { /** For serialization */ private static final long serialVersionUID = 5579506627960356012L; /** Input structure */ protected Instances m_inputFormat; /** Output structure */ protected Instances m_outputFormat; /** Holds the dictionaries (one per class) that are compiled while processing */ protected Map[] m_dictsPerClass; /** * Holds the final dictionary that is consolidated across classes and pruned * according to m_wordsToKeep. First element of array contains the index of * the word. The second (optional) element contains the document count for the * word (i.e. number of training docs the word occurs in). */ protected Map m_consolidatedDict; /** Holds the tokenized input vector */ protected transient Map m_inputVector; /** * True if the final number of words to keep should not be applied on a per * class basis */ protected boolean m_doNotOperateOnPerClassBasis; /** Whether to output frequency counts instead of presence indicators */ protected boolean m_outputCounts; /** True if all tokens should be downcased. */ protected boolean m_lowerCaseTokens; /** the stemming algorithm. */ protected Stemmer m_stemmer = new NullStemmer(); /** Stopword handler to use. */ protected StopwordsHandler m_stopwordsHandler = new Null(); /** * The default number of words (per class if there is a class attribute * assigned) to attempt to keep. */ protected int m_wordsToKeep = 1000; /** * Prune dictionary (per class) of low freq terms after every x documents. 0 = * no periodic pruning */ protected long m_periodicPruneRate; /** Minimum frequency to retain dictionary entries */ protected int m_minFrequency = 1; /** Count of input vectors seen */ protected int m_count = 0; /** the tokenizer algorithm to use. */ protected Tokenizer m_tokenizer = new WordTokenizer(); /** Range of columns to convert to word vectors. */ protected Range m_selectedRange = new Range("first-last"); /** Holds the class index */ protected int m_classIndex = -1; /** Number of classes */ protected int m_numClasses = 1; /** A String prefix for the attribute names. */ protected String m_Prefix = ""; /** True if the TF transform is to be applied */ protected boolean m_TFTransform; /** True if the IDF transform is to be applied */ protected boolean m_IDFTransform; /** Whether to normalize to average length of training docs */ protected boolean m_normalize; /** The sum of document lengths */ protected double m_docLengthSum; /** The average document length */ protected double m_avgDocLength; /** Whether to keep the dictionary(s) sorted alphabetically */ protected boolean m_sortDictionary; /** True if the input data contains string attributes to convert */ protected boolean m_inputContainsStringAttributes; /** * Set the average document length to use when normalizing * * @param averageDocLength the average document length to use */ @ProgrammaticProperty public void setAverageDocLength(double averageDocLength) { m_avgDocLength = averageDocLength; } /** * Get the average document length to use when normalizing * * @return the average document length */ public double getAverageDocLength() { return m_avgDocLength; } /** * Tip text for this property * * @return the tip text for this property */ public String sortDictionaryTipText() { return "Sort the dictionary alphabetically"; } /** * Set whether to keep the dictionary sorted alphabetically as it is built. * Setting this to true uses a TreeMap internally (which is slower than the * default unsorted LinkedHashMap). * * @param sortDictionary true to keep the dictionary sorted alphabetically */ public void setSortDictionary(boolean sortDictionary) { m_sortDictionary = sortDictionary; } /** * Get whether to keep the dictionary sorted alphabetically as it is built. * Setting this to true uses a TreeMap internally (which is slower than the * default unsorted LinkedHashMap). * * @return true to keep the dictionary sorted alphabetically */ public boolean getSortDictionary() { return m_sortDictionary; } /** * Gets whether output instances contain 0 or 1 indicating word presence, or * word counts. * * @return true if word counts should be output. */ public boolean getOutputWordCounts() { return m_outputCounts; } /** * Sets whether output instances contain 0 or 1 indicating word presence, or * word counts. * * @param outputWordCounts true if word counts should be output. */ public void setOutputWordCounts(boolean outputWordCounts) { m_outputCounts = outputWordCounts; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String outputWordCountsTipText() { return "Output word counts rather than boolean 0 or 1" + "(indicating presence or absence of a word)."; } /** * Get the value of m_SelectedRange. * * @return Value of m_SelectedRange. */ public Range getSelectedRange() { return m_selectedRange; } /** * Set the value of m_SelectedRange. * * @param newSelectedRange Value to assign to m_SelectedRange. */ public void setSelectedRange(String newSelectedRange) { m_selectedRange = new Range(newSelectedRange); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String attributeIndicesTipText() { return "Specify range of attributes to act on." + " This is a comma separated list of attribute indices, with" + " \"first\" and \"last\" valid values. Specify an inclusive" + " range with \"-\". E.g: \"first-3,5,6-10,last\"."; } /** * Gets the current range selection. * * @return a string containing a comma separated list of ranges */ public String getAttributeIndices() { return m_selectedRange.getRanges(); } /** * Sets which attributes are to be worked on. * * @param rangeList a string representing the list of attributes. Since the * string will typically come from a user, attributes are indexed * from 1.
* eg: first-3,5,6-last * @throws IllegalArgumentException if an invalid range list is supplied */ public void setAttributeIndices(String rangeList) { m_selectedRange.setRanges(rangeList); } /** * Sets which attributes are to be processed. * * @param attributes an array containing indexes of attributes to process. * Since the array will typically come from a program, attributes are * indexed from 0. * @throws IllegalArgumentException if an invalid set of ranges is supplied */ public void setAttributeIndicesArray(int[] attributes) { setAttributeIndices(Range.indicesToRangeList(attributes)); } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String invertSelectionTipText() { return "Set attribute selection mode. If false, only selected" + " attributes in the range will be worked on; if" + " true, only non-selected attributes will be processed."; } /** * Gets whether the supplied columns are to be processed or skipped. * * @return true if the supplied columns will be kept */ public boolean getInvertSelection() { return m_selectedRange.getInvert(); } /** * Sets whether selected columns should be processed or skipped. * * @param invert the new invert setting */ public void setInvertSelection(boolean invert) { m_selectedRange.setInvert(invert); } /** * Gets the number of words (per class if there is a class attribute assigned) * to attempt to keep. * * @return the target number of words in the output vector (per class if * assigned). */ public int getWordsToKeep() { return m_wordsToKeep; } /** * Sets the number of words (per class if there is a class attribute assigned) * to attempt to keep. * * @param newWordsToKeep the target number of words in the output vector (per * class if assigned). */ public void setWordsToKeep(int newWordsToKeep) { m_wordsToKeep = newWordsToKeep; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String wordsToKeepTipText() { return "The number of words (per class if there is a class attribute " + "assigned) to attempt to keep."; } /** * Gets the rate (number of instances) at which the dictionary is periodically * pruned. * * @return the rate at which the dictionary is periodically pruned */ public long getPeriodicPruning() { return m_periodicPruneRate; } /** * Sets the rate (number of instances) at which the dictionary is periodically * pruned * * * @param newPeriodicPruning the rate at which the dictionary is periodically * pruned */ public void setPeriodicPruning(long newPeriodicPruning) { m_periodicPruneRate = newPeriodicPruning; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String periodicPruningTipText() { return "Specify the rate (x% of the input dataset) at which to periodically prune the dictionary. " + "wordsToKeep prunes after creating a full dictionary. You may not have enough " + "memory for this approach."; } /** * Gets whether if the word frequencies should be transformed into log(1+fij) * where fij is the frequency of word i in document(instance) j. * * @return true if word frequencies are to be transformed. */ public boolean getTFTransform() { return m_TFTransform; } /** * Sets whether if the word frequencies should be transformed into log(1+fij) * where fij is the frequency of word i in document(instance) j. * * @param TFTransform true if word frequencies are to be transformed. */ public void setTFTransform(boolean TFTransform) { this.m_TFTransform = TFTransform; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String TFTransformTipText() { return "Sets whether if the word frequencies should be transformed into:\n " + " log(1+fij) \n" + " where fij is the frequency of word i in document (instance) j."; } /** * Get the attribute name prefix. * * @return The current attribute name prefix. */ public String getAttributeNamePrefix() { return m_Prefix; } /** * Set the attribute name prefix. * * @param newPrefix String to use as the attribute name prefix. */ public void setAttributeNamePrefix(String newPrefix) { m_Prefix = newPrefix; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String attributeNamePrefixTipText() { return "Prefix for the created attribute names. " + "(default: \"\")"; } /** * Sets whether if the word frequencies in a document should be transformed * into:
* fij*log(num of Docs/num of Docs with word i)
* where fij is the frequency of word i in document(instance) j. * * @return true if the word frequencies are to be transformed. */ public boolean getIDFTransform() { return this.m_IDFTransform; } /** * Sets whether if the word frequencies in a document should be transformed * into:
* fij*log(num of Docs/num of Docs with word i)
* where fij is the frequency of word i in document(instance) j. * * @param IDFTransform true if the word frequecies are to be transformed */ public void setIDFTransform(boolean IDFTransform) { this.m_IDFTransform = IDFTransform; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String IDFTransformTipText() { return "Sets whether if the word frequencies in a document should be " + "transformed into: \n" + " fij*log(num of Docs/num of Docs with word i) \n" + " where fij is the frequency of word i in document (instance) j."; } /** * Get whether word frequencies for a document should be normalized * * @return true if word frequencies should be normalized */ public boolean getNormalize() { return m_normalize; } /** * Set whether word frequencies for a document should be normalized * * @param n true if word frequencies should be normalized */ public void setNormalize(boolean n) { m_normalize = n; } /** * Tip text for this property * * @return the tip text for this property */ public String normalizeTipText() { return "Whether word frequencies for a document (instance) should " + "be normalized or not"; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String normalizeDocLengthTipText() { return "Sets whether if the word frequencies for a document (instance) " + "should be normalized or not."; } /** * Gets whether if the tokens are to be downcased or not. * * @return true if the tokens are to be downcased. */ public boolean getLowerCaseTokens() { return this.m_lowerCaseTokens; } /** * Sets whether if the tokens are to be downcased or not. (Doesn't affect * non-alphabetic characters in tokens). * * @param downCaseTokens should be true if only lower case tokens are to be * formed. */ public void setLowerCaseTokens(boolean downCaseTokens) { this.m_lowerCaseTokens = downCaseTokens; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String lowerCaseTokensTipText() { return "If set then all the word tokens are converted to lower case " + "before being added to the dictionary."; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String doNotOperateOnPerClassBasisTipText() { return "If this is set, the maximum number of words and the " + "minimum term frequency is not enforced on a per-class " + "basis but based on the documents in all the classes " + "(even if a class attribute is set)."; } /** * Get the DoNotOperateOnPerClassBasis value. * * @return the DoNotOperateOnPerClassBasis value. */ public boolean getDoNotOperateOnPerClassBasis() { return m_doNotOperateOnPerClassBasis; } /** * Set the DoNotOperateOnPerClassBasis value. * * @param newDoNotOperateOnPerClassBasis The new DoNotOperateOnPerClassBasis * value. */ public void setDoNotOperateOnPerClassBasis( boolean newDoNotOperateOnPerClassBasis) { this.m_doNotOperateOnPerClassBasis = newDoNotOperateOnPerClassBasis; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String minTermFreqTipText() { return "Sets the minimum term frequency. This is enforced " + "on a per-class basis."; } /** * Get the MinTermFreq value. * * @return the MinTermFreq value. */ public int getMinTermFreq() { return m_minFrequency; } /** * Set the MinTermFreq value. * * @param newMinTermFreq The new MinTermFreq value. */ public void setMinTermFreq(int newMinTermFreq) { m_minFrequency = newMinTermFreq; } /** * Returns the current stemming algorithm, null if none is used. * * @return the current stemming algorithm, null if none set */ public Stemmer getStemmer() { return m_stemmer; } /** * the stemming algorithm to use, null means no stemming at all (i.e., the * NullStemmer is used). * * @param value the configured stemming algorithm, or null * @see NullStemmer */ public void setStemmer(Stemmer value) { if (value != null) { m_stemmer = value; } else { m_stemmer = new NullStemmer(); } } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String stemmerTipText() { return "The stemming algorithm to use on the words."; } /** * Gets the stopwords handler. * * @return the stopwords handler */ public StopwordsHandler getStopwordsHandler() { return m_stopwordsHandler; } /** * Sets the stopwords handler to use. * * @param value the stopwords handler, if null, Null is used */ public void setStopwordsHandler(StopwordsHandler value) { if (value != null) { m_stopwordsHandler = value; } else { m_stopwordsHandler = new Null(); } } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String stopwordsHandlerTipText() { return "The stopwords handler to use (Null means no stopwords are used)."; } /** * Returns the current tokenizer algorithm. * * @return the current tokenizer algorithm */ public Tokenizer getTokenizer() { return m_tokenizer; } /** * the tokenizer algorithm to use. * * @param value the configured tokenizing algorithm */ public void setTokenizer(Tokenizer value) { m_tokenizer = value; } /** * Returns the tip text for this property. * * @return tip text for this property suitable for displaying in the * explorer/experimenter gui */ public String tokenizerTipText() { return "The tokenizing algorithm to use on the strings."; } /** * Returns an enumeration describing the available options. * * @return an enumeration of all the available options */ @Override public Enumeration