Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This version represents the developer version, the
"bleeding edge" of development, you could say. New functionality gets added
to this version.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* DictionaryBuilder.java
* Copyright (C) 2015 University of Waikato, Hamilton, New Zealand
*
*/
package weka.core;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.io.Serializable;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.Vector;
import weka.core.stemmers.NullStemmer;
import weka.core.stemmers.Stemmer;
import weka.core.stopwords.Null;
import weka.core.stopwords.StopwordsHandler;
import weka.core.tokenizers.Tokenizer;
import weka.core.tokenizers.WordTokenizer;
import weka.gui.ProgrammaticProperty;
/**
* Class for building and maintaining a dictionary of terms. Has methods for
* loading, saving and aggregating dictionaries. Supports loading/saving in
* binary and textual format. Textual format is expected to have one or two
* comma separated values per line of the format.
*
*
*
* term [,doc_count]
*
*
* where
*
*
* doc_count
*
*
* is the number of documents that the term has occurred in.
*
* @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
* @version $Revision: 14442 $
*/
public class DictionaryBuilder implements Aggregateable,
OptionHandler, Serializable {
/** For serialization */
private static final long serialVersionUID = 5579506627960356012L;
/** Input structure */
protected Instances m_inputFormat;
/** Output structure */
protected Instances m_outputFormat;
/** Holds the dictionaries (one per class) that are compiled while processing */
protected Map[] m_dictsPerClass;
/**
* Holds the final dictionary that is consolidated across classes and pruned
* according to m_wordsToKeep. First element of array contains the index of
* the word. The second (optional) element contains the document count for the
* word (i.e. number of training docs the word occurs in).
*/
protected Map m_consolidatedDict;
/** Holds the tokenized input vector */
protected transient Map m_inputVector;
/**
* True if the final number of words to keep should not be applied on a per
* class basis
*/
protected boolean m_doNotOperateOnPerClassBasis;
/** Whether to output frequency counts instead of presence indicators */
protected boolean m_outputCounts;
/** True if all tokens should be downcased. */
protected boolean m_lowerCaseTokens;
/** the stemming algorithm. */
protected Stemmer m_stemmer = new NullStemmer();
/** Stopword handler to use. */
protected StopwordsHandler m_stopwordsHandler = new Null();
/**
* The default number of words (per class if there is a class attribute
* assigned) to attempt to keep.
*/
protected int m_wordsToKeep = 1000;
/**
* Prune dictionary (per class) of low freq terms after every x documents. 0 =
* no periodic pruning
*/
protected long m_periodicPruneRate;
/** Minimum frequency to retain dictionary entries */
protected int m_minFrequency = 1;
/** Count of input vectors seen */
protected int m_count = 0;
/** the tokenizer algorithm to use. */
protected Tokenizer m_tokenizer = new WordTokenizer();
/** Range of columns to convert to word vectors. */
protected Range m_selectedRange = new Range("first-last");
/** Holds the class index */
protected int m_classIndex = -1;
/** Number of classes */
protected int m_numClasses = 1;
/** A String prefix for the attribute names. */
protected String m_Prefix = "";
/** True if the TF transform is to be applied */
protected boolean m_TFTransform;
/** True if the IDF transform is to be applied */
protected boolean m_IDFTransform;
/** Whether to normalize to average length of training docs */
protected boolean m_normalize;
/** The sum of document lengths */
protected double m_docLengthSum;
/** The average document length */
protected double m_avgDocLength;
/** Whether to keep the dictionary(s) sorted alphabetically */
protected boolean m_sortDictionary;
/** True if the input data contains string attributes to convert */
protected boolean m_inputContainsStringAttributes;
/**
* Set the average document length to use when normalizing
*
* @param averageDocLength the average document length to use
*/
@ProgrammaticProperty
public void setAverageDocLength(double averageDocLength) {
m_avgDocLength = averageDocLength;
}
/**
* Get the average document length to use when normalizing
*
* @return the average document length
*/
public double getAverageDocLength() {
return m_avgDocLength;
}
/**
* Tip text for this property
*
* @return the tip text for this property
*/
public String sortDictionaryTipText() {
return "Sort the dictionary alphabetically";
}
/**
* Set whether to keep the dictionary sorted alphabetically as it is built.
* Setting this to true uses a TreeMap internally (which is slower than the
* default unsorted LinkedHashMap).
*
* @param sortDictionary true to keep the dictionary sorted alphabetically
*/
public void setSortDictionary(boolean sortDictionary) {
m_sortDictionary = sortDictionary;
}
/**
* Get whether to keep the dictionary sorted alphabetically as it is built.
* Setting this to true uses a TreeMap internally (which is slower than the
* default unsorted LinkedHashMap).
*
* @return true to keep the dictionary sorted alphabetically
*/
public boolean getSortDictionary() {
return m_sortDictionary;
}
/**
* Gets whether output instances contain 0 or 1 indicating word presence, or
* word counts.
*
* @return true if word counts should be output.
*/
public boolean getOutputWordCounts() {
return m_outputCounts;
}
/**
* Sets whether output instances contain 0 or 1 indicating word presence, or
* word counts.
*
* @param outputWordCounts true if word counts should be output.
*/
public void setOutputWordCounts(boolean outputWordCounts) {
m_outputCounts = outputWordCounts;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String outputWordCountsTipText() {
return "Output word counts rather than boolean 0 or 1"
+ "(indicating presence or absence of a word).";
}
/**
* Get the value of m_SelectedRange.
*
* @return Value of m_SelectedRange.
*/
public Range getSelectedRange() {
return m_selectedRange;
}
/**
* Set the value of m_SelectedRange.
*
* @param newSelectedRange Value to assign to m_SelectedRange.
*/
public void setSelectedRange(String newSelectedRange) {
m_selectedRange = new Range(newSelectedRange);
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String attributeIndicesTipText() {
return "Specify range of attributes to act on."
+ " This is a comma separated list of attribute indices, with"
+ " \"first\" and \"last\" valid values. Specify an inclusive"
+ " range with \"-\". E.g: \"first-3,5,6-10,last\".";
}
/**
* Gets the current range selection.
*
* @return a string containing a comma separated list of ranges
*/
public String getAttributeIndices() {
return m_selectedRange.getRanges();
}
/**
* Sets which attributes are to be worked on.
*
* @param rangeList a string representing the list of attributes. Since the
* string will typically come from a user, attributes are indexed
* from 1.
* eg: first-3,5,6-last
* @throws IllegalArgumentException if an invalid range list is supplied
*/
public void setAttributeIndices(String rangeList) {
m_selectedRange.setRanges(rangeList);
}
/**
* Sets which attributes are to be processed.
*
* @param attributes an array containing indexes of attributes to process.
* Since the array will typically come from a program, attributes are
* indexed from 0.
* @throws IllegalArgumentException if an invalid set of ranges is supplied
*/
public void setAttributeIndicesArray(int[] attributes) {
setAttributeIndices(Range.indicesToRangeList(attributes));
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String invertSelectionTipText() {
return "Set attribute selection mode. If false, only selected"
+ " attributes in the range will be worked on; if"
+ " true, only non-selected attributes will be processed.";
}
/**
* Gets whether the supplied columns are to be processed or skipped.
*
* @return true if the supplied columns will be kept
*/
public boolean getInvertSelection() {
return m_selectedRange.getInvert();
}
/**
* Sets whether selected columns should be processed or skipped.
*
* @param invert the new invert setting
*/
public void setInvertSelection(boolean invert) {
m_selectedRange.setInvert(invert);
}
/**
* Gets the number of words (per class if there is a class attribute assigned)
* to attempt to keep.
*
* @return the target number of words in the output vector (per class if
* assigned).
*/
public int getWordsToKeep() {
return m_wordsToKeep;
}
/**
* Sets the number of words (per class if there is a class attribute assigned)
* to attempt to keep.
*
* @param newWordsToKeep the target number of words in the output vector (per
* class if assigned).
*/
public void setWordsToKeep(int newWordsToKeep) {
m_wordsToKeep = newWordsToKeep;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String wordsToKeepTipText() {
return "The number of words (per class if there is a class attribute "
+ "assigned) to attempt to keep.";
}
/**
* Gets the rate (number of instances) at which the dictionary is periodically
* pruned.
*
* @return the rate at which the dictionary is periodically pruned
*/
public long getPeriodicPruning() {
return m_periodicPruneRate;
}
/**
* Sets the rate (number of instances) at which the dictionary is periodically
* pruned
*
*
* @param newPeriodicPruning the rate at which the dictionary is periodically
* pruned
*/
public void setPeriodicPruning(long newPeriodicPruning) {
m_periodicPruneRate = newPeriodicPruning;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String periodicPruningTipText() {
return "Specify the rate (x% of the input dataset) at which to periodically prune the dictionary. "
+ "wordsToKeep prunes after creating a full dictionary. You may not have enough "
+ "memory for this approach.";
}
/**
* Gets whether if the word frequencies should be transformed into log(1+fij)
* where fij is the frequency of word i in document(instance) j.
*
* @return true if word frequencies are to be transformed.
*/
public boolean getTFTransform() {
return m_TFTransform;
}
/**
* Sets whether if the word frequencies should be transformed into log(1+fij)
* where fij is the frequency of word i in document(instance) j.
*
* @param TFTransform true if word frequencies are to be transformed.
*/
public void setTFTransform(boolean TFTransform) {
this.m_TFTransform = TFTransform;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String TFTransformTipText() {
return "Sets whether if the word frequencies should be transformed into:\n "
+ " log(1+fij) \n"
+ " where fij is the frequency of word i in document (instance) j.";
}
/**
* Get the attribute name prefix.
*
* @return The current attribute name prefix.
*/
public String getAttributeNamePrefix() {
return m_Prefix;
}
/**
* Set the attribute name prefix.
*
* @param newPrefix String to use as the attribute name prefix.
*/
public void setAttributeNamePrefix(String newPrefix) {
m_Prefix = newPrefix;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String attributeNamePrefixTipText() {
return "Prefix for the created attribute names. " + "(default: \"\")";
}
/**
* Sets whether if the word frequencies in a document should be transformed
* into:
* fij*log(num of Docs/num of Docs with word i)
* where fij is the frequency of word i in document(instance) j.
*
* @return true if the word frequencies are to be transformed.
*/
public boolean getIDFTransform() {
return this.m_IDFTransform;
}
/**
* Sets whether if the word frequencies in a document should be transformed
* into:
* fij*log(num of Docs/num of Docs with word i)
* where fij is the frequency of word i in document(instance) j.
*
* @param IDFTransform true if the word frequecies are to be transformed
*/
public void setIDFTransform(boolean IDFTransform) {
this.m_IDFTransform = IDFTransform;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String IDFTransformTipText() {
return "Sets whether if the word frequencies in a document should be "
+ "transformed into: \n"
+ " fij*log(num of Docs/num of Docs with word i) \n"
+ " where fij is the frequency of word i in document (instance) j.";
}
/**
* Get whether word frequencies for a document should be normalized
*
* @return true if word frequencies should be normalized
*/
public boolean getNormalize() {
return m_normalize;
}
/**
* Set whether word frequencies for a document should be normalized
*
* @param n true if word frequencies should be normalized
*/
public void setNormalize(boolean n) {
m_normalize = n;
}
/**
* Tip text for this property
*
* @return the tip text for this property
*/
public String normalizeTipText() {
return "Whether word frequencies for a document (instance) should "
+ "be normalized or not";
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String normalizeDocLengthTipText() {
return "Sets whether if the word frequencies for a document (instance) "
+ "should be normalized or not.";
}
/**
* Gets whether if the tokens are to be downcased or not.
*
* @return true if the tokens are to be downcased.
*/
public boolean getLowerCaseTokens() {
return this.m_lowerCaseTokens;
}
/**
* Sets whether if the tokens are to be downcased or not. (Doesn't affect
* non-alphabetic characters in tokens).
*
* @param downCaseTokens should be true if only lower case tokens are to be
* formed.
*/
public void setLowerCaseTokens(boolean downCaseTokens) {
this.m_lowerCaseTokens = downCaseTokens;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String lowerCaseTokensTipText() {
return "If set then all the word tokens are converted to lower case "
+ "before being added to the dictionary.";
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String doNotOperateOnPerClassBasisTipText() {
return "If this is set, the maximum number of words and the "
+ "minimum term frequency is not enforced on a per-class "
+ "basis but based on the documents in all the classes "
+ "(even if a class attribute is set).";
}
/**
* Get the DoNotOperateOnPerClassBasis value.
*
* @return the DoNotOperateOnPerClassBasis value.
*/
public boolean getDoNotOperateOnPerClassBasis() {
return m_doNotOperateOnPerClassBasis;
}
/**
* Set the DoNotOperateOnPerClassBasis value.
*
* @param newDoNotOperateOnPerClassBasis The new DoNotOperateOnPerClassBasis
* value.
*/
public void setDoNotOperateOnPerClassBasis(
boolean newDoNotOperateOnPerClassBasis) {
this.m_doNotOperateOnPerClassBasis = newDoNotOperateOnPerClassBasis;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String minTermFreqTipText() {
return "Sets the minimum term frequency. This is enforced "
+ "on a per-class basis.";
}
/**
* Get the MinTermFreq value.
*
* @return the MinTermFreq value.
*/
public int getMinTermFreq() {
return m_minFrequency;
}
/**
* Set the MinTermFreq value.
*
* @param newMinTermFreq The new MinTermFreq value.
*/
public void setMinTermFreq(int newMinTermFreq) {
m_minFrequency = newMinTermFreq;
}
/**
* Returns the current stemming algorithm, null if none is used.
*
* @return the current stemming algorithm, null if none set
*/
public Stemmer getStemmer() {
return m_stemmer;
}
/**
* the stemming algorithm to use, null means no stemming at all (i.e., the
* NullStemmer is used).
*
* @param value the configured stemming algorithm, or null
* @see NullStemmer
*/
public void setStemmer(Stemmer value) {
if (value != null) {
m_stemmer = value;
} else {
m_stemmer = new NullStemmer();
}
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String stemmerTipText() {
return "The stemming algorithm to use on the words.";
}
/**
* Gets the stopwords handler.
*
* @return the stopwords handler
*/
public StopwordsHandler getStopwordsHandler() {
return m_stopwordsHandler;
}
/**
* Sets the stopwords handler to use.
*
* @param value the stopwords handler, if null, Null is used
*/
public void setStopwordsHandler(StopwordsHandler value) {
if (value != null) {
m_stopwordsHandler = value;
} else {
m_stopwordsHandler = new Null();
}
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String stopwordsHandlerTipText() {
return "The stopwords handler to use (Null means no stopwords are used).";
}
/**
* Returns the current tokenizer algorithm.
*
* @return the current tokenizer algorithm
*/
public Tokenizer getTokenizer() {
return m_tokenizer;
}
/**
* the tokenizer algorithm to use.
*
* @param value the configured tokenizing algorithm
*/
public void setTokenizer(Tokenizer value) {
m_tokenizer = value;
}
/**
* Returns the tip text for this property.
*
* @return tip text for this property suitable for displaying in the
* explorer/experimenter gui
*/
public String tokenizerTipText() {
return "The tokenizing algorithm to use on the strings.";
}
/**
* Returns an enumeration describing the available options.
*
* @return an enumeration of all the available options
*/
@Override
public Enumeration