weka.core.converters.DictionarySaver Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of weka-dev Show documentation
Show all versions of weka-dev Show documentation
The Waikato Environment for Knowledge Analysis (WEKA), a machine
learning workbench. This version represents the developer version, the
"bleeding edge" of development, you could say. New functionality gets added
to this version.
/*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*/
/*
* DictionarySaver.java
* Copyright (C) 2015 University of Waikato, Hamilton, New Zealand
*
*/
package weka.core.converters;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import weka.core.Capabilities;
import weka.core.DictionaryBuilder;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.OptionMetadata;
import weka.core.RevisionUtils;
import weka.core.stemmers.NullStemmer;
import weka.core.stemmers.Stemmer;
import weka.core.stopwords.Null;
import weka.core.stopwords.StopwordsHandler;
import weka.core.tokenizers.Tokenizer;
/**
* Writes a dictionary constructed from string
* attributes in incoming instances to a destination.
*
*
*
* Valid options are:
*
*
*
* -binary-dict
* Save as a binary serialized dictionary
*
*
*
* -R <range>
* Specify range of attributes to act on. This is a comma separated list of attribute
* indices, with "first" and "last" valid values.
*
*
*
* -V
* Set attributes selection mode. If false, only selected attributes in the range will
* be worked on. If true, only non-selected attributes will be processed
*
*
*
* -L
* Convert all tokens to lowercase when matching against dictionary entries.
*
*
*
* -stemmer <spec>
* The stemming algorithm (classname plus parameters) to use.
*
*
*
* -stopwords-handler <spec>
* The stopwords handler to use (default = Null)
*
*
*
* -tokenizer <spec>
* The tokenizing algorithm (classname plus parameters) to use.
* (default: weka.core.tokenizers.WordTokenizer)
*
*
*
* -P <integer>
* Prune the dictionary every x instances
* (default = 0 - i.e. no periodic pruning)
*
*
*
* -W <integer>
* The number of words (per class if there is a class attribute assigned) to attempt to keep.
*
*
*
* -M <integer>
* The minimum term frequency to use when pruning the dictionary
* (default = 1).
*
*
*
* -O
* If this is set, the maximum number of words and the
* minimum term frequency is not enforced on a per-class
* basis but based on the documents in all the classes
* (even if a class attribute is set).
*
*
*
* -sort
* Sort the dictionary alphabetically
*
*
*
* -i <the input file>
* The input file
*
*
*
* -o <the output file>
* The output file
*
*
*
*
* @author Mark Hall (mhall{[at]}pentaho{[dot]}com)
* @version $Revision: 12690 $
*/
public class DictionarySaver extends AbstractFileSaver implements
BatchConverter, IncrementalConverter {
private static final long serialVersionUID = -19891905988830722L;
protected transient OutputStream m_binaryStream;
/**
* The dictionary builder to use
*/
protected DictionaryBuilder m_dictionaryBuilder = new DictionaryBuilder();
/**
* Whether the dictionary file contains a binary serialized dictionary, rather
* than plain text
*/
protected boolean m_dictionaryIsBinary;
/**
* Prune the dictionary every x instances. <=0 means no periodic pruning
*/
private long m_periodicPruningRate;
public DictionarySaver() {
resetOptions();
}
/**
* Returns a string describing this Saver.
*
* @return a description of the Saver suitable for displaying in the
* explorer/experimenter gui
*/
public String globalInfo() {
return "Writes a dictionary constructed from string attributes in "
+ "incoming instances to a destination.";
}
/**
* Set whether to save the dictionary as a binary serialized dictionary,
* rather than a plain text one
*
* @param binary true if the dictionary is to be saved as binary rather than
* plain text
*/
@OptionMetadata(displayName = "Save dictionary in binary form",
description = "Save as a binary serialized dictionary",
commandLineParamName = "binary-dict",
commandLineParamSynopsis = "-binary-dict", commandLineParamIsFlag = true,
displayOrder = 2)
public void setSaveBinaryDictionary(boolean binary) {
m_dictionaryIsBinary = binary;
}
/**
* Get whether to save the dictionary as a binary serialized dictionary,
* rather than a plain text one
*
* @return true if the dictionary is to be saved as binary rather than plain
* text
*/
public boolean getSaveBinaryDictionary() {
return m_dictionaryIsBinary;
}
/**
* Gets the current range selection.
*
* @return a string containing a comma separated list of ranges
*/
public String getAttributeIndices() {
return m_dictionaryBuilder.getAttributeIndices();
}
/**
* Sets which attributes are to be worked on.
*
* @param rangeList a string representing the list of attributes. Since the
* string will typically come from a user, attributes are indexed
* from 1.
* eg: first-3,5,6-last
* @throws IllegalArgumentException if an invalid range list is supplied
*/
@OptionMetadata(displayName = "Range of attributes to operate on",
description = "Specify range of attributes to act on. This is a comma "
+ "separated list of attribute\nindices, with \"first\" and "
+ "\"last\" valid values.", commandLineParamName = "R",
commandLineParamSynopsis = "-R ", displayOrder = 4)
public void setAttributeIndices(String rangeList) {
m_dictionaryBuilder.setAttributeIndices(rangeList);
}
/**
* Gets whether the supplied columns are to be processed or skipped.
*
* @return true if the supplied columns will be kept
*/
public boolean getInvertSelection() {
return m_dictionaryBuilder.getInvertSelection();
}
/**
* Sets whether selected columns should be processed or skipped.
*
* @param invert the new invert setting
*/
@OptionMetadata(
displayName = "Invert selection",
description = "Set attributes selection mode. "
+ "If false, only selected attributes in the range will\nbe worked on. If true, "
+ "only non-selected attributes will be processed",
commandLineParamName = "V", commandLineParamSynopsis = "-V",
commandLineParamIsFlag = true, displayOrder = 5)
public
void setInvertSelection(boolean invert) {
m_dictionaryBuilder.setInvertSelection(invert);
}
/**
* Gets whether if the tokens are to be downcased or not.
*
* @return true if the tokens are to be downcased.
*/
public boolean getLowerCaseTokens() {
return m_dictionaryBuilder.getLowerCaseTokens();
}
/**
* Sets whether if the tokens are to be downcased or not. (Doesn't affect
* non-alphabetic characters in tokens).
*
* @param downCaseTokens should be true if only lower case tokens are to be
* formed.
*/
@OptionMetadata(displayName = "Lower case tokens",
description = "Convert all tokens to lowercase when matching against "
+ "dictionary entries.", commandLineParamName = "L",
commandLineParamSynopsis = "-L", commandLineParamIsFlag = true,
displayOrder = 10)
public void setLowerCaseTokens(boolean downCaseTokens) {
m_dictionaryBuilder.setLowerCaseTokens(downCaseTokens);
}
/**
* the stemming algorithm to use, null means no stemming at all (i.e., the
* NullStemmer is used).
*
* @param value the configured stemming algorithm, or null
* @see NullStemmer
*/
@OptionMetadata(displayName = "Stemmer to use",
description = "The stemming algorithm (classname plus parameters) to use.",
commandLineParamName = "stemmer",
commandLineParamSynopsis = "-stemmer ", displayOrder = 11)
public void setStemmer(Stemmer value) {
if (value != null) {
m_dictionaryBuilder.setStemmer(value);
} else {
m_dictionaryBuilder.setStemmer(new NullStemmer());
}
}
/**
* Returns the current stemming algorithm, null if none is used.
*
* @return the current stemming algorithm, null if none set
*/
public Stemmer getStemmer() {
return m_dictionaryBuilder.getStemmer();
}
/**
* Sets the stopwords handler to use.
*
* @param value the stopwords handler, if null, Null is used
*/
@OptionMetadata(displayName = "Stop words handler",
description = "The stopwords handler to use (default = Null)",
commandLineParamName = "stopwords-handler",
commandLineParamSynopsis = "-stopwords-handler ", displayOrder = 12)
public void setStopwordsHandler(StopwordsHandler value) {
if (value != null) {
m_dictionaryBuilder.setStopwordsHandler(value);
} else {
m_dictionaryBuilder.setStopwordsHandler(new Null());
}
}
/**
* Gets the stopwords handler.
*
* @return the stopwords handler
*/
public StopwordsHandler getStopwordsHandler() {
return m_dictionaryBuilder.getStopwordsHandler();
}
/**
* the tokenizer algorithm to use.
*
* @param value the configured tokenizing algorithm
*/
@OptionMetadata(
displayName = "Tokenizer",
description = "The tokenizing algorithm (classname plus parameters) to use.\n"
+ "(default: weka.core.tokenizers.WordTokenizer)",
commandLineParamName = "tokenizer",
commandLineParamSynopsis = "-tokenizer ", displayOrder = 13)
public
void setTokenizer(Tokenizer value) {
m_dictionaryBuilder.setTokenizer(value);
}
/**
* Returns the current tokenizer algorithm.
*
* @return the current tokenizer algorithm
*/
public Tokenizer getTokenizer() {
return m_dictionaryBuilder.getTokenizer();
}
/**
* Gets the rate at which the dictionary is periodically pruned, as a
* percentage of the dataset size.
*
* @return the rate at which the dictionary is periodically pruned
*/
public long getPeriodicPruning() {
return m_periodicPruningRate;
}
/**
* Sets the rate at which the dictionary is periodically pruned, as a
* percentage of the dataset size.
*
* @param newPeriodicPruning the rate at which the dictionary is periodically
* pruned
*/
@OptionMetadata(
displayName = "Periodic pruning rate",
description = "Prune the "
+ "dictionary every x instances\n(default = 0 - i.e. no periodic pruning)",
commandLineParamName = "P", commandLineParamSynopsis = "-P ",
displayOrder = 14)
public
void setPeriodicPruning(long newPeriodicPruning) {
m_periodicPruningRate = newPeriodicPruning;
}
/**
* Gets the number of words (per class if there is a class attribute assigned)
* to attempt to keep.
*
* @return the target number of words in the output vector (per class if
* assigned).
*/
public int getWordsToKeep() {
return m_dictionaryBuilder.getWordsToKeep();
}
/**
* Sets the number of words (per class if there is a class attribute assigned)
* to attempt to keep.
*
* @param newWordsToKeep the target number of words in the output vector (per
* class if assigned).
*/
@OptionMetadata(
displayName = "Number of words to attempt to keep",
description = "The number of words (per class if there is a class attribute "
+ "assigned) to attempt to keep.", commandLineParamName = "W",
commandLineParamSynopsis = "-W ", displayOrder = 15)
public
void setWordsToKeep(int newWordsToKeep) {
m_dictionaryBuilder.setWordsToKeep(newWordsToKeep);
}
/**
* Get the MinTermFreq value.
*
* @return the MinTermFreq value.
*/
public int getMinTermFreq() {
return m_dictionaryBuilder.getMinTermFreq();
}
/**
* Set the MinTermFreq value.
*
* @param newMinTermFreq The new MinTermFreq value.
*/
@OptionMetadata(
displayName = "Minimum term frequency",
description = "The minimum term frequency to use when pruning the dictionary\n"
+ "(default = 1).", commandLineParamName = "M",
commandLineParamSynopsis = "-M ", displayOrder = 16)
public
void setMinTermFreq(int newMinTermFreq) {
m_dictionaryBuilder.setMinTermFreq(newMinTermFreq);
}
/**
* Get the DoNotOperateOnPerClassBasis value.
*
* @return the DoNotOperateOnPerClassBasis value.
*/
public boolean getDoNotOperateOnPerClassBasis() {
return m_dictionaryBuilder.getDoNotOperateOnPerClassBasis();
}
/**
* Set the DoNotOperateOnPerClassBasis value.
*
* @param newDoNotOperateOnPerClassBasis The new DoNotOperateOnPerClassBasis
* value.
*/
@OptionMetadata(displayName = "Do not operate on a per-class basis",
description = "If this is set, the maximum number of words and the\n"
+ "minimum term frequency is not enforced on a per-class\n"
+ "basis but based on the documents in all the classes\n"
+ "(even if a class attribute is set).", commandLineParamName = "O",
commandLineParamSynopsis = "-O", commandLineParamIsFlag = true,
displayOrder = 17)
public void setDoNotOperateOnPerClassBasis(
boolean newDoNotOperateOnPerClassBasis) {
m_dictionaryBuilder
.setDoNotOperateOnPerClassBasis(newDoNotOperateOnPerClassBasis);
}
/**
* Set whether to keep the dictionary sorted alphabetically or not
*
* @param sorted true to keep the dictionary sorted
*/
@OptionMetadata(displayName = "Sort dictionary",
description = "Sort the dictionary alphabetically",
commandLineParamName = "sort", commandLineParamSynopsis = "-sort",
commandLineParamIsFlag = true, displayOrder = 18)
public void setKeepDictionarySorted(boolean sorted) {
m_dictionaryBuilder.setSortDictionary(sorted);
}
/**
* Get whether to keep the dictionary sorted alphabetically or not
*
* @return true to keep the dictionary sorted
*/
public boolean getKeepDictionarySorted() {
return m_dictionaryBuilder.getSortDictionary();
}
/**
* Returns the Capabilities of this saver.
*
* @return the capabilities of this object
* @see Capabilities
*/
@Override
public Capabilities getCapabilities() {
Capabilities result = super.getCapabilities();
// attributes
result.enable(Capabilities.Capability.NOMINAL_ATTRIBUTES);
result.enable(Capabilities.Capability.NUMERIC_ATTRIBUTES);
result.enable(Capabilities.Capability.DATE_ATTRIBUTES);
result.enable(Capabilities.Capability.STRING_ATTRIBUTES);
result.enable(Capabilities.Capability.MISSING_VALUES);
// class
result.enable(Capabilities.Capability.NOMINAL_CLASS);
result.enable(Capabilities.Capability.NUMERIC_CLASS);
result.enable(Capabilities.Capability.DATE_CLASS);
result.enable(Capabilities.Capability.STRING_CLASS);
result.enable(Capabilities.Capability.MISSING_CLASS_VALUES);
result.enable(Capabilities.Capability.NO_CLASS);
return result;
}
@Override
public String getFileDescription() {
return "Plain text or binary serialized dictionary files created from text "
+ "in string attributes";
}
@Override
public void writeIncremental(Instance inst) throws IOException {
int writeMode = getWriteMode();
Instances structure = getInstances();
if (getRetrieval() == BATCH || getRetrieval() == NONE) {
throw new IOException("Batch and incremental saving cannot be mixed.");
}
if (writeMode == WAIT) {
if (structure == null) {
setWriteMode(CANCEL);
if (inst != null) {
throw new IOException("Structure (header Information) has to be set "
+ "in advance");
}
} else {
setWriteMode(STRUCTURE_READY);
}
writeMode = getWriteMode();
}
if (writeMode == CANCEL) {
cancel();
}
if (writeMode == STRUCTURE_READY) {
m_dictionaryBuilder.reset();
try {
m_dictionaryBuilder.setup(structure);
} catch (Exception ex) {
throw new IOException(ex);
}
setWriteMode(WRITE);
writeMode = getWriteMode();
}
if (writeMode == WRITE) {
if (structure == null) {
throw new IOException("No instances information available.");
}
if (inst != null) {
m_dictionaryBuilder.processInstance(inst);
} else {
try {
m_dictionaryBuilder.finalizeDictionary();
} catch (Exception e) {
throw new IOException(e);
}
if (retrieveFile() == null && getWriter() == null) {
if (getSaveBinaryDictionary()) {
throw new IOException(
"Can't output binary dictionary to standard out!");
}
m_dictionaryBuilder.saveDictionary(System.out);
} else {
if (getSaveBinaryDictionary()) {
m_dictionaryBuilder.saveDictionary(m_binaryStream);
} else {
m_dictionaryBuilder.saveDictionary(getWriter());
}
}
resetStructure();
resetWriter();
}
}
}
@Override
public void writeBatch() throws IOException {
if (getInstances() == null) {
throw new IOException("No instances to save");
}
if (getRetrieval() == INCREMENTAL) {
throw new IOException("Batch and incremental saving cannot be mixed.");
}
setRetrieval(BATCH);
setWriteMode(WRITE);
m_dictionaryBuilder.reset();
try {
m_dictionaryBuilder.setup(getInstances());
} catch (Exception ex) {
throw new IOException(ex);
}
for (int i = 0; i < getInstances().numInstances(); i++) {
m_dictionaryBuilder.processInstance(getInstances().instance(i));
}
try {
m_dictionaryBuilder.finalizeDictionary();
} catch (Exception ex) {
throw new IOException(ex);
}
if (retrieveFile() == null && getWriter() == null) {
if (getSaveBinaryDictionary()) {
throw new IOException("Can't output binary dictionary to standard out!");
}
m_dictionaryBuilder.saveDictionary(System.out);
setWriteMode(WAIT);
return;
}
if (getSaveBinaryDictionary()) {
m_dictionaryBuilder.saveDictionary(m_binaryStream);
} else {
m_dictionaryBuilder.saveDictionary(getWriter());
}
setWriteMode(WAIT);
resetWriter();
setWriteMode(CANCEL);
}
@Override
public void resetOptions() {
super.resetOptions();
setFileExtension(".dict");
}
@Override
public void resetWriter() {
super.resetWriter();
m_binaryStream = null;
}
@Override
public void setDestination(OutputStream output) throws IOException {
super.setDestination(output);
m_binaryStream = new BufferedOutputStream(output);
}
@Override
public String getRevision() {
return RevisionUtils.extract("$Revision: 12690 $");
}
public static void main(String[] args) {
runFileSaver(new DictionarySaver(), args);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy