
cc.mallet.util.BulkLoader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
package cc.mallet.util;
import cc.mallet.types.*;
import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.*;
import java.util.*;
import java.io.*;
/**
* This class reads through a single file, breaking each line
* into data and (optional) name and label fields.
*/
public class BulkLoader {
static CommandOption.File inputFile = new CommandOption.File
(BulkLoader.class, "input", "FILE", true, null,
"The file containing data, one instance per line", null);
static CommandOption.File outputFile = new CommandOption.File
(BulkLoader.class, "output", "FILE", true, new File("mallet.data"),
"Write the instance list to this file", null);
static CommandOption.Boolean preserveCase = new CommandOption.Boolean
(BulkLoader.class, "preserve-case", "[TRUE|FALSE]", false, false,
"If true, do not force all strings to lowercase.", null);
static CommandOption.Boolean removeStopWords = new CommandOption.Boolean
(BulkLoader.class, "remove-stopwords", "[TRUE|FALSE]", false, false,
"If true, remove common \"stop words\" from the text.\nThis option invokes a minimal English stoplist. ", null);
static CommandOption.File stoplistFile = new CommandOption.File
(BulkLoader.class, "stoplist", "FILE", true, null,
"Read newline-separated words from this file,\n and remove them from text. This option overrides\n the default English stoplist triggered by --remove-stopwords.", null);
static CommandOption.Boolean keepSequence = new CommandOption.Boolean
(BulkLoader.class, "keep-sequence", "[TRUE|FALSE]", false, false,
"If true, final data will be a FeatureSequence rather than a FeatureVector.", null);
static CommandOption.String lineRegex = new CommandOption.String
(BulkLoader.class, "line-regex", "REGEX", true, "^([^\\t]*)\\t([^\\t]*)\\t(.*)",
"Regular expression containing regex-groups for label, name and data.", null);
static CommandOption.Integer nameGroup = new CommandOption.Integer
(BulkLoader.class, "name", "INTEGER", true, 1,
"The index of the group containing the instance name.\n Use 0 to indicate that this field is not used.", null);
static CommandOption.Integer labelGroup = new CommandOption.Integer
(BulkLoader.class, "label", "INTEGER", true, 2,
"The index of the group containing the label string.\n Use 0 to indicate that this field is not used.", null);
static CommandOption.Integer dataGroup = new CommandOption.Integer
(BulkLoader.class, "data", "INTEGER", true, 3,
"The index of the group containing the data.", null);
static CommandOption.Integer pruneCount = new CommandOption.Integer
(BulkLoader.class, "prune-count", "N", false, 0,
"Reduce features to those that occur more than N times.", null);
static CommandOption.Double docProportionCutoff = new CommandOption.Double
(BulkLoader.class, "prune-doc-frequency", "N", false, 1.0,
"Remove features that occur in more than (X*100)% of documents. 0.05 is equivalent to IDF of 3.0.", null);
/**
* Read the data from inputFile, then write all the words
* that do not occur pruneCount.value times or more to the pruned word file.
*
* @param prunedTokenizer the tokenizer that will be used to write instances
*/
public static void generateStoplist(SimpleTokenizer prunedTokenizer)
throws IOException {
CsvIterator reader = new CsvIterator(new FileReader(inputFile.value),
lineRegex.value,
dataGroup.value,
labelGroup.value,
nameGroup.value);
ArrayList pipes = new ArrayList();
Alphabet alphabet = new Alphabet();
CharSequenceLowercase csl = new CharSequenceLowercase();
SimpleTokenizer st = prunedTokenizer.deepClone();
StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
FeatureCountPipe featureCounter = new FeatureCountPipe(alphabet, null);
FeatureDocFreqPipe docCounter = new FeatureDocFreqPipe(alphabet, null);
if (! preserveCase.value) {
pipes.add(csl);
}
pipes.add(st);
pipes.add(sl2fs);
if (pruneCount.value > 0) {
pipes.add(featureCounter);
}
if (docProportionCutoff.value < 1.0) {
pipes.add(docCounter);
}
Pipe serialPipe = new SerialPipes(pipes);
Iterator iterator = serialPipe.newIteratorFrom(reader);
int count = 0;
// We aren't really interested in the instance itself,
// just the total feature counts.
while (iterator.hasNext()) {
count++;
if (count % 100000 == 0) {
System.out.println(count);
}
iterator.next();
}
if (pruneCount.value > 0) {
featureCounter.addPrunedWordsToStoplist(prunedTokenizer, pruneCount.value);
}
if (docProportionCutoff.value < 1.0) {
docCounter.addPrunedWordsToStoplist(prunedTokenizer, docProportionCutoff.value);
}
}
public static void writeInstanceList(SimpleTokenizer prunedTokenizer)
throws IOException {
CsvIterator reader = new CsvIterator(new FileReader(inputFile.value),
lineRegex.value,
dataGroup.value,
labelGroup.value,
nameGroup.value);
ArrayList pipes = new ArrayList();
Alphabet alphabet = new Alphabet();
CharSequenceLowercase csl = new CharSequenceLowercase();
StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
if (! preserveCase.value) {
pipes.add(csl);
}
pipes.add(prunedTokenizer);
pipes.add(sl2fs);
Pipe serialPipe = new SerialPipes(pipes);
InstanceList instances = new InstanceList(serialPipe);
instances.addThruPipe(reader);
instances.save(outputFile.value);
}
public static void main (String[] args) throws IOException {
// Process the command-line options
CommandOption.setSummary (BulkLoader.class,
"Efficient tool for importing large amounts of text into Mallet format");
CommandOption.process (BulkLoader.class, args);
SimpleTokenizer tokenizer = null;
if (stoplistFile.value != null) {
tokenizer = new SimpleTokenizer(stoplistFile.value);
}
else if (removeStopWords.value) {
tokenizer = new SimpleTokenizer(SimpleTokenizer.USE_DEFAULT_ENGLISH_STOPLIST);
}
else {
tokenizer = new SimpleTokenizer(SimpleTokenizer.USE_EMPTY_STOPLIST);
}
if (pruneCount.value > 0 || docProportionCutoff.value < 1.0) {
generateStoplist(tokenizer);
}
writeInstanceList(tokenizer);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy