
cc.mallet.util.BulkLoader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
package cc.mallet.util;
import cc.mallet.types.*;
import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.*;
import java.util.*;
import java.io.*;
/**
* This class reads through a single file, breaking each line
* into data and (optional) name and label fields.
*/
public class BulkLoader {
static CommandOption.File inputFile = new CommandOption.File
(BulkLoader.class, "input", "FILE", true, null,
"The file containing data, one instance per line", null);
static CommandOption.File outputFile = new CommandOption.File
(BulkLoader.class, "output", "FILE", true, new File("mallet.data"),
"Write the instance list to this file", null);
static CommandOption.Boolean preserveCase = new CommandOption.Boolean
(BulkLoader.class, "preserve-case", "[TRUE|FALSE]", false, false,
"If true, do not force all strings to lowercase.", null);
static CommandOption.Boolean removeStopWords = new CommandOption.Boolean
(BulkLoader.class, "remove-stopwords", "[TRUE|FALSE]", false, false,
"If true, remove common \"stop words\" from the text.\nThis option invokes a minimal English stoplist. ", null);
static CommandOption.File stoplistFile = new CommandOption.File
(BulkLoader.class, "stoplist", "FILE", true, null,
"Read newline-separated words from this file,\n and remove them from text. This option overrides\n the default English stoplist triggered by --remove-stopwords.", null);
static CommandOption.Boolean keepSequence = new CommandOption.Boolean
(BulkLoader.class, "keep-sequence", "[TRUE|FALSE]", false, false,
"If true, final data will be a FeatureSequence rather than a FeatureVector.", null);
static CommandOption.String lineRegex = new CommandOption.String
(BulkLoader.class, "line-regex", "REGEX", true, "^([^\\t]*)\\t([^\\t]*)\\t(.*)",
"Regular expression containing regex-groups for label, name and data.", null);
static CommandOption.Integer nameGroup = new CommandOption.Integer
(BulkLoader.class, "name", "INTEGER", true, 1,
"The index of the group containing the instance name.\n Use 0 to indicate that this field is not used.", null);
static CommandOption.Integer labelGroup = new CommandOption.Integer
(BulkLoader.class, "label", "INTEGER", true, 2,
"The index of the group containing the label string.\n Use 0 to indicate that this field is not used.", null);
static CommandOption.Integer dataGroup = new CommandOption.Integer
(BulkLoader.class, "data", "INTEGER", true, 3,
"The index of the group containing the data.", null);
static CommandOption.Integer pruneCount = new CommandOption.Integer
(BulkLoader.class, "prune-count", "N", false, 0,
"Reduce features to those that occur more than N times.", null);
static CommandOption.Double docProportionCutoff = new CommandOption.Double
(BulkLoader.class, "prune-doc-frequency", "N", false, 1.0,
"Remove features that occur in more than (X*100)% of documents. 0.05 is equivalent to IDF of 3.0.", null);
/**
* Read the data from inputFile, then write all the words
* that do not occur pruneCount.value times or more to the pruned word file.
*
* @param prunedTokenizer the tokenizer that will be used to write instances
*/
public static void generateStoplist(SimpleTokenizer prunedTokenizer)
throws IOException {
CsvIterator reader = new CsvIterator(new FileReader(inputFile.value),
lineRegex.value,
dataGroup.value,
labelGroup.value,
nameGroup.value);
ArrayList pipes = new ArrayList();
Alphabet alphabet = new Alphabet();
CharSequenceLowercase csl = new CharSequenceLowercase();
SimpleTokenizer st = prunedTokenizer.deepClone();
StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
FeatureCountPipe featureCounter = new FeatureCountPipe(alphabet, null);
FeatureDocFreqPipe docCounter = new FeatureDocFreqPipe(alphabet, null);
if (! preserveCase.value) {
pipes.add(csl);
}
pipes.add(st);
pipes.add(sl2fs);
if (pruneCount.value > 0) {
pipes.add(featureCounter);
}
if (docProportionCutoff.value < 1.0) {
pipes.add(docCounter);
}
Pipe serialPipe = new SerialPipes(pipes);
Iterator iterator = serialPipe.newIteratorFrom(reader);
int count = 0;
// We aren't really interested in the instance itself,
// just the total feature counts.
while (iterator.hasNext()) {
count++;
if (count % 100000 == 0) {
System.out.println(count);
}
iterator.next();
}
if (pruneCount.value > 0) {
featureCounter.addPrunedWordsToStoplist(prunedTokenizer, pruneCount.value);
}
if (docProportionCutoff.value < 1.0) {
docCounter.addPrunedWordsToStoplist(prunedTokenizer, docProportionCutoff.value);
}
}
public static void writeInstanceList(SimpleTokenizer prunedTokenizer)
throws IOException {
CsvIterator reader = new CsvIterator(new FileReader(inputFile.value),
lineRegex.value,
dataGroup.value,
labelGroup.value,
nameGroup.value);
ArrayList pipes = new ArrayList();
Alphabet alphabet = new Alphabet();
CharSequenceLowercase csl = new CharSequenceLowercase();
StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
if (! preserveCase.value) {
pipes.add(csl);
}
pipes.add(prunedTokenizer);
pipes.add(sl2fs);
Pipe serialPipe = new SerialPipes(pipes);
InstanceList instances = new InstanceList(serialPipe);
instances.addThruPipe(reader);
instances.save(outputFile.value);
}
public static void main (String[] args) throws IOException {
// Process the command-line options
CommandOption.setSummary (BulkLoader.class,
"Efficient tool for importing large amounts of text into Mallet format");
CommandOption.process (BulkLoader.class, args);
SimpleTokenizer tokenizer = null;
if (stoplistFile.value != null) {
tokenizer = new SimpleTokenizer(stoplistFile.value);
}
else if (removeStopWords.value) {
tokenizer = new SimpleTokenizer(SimpleTokenizer.USE_DEFAULT_ENGLISH_STOPLIST);
}
else {
tokenizer = new SimpleTokenizer(SimpleTokenizer.USE_EMPTY_STOPLIST);
}
if (pruneCount.value > 0 || docProportionCutoff.value < 1.0) {
generateStoplist(tokenizer);
}
writeInstanceList(tokenizer);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy