
cc.mallet.util.DBBulkLoader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
package cc.mallet.util;
import cc.mallet.types.*;
import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.*;
import java.util.*;
import java.util.logging.*;
import java.io.*;
/**
* This class reads through two files (data and metadata),
* tokenizing metadata for use as a label vector.
*/
public class DBBulkLoader {
protected static Logger logger = MalletLogger.getLogger(DBBulkLoader.class.getName());
static CommandOption.SpacedStrings inputFiles = new CommandOption.SpacedStrings
(DBBulkLoader.class, "input", "FILE [FILE ...]", true, null,
"The file containing data, one instance per line", null);
static CommandOption.String outputDatabase = new CommandOption.String
(DBBulkLoader.class, "output", "STRING", true, "mallet-db",
"Write the instance list to this database", null);
static CommandOption.Boolean preserveCase = new CommandOption.Boolean
(DBBulkLoader.class, "preserve-case", "[TRUE|FALSE]", false, false,
"If true, do not force all strings to lowercase.", null);
static CommandOption.File vocabularyFile = new CommandOption.File
(DBBulkLoader.class, "vocabulary", "FILE", true, null,
"Read newline-separated words from this file.", null);
static CommandOption.SpacedStrings replacementFiles = new CommandOption.SpacedStrings
(DBBulkLoader.class, "replacement-files", "FILE [FILE ...]", true, null,
"files containing string replacements, one per line:\n 'A B [tab] C' replaces A B with C,\n 'A B' replaces A B with A_B", null);
static CommandOption.SpacedStrings deletionFiles = new CommandOption.SpacedStrings
(DBBulkLoader.class, "deletion-files", "FILE [FILE ...]", true, null,
"files containing strings to delete after replacements but before tokenization (ie multiword stop terms)", null);
static CommandOption.File stoplistFile = new CommandOption.File
(DBBulkLoader.class, "stoplist", "FILE", true, null,
"Read newline-separated words from this file and remove them from text.", null);
static CommandOption.Boolean keepSequence = new CommandOption.Boolean
(DBBulkLoader.class, "keep-sequence", "[TRUE|FALSE]", false, true,
"If true, final data will be a FeatureSequence rather than a FeatureVector.", null);
static CommandOption.Integer pruneCount = new CommandOption.Integer
(DBBulkLoader.class, "prune-count", "N", false, 0,
"Reduce features to those that occur more than N times.", null);
/**
* Read the data from inputFiles, then write all the words
* that do not occur pruneCount.value times or more to the pruned word file.
*
* @param prunedTokenizer the tokenizer that will be used to write instances
*/
public static void generateStoplist(SimpleTokenizer prunedTokenizer, NGramPreprocessor preprocessor)
throws IOException {
ArrayList pipes = new ArrayList();
Alphabet alphabet = new Alphabet();
SimpleTokenizer st = prunedTokenizer.deepClone();
StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
FeatureCountPipe featureCounter = new FeatureCountPipe(alphabet, null);
pipes.add(preprocessor);
pipes.add(st);
pipes.add(sl2fs);
pipes.add(featureCounter);
Pipe serialPipe = new SerialPipes(pipes);
for (String filename: inputFiles.value) {
logger.info("pruning from " + filename);
CsvIterator reader = new CsvIterator(new FileReader(filename),
"(.*?)\\t(.*?)\\t(.*)", 3, 2, 1);
Iterator iterator = serialPipe.newIteratorFrom(reader);
int count = 0;
// We aren't really interested in the instance itself,
// just the total feature counts.
while (iterator.hasNext()) {
count++;
if (count % 100000 == 0) {
System.out.println(count);
}
iterator.next();
}
}
featureCounter.addPrunedWordsToStoplist(prunedTokenizer, pruneCount.value);
}
public static void writeInstanceList(ArrayList pipes) throws Exception {
Pipe serialPipe = new SerialPipes(pipes);
DBInstanceStore saver = new DBInstanceStore(outputDatabase.value);
for (String filename: inputFiles.value) {
logger.info("importing from " + filename);
CsvIterator reader = new CsvIterator(new FileReader(filename),
"(.*?)\\t(.*?)\\t(.*)", 3, 2, 1);
saver.saveInstances(serialPipe.newIteratorFrom(reader));
}
saver.saveAlphabets(serialPipe.getDataAlphabet(), serialPipe.getTargetAlphabet());
saver.cleanup();
}
public static void main (String[] args) throws Exception {
logger.info("starting");
// Process the command-line options
CommandOption.setSummary (DBBulkLoader.class,
"Efficient tool for importing large amounts of text and saving to an embedded Java database");
CommandOption.process (DBBulkLoader.class, args);
NGramPreprocessor preprocessor = new NGramPreprocessor();
if (replacementFiles.value != null) {
for (String filename: replacementFiles.value) {
System.out.println("including replacements from " + filename);
preprocessor.loadReplacements(filename);
}
}
if (deletionFiles.value != null) {
for (String filename: deletionFiles.value) {
System.out.println("including deletions from " + filename);
preprocessor.loadDeletions(filename);
}
}
if (vocabularyFile.value != null) {
Alphabet alphabet = AlphabetFactory.loadFromFile(vocabularyFile.value);
alphabet.stopGrowth();
logger.info("loaded alphabet of size " + alphabet.size());
ArrayList pipes = new ArrayList();
pipes.add(preprocessor);
pipes.add(new FixedVocabTokenizer(alphabet));
writeInstanceList(pipes);
}
else {
SimpleTokenizer tokenizer = new SimpleTokenizer(stoplistFile.value);
if (pruneCount.value > 0) {
generateStoplist(tokenizer, preprocessor);
}
ArrayList pipes = new ArrayList();
Alphabet alphabet = new Alphabet();
CharSequenceLowercase csl = new CharSequenceLowercase();
StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
pipes.add(preprocessor);
pipes.add(tokenizer);
pipes.add(sl2fs);
writeInstanceList(pipes);
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy