
cc.mallet.util.Replacer Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
package cc.mallet.util;
import cc.mallet.types.*;
import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.*;
import java.util.*;
import java.util.logging.*;
import java.io.*;
/**
* This class replaces ngrams as specified in the configuration files.
* Input: tab-delimited text, one instance per line
* Output: tab-delimited text, but with deletions and text replacements.
*/
public class Replacer {
protected static Logger logger = MalletLogger.getLogger(Replacer.class.getName());
static CommandOption.SpacedStrings inputFiles = new CommandOption.SpacedStrings
(Replacer.class, "input", "FILE [FILE ...]", true, null,
"The file(s) containing data, one instance per line", null);
static CommandOption.File outputFile = new CommandOption.File
(Replacer.class, "output", "FILE", true, new File("mallet.data"),
"Write the strings with replacements applied to this file", null);
static CommandOption.SpacedStrings replacementFiles = new CommandOption.SpacedStrings
(Replacer.class, "replacement-files", "FILE [FILE ...]", true, null,
"files containing string replacements, one per line:\n 'A B [tab] C' replaces A B with C,\n 'A B' replaces A B with A_B", null);
static CommandOption.SpacedStrings deletionFiles = new CommandOption.SpacedStrings
(Replacer.class, "deletion-files", "FILE [FILE ...]", true, null,
"files containing strings to delete after replacements but before tokenization (ie multiword stop terms)", null);
static CommandOption.String lineRegex = new CommandOption.String
(Replacer.class, "line-regex", "REGEX", true, "^([^\\t]*)\\t([^\\t]*)\\t(.*)",
"Regular expression containing regex-groups for label, name and data.", null);
static CommandOption.Integer nameGroup = new CommandOption.Integer
(Replacer.class, "name", "INTEGER", true, 1,
"The index of the group containing the instance name.\n Use 0 to indicate that this field is not used.", null);
static CommandOption.Integer labelGroup = new CommandOption.Integer
(Replacer.class, "label", "INTEGER", true, 2,
"The index of the group containing the label string.\n Use 0 to indicate that this field is not used.", null);
static CommandOption.Integer dataGroup = new CommandOption.Integer
(Replacer.class, "data", "INTEGER", true, 3,
"The index of the group containing the data.", null);
public static void main (String[] args) throws Exception {
// Process the command-line options
CommandOption.setSummary (Replacer.class,
"Tool for modifying text with n-gram preprocessing");
CommandOption.process (Replacer.class, args);
NGramPreprocessor preprocessor = new NGramPreprocessor();
if (replacementFiles.value != null) {
for (String filename: replacementFiles.value) {
System.out.println("including replacements from " + filename);
preprocessor.loadReplacements(filename);
}
}
if (deletionFiles.value != null) {
for (String filename: deletionFiles.value) {
System.out.println("including deletions from " + filename);
preprocessor.loadDeletions(filename);
}
}
ArrayList pipes = new ArrayList();
PrintWriter out = new PrintWriter(outputFile.value);
for (String filename: inputFiles.value) {
logger.info("Loading " + filename);
CsvIterator reader = new CsvIterator(new FileReader(filename),
lineRegex.value,
dataGroup.value,
labelGroup.value,
nameGroup.value);
Iterator iterator = preprocessor.newIteratorFrom(reader);
int count = 0;
// We're not saving the instance list, just writing to the out file
while (iterator.hasNext()) {
Instance instance = iterator.next();
out.println(instance.getName() + "\t" + instance.getTarget() + "\t" + instance.getData());
count++;
if (count % 10000 == 0) {
logger.info("instance " + count);
}
iterator.next();
}
}
out.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy