
cc.mallet.regression.tui.RegressionImporter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
package cc.mallet.regression.tui;
import java.io.*;
import java.util.*;
import java.util.regex.*;
import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.*;
import cc.mallet.types.*;
import cc.mallet.util.*;
/** Load data suitable for linear and Poisson regression */
public class RegressionImporter {
static CommandOption.File inputFile = new CommandOption.File
(RegressionImporter.class, "input", "FILE", true, null,
"The file containing data to be classified, one instance per line", null);
static CommandOption.File outputFile = new CommandOption.File
(RegressionImporter.class, "output", "FILE", true, new File("text.vectors"),
"Write the instance list to this file; Using - indicates stdout.", null);
static CommandOption.String lineRegex = new CommandOption.String
(RegressionImporter.class, "line-regex", "REGEX", true, "^\\s*(\\S*)[\\s,]*(.*)$",
"Regular expression containing regex-groups for response, variables and name fields.\n" +
"Default is response followed by explanatory variables, with no instance name.", null);
static CommandOption.Integer labelOption = new CommandOption.Integer
(RegressionImporter.class, "response", "INTEGER", true, 1,
"The index of the group containing the response variables.\n" +
" Use 0 to indicate that the label field is not used.", null);
static CommandOption.Integer nameOption = new CommandOption.Integer
(RegressionImporter.class, "name", "INTEGER", true, 0,
"The index of the group containing the instance name.\n" +
" Use 0 to indicate that the name field is not used.", null);
static CommandOption.Integer dataOption = new CommandOption.Integer
(RegressionImporter.class, "data", "INTEGER", true, 2,
"The index of the group containing the explanatory variables.", null);
static CommandOption.Boolean integerResponse = new CommandOption.Boolean
(RegressionImporter.class, "integer-response", "[TRUE|FALSE]", false, false,
"If true, interpret the response variable as an integer rather\n" +
" than a double precision real number. Use for Poisson regression.", null);
static CommandOption.Boolean useFeatureValuePairs = new CommandOption.Boolean
(RegressionImporter.class, "use-feature-value-pairs", "[TRUE|FALSE]", false, false,
"If true, process the data field as a series of \"feature=value\" pairs rather\n" +
" than an ordered sequence of variables. Useful when most variables are 0.", null);
static CommandOption.SpacedStrings fieldNames = new CommandOption.SpacedStrings
(RegressionImporter.class, "field-names", "[A B C ...]", false, null,
"Use this option to specify names for the explanatory variables\n" +
" when you are not using feature name/value pairs", null);
public static void main (String[] args) throws IOException {
// Process the command-line options
CommandOption.setSummary (RegressionImporter.class,
"A tool for importing data suitable for linear and Poisson regression");
CommandOption.process (RegressionImporter.class, args);
ArrayList pipeList = new ArrayList();
if (useFeatureValuePairs.value) {
// Break data string into tokens
pipeList.add(new FeatureValueString2FeatureVector());
}
else if (fieldNames.value != null) {
// Break the data into an array on whitespace and then convert
// that to a feature vector, using user-specified names.
pipeList.add(new ValueString2FeatureVector(fieldNames.value));
}
else {
// Break the data into an array on whitespace and then convert
// that to a feature vector.
pipeList.add(new ValueString2FeatureVector());
}
// Convert the target to a numeric value,
// which should be an integer for Poisson regression.
if (integerResponse.value) {
pipeList.add(new Target2Integer());
}
else {
pipeList.add(new Target2Double());
}
// Create an iterator that will create an instance from
// each line in a file by breaking the line into
// [NAME] [LABEL] [feature=value feature=value ...]
CsvIterator reader =
new CsvIterator(new FileReader(inputFile.value),
lineRegex.value,
dataOption.value, labelOption.value, nameOption.value);
// Construct a new instance list, passing it the pipe
// we want to use to process instances.
InstanceList instances = new InstanceList( new SerialPipes(pipeList) );
// Now process each instance provided by the iterator.
instances.addThruPipe(reader);
System.out.println(instances.getDataAlphabet());
instances.save(outputFile.value);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy