All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.topics.tui.DMRLoader Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

There is a newer version: 2.0.12
Show newest version
package cc.mallet.topics.tui;

import cc.mallet.classify.*;
import cc.mallet.types.*;
import cc.mallet.pipe.*;

import java.util.logging.*;
import java.util.*;
import java.util.zip.*;
import java.io.*;

import gnu.trove.*;

/**
 *  This class loads data into the format for the MALLET 
 *   Dirichlet-multinomial regression (DMR). DMR topic models
 *  learn topic assignments conditioned on observed features.
 *  

* The input format consists of two files, one for text and * the other for features. The "text" file consists of one document * per line. This class will tokenize and remove stopwords. *

* The "features" file contains whitespace-delimited features in this format: * blue heavy width=12.08 * Features without explicit values ("blue" and "heavy" in the example) are set to 1.0. */ public class DMRLoader implements Serializable { public static BufferedReader openReader(File file) throws IOException { BufferedReader reader = null; if (file.toString().endsWith(".gz")) { reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)))); } else { reader = new BufferedReader(new FileReader (file)); } return reader; } public void load(File wordsFile, File featuresFile, File instancesFile) throws IOException, FileNotFoundException { Pipe instancePipe = new SerialPipes (new Pipe[] { (Pipe) new TargetStringToFeatures(), (Pipe) new CharSequence2TokenSequence(), (Pipe) new TokenSequenceLowercase(), (Pipe) new TokenSequenceRemoveStopwords(false, false), (Pipe) new TokenSequence2FeatureSequence() }); InstanceList instances = new InstanceList (instancePipe); ArrayList instanceBuffer = new ArrayList(); BufferedReader wordsReader = openReader(wordsFile); BufferedReader featuresReader = openReader(featuresFile); int lineNumber = 1; String wordsLine = null; String featuresLine = null; while ((wordsLine = wordsReader.readLine()) != null) { if ((featuresLine = featuresReader.readLine()) == null) { System.err.println("ran out of features"); System.exit(0); } if (featuresLine.equals("")) { continue; } instanceBuffer.add(new Instance(wordsLine, featuresLine, String.valueOf(lineNumber), null)); lineNumber++; } instances.addThruPipe(instanceBuffer.iterator()); ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(instancesFile))); oos.writeObject(instances); oos.close(); } public static void main (String[] args) throws FileNotFoundException, IOException { if (args.length != 3) { System.err.println("Usage: DMRLoader [words file] [features file] [instances file]"); System.exit(0); } File wordsFile = new File(args[0]); File featuresFile = new File(args[1]); File instancesFile = new File(args[2]); DMRLoader loader = new DMRLoader(); loader.load(wordsFile, featuresFile, instancesFile); } private static final long serialVersionUID = 1; private static final int CURRENT_SERIAL_VERSION = 0; }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy