cc.mallet.topics.tui.DMRLoader Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

There is a newer version: 2.0.12

Show newest version

package cc.mallet.topics.tui;

import cc.mallet.classify.*;
import cc.mallet.types.*;
import cc.mallet.pipe.*;

import java.util.logging.*;
import java.util.*;
import java.util.zip.*;
import java.io.*;

import gnu.trove.*;

/**
 *  This class loads data into the format for the MALLET 
 *   Dirichlet-multinomial regression (DMR). DMR topic models
 *  learn topic assignments conditioned on observed features.
 *  
 *  The input format consists of two files, one for text and
 *   the other for features. The "text" file consists of one document
 *   per line. This class will tokenize and remove stopwords.
 *  
 *  The "features" file contains whitespace-delimited features in this format:
 *    blue heavy width=12.08
 *  Features without explicit values ("blue" and "heavy" in the example) are set to 1.0.
 */

public class DMRLoader implements Serializable {

    public static BufferedReader openReader(File file) throws IOException {
		BufferedReader reader = null;
	
		if (file.toString().endsWith(".gz")) {
            reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file))));
        }
        else {
            reader = new BufferedReader(new FileReader (file));
        }

		return reader;
    }

	public void load(File wordsFile, File featuresFile, File instancesFile) throws IOException, FileNotFoundException {

		Pipe instancePipe =
			new SerialPipes (new Pipe[] {
					(Pipe) new TargetStringToFeatures(),
					(Pipe) new CharSequence2TokenSequence(),
					(Pipe) new TokenSequenceLowercase(),
					(Pipe) new TokenSequenceRemoveStopwords(false, false),
					(Pipe) new TokenSequence2FeatureSequence()
				});

		InstanceList instances = new InstanceList (instancePipe);
		
		ArrayList instanceBuffer = new ArrayList();

        BufferedReader wordsReader = openReader(wordsFile);
        BufferedReader featuresReader = openReader(featuresFile);
        
        int lineNumber = 1;
        String wordsLine = null;
		String featuresLine = null;

        while ((wordsLine = wordsReader.readLine()) != null) {
			if ((featuresLine = featuresReader.readLine()) == null) {
				System.err.println("ran out of features");
				System.exit(0);
			}

			if (featuresLine.equals("")) { continue; }
	
			instanceBuffer.add(new Instance(wordsLine, featuresLine, String.valueOf(lineNumber), null));

			lineNumber++;
        }

		instances.addThruPipe(instanceBuffer.iterator());

        ObjectOutputStream oos = 
			new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(instancesFile)));
        oos.writeObject(instances);
        oos.close();


    }

    public static void main (String[] args) throws FileNotFoundException, IOException {

		if (args.length != 3) {
			System.err.println("Usage: DMRLoader [words file] [features file] [instances file]");
			System.exit(0);
		}

		File wordsFile = new File(args[0]);
		File featuresFile = new File(args[1]);
		File instancesFile = new File(args[2]);

		DMRLoader loader = new DMRLoader();
		loader.load(wordsFile, featuresFile, instancesFile);

	}

	private static final long serialVersionUID = 1;
	private static final int CURRENT_SERIAL_VERSION = 0;
}