cc.mallet.util.BulkLoader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
There is a newer version: 2.0.12
Show newest version
package cc.mallet.util;

import cc.mallet.types.*;
import cc.mallet.pipe.*;
import cc.mallet.pipe.iterator.*;

import java.util.*;
import java.io.*;

/**
 *  This class reads through a single file, breaking each line
 *   into data and (optional) name and label fields.
 */

public class BulkLoader {

	static CommandOption.File inputFile =   new CommandOption.File
		(BulkLoader.class, "input", "FILE", true, null,
		 "The file containing data, one instance per line", null);

	static CommandOption.File outputFile = new CommandOption.File
		(BulkLoader.class, "output", "FILE", true, new File("mallet.data"),
		 "Write the instance list to this file", null);

    static CommandOption.Boolean preserveCase = new CommandOption.Boolean
		(BulkLoader.class, "preserve-case", "[TRUE|FALSE]", false, false,
		 "If true, do not force all strings to lowercase.", null);

    static CommandOption.Boolean removeStopWords = new CommandOption.Boolean
		(BulkLoader.class, "remove-stopwords", "[TRUE|FALSE]", false, false,
		 "If true, remove common \"stop words\" from the text.\nThis option invokes a minimal English stoplist. ", null);

    static CommandOption.File stoplistFile = new CommandOption.File
		(BulkLoader.class, "stoplist", "FILE", true, null,
		 "Read newline-separated words from this file,\n   and remove them from text. This option overrides\n   the default English stoplist triggered by --remove-stopwords.", null);

	static CommandOption.Boolean keepSequence = new CommandOption.Boolean
		(BulkLoader.class, "keep-sequence", "[TRUE|FALSE]", false, false,
		 "If true, final data will be a FeatureSequence rather than a FeatureVector.", null);

	static CommandOption.String lineRegex = new CommandOption.String
		(BulkLoader.class, "line-regex", "REGEX", true, "^([^\\t]*)\\t([^\\t]*)\\t(.*)",
		 "Regular expression containing regex-groups for label, name and data.", null);

    static CommandOption.Integer nameGroup = new CommandOption.Integer
		(BulkLoader.class, "name", "INTEGER", true, 1,
		 "The index of the group containing the instance name.\n   Use 0 to indicate that this field is not used.", null);

    static CommandOption.Integer labelGroup = new CommandOption.Integer
		(BulkLoader.class, "label", "INTEGER", true, 2,
		 "The index of the group containing the label string.\n   Use 0 to indicate that this field is not used.", null);

    static CommandOption.Integer dataGroup = new CommandOption.Integer
		(BulkLoader.class, "data", "INTEGER", true, 3,
		 "The index of the group containing the data.", null);

    static CommandOption.Integer pruneCount = new CommandOption.Integer
        (BulkLoader.class, "prune-count", "N", false, 0,
         "Reduce features to those that occur more than N times.", null);
	
    static CommandOption.Double docProportionCutoff = new CommandOption.Double
        (BulkLoader.class, "prune-doc-frequency", "N", false, 1.0,
         "Remove features that occur in more than (X*100)% of documents. 0.05 is equivalent to IDF of 3.0.", null);
	
    /**
     *  Read the data from inputFile, then write all the words
     *   that do not occur pruneCount.value times or more to the pruned word file.
	 * 
	 *  @param prunedTokenizer the tokenizer that will be used to write instances
     */

    public static void generateStoplist(SimpleTokenizer prunedTokenizer)
		throws IOException {

		CsvIterator reader = new CsvIterator(new FileReader(inputFile.value),
                                             lineRegex.value,
											 dataGroup.value,
											 labelGroup.value,
											 nameGroup.value);

		ArrayList pipes = new ArrayList();
		Alphabet alphabet = new Alphabet();
		
		CharSequenceLowercase csl = new CharSequenceLowercase();
        SimpleTokenizer st = prunedTokenizer.deepClone();
		StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);
		FeatureCountPipe featureCounter = new FeatureCountPipe(alphabet, null);
		FeatureDocFreqPipe docCounter = new FeatureDocFreqPipe(alphabet, null);

		if (! preserveCase.value) {
			pipes.add(csl);
		}
		pipes.add(st);
		pipes.add(sl2fs);
		if (pruneCount.value > 0) {
			pipes.add(featureCounter);
		}
		if (docProportionCutoff.value < 1.0) {
			pipes.add(docCounter);
		}

		Pipe serialPipe = new SerialPipes(pipes);

		Iterator iterator = serialPipe.newIteratorFrom(reader);

        int count = 0;

        // We aren't really interested in the instance itself,
		//  just the total feature counts.
        while (iterator.hasNext()) {
            count++;
            if (count % 100000 == 0) {
				System.out.println(count);
            }
            iterator.next();
		}

		if (pruneCount.value > 0) {
			featureCounter.addPrunedWordsToStoplist(prunedTokenizer, pruneCount.value);
		}
		if (docProportionCutoff.value < 1.0) {
			docCounter.addPrunedWordsToStoplist(prunedTokenizer, docProportionCutoff.value);
		}
	}


    public static void writeInstanceList(SimpleTokenizer prunedTokenizer)
		throws IOException {

		CsvIterator reader = new CsvIterator(new FileReader(inputFile.value),
                                             lineRegex.value,
											 dataGroup.value,
											 labelGroup.value,
											 nameGroup.value);

		ArrayList pipes = new ArrayList();
		Alphabet alphabet = new Alphabet();
		
		CharSequenceLowercase csl = new CharSequenceLowercase();
		StringList2FeatureSequence sl2fs = new StringList2FeatureSequence(alphabet);

		if (! preserveCase.value) {
			pipes.add(csl);
		}
		pipes.add(prunedTokenizer);
		pipes.add(sl2fs);

		Pipe serialPipe = new SerialPipes(pipes);

		InstanceList instances = new InstanceList(serialPipe);
		instances.addThruPipe(reader);
		instances.save(outputFile.value);
	}


	public static void main (String[] args) throws IOException {

		// Process the command-line options
        CommandOption.setSummary (BulkLoader.class,
                                  "Efficient tool for importing large amounts of text into Mallet format");
        CommandOption.process (BulkLoader.class, args);


		
		SimpleTokenizer tokenizer = null;

		if (stoplistFile.value != null) {
			tokenizer = new SimpleTokenizer(stoplistFile.value);
		}
		else if (removeStopWords.value) {
			tokenizer = new SimpleTokenizer(SimpleTokenizer.USE_DEFAULT_ENGLISH_STOPLIST);
		}
		else {
			tokenizer = new SimpleTokenizer(SimpleTokenizer.USE_EMPTY_STOPLIST);
		}

		if (pruneCount.value > 0 || docProportionCutoff.value < 1.0) {
			generateStoplist(tokenizer);
		}

		writeInstanceList(tokenizer);
	}

}