All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.util.DocumentLengths Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!
package cc.mallet.util;

import cc.mallet.types.*;
import java.util.logging.*;
import java.io.*;

public class DocumentLengths {

	protected static Logger logger = MalletLogger.getLogger(DocumentLengths.class.getName());

	static cc.mallet.util.CommandOption.String inputFile = new cc.mallet.util.CommandOption.String
		(DocumentLengths.class, "input", "FILENAME", true, null,
		 "Filename for the input instance list", null);
		
	public static void main(String[] args) throws Exception {

		CommandOption.setSummary (DocumentLengths.class,
								  "Print the length of FeatureSequences in an instance list");
		CommandOption.process (DocumentLengths.class, args);

		InstanceList instances = InstanceList.load (new File(inputFile.value));
		for (Instance instance: instances) {
			if (! (instance.getData() instanceof FeatureSequence)) {
				System.err.println("DocumentLengths is only applicable to FeatureSequence objects (use --keep-sequence when importing)");
				System.exit(1);
			}
			
			FeatureSequence words = (FeatureSequence) instance.getData();
			System.out.println(words.size());
		}
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy