All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.util.FeatureCountTool Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!
package cc.mallet.util;

import cc.mallet.types.*;
import gnu.trove.*;

import java.util.Formatter;
import java.util.Locale;
import java.util.logging.*;
import java.io.*;

import java.text.NumberFormat;

public class FeatureCountTool {

	protected static Logger logger = MalletLogger.getLogger(FeatureCountTool.class.getName());

	static cc.mallet.util.CommandOption.String inputFile = new cc.mallet.util.CommandOption.String
		(FeatureCountTool.class, "input", "FILENAME", true, null,
		 "Filename for the input instance list", null);
	
	double[] featureCounts;
	InstanceList instances;
	int numFeatures;
	int[] documentFrequencies;

	public FeatureCountTool (InstanceList instances) {
		this.instances = instances;
		numFeatures = instances.getDataAlphabet().size();

		featureCounts = new double[numFeatures];
		documentFrequencies = new int[numFeatures];
	}

	public double[] getFeatureCounts() {
		return featureCounts;
	}

	public int[] getDocumentFrequencies() {
		return documentFrequencies;
	}

	public void count() {

		TIntIntHashMap docCounts = new TIntIntHashMap();
		
		int index = 0;

		if (instances.size() == 0) { 
			logger.info("Instance list is empty");
			return;
		}

		if (instances.get(0).getData() instanceof FeatureSequence) {

			for (Instance instance: instances) {
				FeatureSequence features = (FeatureSequence) instance.getData();
				
				for (int i=0; i= minCount && featureCounts[inputType] <= maxCount && documentFrequencies[inputType] >= minDocs && documentFrequencies[inputType] <= maxDocs) {
				outputAlphabet.lookupIndex(inputAlphabet.lookupObject(inputType));
			}
		}
		
		return outputAlphabet;
	}

	public static void main (String[] args) throws Exception {
		CommandOption.setSummary (FeatureCountTool.class,
								  "Print feature counts and instances per feature (eg document frequencies) in an instance list");
		CommandOption.process (FeatureCountTool.class, args);

		InstanceList instances = InstanceList.load (new File(inputFile.value));
		FeatureCountTool counter = new FeatureCountTool(instances);
		counter.count();
		counter.printCounts();
	}


}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy