cc.mallet.util.FeatureCountTool Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
The newest version!
package cc.mallet.util;
import cc.mallet.types.*;
import gnu.trove.*;
import java.util.Formatter;
import java.util.Locale;
import java.util.logging.*;
import java.io.*;
import java.text.NumberFormat;
public class FeatureCountTool {
protected static Logger logger = MalletLogger.getLogger(FeatureCountTool.class.getName());
static cc.mallet.util.CommandOption.String inputFile = new cc.mallet.util.CommandOption.String
(FeatureCountTool.class, "input", "FILENAME", true, null,
"Filename for the input instance list", null);
double[] featureCounts;
InstanceList instances;
int numFeatures;
int[] documentFrequencies;
public FeatureCountTool (InstanceList instances) {
this.instances = instances;
numFeatures = instances.getDataAlphabet().size();
featureCounts = new double[numFeatures];
documentFrequencies = new int[numFeatures];
}
public double[] getFeatureCounts() {
return featureCounts;
}
public int[] getDocumentFrequencies() {
return documentFrequencies;
}
public void count() {
TIntIntHashMap docCounts = new TIntIntHashMap();
int index = 0;
if (instances.size() == 0) {
logger.info("Instance list is empty");
return;
}
if (instances.get(0).getData() instanceof FeatureSequence) {
for (Instance instance: instances) {
FeatureSequence features = (FeatureSequence) instance.getData();
for (int i=0; i= minCount && featureCounts[inputType] <= maxCount && documentFrequencies[inputType] >= minDocs && documentFrequencies[inputType] <= maxDocs) {
outputAlphabet.lookupIndex(inputAlphabet.lookupObject(inputType));
}
}
return outputAlphabet;
}
public static void main (String[] args) throws Exception {
CommandOption.setSummary (FeatureCountTool.class,
"Print feature counts and instances per feature (eg document frequencies) in an instance list");
CommandOption.process (FeatureCountTool.class, args);
InstanceList instances = InstanceList.load (new File(inputFile.value));
FeatureCountTool counter = new FeatureCountTool(instances);
counter.count();
counter.printCounts();
}
}