
cc.mallet.util.FeatureCountTool Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
package cc.mallet.util;
import cc.mallet.types.*;
import gnu.trove.*;
import java.util.Formatter;
import java.util.Locale;
import java.util.logging.*;
import java.io.*;
import java.text.NumberFormat;
public class FeatureCountTool {
protected static Logger logger = MalletLogger.getLogger(FeatureCountTool.class.getName());
static cc.mallet.util.CommandOption.String inputFile = new cc.mallet.util.CommandOption.String
(FeatureCountTool.class, "input", "FILENAME", true, null,
"Filename for the input instance list", null);
double[] featureCounts;
InstanceList instances;
int numFeatures;
int[] documentFrequencies;
public FeatureCountTool (InstanceList instances) {
this.instances = instances;
numFeatures = instances.getDataAlphabet().size();
featureCounts = new double[numFeatures];
documentFrequencies = new int[numFeatures];
}
public double[] getFeatureCounts() {
return featureCounts;
}
public int[] getDocumentFrequencies() {
return documentFrequencies;
}
public void count() {
TIntIntHashMap docCounts = new TIntIntHashMap();
int index = 0;
if (instances.size() == 0) {
logger.info("Instance list is empty");
return;
}
if (instances.get(0).getData() instanceof FeatureSequence) {
for (Instance instance: instances) {
FeatureSequence features = (FeatureSequence) instance.getData();
for (int i=0; i= minCount && featureCounts[inputType] <= maxCount && documentFrequencies[inputType] >= minDocs && documentFrequencies[inputType] <= maxDocs) {
outputAlphabet.lookupIndex(inputAlphabet.lookupObject(inputType));
}
}
return outputAlphabet;
}
public static void main (String[] args) throws Exception {
CommandOption.setSummary (FeatureCountTool.class,
"Print feature counts and instances per feature (eg document frequencies) in an instance list");
CommandOption.process (FeatureCountTool.class, args);
InstanceList instances = InstanceList.load (new File(inputFile.value));
FeatureCountTool counter = new FeatureCountTool(instances);
counter.count();
counter.printCounts();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy