
cc.mallet.util.FeatureCooccurrenceCounter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation
Show all versions of jcore-mallet-2.0.9 Show documentation
MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.
The newest version!
package cc.mallet.util;
import cc.mallet.types.*;
import gnu.trove.*;
import java.util.Arrays;
import java.util.logging.*;
import java.text.NumberFormat;
import java.io.*;
public class FeatureCooccurrenceCounter {
private static Logger logger = MalletLogger.getLogger(FeatureCooccurrenceCounter.class.getName());
static CommandOption.String inputFile = new CommandOption.String
(FeatureCooccurrenceCounter.class, "input", "FILENAME", true, null,
"The filename from which to read the list of training instances. Use - for stdin. " +
"The instances must be FeatureSequence or FeatureSequenceWithBigrams, not FeatureVector", null);
static CommandOption.String weightsFile = new CommandOption.String
(FeatureCooccurrenceCounter.class, "weights-filename", "FILENAME", true, null,
"The filename to write the word-word weights file.", null);
static CommandOption.Double idfCutoff = new CommandOption.Double
(FeatureCooccurrenceCounter.class, "idf-cutoff", "NUMBER", true, 3.0,
"Words with IDF below this threshold will not be linked to any other word.", null);
static CommandOption.String unlinkedFile = new CommandOption.String
(FeatureCooccurrenceCounter.class, "unlinked-filename", "FILENAME", true, null,
"A file to write words that were not linked.", null);
TIntIntHashMap[] featureFeatureCounts;
InstanceList instances;
int numFeatures;
int[] documentFrequencies;
public FeatureCooccurrenceCounter (InstanceList instances) {
this.instances = instances;
numFeatures = instances.getDataAlphabet().size();
featureFeatureCounts = new TIntIntHashMap[numFeatures];
for (int feature = 0; feature < numFeatures; feature++) {
featureFeatureCounts[feature] = new TIntIntHashMap();
}
documentFrequencies = new int[numFeatures];
}
public void count() {
TIntIntHashMap featureCounts = new TIntIntHashMap();
int index = 0;
for (Instance instance: instances) {
FeatureSequence features = (FeatureSequence) instance.getData();
for (int i=0; i 0) {
IDSorter[] sortedWeights = new IDSorter[keys.length];
int i = 0;
for (int key: keys) {
double keyIDF = (logTotalDocs - logCache[documentFrequencies[key]]);
if (keyIDF - idfCutoff.value > 0) {
sortedWeights[i] =
new IDSorter(key,
((keyIDF - idfCutoff.value) / (featureIDF - idfCutoff.value)) *
((double) featureCounts.get(key) / (documentFrequencies[feature]) ));
}
else {
sortedWeights[i] =
new IDSorter(key, 0);
}
i++;
}
Arrays.sort(sortedWeights);
for (i = 0; i < 10; i++) {
if (i >= sortedWeights.length) { break; }
int key = sortedWeights[i].getID();
Object word = alphabet.lookupObject(sortedWeights[i].getID());
double weight = sortedWeights[i].getWeight();
if (weight < 0.05) { break; }
output.append("\t" + word + "\t" + weight);
}
}
out.println(output);
}
out.close();
}
public static void main (String[] args) throws Exception {
CommandOption.setSummary (FeatureCooccurrenceCounter.class,
"Build a file containing weights between word types");
CommandOption.process (FeatureCooccurrenceCounter.class, args);
InstanceList training = InstanceList.load (new File(inputFile.value));
FeatureCooccurrenceCounter counter = new FeatureCooccurrenceCounter(training);
counter.count();
counter.printCounts();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy