cc.mallet.util.FeatureCooccurrenceCounter Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!

package cc.mallet.util;

import cc.mallet.types.*;
import gnu.trove.*;

import java.util.Arrays;
import java.util.logging.*;
import java.text.NumberFormat;
import java.io.*;

public class FeatureCooccurrenceCounter {

	private static Logger logger = MalletLogger.getLogger(FeatureCooccurrenceCounter.class.getName());
	
	static CommandOption.String inputFile = new CommandOption.String
		(FeatureCooccurrenceCounter.class, "input", "FILENAME", true, null,
		  "The filename from which to read the list of training instances.  Use - for stdin.  " +
		 "The instances must be FeatureSequence or FeatureSequenceWithBigrams, not FeatureVector", null);
	
	static CommandOption.String weightsFile = new CommandOption.String
		(FeatureCooccurrenceCounter.class, "weights-filename", "FILENAME", true, null,
		 "The filename to write the word-word weights file.", null);
	
	static CommandOption.Double idfCutoff = new CommandOption.Double
		(FeatureCooccurrenceCounter.class, "idf-cutoff", "NUMBER", true, 3.0,
		 "Words with IDF below this threshold will not be linked to any other word.", null);
	
	static CommandOption.String unlinkedFile = new CommandOption.String
		(FeatureCooccurrenceCounter.class, "unlinked-filename", "FILENAME", true, null,
		 "A file to write words that were not linked.", null);


	TIntIntHashMap[] featureFeatureCounts;
	InstanceList instances;
	int numFeatures;
	int[] documentFrequencies;

	public FeatureCooccurrenceCounter (InstanceList instances) {
		this.instances = instances;
		numFeatures = instances.getDataAlphabet().size();

		featureFeatureCounts = new TIntIntHashMap[numFeatures];
		for (int feature = 0; feature < numFeatures; feature++) {
			featureFeatureCounts[feature] = new TIntIntHashMap();
		}

		documentFrequencies = new int[numFeatures];
	}

	public void count() {
		
		TIntIntHashMap featureCounts = new TIntIntHashMap();
		
		int index = 0;

		for (Instance instance: instances) {
			FeatureSequence features = (FeatureSequence) instance.getData();

			for (int i=0; i 0) {
				IDSorter[] sortedWeights = new IDSorter[keys.length];
				
				int i = 0;
				for (int key: keys) {
					double keyIDF = (logTotalDocs - logCache[documentFrequencies[key]]);

					if (keyIDF - idfCutoff.value > 0) {
						sortedWeights[i] =
							new IDSorter(key,
										 ((keyIDF - idfCutoff.value) / (featureIDF - idfCutoff.value)) *
										 ((double) featureCounts.get(key) / (documentFrequencies[feature]) ));
					}
					else { 
						sortedWeights[i] =
                            new IDSorter(key, 0);
					}
					i++;
				}
				
				Arrays.sort(sortedWeights);
				
				for (i = 0; i < 10; i++) {
					if (i >= sortedWeights.length) { break; }
					
					int key = sortedWeights[i].getID();
					
					Object word = alphabet.lookupObject(sortedWeights[i].getID());
					double weight = sortedWeights[i].getWeight();

					if (weight < 0.05) { break; }
					
					output.append("\t" + word + "\t" + weight);
				}
			}

			out.println(output);
		}
		
		out.close();
	}

	public static void main (String[] args) throws Exception {
		CommandOption.setSummary (FeatureCooccurrenceCounter.class,
								  "Build a file containing weights between word types");
		CommandOption.process (FeatureCooccurrenceCounter.class, args);

		InstanceList training = InstanceList.load (new File(inputFile.value));

		FeatureCooccurrenceCounter counter = new FeatureCooccurrenceCounter(training);
		counter.count();
		counter.printCounts();
	}


}