All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cc.mallet.pipe.FeatureDocFreqPipe Maven / Gradle / Ivy

Go to download

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!
package cc.mallet.pipe;

import cc.mallet.types.*;
import gnu.trove.*;

/** 
 *  Pruning low-count features can be a good way to save memory and computation.
 *   However, in order to use Vectors2Vectors, you need to write out the unpruned
 *   instance list, read it back into memory, collect statistics, create new 
 *   instances, and then write everything back out.
 * 

* This class supports a simpler method that makes two passes over the data: * one to collect statistics and create an augmented "stop list", and a * second to actually create instances. */ public class FeatureDocFreqPipe extends Pipe { FeatureCounter counter; int numInstances; public FeatureDocFreqPipe() { super(new Alphabet(), null); counter = new FeatureCounter(this.getDataAlphabet()); numInstances = 0; } public FeatureDocFreqPipe(Alphabet dataAlphabet, Alphabet targetAlphabet) { super(dataAlphabet, targetAlphabet); counter = new FeatureCounter(dataAlphabet); numInstances = 0; } public Instance pipe(Instance instance) { TIntIntHashMap localCounter = new TIntIntHashMap(); if (instance.getData() instanceof FeatureSequence) { FeatureSequence features = (FeatureSequence) instance.getData(); for (int position = 0; position < features.size(); position++) { localCounter.adjustOrPutValue(features.getIndexAtPosition(position), 1, 1); } } else { throw new IllegalArgumentException("Looking for a FeatureSequence, found a " + instance.getData().getClass()); } for (int feature: localCounter.keys()) { counter.increment(feature); } numInstances++; return instance; } /** * Add all pruned words to the internal stoplist of a SimpleTokenizer. * * @param docFrequencyCutoff Remove words that occur in greater than this proportion of documents. 0.05 corresponds to IDF >= 3. */ public void addPrunedWordsToStoplist(SimpleTokenizer tokenizer, double docFrequencyCutoff) { Alphabet currentAlphabet = getDataAlphabet(); for (int feature = 0; feature < currentAlphabet.size(); feature++) { if ((double) counter.get(feature) / numInstances > docFrequencyCutoff) { tokenizer.stop((String) currentAlphabet.lookupObject(feature)); } } } static final long serialVersionUID = 1; }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy