cc.mallet.pipe.FeatureDocFreqPipe Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!

package cc.mallet.pipe;

import cc.mallet.types.*;
import gnu.trove.*;

/** 
 *  Pruning low-count features can be a good way to save memory and computation.
 *   However, in order to use Vectors2Vectors, you need to write out the unpruned
 *   instance list, read it back into memory, collect statistics, create new 
 *   instances, and then write everything back out.
 * 
 *  This class supports a simpler method that makes two passes over the data:
 *   one to collect statistics and create an augmented "stop list", and a
 *   second to actually create instances.
 */

public class FeatureDocFreqPipe extends Pipe {
		
	FeatureCounter counter;
	int numInstances;

	public FeatureDocFreqPipe() {
		super(new Alphabet(), null);

		counter = new FeatureCounter(this.getDataAlphabet());
		numInstances = 0;
	}
		
	public FeatureDocFreqPipe(Alphabet dataAlphabet, Alphabet targetAlphabet) {
		super(dataAlphabet, targetAlphabet);

		counter = new FeatureCounter(dataAlphabet);
		numInstances = 0;
	}

	public Instance pipe(Instance instance) {
		
		TIntIntHashMap localCounter = new TIntIntHashMap();
	
		if (instance.getData() instanceof FeatureSequence) {
				
			FeatureSequence features = (FeatureSequence) instance.getData();

			for (int position = 0; position < features.size(); position++) {
				localCounter.adjustOrPutValue(features.getIndexAtPosition(position), 1, 1);
			}

		}
		else {
			throw new IllegalArgumentException("Looking for a FeatureSequence, found a " + 
											   instance.getData().getClass());
		}

		for (int feature: localCounter.keys()) {
			counter.increment(feature);
		}

		numInstances++;

		return instance;
	}

	/** 
	 *  Add all pruned words to the internal stoplist of a SimpleTokenizer.
	 * 
	 * @param docFrequencyCutoff Remove words that occur in greater than this proportion of documents. 0.05 corresponds to IDF >= 3.
	 */
	public void addPrunedWordsToStoplist(SimpleTokenizer tokenizer, double docFrequencyCutoff) {
		Alphabet currentAlphabet = getDataAlphabet();

        for (int feature = 0; feature < currentAlphabet.size(); feature++) {
            if ((double) counter.get(feature) / numInstances > docFrequencyCutoff) {
                tokenizer.stop((String) currentAlphabet.lookupObject(feature));
            }
        }
	}

	static final long serialVersionUID = 1;

}