cc.mallet.fst.confidence.QBCSequenceConfidenceEstimator Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of jcore-mallet-2.0.9 Show documentation

MALLET is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text.

The newest version!

/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
   http://www.cs.umass.edu/~mccallum/mallet
   This software is provided under the terms of the Common Public License,
   version 1.0, as published by http://www.opensource.org.  For further
   information, see the file `LICENSE' included with this distribution. */

/** 
		@author Aron Culotta [email protected]
*/

package cc.mallet.fst.confidence;

import java.util.logging.*;
import java.util.*;

import cc.mallet.fst.*;
import cc.mallet.types.*;
import cc.mallet.util.MalletLogger;

/**
	 Estimates the confidence of an entire sequence by the
	 "disagreement" among a committee of CRFs.
 */
public class QBCSequenceConfidenceEstimator extends TransducerSequenceConfidenceEstimator
{
	
	private static Logger logger = MalletLogger.getLogger(
		QBCSequenceConfidenceEstimator.class.getName());


	Transducer[] committee;
	
	public QBCSequenceConfidenceEstimator (Transducer model, Transducer[] committee) {
		super(model);
		this.committee = committee;
	}

	/**
		 Calculates the confidence in the tagging of a {@link Instance}.
	 */
	public double estimateConfidenceFor (Instance instance,
																			 Object[] startTags,
																			 Object[] inTags) {
		Sequence[] predictions = new Sequence[committee.length];
		
		for (int i = 0; i < committee.length; i++)  
			predictions[i] = new MaxLatticeDefault (committee[i], (Sequence)instance.getData()).bestOutputSequence();

		// Higher return value means higher confidence this sequence is correct.
		double avg = avgVoteEntropy(predictions);
		return -1.0 * avg;
	}

	/** Calculate the "vote entropy" for each token and average. Vote
	 * entropy is defined as
	 *
	 * - \frac{1}{log(min(k, |C|)) \sum_c \frac{V(c,e)}{k} log(\frac{V(c,e)}{k})
	 *
	 * where k is committee size, e is Instance, c is class, and V(c,e)
	 * is the number of committee members assigning class c to input e.
	 */
	private double avgVoteEntropy (Sequence[] predictions) {
		double sum = 0.0;		
		for (int i = 0; i < predictions[0].size(); i++) {
			HashMap label2Count = new HashMap();
			for (int j = 0; j < predictions.length; j++) {
				String label = predictions[j].get(i).toString();
				Integer count = (Integer)label2Count.get(label);
				if (count == null)
					count = new Integer(0);
				label2Count.put(label, new Integer(count.intValue() + 1));					
			}
			sum += voteEntropy(label2Count);
		}
		return (double)sum / predictions[0].size();
	}

	private double voteEntropy (HashMap label2Count) {
		Iterator iter = label2Count.keySet().iterator();
		double sum = 0.0;
		while (iter.hasNext()) {
			String label = (String)iter.next();
			int count = ((Integer)label2Count.get(label)).intValue();
			double quot = (double)count / committee.length;
 			sum += quot * Math.log(quot);
		}
		double ret = (double) -1.0 * sum / Math.log((double)committee.length);
		return ret;
	}
}