cc.mallet.fst.confidence.TransducerSequenceConfidenceEstimator Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
The newest version!
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
@author Aron Culotta [email protected]
*/
package cc.mallet.fst.confidence;
import java.util.logging.*;
import java.util.*;
import cc.mallet.fst.*;
import cc.mallet.pipe.iterator.*;
import cc.mallet.types.*;
import cc.mallet.util.MalletLogger;
/**
* Abstract class that estimates the confidence of a {@link Sequence}
* extracted by a {@link Transducer}.Note that this is different from
* {@link TransducerConfidenceEstimator}, which estimates the
* confidence for a single {@link Segment}.
*/
abstract public class TransducerSequenceConfidenceEstimator
{
private static Logger logger = MalletLogger.getLogger(TransducerSequenceConfidenceEstimator.class.getName());
protected Transducer model; // the trained Transducer which
// performed the extractions.
public TransducerSequenceConfidenceEstimator (Transducer model) {
this.model = model;
}
/**
Calculates the confidence in the tagging of a {@link Sequence}.
*/
abstract public double estimateConfidenceFor (
Instance instance, Object[] startTags, Object[] inTags);
/**
Ranks all {@link Sequences}s in this {@link InstanceList} by
confidence estimate.
@param ilist list of segmentation instances
@param startTags represent the labels for the start states (B-)
of all segments
@param continueTags represent the labels for the continue state
(I-) of all segments
@return array of {@link InstanceWithConfidence}s ordered by
non-decreasing confidence scores, as calculated by
estimateConfidenceFor
*/
public InstanceWithConfidence[] rankInstancesByConfidence (InstanceList ilist,
Object[] startTags,
Object[] continueTags) {
ArrayList confidenceList = new ArrayList ();
for (int i=0; i < ilist.size(); i++) {
Instance instance = ilist.get (i);
Sequence predicted = new MaxLatticeDefault (model, (Sequence)instance.getData()).bestOutputSequence();
double confidence = estimateConfidenceFor (instance, startTags, continueTags);
confidenceList.add (new InstanceWithConfidence ( instance, confidence, predicted));
logger.info ("instance#"+i+" confidence="+confidence);
}
Collections.sort (confidenceList);
InstanceWithConfidence[] ret = new InstanceWithConfidence[1];
ret = (InstanceWithConfidence[]) confidenceList.toArray (ret);
return ret;
}
}