cc.mallet.fst.confidence.IsolatedSegmentTransducerCorrector Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mallet Show documentation
Show all versions of mallet Show documentation
MALLET is a Java-based package for statistical natural language processing,
document classification, clustering, topic modeling, information extraction,
and other machine learning applications to text.
The newest version!
/* Copyright (C) 2002 Univ. of Massachusetts Amherst, Computer Science Dept.
This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).
http://www.cs.umass.edu/~mccallum/mallet
This software is provided under the terms of the Common Public License,
version 1.0, as published by http://www.opensource.org. For further
information, see the file `LICENSE' included with this distribution. */
/**
@author Aron Culotta [email protected]
*/
package cc.mallet.fst.confidence;
import java.util.ArrayList;
import java.util.logging.*;
import cc.mallet.fst.*;
import cc.mallet.types.*;
import cc.mallet.util.MalletLogger;
/**
* Corrects a subset of the {@link Segment}s produced by a {@link
* Transducer}. It's most useful to find the {@link Segment}s that the
* {@link Transducer} is least confident in and correct those using
* the true {@link Labeling}
* (correctLeastConfidenceSegments
). Unlike in {@link
* ConstrainedViterbi}, the corrected segment does not affect the
* labeling of other segments in the sequence. For comparison.
*/
public class IsolatedSegmentTransducerCorrector implements TransducerCorrector
{
private static Logger logger = MalletLogger.getLogger(IsolatedSegmentTransducerCorrector.class.getName());
TransducerConfidenceEstimator confidenceEstimator;
Transducer model;
public IsolatedSegmentTransducerCorrector (TransducerConfidenceEstimator confidenceEstimator,
Transducer model) {
this.confidenceEstimator = confidenceEstimator;
this.model = model;
}
public IsolatedSegmentTransducerCorrector (Transducer model) {
this (new ConstrainedForwardBackwardConfidenceEstimator (model), model);
}
/**
@param ilist original Transducer InstanceList
@param startTags start segment tags (B-)
@param continueTags continue segment tags (I-)
TransducerConfidenceEstimator}
@return a list of {@link Sequence}s corresponding to the
corrected tagging of each Instance in ilist
. Note
that these corrections will not affect tokens outside of the
corrected segment.
*/
public ArrayList correctLeastConfidentSegments (InstanceList ilist, Object[] startTags,
Object[] continueTags) {
ArrayList correctedPredictionList = new ArrayList ();
for (int i=0; i < ilist.size(); i++) {
logger.fine ("correcting instance# " + i + " / " + ilist.size());
Instance instance = ilist.get (i);
Segment[] orderedSegments = new Segment[1];
orderedSegments = confidenceEstimator.rankSegmentsByConfidence (instance, startTags, continueTags);
Segment leastConfidentSegment = orderedSegments[0];
logger.fine ("Ordered Segments:\nTrue sequence: " + leastConfidentSegment.getTruth());
for (int j=0; j < orderedSegments.length; j++) {
logger.fine (orderedSegments[j].toString());
}
// _do not_ run constrained viterbi on this sequence with the
// constraint that this segment is tagged correctly.
// instead, simply replace the labeling of the corrected
// segment.
MultiSegmentationEvaluator eval = new MultiSegmentationEvaluator (new InstanceList[0], new String[0], startTags, continueTags);
Sequence truth = leastConfidentSegment.getTruth();
Sequence predicted = leastConfidentSegment.getPredicted();
int numIncorrect = eval.numIncorrectSegments (truth, predicted);
String[] sequence = new String[truth.size()];
for (int j=0; j < truth.size(); j++) {
if (j <= leastConfidentSegment.getEnd() && j >= leastConfidentSegment.getStart())
sequence[j] = (String)truth.get (j);
else sequence[j] = (String) predicted.get (j);
}
ArraySequence segmentCorrectedOutput = new ArraySequence (sequence);
logger.fine ("Original prediction: ");
for (int j=0; j < predicted.size(); j++)
logger.fine ((String)predicted.get (j) + "\t");
logger.fine ("\nCorrected prediction: ");
for (int j=0; j < segmentCorrectedOutput.size(); j++)
logger.fine ((String)segmentCorrectedOutput.get (j) + "\t");
logger.fine ("");
if (numIncorrect > -1)
correctedPredictionList.add (segmentCorrectedOutput);
else
correctedPredictionList.add (null);
}
return correctedPredictionList;
}
}