
com.aliasi.chunk.RescoringChunker Maven / Gradle / Ivy
Show all versions of aliasi-lingpipe Show documentation
/*
* LingPipe v. 4.1.0
* Copyright (C) 2003-2011 Alias-i
*
* This program is licensed under the Alias-i Royalty Free License
* Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i
* Royalty Free License Version 1 for more details.
*
* You should have received a copy of the Alias-i Royalty Free License
* Version 1 along with this program; if not, visit
* http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
* Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
* +1 (718) 290-9170.
*/
package com.aliasi.chunk;
import com.aliasi.util.BoundedPriorityQueue;
import com.aliasi.util.ScoredObject;
import com.aliasi.util.Strings;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
/**
* A RescoringChunker
provides first best, n-best and
* confidence chunking by rescoring n-best chunkings derived from a
* contained chunker.
*
* Concrete subclasses must implement the abstract method {@link
* #rescore(Chunking)}, which provides a score for a chunking. There
* are no restrictions on how this score is computed; most typically,
* it will be a longer-distance/higher-order model than the contained
* chunker and provide more accurate results.
*
*
The n-best chunker works by generating the top analyses from the
* contained chunker. The number of such analyses considered is
* determined in the constructor for this class. These are then
* placed in a bounded priority queue with the bound determined by the
* maximum specified in the call to {@link
* #nBest(char[],int,int,int)}.
*
The first-best chunker methods {@link #chunk(CharSequence)} and
* {@link #chunk(char[],int,int)} operate by choosing the top scoring
* chunking from the rescoring of the contained chunker. The number
* of chunkings from the contained chunker that are rescored is
* determined in the constructor. This is more memory and time
* efficient than running the n-best chunking.
*
*
N-Best Chunks
*
* The {@link #nBestChunks(char[],int,int,int)} method is implemented
* by walking over the n-best analyses generated by {@link
* #nBest(char[],int,int,int)} with a maximum n-best for full analyses
* set to the value of {@link #numChunkingsRescored()}, which may be
* changed using {@link #setNumChunkingsRescored(int)}. For each
* analysis, the chunks are pulled out and their weight is incremented
* by the n-best analysis weight. Normalization is carried out by
* dividing by the total probability mass in the returned n-best list.
*
* Caching
*
* There is no caching in the rescoring chunker per se. Any caching
* needs to be carried out in the contained n-best chunker, which
* is available as the return result of {@link #baseChunker()}.
*
* @author Bob Carpenter
* @version 3.8
* @since LingPipe2.3
* @param the type of the underlying n-best chunker
*/
public abstract class RescoringChunker
implements NBestChunker, ConfidenceChunker {
final B mChunker;
int mNumChunkingsRescored;
/**
* Construct a rescoring chunker that contains the specified base
* chunker and considers the specified number of chunkings for
* rescoring.
*
* @param chunker Base n-best chunker.
* @param numChunkingsRescored Number of chunkings generated
* by the base chunker to rescore.
*/
public RescoringChunker(B chunker, int numChunkingsRescored) {
mChunker = chunker;
mNumChunkingsRescored = numChunkingsRescored;
}
/**
* Returns the score for a chunking. This method is used to
* rescore the chunkings returned by the base chunker to order
* them for n-best or first-best return by this chunker. Although
* the base chunker's score is ignored, it may be incorporated
* in a subclass's implementation of this method.
*
* The rescoring should be in the form of log (base 2) joint
* probability estimate for the specified chunking. For the
* simple whole-analysis rescoring method {@link
* #nBest(char[],int,int,int)}, this is not checked, and any
* values may be used in practice. For the n-best chunk method
* {@link #nBestChunks(char[],int,int,int)}, the scores are
* treated as log probabilities, but renormalized in order to
* compute conditional chunk probability estimates.
*
* @param chunking Chunking to rescore.
* @return The new score for this chunking.
*/
public abstract double rescore(Chunking chunking);
/**
* The base chunker that generates hypotheses to rescore. Note
* that this is the actual chunker used by this class, so any
* changes to it will affect this class's behavior. Common changes
* involve setting the underlying chunker's configuration.
*
* @return The base chunker.
*/
public B baseChunker() {
return mChunker;
}
/**
* Return the number of chunkings to generate from the base
* chunker for rescoring.
*
* @return The number of base chunkings to rescore.
*/
public int numChunkingsRescored() {
return mNumChunkingsRescored;
}
/**
* Set the number of base chunkings to rescore. This value will
* be used in every chunking method to determine the underlying
* number of chunkings considered.
*
* @param numChunkingsRescored Number of base chunkings to
* rescore.
*/
public void setNumChunkingsRescored(int numChunkingsRescored) {
mNumChunkingsRescored = numChunkingsRescored;
}
/**
* Returns the first-best chunking for the specified character
* sequence. See the class documentation above for implementation
* details.
*
* @param cSeq Character sequence to chunk.
* @return First-best chunking of the specified character sequence.
*/
public Chunking chunk(CharSequence cSeq) {
char[] cs = Strings.toCharArray(cSeq);
return chunk(cs,0,cs.length);
}
/**
* Returns the first-best chunking for the specified character
* slice. See the class documentation above for implementation
* details.
*
* @param cs Underlying character array.
* @param start Index of first character to analyze.
* @param end Index of one past the last character to analyze.
* @return First-best chunking of the specified character slice.
*/
public Chunking chunk(char[] cs, int start, int end) {
return firstBest(mChunker.nBest(cs,start,end,mNumChunkingsRescored));
}
/**
* Returns the n-best chunkings of the specified character slice.
* See the class documentation above for implementation details.
*
* @param cs Underlying character array.
* @param start Index of first character to analyze.
* @param end Index of one past the last character to analyze.
* @return Iterator over the n-best chunkings of the specified
* character slice.
*/
public Iterator> nBest(char[] cs, int start, int end,
int maxNBest) {
return nBest(mChunker.nBest(cs,start,end,
mNumChunkingsRescored),
maxNBest);
}
/**
* Returns the n-best chunks for the specified character slice up to
* the specified maximum number of chunks.
*
* See the class documentation above for implementation details.
*
* @param cs Underlying characters.
* @param start Index of first character in slice.
* @param end Index of one past last character in slice.
* @param maxNBest Maximum number of chunks to return.
*/
public Iterator nBestChunks(char[] cs, int start, int end,
int maxNBest) {
double totalScore = 0.0;
Map chunkToScore = new HashMap();
Iterator> it = nBest(cs,start,end,mNumChunkingsRescored);
while (it.hasNext()) {
ScoredObject so = it.next();
double score = java.lang.Math.pow(2.0,so.score());
totalScore += score;
Chunking chunking = so.getObject();
for (Chunk chunk : chunking.chunkSet()) {
Chunk unscoredChunk
= ChunkFactory.createChunk(chunk.start(), chunk.end(),
chunk.type());
Double currentScoreD = chunkToScore.get(chunk);
double currentScore = currentScoreD == null
? 0.0
: currentScoreD.doubleValue();
double nextScore = currentScore + score;
chunkToScore.put(unscoredChunk,Double.valueOf(nextScore));
}
}
BoundedPriorityQueue bpq
= new BoundedPriorityQueue(ScoredObject.comparator(),
maxNBest);
for (Map.Entry entry : chunkToScore.entrySet()) {
Chunk chunk = entry.getKey();
double conditionalEstimate = entry.getValue().doubleValue()
/ totalScore;
Chunk scored = ChunkFactory.createChunk(chunk.start(),
chunk.end(),
chunk.type(),
conditionalEstimate);
bpq.offer(scored);
}
return bpq.iterator();
}
private Chunking firstBest(Iterator> nBestChunkingIt) {
Chunking bestChunking = null;
double bestScore = Double.NEGATIVE_INFINITY;
while (nBestChunkingIt.hasNext()) {
ScoredObject scoredChunking = nBestChunkingIt.next();
Chunking chunking = scoredChunking.getObject();
double score = rescore(chunking);
if (score > bestScore) {
bestScore = score;
bestChunking = chunking;
}
}
return bestChunking;
}
private Iterator>
nBest(Iterator> nBestChunkingIt,
int maxNBest) {
BoundedPriorityQueue> queue
= new BoundedPriorityQueue>(ScoredObject.comparator(),
maxNBest);
while (nBestChunkingIt.hasNext()) {
ScoredObject scoredChunking
= nBestChunkingIt.next();
Chunking chunking = scoredChunking.getObject();
double score = rescore(chunking);
queue.offer(new ScoredObject(chunking,score));
}
return queue.iterator();
}
}