com.aliasi.chunk.ChunkingEvaluation Maven / Gradle / Ivy
Show all versions of aliasi-lingpipe Show documentation
/*
* LingPipe v. 4.1.0
* Copyright (C) 2003-2011 Alias-i
*
* This program is licensed under the Alias-i Royalty Free License
* Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the Alias-i
* Royalty Free License Version 1 for more details.
*
* You should have received a copy of the Alias-i Royalty Free License
* Version 1 along with this program; if not, visit
* http://alias-i.com/lingpipe/licenses/lingpipe-license-1.txt or contact
* Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
* +1 (718) 290-9170.
*/
package com.aliasi.chunk;
import com.aliasi.classify.PrecisionRecallEvaluation;
import com.aliasi.util.Strings;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
/**
* A ChunkingEvaluation
stores and reports the results of
* evaluating response chunkings against reference chunkings. Cases
* to evaluate are supplied in the form of a reference and response
* chunking through the method {@link #addCase(Chunking,Chunking)}.
*
* The sets of true positive,
* false positive and false negative chunks are available through the
* methods {@link #truePositiveSet()}, {@link #falsePositiveSet()},
* and {@link #falseNegativeSet()}. True positives are chunks that
* are in both the reference and response, false positives are chunks
* in the response but not the reference, and false negatives are in
* the reference, but not the response. There is no notion of true
* negative in this task, a fact that is reflected in the results of
* the precision-recall evaluation.
*
*
The main method of reporting is through an instance of {@link
* com.aliasi.classify.ScoredPrecisionRecallEvaluation} returned by
* the method {@link #precisionRecallEvaluation()}. The return result
* provides an object capable of extensive reporting for scored
* classification tasks such as chunking. The instances of true and
* false positive and negatives are described above; their scores are
* derived from response scores.
*
*
This evaluator works solely on the basis of chunk offset and
* exact match. There is no notion of alignment or mapping, as found,
* for example, in the MUC
* Scoring Software User's Manual, and its descendants such as the
* 2005
* ACE Evaluation Plan. In this regard, we follow the model of
* CoNLL 2000
* Chunking Task.
*
*
This evaluation is able to handle overlapping chunks with
* results being reported in the same manner. In particular, the
* labeled precision and recall components of the approach that later
* became known as PARSEVAL can
* be generated by using the ChunkingEvaluation
class.
*
* @author Bob Carpenter
* @version 3.8
* @since LingPipe2.1
*/
public class ChunkingEvaluation {
private final Set mCases = new HashSet();
private final Set mTruePositiveSet
= new HashSet();
private final Set mFalsePositiveSet
= new HashSet();
private final Set mFalseNegativeSet
= new HashSet();
String mLastCase = null;
/**
* Construct a chunking evaluation.
*/
public ChunkingEvaluation() {
/* do nothing */
}
/**
* Return the set of cases consisting of pairs of reference and
* response chunkings. The elements of the set returned are of
* type Chunking[]
, with the first element being the
* reference chunk and the second element being the response
* chunk.
*
* The set returned is an unmodifiable view of the underlying
* set of cases and will change as cases are added to this
* evaluation.
*
* @return The set of cases.
*/
public Set cases() {
return Collections.unmodifiableSet(mCases);
}
/**
* Returns a chunking evaluation which consists of the current
* chunking evaluation restricted to the specified type. A
* new evaluation is constructed and populated with the same
* cases as this evaluation, but with the reference and response
* chunkings both restricted to only include answers of the
* specified type.
*
* @param chunkType Type of chunk to be evaluated.
* @return ChunkingEvaluation Evaluation for this type.
*/
public ChunkingEvaluation perTypeEvaluation(String chunkType) {
ChunkingEvaluation evaluation = new ChunkingEvaluation();
for (Chunking[] testCase : cases()) {
Chunking referenceChunking = testCase[0];
Chunking responseChunking = testCase[1];
Chunking referenceChunkingRestricted
= restrictTo(referenceChunking,chunkType);
Chunking responseChunkingRestricted
= restrictTo(responseChunking,chunkType);
evaluation.addCase(referenceChunkingRestricted,
responseChunkingRestricted);
}
return evaluation;
}
static Chunking restrictTo(Chunking chunking, String type) {
CharSequence cs = chunking.charSequence();
ChunkingImpl chunkingOut = new ChunkingImpl(cs);
for (Chunk chunk : chunking.chunkSet())
if (chunk.type().equals(type))
chunkingOut.add(chunk);
return chunkingOut;
}
static String formatChunks(Chunking chunking) {
StringBuilder sb = new StringBuilder();
int pos = 0;
for (Chunk chunk : chunking.chunkSet()) {
int start = chunk.start();
int padLength = start-pos;
for (int j = 0; j < padLength; ++j)
sb.append(" ");
int end = chunk.end();
int chunkLength = end-start;
char marker = chunk.type().length() > 0
? chunk.type().charAt(0)
: '!';
if (chunkLength > 0) sb.append(marker);
for (int j = 1; j < chunkLength; ++j)
sb.append(".");
pos = end;
}
sb.append("\n");
return sb.toString();
}
static String formatHeader(int indent, Chunking chunking) {
String cs = chunking.charSequence().toString();
StringBuilder sb = new StringBuilder();
for (int i = 0; i < indent; ++i) sb.append(" ");
sb.append("CHUNKS= ");
for (Chunk chunk : chunking.chunkSet()) {
sb.append("(" + chunk.start()
+ "," + chunk.end()
+ "):" + chunk.type() + " ");
}
if (sb.charAt(sb.length()-1) != '\n') sb.append("\n");
for (int i = 0; i < indent; ++i) sb.append(" ");
sb.append(cs);
sb.append("\n");
int length = cs.length();
printMods(1,length, sb,indent);
printMods(10,length, sb,indent);
printMods(100,length, sb,indent);
if (sb.charAt(sb.length()-1) != '\n') sb.append("\n");
return sb.toString();
}
static void printMods(int base, int length, StringBuilder sb, int indent) {
if (length <= base) return;
for (int i = 0; i < indent; ++i) sb.append(" ");
for (int i = 0; i < length; ++i) {
if (base == 1 || (i >= base && i % 10 == 0))
sb.append(Integer.toString((i/base)%10));
else
sb.append(" ");
}
sb.append("\n");
}
/**
* Add an evaluation case consisting of a reference chunk
* set and a response chunk set.
*
* @param referenceChunking Chunking of reference chunks.
* @param responseChunking Chunking of response chunks.
* @throws IllegalArgumentException If the chunkings are not
* over the same character sequence.
*/
public void addCase(Chunking referenceChunking,
Chunking responseChunking) {
StringBuilder sb = new StringBuilder();
CharSequence cSeq = referenceChunking.charSequence();
if (!Strings.equalCharSequence(cSeq,
responseChunking.charSequence())) {
String msg = "Char sequences must be same."
+ " Reference char seq=" + cSeq
+ " Response char seq=" + responseChunking.charSequence();
throw new IllegalArgumentException(msg);
}
sb.append("\n");
sb.append(formatHeader(5,referenceChunking)); // 5 is indent for " REF " and "RESP "
sb.append("\n REF ");
sb.append(formatChunks(referenceChunking));
sb.append("RESP ");
sb.append(formatChunks(responseChunking));
sb.append("\n");
mLastCase = sb.toString();
mCases.add(new Chunking[] { referenceChunking, responseChunking });
// need mutable sets, so wrap
Set refSet = unscoredChunkSet(referenceChunking);
Set respSet = unscoredChunkSet(responseChunking);
for (Chunk respChunk : respSet) {
boolean inRef = refSet.remove(respChunk);
ChunkAndCharSeq ccs = new ChunkAndCharSeq(respChunk,cSeq);
if (inRef) {
mTruePositiveSet.add(ccs);
} else {
mFalsePositiveSet.add(ccs);
}
}
for (Chunk refChunk : refSet) {
mFalseNegativeSet.add(new ChunkAndCharSeq(refChunk,cSeq));
}
}
static Set unscoredChunkSet(Chunking chunking) {
Set result = new HashSet();
for (Chunk chunk : chunking.chunkSet())
result.add(ChunkFactory.createChunk(chunk.start(),
chunk.end(),
chunk.type()));
return result;
}
/**
* Returns the set of true positives. True positives are chunks
* that were in both a reference and response chunking case. The
* set returned contains instances of {@link ChunkAndCharSeq},
* which combine a chunk and a character sequence.
*
* The set is unmodifiable, but tracks the changes in this
* evaluator.
*
* @return The set of true positives.
*/
public Set truePositiveSet() {
return Collections.unmodifiableSet(mTruePositiveSet);
}
/**
* Returns the set of false positives. False positives are
* response chunks that are not reference chunks. The set returned
* contains instances of {@link ChunkAndCharSeq}, which combine a
* chunk and a character sequence.
*
* The set is unmodifiable, but tracks the changes in this
* evaluator.
*
* @return The set of false positives.
*/
public Set falsePositiveSet() {
return Collections.unmodifiableSet(mFalsePositiveSet);
}
/**
* Returns the set of false negatives. False negatives are
* reference chunks which are not response chunks. The set
* returned contains instances of {@link ChunkAndCharSeq}, which
* combine a chunk and a character sequence.
*
* The set is unmodifiable, but tracks the changes in this
* evaluator.
*
* @return The set of false negatives.
*/
public Set falseNegativeSet() {
return Collections.unmodifiableSet(mFalseNegativeSet);
}
/**
* Return the scored precision-recall evaluation for this chunker.
* This is a copy of the precision-recall evaluation and changes to
* it will not affect the results returned by this class.
*
* @return The precision-recall evaluation.
*/
public PrecisionRecallEvaluation precisionRecallEvaluation() {
int tp = truePositiveSet().size();
int fn = falseNegativeSet().size();
int fp = falsePositiveSet().size();
return new PrecisionRecallEvaluation(tp,fn,fp,0);
}
/**
* Returns the precision-recall evaluation for this chunking
* as a string.
*
* @return This evaluation as a string.
*/
@Override
public String toString() {
return precisionRecallEvaluation().toString();
}
}