gate.util.OntologyMeasures Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gate-core Show documentation
Show all versions of gate-core Show documentation
GATE - general achitecture for text engineering - is open source
software capable of solving almost any text processing problem. This
artifact enables you to embed the core GATE Embedded with its essential
dependencies. You will able to use the GATE Embedded API and load and
store GATE XML documents. This artifact is the perfect dependency for
CREOLE plugins or for applications that need to customize the GATE
dependencies due to confict with their own dependencies or for lower
footprint.
The newest version!
/**
* Copyright (c) 1995-2012, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Thomas Heitz - 09/06/2010
*
* $Id$
*/
package gate.util;
import gate.Annotation;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URL;
import java.text.NumberFormat;
import java.util.*;
/**
* Modified version of Precision and Recall called BDM that takes into
* account the distance of two concepts in an ontology.
*/
public class OntologyMeasures {
public OntologyMeasures() {
// empty constructor
}
/**
* Constructor to be used when you have a collection of OntologyMeasures
* and want to consider it as only one OntologyMeasures.
* Then you can only use the methods getPrecision/Recall/FMeasure...().
* @param measures collection to be regrouped in one OntologyMeasures
*/
public OntologyMeasures(Collection measures) {
Map> differsByTypeMap =
new HashMap>();
for (OntologyMeasures measure : measures) {
for (Map.Entry entry : measure.bdmByTypeMap.entrySet()) {
float previousBdm = 0;
if (bdmByTypeMap.containsKey(entry.getKey())) {
previousBdm = bdmByTypeMap.get(entry.getKey());
}
// set the bdmByTypeMap to be the sum of those in the collection
bdmByTypeMap.put(entry.getKey(), previousBdm + entry.getValue());
}
for (Map.Entry entry :
measure.differByTypeMap.entrySet()) {
List differs = differsByTypeMap.get(entry.getKey());
if (differs == null) {
differs = new ArrayList();
}
differs.add(entry.getValue());
differsByTypeMap.put(entry.getKey(), differs);
}
}
// combine the list of AnnotationDiffer for each type
for (Map.Entry> entry :
differsByTypeMap.entrySet()) {
differByTypeMap.put(entry.getKey(),
new AnnotationDiffer(entry.getValue()));
}
}
/**
* For a document get the annotation differs that contain the type to compare
* and the annotation differs that may have miscategorized annotations
* for this type. Then we try to find miscategorized types that are close
* enough from the main type and use their BDM value to get an augmented
* precision, recall and fscore.
*
* @param differs annotation differ for the type and for possible
* miscategorized types.
*/
public void calculateBdm(Collection differs) {
if (bdmByConceptsMap == null) {
// load BDM file with scores for each concept/annotation type pair
bdmByConceptsMap = read(bdmFileUrl); // read the bdm scores
}
// calculate BDM from the spurious and missing annotations
Set unpairedResponseAnnotations = new HashSet();
Set unpairedKeyAnnotations;
// will use the whole spurious annotations as the second set to compare
for (AnnotationDiffer differ : differs) {
unpairedResponseAnnotations.addAll(
differ.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE));
}
bdmByTypeMap.clear();
for (AnnotationDiffer differ : differs) {
unpairedKeyAnnotations = differ.getAnnotationsOfType(
AnnotationDiffer.MISSING_TYPE);
if (!bdmByTypeMap.containsKey(differ.getAnnotationType())) {
bdmByTypeMap.put(differ.getAnnotationType(), 0f);
}
// use the missing annotations as the first set to compare
for (Annotation unpairedKeyAnnotation : unpairedKeyAnnotations) {
String type = unpairedKeyAnnotation.getType();
// Out.prln("unpairedKeyAnnotation: " + unpairedKeyAnnotation.toString());
Iterator iterator = unpairedResponseAnnotations.iterator();
// use the spurious annotations as the second set to compare
while (iterator.hasNext()) {
Annotation unpairedResponseAnnotation = iterator.next();
// Out.prln("unpairedResponsAnnotation: "
// + unpairedResponseAnnotation.toString());
float bdm = 0;
// annotations have the same start and end offsets
if (unpairedKeyAnnotation.coextensive(unpairedResponseAnnotation)) {
// compare both features values with BDM pairs
if (differ.getSignificantFeaturesSet() != null) {
if (!type.equals(unpairedResponseAnnotation.getType())) {
continue; // types must be the same
}
for (Object feature : differ.getSignificantFeaturesSet()) {
if (unpairedKeyAnnotation.getFeatures() == null
|| unpairedResponseAnnotation.getFeatures() == null) {
continue;
}
// Out.prln("Feature: " + feature);
String keyLabel = (String)
unpairedKeyAnnotation.getFeatures().get(feature);
// Out.prln("KeyLabel: " + keyLabel);
String responseLabel = (String)
unpairedResponseAnnotation.getFeatures().get(feature);
// Out.prln("ResponseLabel: " + responseLabel);
if (keyLabel == null || responseLabel == null) {
// do nothing
} else if (bdmByConceptsMap.containsKey(
keyLabel + ", " + responseLabel)) {
bdm += bdmByConceptsMap.get(keyLabel + ", " + responseLabel);
} else if (bdmByConceptsMap.containsKey(
responseLabel + ", " + keyLabel)) {
bdm += bdmByConceptsMap.get(responseLabel + ", " + keyLabel);
}
}
bdm = bdm / differ.getSignificantFeaturesSet().size();
} else { // compare both types with BDM pairs
if (bdmByConceptsMap.containsKey(
type + ',' + unpairedResponseAnnotation.getType())) {
bdm = bdmByConceptsMap.get(
type + ',' + unpairedResponseAnnotation.getType());
} else if (bdmByConceptsMap.containsKey(
unpairedResponseAnnotation.getType() + ", " + type)) {
bdm = bdmByConceptsMap.get(
unpairedResponseAnnotation.getType() + ", " + type);
}
}
if (bdm > 0) {
bdmByTypeMap.put(type, bdmByTypeMap.get(type) + bdm);
iterator.remove();
// Out.prln("BDM: " + bdmByTypeMap.get(type));
}
}
}
}
}
differByTypeMap.clear();
Map> differsByTypeMap =
new HashMap>();
for (AnnotationDiffer differ : differs) {
// we consider that all annotations in AnnotationDiffer are the same type
String type = differ.getAnnotationType();
List differsType = differsByTypeMap.get(type);
if (differsType == null) {
differsType = new ArrayList();
}
differsType.add(differ);
differsByTypeMap.put(type, differsType);
}
// combine the list of AnnotationDiffer for each type
for (Map.Entry> entry :
differsByTypeMap.entrySet()) {
differByTypeMap.put(entry.getKey(),
new AnnotationDiffer(entry.getValue()));
}
}
/**
* AP = (sum of BDMs for BDM-matching pair spurious/missing + Correct)
* / (Correct + Spurious)
* @param type annotation type
* @return strict precision with BDM correction
*/
public double getPrecisionStrictBdm(String type) {
AnnotationDiffer differ = differByTypeMap.get(type);
if (differ.getCorrectMatches() + differ.getSpurious() == 0) {
return 1.0;
}
return (bdmByTypeMap.get(type) + differ.getCorrectMatches())
/ (differ.getCorrectMatches() + differ.getSpurious());
}
public double getPrecisionStrictBdm() {
double result = 0;
for (String type : differByTypeMap.keySet()) {
result += getPrecisionStrictBdm(type);
}
return result / differByTypeMap.size();
}
public double getRecallStrictBdm(String type) {
AnnotationDiffer differ = differByTypeMap.get(type);
if (differ.getCorrectMatches() + differ.getMissing() == 0) {
return 1.0;
}
return (bdmByTypeMap.get(type) + differ.getCorrectMatches())
/ (differ.getCorrectMatches() + differ.getMissing());
}
public double getRecallStrictBdm() {
double result = 0;
for (String type : differByTypeMap.keySet()) {
result += getRecallStrictBdm(type);
}
return result / differByTypeMap.size();
}
public double getFMeasureStrictBdm(String type, double beta) {
double precision = getPrecisionStrictBdm(type);
double recall = getRecallStrictBdm(type);
double betaSq = beta * beta;
double answer = ((betaSq + 1) * precision * recall)
/ (betaSq * precision + recall);
if(Double.isNaN(answer)) answer = 0.0;
return answer;
}
public double getFMeasureStrictBdm(double beta) {
double result = 0;
for (String type : differByTypeMap.keySet()) {
result += getFMeasureStrictBdm(type, beta);
}
return result / differByTypeMap.size();
}
public double getPrecisionLenientBdm(String type) {
AnnotationDiffer differ = differByTypeMap.get(type);
if (differ.getCorrectMatches() + differ.getSpurious() == 0) {
return 1.0;
}
return (bdmByTypeMap.get(type) + differ.getCorrectMatches()
+ differ.getPartiallyCorrectMatches())
/ (differ.getCorrectMatches() + differ.getSpurious());
}
public double getPrecisionLenientBdm() {
double result = 0;
for (String type : differByTypeMap.keySet()) {
result += getPrecisionLenientBdm(type);
}
return result / differByTypeMap.size();
}
public double getRecallLenientBdm(String type) {
AnnotationDiffer differ = differByTypeMap.get(type);
if (differ.getCorrectMatches() + differ.getMissing() == 0) {
return 1.0;
}
return (bdmByTypeMap.get(type) + differ.getCorrectMatches()
+ differ.getPartiallyCorrectMatches())
/ (differ.getCorrectMatches() + differ.getMissing());
}
public double getRecallLenientBdm() {
double result = 0;
for (String type : differByTypeMap.keySet()) {
result += getRecallLenientBdm(type);
}
return result / differByTypeMap.size();
}
public double getFMeasureLenientBdm(String type, double beta) {
double precision = getPrecisionLenientBdm(type);
double recall = getRecallLenientBdm(type);
double betaSq = beta * beta;
double answer = ((betaSq + 1) * precision * recall)
/ (betaSq * precision + recall);
if(Double.isNaN(answer)) answer = 0.0;
return answer;
}
public double getFMeasureLenientBdm(double beta) {
double result = 0;
for (String type : differByTypeMap.keySet()) {
result += getFMeasureLenientBdm(type, beta);
}
return result / differByTypeMap.size();
}
public double getPrecisionAverageBdm(String type) {
return (getPrecisionLenientBdm(type) + getPrecisionStrictBdm(type)) / 2.0;
}
/**
* Gets the average of the strict and lenient precision values.
* @return a double value.
*/
public double getPrecisionAverageBdm() {
return (getPrecisionLenientBdm() + getPrecisionStrictBdm()) / 2.0;
}
public double getRecallAverageBdm(String type) {
return (getRecallLenientBdm(type) + getRecallStrictBdm(type)) / 2.0;
}
/**
* Gets the average of the strict and lenient recall values.
* @return a double value.
*/
public double getRecallAverageBdm() {
return (getRecallLenientBdm() + getRecallStrictBdm()) / 2.0;
}
public double getFMeasureAverageBdm(String type, double beta) {
return (getFMeasureLenientBdm(type, beta)
+ getFMeasureStrictBdm(type, beta))
/ 2.0;
}
/**
* Gets the average of strict and lenient F-Measure values.
* @param beta The relative weight of precision and recall. A value of 1
* gives equal weights to precision and recall. A value of 0 takes the recall
* value completely out of the equation.
* @return a doublevalue.
*/
public double getFMeasureAverageBdm(double beta) {
return (getFMeasureLenientBdm(beta) + getFMeasureStrictBdm(beta)) / 2.0;
}
public void setBdmFile(URL url) {
bdmFileUrl = url;
bdmByConceptsMap = null;
}
/**
* Read the BDM scores from a file.
* @param bdmFile URL of the BDM file
* @return map from a pair of concepts to their BDM score
*/
public Map read(URL bdmFile) {
Map bdmByConceptsMap = new HashMap();
if (bdmFile == null) {
Out.prln("There is no BDM file specified.");
return bdmByConceptsMap;
}
BufferedReader bdmResultsReader = null;
try {
bdmResultsReader = new BomStrippingInputStreamReader(
new FileInputStream(Files.fileFromURL(bdmFile)), "UTF-8");
bdmResultsReader.readLine(); // skip the first line as the header
String line = bdmResultsReader.readLine();
while (line != null) {
String[] terms = line.split(", ");
if (terms.length > 3) {
String oneCon = terms[0].substring(4);
String anoCon = terms[1].substring(9);
String bdmS = terms[2].substring(4);
bdmByConceptsMap.put(oneCon + ", " + anoCon, Float.valueOf(bdmS));
} else {
Out.prln("File " + bdmFile.toString() + " has incorrect format" +
"for the line [" + line + "].");
}
line = bdmResultsReader.readLine();
}
} catch(Exception e) {
Out.prln("There is something wrong with the BDM file.");
e.printStackTrace();
} finally {
if (bdmResultsReader != null) {
try {
bdmResultsReader.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return bdmByConceptsMap;
}
public List getMeasuresRow(Object[] measures, String title) {
List differs = new ArrayList(
getDifferByTypeMap().values());
AnnotationDiffer differ = new AnnotationDiffer(differs);
NumberFormat f = NumberFormat.getInstance(Locale.ENGLISH);
f.setMaximumFractionDigits(2);
f.setMinimumFractionDigits(2);
List row = new ArrayList();
row.add(title);
row.add(Integer.toString(differ.getCorrectMatches()));
row.add(Integer.toString(differ.getMissing()));
row.add(Integer.toString(differ.getSpurious()));
row.add(Integer.toString(differ.getPartiallyCorrectMatches()));
for (Object object : measures) {
String measure = (String) object;
double beta = Double.valueOf(
measure.substring(1,measure.indexOf('-')));
if (measure.endsWith("strict")) {
row.add(f.format(differ.getPrecisionStrict()));
row.add(f.format(differ.getRecallStrict()));
row.add(f.format(differ.getFMeasureStrict(beta)));
} else if (measure.endsWith("strict BDM")) {
row.add(f.format(getPrecisionStrictBdm()));
row.add(f.format(getRecallStrictBdm()));
row.add(f.format(getFMeasureStrictBdm(beta)));
} else if (measure.endsWith("lenient")) {
row.add(f.format(differ.getPrecisionLenient()));
row.add(f.format(differ.getRecallLenient()));
row.add(f.format(differ.getFMeasureLenient(beta)));
} else if (measure.endsWith("lenient BDM")) {
row.add(f.format(getPrecisionLenientBdm()));
row.add(f.format(getRecallLenientBdm()));
row.add(f.format(getFMeasureLenientBdm(beta)));
} else if (measure.endsWith("average")) {
row.add(f.format(differ.getPrecisionAverage()));
row.add(f.format(differ.getRecallAverage()));
row.add(f.format(differ.getFMeasureAverage(beta)));
} else if (measure.endsWith("average BDM")) {
row.add(f.format(getPrecisionAverageBdm()));
row.add(f.format(getRecallAverageBdm()));
row.add(f.format(getFMeasureAverageBdm(beta)));
}
}
return row;
}
/**
* Be careful, don't modify it.
* That's not a copy because it would take too much memory.
* @return differ by type map
*/
public Map getDifferByTypeMap() {
return differByTypeMap;
}
protected Map bdmByTypeMap = new HashMap();
protected URL bdmFileUrl;
protected Map differByTypeMap =
new HashMap();
protected Map bdmByConceptsMap;
}