All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.util.OntologyMeasures Maven / Gradle / Ivy

Go to download

GATE - general achitecture for text engineering - is open source software capable of solving almost any text processing problem. This artifact enables you to embed the core GATE Embedded with its essential dependencies. You will able to use the GATE Embedded API and load and store GATE XML documents. This artifact is the perfect dependency for CREOLE plugins or for applications that need to customize the GATE dependencies due to confict with their own dependencies or for lower footprint.

The newest version!
/**
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 * Thomas Heitz - 09/06/2010
 *
 *  $Id$
 */

package gate.util;

import gate.Annotation;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URL;
import java.text.NumberFormat;
import java.util.*;

/**
 * Modified version of Precision and Recall called BDM that takes into
 * account the distance of two concepts in an ontology.
 */
public class OntologyMeasures {

  public OntologyMeasures() {
    // empty constructor
  }

  /**
   * Constructor to be used when you have a collection of OntologyMeasures
   * and want to consider it as only one OntologyMeasures.
   * Then you can only use the methods getPrecision/Recall/FMeasure...().
   * @param measures collection to be regrouped in one OntologyMeasures
   */
  public OntologyMeasures(Collection measures) {
    Map> differsByTypeMap =
      new HashMap>();
    for (OntologyMeasures measure : measures) {
      for (Map.Entry entry : measure.bdmByTypeMap.entrySet()) {
        float previousBdm = 0;
        if (bdmByTypeMap.containsKey(entry.getKey())) {
          previousBdm = bdmByTypeMap.get(entry.getKey());
        }
        // set the bdmByTypeMap to be the sum of those in the collection
        bdmByTypeMap.put(entry.getKey(), previousBdm + entry.getValue());
      }
      for (Map.Entry entry :
             measure.differByTypeMap.entrySet()) {
        List differs = differsByTypeMap.get(entry.getKey());
        if (differs == null) {
          differs = new ArrayList();
        }
        differs.add(entry.getValue());
        differsByTypeMap.put(entry.getKey(), differs);
      }
    }
    // combine the list of AnnotationDiffer for each type
    for (Map.Entry> entry :
           differsByTypeMap.entrySet()) {
      differByTypeMap.put(entry.getKey(),
        new AnnotationDiffer(entry.getValue()));
    }
  }

  /**
   * For a document get the annotation differs that contain the type to compare
   * and the annotation differs that may have miscategorized annotations
   * for this type. Then we try to find miscategorized types that are close
   * enough from the main type and use their BDM value to get an augmented
   * precision, recall and fscore.
   *
   * @param differs annotation differ for the type and for possible
   * miscategorized types.
   */
  public void calculateBdm(Collection differs) {

    if (bdmByConceptsMap == null) {
      // load BDM file with scores for each concept/annotation type pair
      bdmByConceptsMap = read(bdmFileUrl); // read the bdm scores
    }

    // calculate BDM from the spurious and missing annotations
    Set unpairedResponseAnnotations = new HashSet();
    Set unpairedKeyAnnotations;

    // will use the whole spurious annotations as the second set to compare
    for (AnnotationDiffer differ : differs) {
      unpairedResponseAnnotations.addAll(
        differ.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE));
    }

    bdmByTypeMap.clear();

    for (AnnotationDiffer differ : differs) {
      unpairedKeyAnnotations = differ.getAnnotationsOfType(
        AnnotationDiffer.MISSING_TYPE);
      if (!bdmByTypeMap.containsKey(differ.getAnnotationType())) {
        bdmByTypeMap.put(differ.getAnnotationType(), 0f);
      }

      // use the missing annotations as the first set to compare
      for (Annotation unpairedKeyAnnotation : unpairedKeyAnnotations) {
        String type = unpairedKeyAnnotation.getType();
//        Out.prln("unpairedKeyAnnotation: " + unpairedKeyAnnotation.toString());
        Iterator iterator = unpairedResponseAnnotations.iterator();

        // use the spurious annotations as the second set to compare
        while (iterator.hasNext()) {
          Annotation unpairedResponseAnnotation = iterator.next();
//          Out.prln("unpairedResponsAnnotation: "
//            + unpairedResponseAnnotation.toString());
          float bdm = 0;

          // annotations have the same start and end offsets
          if (unpairedKeyAnnotation.coextensive(unpairedResponseAnnotation)) {

            // compare both features values with BDM pairs
            if (differ.getSignificantFeaturesSet() != null) {
              if (!type.equals(unpairedResponseAnnotation.getType())) {
                continue; // types must be the same
              }
              for (Object feature : differ.getSignificantFeaturesSet()) {
                if (unpairedKeyAnnotation.getFeatures() == null
                 || unpairedResponseAnnotation.getFeatures() == null) {
                  continue;
                }
//                Out.prln("Feature: " + feature);
                String keyLabel = (String)
                  unpairedKeyAnnotation.getFeatures().get(feature);
//                Out.prln("KeyLabel: " + keyLabel);
                String responseLabel = (String)
                  unpairedResponseAnnotation.getFeatures().get(feature);
//                Out.prln("ResponseLabel: " + responseLabel);
                if (keyLabel == null || responseLabel == null) {
                  // do nothing
                } else if (bdmByConceptsMap.containsKey(
                                              keyLabel + ", " + responseLabel)) {
                  bdm += bdmByConceptsMap.get(keyLabel + ", " + responseLabel);
                } else if (bdmByConceptsMap.containsKey(
                                              responseLabel + ", " + keyLabel)) {
                  bdm += bdmByConceptsMap.get(responseLabel + ", " + keyLabel);
                }
              }
              bdm = bdm / differ.getSignificantFeaturesSet().size();

            } else { // compare both types with BDM pairs
              if (bdmByConceptsMap.containsKey(
                    type + ',' + unpairedResponseAnnotation.getType())) {
                bdm = bdmByConceptsMap.get(
                    type + ',' + unpairedResponseAnnotation.getType());
              } else if (bdmByConceptsMap.containsKey(
                           unpairedResponseAnnotation.getType() + ", " + type)) {
                bdm = bdmByConceptsMap.get(
                           unpairedResponseAnnotation.getType() + ", " + type);
              }
            }
            if (bdm > 0) {
              bdmByTypeMap.put(type, bdmByTypeMap.get(type) + bdm);
              iterator.remove();
//              Out.prln("BDM: " + bdmByTypeMap.get(type));
            }
          }
        }
      }
    }

    differByTypeMap.clear();
    Map> differsByTypeMap =
      new HashMap>();

    for (AnnotationDiffer differ : differs) {
      // we consider that all annotations in AnnotationDiffer are the same type
      String type = differ.getAnnotationType();
      List differsType = differsByTypeMap.get(type);
      if (differsType == null) {
        differsType = new ArrayList();
      }
      differsType.add(differ);
      differsByTypeMap.put(type, differsType);
    }

    // combine the list of AnnotationDiffer for each type
    for (Map.Entry> entry :
          differsByTypeMap.entrySet()) {
      differByTypeMap.put(entry.getKey(),
        new AnnotationDiffer(entry.getValue()));
    }
  }

  /**
   * AP = (sum of BDMs for BDM-matching pair spurious/missing + Correct)
   *    / (Correct + Spurious)
   * @param type annotation type
   * @return strict precision with BDM correction
   */
  public double getPrecisionStrictBdm(String type) {
    AnnotationDiffer differ = differByTypeMap.get(type);
    if (differ.getCorrectMatches() + differ.getSpurious() == 0) {
      return 1.0;
    }
    return (bdmByTypeMap.get(type) + differ.getCorrectMatches())
         / (differ.getCorrectMatches() + differ.getSpurious());
  }

  public double getPrecisionStrictBdm() {
    double result = 0;
    for (String type : differByTypeMap.keySet()) {
      result += getPrecisionStrictBdm(type);
    }
    return result / differByTypeMap.size();
  }

  public double getRecallStrictBdm(String type) {
    AnnotationDiffer differ = differByTypeMap.get(type);
    if (differ.getCorrectMatches() + differ.getMissing() == 0) {
      return 1.0;
    }
    return (bdmByTypeMap.get(type) + differ.getCorrectMatches())
         / (differ.getCorrectMatches() + differ.getMissing());
  }

  public double getRecallStrictBdm() {
    double result = 0;
    for (String type : differByTypeMap.keySet()) {
      result += getRecallStrictBdm(type);
    }
    return result / differByTypeMap.size();
  }

  public double getFMeasureStrictBdm(String type, double beta) {
    double precision = getPrecisionStrictBdm(type);
    double recall = getRecallStrictBdm(type);
    double betaSq = beta * beta;
    double answer = ((betaSq + 1) * precision * recall)
                  / (betaSq * precision + recall);
    if(Double.isNaN(answer)) answer = 0.0;
    return answer;
  }

  public double getFMeasureStrictBdm(double beta) {
    double result = 0;
    for (String type : differByTypeMap.keySet()) {
      result += getFMeasureStrictBdm(type, beta);
    }
    return result / differByTypeMap.size();
  }

  public double getPrecisionLenientBdm(String type) {
    AnnotationDiffer differ = differByTypeMap.get(type);
    if (differ.getCorrectMatches() + differ.getSpurious() == 0) {
      return 1.0;
    }
    return (bdmByTypeMap.get(type) + differ.getCorrectMatches()
          + differ.getPartiallyCorrectMatches())
         / (differ.getCorrectMatches() + differ.getSpurious());
  }

  public double getPrecisionLenientBdm() {
    double result = 0;
    for (String type : differByTypeMap.keySet()) {
      result += getPrecisionLenientBdm(type);
    }
    return result / differByTypeMap.size();
  }

  public double getRecallLenientBdm(String type) {
    AnnotationDiffer differ = differByTypeMap.get(type);
    if (differ.getCorrectMatches() + differ.getMissing() == 0) {
      return 1.0;
    }
    return (bdmByTypeMap.get(type) + differ.getCorrectMatches()
          + differ.getPartiallyCorrectMatches())
         / (differ.getCorrectMatches() + differ.getMissing());
  }

  public double getRecallLenientBdm() {
    double result = 0;
    for (String type : differByTypeMap.keySet()) {
      result += getRecallLenientBdm(type);
    }
    return result / differByTypeMap.size();
  }

  public double getFMeasureLenientBdm(String type, double beta) {
    double precision = getPrecisionLenientBdm(type);
    double recall = getRecallLenientBdm(type);
    double betaSq = beta * beta;
    double answer = ((betaSq + 1) * precision * recall)
                  / (betaSq * precision + recall);
    if(Double.isNaN(answer)) answer = 0.0;
    return answer;
  }

  public double getFMeasureLenientBdm(double beta) {
    double result = 0;
    for (String type : differByTypeMap.keySet()) {
      result += getFMeasureLenientBdm(type, beta);
    }
    return result / differByTypeMap.size();
  }

  public double getPrecisionAverageBdm(String type) {
    return (getPrecisionLenientBdm(type) + getPrecisionStrictBdm(type)) / 2.0;
  }

  /**
   * Gets the average of the strict and lenient precision values.
   * @return a double value.
   */
  public double getPrecisionAverageBdm() {
    return (getPrecisionLenientBdm() + getPrecisionStrictBdm()) / 2.0;
  }

  public double getRecallAverageBdm(String type) {
    return (getRecallLenientBdm(type) + getRecallStrictBdm(type)) / 2.0;
  }

  /**
   * Gets the average of the strict and lenient recall values.
   * @return a double value.
   */
  public double getRecallAverageBdm() {
    return (getRecallLenientBdm() + getRecallStrictBdm()) / 2.0;
  }

  public double getFMeasureAverageBdm(String type, double beta) {
    return (getFMeasureLenientBdm(type, beta)
          + getFMeasureStrictBdm(type, beta))
          / 2.0;
  }

  /**
   * Gets the average of strict and lenient F-Measure values.
   * @param beta The relative weight of precision and recall. A value of 1
   * gives equal weights to precision and recall. A value of 0 takes the recall
   * value completely out of the equation.
   * @return a doublevalue.
   */
  public double getFMeasureAverageBdm(double beta) {
    return (getFMeasureLenientBdm(beta) + getFMeasureStrictBdm(beta)) / 2.0;
  }

  public void setBdmFile(URL url) {
    bdmFileUrl = url;
    bdmByConceptsMap = null;
  }

  /**
   * Read the BDM scores from a file.
   * @param bdmFile URL of the BDM file
   * @return map from a pair of concepts to their BDM score
   */
  public Map read(URL bdmFile) {
    Map bdmByConceptsMap = new HashMap();
    if (bdmFile == null) {
      Out.prln("There is no BDM file specified.");
      return bdmByConceptsMap;
    }
    BufferedReader bdmResultsReader = null;
    try {
      bdmResultsReader = new BomStrippingInputStreamReader(
        new FileInputStream(Files.fileFromURL(bdmFile)), "UTF-8");
      bdmResultsReader.readLine(); // skip the first line as the header
      String line = bdmResultsReader.readLine();
      while (line != null) {
        String[] terms = line.split(", ");
        if (terms.length > 3) {
          String oneCon = terms[0].substring(4);
          String anoCon = terms[1].substring(9);
          String bdmS = terms[2].substring(4);
          bdmByConceptsMap.put(oneCon + ", " + anoCon, Float.valueOf(bdmS));
        } else {
          Out.prln("File " + bdmFile.toString() + " has incorrect format" +
            "for the line [" + line + "].");
        }
        line = bdmResultsReader.readLine();
      }

    } catch(Exception e) {
      Out.prln("There is something wrong with the BDM file.");
      e.printStackTrace();

    } finally {
      if (bdmResultsReader != null) {
        try {
          bdmResultsReader.close();
        } catch (IOException e) {
          e.printStackTrace();
        }
      }
    }
    return bdmByConceptsMap;
  }

  public List getMeasuresRow(Object[] measures, String title) {
    List differs = new ArrayList(
      getDifferByTypeMap().values());
    AnnotationDiffer differ = new AnnotationDiffer(differs);
    NumberFormat f = NumberFormat.getInstance(Locale.ENGLISH);
    f.setMaximumFractionDigits(2);
    f.setMinimumFractionDigits(2);
    List row = new ArrayList();
    row.add(title);
    row.add(Integer.toString(differ.getCorrectMatches()));
    row.add(Integer.toString(differ.getMissing()));
    row.add(Integer.toString(differ.getSpurious()));
    row.add(Integer.toString(differ.getPartiallyCorrectMatches()));
    for (Object object : measures) {
      String measure = (String) object;
      double beta = Double.valueOf(
        measure.substring(1,measure.indexOf('-')));
      if (measure.endsWith("strict")) {
        row.add(f.format(differ.getPrecisionStrict()));
        row.add(f.format(differ.getRecallStrict()));
        row.add(f.format(differ.getFMeasureStrict(beta)));
      } else if (measure.endsWith("strict BDM")) {
        row.add(f.format(getPrecisionStrictBdm()));
        row.add(f.format(getRecallStrictBdm()));
        row.add(f.format(getFMeasureStrictBdm(beta)));
      } else if (measure.endsWith("lenient")) {
        row.add(f.format(differ.getPrecisionLenient()));
        row.add(f.format(differ.getRecallLenient()));
        row.add(f.format(differ.getFMeasureLenient(beta)));
      } else if (measure.endsWith("lenient BDM")) {
        row.add(f.format(getPrecisionLenientBdm()));
        row.add(f.format(getRecallLenientBdm()));
        row.add(f.format(getFMeasureLenientBdm(beta)));
      } else if (measure.endsWith("average")) {
        row.add(f.format(differ.getPrecisionAverage()));
        row.add(f.format(differ.getRecallAverage()));
        row.add(f.format(differ.getFMeasureAverage(beta)));
      } else if (measure.endsWith("average BDM")) {
        row.add(f.format(getPrecisionAverageBdm()));
        row.add(f.format(getRecallAverageBdm()));
        row.add(f.format(getFMeasureAverageBdm(beta)));
      }
    }
    return row;
  }

  /**
   * Be careful, don't modify it.
   * That's not a copy because it would take too much memory.
   * @return differ by type map
   */
  public Map getDifferByTypeMap() {
    return differByTypeMap;
  }

  protected Map bdmByTypeMap = new HashMap();
  protected URL bdmFileUrl;
  protected Map differByTypeMap =
    new HashMap();
  protected Map bdmByConceptsMap;
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy