gate.util.ClassificationMeasures Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of gate-core Show documentation
GATE - general achitecture for text engineering - is open source software capable of solving almost any text processing problem. This artifact enables you to embed the core GATE Embedded with its essential dependencies. You will able to use the GATE Embedded API and load and store GATE XML documents. This artifact is the perfect dependency for CREOLE plugins or for applications that need to customize the GATE dependencies due to confict with their own dependencies or for lower footprint.
The newest version!
/**
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  $Id: ContingencyTable.java 12125 2010-01-04 14:44:43Z ggorrell $
 */

package gate.util;

import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;

import gate.Annotation;
import gate.AnnotationSet;


/**
 * Given two annotation sets, a type and a feature,
 * compares the feature values. It finds matching annotations and treats
 * the feature values as classifications. Its purpose is to calculate the
 * extent of agreement between the feature values in the two annotation
 * sets. It computes observed agreement and Kappa measures.
 */
public class ClassificationMeasures {
  
  /** Array of dimensions categories * categories. */
  private float[][] confusionMatrix;
  
  /** Cohen's kappa. */
  private float kappaCohen = 0;
  
  /** Scott's pi or Siegel & Castellan's kappa */
  private float kappaPi = 0;
  
  private boolean isCalculatedKappas = false;
  
  /** List of feature values that are the labels of the confusion matrix */
  private TreeSet featureValues;

  public ClassificationMeasures() {
    // empty constructor
  }

  /**
   * Portion of the instances on which the annotators agree.
   * @return a number between 0 and 1. 1 means perfect agreements.
   */
  public float getObservedAgreement()
  {
    float agreed = getAgreedTrials();
    float total = getTotalTrials();
    if(total>0) {
      return agreed/total;
    } else {
      return 0;
    }
  }

  /**
   * Kappa is defined as the observed agreements minus the agreement
   * expected by chance.
   * The Cohen’s Kappa is based on the individual distribution of each
   * annotator.
   * @return a number between -1 and 1. 1 means perfect agreements.
   */
  public float getKappaCohen()
  {
    if(!isCalculatedKappas){
      computeKappaPairwise();
      isCalculatedKappas = true;
    }
    return kappaCohen;
  }

  /**
   * Kappa is defined as the observed agreements minus the agreement
   * expected by chance.
   * The Siegel & Castellan’s Kappa is based on the assumption that all the
   * annotators have the same distribution.
   * @return a number between -1 and 1. 1 means perfect agreements.
   */
  public float getKappaPi()
  {
    if(!isCalculatedKappas){
      computeKappaPairwise();
      isCalculatedKappas = true;
    }
    return kappaPi;
  }
  
  /**
   * To understand exactly which types are being confused with which other
   * types you will need to view this array in conjunction with featureValues,
   * which gives the class labels (annotation types) in the correct order.
   * @return confusion matrix describing how annotations in one
   * set are classified in the other and vice versa
   */
  public float[][] getConfusionMatrix(){
      return confusionMatrix.clone();
  }
  
  /**
   * This is necessary to make sense of the confusion matrix.
   * @return list of annotation types (class labels) in the
   * order in which they appear in the confusion matrix
   */
  public SortedSet getFeatureValues(){
    return Collections.unmodifiableSortedSet(featureValues);
  }
  
  /**
   * Create a confusion matrix in which annotations of identical span
   * bearing the specified feature name are compared in terms of feature value.
   * Compiles list of classes (feature values) on the fly.
   *
   * @param aS1 annotation set to compare to the second
   * @param aS2 annotation set to compare to the first
   * @param type annotation type containing the features to compare
   * @param feature feature name whose values will be compared
   * @param verbose message error output when ignoring annotations
   */
  public void calculateConfusionMatrix(AnnotationSet aS1, AnnotationSet aS2,
    String type, String feature, boolean verbose)
  {   
    // We'll accumulate a list of the feature values (a.k.a. class labels)
    featureValues = new TreeSet();

    // Keep a track of any feature names we've logged as being merged
    Set mergedFeatures = new HashSet();

    // Make a hash of hashes for the counts.
    HashMap> countMap =
      new HashMap>();
    
    // Get all the annotations of the correct type containing
    // the correct feature
    HashSet featureSet = new HashSet();
    featureSet.add(feature);
    AnnotationSet relevantAnns1 = aS1.get(type, featureSet);
    AnnotationSet relevantAnns2 = aS2.get(type, featureSet);
    
    // For each annotation in aS1, find the match in aS2
    for (Annotation relevantAnn1 : relevantAnns1) {

      // First we need to check that this annotation is not identical in span
      // to anything else in the same set. Duplicates should be excluded.
      List dupeAnnotations = new ArrayList();
      for (Annotation aRelevantAnns1 : relevantAnns1) {
        if (aRelevantAnns1.equals(relevantAnn1)) { continue; }
        if (aRelevantAnns1.coextensive(relevantAnn1)) {
          dupeAnnotations.add(aRelevantAnns1);
          dupeAnnotations.add(relevantAnn1);
        }
      }

      if (dupeAnnotations.size() > 1) {
        if (verbose) {
          Out.prln("ClassificationMeasures: " +
            "Same span annotations in set 1 detected! Ignoring.");
          Out.prln(Arrays.toString(dupeAnnotations.toArray()));
        }
      } else {
        // Find the match in as2
        List  coextensiveAnnotations = new ArrayList();
        for (Annotation relevantAnn2 : relevantAnns2) {
          if (relevantAnn2.coextensive(relevantAnn1)) {
            coextensiveAnnotations.add(relevantAnn2);
          }
        }

        if (coextensiveAnnotations.size() == 0) {
          if (verbose) {
            Out.prln("ClassificationMeasures: Annotation in set 1 " +
              "with no counterpart in set 2 detected! Ignoring.");
            Out.prln(relevantAnn1.toString());
          }
        } else if (coextensiveAnnotations.size() == 1) {

          // get the feature values from the two annotations
          Object featObj1 = relevantAnn1.getFeatures().get(feature);
          Object featObj2 = coextensiveAnnotations.get(0).getFeatures().get(feature);

          // convert the values to Strings (which will merge values of different
          // types that have the same string representation)
          String featVal1 = String.valueOf(featObj1);
          String featVal2 = String.valueOf(featObj2);

          // if we are merging feature values of differing types then print a
          // warning for that feature, but only the first time
          if (!featObj1.getClass().equals(featObj2.getClass())) {
            if (!mergedFeatures.contains(feature)) {
              Err.println("Values for feature '" + feature
                  + "' are of differing types, results may be inconsistent");
              mergedFeatures.add(feature);
            }
          }

          // Make sure both are present in our feature value list
          featureValues.add(featVal1);
          featureValues.add(featVal2);

          // Update the matrix hash of hashes
          // Get the right hashmap for the as1 feature value
          HashMap subHash = countMap.get(featVal1);
          if (subHash == null) {
            // This is a new as1 feature value, since it has no subhash yet
            HashMap subHashForNewAS1FeatVal =
              new HashMap();

            // Since it is a new as1 feature value, there can be no existing
            // as2 feature values paired with it. So we make a new one for this
            // as2 feature value
            subHashForNewAS1FeatVal.put(featVal2, (float) 1);

            countMap.put(featVal1, subHashForNewAS1FeatVal);
          } else {
            // Increment the count
            Float count = subHash.get(featVal2);
            if (count == null) {
              subHash.put(featVal2, (float) 1);
            } else {
              subHash.put(featVal2, (float) count.intValue() + 1);
            }

          }
        } else if (coextensiveAnnotations.size() > 1) {
          if (verbose) {
            Out.prln("ClassificationMeasures: " +
              "Same span annotations in set 2 detected! Ignoring.");
            Out.prln(Arrays.toString(coextensiveAnnotations.toArray()));
          }
        }
      }
    }
    
    // Now we have this hash of hashes, but the calculation implementations
    // require an array of floats. So for now we can just translate it.
    confusionMatrix = convert2DHashTo2DFloatArray(countMap, featureValues);
  }
  
  /**
   * Given a list of ClassificationMeasures, this will combine to make
   * a megatable. Then you can use kappa getters to get micro average
   * figures for the entire set.
   * @param tables tables to combine
   */
  public ClassificationMeasures(Collection tables) {
    /* A hash of hashes for the actual values.
     * This will later be converted to a 2D float array for
     * compatibility with the existing code. */
    HashMap> countMap =
      new HashMap>();
    
    /* Make a new feature values set which is a superset of all the others */
    TreeSet newFeatureValues = new TreeSet();
    
    /* Now we are going to add each new contingency table in turn */

    for (ClassificationMeasures table : tables) {
      int it1index = 0;
      for (String featureValue1 : table.featureValues) {
        newFeatureValues.add(featureValue1);
        int it2index = 0;
        for (String featureValue2 : table.featureValues) {

          /* So we have the labels of the count we want to add */
          /* What is the value we want to add? */
          Float valtoadd = table.confusionMatrix[it1index][it2index];

          HashMap subHash = countMap.get(featureValue1);
          if (subHash == null) {
            /* This is a new as1 feature value, since it has no subhash yet */
            HashMap subHashForNewAS1FeatVal =
              new HashMap();

            /* Since it is a new as1 feature value, there can be no existing
             *  as2 feature values paired with it. So we make a new one for this
             *  as2 feature value */
            subHashForNewAS1FeatVal.put(featureValue2, valtoadd);

            countMap.put(featureValue1, subHashForNewAS1FeatVal);
          } else {
            /* Increment the count */
            Float count = subHash.get(featureValue2);
            if (count == null) {
              subHash.put(featureValue2, valtoadd);
            } else {
              subHash.put(featureValue2, count.intValue() + valtoadd);
            }
          }
          it2index++;
        }
        it1index++;
      }
    }
    
    confusionMatrix = convert2DHashTo2DFloatArray(countMap, newFeatureValues);
    featureValues = newFeatureValues;
    isCalculatedKappas = false;
  }
  
  /** Compute Cohen's and Pi kappas for two annotators.
   */
  protected void computeKappaPairwise()
  {
    // Compute the agreement
    float observedAgreement = getObservedAgreement();
    int numCats = featureValues.size();
    // compute the agreement by chance
    // Get the marginal sum for each annotator
    float[] marginalArrayC = new float[numCats];
    float[] marginalArrayR = new float[numCats];
    float totalSum = 0;
    for(int i = 0; i < numCats; ++i) {
      float sum = 0;
      for(int j = 0; j < numCats; ++j)
        sum += confusionMatrix[i][j];
      marginalArrayC[i] = sum;
      totalSum += sum;
      sum = 0;
      for(int j = 0; j < numCats; ++j)
        sum += confusionMatrix[j][i];
      marginalArrayR[i] = sum;
    }
    
    // Compute Cohen's p(E)
    float pE = 0;
    if(totalSum > 0) {
      float doubleSum = totalSum * totalSum;
      for(int i = 0; i < numCats; ++i)
        pE += (marginalArrayC[i] * marginalArrayR[i]) / doubleSum;
    }
    
    // Compute Cohen's Kappa
    if (pE == 1.0F) { // prevent division by zero
      kappaCohen = 1.0F;
    }
    else if (totalSum > 0) 
      kappaCohen = (observedAgreement - pE) / (1.0F - pE);
    else kappaCohen = 0;
    
    // Compute S&C's chance agreement
    pE = 0;
    if(totalSum > 0) {
      float doubleSum = 2 * totalSum;
      for(int i = 0; i < numCats; ++i) {
        float p = (marginalArrayC[i] + marginalArrayR[i]) / doubleSum;
        pE += p * p;
      }
    }
    
    if (pE == 1.0F) { // prevent division by zero
      kappaPi = 1.0F;
    }
    else if (totalSum > 0)
      kappaPi = (observedAgreement - pE) / (1.0F - pE);
    else kappaPi = 0;
    
    // Compute the specific agreement for each label using marginal sums
    float[][] sAgreements = new float[numCats][2];
    for(int i = 0; i < numCats; ++i) {
      if(marginalArrayC[i] + marginalArrayR[i]>0) 
        sAgreements[i][0] = (2 * confusionMatrix[i][i])
          / (marginalArrayC[i] + marginalArrayR[i]);
      else sAgreements[i][0] = 0.0f;
      if(2 * totalSum - marginalArrayC[i] - marginalArrayR[i]>0)
        sAgreements[i][1] = (2 * (totalSum - marginalArrayC[i]
          - marginalArrayR[i] + confusionMatrix[i][i]))
          / (2 * totalSum - marginalArrayC[i] - marginalArrayR[i]);
      else sAgreements[i][1] = 0.0f;
    }
  }
  
  /** Gets the number of annotations for which the two annotation sets
   * are in agreement with regards to the annotation type.
   * @return Number of agreed trials
   */
  public float getAgreedTrials(){
    float sumAgreed = 0;
    for(int i = 0; i < featureValues.size(); ++i) {
      sumAgreed += confusionMatrix[i][i];
    }
    return sumAgreed;
  }
  
  /** Gets the total number of annotations in the two sets.
   * Note that only matched annotations (identical span) are
   * considered.
   * @return Number of trials
   */
  public float getTotalTrials(){
    float sumTotal = 0;
    for(int i = 0; i < featureValues.size(); ++i) {
      for(int j = 0; j < featureValues.size(); ++j) {
        sumTotal += confusionMatrix[i][j];
      }
    }
    return sumTotal;
  }
  
  /**
   * @param title matrix title
   * @return confusion matrix as a list of list of String
   */
  public List> getConfusionMatrix(String title) {
    List> matrix = new ArrayList>();
    List row = new ArrayList();
    row.add(" ");
    matrix.add(row); // spacer
    row = new ArrayList();
    row.add(title);
    matrix.add(row); // title
    SortedSet features = new TreeSet(getFeatureValues());
    row = new ArrayList();
    row.add("A \\ B");
    row.addAll(features);
    matrix.add(row); // heading horizontal
    for (float[] confusionValues : getConfusionMatrix()) {
      row = new ArrayList();
      row.add(features.first()); // heading vertical
      features.remove(features.first());
      for (float confusionValue : confusionValues) {
        row.add(String.valueOf((int) confusionValue));
      }
      matrix.add(row); // confusion values
    }
    return matrix;
  }

  public List getMeasuresRow(Object[] measures, String documentName) {
    NumberFormat f = NumberFormat.getInstance(Locale.ENGLISH);
    f.setMaximumFractionDigits(2);
    f.setMinimumFractionDigits(2);
    List row = new ArrayList();
    row.add(documentName);
    row.add(String.valueOf((int) getAgreedTrials()));
    row.add(String.valueOf((int) getTotalTrials()));
    for (Object object : measures) {
      String measure = (String) object;
      if (measure.equals("Observed agreement")) {
        row.add(f.format(getObservedAgreement()));
      }
      if (measure.equals("Cohen's Kappa")) {
        float result = getKappaCohen();
        row.add(Float.isNaN(result) ? "" : f.format(result));
      }
      if (measure.equals("Pi's Kappa")) {
        float result = getKappaPi();
        row.add(Float.isNaN(result) ? "" : f.format(result));
      }
    }
    return row;
  }

  /**
   * Convert between two formats of confusion matrix.
   * A hashmap of hashmaps is easier to populate but an array is better for
   * matrix computation.
   * @param countMap count for each label as in confusion matrix
   * @param featureValues sorted set of labels that will define the dimensions
   * @return converted confusion matrix as an 2D array
   */
  private float[][] convert2DHashTo2DFloatArray(
    HashMap> countMap,
    TreeSet featureValues)
  {
    int dimensionOfContingencyTable = featureValues.size();
    float[][] matrix =
      new float[dimensionOfContingencyTable][dimensionOfContingencyTable];
    int i=0;
    int j=0;
    for (String featureValue1 : featureValues) {
      HashMap hashForThisAS1FeatVal =
        countMap.get(featureValue1);
      j = 0;
      for (String featureValue2 : featureValues) {
        Float count = null;
        if (hashForThisAS1FeatVal != null) {
          count = hashForThisAS1FeatVal.get(featureValue2);
        }
        if (count != null) {
          matrix[i][j] = count;
        } else {
          matrix[i][j] = 0;
        }
        j++;
      }
      i++;
    }    
    return matrix;
  }
  
}