All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.creole.coref.NominalCoref Maven / Gradle / Ivy

Go to download

ANNIE is a general purpose information extraction system that provides the building blocks of many other GATE applications.

There is a newer version: 9.1
Show newest version
/*
 *  NominalCoref.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  $Id: NominalCoref.java 19742 2016-11-16 17:58:23Z markagreenwood $
 */

package gate.creole.coref;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.FeatureMap;
import gate.Resource;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.Err;
import gate.util.OffsetComparator;
import gate.util.SimpleFeatureMapImpl;

@CreoleResource(name="ANNIE Nominal Coreferencer", comment="Nominal Coreference resolution component", helpURL="http://gate.ac.uk/userguide/sec:annie:pronom-coref", icon="nominal-coreferencer")
public class NominalCoref extends AbstractCoreferencer {

  private static final long serialVersionUID = 1497388811557744017L;

  public static final String COREF_DOCUMENT_PARAMETER_NAME = "document";

  public static final String COREF_ANN_SET_PARAMETER_NAME = "annotationSetName";

  //annotation features
  private static final String PERSON_CATEGORY = "Person";
  private static final String JOBTITLE_CATEGORY = "JobTitle";
  private static final String ORGANIZATION_CATEGORY = "Organization";
  private static final String LOOKUP_CATEGORY = "Lookup";
  private static final String ORGANIZATION_NOUN_CATEGORY = "organization_noun";
  

  //scope
  /** --- */
  //private static AnnotationOffsetComparator ANNOTATION_OFFSET_COMPARATOR;
  /** --- */
  private String annotationSetName;
  /** --- */
  private AnnotationSet defaultAnnotations;
  /** --- */
  private HashMap anaphor2antecedent;

    /*  static {
    ANNOTATION_OFFSET_COMPARATOR = new AnnotationOffsetComparator();
    }*/

  /** --- */
  public NominalCoref() {
    super("NOMINAL");
    this.anaphor2antecedent = new HashMap();
  }

  /** Initialise this resource, and return it. */
  @Override
  public Resource init() throws ResourceInstantiationException {
    return super.init();
  } // init()

  /**
   * Reinitialises the processing resource. After calling this method the
   * resource should be in the state it is after calling init.
   * If the resource depends on external resources (such as rules files) then
   * the resource will re-read those resources. If the data used to create
   * the resource has changed since the resource has been created then the
   * resource will change too after calling reInit().
  */
  @Override
  public void reInit() throws ResourceInstantiationException {
    this.anaphor2antecedent = new HashMap();
    init();
  } // reInit()


  /** Set the document to run on. */
  @Override
  public void setDocument(Document newDocument) {

    //0. precondition
//    Assert.assertNotNull(newDocument);

    super.setDocument(newDocument);
  }

  /** --- */
  @Override
  @RunTime
  @Optional
  @CreoleParameter(comment="The annotation set to be used for the generated annotations")
  public void setAnnotationSetName(String annotationSetName) {
    this.annotationSetName = annotationSetName;
  }

  /** --- */
  @Override
  public String getAnnotationSetName() {
    return annotationSetName;
  }

  /**
   * This method runs the coreferencer. It assumes that all the needed parameters
   * are set. If they are not, an exception will be fired.
   *
   * The process goes like this:
   * - Create a sorted list of Person and JobTitle annotations.
   * - Loop through the annotations
   *    If it is a Person, we add it to the top of a stack.
   *    If it is a job title, we subject it to a series of tests. If it 
   *      passes, we associate it with the Person annotation at the top
   *      of the stack
   */
  @Override
  public void execute() throws ExecutionException{

    Annotation[] nominalArray;

    //0. preconditions
    if (null == this.document) {
      throw new ExecutionException("[coreference] Document is not set!");
    }

    //1. preprocess
    preprocess();

    // Out.println("Total annotations: " + defaultAnnotations.size());

    // Get a sorted array of Tokens.
    // The tests for job titles often require getting previous and subsequent
    // tokens, so to save work, we create a single, sorted list of 
    // tokens.
    Annotation[] tokens = defaultAnnotations.get(TOKEN_ANNOTATION_TYPE).
        toArray(new Annotation[0]);
    java.util.Arrays.sort(tokens, new OffsetComparator());

    // The current token is the token at the start of the current annotation.
    int currentToken = 0;

    // get Person entities
    //FeatureMap personConstraint = new SimpleFeatureMapImpl();
    //personConstraint.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME,
    //                          PERSON_CATEGORY);
    Set personConstraint = new HashSet();
    personConstraint.add(PERSON_CATEGORY);
    AnnotationSet people =
      this.defaultAnnotations.get(personConstraint);

    // get all JobTitle entities
    //FeatureMap constraintJobTitle = new SimpleFeatureMapImpl();
    //constraintJobTitle.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, JOBTITLE_CATEGORY);
    Set jobTitleConstraint = new HashSet();
    jobTitleConstraint.add(JOBTITLE_CATEGORY);
    
    AnnotationSet jobTitles = 
      this.defaultAnnotations.get(jobTitleConstraint);

    FeatureMap orgNounConstraint = new SimpleFeatureMapImpl();
    orgNounConstraint.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME,
                          ORGANIZATION_NOUN_CATEGORY);
    AnnotationSet orgNouns =
      this.defaultAnnotations.get(LOOKUP_CATEGORY, orgNounConstraint);

    Set orgConstraint = new HashSet();
    orgConstraint.add(ORGANIZATION_CATEGORY);

    AnnotationSet organizations =
      this.defaultAnnotations.get(orgConstraint);

    // combine them into a list of nominals
    Set nominals = new HashSet();
    if (people != null) {
      nominals.addAll(people);
    }
    if (jobTitles != null) {
      nominals.addAll(jobTitles);
    }
    if (orgNouns != null) {
      nominals.addAll(orgNouns);
    }
    if (organizations != null) {
      nominals.addAll(organizations);
    }

    //  Out.println("total nominals: " + nominals.size());

    // sort them according to offset
    nominalArray = nominals.toArray(new Annotation[0]);
    java.util.Arrays.sort(nominalArray, new OffsetComparator());
    
    ArrayList previousPeople = new ArrayList();
    ArrayList previousOrgs = new ArrayList();
    
        
    // process all nominals
    for (int i=0; i iter = annotations.iterator();
    while (iter.hasNext()) {
      Annotation current = iter.next();
      if (a.overlaps(current)) {
        return true;
      }
    }
      
    return false;
  }

  /** Use this method to keep the current token pointer at the right point
   * in the token list */
  private int advanceTokenPosition(Annotation target, int currentPosition,
				   Object[] tokens) {
    long targetOffset = target.getStartNode().getOffset().longValue();
    long currentOffset = ((Annotation) tokens[currentPosition])
      .getStartNode().getOffset().longValue();
    
    if (targetOffset > currentOffset) {
      while (targetOffset > currentOffset) {
	currentPosition++;
	currentOffset = ((Annotation) tokens[currentPosition])
          .getStartNode().getOffset().longValue();
      }
    }
    else if (targetOffset < currentOffset) {
      while (targetOffset < currentOffset) {
	currentPosition--;
	currentOffset = ((Annotation) tokens[currentPosition])
          .getStartNode().getOffset().longValue();
      }
    }
    
    return currentPosition;
  }

  /** Return the number of tokens between the end of annotation 1 and the
   * beginning of annotation 2. Will return 0 if they are not in order */
  private int countInterveningTokens(Annotation first, Annotation second,
				     int currentPosition, Object[] tokens) {
    int interveningTokens = 0;

    long startOffset = first.getEndNode().getOffset().longValue();
    long endOffset = second.getStartNode().getOffset().longValue();
    
    long currentOffset = ((Annotation) tokens[currentPosition])
      .getStartNode().getOffset().longValue();
    
    while (currentOffset < endOffset) {
      if (currentOffset >= startOffset) {
        interveningTokens++;
      }
      currentPosition++;
      currentOffset = ((Annotation) tokens[currentPosition])
	.getStartNode().getOffset().longValue();
    }
    return interveningTokens;
  }

  /** Get the next token after an annotation */
  private Annotation getFollowingToken(Annotation current, int currentPosition,
				       Object[] tokens) {
    long endOffset = current.getEndNode().getOffset().longValue();
    long currentOffset = ((Annotation) tokens[currentPosition])
      .getStartNode().getOffset().longValue();
    while (currentOffset < endOffset) {
      currentPosition++;
      currentOffset = ((Annotation) tokens[currentPosition])
	.getStartNode().getOffset().longValue();
    }
    return (Annotation) tokens[currentPosition];
  }
	
  /** Get the text of an annotation */
  @SuppressWarnings("unused")
  private String stringValue(Annotation ann) {
    Object[] tokens = getSortedTokens(ann);
	
    StringBuffer output = new StringBuffer();
    for (int i=0;i getResolvedAnaphora() {
    return this.anaphor2antecedent;
  }

  /** --- */
  private void preprocess() throws ExecutionException {

    //0.5 cleanup
    this.anaphor2antecedent.clear();

    //1.get all annotation in the input set
    if ( this.annotationSetName == null || this.annotationSetName.equals("")) {
      this.defaultAnnotations = this.document.getAnnotations();
    }
    else {
      this.defaultAnnotations = this.document.getAnnotations(annotationSetName);
    }

    //if none found, print warning and exit
    if (this.defaultAnnotations == null || this.defaultAnnotations.isEmpty()) {
      Err.prln("Coref Warning: No annotations found for processing!");
      return;
    }

    /*
    // initialise the quoted text fragments
    AnnotationSet sentQuotes = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);

    //if none then return
    if (null == sentQuotes) {
      this.quotedText = new Quote[0];
    }
    else {
      this.quotedText = new Quote[sentQuotes.size()];

      Object[] quotesArray = sentQuotes.toArray();
      java.util.Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);

      for (int i =0; i < quotesArray.length; i++) {
        this.quotedText[i] = new Quote((Annotation)quotesArray[i],i);
      }
    }
    */
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy