All Downloads are FREE. Search and download functionalities are using the official Maven repository.

gate.creole.orthomatcher.OrthoMatcher Maven / Gradle / Ivy

Go to download

ANNIE is a general purpose information extraction system that provides the building blocks of many other GATE applications.

There is a newer version: 9.1
Show newest version
/*
 *  OrthoMatcher.java
 *
 *  Copyright (c) 1995-2012, The University of Sheffield. See the file
 *  COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
 *
 *  This file is part of GATE (see http://gate.ac.uk/), and is free
 *  software, licenced under the GNU Library General Public License,
 *  Version 2, June 1991 (in the distribution as file licence.html,
 *  and also available at http://gate.ac.uk/gate/licence.html).
 *
 *  Kalina Bontcheva, 24/August/2001
 *
 *  Major update by Andrew Borthwick of Spock Networks, 11/13/2007 - 8/3/2008:
 *    1.  matchWithPrevious now searches for matching annotations in order, starting from current and working backwards
 *    until it finds a match.  This compares with the previous behavior, which searched randomly among previous annotations
 *    for a match (because it used an iterator across an AnnotationSet, whereas now we iterate across an ArrayList)
 *    2.  We no longer require that identical strings always refer to the same entity.  We can correctly match
 *    the sequence "David Jones ... David ... David Smith ... David" as referring to two people, tying the first
 *    David to "David Jones" and the second David to "David Smith".  Ditto with David Jones .. Mr. Jones ..
 *    Richard Jones .. Mr. Jones
 *    3.  We now allow for nickname matches for Persons (David = Dave) via the "fuzzyMatch" method which is referenced
 *    in some of the matching rules.
 *    4.  Optional parameter highPrecisionOrgs only allows high precision matches for organizations and
 *    turns off the riskier rules.  Under this option, need to match on something like IBM = IBM Corp.
 *    5.  Various fixes to a number of rules
 *
 *  $Id: OrthoMatcher.java 8929 2007-07-12 16:49:55Z ian_roberts $
 */

package gate.creole.orthomatcher;

import java.io.BufferedReader;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;

import gate.Annotation;
import gate.AnnotationSet;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.ResourceReference;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.BomStrippingInputStreamReader;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;
import gate.util.OffsetComparator;
import gate.util.Out;
@CreoleResource(name="ANNIE OrthoMatcher", comment="ANNIE orthographical coreference component.", helpURL="http://gate.ac.uk/userguide/sec:annie:orthomatcher", icon="ortho-matcher")
public class OrthoMatcher extends AbstractLanguageAnalyser {

  private static final long serialVersionUID = -6258229350677707465L;

  protected static final Logger log = Logger.getLogger(OrthoMatcher.class);

  public static final boolean DEBUG = false;

  public static final String
  OM_DOCUMENT_PARAMETER_NAME = "document";

  public static final String
  OM_ANN_SET_PARAMETER_NAME = "annotationSetName";

  public static final String
  OM_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";

  public static final String
  OM_ANN_TYPES_PARAMETER_NAME = "annotationTypes";

  public static final String
  OM_ORG_TYPE_PARAMETER_NAME = "organizationType";

  public static final String
  OM_PERSON_TYPE_PARAMETER_NAME = "personType";

  public static final String
  OM_EXT_LISTS_PARAMETER_NAME = "extLists";

  protected static final String CDGLISTNAME = "cdg";
  protected static final String ALIASLISTNAME = "alias";
  protected static final String ARTLISTNAME = "def_art";
  protected static final String PREPLISTNAME = "prepos";
  protected static final String CONNECTORLISTNAME = "connector";
  protected static final String SPURLISTNAME = "spur_match";

  protected static final String PUNCTUATION_VALUE = "punctuation";
  protected static final String THE_VALUE = "The";


  /**the name of the annotation set*/
  protected String annotationSetName;

  /** the types of the annotation */
  protected List annotationTypes = new ArrayList(10);

  /** the organization type*/
  protected String organizationType = ORGANIZATION_ANNOTATION_TYPE;

  /** the person type*/
  protected String personType = PERSON_ANNOTATION_TYPE;

  protected String unknownType = "Unknown";

  /** internal or external list */
  protected boolean extLists = true;

  /** Use only high precision rules for Organizations */
  protected Boolean highPrecisionOrgs = false;

  /** matching unknowns or not*/
  protected boolean matchingUnknowns = true;

  /** This is an internal variable to indicate whether
   *  we matched using a rule that requires that
   *  the newly matched annotation matches all the others
   *  This is needed, because organizations can share
   *  first/last tokens like News and be different
   */
  protected boolean allMatchingNeeded = false;

  //** Orthomatching is not case-sensitive by default*/
  protected boolean caseSensitive = false;

  //protected FeatureMap queryFM = Factory.newFeatureMap();

  // name lookup tables (used for namematch)
  //gave them bigger default size, coz rehash is expensive
  protected HashMap alias = new HashMap(100);
  protected Set cdg = new HashSet();
  protected HashMap spur_match = new HashMap(100);
  protected HashMap def_art = new HashMap(20);
  protected HashMap connector = new HashMap(20);
  protected HashMap prepos = new HashMap(30);


  protected AnnotationSet nameAllAnnots = null;

  protected HashMap processedAnnots = new HashMap(150);
  protected HashMap annots2Remove = new HashMap(75);
  protected List> matchesDocFeature = new ArrayList>();
  //maps annotation ids to array lists of tokens
  protected HashMap> tokensMap = new HashMap>(150);
  public Map> getTokensMap() {
    return tokensMap;
  }

  protected Map> normalizedTokensMap = new HashMap>(150);

  protected Annotation shortAnnot;
  protected Annotation longAnnot;

  protected List tokensLongAnnot;
  protected List tokensShortAnnot;

  protected List normalizedTokensLongAnnot, normalizedTokensShortAnnot;

  /**
   * URL to the file containing the definition for this orthomatcher
   */
  private ResourceReference definitionFileURL;

  private Double minimumNicknameLikelihood;

  /** The encoding used for the definition file and associated lists.*/
  private String encoding;

  private Map rules=new HashMap();

  /** to be initialized in init() */
  private AnnotationOrthography orthoAnnotation;

  public OrthoMatcher () {
    annotationTypes.add(organizationType);
    annotationTypes.add(personType);
    annotationTypes.add("Location");
    annotationTypes.add("Date");
  }

  /** Initialise the rules. The orthomatcher loads its build-in rules. */
  private void initRules(){
    //this line should be executed after spur_match is loaded
    rules.put(0,  new MatchRule0(this));
    rules.put(1,  new MatchRule1(this));
    rules.put(2,  new MatchRule2(this));
    rules.put(3,  new MatchRule3(this));
    rules.put(4,  new MatchRule4(this));
    rules.put(5,  new MatchRule5(this));
    rules.put(6,  new MatchRule6(this));
    rules.put(7,  new MatchRule7(this));
    rules.put(8,  new MatchRule8(this));
    rules.put(9,  new MatchRule9(this));
    rules.put(10, new MatchRule10(this));
    rules.put(11, new MatchRule11(this));
    rules.put(12, new MatchRule12(this));
    rules.put(13, new MatchRule13(this));
    rules.put(14, new MatchRule14(this));
    rules.put(15, new MatchRule15(this));
    rules.put(16, new MatchRule16(this));
    rules.put(17, new MatchRule17(this));

  }

  /** Override this method to add, replace, remove rules */
  protected void modifyRules(Map rules) {

  }

  /** Initialise this resource, and return it. */
  @SuppressWarnings("resource")
  @Override
  public Resource init() throws ResourceInstantiationException {
    //initialise the list of annotations which we will match
    if(definitionFileURL == null){
      throw new ResourceInstantiationException(
      "No URL provided for the definition file!");
    }
    String nicknameFile = null;
    BufferedReader reader = null;
    //at this point we have the definition file
    try{
      reader = new BomStrippingInputStreamReader(
          definitionFileURL.openStream(), encoding);
      String lineRead = null;
      //boolean foundANickname = false;
      while ((lineRead = reader.readLine()) != null){
        int index = lineRead.indexOf(":");
        if (index != -1){
          String nameFile = lineRead.substring(0,index);
          String nameList = lineRead.substring(index+1,lineRead.length());
          if (nameList.equals("nickname")) {
            if (minimumNicknameLikelihood == null) {
              throw new ResourceInstantiationException(
                  "No value for the required parameter " +
                  "minimumNicknameLikelihood!");
            }
            nicknameFile = nameFile;
          } else {
            createAnnotList(nameFile,nameList);
          }
        }// if
      }//while
      reader.close();

      URL nicknameURL = null;
      if (nicknameFile != null)
        nicknameURL = new URL(definitionFileURL.toURL(), nicknameFile);
      this.orthoAnnotation = new BasicAnnotationOrthography(
              personType,extLists,unknownType,nicknameURL,
              minimumNicknameLikelihood, encoding);
      initRules();
      modifyRules(rules);

    }catch(IOException ioe){
      throw new ResourceInstantiationException(ioe);
    }
    finally {
      IOUtils.closeQuietly(reader);
    }


    return this;
  } // init()


  /**  Run the resource. It doesn't make sense not to override
   *  this in subclasses so the default implementation signals an
   *  exception.
   */
  @Override
  public void execute() throws ExecutionException{
    try{
      //check the input
      if(document == null) {
        throw new ExecutionException(
                "No document for namematch!"
        );
      }
      fireStatusChanged("OrthoMatcher processing: " +  document.getName());

      // get the annotations from document
      if ((annotationSetName == null)|| (annotationSetName.equals("")))
        nameAllAnnots = document.getAnnotations();
      else
        nameAllAnnots = document.getAnnotations(annotationSetName);

      //if none found, print warning and exit
      if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) {
        Out.prln("OrthoMatcher Warning: No annotations found for processing");
        return;
      }

      //check if we've been run on this document before
      //and clean the doc if needed
      docCleanup();
      @SuppressWarnings("unchecked")
      Map>> matchesMap = (Map>>)document.getFeatures().
      get(DOCUMENT_COREF_FEATURE_NAME);


      // creates the cdg list from the document
      //no need to create otherwise, coz already done in init()
      if (!extLists)
        cdg=orthoAnnotation.buildTables(nameAllAnnots);


      //Match all name annotations and unknown annotations
      matchNameAnnotations();

      //used to check if the Orthomatcher works properly
      //OrthoMatcherHelper.setMatchesPositions(nameAllAnnots);

      // set the matches of the document
      //    determineMatchesDocument();
      if (! matchesDocFeature.isEmpty()) {
        if(matchesMap == null){
          matchesMap = new HashMap>>();
        }
        matchesMap.put(nameAllAnnots.getName(), matchesDocFeature);
        // System.out.println("matchesMap is: " + matchesMap);
        //we need to put it even if it was already present in order to triger
        //the update events
        document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap);

        //cannot do clear() as this has already been put on the document
        //so I need a new one for the next run of matcher
        matchesDocFeature = new ArrayList>();


        fireStatusChanged("OrthoMatcher completed");
      }
    }finally{
      //make sure the cleanup happens even if there are errors.
      //    Out.prln("Processed strings" + processedAnnots.values());
      //clean-up the internal data structures for next run
      nameAllAnnots = null;
      processedAnnots.clear();
      annots2Remove.clear();
      tokensMap.clear();
      normalizedTokensMap.clear();
      matchesDocFeature = new ArrayList>();
      longAnnot = null;
      shortAnnot = null;
      tokensLongAnnot = null;
      tokensShortAnnot = null;

      //if (log.isDebugEnabled()) OrthoMatcherHelper.saveUsedTable();
    }
  } // run()

  protected void matchNameAnnotations() throws ExecutionException{
    // go through all the annotation types
    Iterator iterAnnotationTypes = annotationTypes.iterator();
    while (iterAnnotationTypes.hasNext()) {
      String annotationType = iterAnnotationTypes.next();

      AnnotationSet nameAnnots = nameAllAnnots.get(annotationType);

      // continue if no such annotations exist
      if (nameAnnots.isEmpty()) continue;

      AnnotationSet tokensNameAS = nameAllAnnots.get(TOKEN_ANNOTATION_TYPE);
      if (tokensNameAS.isEmpty()) continue;

      ArrayList sortedNameAnnots = new ArrayList(nameAnnots);
      Collections.sort(sortedNameAnnots,new OffsetComparator());
      for (int snaIndex = 0;snaIndex < sortedNameAnnots.size();snaIndex++) {
        Annotation tempAnnot = sortedNameAnnots.get(snaIndex);
        Annotation nameAnnot = nameAllAnnots.get(tempAnnot.getId()); // Not sure if this matters

        // get string and value
        String annotString = orthoAnnotation.getStringForAnnotation(nameAnnot, document);

        //convert to lower case if we are not doing a case sensitive match
        if (!caseSensitive)
          annotString = annotString.toLowerCase();

        if (DEBUG) {
          if (log.isDebugEnabled()) {
            log.debug("Now processing the annotation:  "
                    + orthoAnnotation.getStringForAnnotation(nameAnnot, document) + " Id: " + nameAnnot.getId()
                    + " Type: " + nameAnnot.getType() + " Offset: " + nameAnnot.getStartNode().getOffset());
          }
        }

        // get the tokens
        List tokens = new ArrayList(tokensNameAS.getContained(nameAnnot.getStartNode().getOffset(),
                nameAnnot.getEndNode().getOffset()));

        //if no tokens to match, do nothing
        if (tokens.isEmpty()) {
          if (log.isDebugEnabled()) {
            log.debug("Didn't find any tokens for the following annotation.  We will be unable to perform coref on this annotation.  \n String:  "
                    + orthoAnnotation.getStringForAnnotation(nameAnnot, document) + " Id: " + nameAnnot.getId() + " Type: " + nameAnnot.getType());
          }
          continue;
        }
        Collections.sort(tokens, new gate.util.OffsetComparator());
        //check if these actually do not end after the name
        //needed coz new tokeniser conflates
        //strings with dashes. So British Gas-style is two tokens
        //instead of three. So cannot match properly British Gas
        //      tokens = checkTokens(tokens);
        tokensMap.put(nameAnnot.getId(), tokens);
        normalizedTokensMap.put(nameAnnot.getId(), new ArrayList(tokens));

        //first check whether we have not matched such a string already
        //if so, just consider it matched, don't bother calling the rules
        // Exception:  AB, Spock:
        // Note that we require one-token Person annotations to be matched even if an identical string
        // has been matched earlier because there could be multiple people named "David", for instance,
        // on a page.
        if (processedAnnots.containsValue(annotString) &&
                (! (nameAnnot.getType().equals(personType) && (tokens.size() == 1)))) {
          Annotation returnAnnot = orthoAnnotation.updateMatches(nameAnnot, annotString,processedAnnots,nameAllAnnots,matchesDocFeature);
          if (returnAnnot != null) {
            if (DEBUG) {
              if (log.isDebugEnabled()) {
                log.debug("Exact match criteria matched " + annotString + " from (id: " + nameAnnot.getId() + ", offset: " + nameAnnot.getStartNode().getOffset() + ") to " +
                        "(id: " + returnAnnot.getId() + ", offset: " + returnAnnot.getStartNode().getOffset() + ")");
              }
            }
            processedAnnots.put(nameAnnot.getId(), annotString);
            continue;
          }
        } else if (processedAnnots.isEmpty()) {
          // System.out.println("First item put in processedAnnots: " + annotString);
          processedAnnots.put(nameAnnot.getId(), annotString);
          continue;
        }

        //if a person, then remove their title before matching
        if (nameAnnot.getType().equals(personType)) {
          annotString = orthoAnnotation.stripPersonTitle(annotString, nameAnnot,document,tokensMap,normalizedTokensMap,nameAllAnnots);
          normalizePersonName(nameAnnot);
        }
        else if (nameAnnot.getType().equals(organizationType))
          annotString = normalizeOrganizationName(annotString, nameAnnot);

        if(null == annotString || "".equals(annotString) || tokens.isEmpty()) {
          if (log.isDebugEnabled()) {
            log.debug("Annotation ID " + nameAnnot.getId() + " of type" + nameAnnot.getType() +
            " refers to a null or empty string or one with no tokens after normalization.  Unable to process further.");
          }
          continue;
        }
        //otherwise try matching with previous annotations
        matchWithPrevious(nameAnnot, annotString,sortedNameAnnots,snaIndex);

        // Out.prln("Putting in previous " + nameAnnot + ": string " + annotString);
        //finally add the current annotations to the processed map
        processedAnnots.put(nameAnnot.getId(), annotString);
      }//while through name annotations
      if (matchingUnknowns) {
        matchUnknown(sortedNameAnnots);
      }
    }//while through annotation types

  }

  protected void matchUnknown(ArrayList sortedAnnotationsForAType) throws ExecutionException {
    //get all Unknown annotations
    AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType);
    annots2Remove.clear();
    if (unknownAnnots.isEmpty()) return;

    AnnotationSet nameAllTokens = nameAllAnnots.get(TOKEN_ANNOTATION_TYPE);
    if (nameAllTokens.isEmpty()) return;

    Iterator iter = unknownAnnots.iterator();
    //loop through the unknown annots
    while (iter.hasNext()) {
      Annotation unknown = iter.next();

      // get string and value
      String unknownString = orthoAnnotation.getStringForAnnotation(unknown, document);
      //convert to lower case if we are not doing a case sensitive match
      if (!caseSensitive)
        unknownString = unknownString.toLowerCase();

      // System.out.println("Now trying to match the unknown string: " + unknownString);
      //get the tokens
      List tokens = new ArrayList(nameAllTokens.getContained(
              unknown.getStartNode().getOffset(),
              unknown.getEndNode().getOffset()
      ));
      if (tokens.isEmpty())
        continue;
      Collections.sort(tokens, new gate.util.OffsetComparator());
      tokensMap.put(unknown.getId(), tokens);
      normalizedTokensMap.put(unknown.getId(), tokens);


      //first check whether we have not matched such a string already
      //if so, just consider it matched, don't bother calling the rules
      if (processedAnnots.containsValue(unknownString)) {
        Annotation matchedAnnot = orthoAnnotation.updateMatches(unknown, unknownString,processedAnnots,nameAllAnnots,matchesDocFeature);
        if (matchedAnnot == null) {
          log.debug("Orthomatcher: Unable to find the annotation: " +
                  orthoAnnotation.getStringForAnnotation(unknown, document) +
          " in matchUnknown");
        }
        else {
          if (matchedAnnot.getType().equals(unknownType)) {
            annots2Remove.put(unknown.getId(),
                    annots2Remove.get(matchedAnnot.getId()));
          }
          else
            annots2Remove.put(unknown.getId(), matchedAnnot.getType());
          processedAnnots.put(unknown.getId(), unknownString);
          unknown.getFeatures().put("NMRule", unknownType);
          continue;
        }
      }

      //check if we should do sub-string matching in case it's hyphenated
      //for example US-led
      if (tokens.size() == 1
              && "hyphen".equals(unknown.getFeatures().get(TOKEN_KIND_FEATURE_NAME))) {
        if (matchHyphenatedUnknowns(unknown, unknownString, iter))
          continue;
      }//if

      // TODO:  The below results in a assigning the unknown's to the last annotation that it matches in a document.
      // It would probably be better to first start with things which precede the current unknown and then do
      // annotations after
      matchWithPrevious(unknown, unknownString,sortedAnnotationsForAType,sortedAnnotationsForAType.size());

    } //while though unknowns

    if (! annots2Remove.isEmpty()) {
      Iterator unknownIter = annots2Remove.keySet().iterator();
      while (unknownIter.hasNext()) {
        Integer unknId = unknownIter.next();
        Annotation unknown = nameAllAnnots.get(unknId);
        Integer newID = nameAllAnnots.add(
                unknown.getStartNode(),
                unknown.getEndNode(),
                annots2Remove.get(unknId),
                unknown.getFeatures()
        );
        nameAllAnnots.remove(unknown);

        //change the id in the matches list
        @SuppressWarnings("unchecked")
        List mList = (List)unknown.getFeatures().
        get(ANNOTATION_COREF_FEATURE_NAME);
        mList.remove(unknId);
        mList.add(newID);
      }//while
    }//if
  }

  private boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString,
          Iterator iter){
    boolean matched = false;

    //only take the substring before the hyphen
    int stringEnd = unknownString.indexOf("-");
    unknownString = unknownString.substring(0, stringEnd);
    //check if we've already matched this string
    //because only exact match of the substring are considered
    if (processedAnnots.containsValue(unknownString)) {
      matched = true;
      Annotation matchedAnnot = orthoAnnotation.updateMatches(unknown, unknownString,processedAnnots,nameAllAnnots,matchesDocFeature);
      //only do the matching if not a person, because we do not match
      //those on sub-strings
      iter.remove();
      String newType;
      if (matchedAnnot.getType().equals(unknownType))
        newType = annots2Remove.get(matchedAnnot.getId());
      else
        newType = matchedAnnot.getType();

      Integer newID;
      try {
        newID = nameAllAnnots.add(
                unknown.getStartNode().getOffset(),
                unknown.getStartNode().getOffset() + stringEnd,
                        newType,
                        unknown.getFeatures()
        );
      } catch (InvalidOffsetException ex) {
        throw new GateRuntimeException(ex.getMessage());
      }
      nameAllAnnots.remove(unknown);

      //change the id in the matches list
      @SuppressWarnings("unchecked")
      List mList = (List)unknown.getFeatures().
      get(ANNOTATION_COREF_FEATURE_NAME);
      mList.remove(unknown.getId());
      mList.add(newID);

    }
    return matched;
  }

  /**
   * Attempt to match nameAnnot against all previous annotations of the same type, which are passed down
   * in listOfThisType.  Matches are tested in order from most recent to oldest.
   * @param nameAnnot    Annotation we are trying to match
   * @param annotString  Normalized string representation of annotation
   * @param listOfThisType  ArrayList of Annotations of the same type as nameAnnot
   * @param startIndex   Index in listOfThisType that we will start from in matching the current annotation
   */
  protected void matchWithPrevious(Annotation nameAnnot, String annotString,
          ArrayList listOfThisType,
          int startIndex) {
    boolean matchedUnknown = false;
    // Out.prln("matchWithPrevious now processing: " + annotString);

    for (int curIndex = startIndex - 1;curIndex >= 0;curIndex--) {
      Integer prevId = listOfThisType.get(curIndex).getId();
      Annotation prevAnnot = nameAllAnnots.get(prevId);  // Note that this line probably isn't necessary anymore

      //check if the two are from the same type or the new one is unknown
      if (prevAnnot == null || (! prevAnnot.getType().equals(nameAnnot.getType())
              && ! nameAnnot.getType().equals(unknownType))
      )
        continue;
      //do not compare two unknown annotations either
      //they are only matched to those of known types
      if (  nameAnnot.getType().equals(unknownType)
              && prevAnnot.getType().equals(unknownType))
        continue;

      //check if we have already matched this annotation to the new one
      if (orthoAnnotation.matchedAlready(nameAnnot, prevAnnot,matchesDocFeature,nameAllAnnots) )
        continue;

      //now changed to a rule, here we just match by gender
      if (prevAnnot.getType().equals(personType)) {
        String prevGender =
          (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
        String nameGender =
          (String) nameAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
        if (   prevGender != null
                && nameGender != null
                && ( (nameGender.equalsIgnoreCase("female")
                        &&
                        prevGender.equalsIgnoreCase("male")
                )
                ||
                (prevGender.equalsIgnoreCase("female")
                        && nameGender.equalsIgnoreCase("male")
                )
                )
        ) //if condition
          continue; //we don't have a match if the two genders are different

      }//if

      //if the two annotations match
      //
      // A. Borthwick, Spock:  If the earlier annotation is shorter than the current annotation and it
      // has already been matched with a longer annotations, then don't match it with the current annotation.
      // Reasoning is that with the sequence David Jones . . . David  . . . David Smith, we don't want to match
      // David Smith with David.  However, with the sequence, David  . . . David Jones, it's okay to match the
      // shorter version with the longer, because it hasn't already been matched with a longer.
      boolean prevAnnotUsedToMatchWithLonger = prevAnnot.getFeatures().containsKey("matchedWithLonger");
      if (matchAnnotations(nameAnnot, annotString,  prevAnnot)) {
        orthoAnnotation.updateMatches(nameAnnot, prevAnnot,matchesDocFeature,nameAllAnnots);
        if (DEBUG) {
          log.debug("Just matched nameAnnot " + nameAnnot.getId() + " with prevAnnot " + prevAnnot.getId());
        }

        if (!prevAnnotUsedToMatchWithLonger && prevAnnot.getFeatures().containsKey("matchedWithLonger")) {
          // We have just matched the previous annotation with a longer annotation for the first time.  We need
          // to propagate the matchedWithLonger property to all other annotations which coreffed with the previous annotation
          // so that we don't match them with a longer annotation
          propagatePropertyToExactMatchingMatches(prevAnnot,"matchedWithLonger",true);
        }
        //if unknown annotation, we need to change to the new type
        if (nameAnnot.getType().equals(unknownType)) {
          matchedUnknown = true;
          if (prevAnnot.getType().equals(unknownType))
            annots2Remove.put(nameAnnot.getId(),
                    annots2Remove.get(prevAnnot.getId()));
          else
            annots2Remove.put(nameAnnot.getId(), prevAnnot.getType());
          //also put an attribute to indicate that
          nameAnnot.getFeatures().put("NMRule", unknownType);
        }//if unknown
        break; //no need to match further
      }//if annotations matched

    }//while through previous annotations

    if (matchedUnknown)
      processedAnnots.put(nameAnnot.getId(), annotString);


  }//matchWithPrevious

  protected void propagatePropertyToExactMatchingMatches(Annotation updateAnnot,String featureName,Object value) {
    try {
      @SuppressWarnings("unchecked")
      List matchesList = (List) updateAnnot.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME);
      if ((matchesList == null) || matchesList.isEmpty()) {
        return;
      }
      else {
        String updateAnnotString = orthoAnnotation.getStringForAnnotation(updateAnnot, document).toLowerCase();
        for (Integer nextId : matchesList) {
          Annotation a = nameAllAnnots.get(nextId);

          if (orthoAnnotation.fuzzyMatch(orthoAnnotation.getStringForAnnotation(a, document),updateAnnotString)) {
            if (DEBUG) {
              log.debug("propogateProperty: " + featureName + " " + value + " from: " + updateAnnot.getId() + " to: " + a.getId());
            }
            a.getFeatures().put(featureName, value);
          }
        }
      }
    }
    catch (Exception e) {
      log.error("Error in propogatePropertyToExactMatchingMatches", e);
    }
  }

  protected boolean matchAnnotations(Annotation newAnnot, String annotString,
          Annotation prevAnnot) {
    //do not match two annotations that overlap
    if (newAnnot.overlaps(prevAnnot))
      return false;

    // find which annotation string of the two is longer
    //  this is useful for some of the matching rules
    String prevAnnotString = processedAnnots.get(prevAnnot.getId());
    // Out.prln("matchAnnotations processing " + annotString + " and " + prevAnnotString);
    if (prevAnnotString == null) {
      //    Out.prln("We discovered that the following string is null!:  " + prevAnnot.getId() +
      //    " For the previous annotation " + getStringForAnnotation(prevAnnot, document) +
      //    " which has annotation type " + prevAnnot.getType() +
      //    " Tried to compared it to the annotation string " + annotString);
      return false;
    }

    String longName = prevAnnotString;
    String shortName = annotString;
    longAnnot = prevAnnot;
    shortAnnot = newAnnot;
    boolean longerPrevious = true;

    if (shortName.length()>longName.length()) {
      String temp = longName;
      longName = shortName;
      shortName = temp;
      Annotation tempAnn = longAnnot;
      longAnnot = shortAnnot;
      shortAnnot = tempAnn;
      longerPrevious = false;
    }//if

    tokensLongAnnot = tokensMap.get(longAnnot.getId());
    normalizedTokensLongAnnot = normalizedTokensMap.get(longAnnot.getId());
    tokensShortAnnot = tokensMap.get(shortAnnot.getId());
    normalizedTokensShortAnnot = normalizedTokensMap.get(shortAnnot.getId());

    @SuppressWarnings("unchecked")
    List matchesList = (List) prevAnnot.getFeatures().
    get(ANNOTATION_COREF_FEATURE_NAME);
    if (matchesList == null || matchesList.isEmpty())
      return apply_rules_namematch(prevAnnot.getType(), shortName,longName,
              prevAnnot,newAnnot,longerPrevious);

    //if these two match, then let's see if all the other matching one will too
    //that's needed, because sometimes names can share a token (e.g., first or
    //last but not be the same
    if (apply_rules_namematch(prevAnnot.getType(), shortName,longName,prevAnnot,newAnnot,
            longerPrevious)) {
      /**
       * Check whether we need to ensure that there is a match with the rest
       * of the matching annotations, because the rule requires that
       * transtivity is not assummed.
       */
      if (allMatchingNeeded) {
        allMatchingNeeded = false;

        List toMatchList = new ArrayList(matchesList);
        //      if (newAnnot.getType().equals(unknownType))
        //        Out.prln("Matching new " + annotString + " with annots " + toMatchList);
        toMatchList.remove(prevAnnot.getId());

        return matchOtherAnnots(toMatchList, newAnnot, annotString);
      } else
        return true;
    }
    return false;
  }

  /** This method checkes whether the new annotation matches
   *  all annotations given in the toMatchList (it contains ids)
   *  The idea is that the new annotation needs to match all those,
   *  because assuming transitivity does not always work, when
   *  two different entities share a common token: e.g., BT Cellnet
   *  and BT and British Telecom.
   */
  protected boolean matchOtherAnnots( List toMatchList, Annotation newAnnot,
          String annotString) {

    //if the list is empty, then we're matching all right :-)
    if (toMatchList.isEmpty())
      return true;

    boolean matchedAll = true;
    int i = 0;

    while (matchedAll && i < toMatchList.size()) {
      Annotation prevAnnot = nameAllAnnots.get(toMatchList.get(i));

      // find which annotation string of the two is longer
      //  this is useful for some of the matching rules
      String prevAnnotString = processedAnnots.get(prevAnnot.getId());
      if (prevAnnotString == null)
        try {
          prevAnnotString = document.getContent().getContent(
                  prevAnnot.getStartNode().getOffset(),
                  prevAnnot.getEndNode().getOffset()
          ).toString();
        } catch (InvalidOffsetException ioe) {
          return false;
        }//try


        String longName = prevAnnotString;
        String shortName = annotString;
        longAnnot = prevAnnot;
        shortAnnot = newAnnot;
        boolean longerPrevious = true;
        if (shortName.length()>=longName.length()) {
          String temp = longName;
          longName = shortName;
          shortName = temp;
          Annotation tempAnn = longAnnot;
          longAnnot = shortAnnot;
          shortAnnot = tempAnn;
          longerPrevious = false;
        }//if

        tokensLongAnnot = tokensMap.get(longAnnot.getId());
        normalizedTokensLongAnnot = normalizedTokensMap.get(longAnnot.getId());
        tokensShortAnnot = tokensMap.get(shortAnnot.getId());
        normalizedTokensShortAnnot = normalizedTokensMap.get(shortAnnot.getId());

        matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName,longName,prevAnnot,newAnnot,
                longerPrevious);
        //      if (newAnnot.getType().equals(unknownType))
        //      Out.prln("Loop: " + shortName + " and " + longName + ": result: " + matchedAll);

        i++;
    }//while
    return matchedAll;
  }

  @SuppressWarnings("unchecked")
  protected void docCleanup() {
    Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME);
    if (matchesValue != null && (matchesValue instanceof Map))
      ((Map>>)matchesValue).remove(nameAllAnnots.getName());
    else if (matchesValue != null) {
      document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, new HashMap>>());
    }

    //get all annotations that have a matches feature
    HashSet fNames = new HashSet();
    fNames.add(ANNOTATION_COREF_FEATURE_NAME);
    AnnotationSet annots =
      nameAllAnnots.get(null, fNames);

    //  Out.prln("Annots to cleanup" + annots);

    if (annots == null || annots.isEmpty())
      return;

    Iterator iter = annots.iterator();
    while (iter.hasNext()) {
      while (iter.hasNext())
        iter.next().getFeatures().remove(ANNOTATION_COREF_FEATURE_NAME);
    } //while
  }//cleanup


  static Pattern periodPat = Pattern.compile("[\\.]+");

  protected void normalizePersonName (Annotation annot) throws ExecutionException {
    List tokens = normalizedTokensMap.get(annot.getId());
    for (int i = tokens.size() - 1; i >= 0; i--) {
      String tokenString = ((String) tokens.get(i).getFeatures().get(TOKEN_STRING_FEATURE_NAME));
      String kind = (String) tokens.get(i).getFeatures().get(TOKEN_KIND_FEATURE_NAME);
      //String category = (String) tokens.get(i).getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
      if (!caseSensitive)  {
        tokenString = tokenString.toLowerCase();
      }
      // log.debug("tokenString: " + tokenString + " kind: " + kind + " category: " + category);
      if (kind.equals(PUNCTUATION_VALUE) ) {
        // log.debug("Now tagging it!");
        tokens.get(i).getFeatures().put("ortho_stop", true);
      }
    }

    List normalizedTokens = new ArrayList(tokens);
    for (int j = normalizedTokens.size() - 1; j >=  0;j--) {
      if (normalizedTokens.get(j).getFeatures().containsKey("ortho_stop")) {
        // log.debug("Now removing " + normalizedTokens.get(j).getFeatures().get(TOKEN_STRING_FEATURE_NAME));
        normalizedTokens.remove(j);
      }
    }
    // log.debug("normalizedTokens size is: " + normalizedTokens.size());
    normalizedTokensMap.put(annot.getId(), normalizedTokens);
  }

  /** return an organization  without a designator and starting The*/
  protected String normalizeOrganizationName (String annotString, Annotation annot){

    List tokens = tokensMap.get(annot.getId());

    //strip starting The first
    if ( ((String) tokens.get(0).getFeatures().get(TOKEN_STRING_FEATURE_NAME))
    .equalsIgnoreCase(THE_VALUE))
      tokens.remove(0);

    if (tokens.size() > 0) {

      // New code by A. Borthwick of Spock Networks
      // June 13, 2008
      // Strip everything on the cdg list, which now encompasses not just cdg's, but also other stopwords
      // Start from the right side so we don't mess up the arraylist
      for (int i = tokens.size() - 1; i >= 0; i--) {
        String tokenString = ((String) tokens.get(i).getFeatures().get(TOKEN_STRING_FEATURE_NAME));
        String kind = (String) tokens.get(i).getFeatures().get(TOKEN_KIND_FEATURE_NAME);
        String category = (String) tokens.get(i).getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
        if (!caseSensitive)  {
          tokenString = tokenString.toLowerCase();
        }
        // Out.prln("tokenString: " + tokenString + " kind: " + kind + " category: " + category);
        if (kind.equals(PUNCTUATION_VALUE) ||
	    ( (category != null) && (category.equals("DT") || category.equals("IN")) )
	    || cdg.contains(tokenString)) {
          // Out.prln("Now tagging it!");
          tokens.get(i).getFeatures().put("ortho_stop", true);
        }
      }

      // AB, Spock:  Need to check for CDG even for 1 token so we don't automatically match
      // a one-token annotation called "Company", for instance
      String compareString = (String) tokens.get(tokens.size()-1).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
      if (!caseSensitive) {
        compareString = compareString.toLowerCase();
      }
      if (cdg.contains(compareString)) {
        tokens.remove(tokens.size()-1);
      }

    }

    ArrayList normalizedTokens = new ArrayList(tokens);
    for (int j = normalizedTokens.size() - 1; j >=  0;j--) {
      if (normalizedTokens.get(j).getFeatures().containsKey("ortho_stop")) {
        normalizedTokens.remove(j);
      }
    }

    normalizedTokensMap.put(annot.getId(), normalizedTokens);

    StringBuffer newString = new StringBuffer(50);
    for (int i = 0; i < tokens.size(); i++){
      newString.append((String) tokens.get(i).getFeatures().get(TOKEN_STRING_FEATURE_NAME) );
      if (i != tokens.size()-1)
        newString.append(" ");
    }
    // Out.prln("Strip CDG returned: " + newString + "for string " + annotString);

    if (caseSensitive)
      return newString.toString();

    return newString.toString().toLowerCase();
  }

  /** creates the lookup tables */
  protected void createAnnotList(String nameFile, String nameList)
          throws IOException {
    // create the relative URL
    URL fileURL = new URL(definitionFileURL.toURL(), nameFile);
    BufferedReader bufferedReader = null;
    try {
      bufferedReader =
              new BomStrippingInputStreamReader(fileURL.openStream(), encoding);

      String lineRead = null;
      while((lineRead = bufferedReader.readLine()) != null) {
        if(nameList.compareTo(CDGLISTNAME) == 0) {
          Matcher matcher = punctPat.matcher(lineRead.toLowerCase().trim());
          lineRead = matcher.replaceAll(" ").trim();
          if(caseSensitive)
            cdg.add(lineRead);
          else cdg.add(lineRead.toLowerCase());
        }// if
        else {
          int index = lineRead.indexOf("£");
          if(index != -1) {
            String expr = lineRead.substring(0, index);
            // if not case-sensitive, we need to downcase all strings
            if(!caseSensitive) expr = expr.toLowerCase();
            String code = lineRead.substring(index + 1, lineRead.length());
            if(nameList.equals(ALIASLISTNAME)) {
              alias.put(expr, code);
            } else if(nameList.equals(ARTLISTNAME)) {
              def_art.put(expr, code);
            } else if(nameList.equals(PREPLISTNAME)) {
              prepos.put(expr, code);
            } else if(nameList.equals(CONNECTORLISTNAME)) {
              connector.put(expr, code);
            } else if(nameList.equals(SPURLISTNAME)) {
              spur_match.put(expr, code);
            }
          }// if
        }// else

      }// while
    } finally {
      IOUtils.closeQuietly(bufferedReader);
    }
  }// createAnnotList


  /**
   * This is the skeleton of a function which should be available in OrthoMatcher to allow a pairwise comparison of two name strings
   * It should eventually be made public.  It is private here (and thus non-functional) because OrthoMatcher is currently reliant
   * on the tokenization of the names, which are held in the global variables tokensShortAnnot and tokensLongAnnot
   *
   * @param name1
   * @param name2
   * @return  true if the two names indicate the same person
   */
  @SuppressWarnings("unused")
  private boolean pairwise_person_name_match(String name1, String name2) {
    String shortName,longName;
    if (name1.length() > name2.length()) {
      longName = name1;
      shortName = name2;
    }
    else {
      longName = name2;
      shortName = name1;
    }
    if (rules.get(0).value(longName,shortName)) {//matchRule0(longName,shortName)
      return false;
    }
    else {
      if (longName.equals(shortName) || rules.get(2).value(longName, shortName) ||
              rules.get(3).value(longName, shortName)) {
        return true;
      }
      else {
        return (rules.get(0).value(longName, shortName));
        // boolean throwAway[] = new boolean[17];
        // return basic_person_match_criteria(shortName,longName,throwAway);
        // The above doesn't work because basic_person_match_criteria is reliant on the global
        // variables tokensShortAnnot and tokensLongAnnot so I just call what I can directly
      }
    }
  }

  /**
   * basic_person_match_criteria
   * Note that this function relies on various global variables in some other match rules.
   */
  private boolean basic_person_match_criteria(String shortName,
          String longName, boolean mr[]) {

    if ( // For 4, 5, 14, and 15, need to mark shorter annot
            //kalina: added 16, so it matches names when contain more than one first and one last name
            OrthoMatcherHelper.executeDisjunction(rules, new int[] {1,5,6,13,15,16},longName,shortName,mr)
    ) {
      return true;
    }
    return false;
  }


  /** apply_rules_namematch: apply rules similarly to lasie1.5's namematch */
  private boolean apply_rules_namematch(String annotationType, String shortName,
          String longName,Annotation prevAnnot,
          Annotation followAnnot,
          boolean longerPrevious) {
    boolean mr[] = new boolean[rules.size()];
    // first apply rule for spurious matches i.e. rule0
    if (DEBUG) {
      log.debug("Now matching " + longName + "(id: " + longAnnot.getId() + ") to "
              + shortName + "(id: " + shortAnnot.getId() + ")");
    }

    if (rules.get(0).value(longName,shortName))
      return false;
    if (
            (// rules for all annotations
                    //no longer use rule1, coz I do the check for same string via the hash table
                    OrthoMatcherHelper.executeDisjunction(rules, new int[] {2,3},longName,shortName,mr)

            ) // rules for all annotations
            ||
            (// rules for organisation annotations
                    (annotationType.equals(organizationType)
                            //ACE addition
                            || annotationType.equals("Facility")
                    )
                    &&
                    // Should basically only match when you have a match of all tokens other than
                    // CDG's and function words
                    (
                            (!highPrecisionOrgs && OrthoMatcherHelper.executeDisjunction(rules,new int[] {4,6,7,8,9,10,11,12,14},longName,shortName,mr))
                            ||
                            (highPrecisionOrgs && OrthoMatcherHelper.executeDisjunction(rules,new int[] {7,8,10,11,17},longName,shortName,mr))
                    )
            )
    ) {// rules for organisation annotations
      return true;
    }

    if  (// rules for person annotations
            (    annotationType.equals(personType))) {
      if (noMatchRule1(longName, shortName,prevAnnot, longerPrevious) ||
              noMatchRule2(longName, shortName)) {
        // Out.prln("noMatchRule1 rejected match between " + longName + " and " + shortName);
        return false;
      }
      else {
        if (  basic_person_match_criteria(shortName,longName,mr))
        {
          if ((longName.length() != shortName.length()) && (mr[4] || mr[5] || mr[14] || mr[15])) {
            if (longerPrevious) {
              followAnnot.getFeatures().put("matchedWithLonger", true);
            }
            else {
              prevAnnot.getFeatures().put("matchedWithLonger", true);
            }
          }
          else if ((longName.length() == shortName.length()) && (mr[1])) {
            if (prevAnnot.getFeatures().containsKey("matchedWithLonger")) {
              followAnnot.getFeatures().put("matchedWithLonger", true);
            }
          }
          return true;
        }
        return false;
      }
    }
    return false;
  }//apply_rules


  /** set the extLists flag */
  @Optional
  @CreoleParameter(comment="External lists otherwise internal", defaultValue="true")
  public void setExtLists(Boolean newExtLists) {
    extLists = newExtLists.booleanValue();
  }//setextLists

  /** set the caseSensitive flag */
  @Optional
  @CreoleParameter(comment="Should this resource diferentiate on case?",defaultValue="false")
  public void setCaseSensitive(Boolean newCase) {
    caseSensitive = newCase.booleanValue();
  }//setextLists

  /** set the annotation set name*/
  @RunTime
  @Optional
  @CreoleParameter(comment="Annotation set name where are the annotation types (annotationTypes)")
  public void setAnnotationSetName(String newAnnotationSetName) {
    annotationSetName = newAnnotationSetName;
  }//setAnnotationSetName

  /** set the types of the annotations*/
  @RunTime
  @Optional
  @CreoleParameter(comment="Name of the annotation types to use", defaultValue="Organization;Person;Location;Date")
  public void setAnnotationTypes(List newType) {
    annotationTypes = newType;
  }//setAnnotationTypes

  /** set whether to process the Unknown annotations*/
  @Optional
  @CreoleParameter(comment="Should we process 'Unknown' annotations?", defaultValue="true")
  public void setProcessUnknown(Boolean processOrNot) {
    this.matchingUnknowns = processOrNot.booleanValue();
  }//setAnnotationTypes

  @Optional
  @CreoleParameter(comment="Annotation name for the organizations", defaultValue="Organization")
  public void setOrganizationType(String newOrganizationType) {
    organizationType = newOrganizationType;
  }//setOrganizationType

  @Optional
  @CreoleParameter(comment="Annotation name for the persons", defaultValue="Person")
  public void setPersonType(String newPersonType) {
    personType = newPersonType;
  }//setPersonType

  /**get the name of the annotation set*/
  public String getAnnotationSetName() {
    return annotationSetName;
  }//getAnnotationSetName

  /** get the types of the annotation*/
  public List getAnnotationTypes() {
    return annotationTypes;
  }//getAnnotationTypes

  public String getOrganizationType() {
    return organizationType;
  }

  public String getPersonType() {
    return personType;
  }

  public Boolean getExtLists() {
    return extLists;
  }

  /** Are we running in a case-sensitive mode?*/
  public Boolean getCaseSensitive() {
    return caseSensitive;
  }

  /** Return whether or not we're processing the Unknown annots*/
  public Boolean getProcessUnknown() {
    return matchingUnknowns;
  }



  /**
  No Match Rule 1:
  Avoids the problem of matching
  David Jones ...
  David ...
  David Smith
  Since "David" was matched with David Jones, we don't match David with David Smith.
   */
  public boolean noMatchRule1(String s1,
          String s2,Annotation previousAnnot, boolean longerPrevious) {
    //    if (DEBUG) {
    //      try {
    //        String annotString = getStringForAnnotation(previousAnnot, document );

    //        log.debug("Previous annotation was " + annotString +  "(id: " + previousAnnot.getId() + ")" + " features are " + previousAnnot.getFeatures());
    //      }
    //      catch (ExecutionException e) {}
    //    }

    if (longerPrevious || !previousAnnot.getFeatures().containsKey("matchedWithLonger")) {
      return false;
    }
    else {
      return true;
    }
  }//noMatchRule1

  /***
   * returns true if it detects a middle name which indicates that the name string contains a nickname or a
   * compound last name
   */
  private boolean detectBadMiddleTokens(List tokArray) {
    for (int j = 1;j < tokArray.size() - 1;j++) {
      String currentToken = (String) tokArray.get(j).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
      Matcher matcher = badMiddleTokens.matcher(currentToken.toLowerCase().trim());
      if (matcher.find()) {
        // We have found a case of a ", ',
        return true;
      }
    }
    return false;
  }

  /**
   * NoMatch Rule #2: Do we have a mismatch of middle initial?
   * Condition(s):  Only applies to person names with more than two tokens in the name
   *
   * Want George W. Bush != George H. W. Bush and George Walker Bush != George Herbert Walker Bush
   * and
   * John T. Smith != John Q. Smith
   * however
   * John T. Smith == John Thomas Smith
   * be careful about
   * Hillary Rodham Clinton == Hillary Rodham-Clinton
   * be careful about
   * Carlos Bueno de Lopez == Bueno de Lopez
   * and
   * Cynthia Morgan de Rothschild == Cynthia de Rothschild
   */
  @SuppressWarnings("unused")
  public boolean noMatchRule2(String s1,String s2) {
    if (normalizedTokensLongAnnot.size()>2 && normalizedTokensShortAnnot.size()>2) {
      boolean retval = false;
      if (normalizedTokensLongAnnot.size() != normalizedTokensShortAnnot.size()) {
        String firstNameLong = (String) normalizedTokensLongAnnot.get(0).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
        String firstNameShort = (String) normalizedTokensShortAnnot.get(0).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
        String lastNameLong = (String) normalizedTokensLongAnnot.get(normalizedTokensLongAnnot.size() - 1).
        getFeatures().get(TOKEN_STRING_FEATURE_NAME);
        String lastNameShort = (String) normalizedTokensShortAnnot.get(normalizedTokensShortAnnot.size() - 1).
        getFeatures().get(TOKEN_STRING_FEATURE_NAME);
        if (rules.get(1).value(firstNameLong,firstNameShort) &&
                (rules.get(1).value(lastNameLong,lastNameShort))) {
          // Must have a match on first and last name for this non-match rule to take effect when the number of tokens differs
          if (detectBadMiddleTokens(tokensLongAnnot) || detectBadMiddleTokens(tokensShortAnnot)) {
            // Exclude the William (Bill) H. Gates vs. William H. Gates case and the
            // Cynthia Morgan de Rothschild vs. Cynthia de Rothschild case
            if (DEBUG && log.isDebugEnabled()) {
              log.debug("noMatchRule2Name did not non-match because of bad middle tokens " + s1 + "(id: " + longAnnot.getId() + ") to "
                      + s2+ "(id: " + shortAnnot.getId() + ")");
            }
            return false;
          }
          else {
            // Covers the George W. Bush vs George H. W. Bush and George Walker Bush vs. George Herbert Walker Bush cases
            retval = true;
          }
        }
      }
      else {
        for (int i = 1; i < normalizedTokensLongAnnot.size() - 1;i++) {
          String s1_middle = (String) normalizedTokensLongAnnot.get(i).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
          String s2_middle = (String) normalizedTokensShortAnnot.get(i).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
          if (!caseSensitive) {
            s1_middle = s1_middle.toLowerCase();
            s2_middle = s2_middle.toLowerCase();
          }
          //          log.debug("noMatchRule2 comparing substring " + s1_middle + " to " + s2_middle);
          if (!(rules.get(1).value(s1_middle,s2_middle) ||
                  OrthoMatcherHelper.initialMatch(s1_middle, s2_middle))) {
            // We found a mismatching middle name
            retval = true;
            break;
          }
        }
      }
      if (retval && log.isDebugEnabled() && DEBUG)  {
        log.debug("noMatchRule2Name non-matched  " + s1 + "(id: " + longAnnot.getId() + ") to "
                + s2+ "(id: " + shortAnnot.getId() + ")");
      }
      return retval;
    } // if (normalizedTokensLongAnnot.size()>2 && normalizedTokensShortAnnot.size()>2)
    return false;
  }//noMatchRule2

  @CreoleParameter(comment="The URL to the definition file", defaultValue="resources/othomatcher/listsNM.def", suffixes="def")
  public void setDefinitionFileURL(ResourceReference definitionFileURL) {
    this.definitionFileURL = definitionFileURL;
  }
  
  @Deprecated
  public void setDefinitionFileURL(URL definitionFileURL) {
    try {
      this.setDefinitionFileURL(new ResourceReference(definitionFileURL));
    } catch (URISyntaxException e) {
      throw new RuntimeException("Error converting URL to ResourceReference", e);
    }
  }

  public ResourceReference getDefinitionFileURL() {
    return definitionFileURL;
  }
  
  @CreoleParameter(comment="The encoding used for reading the definition file", defaultValue="UTF-8")
  public void setEncoding(String encoding) {
    this.encoding = encoding;
  }
  public String getEncoding() {
    return encoding;
  }


  public Double getMinimumNicknameLikelihood() {
    return minimumNicknameLikelihood;
  }

  @CreoleParameter(comment="Minimum likelihood that a name is a nickname", defaultValue="0.50")
  public void setMinimumNicknameLikelihood(Double minimumNicknameLikelihood) {
    this.minimumNicknameLikelihood = minimumNicknameLikelihood;
  }

  /**
   * @return the highPrecisionOrgs
   */
  public Boolean getHighPrecisionOrgs() {
    return highPrecisionOrgs;
  }

  /**
   * @param highPrecisionOrgs the highPrecisionOrgs to set
   */
  @Optional
  @CreoleParameter(comment="Use very safe features for matching orgs, such as ACME = ACME, Inc.", defaultValue="false")  
  public void setHighPrecisionOrgs(Boolean highPrecisionOrgs) {
    this.highPrecisionOrgs = highPrecisionOrgs;
  }

  public void setOrthography(AnnotationOrthography orthography) {
    this.orthoAnnotation = orthography;
  }

  public AnnotationOrthography getOrthography() {
    return orthoAnnotation;
  }

  static Pattern punctPat = Pattern.compile("[\\p{Punct}]+");
  // The UTF characters are right and left double and single curly quotes
  static Pattern badMiddleTokens = Pattern.compile("[\u201c\u201d\u2018\u2019\'\\(\\)\"]+|^de$|^von$");
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy