gate.creole.orthomatcher.OrthoMatcher Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of annie Show documentation
Show all versions of annie Show documentation
ANNIE is a general purpose information extraction system that
provides the building blocks of many other GATE applications.
/*
* OrthoMatcher.java
*
* Copyright (c) 1995-2012, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Kalina Bontcheva, 24/August/2001
*
* Major update by Andrew Borthwick of Spock Networks, 11/13/2007 - 8/3/2008:
* 1. matchWithPrevious now searches for matching annotations in order, starting from current and working backwards
* until it finds a match. This compares with the previous behavior, which searched randomly among previous annotations
* for a match (because it used an iterator across an AnnotationSet, whereas now we iterate across an ArrayList)
* 2. We no longer require that identical strings always refer to the same entity. We can correctly match
* the sequence "David Jones ... David ... David Smith ... David" as referring to two people, tying the first
* David to "David Jones" and the second David to "David Smith". Ditto with David Jones .. Mr. Jones ..
* Richard Jones .. Mr. Jones
* 3. We now allow for nickname matches for Persons (David = Dave) via the "fuzzyMatch" method which is referenced
* in some of the matching rules.
* 4. Optional parameter highPrecisionOrgs only allows high precision matches for organizations and
* turns off the riskier rules. Under this option, need to match on something like IBM = IBM Corp.
* 5. Various fixes to a number of rules
*
* $Id: OrthoMatcher.java 8929 2007-07-12 16:49:55Z ian_roberts $
*/
package gate.creole.orthomatcher;
import java.io.BufferedReader;
import java.io.IOException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.ResourceReference;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.creole.metadata.Optional;
import gate.creole.metadata.RunTime;
import gate.util.BomStrippingInputStreamReader;
import gate.util.GateRuntimeException;
import gate.util.InvalidOffsetException;
import gate.util.OffsetComparator;
import gate.util.Out;
@CreoleResource(name="ANNIE OrthoMatcher", comment="ANNIE orthographical coreference component.", helpURL="http://gate.ac.uk/userguide/sec:annie:orthomatcher", icon="ortho-matcher")
public class OrthoMatcher extends AbstractLanguageAnalyser {
private static final long serialVersionUID = -6258229350677707465L;
protected static final Logger log = Logger.getLogger(OrthoMatcher.class);
public static final boolean DEBUG = false;
public static final String
OM_DOCUMENT_PARAMETER_NAME = "document";
public static final String
OM_ANN_SET_PARAMETER_NAME = "annotationSetName";
public static final String
OM_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
public static final String
OM_ANN_TYPES_PARAMETER_NAME = "annotationTypes";
public static final String
OM_ORG_TYPE_PARAMETER_NAME = "organizationType";
public static final String
OM_PERSON_TYPE_PARAMETER_NAME = "personType";
public static final String
OM_EXT_LISTS_PARAMETER_NAME = "extLists";
protected static final String CDGLISTNAME = "cdg";
protected static final String ALIASLISTNAME = "alias";
protected static final String ARTLISTNAME = "def_art";
protected static final String PREPLISTNAME = "prepos";
protected static final String CONNECTORLISTNAME = "connector";
protected static final String SPURLISTNAME = "spur_match";
protected static final String PUNCTUATION_VALUE = "punctuation";
protected static final String THE_VALUE = "The";
/**the name of the annotation set*/
protected String annotationSetName;
/** the types of the annotation */
protected List annotationTypes = new ArrayList(10);
/** the organization type*/
protected String organizationType = ORGANIZATION_ANNOTATION_TYPE;
/** the person type*/
protected String personType = PERSON_ANNOTATION_TYPE;
protected String unknownType = "Unknown";
/** internal or external list */
protected boolean extLists = true;
/** Use only high precision rules for Organizations */
protected Boolean highPrecisionOrgs = false;
/** matching unknowns or not*/
protected boolean matchingUnknowns = true;
/** This is an internal variable to indicate whether
* we matched using a rule that requires that
* the newly matched annotation matches all the others
* This is needed, because organizations can share
* first/last tokens like News and be different
*/
protected boolean allMatchingNeeded = false;
//** Orthomatching is not case-sensitive by default*/
protected boolean caseSensitive = false;
//protected FeatureMap queryFM = Factory.newFeatureMap();
// name lookup tables (used for namematch)
//gave them bigger default size, coz rehash is expensive
protected HashMap alias = new HashMap(100);
protected Set cdg = new HashSet();
protected HashMap spur_match = new HashMap(100);
protected HashMap def_art = new HashMap(20);
protected HashMap connector = new HashMap(20);
protected HashMap prepos = new HashMap(30);
protected AnnotationSet nameAllAnnots = null;
protected HashMap processedAnnots = new HashMap(150);
protected HashMap annots2Remove = new HashMap(75);
protected List> matchesDocFeature = new ArrayList>();
//maps annotation ids to array lists of tokens
protected HashMap> tokensMap = new HashMap>(150);
public Map> getTokensMap() {
return tokensMap;
}
protected Map> normalizedTokensMap = new HashMap>(150);
protected Annotation shortAnnot;
protected Annotation longAnnot;
protected List tokensLongAnnot;
protected List tokensShortAnnot;
protected List normalizedTokensLongAnnot, normalizedTokensShortAnnot;
/**
* URL to the file containing the definition for this orthomatcher
*/
private ResourceReference definitionFileURL;
private Double minimumNicknameLikelihood;
/** The encoding used for the definition file and associated lists.*/
private String encoding;
private Map rules=new HashMap();
/** to be initialized in init() */
private AnnotationOrthography orthoAnnotation;
public OrthoMatcher () {
annotationTypes.add(organizationType);
annotationTypes.add(personType);
annotationTypes.add("Location");
annotationTypes.add("Date");
}
/** Initialise the rules. The orthomatcher loads its build-in rules. */
private void initRules(){
//this line should be executed after spur_match is loaded
rules.put(0, new MatchRule0(this));
rules.put(1, new MatchRule1(this));
rules.put(2, new MatchRule2(this));
rules.put(3, new MatchRule3(this));
rules.put(4, new MatchRule4(this));
rules.put(5, new MatchRule5(this));
rules.put(6, new MatchRule6(this));
rules.put(7, new MatchRule7(this));
rules.put(8, new MatchRule8(this));
rules.put(9, new MatchRule9(this));
rules.put(10, new MatchRule10(this));
rules.put(11, new MatchRule11(this));
rules.put(12, new MatchRule12(this));
rules.put(13, new MatchRule13(this));
rules.put(14, new MatchRule14(this));
rules.put(15, new MatchRule15(this));
rules.put(16, new MatchRule16(this));
rules.put(17, new MatchRule17(this));
}
/** Override this method to add, replace, remove rules */
protected void modifyRules(Map rules) {
}
/** Initialise this resource, and return it. */
@SuppressWarnings("resource")
@Override
public Resource init() throws ResourceInstantiationException {
//initialise the list of annotations which we will match
if(definitionFileURL == null){
throw new ResourceInstantiationException(
"No URL provided for the definition file!");
}
String nicknameFile = null;
BufferedReader reader = null;
//at this point we have the definition file
try{
reader = new BomStrippingInputStreamReader(
definitionFileURL.openStream(), encoding);
String lineRead = null;
//boolean foundANickname = false;
while ((lineRead = reader.readLine()) != null){
int index = lineRead.indexOf(":");
if (index != -1){
String nameFile = lineRead.substring(0,index);
String nameList = lineRead.substring(index+1,lineRead.length());
if (nameList.equals("nickname")) {
if (minimumNicknameLikelihood == null) {
throw new ResourceInstantiationException(
"No value for the required parameter " +
"minimumNicknameLikelihood!");
}
nicknameFile = nameFile;
} else {
createAnnotList(nameFile,nameList);
}
}// if
}//while
reader.close();
URL nicknameURL = null;
if (nicknameFile != null)
nicknameURL = new URL(definitionFileURL.toURL(), nicknameFile);
this.orthoAnnotation = new BasicAnnotationOrthography(
personType,extLists,unknownType,nicknameURL,
minimumNicknameLikelihood, encoding);
initRules();
modifyRules(rules);
}catch(IOException ioe){
throw new ResourceInstantiationException(ioe);
}
finally {
IOUtils.closeQuietly(reader);
}
return this;
} // init()
/** Run the resource. It doesn't make sense not to override
* this in subclasses so the default implementation signals an
* exception.
*/
@Override
public void execute() throws ExecutionException{
try{
//check the input
if(document == null) {
throw new ExecutionException(
"No document for namematch!"
);
}
fireStatusChanged("OrthoMatcher processing: " + document.getName());
// get the annotations from document
if ((annotationSetName == null)|| (annotationSetName.equals("")))
nameAllAnnots = document.getAnnotations();
else
nameAllAnnots = document.getAnnotations(annotationSetName);
//if none found, print warning and exit
if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) {
Out.prln("OrthoMatcher Warning: No annotations found for processing");
return;
}
//check if we've been run on this document before
//and clean the doc if needed
docCleanup();
@SuppressWarnings("unchecked")
Map>> matchesMap = (Map>>)document.getFeatures().
get(DOCUMENT_COREF_FEATURE_NAME);
// creates the cdg list from the document
//no need to create otherwise, coz already done in init()
if (!extLists)
cdg=orthoAnnotation.buildTables(nameAllAnnots);
//Match all name annotations and unknown annotations
matchNameAnnotations();
//used to check if the Orthomatcher works properly
//OrthoMatcherHelper.setMatchesPositions(nameAllAnnots);
// set the matches of the document
// determineMatchesDocument();
if (! matchesDocFeature.isEmpty()) {
if(matchesMap == null){
matchesMap = new HashMap>>();
}
matchesMap.put(nameAllAnnots.getName(), matchesDocFeature);
// System.out.println("matchesMap is: " + matchesMap);
//we need to put it even if it was already present in order to triger
//the update events
document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap);
//cannot do clear() as this has already been put on the document
//so I need a new one for the next run of matcher
matchesDocFeature = new ArrayList>();
fireStatusChanged("OrthoMatcher completed");
}
}finally{
//make sure the cleanup happens even if there are errors.
// Out.prln("Processed strings" + processedAnnots.values());
//clean-up the internal data structures for next run
nameAllAnnots = null;
processedAnnots.clear();
annots2Remove.clear();
tokensMap.clear();
normalizedTokensMap.clear();
matchesDocFeature = new ArrayList>();
longAnnot = null;
shortAnnot = null;
tokensLongAnnot = null;
tokensShortAnnot = null;
//if (log.isDebugEnabled()) OrthoMatcherHelper.saveUsedTable();
}
} // run()
protected void matchNameAnnotations() throws ExecutionException{
// go through all the annotation types
Iterator iterAnnotationTypes = annotationTypes.iterator();
while (iterAnnotationTypes.hasNext()) {
String annotationType = iterAnnotationTypes.next();
AnnotationSet nameAnnots = nameAllAnnots.get(annotationType);
// continue if no such annotations exist
if (nameAnnots.isEmpty()) continue;
AnnotationSet tokensNameAS = nameAllAnnots.get(TOKEN_ANNOTATION_TYPE);
if (tokensNameAS.isEmpty()) continue;
ArrayList sortedNameAnnots = new ArrayList(nameAnnots);
Collections.sort(sortedNameAnnots,new OffsetComparator());
for (int snaIndex = 0;snaIndex < sortedNameAnnots.size();snaIndex++) {
Annotation tempAnnot = sortedNameAnnots.get(snaIndex);
Annotation nameAnnot = nameAllAnnots.get(tempAnnot.getId()); // Not sure if this matters
// get string and value
String annotString = orthoAnnotation.getStringForAnnotation(nameAnnot, document);
//convert to lower case if we are not doing a case sensitive match
if (!caseSensitive)
annotString = annotString.toLowerCase();
if (DEBUG) {
if (log.isDebugEnabled()) {
log.debug("Now processing the annotation: "
+ orthoAnnotation.getStringForAnnotation(nameAnnot, document) + " Id: " + nameAnnot.getId()
+ " Type: " + nameAnnot.getType() + " Offset: " + nameAnnot.getStartNode().getOffset());
}
}
// get the tokens
List tokens = new ArrayList(tokensNameAS.getContained(nameAnnot.getStartNode().getOffset(),
nameAnnot.getEndNode().getOffset()));
//if no tokens to match, do nothing
if (tokens.isEmpty()) {
if (log.isDebugEnabled()) {
log.debug("Didn't find any tokens for the following annotation. We will be unable to perform coref on this annotation. \n String: "
+ orthoAnnotation.getStringForAnnotation(nameAnnot, document) + " Id: " + nameAnnot.getId() + " Type: " + nameAnnot.getType());
}
continue;
}
Collections.sort(tokens, new gate.util.OffsetComparator());
//check if these actually do not end after the name
//needed coz new tokeniser conflates
//strings with dashes. So British Gas-style is two tokens
//instead of three. So cannot match properly British Gas
// tokens = checkTokens(tokens);
tokensMap.put(nameAnnot.getId(), tokens);
normalizedTokensMap.put(nameAnnot.getId(), new ArrayList(tokens));
//first check whether we have not matched such a string already
//if so, just consider it matched, don't bother calling the rules
// Exception: AB, Spock:
// Note that we require one-token Person annotations to be matched even if an identical string
// has been matched earlier because there could be multiple people named "David", for instance,
// on a page.
if (processedAnnots.containsValue(annotString) &&
(! (nameAnnot.getType().equals(personType) && (tokens.size() == 1)))) {
Annotation returnAnnot = orthoAnnotation.updateMatches(nameAnnot, annotString,processedAnnots,nameAllAnnots,matchesDocFeature);
if (returnAnnot != null) {
if (DEBUG) {
if (log.isDebugEnabled()) {
log.debug("Exact match criteria matched " + annotString + " from (id: " + nameAnnot.getId() + ", offset: " + nameAnnot.getStartNode().getOffset() + ") to " +
"(id: " + returnAnnot.getId() + ", offset: " + returnAnnot.getStartNode().getOffset() + ")");
}
}
processedAnnots.put(nameAnnot.getId(), annotString);
continue;
}
} else if (processedAnnots.isEmpty()) {
// System.out.println("First item put in processedAnnots: " + annotString);
processedAnnots.put(nameAnnot.getId(), annotString);
continue;
}
//if a person, then remove their title before matching
if (nameAnnot.getType().equals(personType)) {
annotString = orthoAnnotation.stripPersonTitle(annotString, nameAnnot,document,tokensMap,normalizedTokensMap,nameAllAnnots);
normalizePersonName(nameAnnot);
}
else if (nameAnnot.getType().equals(organizationType))
annotString = normalizeOrganizationName(annotString, nameAnnot);
if(null == annotString || "".equals(annotString) || tokens.isEmpty()) {
if (log.isDebugEnabled()) {
log.debug("Annotation ID " + nameAnnot.getId() + " of type" + nameAnnot.getType() +
" refers to a null or empty string or one with no tokens after normalization. Unable to process further.");
}
continue;
}
//otherwise try matching with previous annotations
matchWithPrevious(nameAnnot, annotString,sortedNameAnnots,snaIndex);
// Out.prln("Putting in previous " + nameAnnot + ": string " + annotString);
//finally add the current annotations to the processed map
processedAnnots.put(nameAnnot.getId(), annotString);
}//while through name annotations
if (matchingUnknowns) {
matchUnknown(sortedNameAnnots);
}
}//while through annotation types
}
protected void matchUnknown(ArrayList sortedAnnotationsForAType) throws ExecutionException {
//get all Unknown annotations
AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType);
annots2Remove.clear();
if (unknownAnnots.isEmpty()) return;
AnnotationSet nameAllTokens = nameAllAnnots.get(TOKEN_ANNOTATION_TYPE);
if (nameAllTokens.isEmpty()) return;
Iterator iter = unknownAnnots.iterator();
//loop through the unknown annots
while (iter.hasNext()) {
Annotation unknown = iter.next();
// get string and value
String unknownString = orthoAnnotation.getStringForAnnotation(unknown, document);
//convert to lower case if we are not doing a case sensitive match
if (!caseSensitive)
unknownString = unknownString.toLowerCase();
// System.out.println("Now trying to match the unknown string: " + unknownString);
//get the tokens
List tokens = new ArrayList(nameAllTokens.getContained(
unknown.getStartNode().getOffset(),
unknown.getEndNode().getOffset()
));
if (tokens.isEmpty())
continue;
Collections.sort(tokens, new gate.util.OffsetComparator());
tokensMap.put(unknown.getId(), tokens);
normalizedTokensMap.put(unknown.getId(), tokens);
//first check whether we have not matched such a string already
//if so, just consider it matched, don't bother calling the rules
if (processedAnnots.containsValue(unknownString)) {
Annotation matchedAnnot = orthoAnnotation.updateMatches(unknown, unknownString,processedAnnots,nameAllAnnots,matchesDocFeature);
if (matchedAnnot == null) {
log.debug("Orthomatcher: Unable to find the annotation: " +
orthoAnnotation.getStringForAnnotation(unknown, document) +
" in matchUnknown");
}
else {
if (matchedAnnot.getType().equals(unknownType)) {
annots2Remove.put(unknown.getId(),
annots2Remove.get(matchedAnnot.getId()));
}
else
annots2Remove.put(unknown.getId(), matchedAnnot.getType());
processedAnnots.put(unknown.getId(), unknownString);
unknown.getFeatures().put("NMRule", unknownType);
continue;
}
}
//check if we should do sub-string matching in case it's hyphenated
//for example US-led
if (tokens.size() == 1
&& "hyphen".equals(unknown.getFeatures().get(TOKEN_KIND_FEATURE_NAME))) {
if (matchHyphenatedUnknowns(unknown, unknownString, iter))
continue;
}//if
// TODO: The below results in a assigning the unknown's to the last annotation that it matches in a document.
// It would probably be better to first start with things which precede the current unknown and then do
// annotations after
matchWithPrevious(unknown, unknownString,sortedAnnotationsForAType,sortedAnnotationsForAType.size());
} //while though unknowns
if (! annots2Remove.isEmpty()) {
Iterator unknownIter = annots2Remove.keySet().iterator();
while (unknownIter.hasNext()) {
Integer unknId = unknownIter.next();
Annotation unknown = nameAllAnnots.get(unknId);
Integer newID = nameAllAnnots.add(
unknown.getStartNode(),
unknown.getEndNode(),
annots2Remove.get(unknId),
unknown.getFeatures()
);
nameAllAnnots.remove(unknown);
//change the id in the matches list
@SuppressWarnings("unchecked")
List mList = (List)unknown.getFeatures().
get(ANNOTATION_COREF_FEATURE_NAME);
mList.remove(unknId);
mList.add(newID);
}//while
}//if
}
private boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString,
Iterator iter){
boolean matched = false;
//only take the substring before the hyphen
int stringEnd = unknownString.indexOf("-");
unknownString = unknownString.substring(0, stringEnd);
//check if we've already matched this string
//because only exact match of the substring are considered
if (processedAnnots.containsValue(unknownString)) {
matched = true;
Annotation matchedAnnot = orthoAnnotation.updateMatches(unknown, unknownString,processedAnnots,nameAllAnnots,matchesDocFeature);
//only do the matching if not a person, because we do not match
//those on sub-strings
iter.remove();
String newType;
if (matchedAnnot.getType().equals(unknownType))
newType = annots2Remove.get(matchedAnnot.getId());
else
newType = matchedAnnot.getType();
Integer newID;
try {
newID = nameAllAnnots.add(
unknown.getStartNode().getOffset(),
unknown.getStartNode().getOffset() + stringEnd,
newType,
unknown.getFeatures()
);
} catch (InvalidOffsetException ex) {
throw new GateRuntimeException(ex.getMessage());
}
nameAllAnnots.remove(unknown);
//change the id in the matches list
@SuppressWarnings("unchecked")
List mList = (List)unknown.getFeatures().
get(ANNOTATION_COREF_FEATURE_NAME);
mList.remove(unknown.getId());
mList.add(newID);
}
return matched;
}
/**
* Attempt to match nameAnnot against all previous annotations of the same type, which are passed down
* in listOfThisType. Matches are tested in order from most recent to oldest.
* @param nameAnnot Annotation we are trying to match
* @param annotString Normalized string representation of annotation
* @param listOfThisType ArrayList of Annotations of the same type as nameAnnot
* @param startIndex Index in listOfThisType that we will start from in matching the current annotation
*/
protected void matchWithPrevious(Annotation nameAnnot, String annotString,
ArrayList listOfThisType,
int startIndex) {
boolean matchedUnknown = false;
// Out.prln("matchWithPrevious now processing: " + annotString);
for (int curIndex = startIndex - 1;curIndex >= 0;curIndex--) {
Integer prevId = listOfThisType.get(curIndex).getId();
Annotation prevAnnot = nameAllAnnots.get(prevId); // Note that this line probably isn't necessary anymore
//check if the two are from the same type or the new one is unknown
if (prevAnnot == null || (! prevAnnot.getType().equals(nameAnnot.getType())
&& ! nameAnnot.getType().equals(unknownType))
)
continue;
//do not compare two unknown annotations either
//they are only matched to those of known types
if ( nameAnnot.getType().equals(unknownType)
&& prevAnnot.getType().equals(unknownType))
continue;
//check if we have already matched this annotation to the new one
if (orthoAnnotation.matchedAlready(nameAnnot, prevAnnot,matchesDocFeature,nameAllAnnots) )
continue;
//now changed to a rule, here we just match by gender
if (prevAnnot.getType().equals(personType)) {
String prevGender =
(String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
String nameGender =
(String) nameAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
if ( prevGender != null
&& nameGender != null
&& ( (nameGender.equalsIgnoreCase("female")
&&
prevGender.equalsIgnoreCase("male")
)
||
(prevGender.equalsIgnoreCase("female")
&& nameGender.equalsIgnoreCase("male")
)
)
) //if condition
continue; //we don't have a match if the two genders are different
}//if
//if the two annotations match
//
// A. Borthwick, Spock: If the earlier annotation is shorter than the current annotation and it
// has already been matched with a longer annotations, then don't match it with the current annotation.
// Reasoning is that with the sequence David Jones . . . David . . . David Smith, we don't want to match
// David Smith with David. However, with the sequence, David . . . David Jones, it's okay to match the
// shorter version with the longer, because it hasn't already been matched with a longer.
boolean prevAnnotUsedToMatchWithLonger = prevAnnot.getFeatures().containsKey("matchedWithLonger");
if (matchAnnotations(nameAnnot, annotString, prevAnnot)) {
orthoAnnotation.updateMatches(nameAnnot, prevAnnot,matchesDocFeature,nameAllAnnots);
if (DEBUG) {
log.debug("Just matched nameAnnot " + nameAnnot.getId() + " with prevAnnot " + prevAnnot.getId());
}
if (!prevAnnotUsedToMatchWithLonger && prevAnnot.getFeatures().containsKey("matchedWithLonger")) {
// We have just matched the previous annotation with a longer annotation for the first time. We need
// to propagate the matchedWithLonger property to all other annotations which coreffed with the previous annotation
// so that we don't match them with a longer annotation
propagatePropertyToExactMatchingMatches(prevAnnot,"matchedWithLonger",true);
}
//if unknown annotation, we need to change to the new type
if (nameAnnot.getType().equals(unknownType)) {
matchedUnknown = true;
if (prevAnnot.getType().equals(unknownType))
annots2Remove.put(nameAnnot.getId(),
annots2Remove.get(prevAnnot.getId()));
else
annots2Remove.put(nameAnnot.getId(), prevAnnot.getType());
//also put an attribute to indicate that
nameAnnot.getFeatures().put("NMRule", unknownType);
}//if unknown
break; //no need to match further
}//if annotations matched
}//while through previous annotations
if (matchedUnknown)
processedAnnots.put(nameAnnot.getId(), annotString);
}//matchWithPrevious
protected void propagatePropertyToExactMatchingMatches(Annotation updateAnnot,String featureName,Object value) {
try {
@SuppressWarnings("unchecked")
List matchesList = (List) updateAnnot.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME);
if ((matchesList == null) || matchesList.isEmpty()) {
return;
}
else {
String updateAnnotString = orthoAnnotation.getStringForAnnotation(updateAnnot, document).toLowerCase();
for (Integer nextId : matchesList) {
Annotation a = nameAllAnnots.get(nextId);
if (orthoAnnotation.fuzzyMatch(orthoAnnotation.getStringForAnnotation(a, document),updateAnnotString)) {
if (DEBUG) {
log.debug("propogateProperty: " + featureName + " " + value + " from: " + updateAnnot.getId() + " to: " + a.getId());
}
a.getFeatures().put(featureName, value);
}
}
}
}
catch (Exception e) {
log.error("Error in propogatePropertyToExactMatchingMatches", e);
}
}
protected boolean matchAnnotations(Annotation newAnnot, String annotString,
Annotation prevAnnot) {
//do not match two annotations that overlap
if (newAnnot.overlaps(prevAnnot))
return false;
// find which annotation string of the two is longer
// this is useful for some of the matching rules
String prevAnnotString = processedAnnots.get(prevAnnot.getId());
// Out.prln("matchAnnotations processing " + annotString + " and " + prevAnnotString);
if (prevAnnotString == null) {
// Out.prln("We discovered that the following string is null!: " + prevAnnot.getId() +
// " For the previous annotation " + getStringForAnnotation(prevAnnot, document) +
// " which has annotation type " + prevAnnot.getType() +
// " Tried to compared it to the annotation string " + annotString);
return false;
}
String longName = prevAnnotString;
String shortName = annotString;
longAnnot = prevAnnot;
shortAnnot = newAnnot;
boolean longerPrevious = true;
if (shortName.length()>longName.length()) {
String temp = longName;
longName = shortName;
shortName = temp;
Annotation tempAnn = longAnnot;
longAnnot = shortAnnot;
shortAnnot = tempAnn;
longerPrevious = false;
}//if
tokensLongAnnot = tokensMap.get(longAnnot.getId());
normalizedTokensLongAnnot = normalizedTokensMap.get(longAnnot.getId());
tokensShortAnnot = tokensMap.get(shortAnnot.getId());
normalizedTokensShortAnnot = normalizedTokensMap.get(shortAnnot.getId());
@SuppressWarnings("unchecked")
List matchesList = (List) prevAnnot.getFeatures().
get(ANNOTATION_COREF_FEATURE_NAME);
if (matchesList == null || matchesList.isEmpty())
return apply_rules_namematch(prevAnnot.getType(), shortName,longName,
prevAnnot,newAnnot,longerPrevious);
//if these two match, then let's see if all the other matching one will too
//that's needed, because sometimes names can share a token (e.g., first or
//last but not be the same
if (apply_rules_namematch(prevAnnot.getType(), shortName,longName,prevAnnot,newAnnot,
longerPrevious)) {
/**
* Check whether we need to ensure that there is a match with the rest
* of the matching annotations, because the rule requires that
* transtivity is not assummed.
*/
if (allMatchingNeeded) {
allMatchingNeeded = false;
List toMatchList = new ArrayList(matchesList);
// if (newAnnot.getType().equals(unknownType))
// Out.prln("Matching new " + annotString + " with annots " + toMatchList);
toMatchList.remove(prevAnnot.getId());
return matchOtherAnnots(toMatchList, newAnnot, annotString);
} else
return true;
}
return false;
}
/** This method checkes whether the new annotation matches
* all annotations given in the toMatchList (it contains ids)
* The idea is that the new annotation needs to match all those,
* because assuming transitivity does not always work, when
* two different entities share a common token: e.g., BT Cellnet
* and BT and British Telecom.
*/
protected boolean matchOtherAnnots( List toMatchList, Annotation newAnnot,
String annotString) {
//if the list is empty, then we're matching all right :-)
if (toMatchList.isEmpty())
return true;
boolean matchedAll = true;
int i = 0;
while (matchedAll && i < toMatchList.size()) {
Annotation prevAnnot = nameAllAnnots.get(toMatchList.get(i));
// find which annotation string of the two is longer
// this is useful for some of the matching rules
String prevAnnotString = processedAnnots.get(prevAnnot.getId());
if (prevAnnotString == null)
try {
prevAnnotString = document.getContent().getContent(
prevAnnot.getStartNode().getOffset(),
prevAnnot.getEndNode().getOffset()
).toString();
} catch (InvalidOffsetException ioe) {
return false;
}//try
String longName = prevAnnotString;
String shortName = annotString;
longAnnot = prevAnnot;
shortAnnot = newAnnot;
boolean longerPrevious = true;
if (shortName.length()>=longName.length()) {
String temp = longName;
longName = shortName;
shortName = temp;
Annotation tempAnn = longAnnot;
longAnnot = shortAnnot;
shortAnnot = tempAnn;
longerPrevious = false;
}//if
tokensLongAnnot = tokensMap.get(longAnnot.getId());
normalizedTokensLongAnnot = normalizedTokensMap.get(longAnnot.getId());
tokensShortAnnot = tokensMap.get(shortAnnot.getId());
normalizedTokensShortAnnot = normalizedTokensMap.get(shortAnnot.getId());
matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName,longName,prevAnnot,newAnnot,
longerPrevious);
// if (newAnnot.getType().equals(unknownType))
// Out.prln("Loop: " + shortName + " and " + longName + ": result: " + matchedAll);
i++;
}//while
return matchedAll;
}
@SuppressWarnings("unchecked")
protected void docCleanup() {
Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME);
if (matchesValue != null && (matchesValue instanceof Map))
((Map>>)matchesValue).remove(nameAllAnnots.getName());
else if (matchesValue != null) {
document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, new HashMap>>());
}
//get all annotations that have a matches feature
HashSet fNames = new HashSet();
fNames.add(ANNOTATION_COREF_FEATURE_NAME);
AnnotationSet annots =
nameAllAnnots.get(null, fNames);
// Out.prln("Annots to cleanup" + annots);
if (annots == null || annots.isEmpty())
return;
Iterator iter = annots.iterator();
while (iter.hasNext()) {
while (iter.hasNext())
iter.next().getFeatures().remove(ANNOTATION_COREF_FEATURE_NAME);
} //while
}//cleanup
static Pattern periodPat = Pattern.compile("[\\.]+");
protected void normalizePersonName (Annotation annot) throws ExecutionException {
List tokens = normalizedTokensMap.get(annot.getId());
for (int i = tokens.size() - 1; i >= 0; i--) {
String tokenString = ((String) tokens.get(i).getFeatures().get(TOKEN_STRING_FEATURE_NAME));
String kind = (String) tokens.get(i).getFeatures().get(TOKEN_KIND_FEATURE_NAME);
//String category = (String) tokens.get(i).getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
if (!caseSensitive) {
tokenString = tokenString.toLowerCase();
}
// log.debug("tokenString: " + tokenString + " kind: " + kind + " category: " + category);
if (kind.equals(PUNCTUATION_VALUE) ) {
// log.debug("Now tagging it!");
tokens.get(i).getFeatures().put("ortho_stop", true);
}
}
List normalizedTokens = new ArrayList(tokens);
for (int j = normalizedTokens.size() - 1; j >= 0;j--) {
if (normalizedTokens.get(j).getFeatures().containsKey("ortho_stop")) {
// log.debug("Now removing " + normalizedTokens.get(j).getFeatures().get(TOKEN_STRING_FEATURE_NAME));
normalizedTokens.remove(j);
}
}
// log.debug("normalizedTokens size is: " + normalizedTokens.size());
normalizedTokensMap.put(annot.getId(), normalizedTokens);
}
/** return an organization without a designator and starting The*/
protected String normalizeOrganizationName (String annotString, Annotation annot){
List tokens = tokensMap.get(annot.getId());
//strip starting The first
if ( ((String) tokens.get(0).getFeatures().get(TOKEN_STRING_FEATURE_NAME))
.equalsIgnoreCase(THE_VALUE))
tokens.remove(0);
if (tokens.size() > 0) {
// New code by A. Borthwick of Spock Networks
// June 13, 2008
// Strip everything on the cdg list, which now encompasses not just cdg's, but also other stopwords
// Start from the right side so we don't mess up the arraylist
for (int i = tokens.size() - 1; i >= 0; i--) {
String tokenString = ((String) tokens.get(i).getFeatures().get(TOKEN_STRING_FEATURE_NAME));
String kind = (String) tokens.get(i).getFeatures().get(TOKEN_KIND_FEATURE_NAME);
String category = (String) tokens.get(i).getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME);
if (!caseSensitive) {
tokenString = tokenString.toLowerCase();
}
// Out.prln("tokenString: " + tokenString + " kind: " + kind + " category: " + category);
if (kind.equals(PUNCTUATION_VALUE) ||
( (category != null) && (category.equals("DT") || category.equals("IN")) )
|| cdg.contains(tokenString)) {
// Out.prln("Now tagging it!");
tokens.get(i).getFeatures().put("ortho_stop", true);
}
}
// AB, Spock: Need to check for CDG even for 1 token so we don't automatically match
// a one-token annotation called "Company", for instance
String compareString = (String) tokens.get(tokens.size()-1).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
if (!caseSensitive) {
compareString = compareString.toLowerCase();
}
if (cdg.contains(compareString)) {
tokens.remove(tokens.size()-1);
}
}
ArrayList normalizedTokens = new ArrayList(tokens);
for (int j = normalizedTokens.size() - 1; j >= 0;j--) {
if (normalizedTokens.get(j).getFeatures().containsKey("ortho_stop")) {
normalizedTokens.remove(j);
}
}
normalizedTokensMap.put(annot.getId(), normalizedTokens);
StringBuffer newString = new StringBuffer(50);
for (int i = 0; i < tokens.size(); i++){
newString.append((String) tokens.get(i).getFeatures().get(TOKEN_STRING_FEATURE_NAME) );
if (i != tokens.size()-1)
newString.append(" ");
}
// Out.prln("Strip CDG returned: " + newString + "for string " + annotString);
if (caseSensitive)
return newString.toString();
return newString.toString().toLowerCase();
}
/** creates the lookup tables */
protected void createAnnotList(String nameFile, String nameList)
throws IOException {
// create the relative URL
URL fileURL = new URL(definitionFileURL.toURL(), nameFile);
BufferedReader bufferedReader = null;
try {
bufferedReader =
new BomStrippingInputStreamReader(fileURL.openStream(), encoding);
String lineRead = null;
while((lineRead = bufferedReader.readLine()) != null) {
if(nameList.compareTo(CDGLISTNAME) == 0) {
Matcher matcher = punctPat.matcher(lineRead.toLowerCase().trim());
lineRead = matcher.replaceAll(" ").trim();
if(caseSensitive)
cdg.add(lineRead);
else cdg.add(lineRead.toLowerCase());
}// if
else {
int index = lineRead.indexOf("£");
if(index != -1) {
String expr = lineRead.substring(0, index);
// if not case-sensitive, we need to downcase all strings
if(!caseSensitive) expr = expr.toLowerCase();
String code = lineRead.substring(index + 1, lineRead.length());
if(nameList.equals(ALIASLISTNAME)) {
alias.put(expr, code);
} else if(nameList.equals(ARTLISTNAME)) {
def_art.put(expr, code);
} else if(nameList.equals(PREPLISTNAME)) {
prepos.put(expr, code);
} else if(nameList.equals(CONNECTORLISTNAME)) {
connector.put(expr, code);
} else if(nameList.equals(SPURLISTNAME)) {
spur_match.put(expr, code);
}
}// if
}// else
}// while
} finally {
IOUtils.closeQuietly(bufferedReader);
}
}// createAnnotList
/**
* This is the skeleton of a function which should be available in OrthoMatcher to allow a pairwise comparison of two name strings
* It should eventually be made public. It is private here (and thus non-functional) because OrthoMatcher is currently reliant
* on the tokenization of the names, which are held in the global variables tokensShortAnnot and tokensLongAnnot
*
* @param name1
* @param name2
* @return true if the two names indicate the same person
*/
@SuppressWarnings("unused")
private boolean pairwise_person_name_match(String name1, String name2) {
String shortName,longName;
if (name1.length() > name2.length()) {
longName = name1;
shortName = name2;
}
else {
longName = name2;
shortName = name1;
}
if (rules.get(0).value(longName,shortName)) {//matchRule0(longName,shortName)
return false;
}
else {
if (longName.equals(shortName) || rules.get(2).value(longName, shortName) ||
rules.get(3).value(longName, shortName)) {
return true;
}
else {
return (rules.get(0).value(longName, shortName));
// boolean throwAway[] = new boolean[17];
// return basic_person_match_criteria(shortName,longName,throwAway);
// The above doesn't work because basic_person_match_criteria is reliant on the global
// variables tokensShortAnnot and tokensLongAnnot so I just call what I can directly
}
}
}
/**
* basic_person_match_criteria
* Note that this function relies on various global variables in some other match rules.
*/
private boolean basic_person_match_criteria(String shortName,
String longName, boolean mr[]) {
if ( // For 4, 5, 14, and 15, need to mark shorter annot
//kalina: added 16, so it matches names when contain more than one first and one last name
OrthoMatcherHelper.executeDisjunction(rules, new int[] {1,5,6,13,15,16},longName,shortName,mr)
) {
return true;
}
return false;
}
/** apply_rules_namematch: apply rules similarly to lasie1.5's namematch */
private boolean apply_rules_namematch(String annotationType, String shortName,
String longName,Annotation prevAnnot,
Annotation followAnnot,
boolean longerPrevious) {
boolean mr[] = new boolean[rules.size()];
// first apply rule for spurious matches i.e. rule0
if (DEBUG) {
log.debug("Now matching " + longName + "(id: " + longAnnot.getId() + ") to "
+ shortName + "(id: " + shortAnnot.getId() + ")");
}
if (rules.get(0).value(longName,shortName))
return false;
if (
(// rules for all annotations
//no longer use rule1, coz I do the check for same string via the hash table
OrthoMatcherHelper.executeDisjunction(rules, new int[] {2,3},longName,shortName,mr)
) // rules for all annotations
||
(// rules for organisation annotations
(annotationType.equals(organizationType)
//ACE addition
|| annotationType.equals("Facility")
)
&&
// Should basically only match when you have a match of all tokens other than
// CDG's and function words
(
(!highPrecisionOrgs && OrthoMatcherHelper.executeDisjunction(rules,new int[] {4,6,7,8,9,10,11,12,14},longName,shortName,mr))
||
(highPrecisionOrgs && OrthoMatcherHelper.executeDisjunction(rules,new int[] {7,8,10,11,17},longName,shortName,mr))
)
)
) {// rules for organisation annotations
return true;
}
if (// rules for person annotations
( annotationType.equals(personType))) {
if (noMatchRule1(longName, shortName,prevAnnot, longerPrevious) ||
noMatchRule2(longName, shortName)) {
// Out.prln("noMatchRule1 rejected match between " + longName + " and " + shortName);
return false;
}
else {
if ( basic_person_match_criteria(shortName,longName,mr))
{
if ((longName.length() != shortName.length()) && (mr[4] || mr[5] || mr[14] || mr[15])) {
if (longerPrevious) {
followAnnot.getFeatures().put("matchedWithLonger", true);
}
else {
prevAnnot.getFeatures().put("matchedWithLonger", true);
}
}
else if ((longName.length() == shortName.length()) && (mr[1])) {
if (prevAnnot.getFeatures().containsKey("matchedWithLonger")) {
followAnnot.getFeatures().put("matchedWithLonger", true);
}
}
return true;
}
return false;
}
}
return false;
}//apply_rules
/** set the extLists flag */
@Optional
@CreoleParameter(comment="External lists otherwise internal", defaultValue="true")
public void setExtLists(Boolean newExtLists) {
extLists = newExtLists.booleanValue();
}//setextLists
/** set the caseSensitive flag */
@Optional
@CreoleParameter(comment="Should this resource diferentiate on case?",defaultValue="false")
public void setCaseSensitive(Boolean newCase) {
caseSensitive = newCase.booleanValue();
}//setextLists
/** set the annotation set name*/
@RunTime
@Optional
@CreoleParameter(comment="Annotation set name where are the annotation types (annotationTypes)")
public void setAnnotationSetName(String newAnnotationSetName) {
annotationSetName = newAnnotationSetName;
}//setAnnotationSetName
/** set the types of the annotations*/
@RunTime
@Optional
@CreoleParameter(comment="Name of the annotation types to use", defaultValue="Organization;Person;Location;Date")
public void setAnnotationTypes(List newType) {
annotationTypes = newType;
}//setAnnotationTypes
/** set whether to process the Unknown annotations*/
@Optional
@CreoleParameter(comment="Should we process 'Unknown' annotations?", defaultValue="true")
public void setProcessUnknown(Boolean processOrNot) {
this.matchingUnknowns = processOrNot.booleanValue();
}//setAnnotationTypes
@Optional
@CreoleParameter(comment="Annotation name for the organizations", defaultValue="Organization")
public void setOrganizationType(String newOrganizationType) {
organizationType = newOrganizationType;
}//setOrganizationType
@Optional
@CreoleParameter(comment="Annotation name for the persons", defaultValue="Person")
public void setPersonType(String newPersonType) {
personType = newPersonType;
}//setPersonType
/**get the name of the annotation set*/
public String getAnnotationSetName() {
return annotationSetName;
}//getAnnotationSetName
/** get the types of the annotation*/
public List getAnnotationTypes() {
return annotationTypes;
}//getAnnotationTypes
public String getOrganizationType() {
return organizationType;
}
public String getPersonType() {
return personType;
}
public Boolean getExtLists() {
return extLists;
}
/** Are we running in a case-sensitive mode?*/
public Boolean getCaseSensitive() {
return caseSensitive;
}
/** Return whether or not we're processing the Unknown annots*/
public Boolean getProcessUnknown() {
return matchingUnknowns;
}
/**
No Match Rule 1:
Avoids the problem of matching
David Jones ...
David ...
David Smith
Since "David" was matched with David Jones, we don't match David with David Smith.
*/
public boolean noMatchRule1(String s1,
String s2,Annotation previousAnnot, boolean longerPrevious) {
// if (DEBUG) {
// try {
// String annotString = getStringForAnnotation(previousAnnot, document );
// log.debug("Previous annotation was " + annotString + "(id: " + previousAnnot.getId() + ")" + " features are " + previousAnnot.getFeatures());
// }
// catch (ExecutionException e) {}
// }
if (longerPrevious || !previousAnnot.getFeatures().containsKey("matchedWithLonger")) {
return false;
}
else {
return true;
}
}//noMatchRule1
/***
* returns true if it detects a middle name which indicates that the name string contains a nickname or a
* compound last name
*/
private boolean detectBadMiddleTokens(List tokArray) {
for (int j = 1;j < tokArray.size() - 1;j++) {
String currentToken = (String) tokArray.get(j).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
Matcher matcher = badMiddleTokens.matcher(currentToken.toLowerCase().trim());
if (matcher.find()) {
// We have found a case of a ", ',
return true;
}
}
return false;
}
/**
* NoMatch Rule #2: Do we have a mismatch of middle initial?
* Condition(s): Only applies to person names with more than two tokens in the name
*
* Want George W. Bush != George H. W. Bush and George Walker Bush != George Herbert Walker Bush
* and
* John T. Smith != John Q. Smith
* however
* John T. Smith == John Thomas Smith
* be careful about
* Hillary Rodham Clinton == Hillary Rodham-Clinton
* be careful about
* Carlos Bueno de Lopez == Bueno de Lopez
* and
* Cynthia Morgan de Rothschild == Cynthia de Rothschild
*/
@SuppressWarnings("unused")
public boolean noMatchRule2(String s1,String s2) {
if (normalizedTokensLongAnnot.size()>2 && normalizedTokensShortAnnot.size()>2) {
boolean retval = false;
if (normalizedTokensLongAnnot.size() != normalizedTokensShortAnnot.size()) {
String firstNameLong = (String) normalizedTokensLongAnnot.get(0).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
String firstNameShort = (String) normalizedTokensShortAnnot.get(0).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
String lastNameLong = (String) normalizedTokensLongAnnot.get(normalizedTokensLongAnnot.size() - 1).
getFeatures().get(TOKEN_STRING_FEATURE_NAME);
String lastNameShort = (String) normalizedTokensShortAnnot.get(normalizedTokensShortAnnot.size() - 1).
getFeatures().get(TOKEN_STRING_FEATURE_NAME);
if (rules.get(1).value(firstNameLong,firstNameShort) &&
(rules.get(1).value(lastNameLong,lastNameShort))) {
// Must have a match on first and last name for this non-match rule to take effect when the number of tokens differs
if (detectBadMiddleTokens(tokensLongAnnot) || detectBadMiddleTokens(tokensShortAnnot)) {
// Exclude the William (Bill) H. Gates vs. William H. Gates case and the
// Cynthia Morgan de Rothschild vs. Cynthia de Rothschild case
if (DEBUG && log.isDebugEnabled()) {
log.debug("noMatchRule2Name did not non-match because of bad middle tokens " + s1 + "(id: " + longAnnot.getId() + ") to "
+ s2+ "(id: " + shortAnnot.getId() + ")");
}
return false;
}
else {
// Covers the George W. Bush vs George H. W. Bush and George Walker Bush vs. George Herbert Walker Bush cases
retval = true;
}
}
}
else {
for (int i = 1; i < normalizedTokensLongAnnot.size() - 1;i++) {
String s1_middle = (String) normalizedTokensLongAnnot.get(i).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
String s2_middle = (String) normalizedTokensShortAnnot.get(i).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
if (!caseSensitive) {
s1_middle = s1_middle.toLowerCase();
s2_middle = s2_middle.toLowerCase();
}
// log.debug("noMatchRule2 comparing substring " + s1_middle + " to " + s2_middle);
if (!(rules.get(1).value(s1_middle,s2_middle) ||
OrthoMatcherHelper.initialMatch(s1_middle, s2_middle))) {
// We found a mismatching middle name
retval = true;
break;
}
}
}
if (retval && log.isDebugEnabled() && DEBUG) {
log.debug("noMatchRule2Name non-matched " + s1 + "(id: " + longAnnot.getId() + ") to "
+ s2+ "(id: " + shortAnnot.getId() + ")");
}
return retval;
} // if (normalizedTokensLongAnnot.size()>2 && normalizedTokensShortAnnot.size()>2)
return false;
}//noMatchRule2
@CreoleParameter(comment="The URL to the definition file", defaultValue="resources/othomatcher/listsNM.def", suffixes="def")
public void setDefinitionFileURL(ResourceReference definitionFileURL) {
this.definitionFileURL = definitionFileURL;
}
@Deprecated
public void setDefinitionFileURL(URL definitionFileURL) {
try {
this.setDefinitionFileURL(new ResourceReference(definitionFileURL));
} catch (URISyntaxException e) {
throw new RuntimeException("Error converting URL to ResourceReference", e);
}
}
public ResourceReference getDefinitionFileURL() {
return definitionFileURL;
}
@CreoleParameter(comment="The encoding used for reading the definition file", defaultValue="UTF-8")
public void setEncoding(String encoding) {
this.encoding = encoding;
}
public String getEncoding() {
return encoding;
}
public Double getMinimumNicknameLikelihood() {
return minimumNicknameLikelihood;
}
@CreoleParameter(comment="Minimum likelihood that a name is a nickname", defaultValue="0.50")
public void setMinimumNicknameLikelihood(Double minimumNicknameLikelihood) {
this.minimumNicknameLikelihood = minimumNicknameLikelihood;
}
/**
* @return the highPrecisionOrgs
*/
public Boolean getHighPrecisionOrgs() {
return highPrecisionOrgs;
}
/**
* @param highPrecisionOrgs the highPrecisionOrgs to set
*/
@Optional
@CreoleParameter(comment="Use very safe features for matching orgs, such as ACME = ACME, Inc.", defaultValue="false")
public void setHighPrecisionOrgs(Boolean highPrecisionOrgs) {
this.highPrecisionOrgs = highPrecisionOrgs;
}
public void setOrthography(AnnotationOrthography orthography) {
this.orthoAnnotation = orthography;
}
public AnnotationOrthography getOrthography() {
return orthoAnnotation;
}
static Pattern punctPat = Pattern.compile("[\\p{Punct}]+");
// The UTF characters are right and left double and single curly quotes
static Pattern badMiddleTokens = Pattern.compile("[\u201c\u201d\u2018\u2019\'\\(\\)\"]+|^de$|^von$");
}