gate.creole.orthomatcher.BasicAnnotationOrthography Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of annie Show documentation
Show all versions of annie Show documentation
ANNIE is a general purpose information extraction system that
provides the building blocks of many other GATE applications.
package gate.creole.orthomatcher;
import static gate.creole.orthomatcher.OrthoMatcherHelper.getStringForSpan;
import static gate.creole.orthomatcher.OrthoMatcherHelper.round2Places;
import java.io.BufferedReader;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.creole.ANNIEConstants;
import gate.creole.ExecutionException;
import gate.util.BomStrippingInputStreamReader;
import gate.util.Err;
import gate.util.InvalidOffsetException;
/*
* This class defines an orthography which defines the primary behaviour of the
* Orthomatcher processing resource in GATE.
*/
public class BasicAnnotationOrthography implements AnnotationOrthography {
private final boolean extLists;
private final String personType;
private final String unknownType;
private Map> nicknameMap =
new HashMap>();
private final Double minimumNicknameLikelihood;
public BasicAnnotationOrthography(String personType, boolean extLists,
String unknownType, URL nicknameFile, Double minimumNicknameLikelihood,
String encoding) {
this.personType = personType;
this.extLists = extLists;
this.unknownType = unknownType;
this.minimumNicknameLikelihood = minimumNicknameLikelihood;
try {
if(nicknameFile != null) this.initNicknames(encoding, nicknameFile);
} catch(IOException e) {
log.warn("Could not load nickname map.", e);
}
}
protected static final Logger log = Logger
.getLogger(BasicAnnotationOrthography.class);
@Override
public String getStringForAnnotation(Annotation a, gate.Document d)
throws ExecutionException {
String annotString =
getStringForSpan(a.getStartNode().getOffset(), a.getEndNode()
.getOffset(), d);
// now do the reg. exp. substitutions
annotString = annotString.replaceAll("\\s+", " ");
return annotString;
}
@Override
public boolean fuzzyMatch(String s1, String s2) {
String s1Lower = s1.toLowerCase();
String s2Lower = s2.toLowerCase();
if(s1Lower.equals(s2Lower)) { return true; }
// System.out.println("Now comparing " + s1 + " | " + s2) ;
Set formalNameSet = nicknameMap.get(s1Lower);
if(formalNameSet != null) {
if(formalNameSet.contains(s2Lower)) { return true; }
}
formalNameSet = nicknameMap.get(s2Lower);
if(formalNameSet != null) {
if(formalNameSet.contains(s1Lower)) { return true; }
}
return false;
}
/**
* @return true if all of the tokens in firstName are either found in second
* name or are stop words
*/
@Override
public boolean allNonStopTokensInOtherAnnot(List firstName,
List secondName, String TOKEN_STRING_FEATURE_NAME,
boolean caseSensitive) {
for(Annotation a : firstName) {
if(!a.getFeatures().containsKey("ortho_stop")) {
String aString = (String)a.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
boolean foundAMatchInSecond = false;
for(Annotation b : secondName) {
if(OrthoMatcherHelper.straightCompare(aString, (String)b
.getFeatures().get(TOKEN_STRING_FEATURE_NAME), caseSensitive)) {
foundAMatchInSecond = true;
break;
}
}
if(!foundAMatchInSecond) { return false; }
}
}
return true;
}
/**
* Return a person name without a title. Also remove title from global
* variable tokensMap
*/
@Override
public String stripPersonTitle(String annotString, Annotation annot,
Document doc, Map> tokensMap,
Map> normalizedTokensMap, AnnotationSet nameAllAnnots)
throws ExecutionException {
FeatureMap queryFM = Factory.newFeatureMap();
// get the offsets
Long startAnnot = annot.getStartNode().getOffset();
Long endAnnot = annot.getEndNode().getOffset();
// determine "Lookup" annotation set
queryFM.clear();
queryFM.put("majorType", "title");
AnnotationSet as1 = nameAllAnnots.getContained(startAnnot, endAnnot);
if(as1 == null || as1.isEmpty()) return annotString;
AnnotationSet as = as1.get("Lookup", queryFM);
if(as != null && !as.isEmpty()) {
List titles = new ArrayList(as);
Collections.sort(titles, new gate.util.OffsetComparator());
Iterator iter = titles.iterator();
while(iter.hasNext()) {
Annotation titleAnn = iter.next();
// we've not found a title at the start offset,
// there's no point in looking further
// coz titles come first
if(titleAnn.getStartNode().getOffset().compareTo(startAnnot) != 0)
return annotString;
try {
// the title from the current annotation
String annotTitle =
doc.getContent()
.getContent(titleAnn.getStartNode().getOffset(),
titleAnn.getEndNode().getOffset()).toString();
// eliminate the title from annotation string and return the result
if(annotTitle.length() < annotString.length()) {
// remove from the array of tokens, so then we can compare properly
// the remaining tokens
// log.debug("Removing title from: " + annot + " with string " +
// annotString);
// log.debug("Tokens are " + tokensMap.get(annot.getId()));
// log.debug("Title is " + annotTitle);
tokensMap.get(annot.getId()).remove(0);
normalizedTokensMap.get(annot.getId()).remove(0);
return annotString.substring(annotTitle.length() + 1,
annotString.length());
}
} catch(InvalidOffsetException ioe) {
throw new ExecutionException("Invalid offset of the annotation");
}// try
}// while
}// if
return annotString;
}
@Override
public boolean matchedAlready(Annotation annot1, Annotation annot2,
List> matchesDocFeature, AnnotationSet nameAllAnnots) {
// the two annotations are already matched if the matches list of the first
// contains the id of the second
@SuppressWarnings("unchecked")
List matchesList =
(List)annot1.getFeatures().get(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME);
if((matchesList == null) || matchesList.isEmpty())
return false;
else if(matchesList.contains(annot2.getId())) return true;
return false;
}
@Override
public Annotation updateMatches(Annotation newAnnot, String annotString,
Map processedAnnots, AnnotationSet nameAllAnnots,
List> matchesDocFeature) {
Annotation matchedAnnot = null;
Integer id;
// first find a processed annotation with the same string
// TODO: Andrew Borthwick 7/26/08: The below is very inefficient. We should
// be doing a lookup into a hash
// which is indexed on string rather than testing every id. Need to have the
// index be String + Type
// for safety
Iterator iter = processedAnnots.keySet().iterator();
// System.out.println("ID's examined: ");
while(iter.hasNext()) {
id = iter.next();
String oldString = processedAnnots.get(id);
// System.out.print(id + " ");
if(annotString.equals(oldString)) {
Annotation tempAnnot = nameAllAnnots.get(id);
if(tempAnnot == null) {
log.debug("Orthomatcher: TempAnnot is null when looking at "
+ annotString + " | " + oldString + " | old id: " + id);
return null;
}
// Below is a new Spock addition to prevent unpredictable behavior when
// the same string is given more than one type. We want to return null
// if there is no match on name + type (other than Unknown)
if(newAnnot.getType().equals(unknownType)
|| tempAnnot.getType().equals(newAnnot.getType())) {
matchedAnnot = tempAnnot;
break;
}
}
}// while
// System.out.println();
if(matchedAnnot == null) return null;
@SuppressWarnings("unchecked")
List matchesList =
(List)matchedAnnot.getFeatures().get(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME);
if((matchesList == null) || matchesList.isEmpty()) {
// no previous matches, so need to add
if(matchesList == null) {
matchesList = new ArrayList();
matchedAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME,
matchesList);
matchesDocFeature.add(matchesList);
}// if
matchesList.add(matchedAnnot.getId());
matchesList.add(newAnnot.getId());
} else {
// just add the new annotation
matchesList.add(newAnnot.getId());
}// if
// add the matches list to the new annotation
newAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME,
matchesList);
return matchedAnnot;
}
@Override
public void updateMatches(Annotation newAnnot, Annotation prevAnnot,
List> matchesDocFeature, AnnotationSet nameAllAnnots) {
@SuppressWarnings("unchecked")
List matchesList =
(List)prevAnnot.getFeatures().get(
OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME);
if((matchesList == null) || matchesList.isEmpty()) {
// no previous matches, so need to add
if(matchesList == null) {
matchesList = new ArrayList();
prevAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME,
matchesList);
matchesDocFeature.add(matchesList);
}// if
matchesList.add(prevAnnot.getId());
matchesList.add(newAnnot.getId());
} else {
// just add the new annotation
matchesList.add(newAnnot.getId());
}// if
// add the matches list to the new annotation
newAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME,
matchesList);
// propagate the gender if two persons are matched
if(prevAnnot.getType().equals(this.personType)) {
String prevGender =
(String)prevAnnot.getFeatures().get(
OrthoMatcher.PERSON_GENDER_FEATURE_NAME);
String newGender =
(String)newAnnot.getFeatures().get(
OrthoMatcher.PERSON_GENDER_FEATURE_NAME);
boolean unknownPrevGender = isUnknownGender(prevGender);
boolean unknownNewGender = isUnknownGender(newGender);
if(unknownPrevGender && !unknownNewGender)
prevAnnot.getFeatures().put(OrthoMatcher.PERSON_GENDER_FEATURE_NAME,
newGender);
else if(unknownNewGender && !unknownPrevGender)
newAnnot.getFeatures().put(OrthoMatcher.PERSON_GENDER_FEATURE_NAME,
prevGender);
}// if
}
/**
* Tables for namematch info (used by the namematch rules)
*/
@Override
public Set buildTables(AnnotationSet nameAllAnnots) {
FeatureMap tempMap = Factory.newFeatureMap();
// reset the tables first
Set cdg = new HashSet();
if(!extLists) {
// i.e. get cdg from Lookup annotations
// get all Lookup annotations
tempMap.clear();
tempMap.put(gate.creole.ANNIEConstants.LOOKUP_MAJOR_TYPE_FEATURE_NAME,
"cdg");
// now get all lookup annotations which are cdg
AnnotationSet nameAnnots =
nameAllAnnots.get(OrthoMatcher.LOOKUP_ANNOTATION_TYPE, tempMap);
if((nameAnnots == null) || nameAnnots.isEmpty()) return cdg;
Iterator iter = nameAnnots.iterator();
while(iter.hasNext()) {
Annotation annot = iter.next();
// get the actual string
Long offsetStartAnnot = annot.getStartNode().getOffset();
Long offsetEndAnnot = annot.getEndNode().getOffset();
try {
gate.Document doc = nameAllAnnots.getDocument();
String annotString =
doc.getContent().getContent(offsetStartAnnot, offsetEndAnnot)
.toString();
cdg.add(annotString);
} catch(InvalidOffsetException ioe) {
ioe.printStackTrace(Err.getPrintWriter());
}
}// while
}// if
return cdg;
}// buildTables
@Override
public boolean isUnknownGender(String gender) {
if(gender == null) return true;
if(gender.equalsIgnoreCase("male") || gender.equalsIgnoreCase("female"))
return false;
return true;
} // isUnknownGender
protected Map> initNicknames(String nicknameFileEncoding,
java.net.URL fileURL) throws IOException {
Pattern spacePat = Pattern.compile("(\\s+)");
nicknameMap = new HashMap>();
// create the relative URL
BufferedReader reader = null;
try {
reader = new BomStrippingInputStreamReader(fileURL.openStream(),
nicknameFileEncoding);
String lineRead = null;
while((lineRead = reader.readLine()) != null) {
if(lineRead.length() == 0 || lineRead.charAt(0) == '#') {
continue;
}
List nickNameLine =
Arrays.asList(spacePat.split(lineRead
.toLowerCase().trim()));
if(nickNameLine.size() != 3
&& (nickNameLine.size() != 4 && ((nickNameLine.get(3) != "M") || nickNameLine
.get(3) != "F"))) {
continue;
}
if(round2Places(Double.valueOf(nickNameLine.get(2))) < OrthoMatcherHelper
.round2Places(minimumNicknameLikelihood)) {
continue;
}
if(nicknameMap.containsKey(nickNameLine.get(0))) {
/*
* System.out.println("Adding to existing nickname of " +
* nickNameLine.get(0) + " " + nickNameLine.get(1));
*/
nicknameMap.get(nickNameLine.get(0)).add(nickNameLine.get(1));
} else {
/*
* System.out.println("Adding new nickname of " +
* nickNameLine.get(0) + " " + nickNameLine.get(1));
*/
nicknameMap.put(
nickNameLine.get(0),
new HashSet(
Collections.singleton(nickNameLine.get(1))));
}
}
} finally {
IOUtils.closeQuietly(reader);
}
return nicknameMap;
}
}