
gate.creole.orthomatcher.BasicAnnotationOrthography Maven / Gradle / Ivy
Go to download
ANNIE is a general purpose information extraction system that
provides the building blocks of many other GATE applications.
package gate.creole.orthomatcher;
import static gate.creole.orthomatcher.OrthoMatcherHelper.getStringForSpan;
import static gate.creole.orthomatcher.OrthoMatcherHelper.round2Places;
import java.io.BufferedReader;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.creole.ANNIEConstants;
import gate.creole.ExecutionException;
import gate.util.BomStrippingInputStreamReader;
import gate.util.Err;
import gate.util.InvalidOffsetException;
/*
* This class defines an orthography which defines the primary behaviour of the
* Orthomatcher processing resource in GATE.
*/
public class BasicAnnotationOrthography implements AnnotationOrthography {
private final boolean extLists;
private final String personType;
private final String unknownType;
private Map> nicknameMap =
new HashMap>();
private final Double minimumNicknameLikelihood;
public BasicAnnotationOrthography(String personType, boolean extLists,
String unknownType, URL nicknameFile, Double minimumNicknameLikelihood,
String encoding) {
this.personType = personType;
this.extLists = extLists;
this.unknownType = unknownType;
this.minimumNicknameLikelihood = minimumNicknameLikelihood;
try {
if(nicknameFile != null) this.initNicknames(encoding, nicknameFile);
} catch(IOException e) {
log.warn("Could not load nickname map.", e);
}
}
protected static final Logger log = Logger
.getLogger(BasicAnnotationOrthography.class);
@Override
public String getStringForAnnotation(Annotation a, gate.Document d)
throws ExecutionException {
String annotString =
getStringForSpan(a.getStartNode().getOffset(), a.getEndNode()
.getOffset(), d);
// now do the reg. exp. substitutions
annotString = annotString.replaceAll("\\s+", " ");
return annotString;
}
@Override
public boolean fuzzyMatch(String s1, String s2) {
String s1Lower = s1.toLowerCase();
String s2Lower = s2.toLowerCase();
if(s1Lower.equals(s2Lower)) { return true; }
// System.out.println("Now comparing " + s1 + " | " + s2) ;
Set formalNameSet = nicknameMap.get(s1Lower);
if(formalNameSet != null) {
if(formalNameSet.contains(s2Lower)) { return true; }
}
formalNameSet = nicknameMap.get(s2Lower);
if(formalNameSet != null) {
if(formalNameSet.contains(s1Lower)) { return true; }
}
return false;
}
/**
* @return true if all of the tokens in firstName are either found in second
* name or are stop words
*/
@Override
public boolean allNonStopTokensInOtherAnnot(List firstName,
List secondName, String TOKEN_STRING_FEATURE_NAME,
boolean caseSensitive) {
for(Annotation a : firstName) {
if(!a.getFeatures().containsKey("ortho_stop")) {
String aString = (String)a.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
boolean foundAMatchInSecond = false;
for(Annotation b : secondName) {
if(OrthoMatcherHelper.straightCompare(aString, (String)b
.getFeatures().get(TOKEN_STRING_FEATURE_NAME), caseSensitive)) {
foundAMatchInSecond = true;
break;
}
}
if(!foundAMatchInSecond) { return false; }
}
}
return true;
}
/**
* Return a person name without a title. Also remove title from global
* variable tokensMap
*/
@Override
public String stripPersonTitle(String annotString, Annotation annot,
Document doc, Map> tokensMap,
Map> normalizedTokensMap, AnnotationSet nameAllAnnots)
throws ExecutionException {
FeatureMap queryFM = Factory.newFeatureMap();
// get the offsets
Long startAnnot = annot.getStartNode().getOffset();
Long endAnnot = annot.getEndNode().getOffset();
// determine "Lookup" annotation set
queryFM.clear();
queryFM.put("majorType", "title");
AnnotationSet as1 = nameAllAnnots.getContained(startAnnot, endAnnot);
if(as1 == null || as1.isEmpty()) return annotString;
AnnotationSet as = as1.get("Lookup", queryFM);
if(as != null && !as.isEmpty()) {
List titles = new ArrayList(as);
Collections.sort(titles, new gate.util.OffsetComparator());
Iterator iter = titles.iterator();
while(iter.hasNext()) {
Annotation titleAnn = iter.next();
// we've not found a title at the start offset,
// there's no point in looking further
// coz titles come first
if(titleAnn.getStartNode().getOffset().compareTo(startAnnot) != 0)
return annotString;
try {
// the title from the current annotation
String annotTitle =
doc.getContent()
.getContent(titleAnn.getStartNode().getOffset(),
titleAnn.getEndNode().getOffset()).toString();
// eliminate the title from annotation string and return the result
if(annotTitle.length() < annotString.length()) {
// remove from the array of tokens, so then we can compare properly
// the remaining tokens
// log.debug("Removing title from: " + annot + " with string " +
// annotString);
// log.debug("Tokens are " + tokensMap.get(annot.getId()));
// log.debug("Title is " + annotTitle);
tokensMap.get(annot.getId()).remove(0);
normalizedTokensMap.get(annot.getId()).remove(0);
return annotString.substring(annotTitle.length() + 1,
annotString.length());
}
} catch(InvalidOffsetException ioe) {
throw new ExecutionException("Invalid offset of the annotation");
}// try
}// while
}// if
return annotString;
}
@Override
public boolean matchedAlready(Annotation annot1, Annotation annot2,
List> matchesDocFeature, AnnotationSet nameAllAnnots) {
// the two annotations are already matched if the matches list of the first
// contains the id of the second
@SuppressWarnings("unchecked")
List matchesList =
(List)annot1.getFeatures().get(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME);
if((matchesList == null) || matchesList.isEmpty())
return false;
else if(matchesList.contains(annot2.getId())) return true;
return false;
}
@Override
public Annotation updateMatches(Annotation newAnnot, String annotString,
Map processedAnnots, AnnotationSet nameAllAnnots,
List> matchesDocFeature) {
Annotation matchedAnnot = null;
Integer id;
// first find a processed annotation with the same string
// TODO: Andrew Borthwick 7/26/08: The below is very inefficient. We should
// be doing a lookup into a hash
// which is indexed on string rather than testing every id. Need to have the
// index be String + Type
// for safety
Iterator iter = processedAnnots.keySet().iterator();
// System.out.println("ID's examined: ");
while(iter.hasNext()) {
id = iter.next();
String oldString = processedAnnots.get(id);
// System.out.print(id + " ");
if(annotString.equals(oldString)) {
Annotation tempAnnot = nameAllAnnots.get(id);
if(tempAnnot == null) {
log.debug("Orthomatcher: TempAnnot is null when looking at "
+ annotString + " | " + oldString + " | old id: " + id);
return null;
}
// Below is a new Spock addition to prevent unpredictable behavior when
// the same string is given more than one type. We want to return null
// if there is no match on name + type (other than Unknown)
if(newAnnot.getType().equals(unknownType)
|| tempAnnot.getType().equals(newAnnot.getType())) {
matchedAnnot = tempAnnot;
break;
}
}
}// while
// System.out.println();
if(matchedAnnot == null) return null;
@SuppressWarnings("unchecked")
List matchesList =
(List)matchedAnnot.getFeatures().get(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME);
if((matchesList == null) || matchesList.isEmpty()) {
// no previous matches, so need to add
if(matchesList == null) {
matchesList = new ArrayList();
matchedAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME,
matchesList);
matchesDocFeature.add(matchesList);
}// if
matchesList.add(matchedAnnot.getId());
matchesList.add(newAnnot.getId());
} else {
// just add the new annotation
matchesList.add(newAnnot.getId());
}// if
// add the matches list to the new annotation
newAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME,
matchesList);
return matchedAnnot;
}
@Override
public void updateMatches(Annotation newAnnot, Annotation prevAnnot,
List> matchesDocFeature, AnnotationSet nameAllAnnots) {
@SuppressWarnings("unchecked")
List matchesList =
(List)prevAnnot.getFeatures().get(
OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME);
if((matchesList == null) || matchesList.isEmpty()) {
// no previous matches, so need to add
if(matchesList == null) {
matchesList = new ArrayList();
prevAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME,
matchesList);
matchesDocFeature.add(matchesList);
}// if
matchesList.add(prevAnnot.getId());
matchesList.add(newAnnot.getId());
} else {
// just add the new annotation
matchesList.add(newAnnot.getId());
}// if
// add the matches list to the new annotation
newAnnot.getFeatures().put(OrthoMatcher.ANNOTATION_COREF_FEATURE_NAME,
matchesList);
// propagate the gender if two persons are matched
if(prevAnnot.getType().equals(this.personType)) {
String prevGender =
(String)prevAnnot.getFeatures().get(
OrthoMatcher.PERSON_GENDER_FEATURE_NAME);
String newGender =
(String)newAnnot.getFeatures().get(
OrthoMatcher.PERSON_GENDER_FEATURE_NAME);
boolean unknownPrevGender = isUnknownGender(prevGender);
boolean unknownNewGender = isUnknownGender(newGender);
if(unknownPrevGender && !unknownNewGender)
prevAnnot.getFeatures().put(OrthoMatcher.PERSON_GENDER_FEATURE_NAME,
newGender);
else if(unknownNewGender && !unknownPrevGender)
newAnnot.getFeatures().put(OrthoMatcher.PERSON_GENDER_FEATURE_NAME,
prevGender);
}// if
}
/**
* Tables for namematch info (used by the namematch rules)
*/
@Override
public Set buildTables(AnnotationSet nameAllAnnots) {
FeatureMap tempMap = Factory.newFeatureMap();
// reset the tables first
Set cdg = new HashSet();
if(!extLists) {
// i.e. get cdg from Lookup annotations
// get all Lookup annotations
tempMap.clear();
tempMap.put(gate.creole.ANNIEConstants.LOOKUP_MAJOR_TYPE_FEATURE_NAME,
"cdg");
// now get all lookup annotations which are cdg
AnnotationSet nameAnnots =
nameAllAnnots.get(OrthoMatcher.LOOKUP_ANNOTATION_TYPE, tempMap);
if((nameAnnots == null) || nameAnnots.isEmpty()) return cdg;
Iterator iter = nameAnnots.iterator();
while(iter.hasNext()) {
Annotation annot = iter.next();
// get the actual string
Long offsetStartAnnot = annot.getStartNode().getOffset();
Long offsetEndAnnot = annot.getEndNode().getOffset();
try {
gate.Document doc = nameAllAnnots.getDocument();
String annotString =
doc.getContent().getContent(offsetStartAnnot, offsetEndAnnot)
.toString();
cdg.add(annotString);
} catch(InvalidOffsetException ioe) {
ioe.printStackTrace(Err.getPrintWriter());
}
}// while
}// if
return cdg;
}// buildTables
@Override
public boolean isUnknownGender(String gender) {
if(gender == null) return true;
if(gender.equalsIgnoreCase("male") || gender.equalsIgnoreCase("female"))
return false;
return true;
} // isUnknownGender
protected Map> initNicknames(String nicknameFileEncoding,
java.net.URL fileURL) throws IOException {
Pattern spacePat = Pattern.compile("(\\s+)");
nicknameMap = new HashMap>();
// create the relative URL
BufferedReader reader = null;
try {
reader = new BomStrippingInputStreamReader(fileURL.openStream(),
nicknameFileEncoding);
String lineRead = null;
while((lineRead = reader.readLine()) != null) {
if(lineRead.length() == 0 || lineRead.charAt(0) == '#') {
continue;
}
List nickNameLine =
Arrays.asList(spacePat.split(lineRead
.toLowerCase().trim()));
if(nickNameLine.size() != 3
&& (nickNameLine.size() != 4 && ((nickNameLine.get(3) != "M") || nickNameLine
.get(3) != "F"))) {
continue;
}
if(round2Places(Double.valueOf(nickNameLine.get(2))) < OrthoMatcherHelper
.round2Places(minimumNicknameLikelihood)) {
continue;
}
if(nicknameMap.containsKey(nickNameLine.get(0))) {
/*
* System.out.println("Adding to existing nickname of " +
* nickNameLine.get(0) + " " + nickNameLine.get(1));
*/
nicknameMap.get(nickNameLine.get(0)).add(nickNameLine.get(1));
} else {
/*
* System.out.println("Adding new nickname of " +
* nickNameLine.get(0) + " " + nickNameLine.get(1));
*/
nicknameMap.put(
nickNameLine.get(0),
new HashSet(
Collections.singleton(nickNameLine.get(1))));
}
}
} finally {
IOUtils.closeQuietly(reader);
}
return nicknameMap;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy