gate.creole.coref.PronominalCoref Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of annie Show documentation
Show all versions of annie Show documentation
ANNIE is a general purpose information extraction system that
provides the building blocks of many other GATE applications.
/*
* PronominalCoref.java
*
* Copyright (c) 1995-2012, The University of Sheffield. See the file
* COPYRIGHT.txt in the software or at http://gate.ac.uk/gate/COPYRIGHT.txt
*
* This file is part of GATE (see http://gate.ac.uk/), and is free
* software, licenced under the GNU Library General Public License,
* Version 2, June 1991 (in the distribution as file licence.html,
* and also available at http://gate.ac.uk/gate/licence.html).
*
* Marin Dimitrov, 30/Dec/2001
*
* $Id: PronominalCoref.java 19742 2016-11-16 17:58:23Z markagreenwood $
*/
package gate.creole.coref;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import gate.Annotation;
import gate.AnnotationSet;
import gate.Document;
import gate.Factory;
import gate.FeatureMap;
import gate.Gate;
import gate.LanguageAnalyser;
import gate.Node;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.creole.ResourceReference;
import gate.creole.metadata.CreoleParameter;
import gate.creole.metadata.CreoleResource;
import gate.util.Benchmark;
import gate.util.Benchmarkable;
import gate.util.Err;
import gate.util.SimpleFeatureMapImpl;
@CreoleResource(isPrivate = true)
public class PronominalCoref extends AbstractLanguageAnalyser
implements Benchmarkable {
private static final long serialVersionUID = 3860815557386683264L;
public static final String COREF_DOCUMENT_PARAMETER_NAME = "document";
public static final String COREF_ANN_SET_PARAMETER_NAME = "annotationSetName";
public static final String TRANSD_ENCODING_PARAMETER_NAME = "encoding";
public static final String TRANSD_GRAMMAR_URL_PARAMETER_NAME = "grammarURL";
/** --- */
private static final boolean DEBUG = false;
private ResourceReference qtGrammarURL;
public ResourceReference getQuotedGrammarURL() {
return qtGrammarURL;
}
@CreoleParameter(defaultValue="resources/coref/quoted_text.jape")
public void setQuotedGrammarURL(ResourceReference qtGrammarURL) {
this.qtGrammarURL = qtGrammarURL;
}
@Deprecated
public void setQuotedGrammarURL(URL qtGrammarURL) {
try {
this.setQuotedGrammarURL(new ResourceReference(qtGrammarURL));
} catch (URISyntaxException e) {
throw new RuntimeException("Error converting URL to ResourceReference", e);
}
}
private ResourceReference pleonGrammarURL;
public ResourceReference getPleonasmGrammarURL() {
return pleonGrammarURL;
}
@CreoleParameter(defaultValue="resources/coref/pleonasm.jape")
public void setPleonasmGrammarURL(ResourceReference pleonGrammarURL) {
this.pleonGrammarURL = pleonGrammarURL;
}
@Deprecated
public void setPleonasmGrammarURL(URL pleonGrammarURL) {
try {
this.setPleonasmGrammarURL(new ResourceReference(pleonGrammarURL));
} catch (URISyntaxException e) {
throw new RuntimeException("Error converting URL to ResourceReference", e);
}
}
//annotation types
private static final String QUOTED_TEXT_TYPE = "QuotedText";
private static final String PLEONASTIC_TYPE = "PleonasticIt";
//annotation features
private static final String PRP_CATEGORY = "PRP";
private static final String PRP$_CATEGORY = "PRP$";
//scope
private static final int SENTENCES_IN_SCOPE = 3;
/** --- */
private static AnnotationOffsetComparator ANNOTATION_OFFSET_COMPARATOR;
/** --- */
private String annotationSetName;
/** --- */
private LanguageAnalyser qtTransducer;
/** --- */
private LanguageAnalyser pleonTransducer;
/** --- */
private AnnotationSet defaultAnnotations;
/** --- */
private transient Sentence[] textSentences;
/** --- */
private transient Quote[] quotedText;
/** --- */
private Annotation[] pleonasticIt;
/** --- */
private Map personGender;
/** --- */
private HashMap anaphor2antecedent;
/** --- */
private static final FeatureMap PRP_RESTRICTION;
private boolean resolveIt = true;
/** default ORGANIZATIONS,LOCATION**/
private Set inanimatedSet;
private String inanimatedEntityTypes;
private String benchmarkId;
/** --- */
static {
ANNOTATION_OFFSET_COMPARATOR = new AnnotationOffsetComparator();
PRP_RESTRICTION = new SimpleFeatureMapImpl();
PRP_RESTRICTION.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
}
/** Initialise this resource, and return it. */
@Override
public Resource init() throws ResourceInstantiationException {
personGender = new HashMap();
anaphor2antecedent = new HashMap();
inanimatedSet = new HashSet();
//1. initialise quoted text transducer
FeatureMap params = Factory.newFeatureMap();
params.put(TRANSD_GRAMMAR_URL_PARAMETER_NAME, qtGrammarURL);
params.put(TRANSD_ENCODING_PARAMETER_NAME, "UTF-8");
if (qtTransducer == null) {
features = Factory.newFeatureMap();
Gate.setHiddenAttribute(features, true);
qtTransducer = (LanguageAnalyser)Factory.createResource("gate.creole.Transducer",
params, features);
qtTransducer.setName("PronominalCoref-QT " + System.currentTimeMillis());
}
else {
qtTransducer.setParameterValues(params);
qtTransducer.reInit();
}
//2. initialise pleonastic transducer
params = Factory.newFeatureMap();
params.put(TRANSD_GRAMMAR_URL_PARAMETER_NAME, pleonGrammarURL);
params.put(TRANSD_ENCODING_PARAMETER_NAME, "UTF-8");
if (pleonTransducer == null) {
features = Factory.newFeatureMap();
Gate.setHiddenAttribute(features, true);
pleonTransducer = (LanguageAnalyser)Factory.createResource("gate.creole.Transducer",
params, features);
pleonTransducer.setName("PronominalCoref-Pleon " + System.currentTimeMillis());
}
else {
pleonTransducer.setParameterValues(params);
pleonTransducer.reInit();
}
return this;
} // init()
@Override
public void cleanup() {
super.cleanup();
Factory.deleteResource(qtTransducer);
Factory.deleteResource(pleonTransducer);
}
/** Set the document to run on. */
@Override
public void setDocument(Document newDocument) {
//0. precondition
// assert (null != newDocument);
//1. set doc for aggregated components
qtTransducer.setDocument(newDocument);
pleonTransducer.setDocument(newDocument);
//3. delegate
super.setDocument(newDocument);
}
/** --- */
public void setAnnotationSetName(String annotationSetName) {
this.annotationSetName = annotationSetName;
}
/** --- */
public String getAnnotationSetName() {
return annotationSetName;
}
/** --- */
public void setResolveIt(Boolean newValue) {
this.resolveIt = newValue.booleanValue();
}
/** --- */
public Boolean getResolveIt() {
return resolveIt;
}
/**
* This method runs the coreferencer. It assumes that all the needed parameters
* are set. If they are not, an exception will be fired.
*/
@SuppressWarnings("unchecked")
@Override
public void execute() throws ExecutionException{
//0. preconditions
if(null == this.document) {
throw new ExecutionException("[coreference] Document is not set!");
}
//1. preprocess
preprocess();
/*
//2. remove corefs from previous run
String annSetName = this.annotationSetName == null ? "COREF"
: this.annotationSetName;
AnnotationSet corefSet = this.document.getAnnotations(annSetName);
if (false == corefSet.isEmpty()) {
corefSet.clear();
}
*/
//3.get personal pronouns
FeatureMap constraintPRP = new SimpleFeatureMapImpl();
constraintPRP.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
AnnotationSet personalPronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP);
//4.get possesive pronouns
FeatureMap constraintPRP$ = new SimpleFeatureMapImpl();
constraintPRP$.put(TOKEN_CATEGORY_FEATURE_NAME,PRP$_CATEGORY);
AnnotationSet possesivePronouns = this.defaultAnnotations.get(TOKEN_ANNOTATION_TYPE,constraintPRP$);
//5.combine them
List pronouns = new ArrayList();
if (personalPronouns != null && !personalPronouns.isEmpty()) {
pronouns.addAll(personalPronouns);
}
if (possesivePronouns != null && !possesivePronouns.isEmpty()) {
pronouns.addAll(possesivePronouns);
}
//6.do we have pronouns at all?
if (pronouns.isEmpty()) {
//do nothing
return;
}
//7.sort them according to offset
Annotation[] arrPronouns = pronouns.toArray(new Annotation[pronouns.size()]);
Arrays.sort(arrPronouns,ANNOTATION_OFFSET_COMPARATOR);
//8.cleanup - ease the GC
//as of JDK6 this is no longer helpful
//pronouns = null;
//personalPronouns = null;
//possesivePronouns = null;
int prnSentIndex = 0;
//10. process all pronouns
for (int i=0; i< arrPronouns.length; i++) {
Annotation currPronoun = arrPronouns[i];
while (this.textSentences[prnSentIndex].getEndOffset().longValue() <
currPronoun.getEndNode().getOffset().longValue()) {
prnSentIndex++;
}
Sentence currSentence = this.textSentences[prnSentIndex];
assert (currSentence.getStartOffset().longValue() <= currPronoun.getStartNode().getOffset().longValue());
assert (currSentence.getEndOffset().longValue() >= currPronoun.getEndNode().getOffset().longValue());
//11. find antecedent (if any) for pronoun
Annotation antc = findAntecedent(currPronoun,prnSentIndex);
//12. add to the ana2ant hashtable
this.anaphor2antecedent.put(currPronoun,antc);
}
//done
}
/** --- */
public Map getResolvedAnaphora() {
return this.anaphor2antecedent;
}
/** --- */
private Annotation findAntecedent(Annotation currPronoun,int prnSentIndex) {
//0. preconditions
assert (null != currPronoun);
assert (prnSentIndex >= 0);
assert (currPronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
assert (currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
currPronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
//1.
String strPronoun = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
assert (null != strPronoun);
//2. delegate processing to the appropriate methods
if (strPronoun.equalsIgnoreCase("HE") ||
strPronoun.equalsIgnoreCase("HIM") ||
strPronoun.equalsIgnoreCase("HIS") ||
strPronoun.equalsIgnoreCase("HIMSELF")) {
return _resolve$HE$HIM$HIS$HIMSELF$(currPronoun,prnSentIndex);
}
else if (strPronoun.equalsIgnoreCase("SHE") ||
strPronoun.equalsIgnoreCase("HER") ||
strPronoun.equalsIgnoreCase("HERS") ||
strPronoun.equalsIgnoreCase("HERSELF")) {
return _resolve$SHE$HER$HERS$HERSELF$(currPronoun,prnSentIndex);
}
else if (strPronoun.equalsIgnoreCase("IT") ||
strPronoun.equalsIgnoreCase("ITS") ||
strPronoun.equalsIgnoreCase("ITSELF")) {
return _resolve$IT$ITS$ITSELF$(currPronoun,prnSentIndex);
}
else if (strPronoun.equalsIgnoreCase("I") ||
strPronoun.equalsIgnoreCase("ME") ||
strPronoun.equalsIgnoreCase("MY") ||
strPronoun.equalsIgnoreCase("MINE") ||
strPronoun.equalsIgnoreCase("MYSELF")) {
return _resolve$I$ME$MY$MINE$MYSELF$(currPronoun,prnSentIndex);
}
else {
if (DEBUG) {
gate.util.Err.println("["+strPronoun+"] is not handled yet...");
}
return null;
}
}
boolean isPleonastic(Annotation pronoun) {
//0. preconditions
assert (null != pronoun);
String str = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
assert (str.equalsIgnoreCase("IT"));
//1. do we have pleonasms in this text?
if (this.pleonasticIt.length == 0) {
return false;
}
//2. find closest pleonasm index
@SuppressWarnings("unchecked")
int closestPleonasmIndex = Arrays.binarySearch(this.pleonasticIt,
pronoun,
ANNOTATION_OFFSET_COMPARATOR);
//normalize index
if (closestPleonasmIndex < 0) {
closestPleonasmIndex = -closestPleonasmIndex -1 -1;
}
//still not good?
if (closestPleonasmIndex < 0) {
closestPleonasmIndex = 0;
}
//get closest pleonasm
Annotation pleonasm = this.pleonasticIt[closestPleonasmIndex];
//System.out.println(pleonasm);
//System.out.println(pronoun);
//3. return true only if the proboun is contained in pleonastic fragment
boolean result = (pleonasm.getStartNode().getOffset().intValue() <= pronoun.getStartNode().getOffset().intValue()
&&
pleonasm.getEndNode().getOffset().intValue() >= pronoun.getEndNode().getOffset().intValue());
//System.out.println("is pleon=["+result+"]");
return result;
}
/** --- */
private Annotation _resolve$HE$HIM$HIS$HIMSELF$(Annotation pronoun, int sentenceIndex) {
//0. preconditions
assert (pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
assert (pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
assert (pronounString.equalsIgnoreCase("HE") ||
pronounString.equalsIgnoreCase("HIM") ||
pronounString.equalsIgnoreCase("HIS") ||
pronounString.equalsIgnoreCase("HIMSELF"));
//1.
boolean antecedentFound = false;
int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE;
if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
int currSentenceIndex = sentenceIndex;
Annotation bestAntecedent = null;
while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) {
Sentence currSentence = this.textSentences[currSentenceIndex];
AnnotationSet persons = currSentence.getPersons();
Iterator it = persons.iterator();
while (it.hasNext()) {
Annotation currPerson = it.next();
String gender = this.personGender.get(currPerson);
if (null == gender ||
gender.equalsIgnoreCase("MALE") ||
gender.equalsIgnoreCase("UNKNOWN")) {
//hit
antecedentFound = true;
if (null == bestAntecedent) {
bestAntecedent = currPerson;
}
else {
bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HERS$HIMSELF$HERSELF$(bestAntecedent,currPerson,pronoun);
}
}
}
if (0 == currSentenceIndex--)
break;
}
return bestAntecedent;
}
/** --- */
private Annotation _resolve$SHE$HER$HERS$HERSELF$(Annotation pronoun, int sentenceIndex) {
//0. preconditions
assert (pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
assert (pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
assert (pronounString.equalsIgnoreCase("SHE") ||
pronounString.equalsIgnoreCase("HER") ||
pronounString.equalsIgnoreCase("HERS") ||
pronounString.equalsIgnoreCase("HERSELF"));
//1.
boolean antecedentFound = false;
int scopeFirstIndex = sentenceIndex - SENTENCES_IN_SCOPE;
if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
int currSentenceIndex = sentenceIndex;
Annotation bestAntecedent = null;
while (currSentenceIndex >= scopeFirstIndex || antecedentFound == false) {
Sentence currSentence = this.textSentences[currSentenceIndex];
AnnotationSet persons = currSentence.getPersons();
Iterator it = persons.iterator();
while (it.hasNext()) {
Annotation currPerson = it.next();
String gender = this.personGender.get(currPerson);
if (null == gender ||
gender.equalsIgnoreCase("FEMALE") ||
gender.equalsIgnoreCase("UNKNOWN")) {
//hit
antecedentFound = true;
if (null == bestAntecedent) {
bestAntecedent = currPerson;
}
else {
bestAntecedent = _chooseAntecedent$HE$HIM$HIS$SHE$HER$HERS$HIMSELF$HERSELF$(bestAntecedent,currPerson,pronoun);
}
}
}
if (0 == currSentenceIndex--)
break;
}
return bestAntecedent;
}
/** --- */
private Annotation _resolve$IT$ITS$ITSELF$(Annotation pronoun, int sentenceIndex) {
//do not resolve it pronouns if disabled by the user
if (! resolveIt)
return null;
//0. preconditions
assert (pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
assert (pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
assert (pronounString.equalsIgnoreCase("IT") ||
pronounString.equalsIgnoreCase("ITS") ||
pronounString.equalsIgnoreCase("ITSELF"));
//0.5 check if the IT is pleonastic
if (pronounString.equalsIgnoreCase("IT") &&
isPleonastic(pronoun)) {
//System.out.println("PLEONASM...");
return null;
}
//1.
int scopeFirstIndex = sentenceIndex - 1;
if (scopeFirstIndex < 0 ) scopeFirstIndex = 0;
int currSentenceIndex = sentenceIndex;
Annotation bestAntecedent = null;
while (currSentenceIndex >= scopeFirstIndex) {
Sentence currSentence = this.textSentences[currSentenceIndex];
Set org_loc = currSentence.getInanimated();
Iterator it = org_loc.iterator();
while (it.hasNext()) {
Annotation currOrgLoc = it.next();
if (null == bestAntecedent) {
//discard cataphoric references
if (currOrgLoc.getStartNode().getOffset().longValue() <
pronoun.getStartNode().getOffset().longValue()) {
bestAntecedent = currOrgLoc;
}
}
else {
bestAntecedent = this._chooseAntecedent$IT$ITS$ITSELF$(bestAntecedent,currOrgLoc,pronoun);
}
}
if (0 == currSentenceIndex--)
break;
}
return bestAntecedent;
}
/** --- */
private Annotation _resolve$I$ME$MY$MINE$MYSELF$(Annotation pronoun, int sentenceIndex) {
//0. preconditions
assert (pronoun.getType().equals(TOKEN_ANNOTATION_TYPE));
assert (pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
assert (pronounString.equalsIgnoreCase("I") ||
pronounString.equalsIgnoreCase("MY") ||
pronounString.equalsIgnoreCase("ME") ||
pronounString.equalsIgnoreCase("MINE") ||
pronounString.equalsIgnoreCase("MYSELF"));
//0.5 sanity check
//if there are not quotes at all in the text then exit
if (0 == this.quotedText.length) {
//System.out.println("TEXT WITH NO QUOTES ENCOUNTERED...");
return null;
}
//1.
Annotation bestAntecedent = null;
@SuppressWarnings("unchecked")
int closestQuoteIndex = Arrays.binarySearch(this.quotedText,pronoun,ANNOTATION_OFFSET_COMPARATOR);
//normalize index
if (closestQuoteIndex < 0) {
closestQuoteIndex = -closestQuoteIndex -1 -1;
}
//still not good?
if (closestQuoteIndex < 0) {
closestQuoteIndex = 0;
}
//get closest Quote
Quote quoteContext = this.quotedText[closestQuoteIndex];
//assure that the pronoun is contained in the quoted text fragment
//otherwise exit
if (pronoun.getStartNode().getOffset().intValue() > quoteContext.getEndOffset().intValue() ||
pronoun.getEndNode().getOffset().intValue() < quoteContext.getStartOffset().intValue()) {
//oops, probably incorrect text - I/My/Me is not part of quoted text fragment
//exit
//System.out.println("Oops! ["+pronounString+"] not part of quoted fragment...");
return null;
}
//get the Persons that precede/succeed the quoted fragment
//the order is:
//
//[1]. if there exists a Person or pronoun in {he, she} following the quoted fragment but
//in the same sentence, then use it
//i.e. ["PRN1(x)...", said X ...A, B, C ....]
//
//[2]. if there is a Person (NOT a pronoun) in the same sentence,
// preceding the quote, then use it
//i.e. . [A, B, C...X ..."PRN1(x) ..."...]
//
//try [1]
//get the succeeding Persons/pronouns
Set succCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_AFTER);
if (false == succCandidates.isEmpty()) {
//cool, we have candidates, pick up the one closest to the end quote
Iterator it = succCandidates.iterator();
while (it.hasNext()) {
Annotation currCandidate = it.next();
if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) {
//wow, we have a candidate that is closer to the quote
bestAntecedent = currCandidate;
}
}
}
//try [2]
//get the preceding Persons/pronouns
if (null == bestAntecedent) {
Set precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BEFORE);
if (false == precCandidates.isEmpty()) {
//cool, we have candidates, pick up the one closest to the end quote
Iterator it = precCandidates.iterator();
while (it.hasNext()) {
Annotation currCandidate = it.next();
if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) < 0) {
//wow, we have a candidate that is closer to the quote
bestAntecedent = currCandidate;
}
}
}
}
//try [3]
//get the Persons/pronouns back in context
if (null == bestAntecedent) {
Set precCandidates = quoteContext.getAntecedentCandidates(Quote.ANTEC_BACK);
if (false == precCandidates.isEmpty()) {
//cool, we have candidates, pick up the one closest to the end quote
Iterator it = precCandidates.iterator();
while (it.hasNext()) {
Annotation currCandidate = it.next();
if (null == bestAntecedent || ANNOTATION_OFFSET_COMPARATOR.compare(bestAntecedent,currCandidate) > 0) {
//wow, we have a candidate that is closer to the quote
bestAntecedent = currCandidate;
}
}
}
}
return bestAntecedent;
}
/** --- */
@SuppressWarnings("unchecked")
private void preprocess() throws ExecutionException {
//0.5 cleanup
this.personGender.clear();
this.anaphor2antecedent.clear();
//1.get all annotation in the input set
if ( this.annotationSetName == null || this.annotationSetName.equals("")) {
this.defaultAnnotations = this.document.getAnnotations();
}
else {
this.defaultAnnotations = this.document.getAnnotations(annotationSetName);
}
//if none found, print warning and exit
if (this.defaultAnnotations == null || this.defaultAnnotations.isEmpty()) {
Err.prln("Coref Warning: No annotations found for processing!");
return;
}
// get the list of inanimated entity types
if (inanimatedEntityTypes==null||inanimatedEntityTypes.equals(""))
inanimatedEntityTypes="Organization;Location";
String[] types = inanimatedEntityTypes.split(";");
this.inanimatedSet.addAll(Arrays.asList(types));
//2.1 remove QT annotations if left from previous execution
AnnotationSet qtSet = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);
if (qtSet != null && !qtSet.isEmpty()) {
this.defaultAnnotations.removeAll(qtSet);
}
//2.2. run quoted text transducer to generate "Quoted Text" annotations
Benchmark.executeWithBenchmarking(this.qtTransducer,
Benchmark.createBenchmarkId("qtTransducer",
getBenchmarkId()), this, null);
//3.1 remove pleonastic annotations if left from previous execution
AnnotationSet pleonSet = this.defaultAnnotations.get(PLEONASTIC_TYPE);
if (pleonSet != null && !pleonSet.isEmpty()) {
this.defaultAnnotations.removeAll(pleonSet);
}
//3.2 run quoted text transducer to generate "Pleonasm" annotations
Benchmark.executeWithBenchmarking(pleonTransducer,
Benchmark.createBenchmarkId("pleonTransducer",
getBenchmarkId()), this, null);
//4.get all SENTENCE annotations
AnnotationSet sentenceAnnotations = this.defaultAnnotations.get(SENTENCE_ANNOTATION_TYPE);
this.textSentences = new Sentence[sentenceAnnotations.size()];
Annotation[] sentenceArray = sentenceAnnotations.toArray(new Annotation[sentenceAnnotations.size()]);
Arrays.sort(sentenceArray,ANNOTATION_OFFSET_COMPARATOR);
for (int i=0; i< sentenceArray.length; i++) {
Annotation currSentence = sentenceArray[i];
Long sentStartOffset = currSentence.getStartNode().getOffset();
Long sentEndOffset = currSentence.getEndNode().getOffset();
AnnotationSet tempASOffsets = this.defaultAnnotations.getContained(
sentStartOffset,sentEndOffset);
//4.1. get PERSONS in this sentence
AnnotationSet sentPersons = tempASOffsets.get(PERSON_ANNOTATION_TYPE);
//4.2. get inanimated entities (ORGANIZATIONS,LOCATION) in this sentence
AnnotationSet sentInans = tempASOffsets.get(this.inanimatedSet);
//4.5. create a Sentence for the SENTENCE annotation
this.textSentences[i] = new Sentence(i,
0,
sentStartOffset,
sentEndOffset,
sentPersons,
sentInans
);
//4.6. for all PERSONs in the sentence - find their gender using the
//orthographic coreferences if the gender of some entity is unknown
Iterator itPersons = sentPersons.iterator();
while (itPersons.hasNext()) {
Annotation currPerson = itPersons.next();
String gender = this.findPersonGender(currPerson);
this.personGender.put(currPerson,gender);
}
}
//5. initialise the quoted text fragments
AnnotationSet sentQuotes = this.defaultAnnotations.get(QUOTED_TEXT_TYPE);
//if none then return
if (null == sentQuotes) {
this.quotedText = new Quote[0];
}
else {
this.quotedText = new Quote[sentQuotes.size()];
Annotation[] quotesArray = sentQuotes.toArray(new Annotation[sentQuotes.size()]);
Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);
for (int i =0; i < quotesArray.length; i++) {
this.quotedText[i] = new Quote(quotesArray[i],i);
}
}
//6. initialuse the plonastic It annotations
AnnotationSet plaonasticSet = this.defaultAnnotations.get(PLEONASTIC_TYPE);
if (null == plaonasticSet) {
this.pleonasticIt = new Annotation[0];
}
else {
this.pleonasticIt = new Annotation[plaonasticSet.size()];
Annotation[] quotesArray = plaonasticSet.toArray(new Annotation[plaonasticSet.size()]);
Arrays.sort(quotesArray,ANNOTATION_OFFSET_COMPARATOR);
for (int i=0; i< this.pleonasticIt.length; i++) {
this.pleonasticIt[i] = quotesArray[i];
}
}
}
/** --- */
private String findPersonGender(Annotation person) {
String result = (String)person.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
if (null==result) {
//gender is unknown - try to find it from the ortho coreferences
@SuppressWarnings("unchecked")
List orthoMatches = (List)person.getFeatures().get(ANNOTATION_COREF_FEATURE_NAME);
if (null != orthoMatches) {
Iterator itMatches = orthoMatches.iterator();
while (itMatches.hasNext()) {
Integer correferringID = itMatches.next();
Annotation coreferringEntity = this.defaultAnnotations.get(correferringID);
if (coreferringEntity != null) {
assert (coreferringEntity.getType().equalsIgnoreCase(PERSON_ANNOTATION_TYPE));
String correferringGender = (String)coreferringEntity.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
if (null != correferringGender) {
result = correferringGender;
break;
}
}
}
}
}
return result;
}
@SuppressWarnings("rawtypes")
private static class AnnotationOffsetComparator implements Comparator, Serializable {
private static final long serialVersionUID = 4529121506801473785L;
private int _getOffset(Object o) {
if (o instanceof Annotation) {
return ((Annotation)o).getEndNode().getOffset().intValue();
}
else if (o instanceof Sentence) {
return ((Sentence)o).getStartOffset().intValue();
}
else if (o instanceof Quote) {
return ((Quote)o).getStartOffset().intValue();
}
else if (o instanceof Node) {
return ((Node)o).getOffset().intValue();
}
else {
throw new IllegalArgumentException();
}
}
@Override
public int compare(Object o1,Object o2) {
//0. preconditions
assert (null != o1);
assert (null != o2);
assert (o1 instanceof Annotation ||
o1 instanceof Sentence ||
o1 instanceof Quote ||
o1 instanceof Node);
assert (o2 instanceof Annotation ||
o2 instanceof Sentence ||
o2 instanceof Quote ||
o2 instanceof Node);
int offset1 = _getOffset(o1);
int offset2 = _getOffset(o2);
return offset1 - offset2;
}
}
/** --- */
private Annotation _chooseAntecedent$HE$HIM$HIS$SHE$HER$HERS$HIMSELF$HERSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) {
//0. preconditions
assert (null != ant1);
assert (null != ant2);
assert (null != pronoun);
assert (pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
assert (pronounString.equalsIgnoreCase("SHE") ||
pronounString.equalsIgnoreCase("HER") ||
pronounString.equalsIgnoreCase("HERS") ||
pronounString.equalsIgnoreCase("HERSELF") ||
pronounString.equalsIgnoreCase("HE") ||
pronounString.equalsIgnoreCase("HIM") ||
pronounString.equalsIgnoreCase("HIS") ||
pronounString.equalsIgnoreCase("HIMSELF"));
Long offset1 = ant1.getStartNode().getOffset();
Long offset2 = ant2.getStartNode().getOffset();
Long offsetPrn = pronoun.getStartNode().getOffset();
long diff1 = offsetPrn.longValue() - offset1.longValue();
long diff2 = offsetPrn.longValue() - offset2.longValue();
// assert (diff1 != 0 && diff2 != 0);
//reject candidates that overlap with the pronoun
if (diff1 == 0) {
return ant2;
}
else if (diff2 == 0) {
return ant1;
}
//get the one CLOSEST AND PRECEDING the pronoun
if (diff1 > 0 && diff2 > 0) {
//we have [...antecedentA...AntecedentB....pronoun...] ==> choose B
if (diff1 < diff2)
return ant1;
else
return ant2;
}
else if (diff1 < 0 && diff2 < 0) {
//we have [...pronoun ...antecedentA...AntecedentB.......] ==> choose A
if (Math.abs(diff1) < Math.abs(diff2))
return ant1;
else
return ant2;
}
else {
assert (Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
//we have [antecedentA...pronoun...AntecedentB] ==> choose A
if (diff1 > 0)
return ant1;
else
return ant2;
}
}
/** --- */
private Annotation _chooseAntecedent$IT$ITS$ITSELF$(Annotation ant1, Annotation ant2, Annotation pronoun) {
//0. preconditions
assert (null != ant1);
assert (null != ant2);
assert (null != pronoun);
assert (pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP_CATEGORY) ||
pronoun.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME).equals(PRP$_CATEGORY));
String pronounString = (String)pronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
assert (pronounString.equalsIgnoreCase("IT") ||
pronounString.equalsIgnoreCase("ITS") ||
pronounString.equalsIgnoreCase("ITSELF"));
Long offset1 = ant1.getStartNode().getOffset();
Long offset2 = ant2.getStartNode().getOffset();
Long offsetPrn = pronoun.getStartNode().getOffset();
long diff1 = offsetPrn.longValue() - offset1.longValue();
long diff2 = offsetPrn.longValue() - offset2.longValue();
// assert (diff1 != 0 && diff2 != 0);
//reject candidates that overlap with the pronoun
if (diff1 == 0) {
return ant2;
}
else if (diff2 == 0) {
return ant1;
}
//get the one CLOSEST AND PRECEDING the pronoun
if (diff1 > 0 && diff2 > 0) {
//we have [...antecedentA...AntecedentB....pronoun...] ==> choose B
if (diff1 < diff2)
return ant1;
else
return ant2;
}
else if (diff1 > 0){
assert (Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
//we have [antecedentA...pronoun...AntecedentB] ==> choose A
return ant1;
}
else if (diff2 > 0){
assert (Math.abs(diff1 + diff2) < Math.abs(diff1) + Math.abs(diff2));
//we have [antecedentA...pronoun...AntecedentB] ==> choose A
return ant2;
}
else {
//both possible antecedents are BEHIND the anaophoric pronoun - i.e. we have either
//cataphora, or nominal antecedent, or an antecedent that is further back in scope
//in any case - discard the antecedents
return null;
}
}
/** --- */
private class Quote {
/** --- */
public static final int ANTEC_AFTER = 1;
/** --- */
public static final int ANTEC_BEFORE = 2;
/** --- */
public static final int ANTEC_BACK = 3;
/** --- */
private Set antecedentsBefore;
/** --- */
private Set antecedentsAfter;
/** --- */
private Set antecedentsBackInContext;
/** --- */
private Annotation quoteAnnotation;
/** --- */
private int quoteIndex;
/** --- */
public Quote(Annotation quoteAnnotation, int index) {
this.quoteAnnotation = quoteAnnotation;
this.quoteIndex = index;
init();
}
/** --- */
private void init() {
//0.preconditions
assert (null != textSentences);
//0.5 create a restriction for PRP pos tokens
FeatureMap prpTokenRestriction = new SimpleFeatureMapImpl();
prpTokenRestriction.put(TOKEN_CATEGORY_FEATURE_NAME,PRP_CATEGORY);
//1. generate the precPersons set
//1.1 locate the sentece containing the opening quote marks
@SuppressWarnings("unchecked")
int quoteStartPos = Arrays.binarySearch(textSentences,
this.quoteAnnotation.getStartNode(),
ANNOTATION_OFFSET_COMPARATOR);
//normalize index
int startSentenceIndex = quoteStartPos >= 0 ? quoteStartPos
: -quoteStartPos -1 -1; // blame Sun, not me
//still not good?
if (startSentenceIndex < 0) {
startSentenceIndex = 0;
}
//1.2. get the persons and restrict to these that precede the quote (i.e. not contained
//in the quote)
this.antecedentsBefore = generateAntecedentCandidates(startSentenceIndex,
this.quoteIndex,
ANTEC_BEFORE);
//2. generate the precPersonsInCOntext set
//2.1. get the persons from the sentence precedeing the sentence containing the quote start
if (startSentenceIndex > 0) {
this.antecedentsBackInContext = generateAntecedentCandidates(startSentenceIndex -1,
this.quoteIndex,
ANTEC_BACK);
}
//2. generate the succ Persons set
//2.1 locate the sentece containing the closing quote marks
@SuppressWarnings("unchecked")
int quoteEndPos = Arrays.binarySearch(textSentences,
this.quoteAnnotation.getEndNode(),
ANNOTATION_OFFSET_COMPARATOR);
//normalize it
int endSentenceIndex = quoteEndPos >= 0 ? quoteEndPos
: -quoteEndPos -1 -1; // blame Sun, not me
//still not good?
if (endSentenceIndex < 0) {
endSentenceIndex = 0;
}
this.antecedentsAfter = generateAntecedentCandidates(endSentenceIndex,
this.quoteIndex,
ANTEC_AFTER);
//generate t
}
/** --- */
private Set generateAntecedentCandidates(int sentenceNumber,
int quoteNumber ,
int mode) {
//0. preconditions
assert (sentenceNumber >=0);
assert (quoteNumber >=0);
assert (mode == Quote.ANTEC_AFTER ||
mode == Quote.ANTEC_BEFORE ||
mode == Quote.ANTEC_BACK);
//1. get sentence
Sentence sentence = textSentences[sentenceNumber];
//2. get the persons
Set antecedents = new HashSet(sentence.getPersons());
//4. now get the he/she pronouns in the relevant context
AnnotationSet annotations = null;
switch(mode) {
case ANTEC_BEFORE:
annotations = defaultAnnotations.getContained(sentence.getStartOffset(),
this.getStartOffset());
break;
case ANTEC_AFTER:
annotations = defaultAnnotations.getContained(this.getEndOffset(),
sentence.getEndOffset());
break;
case ANTEC_BACK:
annotations = defaultAnnotations.getContained(sentence.getStartOffset(),
sentence.getEndOffset());
break;
}
//4. get the pronouns
//restrict to he/she pronouns
if (null != annotations) {
AnnotationSet pronouns = annotations.get(TOKEN_ANNOTATION_TYPE,PRP_RESTRICTION);
if (null != pronouns) {
Iterator it = pronouns.iterator();
while (it.hasNext()) {
Annotation currPronoun = it.next();
//add to succPersons only if HE/SHE
String pronounString = (String)currPronoun.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
if (null != pronounString &&
(pronounString.equalsIgnoreCase("he") || pronounString.equalsIgnoreCase("she"))
)
antecedents.add(currPronoun);
}//while
}//if
}//if
//3. depending on the mode, may have to restrict persons to these that precede/succeed
//the quoted fragment
//
//for ANTEC_BEFORE, get the ones #preceding# the quote, contained in the sentence where
//the quote *starts*
//
//for ANTEC_AFTER, get the ones #succeeding# the quote, contained in the sentence where
//the quote *ends*
//
//for ANTEC_BACK, we are operating in the context of the sentence previous to the
//sentence where the quote starts. I.e. we're resolbinf a case like
// [sss "q1q1q1q1" s1s1s1s1]["q2q2q2q2"]
//...and we want to get the entities from the s1s1 part - they *succeed* the #previous# quote
//Note that the cirrent sentence is the first one, not the second
//
Iterator itPersons = antecedents.iterator();
while (itPersons.hasNext()) {
Annotation currPerson = itPersons.next();
//cut
if (Quote.ANTEC_BEFORE == mode &&
currPerson.getStartNode().getOffset().intValue() > getStartOffset().intValue()) {
//restrict only to persosn preceding
itPersons.remove();
}
else if (Quote.ANTEC_AFTER == mode &&
currPerson.getStartNode().getOffset().intValue() < getEndOffset().intValue()) {
//restrict only to persons succeeding the quote
itPersons.remove();
}
else if (Quote.ANTEC_BACK == mode) {
//this one is tricky
//locate the quote previous to the one we're resolving
//(since we're operating in the sentence previous to the quote being resolved
//wew try to find if any quote (prevQuote) exist in this sentence and get the
//persons succeeding it)
//get prev quote
//is the curr quote the first one?
if (quoteNumber >0) {
Quote prevQuote = PronominalCoref.this.quotedText[quoteNumber-1];
//restrict to the succeeding persons
if (currPerson.getStartNode().getOffset().longValue() < prevQuote.getEndOffset().longValue()) {
itPersons.remove();
}
}
}
}
return antecedents;
}
/** --- */
public Long getStartOffset() {
return this.quoteAnnotation.getStartNode().getOffset();
}
/** --- */
public Long getEndOffset() {
return this.quoteAnnotation.getEndNode().getOffset();
}
/** --- */
public Set getAntecedentCandidates(int type) {
switch(type) {
case ANTEC_AFTER:
return null != this.antecedentsAfter ?
this.antecedentsAfter :
new HashSet();
case ANTEC_BEFORE:
return null != this.antecedentsBefore ?
this.antecedentsBefore :
new HashSet();
case ANTEC_BACK:
return null != this.antecedentsBackInContext ?
this.antecedentsBackInContext :
new HashSet();
default:
throw new IllegalArgumentException();
}
}
}
/** --- */
private class Sentence {
/** --- */
@SuppressWarnings("unused")
private int sentNumber;
/** --- */
@SuppressWarnings("unused")
private int paraNumber;
/** --- */
private Long startOffset;
/** --- */
private Long endOffset;
/** --- */
private AnnotationSet persons;
/** --- */
private AnnotationSet inanimated;
/** --- */
public Sentence(int sentNumber,
int paraNumber,
Long startOffset,
Long endOffset,
AnnotationSet persons,
AnnotationSet inanimated) {
this.sentNumber = sentNumber;
this.paraNumber = paraNumber;
this.startOffset = startOffset;
this.endOffset = endOffset;
this.persons = persons;
this.inanimated = inanimated;
}
/** --- */
public Long getStartOffset() {
return this.startOffset;
}
/** --- */
public Long getEndOffset() {
return this.endOffset;
}
/** --- */
public AnnotationSet getPersons() {
return this.persons;
}
public AnnotationSet getInanimated() {
return this.inanimated;
}
}
public String getInanimatedEntityTypes() {
return inanimatedEntityTypes;
}
public void setInanimatedEntityTypes(String inanimatedEntityTypes) {
this.inanimatedEntityTypes = inanimatedEntityTypes;
}
/* (non-Javadoc)
* @see gate.util.Benchmarkable#getBenchmarkId()
*/
@Override
public String getBenchmarkId() {
if(benchmarkId == null) {
return getName();
}
else {
return benchmarkId;
}
}
/* (non-Javadoc)
* @see gate.util.Benchmarkable#setBenchmarkId(java.lang.String)
*/
@Override
public void setBenchmarkId(String benchmarkId) {
this.benchmarkId = benchmarkId;
}
}