edu.stanford.nlp.coref.md.DependencyCorefMentionFinder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
package edu.stanford.nlp.coref.md;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;
import edu.stanford.nlp.coref.CorefProperties;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.coref.data.Mention;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations.BasicDependenciesAnnotation;
import edu.stanford.nlp.semgraph.SemanticGraphEdge;
import edu.stanford.nlp.semgraph.SemanticGraphUtils;
import edu.stanford.nlp.trees.GrammaticalRelation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.trees.UniversalEnglishGrammaticalRelations;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.IntPair;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.logging.Redwood;
public class DependencyCorefMentionFinder extends CorefMentionFinder {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(DependencyCorefMentionFinder.class);
public DependencyCorefMentionFinder(Properties props) throws ClassNotFoundException, IOException {
this.lang = CorefProperties.getLanguage(props);
mdClassifier = (CorefProperties.isMentionDetectionTraining(props)) ?
null : IOUtils.readObjectFromURLOrClasspathOrFileSystem(CorefProperties.getMentionDetectionModel(props));
}
public MentionDetectionClassifier mdClassifier = null;
/** Main method of mention detection.
* Extract all NP, PRP or NE, and filter out by manually written patterns.
*/
@Override
public List> findMentions(Annotation doc, Dictionaries dict, Properties props) {
List> predictedMentions = new ArrayList<>();
Set neStrings = Generics.newHashSet();
List> mentionSpanSetList = Generics.newArrayList();
List sentences = doc.get(CoreAnnotations.SentencesAnnotation.class);
for (CoreMap s : sentences) {
List mentions = new ArrayList<>();
predictedMentions.add(mentions);
Set mentionSpanSet = Generics.newHashSet();
Set namedEntitySpanSet = Generics.newHashSet();
extractPremarkedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet);
HybridCorefMentionFinder.extractNamedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet);
extractNPorPRPFromDependency(s, mentions, mentionSpanSet, namedEntitySpanSet);
addNamedEntityStrings(s, neStrings, namedEntitySpanSet);
mentionSpanSetList.add(mentionSpanSet);
}
// extractNamedEntityModifiers(sentences, mentionSpanSetList, predictedMentions, neStrings);
for(int i=0 ; i> predictedMentions, int maxID) {
for(List mentions : predictedMentions) {
for(Mention m : mentions) {
m.mentionID = (++maxID);
}
}
}
protected static void setBarePlural(List mentions) {
for (Mention m : mentions) {
String pos = m.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class);
if(m.originalSpan.size()==1 && pos.equals("NNS")) m.generic = true;
}
}
private void extractNPorPRPFromDependency(CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet) {
List sent = s.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph basic = s.get(BasicDependenciesAnnotation.class);
List nounsOrPrp = basic.getAllNodesByPartOfSpeechPattern("N.*|PRP.*|DT"); // DT is for "this, these, etc"
Tree tree = s.get(TreeAnnotation.class);
for(IndexedWord w : nounsOrPrp) {
SemanticGraphEdge edge = basic.getEdge(basic.getParent(w), w);
GrammaticalRelation rel = null;
String shortname = "root"; // if edge is null, it's root
if(edge!=null) {
rel = edge.getRelation();
shortname = rel.getShortName();
}
// TODO: what to remove? remove more?
if(shortname.matches("det|compound")) {
// // for debug ---------------
// Tree t = tree.getLeaves().get(w.index()-1);
// for(Tree p : tree.pathNodeToNode(t, tree)) {
// if(p.label().value().equals("NP")) {
// HeadFinder headFinder = new SemanticHeadFinder();
// Tree head = headFinder.determineHead(p);
// if(head == t.parent(tree)) {
// log.info();
// }
// break;
// }
// } // for debug -------------
continue;
} else {
extractMentionForHeadword(w, basic, s, mentions, mentionSpanSet, namedEntitySpanSet);
}
}
}
private void extractMentionForHeadword(IndexedWord headword, SemanticGraph dep, CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet) {
List sent = s.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph basic = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
SemanticGraph enhanced = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
if (enhanced == null) {
enhanced = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
}
// pronoun
if(headword.tag().startsWith("PRP")) {
extractPronounForHeadword(headword, dep, s, mentions, mentionSpanSet, namedEntitySpanSet);
return;
}
// add NP mention
IntPair npSpan = getNPSpan(headword, dep, sent);
int beginIdx = npSpan.get(0);
int endIdx = npSpan.get(1)+1;
if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with ,
if ("IN".equals(sent.get(beginIdx).tag())) { beginIdx++; } // try to remove first IN.
addMention(beginIdx, endIdx, headword, mentions, mentionSpanSet, namedEntitySpanSet, sent, basic, enhanced);
//
// extract the first element in conjunction (A and B -> extract A here "A and B", "B" will be extracted above)
//
// to make sure we find the first conjunction
Set conjChildren = dep.getChildrenWithReln(headword, UniversalEnglishGrammaticalRelations.CONJUNCT);
if(conjChildren.size() > 0) {
IndexedWord conjChild = dep.getChildWithReln(headword, UniversalEnglishGrammaticalRelations.CONJUNCT);
for(IndexedWord c : conjChildren) {
if(c.index() < conjChild.index()) conjChild = c;
}
IndexedWord left = SemanticGraphUtils.leftMostChildVertice(conjChild, dep);
for(int endIdxFirstElement = left.index()-1 ; endIdxFirstElement > beginIdx ; endIdxFirstElement--) {
if(!sent.get(endIdxFirstElement-1).tag().matches("CC|,")) {
if(headword.index()-1 < endIdxFirstElement) {
addMention(beginIdx, endIdxFirstElement, headword, mentions, mentionSpanSet, namedEntitySpanSet, sent, basic, enhanced);
}
break;
}
}
}
}
/**
* return the left and right most node except copula relation (nsubj & cop) and some others (maybe discourse?)
* e.g., you are the person -> return "the person"
*/
private IntPair getNPSpan(IndexedWord headword, SemanticGraph dep, List sent) {
int headwordIdx = headword.index()-1;
List children = dep.getChildList(headword);
// if(children.size()==0) return new IntPair(headwordIdx, headwordIdx); // the headword is the only word
// check if we have copula relation
IndexedWord cop = dep.getChildWithReln(headword, UniversalEnglishGrammaticalRelations.COPULA);
int startIdx = (cop==null)? 0 : children.indexOf(cop)+1;
// children which will be inside of NP
List insideNP = Generics.newArrayList();
for(int i=startIdx ; i < children.size() ; i++) {
IndexedWord child = children.get(i);
SemanticGraphEdge edge = dep.getEdge(headword, child);
if(edge.getRelation().getShortName().matches("dep|discourse|punct")) {
continue; // skip
} else {
insideNP.add(child);
}
}
if(insideNP.size()==0) return new IntPair(headwordIdx, headwordIdx); // the headword is the only word
Pair firstChildLeftRight = SemanticGraphUtils.leftRightMostChildVertices(insideNP.get(0), dep);
Pair lastChildLeftRight = SemanticGraphUtils.leftRightMostChildVertices(insideNP.get(insideNP.size()-1), dep);
// headword can be first or last word
int beginIdx = Math.min(headwordIdx, firstChildLeftRight.first.index()-1);
int endIdx = Math.max(headwordIdx, lastChildLeftRight.second.index()-1);
return new IntPair(beginIdx, endIdx);
}
private IntPair getNPSpanOld(IndexedWord headword, SemanticGraph dep, List sent) {
IndexedWord cop = dep.getChildWithReln(headword, UniversalEnglishGrammaticalRelations.COPULA);
Pair leftRight = SemanticGraphUtils.leftRightMostChildVertices(headword, dep);
// headword can be first or last word
int beginIdx = Math.min(headword.index()-1, leftRight.first.index()-1);
int endIdx = Math.max(headword.index()-1, leftRight.second.index()-1);
// no copula relation
if(cop==null) return new IntPair(beginIdx, endIdx);
// if we have copula relation
List children = dep.getChildList(headword);
int copIdx = children.indexOf(cop);
if(copIdx+1 < children.size()) {
beginIdx = Math.min(headword.index()-1, SemanticGraphUtils.leftMostChildVertice(children.get(copIdx+1), dep).index()-1);
} else {
beginIdx = headword.index()-1;
}
return new IntPair(beginIdx, endIdx);
}
private void addMention(int beginIdx, int endIdx, IndexedWord headword, List mentions, Set mentionSpanSet, Set namedEntitySpanSet, List sent, SemanticGraph basic, SemanticGraph enhanced) {
IntPair mSpan = new IntPair(beginIdx, endIdx);
if(!mentionSpanSet.contains(mSpan) && (!insideNE(mSpan, namedEntitySpanSet)) ) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIdx, endIdx, sent, basic, enhanced, new ArrayList<>(sent.subList(beginIdx, endIdx)));
m.headIndex = headword.index()-1;
m.headWord = sent.get(m.headIndex);
m.headString = m.headWord.word().toLowerCase(Locale.ENGLISH);
mentions.add(m);
mentionSpanSet.add(mSpan);
}
}
private void extractPronounForHeadword(IndexedWord headword, SemanticGraph dep, CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet) {
List sent = s.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph basic = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
SemanticGraph enhanced = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
if (enhanced == null) {
enhanced = s.get(SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class);
}
int beginIdx = headword.index()-1;
int endIdx = headword.index();
// handle "you all", "they both" etc
if(sent.size() > headword.index() && sent.get(headword.index()).word().matches("all|both")) {
IndexedWord c = dep.getNodeByIndex(headword.index()+1);
SemanticGraphEdge edge = dep.getEdge(headword, c);
if(edge!=null) endIdx++;
}
IntPair mSpan = new IntPair(beginIdx, endIdx);
if(!mentionSpanSet.contains(mSpan) && (!insideNE(mSpan, namedEntitySpanSet)) ) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIdx, endIdx, sent, basic, enhanced, new ArrayList<>(sent.subList(beginIdx, endIdx)));
m.headIndex = headword.index()-1;
m.headWord = sent.get(m.headIndex);
m.headString = m.headWord.word().toLowerCase(Locale.ENGLISH);
mentions.add(m);
mentionSpanSet.add(mSpan);
}
// when pronoun is a part of conjunction (e.g., you and I)
Set conjChildren = dep.getChildrenWithReln(headword, UniversalEnglishGrammaticalRelations.CONJUNCT);
if(conjChildren.size() > 0) {
IntPair npSpan = getNPSpan(headword, dep, sent);
beginIdx = npSpan.get(0);
endIdx = npSpan.get(1)+1;
if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with ,
addMention(beginIdx, endIdx, headword, mentions, mentionSpanSet, namedEntitySpanSet, sent, basic, enhanced);
}
}
public static void findHeadInDependency(CoreMap s, List mentions) {
for (Mention m : mentions){
findHeadInDependency(s, m);
}
}
@Override
public void findHead(CoreMap s, List mentions) {
for (Mention m : mentions){
findHeadInDependency(s, m);
}
}
// TODO: still errors in head finder
public static void findHeadInDependency(CoreMap s, Mention m) {
List sent = s.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph basicDep = s.get(BasicDependenciesAnnotation.class);
if(m.headWord == null) {
// when there's punctuation, no node found in the dependency tree
int curIdx;
IndexedWord cur = null;
for(curIdx = m.endIndex-1 ; curIdx >= m.startIndex ; curIdx--) {
if((cur = basicDep.getNodeByIndexSafe(curIdx+1)) != null) break;
}
if(cur==null) curIdx = m.endIndex-1;
while(cur!=null) {
IndexedWord p = basicDep.getParent(cur);
if(p==null || p.index()-1 < m.startIndex || p.index()-1 >= m.endIndex) break;
curIdx = p.index()-1;
cur = basicDep.getNodeByIndexSafe(curIdx+1);
}
// for(IndexedWord p : basicDep.getPathToRoot(basicDep.getNodeByIndex(curIdx+1))) {
// if(p.index()-1 < m.startIndex || p.index()-1 >= m.endIndex) {
// break;
// }
// curIdx = p.index()-1;
// }
m.headIndex = curIdx;
m.headWord = sent.get(m.headIndex);
m.headString = m.headWord.word().toLowerCase(Locale.ENGLISH);
}
}
}