edu.stanford.nlp.dcoref.RuleBasedCorefMentionFinder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
package edu.stanford.nlp.dcoref;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.MultiTokenTag;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.parser.common.ParserAnnotations;
import edu.stanford.nlp.parser.common.ParserConstraint;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.AnnotationPipeline;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.*;
public class RuleBasedCorefMentionFinder implements CorefMentionFinder {
/** A logger for this class */
private static Redwood.RedwoodChannels log = Redwood.channels(RuleBasedCorefMentionFinder.class);
protected boolean assignIds = true;
// protected int maxID = -1;
private final HeadFinder headFinder;
protected Annotator parserProcessor;
private final boolean allowReparsing;
public RuleBasedCorefMentionFinder() {
this(Constants.ALLOW_REPARSING);
}
public RuleBasedCorefMentionFinder(boolean allowReparsing) {
SieveCoreferenceSystem.logger.fine("Using SEMANTIC HEAD FINDER!!!!!!!!!!!!!!!!!!!");
this.headFinder = new SemanticHeadFinder();
this.allowReparsing = allowReparsing;
}
/** When mention boundaries are given */
public List> filterPredictedMentions(List> allGoldMentions, Annotation doc, Dictionaries dict){
List> predictedMentions = new ArrayList<>();
for(int i = 0 ; i < allGoldMentions.size(); i++){
CoreMap s = doc.get(CoreAnnotations.SentencesAnnotation.class).get(i);
List goldMentions = allGoldMentions.get(i);
List mentions = new ArrayList<>();
predictedMentions.add(mentions);
mentions.addAll(goldMentions);
findHead(s, mentions);
// todo [cdm 2013]: This block seems to do nothing - the two sets are never used
Set mentionSpanSet = Generics.newHashSet();
Set namedEntitySpanSet = Generics.newHashSet();
for(Mention m : mentions) {
mentionSpanSet.add(new IntPair(m.startIndex, m.endIndex));
if(!m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("O")) {
namedEntitySpanSet.add(new IntPair(m.startIndex, m.endIndex));
}
}
setBarePlural(mentions);
removeSpuriousMentions(s, mentions, dict);
}
return predictedMentions;
}
/** Main method of mention detection.
* Extract all NP, PRP or NE, and filter out by manually written patterns.
*/
@Override
public List> extractPredictedMentions(Annotation doc, int maxID, Dictionaries dict) {
// this.maxID = _maxID;
List> predictedMentions = new ArrayList<>();
for (CoreMap s : doc.get(CoreAnnotations.SentencesAnnotation.class)) {
List mentions = new ArrayList<>();
predictedMentions.add(mentions);
Set mentionSpanSet = Generics.newHashSet();
Set namedEntitySpanSet = Generics.newHashSet();
extractPremarkedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet);
extractNamedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet);
extractNPorPRP(s, mentions, mentionSpanSet, namedEntitySpanSet);
extractEnumerations(s, mentions, mentionSpanSet, namedEntitySpanSet);
findHead(s, mentions);
setBarePlural(mentions);
removeSpuriousMentions(s, mentions, dict);
}
// assign mention IDs
if(assignIds) assignMentionIDs(predictedMentions, maxID);
return predictedMentions;
}
protected static void assignMentionIDs(List> predictedMentions, int maxID) {
for(List mentions : predictedMentions) {
for(Mention m : mentions) {
m.mentionID = (++maxID);
}
}
}
protected static void setBarePlural(List mentions) {
for (Mention m : mentions) {
String pos = m.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class);
if(m.originalSpan.size()==1 && pos.equals("NNS")) m.generic = true;
}
}
protected static void extractPremarkedEntityMentions(CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet) {
List sent = s.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
int beginIndex = -1;
for(CoreLabel w : sent) {
MultiTokenTag t = w.get(CoreAnnotations.MentionTokenAnnotation.class);
if (t != null) {
// Part of a mention
if (t.isStart()) {
// Start of mention
beginIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1;
}
if (t.isEnd()) {
// end of mention
int endIndex = w.get(CoreAnnotations.IndexAnnotation.class);
if (beginIndex >= 0) {
IntPair mSpan = new IntPair(beginIndex, endIndex);
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIndex, endIndex, dependency, new ArrayList<>(sent.subList(beginIndex, endIndex)));
mentions.add(m);
mentionSpanSet.add(mSpan);
beginIndex = -1;
} else {
SieveCoreferenceSystem.logger.warning("Start of marked mention not found in sentence: "
+ t + " at tokenIndex=" + (w.get(CoreAnnotations.IndexAnnotation.class)-1)+ " for "
+ s.get(CoreAnnotations.TextAnnotation.class));
}
}
}
}
}
protected static void extractNamedEntityMentions(CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet) {
List sent = s.get(CoreAnnotations.TokensAnnotation.class);
SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
String preNE = "O";
int beginIndex = -1;
for(CoreLabel w : sent) {
String nerString = w.get(CoreAnnotations.NamedEntityTagAnnotation.class);
if(!nerString.equals(preNE)) {
int endIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1;
if(!preNE.matches("O|QUANTITY|CARDINAL|PERCENT|DATE|DURATION|TIME|SET")){
if(w.get(CoreAnnotations.TextAnnotation.class).equals("'s")) endIndex++;
IntPair mSpan = new IntPair(beginIndex, endIndex);
// Need to check if beginIndex < endIndex because, for
// example, there could be a 's mislabeled by the NER and
// attached to the previous NER by the earlier heuristic
if(beginIndex < endIndex && !mentionSpanSet.contains(mSpan)) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIndex, endIndex, dependency, new ArrayList<>(sent.subList(beginIndex, endIndex)));
mentions.add(m);
mentionSpanSet.add(mSpan);
namedEntitySpanSet.add(mSpan);
}
}
beginIndex = endIndex;
preNE = nerString;
}
}
// NE at the end of sentence
if(!preNE.matches("O|QUANTITY|CARDINAL|PERCENT|DATE|DURATION|TIME|SET")) {
IntPair mSpan = new IntPair(beginIndex, sent.size());
if(!mentionSpanSet.contains(mSpan)) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIndex, sent.size(), dependency, new ArrayList<>(sent.subList(beginIndex, sent.size())));
mentions.add(m);
mentionSpanSet.add(mSpan);
namedEntitySpanSet.add(mSpan);
}
}
}
private static final TregexPattern npOrPrpMentionPattern = TregexPattern.compile("/^(?:NP|PRP)/");
protected static void extractNPorPRP(CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet) {
List sent = s.get(CoreAnnotations.TokensAnnotation.class);
Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
tree.indexLeaves();
SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
TregexPattern tgrepPattern = npOrPrpMentionPattern;
TregexMatcher matcher = tgrepPattern.matcher(tree);
while (matcher.find()) {
Tree t = matcher.getMatch();
List mLeaves = t.getLeaves();
int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1;
int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class);
if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with ,
IntPair mSpan = new IntPair(beginIdx, endIdx);
if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, beginIdx, endIdx, dependency, new ArrayList<>(sent.subList(beginIdx, endIdx)), t);
mentions.add(m);
mentionSpanSet.add(mSpan);
}
}
}
/** Extract enumerations (A, B, and C) */
private static final TregexPattern enumerationsMentionPattern = TregexPattern.compile("NP < (/^(?:NP|NNP|NML)/=m1 $.. (/^CC|,/ $.. /^(?:NP|NNP|NML)/=m2))");
protected static void extractEnumerations(CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet) {
List sent = s.get(CoreAnnotations.TokensAnnotation.class);
Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
TregexPattern tgrepPattern = enumerationsMentionPattern;
TregexMatcher matcher = tgrepPattern.matcher(tree);
Map spanToMentionSubTree = Generics.newHashMap();
while (matcher.find()) {
matcher.getMatch();
Tree m1 = matcher.getNode("m1");
Tree m2 = matcher.getNode("m2");
List mLeaves = m1.getLeaves();
int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1;
int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class);
spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m1);
mLeaves = m2.getLeaves();
beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1;
endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class);
spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m2);
}
for(IntPair mSpan : spanToMentionSubTree.keySet()){
if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) {
int dummyMentionId = -1;
Mention m = new Mention(dummyMentionId, mSpan.get(0), mSpan.get(1), dependency,
new ArrayList<>(sent.subList(mSpan.get(0), mSpan.get(1))), spanToMentionSubTree.get(mSpan));
mentions.add(m);
mentionSpanSet.add(mSpan);
}
}
}
/** Check whether a mention is inside of a named entity */
private static boolean insideNE(IntPair mSpan, Set namedEntitySpanSet) {
for (IntPair span : namedEntitySpanSet){
if(span.get(0) <= mSpan.get(0) && mSpan.get(1) <= span.get(1)) return true;
}
return false;
}
protected void findHead(CoreMap s, List mentions) {
Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
List sent = s.get(CoreAnnotations.TokensAnnotation.class);
tree.indexSpans(0);
for (Mention m : mentions){
Tree head = findSyntacticHead(m, tree, sent);
m.headIndex = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class)-1;
m.headWord = sent.get(m.headIndex);
m.headString = m.headWord.get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH);
int start = m.headIndex - m.startIndex;
if (start < 0 || start >= m.originalSpan.size()) {
SieveCoreferenceSystem.logger.warning("Invalid index for head " + start + "=" + m.headIndex + "-" + m.startIndex
+ ": originalSpan=[" + StringUtils.joinWords(m.originalSpan, " ") + "], head=" + m.headWord);
SieveCoreferenceSystem.logger.warning("Setting head string to entire mention");
m.headIndex = m.startIndex;
m.headWord = m.originalSpan.size() > 0 ? m.originalSpan.get(0) : sent.get(m.startIndex);
m.headString = m.originalSpan.toString();
}
}
}
protected Tree findSyntacticHead(Mention m, Tree root, List tokens) {
// mention ends with 's
int endIdx = m.endIndex;
if (m.originalSpan.size() > 0) {
String lastWord = m.originalSpan.get(m.originalSpan.size()-1).get(CoreAnnotations.TextAnnotation.class);
if((lastWord.equals("'s") || lastWord.equals("'"))
&& m.originalSpan.size() != 1 ) endIdx--;
}
Tree exactMatch = findTreeWithSpan(root, m.startIndex, endIdx);
//
// found an exact match
//
if (exactMatch != null) {
return safeHead(exactMatch, endIdx);
}
// no exact match found
// in this case, we parse the actual extent of the mention, embedded in a sentence
// context, so as to make the parser work better :-)
if (allowReparsing) {
int approximateness = 0;
List extentTokens = new ArrayList<>();
extentTokens.add(initCoreLabel("It"));
extentTokens.add(initCoreLabel("was"));
final int ADDED_WORDS = 2;
for (int i = m.startIndex; i < endIdx; i++) {
// Add everything except separated dashes! The separated dashes mess with the parser too badly.
CoreLabel label = tokens.get(i);
if ( ! "-".equals(label.word())) {
// necessary to copy tokens in case the parser does things like
// put new indices on the tokens
extentTokens.add((CoreLabel) label.labelFactory().newLabel(label));
} else {
approximateness++;
}
}
extentTokens.add(initCoreLabel("."));
// constrain the parse to the part we're interested in.
// Starting from ADDED_WORDS comes from skipping "It was".
// -1 to exclude the period.
// We now let it be any kind of nominal constituent, since there
// are VP and S ones
ParserConstraint constraint = new ParserConstraint(ADDED_WORDS, extentTokens.size() - 1, Pattern.compile(".*"));
List constraints = Collections.singletonList(constraint);
Tree tree = parse(extentTokens, constraints);
convertToCoreLabels(tree); // now unnecessary, as parser uses CoreLabels?
tree.indexSpans(m.startIndex - ADDED_WORDS); // remember it has ADDED_WORDS extra words at the beginning
Tree subtree = findPartialSpan(tree, m.startIndex);
// There was a possible problem that with a crazy parse, extentHead could be one of the added words, not a real word!
// Now we make sure in findPartialSpan that it can't be before the real start, and in safeHead, we disallow something
// passed the right end (that is, just that final period).
Tree extentHead = safeHead(subtree, endIdx);
assert(extentHead != null);
// extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree
// Because we deleted dashes, it's index will be >= the index in the extent parse tree
CoreLabel l = (CoreLabel) extentHead.label();
Tree realHead = funkyFindLeafWithApproximateSpan(root, l.value(), l.get(CoreAnnotations.BeginIndexAnnotation.class), approximateness);
assert(realHead != null);
return realHead;
}
// If reparsing wasn't allowed, try to find a span in the tree
// which happens to have the head
Tree wordMatch = findTreeWithSmallestSpan(root, m.startIndex, endIdx);
if (wordMatch != null) {
Tree head = safeHead(wordMatch, endIdx);
if (head != null) {
int index = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class)-1;
if (index >= m.startIndex && index < endIdx) {
return head;
}
}
}
// If that didn't work, guess that it's the last word
int lastNounIdx = endIdx-1;
for(int i=m.startIndex ; i < m.endIndex ; i++) {
if(tokens.get(i).tag().startsWith("N")) lastNounIdx = i;
else if(tokens.get(i).tag().startsWith("W")) break;
}
List leaves = root.getLeaves();
Tree endLeaf = leaves.get(lastNounIdx);
return endLeaf;
}
/** Find the tree that covers the portion of interest. */
private static Tree findPartialSpan(final Tree root, final int start) {
CoreLabel label = (CoreLabel) root.label();
int startIndex = label.get(CoreAnnotations.BeginIndexAnnotation.class);
if (startIndex == start) {
return root;
}
for (Tree kid : root.children()) {
CoreLabel kidLabel = (CoreLabel) kid.label();
int kidStart = kidLabel.get(CoreAnnotations.BeginIndexAnnotation.class);
int kidEnd = kidLabel.get(CoreAnnotations.EndIndexAnnotation.class);
if (kidStart <= start && kidEnd > start) {
return findPartialSpan(kid, start);
}
}
throw new RuntimeException("Shouldn't happen: " + start + " " + root);
}
private static Tree funkyFindLeafWithApproximateSpan(Tree root, String token, int index, int approximateness) {
// log.info("Searching " + root + "\n for " + token + " at position " + index + " (plus up to " + approximateness + ")");
List leaves = root.getLeaves();
for (Tree leaf : leaves) {
CoreLabel label = CoreLabel.class.cast(leaf.label());
Integer indexInteger = label.get(CoreAnnotations.IndexAnnotation.class);
if (indexInteger == null) continue;
int ind = indexInteger - 1;
if (token.equals(leaf.value()) && ind >= index && ind <= index + approximateness) {
return leaf;
}
}
// this shouldn't happen
// throw new RuntimeException("RuleBasedCorefMentionFinder: ERROR: Failed to find head token");
SieveCoreferenceSystem.logger.warning("RuleBasedCorefMentionFinder: Failed to find head token:\n" +
"Tree is: " + root + "\n" +
"token = |" + token + "|" + index + "|, approx=" + approximateness);
for (Tree leaf : leaves) {
if (token.equals(leaf.value())) {
//log.info("Found something: returning " + leaf);
return leaf;
}
}
int fallback = Math.max(0, leaves.size() - 2);
SieveCoreferenceSystem.logger.warning("RuleBasedCorefMentionFinder: Last resort: returning as head: " + leaves.get(fallback));
return leaves.get(fallback); // last except for the added period.
}
private static CoreLabel initCoreLabel(String token) {
CoreLabel label = new CoreLabel();
label.set(CoreAnnotations.TextAnnotation.class, token);
label.set(CoreAnnotations.ValueAnnotation.class, token);
return label;
}
private Tree parse(List tokens) {
return parse(tokens, null);
}
private Tree parse(List tokens,
List constraints) {
CoreMap sent = new Annotation("");
sent.set(CoreAnnotations.TokensAnnotation.class, tokens);
sent.set(ParserAnnotations.ConstraintAnnotation.class, constraints);
Annotation doc = new Annotation("");
List sents = new ArrayList<>(1);
sents.add(sent);
doc.set(CoreAnnotations.SentencesAnnotation.class, sents);
getParser().annotate(doc);
sents = doc.get(CoreAnnotations.SentencesAnnotation.class);
return sents.get(0).get(TreeCoreAnnotations.TreeAnnotation.class);
}
private Annotator getParser() {
if(parserProcessor == null){
Annotator parser = StanfordCoreNLP.getExistingAnnotator("parse");
if (parser == null) {
// TODO: these assertions rule out the possibility of alternately named parse/pos annotators
throw new AssertionError("Failed to get parser - this should not be possible");
}
if (parser.requires().contains(CoreAnnotations.PartOfSpeechAnnotation.class)) {
Annotator tagger = StanfordCoreNLP.getExistingAnnotator("pos");
if (tagger == null) {
throw new AssertionError("Parser required tagger, but failed to find the pos annotator");
}
List annotators = Generics.newArrayList();
annotators.add(tagger);
annotators.add(parser);
parserProcessor = new AnnotationPipeline(annotators);
} else {
parserProcessor = parser;
}
}
return parserProcessor;
}
// This probably isn't needed now; everything is always a core label. But no-op.
private static void convertToCoreLabels(Tree tree) {
Label l = tree.label();
if (! (l instanceof CoreLabel)) {
CoreLabel cl = new CoreLabel();
cl.setValue(l.value());
tree.setLabel(cl);
}
for (Tree kid : tree.children()) {
convertToCoreLabels(kid);
}
}
private Tree safeHead(Tree top, int endIndex) {
// The trees passed in do not have the CoordinationTransformer
// applied, but that just means the SemanticHeadFinder results are
// slightly worse.
Tree head = top.headTerminal(headFinder);
// One obscure failure case is that the added period becomes the head. Disallow this.
if (head != null) {
Integer headIndexInteger = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class);
if (headIndexInteger != null) {
int headIndex = headIndexInteger - 1;
if (headIndex < endIndex) {
return head;
}
}
}
// if no head found return the right-most leaf
List leaves = top.getLeaves();
int candidate = leaves.size() - 1;
while (candidate >= 0) {
head = leaves.get(candidate);
Integer headIndexInteger = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class);
if (headIndexInteger != null) {
int headIndex = headIndexInteger - 1;
if (headIndex < endIndex) {
return head;
}
}
candidate--;
}
// fallback: return top
return top;
}
static Tree findTreeWithSmallestSpan(Tree tree, int start, int end) {
List leaves = tree.getLeaves();
Tree startLeaf = leaves.get(start);
Tree endLeaf = leaves.get(end - 1);
return Trees.getLowestCommonAncestor(Arrays.asList(startLeaf, endLeaf), tree);
}
private static Tree findTreeWithSpan(Tree tree, int start, int end) {
CoreLabel l = (CoreLabel) tree.label();
if (l != null && l.containsKey(CoreAnnotations.BeginIndexAnnotation.class) && l.containsKey(CoreAnnotations.EndIndexAnnotation.class)) {
int myStart = l.get(CoreAnnotations.BeginIndexAnnotation.class);
int myEnd = l.get(CoreAnnotations.EndIndexAnnotation.class);
if (start == myStart && end == myEnd){
// found perfect match
return tree;
} else if (end < myStart) {
return null;
} else if (start >= myEnd) {
return null;
}
}
// otherwise, check inside children - a match is possible
for (Tree kid : tree.children()) {
if (kid == null) continue;
Tree ret = findTreeWithSpan(kid, start, end);
// found matching child
if (ret != null) return ret;
}
// no match
return null;
}
/** Filter out all spurious mentions */
protected static void removeSpuriousMentions(CoreMap s, List mentions, Dictionaries dict) {
Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
List sent = s.get(CoreAnnotations.TokensAnnotation.class);
Set remove = Generics.newHashSet();
for(Mention m : mentions){
String headPOS = m.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class);
String headNE = m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class);
// pleonastic it
if(isPleonastic(m, tree)) remove.add(m);
// non word such as 'hmm'
if(dict.nonWords.contains(m.headString)) remove.add(m);
// quantRule : not starts with 'any', 'all' etc
if (m.originalSpan.size() > 0 && dict.quantifiers.contains(m.originalSpan.get(0).get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH))) remove.add(m);
// partitiveRule
if (partitiveRule(m, sent, dict)) remove.add(m);
// bareNPRule
if (headPOS.equals("NN") && !dict.temporals.contains(m.headString)
&& (m.originalSpan.size()==1 || m.originalSpan.get(0).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("JJ"))) {
remove.add(m);
}
// remove generic rule
// if(m.generic==true) remove.add(m);
if (m.headString.equals("%")) remove.add(m);
if (headNE.equals("PERCENT") || headNE.equals("MONEY")) remove.add(m);
// adjective form of nations
if (dict.isAdjectivalDemonym(m.spanToString())) remove.add(m);
// stop list (e.g., U.S., there)
if (inStopList(m)) remove.add(m);
}
// nested mention with shared headword (except apposition, enumeration): pick larger one
for (Mention m1 : mentions){
for (Mention m2 : mentions){
if (m1==m2 || remove.contains(m1) || remove.contains(m2)) continue;
if (m1.sentNum==m2.sentNum && m1.headWord==m2.headWord && m2.insideIn(m1)) {
if (m2.endIndex < sent.size() && (sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals(",")
|| sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CC"))) {
continue;
}
remove.add(m2);
}
}
}
mentions.removeAll(remove);
}
private static boolean inStopList(Mention m) {
String mentionSpan = m.spanToString().toLowerCase(Locale.ENGLISH);
if (mentionSpan.equals("u.s.") || mentionSpan.equals("u.k.")
|| mentionSpan.equals("u.s.s.r")) return true;
if (mentionSpan.equals("there") || mentionSpan.startsWith("etc.")
|| mentionSpan.equals("ltd.")) return true;
if (mentionSpan.startsWith("'s ")) return true;
if (mentionSpan.endsWith("etc.")) return true;
return false;
}
private static boolean partitiveRule(Mention m, List sent, Dictionaries dict) {
return m.startIndex >= 2
&& sent.get(m.startIndex - 1).get(CoreAnnotations.TextAnnotation.class).equalsIgnoreCase("of")
&& dict.parts.contains(sent.get(m.startIndex - 2).get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH));
}
/** Check whether pleonastic 'it'. E.g., It is possible that ... */
private static final TregexPattern[] pleonasticPatterns = getPleonasticPatterns();
private static boolean isPleonastic(Mention m, Tree tree) {
if ( ! m.spanToString().equalsIgnoreCase("it")) return false;
for (TregexPattern p : pleonasticPatterns) {
if (checkPleonastic(m, tree, p)) {
// SieveCoreferenceSystem.logger.fine("RuleBasedCorefMentionFinder: matched pleonastic pattern '" + p + "' for " + tree);
return true;
}
}
return false;
}
private static TregexPattern[] getPleonasticPatterns() {
final String[] patterns = {
// cdm 2013: I spent a while on these patterns. I fixed a syntax error in five patterns ($.. split with space), so it now shouldn't exception in checkPleonastic. This gave 0.02% on CoNLL11 dev
// I tried some more precise patterns but they didn't help. Indeed, they tended to hurt vs. the higher recall patterns.
//"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (VP < (VBN $.. /S|SBAR/))))", // overmatches
// "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN < expected|hoped $.. @SBAR))))", // this one seems more accurate, but ...
"@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN $.. @S|SBAR))))", // in practice, go with this one (best results)
"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP $.. (/S|SBAR/))))",
"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP < (/S|SBAR/))))",
// "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@ADJP < (/^(?:JJ|VB)/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay)$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))", // does worse than above 2 on CoNLL11 dev
"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP < /S|SBAR/)))",
"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP $.. ADVP $.. /S|SBAR/)))",
// "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@NP $.. @ADVP $.. @SBAR)))", // cleft examples, generalized to not need ADVP; but gave worse CoNLL12 dev numbers....
// these next 5 had buggy space in "$ ..", which I fixed
"NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (VP < (VBN $.. /S|SBAR/))))))",
"NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP $.. (/S|SBAR/))))))", // extraposed. OK 1/2 correct; need non-adverbial case
"NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP < (/S|SBAR/))))))", // OK: 3/3 good matches on dev; but 3/4 wrong on WSJ
// certain can be either but relatively likely pleonastic with it ... be
// "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (MD $.. (@VP < ((/^V.*/ < /^(?:be|become)/) $.. (@ADJP < (/^JJ/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay))$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))))", // GOOD REPLACEMENT ; 2nd clause is for extraposed ones
"NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP < /S|SBAR/)))))",
"NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP $.. ADVP $.. /S|SBAR/)))))",
"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:seems|appears|means|follows)/) $.. /S|SBAR/))",
"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:turns|turned)/) $.. PRT $.. /S|SBAR/))"
};
TregexPattern[] tgrepPatterns = new TregexPattern[patterns.length];
for (int i = 0; i < tgrepPatterns.length; i++) {
tgrepPatterns[i] = TregexPattern.compile(patterns[i]);
}
return tgrepPatterns;
}
private static boolean checkPleonastic(Mention m, Tree tree, TregexPattern tgrepPattern) {
try {
TregexMatcher matcher = tgrepPattern.matcher(tree);
while (matcher.find()) {
Tree np1 = matcher.getNode("m1");
if (((CoreLabel)np1.label()).get(CoreAnnotations.BeginIndexAnnotation.class)+1 == m.headWord.get(CoreAnnotations.IndexAnnotation.class)) {
return true;
}
}
} catch (Exception e) {
e.printStackTrace();
}
return false;
}
}