All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.dcoref.RuleBasedCorefMentionFinder Maven / Gradle / Ivy

Go to download

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.dcoref; 
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.util.logging.Redwood;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.MultiTokenTag;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.parser.common.ParserAnnotations;
import edu.stanford.nlp.parser.common.ParserConstraint;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.AnnotationPipeline;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.*;

public class RuleBasedCorefMentionFinder implements CorefMentionFinder  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(RuleBasedCorefMentionFinder.class);

  protected boolean assignIds = true;
//  protected int maxID = -1;
  private final HeadFinder headFinder;
  protected Annotator parserProcessor;

  private final boolean allowReparsing;

  public RuleBasedCorefMentionFinder() {
    this(Constants.ALLOW_REPARSING);
  }

  public RuleBasedCorefMentionFinder(boolean allowReparsing) {
    SieveCoreferenceSystem.logger.fine("Using SEMANTIC HEAD FINDER!!!!!!!!!!!!!!!!!!!");
    this.headFinder = new SemanticHeadFinder();
    this.allowReparsing = allowReparsing;
  }

  /** When mention boundaries are given */
  public List> filterPredictedMentions(List> allGoldMentions, Annotation doc, Dictionaries dict){
    List> predictedMentions = new ArrayList<>();

    for(int i = 0 ; i < allGoldMentions.size(); i++){
      CoreMap s = doc.get(CoreAnnotations.SentencesAnnotation.class).get(i);
      List goldMentions = allGoldMentions.get(i);
      List mentions = new ArrayList<>();
      predictedMentions.add(mentions);
      mentions.addAll(goldMentions);
      findHead(s, mentions);

      // todo [cdm 2013]: This block seems to do nothing - the two sets are never used
      Set mentionSpanSet = Generics.newHashSet();
      Set namedEntitySpanSet = Generics.newHashSet();
      for(Mention m : mentions) {
        mentionSpanSet.add(new IntPair(m.startIndex, m.endIndex));
        if(!m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class).equals("O")) {
          namedEntitySpanSet.add(new IntPair(m.startIndex, m.endIndex));
        }
      }

      setBarePlural(mentions);
      removeSpuriousMentions(s, mentions, dict);
    }
    return predictedMentions;
  }

  /** Main method of mention detection.
   *  Extract all NP, PRP or NE, and filter out by manually written patterns.
   */
  @Override
  public List> extractPredictedMentions(Annotation doc, int maxID, Dictionaries dict) {
//    this.maxID = _maxID;
    List> predictedMentions = new ArrayList<>();
    for (CoreMap s : doc.get(CoreAnnotations.SentencesAnnotation.class)) {

      List mentions = new ArrayList<>();
      predictedMentions.add(mentions);
      Set mentionSpanSet = Generics.newHashSet();
      Set namedEntitySpanSet = Generics.newHashSet();

      extractPremarkedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet);
      extractNamedEntityMentions(s, mentions, mentionSpanSet, namedEntitySpanSet);
      extractNPorPRP(s, mentions, mentionSpanSet, namedEntitySpanSet);
      extractEnumerations(s, mentions, mentionSpanSet, namedEntitySpanSet);
      findHead(s, mentions);
      setBarePlural(mentions);
      removeSpuriousMentions(s, mentions, dict);
    }

    // assign mention IDs
    if(assignIds) assignMentionIDs(predictedMentions, maxID);

    return predictedMentions;
  }

  protected static void assignMentionIDs(List> predictedMentions, int maxID) {
    for(List mentions : predictedMentions) {
      for(Mention m : mentions) {
        m.mentionID = (++maxID);
      }
    }
  }

  protected static void setBarePlural(List mentions) {
    for (Mention m : mentions) {
      String pos = m.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class);
      if(m.originalSpan.size()==1 && pos.equals("NNS")) m.generic = true;
    }
  }

  protected static void extractPremarkedEntityMentions(CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet) {
    List sent = s.get(CoreAnnotations.TokensAnnotation.class);
    SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
    int beginIndex = -1;
    for(CoreLabel w : sent) {
      MultiTokenTag t = w.get(CoreAnnotations.MentionTokenAnnotation.class);
      if (t != null) {
        // Part of a mention
        if (t.isStart()) {
          // Start of mention
          beginIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1;
        }
        if (t.isEnd()) {
          // end of mention
          int endIndex = w.get(CoreAnnotations.IndexAnnotation.class);
          if (beginIndex >= 0) {
            IntPair mSpan = new IntPair(beginIndex, endIndex);
            int dummyMentionId = -1;
            Mention m = new Mention(dummyMentionId, beginIndex, endIndex, dependency, new ArrayList<>(sent.subList(beginIndex, endIndex)));
            mentions.add(m);
            mentionSpanSet.add(mSpan);
            beginIndex = -1;
          } else {
            SieveCoreferenceSystem.logger.warning("Start of marked mention not found in sentence: "
                    + t + " at tokenIndex=" + (w.get(CoreAnnotations.IndexAnnotation.class)-1)+ " for "
                    + s.get(CoreAnnotations.TextAnnotation.class));
          }
        }
      }
    }
  }

  protected static void extractNamedEntityMentions(CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet) {
    List sent = s.get(CoreAnnotations.TokensAnnotation.class);
    SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);
    String preNE = "O";
    int beginIndex = -1;
    for(CoreLabel w : sent) {
      String nerString = w.get(CoreAnnotations.NamedEntityTagAnnotation.class);
      if(!nerString.equals(preNE)) {
        int endIndex = w.get(CoreAnnotations.IndexAnnotation.class) - 1;
        if(!preNE.matches("O|QUANTITY|CARDINAL|PERCENT|DATE|DURATION|TIME|SET")){
          if(w.get(CoreAnnotations.TextAnnotation.class).equals("'s")) endIndex++;
          IntPair mSpan = new IntPair(beginIndex, endIndex);
          // Need to check if beginIndex < endIndex because, for
          // example, there could be a 's mislabeled by the NER and
          // attached to the previous NER by the earlier heuristic
          if(beginIndex < endIndex && !mentionSpanSet.contains(mSpan)) {
            int dummyMentionId = -1;
            Mention m = new Mention(dummyMentionId, beginIndex, endIndex, dependency, new ArrayList<>(sent.subList(beginIndex, endIndex)));
            mentions.add(m);
            mentionSpanSet.add(mSpan);
            namedEntitySpanSet.add(mSpan);
          }
        }
        beginIndex = endIndex;
        preNE = nerString;
      }
    }
    // NE at the end of sentence
    if(!preNE.matches("O|QUANTITY|CARDINAL|PERCENT|DATE|DURATION|TIME|SET")) {
      IntPair mSpan = new IntPair(beginIndex, sent.size());
      if(!mentionSpanSet.contains(mSpan)) {
        int dummyMentionId = -1;
        Mention m = new Mention(dummyMentionId, beginIndex, sent.size(), dependency, new ArrayList<>(sent.subList(beginIndex, sent.size())));
        mentions.add(m);
        mentionSpanSet.add(mSpan);
        namedEntitySpanSet.add(mSpan);
      }
    }
  }

  private static final TregexPattern npOrPrpMentionPattern = TregexPattern.compile("/^(?:NP|PRP)/");

  protected static void extractNPorPRP(CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet) {
    List sent = s.get(CoreAnnotations.TokensAnnotation.class);
    Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
    tree.indexLeaves();
    SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);

    TregexPattern tgrepPattern = npOrPrpMentionPattern;
    TregexMatcher matcher = tgrepPattern.matcher(tree);
    while (matcher.find()) {
      Tree t = matcher.getMatch();
      List mLeaves = t.getLeaves();
      int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1;
      int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class);
      if (",".equals(sent.get(endIdx-1).word())) { endIdx--; } // try not to have span that ends with ,
      IntPair mSpan = new IntPair(beginIdx, endIdx);
      if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) {
        int dummyMentionId = -1;
        Mention m = new Mention(dummyMentionId, beginIdx, endIdx, dependency, new ArrayList<>(sent.subList(beginIdx, endIdx)), t);
        mentions.add(m);
        mentionSpanSet.add(mSpan);
      }
    }
  }
  /** Extract enumerations (A, B, and C) */
  private static final TregexPattern enumerationsMentionPattern = TregexPattern.compile("NP < (/^(?:NP|NNP|NML)/=m1 $.. (/^CC|,/ $.. /^(?:NP|NNP|NML)/=m2))");

  protected static void extractEnumerations(CoreMap s, List mentions, Set mentionSpanSet, Set namedEntitySpanSet) {
    List sent = s.get(CoreAnnotations.TokensAnnotation.class);
    Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
    SemanticGraph dependency = s.get(SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class);

    TregexPattern tgrepPattern = enumerationsMentionPattern;
    TregexMatcher matcher = tgrepPattern.matcher(tree);
    Map spanToMentionSubTree = Generics.newHashMap();
    while (matcher.find()) {
      matcher.getMatch();
      Tree m1 = matcher.getNode("m1");
      Tree m2 = matcher.getNode("m2");

      List mLeaves = m1.getLeaves();
      int beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1;
      int endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class);
      spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m1);

      mLeaves = m2.getLeaves();
      beginIdx = ((CoreLabel)mLeaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class)-1;
      endIdx = ((CoreLabel)mLeaves.get(mLeaves.size()-1).label()).get(CoreAnnotations.IndexAnnotation.class);
      spanToMentionSubTree.put(new IntPair(beginIdx, endIdx), m2);
    }

    for(IntPair mSpan : spanToMentionSubTree.keySet()){
      if(!mentionSpanSet.contains(mSpan) && !insideNE(mSpan, namedEntitySpanSet)) {
        int dummyMentionId = -1;
        Mention m = new Mention(dummyMentionId, mSpan.get(0), mSpan.get(1), dependency,
                new ArrayList<>(sent.subList(mSpan.get(0), mSpan.get(1))), spanToMentionSubTree.get(mSpan));
        mentions.add(m);
        mentionSpanSet.add(mSpan);
      }
    }
  }

  /** Check whether a mention is inside of a named entity */
  private static boolean insideNE(IntPair mSpan, Set namedEntitySpanSet) {
    for (IntPair span : namedEntitySpanSet){
      if(span.get(0) <= mSpan.get(0) && mSpan.get(1) <= span.get(1)) return true;
    }
    return false;
  }

  protected void findHead(CoreMap s, List mentions) {
    Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
    List sent = s.get(CoreAnnotations.TokensAnnotation.class);
    tree.indexSpans(0);
    for (Mention m : mentions){
      Tree head = findSyntacticHead(m, tree, sent);
      m.headIndex = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class)-1;
      m.headWord = sent.get(m.headIndex);
      m.headString = m.headWord.get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH);
      int start = m.headIndex - m.startIndex;
      if (start < 0 || start >= m.originalSpan.size()) {
        SieveCoreferenceSystem.logger.warning("Invalid index for head " + start + "=" + m.headIndex + "-" + m.startIndex
                + ": originalSpan=[" + StringUtils.joinWords(m.originalSpan, " ") + "], head=" + m.headWord);
        SieveCoreferenceSystem.logger.warning("Setting head string to entire mention");
        m.headIndex = m.startIndex;
        m.headWord = m.originalSpan.size() > 0 ? m.originalSpan.get(0) : sent.get(m.startIndex);
        m.headString = m.originalSpan.toString();
      }
    }
  }

  protected Tree findSyntacticHead(Mention m, Tree root, List tokens) {
    // mention ends with 's
    int endIdx = m.endIndex;
    if (m.originalSpan.size() > 0) {
      String lastWord = m.originalSpan.get(m.originalSpan.size()-1).get(CoreAnnotations.TextAnnotation.class);
        if((lastWord.equals("'s") || lastWord.equals("'"))
            && m.originalSpan.size() != 1 ) endIdx--;
    }

    Tree exactMatch = findTreeWithSpan(root, m.startIndex, endIdx);
    //
    // found an exact match
    //
    if (exactMatch != null) {
      return safeHead(exactMatch, endIdx);
    }

    // no exact match found
    // in this case, we parse the actual extent of the mention, embedded in a sentence
    // context, so as to make the parser work better :-)
    if (allowReparsing) {
      int approximateness = 0;
      List extentTokens = new ArrayList<>();
      extentTokens.add(initCoreLabel("It"));
      extentTokens.add(initCoreLabel("was"));
      final int ADDED_WORDS = 2;
      for (int i = m.startIndex; i < endIdx; i++) {
        // Add everything except separated dashes! The separated dashes mess with the parser too badly.
        CoreLabel label = tokens.get(i);
        if ( ! "-".equals(label.word())) {
          // necessary to copy tokens in case the parser does things like
          // put new indices on the tokens
          extentTokens.add((CoreLabel) label.labelFactory().newLabel(label));
        } else {
          approximateness++;
        }
      }
      extentTokens.add(initCoreLabel("."));

      // constrain the parse to the part we're interested in.
      // Starting from ADDED_WORDS comes from skipping "It was".
      // -1 to exclude the period.
      // We now let it be any kind of nominal constituent, since there
      // are VP and S ones
      ParserConstraint constraint = new ParserConstraint(ADDED_WORDS, extentTokens.size() - 1, Pattern.compile(".*"));
      List constraints = Collections.singletonList(constraint);
      Tree tree = parse(extentTokens, constraints);
      convertToCoreLabels(tree);  // now unnecessary, as parser uses CoreLabels?
      tree.indexSpans(m.startIndex - ADDED_WORDS);  // remember it has ADDED_WORDS extra words at the beginning
      Tree subtree = findPartialSpan(tree, m.startIndex);
      // There was a possible problem that with a crazy parse, extentHead could be one of the added words, not a real word!
      // Now we make sure in findPartialSpan that it can't be before the real start, and in safeHead, we disallow something
      // passed the right end (that is, just that final period).
      Tree extentHead = safeHead(subtree, endIdx);
      assert(extentHead != null);
      // extentHead is a child in the local extent parse tree. we need to find the corresponding node in the main tree
      // Because we deleted dashes, it's index will be >= the index in the extent parse tree
      CoreLabel l = (CoreLabel) extentHead.label();
      Tree realHead = funkyFindLeafWithApproximateSpan(root, l.value(), l.get(CoreAnnotations.BeginIndexAnnotation.class), approximateness);
      assert(realHead != null);
      return realHead;
    }

    // If reparsing wasn't allowed, try to find a span in the tree
    // which happens to have the head
    Tree wordMatch = findTreeWithSmallestSpan(root, m.startIndex, endIdx);
    if (wordMatch != null) {
      Tree head = safeHead(wordMatch, endIdx);
      if (head != null) {
        int index = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class)-1;
        if (index >= m.startIndex && index < endIdx) {
          return head;
        }
      }
    }

    // If that didn't work, guess that it's the last word

    int lastNounIdx = endIdx-1;
    for(int i=m.startIndex ; i < m.endIndex ; i++) {
      if(tokens.get(i).tag().startsWith("N")) lastNounIdx = i;
      else if(tokens.get(i).tag().startsWith("W")) break;
    }

    List leaves = root.getLeaves();
    Tree endLeaf = leaves.get(lastNounIdx);
    return endLeaf;
  }

  /** Find the tree that covers the portion of interest. */
  private static Tree findPartialSpan(final Tree root, final int start) {
    CoreLabel label = (CoreLabel) root.label();
    int startIndex = label.get(CoreAnnotations.BeginIndexAnnotation.class);
    if (startIndex == start) {
      return root;
    }
    for (Tree kid : root.children()) {
      CoreLabel kidLabel = (CoreLabel) kid.label();
      int kidStart = kidLabel.get(CoreAnnotations.BeginIndexAnnotation.class);
      int kidEnd = kidLabel.get(CoreAnnotations.EndIndexAnnotation.class);
      if (kidStart <= start && kidEnd > start) {
        return findPartialSpan(kid, start);
      }
    }
    throw new RuntimeException("Shouldn't happen: " + start + " " + root);
  }

  private static Tree funkyFindLeafWithApproximateSpan(Tree root, String token, int index, int approximateness) {
    // log.info("Searching " + root + "\n  for " + token + " at position " + index + " (plus up to " + approximateness + ")");
    List leaves = root.getLeaves();
    for (Tree leaf : leaves) {
      CoreLabel label = CoreLabel.class.cast(leaf.label());
      Integer indexInteger = label.get(CoreAnnotations.IndexAnnotation.class);
      if (indexInteger == null) continue;
      int ind = indexInteger - 1;
      if (token.equals(leaf.value()) && ind >= index && ind <= index + approximateness) {
        return leaf;
      }
    }
    // this shouldn't happen
    //    throw new RuntimeException("RuleBasedCorefMentionFinder: ERROR: Failed to find head token");
    SieveCoreferenceSystem.logger.warning("RuleBasedCorefMentionFinder: Failed to find head token:\n" +
                       "Tree is: " + root + "\n" +
                       "token = |" + token + "|" + index + "|, approx=" + approximateness);
    for (Tree leaf : leaves) {
      if (token.equals(leaf.value())) {
        //log.info("Found something: returning " + leaf);
        return leaf;
      }
    }
    int fallback = Math.max(0, leaves.size() - 2);
    SieveCoreferenceSystem.logger.warning("RuleBasedCorefMentionFinder: Last resort: returning as head: " + leaves.get(fallback));
    return leaves.get(fallback); // last except for the added period.
  }

  private static CoreLabel initCoreLabel(String token) {
    CoreLabel label = new CoreLabel();
    label.set(CoreAnnotations.TextAnnotation.class, token);
    label.set(CoreAnnotations.ValueAnnotation.class, token);
    return label;
  }

  private Tree parse(List tokens) {
    return parse(tokens, null);
  }

  private Tree parse(List tokens,
                     List constraints) {
    CoreMap sent = new Annotation("");
    sent.set(CoreAnnotations.TokensAnnotation.class, tokens);
    sent.set(ParserAnnotations.ConstraintAnnotation.class, constraints);
    Annotation doc = new Annotation("");
    List sents = new ArrayList<>(1);
    sents.add(sent);
    doc.set(CoreAnnotations.SentencesAnnotation.class, sents);
    getParser().annotate(doc);
    sents = doc.get(CoreAnnotations.SentencesAnnotation.class);
    return sents.get(0).get(TreeCoreAnnotations.TreeAnnotation.class);
  }

  private Annotator getParser() {
    if(parserProcessor == null){
      Annotator parser = StanfordCoreNLP.getExistingAnnotator("parse");
      if (parser == null) {
        // TODO: these assertions rule out the possibility of alternately named parse/pos annotators
        throw new AssertionError("Failed to get parser - this should not be possible");
      }
      if (parser.requires().contains(CoreAnnotations.PartOfSpeechAnnotation.class)) {
        Annotator tagger = StanfordCoreNLP.getExistingAnnotator("pos");
        if (tagger == null) {
          throw new AssertionError("Parser required tagger, but failed to find the pos annotator");
        }
        List annotators = Generics.newArrayList();
        annotators.add(tagger);
        annotators.add(parser);        
        parserProcessor = new AnnotationPipeline(annotators);
      } else {
        parserProcessor = parser;
      }
    }
    return parserProcessor;
  }

  // This probably isn't needed now; everything is always a core label. But no-op.
  private static void convertToCoreLabels(Tree tree) {
    Label l = tree.label();
    if (! (l instanceof CoreLabel)) {
      CoreLabel cl = new CoreLabel();
      cl.setValue(l.value());
      tree.setLabel(cl);
    }

    for (Tree kid : tree.children()) {
      convertToCoreLabels(kid);
    }
  }

  private Tree safeHead(Tree top, int endIndex) {
    // The trees passed in do not have the CoordinationTransformer
    // applied, but that just means the SemanticHeadFinder results are
    // slightly worse.
    Tree head = top.headTerminal(headFinder);
    // One obscure failure case is that the added period becomes the head. Disallow this.
    if (head != null) {
      Integer headIndexInteger = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class);
      if (headIndexInteger != null) {
        int headIndex = headIndexInteger - 1;
        if (headIndex < endIndex) {
          return head;
        }
      }
    }
    // if no head found return the right-most leaf
    List leaves = top.getLeaves();
    int candidate = leaves.size() - 1;
    while (candidate >= 0) {
      head = leaves.get(candidate);
      Integer headIndexInteger = ((CoreLabel) head.label()).get(CoreAnnotations.IndexAnnotation.class);
      if (headIndexInteger != null) {
        int headIndex = headIndexInteger - 1;
        if (headIndex < endIndex) {
          return head;
        }
      }
      candidate--;
    }
    // fallback: return top
    return top;
  }

  static Tree findTreeWithSmallestSpan(Tree tree, int start, int end) {
    List leaves = tree.getLeaves();
    Tree startLeaf = leaves.get(start);
    Tree endLeaf = leaves.get(end - 1);
    return Trees.getLowestCommonAncestor(Arrays.asList(startLeaf, endLeaf), tree);
  }

  private static Tree findTreeWithSpan(Tree tree, int start, int end) {
    CoreLabel l = (CoreLabel) tree.label();
    if (l != null && l.containsKey(CoreAnnotations.BeginIndexAnnotation.class) && l.containsKey(CoreAnnotations.EndIndexAnnotation.class)) {
      int myStart = l.get(CoreAnnotations.BeginIndexAnnotation.class);
      int myEnd = l.get(CoreAnnotations.EndIndexAnnotation.class);
      if (start == myStart && end == myEnd){
        // found perfect match
        return tree;
      } else if (end < myStart) {
        return null;
      } else if (start >= myEnd) {
        return null;
      }
    }

    // otherwise, check inside children - a match is possible
    for (Tree kid : tree.children()) {
      if (kid == null) continue;
      Tree ret = findTreeWithSpan(kid, start, end);
      // found matching child
      if (ret != null) return ret;
    }

    // no match
    return null;
  }

  /** Filter out all spurious mentions */
  protected static void removeSpuriousMentions(CoreMap s, List mentions, Dictionaries dict) {
    Tree tree = s.get(TreeCoreAnnotations.TreeAnnotation.class);
    List sent = s.get(CoreAnnotations.TokensAnnotation.class);
    Set remove = Generics.newHashSet();


    for(Mention m : mentions){
      String headPOS = m.headWord.get(CoreAnnotations.PartOfSpeechAnnotation.class);
      String headNE = m.headWord.get(CoreAnnotations.NamedEntityTagAnnotation.class);
      // pleonastic it
      if(isPleonastic(m, tree)) remove.add(m);

      // non word such as 'hmm'
      if(dict.nonWords.contains(m.headString)) remove.add(m);

      // quantRule : not starts with 'any', 'all' etc
      if (m.originalSpan.size() > 0 && dict.quantifiers.contains(m.originalSpan.get(0).get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH))) remove.add(m);

      // partitiveRule
      if (partitiveRule(m, sent, dict)) remove.add(m);

      // bareNPRule
      if (headPOS.equals("NN") && !dict.temporals.contains(m.headString)
          && (m.originalSpan.size()==1 || m.originalSpan.get(0).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("JJ"))) {
        remove.add(m);
      }

      // remove generic rule
      //  if(m.generic==true) remove.add(m);

      if (m.headString.equals("%")) remove.add(m);
      if (headNE.equals("PERCENT") || headNE.equals("MONEY")) remove.add(m);

      // adjective form of nations
      if (dict.isAdjectivalDemonym(m.spanToString())) remove.add(m);

      // stop list (e.g., U.S., there)
      if (inStopList(m)) remove.add(m);
    }

    // nested mention with shared headword (except apposition, enumeration): pick larger one
    for (Mention m1 : mentions){
      for (Mention m2 : mentions){
        if (m1==m2 || remove.contains(m1) || remove.contains(m2)) continue;
        if (m1.sentNum==m2.sentNum && m1.headWord==m2.headWord && m2.insideIn(m1)) {
          if (m2.endIndex < sent.size() && (sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals(",")
              || sent.get(m2.endIndex).get(CoreAnnotations.PartOfSpeechAnnotation.class).equals("CC"))) {
            continue;
          }
          remove.add(m2);
        }
      }
    }
    mentions.removeAll(remove);
  }

  private static boolean inStopList(Mention m) {
    String mentionSpan = m.spanToString().toLowerCase(Locale.ENGLISH);
    if (mentionSpan.equals("u.s.") || mentionSpan.equals("u.k.")
        || mentionSpan.equals("u.s.s.r")) return true;
    if (mentionSpan.equals("there") || mentionSpan.startsWith("etc.")
        || mentionSpan.equals("ltd.")) return true;
    if (mentionSpan.startsWith("'s ")) return true;
    if (mentionSpan.endsWith("etc.")) return true;

    return false;
  }

  private static boolean partitiveRule(Mention m, List sent, Dictionaries dict) {
    return m.startIndex >= 2
            && sent.get(m.startIndex - 1).get(CoreAnnotations.TextAnnotation.class).equalsIgnoreCase("of")
            && dict.parts.contains(sent.get(m.startIndex - 2).get(CoreAnnotations.TextAnnotation.class).toLowerCase(Locale.ENGLISH));
  }

  /** Check whether pleonastic 'it'. E.g., It is possible that ... */
  private static final TregexPattern[] pleonasticPatterns = getPleonasticPatterns();
  private static boolean isPleonastic(Mention m, Tree tree) {
    if ( ! m.spanToString().equalsIgnoreCase("it")) return false;
    for (TregexPattern p : pleonasticPatterns) {
      if (checkPleonastic(m, tree, p)) {
        // SieveCoreferenceSystem.logger.fine("RuleBasedCorefMentionFinder: matched pleonastic pattern '" + p + "' for " + tree);
        return true;
      }
    }
    return false;
  }

  private static TregexPattern[] getPleonasticPatterns() {
    final String[] patterns = {
            // cdm 2013: I spent a while on these patterns. I fixed a syntax error in five patterns ($.. split with space), so it now shouldn't exception in checkPleonastic. This gave 0.02% on CoNLL11 dev
            // I tried some more precise patterns but they didn't help. Indeed, they tended to hurt vs. the higher recall patterns.

            //"NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (VP < (VBN $.. /S|SBAR/))))", // overmatches
            // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN < expected|hoped $.. @SBAR))))",  // this one seems more accurate, but ...
            "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@VP < (VBN $.. @S|SBAR))))",  // in practice, go with this one (best results)

            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP $.. (/S|SBAR/))))",
            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (ADJP < (/S|SBAR/))))",
            // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@ADJP < (/^(?:JJ|VB)/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay)$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))", // does worse than above 2 on CoNLL11 dev

            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP < /S|SBAR/)))",
            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:is|was|become|became)/) $.. (NP $.. ADVP $.. /S|SBAR/)))",
            // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (/^V.*/ < /^(?i:is|was|be|becomes|become|became)$/ $.. (@NP $.. @ADVP $.. @SBAR)))", // cleft examples, generalized to not need ADVP; but gave worse CoNLL12 dev numbers....

            // these next 5 had buggy space in "$ ..", which I fixed
            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (VP < (VBN $.. /S|SBAR/))))))",

            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP $.. (/S|SBAR/))))))", // extraposed. OK 1/2 correct; need non-adverbial case
            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (ADJP < (/S|SBAR/))))))", // OK: 3/3 good matches on dev; but 3/4 wrong on WSJ
            // certain can be either but relatively likely pleonastic with it ... be
            // "@NP < (PRP=m1 < it|IT|It) $.. (@VP < (MD $.. (@VP < ((/^V.*/ < /^(?:be|become)/) $.. (@ADJP < (/^JJ/ < /^(?i:(?:hard|tough|easi)(?:er|est)?|(?:im|un)?(?:possible|interesting|worthwhile|likely|surprising|certain)|disappointing|pointless|easy|fine|okay))$/) [ < @S|SBAR | $.. (@S|SBAR !< (IN !< for|For|FOR|that|That|THAT)) ] )))))", // GOOD REPLACEMENT ; 2nd clause is for extraposed ones

            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP < /S|SBAR/)))))",
            "NP < (PRP=m1) $.. (VP < (MD $.. (VP < ((/^V.*/ < /^(?:be|become)/) $.. (NP $.. ADVP $.. /S|SBAR/)))))",

            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:seems|appears|means|follows)/) $.. /S|SBAR/))",

            "NP < (PRP=m1) $.. (VP < ((/^V.*/ < /^(?:turns|turned)/) $.. PRT $.. /S|SBAR/))"
    };

    TregexPattern[] tgrepPatterns = new TregexPattern[patterns.length];
    for (int i = 0; i < tgrepPatterns.length; i++) {
      tgrepPatterns[i] = TregexPattern.compile(patterns[i]);
    }
    return tgrepPatterns;
  }

  private static boolean checkPleonastic(Mention m, Tree tree, TregexPattern tgrepPattern) {
    try {
      TregexMatcher matcher = tgrepPattern.matcher(tree);
      while (matcher.find()) {
        Tree np1 = matcher.getNode("m1");
        if (((CoreLabel)np1.label()).get(CoreAnnotations.BeginIndexAnnotation.class)+1 == m.headWord.get(CoreAnnotations.IndexAnnotation.class)) {
          return true;
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
    return false;
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy