edu.stanford.nlp.dcoref.MentionExtractor Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
//
// StanfordCoreNLP -- a suite of NLP tools
// Copyright (c) 2009-2011 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//

package edu.stanford.nlp.dcoref;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

import edu.stanford.nlp.classify.LogisticClassifier;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.SemanticHeadFinder;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.tregex.TregexMatcher;
import edu.stanford.nlp.trees.tregex.TregexPattern;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Pair;

/**
 * Generic mention extractor from a corpus.
 *
 * @author Jenny Finkel
 * @author Mihai Surdeanu
 * @author Karthik Raghunathan
 * @author Heeyoung Lee
 * @author Sudarshan Rangarajan
 */
public class MentionExtractor {

  private final HeadFinder headFinder;

  protected String currentDocumentID;

  protected final Dictionaries dictionaries;
  protected final Semantics semantics;

  public CorefMentionFinder mentionFinder;
  protected StanfordCoreNLP stanfordProcessor;
  protected LogisticClassifier singletonPredictor;

  /** The maximum mention ID: for preventing duplicated mention ID assignment */
  protected int maxID = -1;

  public static final boolean VERBOSE = false;

  public MentionExtractor(Dictionaries dict, Semantics semantics) {
    this.headFinder = new SemanticHeadFinder();
    this.dictionaries = dict;
    this.semantics = semantics;
    this.mentionFinder = new RuleBasedCorefMentionFinder();  // Default
  }

  public void setMentionFinder(CorefMentionFinder mentionFinder)
  {
    this.mentionFinder = mentionFinder;
  }

  /**
   * Extracts the info relevant for coref from the next document in the corpus
   * @return List of mentions found in each sentence ordered according to the tree traversal.
   * @throws Exception
   */
  public Document nextDoc() throws Exception { return null; }

  /**
   * Reset so that we start at the beginning of the document collection
   */
  public void resetDocs() {
    maxID = -1;
    currentDocumentID = null;
  }

  public Document arrange(
      Annotation anno,
      List> words,
      List trees,
      List> unorderedMentions) throws Exception {
    return arrange(anno, words, trees, unorderedMentions, null, false);
  }

  protected int getHeadIndex(Tree t) {
    // The trees passed in do not have the CoordinationTransformer
    // applied, but that just means the SemanticHeadFinder results are
    // slightly worse.
    Tree ht = t.headTerminal(headFinder);
    if(ht==null) return -1;  // temporary: a key which is matched to nothing
    CoreLabel l = (CoreLabel) ht.label();
    return l.get(CoreAnnotations.IndexAnnotation.class);
  }
  private String treeToKey(Tree t) {
    int idx = getHeadIndex(t);
    String key = Integer.toString(idx) + ':' + t.toString();
    return key;
  }

  public Document arrange(
      Annotation anno,
      List> words,
      List trees,
      List> unorderedMentions,
      List> unorderedGoldMentions,
      boolean doMergeLabels) throws Exception {
    List> predictedOrderedMentionsBySentence = arrange(anno, words, trees, unorderedMentions, doMergeLabels);
    List> goldOrderedMentionsBySentence = null;
//    SieveCoreferenceSystem.debugPrintMentions(System.err, "UNORDERED GOLD MENTIONS:", unorderedGoldMentions);

    if(unorderedGoldMentions != null) {
      goldOrderedMentionsBySentence = arrange(anno, words, trees, unorderedGoldMentions, doMergeLabels);
    }
//    SieveCoreferenceSystem.debugPrintMentions(System.err, "ORDERED GOLD MENTIONS:", goldOrderedMentionsBySentence);
    return new Document(anno, predictedOrderedMentionsBySentence, goldOrderedMentionsBySentence, dictionaries);
  }

  /**
   * Post-processes the extracted mentions. Here we set the Mention fields required for coref and order mentions by tree-traversal order.
   * @param words List of words in each sentence, in textual order
   * @param trees List of trees, one per sentence
   * @param unorderedMentions List of unordered, unprocessed mentions
   *                 Each mention MUST have startIndex and endIndex set!
   *                 Optionally, if scoring is desired, mentions must have mentionID and originalRef set.
   *                 All the other Mention fields are set here.
   * @return List of mentions ordered according to the tree traversal
   * @throws Exception
   */
  public List> arrange(
      Annotation anno,
      List> words,
      List trees,
      List> unorderedMentions,
      boolean doMergeLabels) throws Exception {

    List> orderedMentionsBySentence = new ArrayList<>();

    //
    // traverse all sentences and process each individual one
    //
    for (int sent = 0, sz = words.size(); sent < sz; sent ++) {
      List sentence = words.get(sent);
      Tree tree = trees.get(sent);
      List mentions = unorderedMentions.get(sent);
      Map> mentionsToTrees = Generics.newHashMap();

      // merge the parse tree of the entire sentence with the sentence words
      if(doMergeLabels) mergeLabels(tree, sentence);

      //
      // set the surface information and the syntactic info in each mention
      // startIndex and endIndex MUST be set before!
      //
      for (Mention mention: mentions) {
        mention.contextParseTree = tree;
        mention.sentenceWords = sentence;
        mention.originalSpan = new ArrayList<>(mention.sentenceWords.subList(mention.startIndex, mention.endIndex));
        if(!((CoreLabel) tree.label()).containsKey(CoreAnnotations.BeginIndexAnnotation.class)) tree.indexSpans(0);
        if(mention.headWord==null) {
          Tree headTree = ((RuleBasedCorefMentionFinder) mentionFinder).findSyntacticHead(mention, tree, sentence);
          mention.headWord = (CoreLabel)headTree.label();
          mention.headIndex = mention.headWord.get(CoreAnnotations.IndexAnnotation.class) - 1;
        }
        if(mention.mentionSubTree==null) {
          // mentionSubTree = highest NP that has the same head
          Tree headTree = tree.getLeaves().get(mention.headIndex);
          if (headTree == null) { throw new RuntimeException("Missing head tree for a mention!"); }
          Tree t = headTree;
          while ((t = t.parent(tree)) != null) {
            if (t.headTerminal(headFinder) == headTree && t.value().equals("NP")) {
              mention.mentionSubTree = t;
            } else if(mention.mentionSubTree != null){
              break;
            }
          }
          if (mention.mentionSubTree == null) {
            mention.mentionSubTree = headTree;
          }
        }

        List mentionsForTree = mentionsToTrees.get(treeToKey(mention.mentionSubTree));
        if(mentionsForTree == null){
          mentionsForTree = new ArrayList<>();
          mentionsToTrees.put(treeToKey(mention.mentionSubTree), mentionsForTree);
        }
        mentionsForTree.add(mention);

        // generates all fields required for coref, such as gender, number, etc.
        mention.process(dictionaries, semantics, this, singletonPredictor);
      }

      //
      // Order all mentions in tree-traversal order
      //
      List orderedMentions = new ArrayList<>();
      orderedMentionsBySentence.add(orderedMentions);

      // extract all mentions in tree traversal order (alternative: tree.postOrderNodeList())
      for (Tree t : tree.preOrderNodeList()) {
        List lm = mentionsToTrees.get(treeToKey(t));
        if(lm != null){
          for(Mention m: lm){
            orderedMentions.add(m);
          }
        }
      }

      //
      // find appositions, predicate nominatives, relative pronouns in this sentence
      //
      findSyntacticRelations(tree, orderedMentions);
      assert(mentions.size() == orderedMentions.size());
    }
    return orderedMentionsBySentence;
  }

  /**
   * Sets the label of the leaf nodes of a Tree to be the CoreLabels in the given sentence.
   * The original value() of the Tree nodes is preserved, and otherwise the label of tree
   * leaves becomes the label from the List.
   */
  // todo [cdm 2015]: This clearly shouldn't be here! Maybe it's not needed at all now since parsing code does this?
  public static void mergeLabels(Tree tree, List sentence) {
    int idx = 0;
    for (Tree t : tree.getLeaves()) {
      CoreLabel cl = sentence.get(idx ++);
      String value = t.value();
      cl.set(CoreAnnotations.ValueAnnotation.class, value);
      t.setLabel(cl);
    }
    tree.indexLeaves();
  }

  private static boolean inside(int i, Mention m) {
    return i >= m.startIndex && i < m.endIndex;
  }

  /** Find syntactic relations (e.g., appositives) in a sentence */
  private void findSyntacticRelations(Tree tree, List orderedMentions) {
    markListMemberRelation(orderedMentions);

    Set> appos = Generics.newHashSet();
    // TODO: This apposition finding doesn't seem to be very good - what about using "appos" from dependencies?
    findAppositions(tree, appos);
    markMentionRelation(orderedMentions, appos, "APPOSITION");

    Set> preNomi = Generics.newHashSet();
    findPredicateNominatives(tree, preNomi);
    markMentionRelation(orderedMentions, preNomi, "PREDICATE_NOMINATIVE");

    Set> relativePronounPairs = Generics.newHashSet();
    findRelativePronouns(tree, relativePronounPairs);
    markMentionRelation(orderedMentions, relativePronounPairs, "RELATIVE_PRONOUN");
  }

  /** Find syntactic pattern in a sentence by tregex */
  private void findTreePattern(Tree tree, String tregex, Set> foundPairs) {
    try {
      TregexPattern tgrepPattern = TregexPattern.compile(tregex);
      findTreePattern(tree, tgrepPattern, foundPairs);
    } catch (Exception e) {
      // shouldn't happen....
      throw new RuntimeException(e);
    }
  }

  private void findTreePattern(Tree tree, TregexPattern tgrepPattern, Set> foundPairs) {
    try {
      TregexMatcher m = tgrepPattern.matcher(tree);
      while (m.find()) {
        Tree t = m.getMatch();
        Tree np1 = m.getNode("m1");
        Tree np2 = m.getNode("m2");
        Tree np3 = null;
        if(tgrepPattern.pattern().contains("m3")) np3 = m.getNode("m3");
        addFoundPair(np1, np2, t, foundPairs);
        if(np3!=null) addFoundPair(np2, np3, t, foundPairs);
      }
    } catch (Exception e) {
      // shouldn't happen....
      throw new RuntimeException(e);
    }
  }

  private void addFoundPair(Tree np1, Tree np2, Tree t,
      Set> foundPairs) {
    Tree head1 = np1.headTerminal(headFinder);
    Tree head2 = np2.headTerminal(headFinder);
    int h1 = ((CoreMap) head1.label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
    int h2 = ((CoreMap) head2.label()).get(CoreAnnotations.IndexAnnotation.class) - 1;
    Pair p = new Pair<>(h1, h2);
    foundPairs.add(p);
  }

  private static final TregexPattern appositionPattern = TregexPattern.compile("NP=m1 < (NP=m2 $.. (/,/ $.. NP=m3))");
  private static final TregexPattern appositionPattern2 = TregexPattern.compile("NP=m1 < (NP=m2 $.. (/,/ $.. (SBAR < (WHNP < WP|WDT=m3))))");
  private static final TregexPattern appositionPattern3 = TregexPattern.compile("/^NP(?:-TMP|-ADV)?$/=m1 < (NP=m2 $- /^,$/ $-- NP=m3 !$ CC|CONJP)");
  private static final TregexPattern appositionPattern4 = TregexPattern.compile("/^NP(?:-TMP|-ADV)?$/=m1 < (PRN=m2 < (NP < /^NNS?|CD$/ $-- /^-LRB-$/ $+ /^-RRB-$/))");
  private void findAppositions(Tree tree, Set> appos) {
    findTreePattern(tree, appositionPattern, appos);
    findTreePattern(tree, appositionPattern2, appos);
    findTreePattern(tree, appositionPattern3, appos);
    findTreePattern(tree, appositionPattern4, appos);
  }

  private static final TregexPattern predicateNominativePattern = TregexPattern.compile("S < (NP=m1 $.. (VP < ((/VB/ < /^(am|are|is|was|were|'m|'re|'s|be)$/) $.. NP=m2)))");
  private static final TregexPattern predicateNominativePattern2 = TregexPattern.compile("S < (NP=m1 $.. (VP < (VP < ((/VB/ < /^(be|been|being)$/) $.. NP=m2))))");
  private void findPredicateNominatives(Tree tree, Set> preNomi) {
    //    String predicateNominativePattern2 = "NP=m1 $.. (VP < ((/VB/ < /^(am|are|is|was|were|'m|'re|'s|be)$/) $.. NP=m2))";
    findTreePattern(tree, predicateNominativePattern, preNomi);
    findTreePattern(tree, predicateNominativePattern2, preNomi);
  }

  private static final TregexPattern relativePronounPattern = TregexPattern.compile("NP < (NP=m1 $.. (SBAR < (WHNP < WP|WDT=m2)))");
  private void findRelativePronouns(Tree tree, Set> relativePronounPairs) {
    findTreePattern(tree, relativePronounPattern, relativePronounPairs);
  }

  private static void markListMemberRelation(List orderedMentions) {
    for(Mention m1 : orderedMentions){
      for(Mention m2 : orderedMentions){
        // Mark if m2 and m1 are in list relationship
        if (m1.isListMemberOf(m2)) {
          m2.addListMember(m1);
          m1.addBelongsToList(m2);
        } else if (m2.isListMemberOf(m1)) {
          m1.addListMember(m2);
          m2.addBelongsToList(m1);
        }
      }
    }
  }

  private static void markMentionRelation(List orderedMentions, Set> foundPairs, String flag) {
    for(Mention m1 : orderedMentions){
      for(Mention m2 : orderedMentions){
        // Ignore if m2 and m1 are in list relationship
        if (m1.isListMemberOf(m2) || m2.isListMemberOf(m1) || m1.isMemberOfSameList(m2)) {
          SieveCoreferenceSystem.logger.finest("Not checking '" + m1 + "' and '" + m2 + "' for " + flag + ": in list relationship");
          continue;
        }
        for(Pair foundPair: foundPairs){
          if((foundPair.first == m1.headIndex && foundPair.second == m2.headIndex)){
            switch (flag) {
              case "APPOSITION":
                m2.addApposition(m1);
                break;
              case "PREDICATE_NOMINATIVE":
                m2.addPredicateNominatives(m1);
                break;
              case "RELATIVE_PRONOUN":
                m2.addRelativePronoun(m1);
                break;
              default:
                throw new RuntimeException("check flag in markMentionRelation (dcoref/MentionExtractor.java)");
            }
          }
        }
      }
    }
  }

  /**
   * Finds the tree the matches this span exactly
   * @param tree Leaves must be indexed!
   * @param first First element in the span (first position has offset 1)
   * @param last Last element included in the span (first position has offset 1)
   */
  public static Tree findExactMatch(Tree tree, int first, int last) {
    List leaves = tree.getLeaves();
    int thisFirst = ((CoreMap) leaves.get(0).label()).get(CoreAnnotations.IndexAnnotation.class);
    int thisLast = ((CoreMap) leaves.get(leaves.size() - 1).label()).get(CoreAnnotations.IndexAnnotation.class);
    if(thisFirst == first && thisLast == last) {
      return tree;
    } else {
      Tree [] kids = tree.children();
      for(Tree k: kids){
        Tree t = findExactMatch(k, first, last);
        if(t != null) return t;
      }
    }
    return null;
  }

  /** Load Stanford Processor: skip unnecessary annotator */
  protected static StanfordCoreNLP loadStanfordProcessor(Properties props) {
    boolean replicateCoNLL = Boolean.parseBoolean(props.getProperty(Constants.REPLICATECONLL_PROP, "false"));

    Properties pipelineProps = new Properties(props);
    StringBuilder annoSb = new StringBuilder("");
    if (!Constants.USE_GOLD_POS && !replicateCoNLL)  {
      annoSb.append("pos, lemma");
    } else {
      annoSb.append("lemma");
    }
    if(Constants.USE_TRUECASE) {
      annoSb.append(", truecase");
    }
    if (!Constants.USE_GOLD_NE && !replicateCoNLL)  {
      annoSb.append(", ner");
    }
    if (!Constants.USE_GOLD_PARSES && !replicateCoNLL)  {
      annoSb.append(", parse");
    }
    String annoStr = annoSb.toString();
    SieveCoreferenceSystem.logger.info("MentionExtractor ignores specified annotators, using annotators=" + annoStr);
    pipelineProps.setProperty("annotators", annoStr);
    return new StanfordCoreNLP(pipelineProps, false);
  }

  public static void initializeUtterance(List tokens) {
    for(CoreLabel l : tokens){
      if (l.get(CoreAnnotations.UtteranceAnnotation.class) == null) {
        l.set(CoreAnnotations.UtteranceAnnotation.class, 0);
      }
    }
  }

}