edu.stanford.nlp.pipeline.MentionAnnotator Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.pipeline;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;

import edu.stanford.nlp.coref.CorefCoreAnnotations;
import edu.stanford.nlp.coref.CorefProperties;
import edu.stanford.nlp.coref.data.Dictionaries;
import edu.stanford.nlp.coref.data.Mention;
import edu.stanford.nlp.coref.md.CorefMentionFinder;
import edu.stanford.nlp.coref.md.DependencyCorefMentionFinder;
import edu.stanford.nlp.coref.md.HybridCorefMentionFinder;
import edu.stanford.nlp.coref.md.RuleBasedCorefMentionFinder;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.semgraph.SemanticGraphCoreAnnotations;
import edu.stanford.nlp.trees.HeadFinder;
import edu.stanford.nlp.trees.SemanticHeadFinder;
import edu.stanford.nlp.trees.TreeCoreAnnotations;
import edu.stanford.nlp.trees.international.pennchinese.ChineseSemanticHeadFinder;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.logging.Redwood;

/**
 * This class adds mention information to an Annotation.
 *
 * After annotation each sentence will have a List representing the Mentions in the sentence
 *
 * the List containing the Mentions will be put under the annotation
 * {@link edu.stanford.nlp.coref.CorefCoreAnnotations.CorefMentionsAnnotation}.
 *
 * @author heeyoung
 * @author Jason Bolton
 */

public class MentionAnnotator extends TextAnnotationCreator implements Annotator  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(MentionAnnotator.class);

  HeadFinder headFinder;
  CorefMentionFinder md;
  String mdName;
  Dictionaries dictionaries;
  Properties corefProperties;

  Set> mentionAnnotatorRequirements = new HashSet<>();

  public MentionAnnotator(Properties props) {
    try {
      corefProperties = props;
      //System.out.println("corefProperties: "+corefProperties);
      dictionaries = new Dictionaries(props);
      //System.out.println("got dictionaries");
      headFinder = getHeadFinder(props);
      //System.out.println("got head finder");
      md = getMentionFinder(props, headFinder);
      log.info("Using mention detector type: "+mdName);
      mentionAnnotatorRequirements.addAll(Arrays.asList(
          CoreAnnotations.TokensAnnotation.class,
          CoreAnnotations.SentencesAnnotation.class,
          CoreAnnotations.PartOfSpeechAnnotation.class,
          CoreAnnotations.NamedEntityTagAnnotation.class,
          CoreAnnotations.IndexAnnotation.class,
          CoreAnnotations.TextAnnotation.class,
          CoreAnnotations.ValueAnnotation.class,
          SemanticGraphCoreAnnotations.BasicDependenciesAnnotation.class,
          SemanticGraphCoreAnnotations.EnhancedDependenciesAnnotation.class

      ));
    } catch (Exception e) {
      e.printStackTrace();
      log.info("Error with building coref mention annotator!");
    }
  }

  @Override
  public void annotate(Annotation annotation) {
    List sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
    // TO DO: be careful, this could introduce a really hard to find bug
    // this is necessary for Chinese coreference
    // removeNested needs to be set to "false" for newswire text or big performance drop
    String docID = annotation.get(CoreAnnotations.DocIDAnnotation.class);
    if (docID == null) {
      docID = "";
    }
    if (docID.contains("nw") && (CorefProperties.conll(corefProperties)
        || corefProperties.getProperty("coref.input.type", "raw").equals("conll")) &&
            CorefProperties.getLanguage(corefProperties) == Locale.CHINESE &&
            PropertiesUtils.getBool(corefProperties,"coref.specialCaseNewswire")) {
      corefProperties.setProperty("removeNestedMentions", "false");
    } else {
      corefProperties.setProperty("removeNestedMentions", "true");
    }
    List> mentions = md.findMentions(annotation, dictionaries, corefProperties);
    int mentionIndex = 0;
    int currIndex = 0;
    for (CoreMap sentence : sentences) {
      List mentionsForThisSentence = mentions.get(currIndex);
      sentence.set(CorefCoreAnnotations.CorefMentionsAnnotation.class, mentionsForThisSentence);
      // increment to next list of mentions
      currIndex++;
      // assign latest mentionID
      for (Mention m : mentionsForThisSentence) {
        m.mentionID = mentionIndex;
        mentionIndex++;
      }
    }
  }

  private static HeadFinder getHeadFinder(Properties props) {
    Locale lang = CorefProperties.getLanguage(props);
    if(lang == Locale.ENGLISH) return new SemanticHeadFinder();
    else if(lang == Locale.CHINESE) return new ChineseSemanticHeadFinder();
    else {
      throw new RuntimeException("Invalid language setting: cannot load HeadFinder");
    }
  }

  private CorefMentionFinder getMentionFinder(Properties props, HeadFinder headFinder)
          throws ClassNotFoundException, IOException {

    switch (CorefProperties.mdType(props)) {
      case DEPENDENCY:
        mdName = "dependency";
        return new DependencyCorefMentionFinder(props);

      case HYBRID:
        mdName = "hybrid";
        mentionAnnotatorRequirements.add(TreeCoreAnnotations.TreeAnnotation.class);
        mentionAnnotatorRequirements.add(CoreAnnotations.BeginIndexAnnotation.class);
        mentionAnnotatorRequirements.add(CoreAnnotations.EndIndexAnnotation.class);
        return new HybridCorefMentionFinder(headFinder, props);

      case RULE:
      default:
        mentionAnnotatorRequirements.add(TreeCoreAnnotations.TreeAnnotation.class);
        mentionAnnotatorRequirements.add(CoreAnnotations.BeginIndexAnnotation.class);
        mentionAnnotatorRequirements.add(CoreAnnotations.EndIndexAnnotation.class);
        mdName = "rule";
        return new RuleBasedCorefMentionFinder(headFinder, props);
    }
  }

  @Override
  public Set> requires() {
    return mentionAnnotatorRequirements;
  }

  @Override
  public Set> requirementsSatisfied() {
    return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
        CorefCoreAnnotations.CorefMentionsAnnotation.class,
        CoreAnnotations.ParagraphAnnotation.class,
        CoreAnnotations.SpeakerAnnotation.class,
        CoreAnnotations.UtteranceAnnotation.class
    )));
  }

}