edu.stanford.nlp.pipeline.QuantifiableEntityNormalizingAnnotator Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.pipeline; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ie.QuantifiableEntityNormalizer;
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.Timing;

import java.util.*;

/**
 * This class provides a facility for normalizing content of numerical named
 * entities (number, money, date, time) in the pipeline package world. It uses a
 * lot of code with {@link edu.stanford.nlp.ie.QuantifiableEntityNormalizer}.
 * New stuff should generally be added there so as to reduce code duplication.
 * 
 * @author Jenny Finkel
 * @author Christopher Manning (extended for RTE)
 * @author Chris Cox (original version)
 */

public class QuantifiableEntityNormalizingAnnotator implements Annotator  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(QuantifiableEntityNormalizingAnnotator.class);

  private Timing timer = new Timing();
  private final boolean VERBOSE;
  private static final String DEFAULT_BACKGROUND_SYMBOL = "O";
  private final boolean collapse;  // TODO: collpase = true won't work properly (see annotateTokens)

  public static final String BACKGROUND_SYMBOL_PROPERTY = "background";
  public static final String COLLAPSE_PROPERTY = "collapse";

  public QuantifiableEntityNormalizingAnnotator() {
    this(DEFAULT_BACKGROUND_SYMBOL, true);
  }

  public QuantifiableEntityNormalizingAnnotator(boolean verbose) {
    this(DEFAULT_BACKGROUND_SYMBOL, verbose);
  }

  public QuantifiableEntityNormalizingAnnotator(String name, Properties props) {
    String property = name + "." + BACKGROUND_SYMBOL_PROPERTY;
    String backgroundSymbol = props.getProperty(property, DEFAULT_BACKGROUND_SYMBOL);
    // this next line is yuck as QuantifiableEntityNormalizer is still static
    QuantifiableEntityNormalizer.BACKGROUND_SYMBOL = backgroundSymbol;
    property = name + "." + COLLAPSE_PROPERTY;
    collapse = PropertiesUtils.getBool(props, property, false);
    if (this.collapse) {
      log.info("WARNING: QuantifiableEntityNormalizingAnnotator does not work well with collapse=true");
    }
    VERBOSE = false;
  }

  /**
   * Do quantity entity normalization and collapse together multitoken quantity
   * entities into a single token.
   * 
   * @param backgroundSymbol
   *          NER background symbol
   * @param verbose
   *          Whether to write messages
   */
  public QuantifiableEntityNormalizingAnnotator(String backgroundSymbol, boolean verbose) {
    this(backgroundSymbol, verbose, false);
  }

  /**
   * Do quantity entity normalization and collapse together multitoken quantity
   * entities into a single token.
   * 
   * @param verbose
   *          Whether to write messages
   * @param collapse
   *          Whether to collapse multitoken quantity entities.
   */
  public QuantifiableEntityNormalizingAnnotator(boolean verbose, boolean collapse) {
    this(DEFAULT_BACKGROUND_SYMBOL, verbose, collapse);
  }

  public QuantifiableEntityNormalizingAnnotator(String backgroundSymbol, boolean verbose, boolean collapse) {
    // this next line is yuck as QuantifiableEntityNormalizer is still static
    QuantifiableEntityNormalizer.BACKGROUND_SYMBOL = backgroundSymbol;
    VERBOSE = verbose;
    this.collapse = collapse;
    if (this.collapse) {
      log.info("WARNING: QuantifiableEntityNormalizingAnnotator does not work well with collapse=true");
    }
  }

  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      timer.start();
      log.info("Normalizing quantifiable entities...");
    }
    if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
      List sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
      for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        annotateTokens(tokens);
      }
      if (VERBOSE) {
        timer.stop("done.");
        log.info("output: " + sentences + '\n');
      }
    } else if (annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
      List tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
      annotateTokens(tokens);
    } else {
      throw new RuntimeException("unable to find sentences in: " + annotation);
    }
  }

  private  void annotateTokens(List tokens) {
    // Make a copy of the tokens before annotating because QuantifiableEntityNormalizer may change the POS too
    List words = new ArrayList<>();
    for (CoreLabel token : tokens) {
      CoreLabel word = new CoreLabel();
      word.setWord(token.word());
      word.setNER(token.ner());
      word.setTag(token.tag());

      // copy fields potentially set by SUTime
      NumberSequenceClassifier.transferAnnotations(token, word);

      words.add(word);
    }
    doOneSentence(words);
    // TODO: If collapsed is set, tokens for entities are collapsed into one node then
    // (words.size() != tokens.size() and the logic below just don't work!!!
    for (int i = 0; i < words.size(); i++) {
      String ner = words.get(i).ner();
      tokens.get(i).setNER(ner);
      tokens.get(i).set(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class,
              words.get(i).get(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class));
    }
  }

  private  void doOneSentence(List words) {
    QuantifiableEntityNormalizer.addNormalizedQuantitiesToEntities(words, collapse);
  }


  @Override
  public Set> requires() {
    return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
        CoreAnnotations.TextAnnotation.class,
        CoreAnnotations.TokensAnnotation.class,
        CoreAnnotations.CharacterOffsetBeginAnnotation.class,
        CoreAnnotations.CharacterOffsetEndAnnotation.class,
        CoreAnnotations.SentencesAnnotation.class
    )));
  }

  @Override
  public Set> requirementsSatisfied() {
    // technically it adds some NER, but someone who wants full NER
    // labels will be very disappointed, so we do not claim to produce NER
    return Collections.singleton(CoreAnnotations.NormalizedNamedEntityTagAnnotation.class);
  }
}