edu.stanford.nlp.pipeline.NumberAnnotator Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.pipeline; 
import edu.stanford.nlp.util.logging.Redwood;

import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.regexp.NumberSequenceClassifier;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;

import java.util.*;

/**
 * This calls NumberSequenceClassifier, which is a rule based classifier, which
 * adds a NUMBER entity tag to numbers not already given another entity tag, and
 * also has additional rules for marking MONEY, TIME, and DATE. It assumes that
 * tokens already have a (POS) TagAnnotation, and an original round of NER that
 * covers MONEY and American DATE/TIME formats, such as MUC NER in
 * AnswerAnnotation, to which we add.
 *
 * @author Jenny Finkel
 */

public class NumberAnnotator implements Annotator  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(NumberAnnotator.class);

  private final AbstractSequenceClassifier nsc;

  private boolean VERBOSE = true;
  private static final String DEFAULT_BACKGROUND_SYMBOL = "O";
  private final String BACKGROUND_SYMBOL;

  public static final String BACKGROUND_SYMBOL_PROPERTY = "background";

  public NumberAnnotator() {
    this(DEFAULT_BACKGROUND_SYMBOL, true, NumberSequenceClassifier.USE_SUTIME_DEFAULT);
  }

  public NumberAnnotator(boolean verbose) {
    this(DEFAULT_BACKGROUND_SYMBOL, verbose, NumberSequenceClassifier.USE_SUTIME_DEFAULT);
  }

  public NumberAnnotator(boolean verbose, boolean useSUTime) {
    this(DEFAULT_BACKGROUND_SYMBOL, verbose, useSUTime);
  }

  public NumberAnnotator(String backgroundSymbol, boolean verbose, boolean useSUTime) {
    BACKGROUND_SYMBOL = backgroundSymbol;
    VERBOSE = verbose;
    nsc = new NumberSequenceClassifier(useSUTime);
  }

  public NumberAnnotator(String name, Properties props) {
    String property = name + "." + BACKGROUND_SYMBOL_PROPERTY;
    BACKGROUND_SYMBOL = props.getProperty(property, DEFAULT_BACKGROUND_SYMBOL);
    boolean useSUTime = PropertiesUtils.getBool(props,
        NumberSequenceClassifier.USE_SUTIME_PROPERTY,
        NumberSequenceClassifier.USE_SUTIME_DEFAULT);
    VERBOSE = false;
    nsc = new NumberSequenceClassifier(useSUTime);
  }

  @Override
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      log.info("Adding number annotation ... ");
    }

    if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
      // classify tokens for each sentence
      for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        doOneSentenceNew(tokens, annotation, sentence);
      }
      if (VERBOSE) {
        log.info("done. Output: " + annotation.get(CoreAnnotations.SentencesAnnotation.class));
      }
    } else if (annotation.containsKey(CoreAnnotations.TokensAnnotation.class)) {
      List tokens = annotation.get(CoreAnnotations.TokensAnnotation.class);
      doOneSentenceNew(tokens, annotation, null);
    } else {
      throw new RuntimeException("unable to find sentences in: " + annotation);
    }
  }

  private void doOneSentenceNew(List words, Annotation doc, CoreMap sentence) {
    List newWords = NumberSequenceClassifier.copyTokens(words, sentence);

    nsc.classifyWithGlobalInformation(newWords, doc, sentence);

    Iterator newFLIter = newWords.iterator();
    for (CoreLabel origWord : words) {
      CoreLabel newWord = newFLIter.next();
      String before = origWord.ner();
      String newGuess = newWord.get(CoreAnnotations.AnswerAnnotation.class);
      // log.info(origWord.word());
      // log.info(origWord.ner());
      if (VERBOSE)
        log.info(newWord);
      // log.info("-------------------------------------");
      if ((before == null || before.equals(BACKGROUND_SYMBOL) || before.equals("MISC"))
          && !newGuess.equals(BACKGROUND_SYMBOL)) {
        origWord.setNER(newGuess);
      }

      // transfer other annotations generated by SUTime or NumberNormalizer
      NumberSequenceClassifier.transferAnnotations(newWord, origWord);
    }
  }


  @Override
  public Set> requires() {
    return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
        CoreAnnotations.TokensAnnotation.class,
        CoreAnnotations.SentencesAnnotation.class
        )));
  }

  @Override
  public Set> requirementsSatisfied() {
    // technically it adds some NER, but someone who wants full NER
    // labels will be very disappointed, so we do not claim to produce NER
    return Collections.singleton(CoreAnnotations.NumerizedTokensAnnotation.class);
  }
}