edu.stanford.nlp.pipeline.MorphaAnnotator Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation

Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.

There is a newer version: 4.5.7

Show newest version

package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.util.ArraySet;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.logging.Redwood;

import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Set;


/**
 * This class will add the lemmas of all the words to the Annotation.
 * It assumes that the Annotation already contains the tokenized words as
 * a {@code List} for a list of sentences under the
 * {@code SentencesAnnotation.class} key.
 * The Annotator adds lemma information to each CoreLabel,
 * in the LemmaAnnotation.class.
 *
 * @author Jenny Finkel
 */
public class MorphaAnnotator implements Annotator {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(MorphaAnnotator.class);

  private boolean VERBOSE = false;


  private static final String[] prep = {"abroad", "across", "after", "ahead", "along", "aside", "away", "around", "back", "down", "forward", "in", "off", "on", "over", "out", "round", "together", "through", "up"};
  private static final List particles = Arrays.asList(prep);

  public MorphaAnnotator() {
    this(true);
  }

  public MorphaAnnotator(boolean verbose) {
    VERBOSE = verbose;
  }

  @Override
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      log.info("Finding lemmas ...");
    }
    Morphology morphology = new Morphology();
    if (annotation.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
      for (CoreMap sentence : annotation.get(CoreAnnotations.SentencesAnnotation.class)) {
        List tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
        //log.info("Lemmatizing sentence: " + tokens);
        for (CoreLabel token : tokens) {
          String text = token.get(CoreAnnotations.TextAnnotation.class);
          String posTag = token.get(CoreAnnotations.PartOfSpeechAnnotation.class);
          addLemma(morphology, CoreAnnotations.LemmaAnnotation.class, token, text, posTag);
        }
      }
    } else {
      throw new RuntimeException("Unable to find words/tokens in: " +
                                 annotation);
    }
  }


  private static void addLemma(Morphology morpha,
                        Class> ann,
                        CoreMap map, String word, String tag) {
    if ( ! tag.isEmpty()) {
      String phrasalVerb = phrasalVerb(morpha, word, tag);
      if (phrasalVerb == null) {
        map.set(ann, morpha.lemma(word, tag));
      } else {
        map.set(ann, phrasalVerb);
      }
    } else {
      map.set(ann, morpha.stem(word));
    }
  }


  /** If a token is a phrasal verb with an underscore between a verb and a
   *  particle, return the phrasal verb lemmatized. If not, return null
   */
  private static String phrasalVerb(Morphology morpha, String word, String tag) {

    // must be a verb and contain an underscore
    assert(word != null);
    assert(tag != null);
    if(!tag.startsWith("VB")  || !word.contains("_")) return null;

    // check whether the last part is a particle
    String[] verb = word.split("_");
    if(verb.length != 2) return null;
    String particle = verb[1];
    if(particles.contains(particle)) {
      String base = verb[0];
      String lemma = morpha.lemma(base, tag);
      return lemma + '_' + particle;
    }

    return null;
  }


  @Override
  public Set> requires() {
    return Collections.unmodifiableSet(new ArraySet<>(Arrays.asList(
        CoreAnnotations.TextAnnotation.class,
        CoreAnnotations.TokensAnnotation.class,
        CoreAnnotations.SentencesAnnotation.class,
        CoreAnnotations.PartOfSpeechAnnotation.class
    )));
  }

  @Override
  public Set> requirementsSatisfied() {
    return Collections.singleton(CoreAnnotations.LemmaAnnotation.class);
  }

}