edu.stanford.nlp.pipeline.TokenizerAnnotator Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.pipeline;

import java.io.Reader;
import java.io.StringReader;
import java.util.*;

import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.WhitespaceTokenizer;
import edu.stanford.nlp.international.spanish.process.SpanishTokenizer;
import edu.stanford.nlp.international.french.process.FrenchTokenizer;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.logging.Redwood;


/**
 * This class will PTB tokenize the input.  It assumes that the original
 * String is under the CoreAnnotations.TextAnnotation field
 * and it will add the output from the
 * InvertiblePTBTokenizer ({@code List}) under
 * CoreAnnotation.TokensAnnotation.
 *
 * @author Jenny Finkel
 * @author Christopher Manning
 * @author Ishita Prasad
 */
public class TokenizerAnnotator implements Annotator  {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(TokenizerAnnotator.class);

  /**
   * Enum to identify the different TokenizerTypes. To add a new
   * TokenizerType, add it to the list with a default options string
   * and add a clause in getTokenizerType to identify it.
   */
  public enum TokenizerType {
    Unspecified(null, null, "invertible,ptb3Escaping=true"),
    Arabic     ("ar", null, ""),
    Chinese    ("zh", null, ""),
    Spanish    ("es", "SpanishTokenizer", "invertible,ptb3Escaping=true,splitAll=true"),
    English    ("en", "PTBTokenizer", "invertible,ptb3Escaping=true"),
    German     ("de", null, "invertible,ptb3Escaping=true"),
    French     ("fr", "FrenchTokenizer", ""),
    Whitespace (null, "WhitespaceTokenizer", "");

    private final String abbreviation;
    private final String className;
    private final String defaultOptions;

    TokenizerType(String abbreviation, String className, String defaultOptions) {
      this.abbreviation = abbreviation;
      this.className = className;
      this.defaultOptions = defaultOptions;
    }

    public String getDefaultOptions() {
      return defaultOptions;
    }

    private static final Map nameToTokenizerMap = initializeNameMap();

    private static Map initializeNameMap() {
      Map map = Generics.newHashMap();
      for (TokenizerType type : TokenizerType.values()) {
        if (type.abbreviation != null) {
          map.put(type.abbreviation.toUpperCase(), type);
        }
        map.put(type.toString().toUpperCase(), type);
      }
      return Collections.unmodifiableMap(map);
    }

    private static final Map classToTokenizerMap = initializeClassMap();

    private static Map initializeClassMap() {
      Map map = Generics.newHashMap();
      for (TokenizerType type : TokenizerType.values()) {
        if (type.className != null) {
          map.put(type.className.toUpperCase(), type);
        }
      }
      return Collections.unmodifiableMap(map);
    }

    /**
     * Get TokenizerType based on what's in the properties.
     *
     * @param props Properties to find tokenizer options in
     * @return An element of the TokenizerType enum indicating the tokenizer to use
     */
    public static TokenizerType getTokenizerType(Properties props) {
      String tokClass = props.getProperty("tokenize.class", null);
      boolean whitespace = Boolean.valueOf(props.getProperty("tokenize.whitespace", "false"));
      String language = props.getProperty("tokenize.language", null);

      if(whitespace) {
        return Whitespace;
      }

      if (tokClass != null) {
        TokenizerType type = classToTokenizerMap.get(tokClass.toUpperCase());
        if (type == null) {
          throw new IllegalArgumentException("TokenizerAnnotator: unknown tokenize.class property " + tokClass);
        }
        return type;
      }

      if (language != null) {
        TokenizerType type = nameToTokenizerMap.get(language.toUpperCase());
        if (type == null) {
          throw new IllegalArgumentException("TokenizerAnnotator: unknown tokenize.language property " + language);
        }
        return type;
      }

      return Unspecified;
    }
  } // end enum TokenizerType


  public static final String EOL_PROPERTY = "tokenize.keepeol";

  private final boolean VERBOSE;
  private final TokenizerFactory factory;

  /** new segmenter properties **/
  private final boolean useSegmenter;
  private final Annotator segmenterAnnotator;

  // CONSTRUCTORS

  /** Gives a non-verbose, English tokenizer. */
  public TokenizerAnnotator() {
    this(false);
  }

  public TokenizerAnnotator(boolean verbose) {
    this(verbose, TokenizerType.English);
  }

  public TokenizerAnnotator(String lang) {
    this(true, lang, null);
  }

  public TokenizerAnnotator(boolean verbose, TokenizerType lang) {
    this(verbose, lang.toString());
  }

  public TokenizerAnnotator(boolean verbose, String lang) {
    this(verbose, lang, null);
  }

  public TokenizerAnnotator(boolean verbose, String lang, String options) {
    this(verbose, lang == null ? null : PropertiesUtils.asProperties("tokenize.language", lang), options);
  }

  public TokenizerAnnotator(boolean verbose, Properties props) {
    this(verbose, props, null);
  }

  public TokenizerAnnotator(boolean verbose, Properties props, String options) {
    if (props == null) {
      props = new Properties();
    }
    // check if segmenting must be done
    if (props.getProperty("tokenize.language") != null &&
            LanguageInfo.isSegmenterLanguage(props.getProperty("tokenize.language"))) {
      useSegmenter = true;
      if (LanguageInfo.getLanguageFromString(
              props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.ARABIC)
        segmenterAnnotator = new ArabicSegmenterAnnotator("segment", props);
      else if (LanguageInfo.getLanguageFromString(
              props.getProperty("tokenize.language")) == LanguageInfo.HumanLanguage.CHINESE)
        segmenterAnnotator = new ChineseSegmenterAnnotator("segment", props);
      else {
        segmenterAnnotator = null;
        throw new RuntimeException("No segmenter implemented for: "+
                LanguageInfo.getLanguageFromString(props.getProperty("tokenize.language")));
      }
    } else {
      useSegmenter = false;
      segmenterAnnotator = null;
    }
    VERBOSE = PropertiesUtils.getBool(props, "tokenize.verbose", verbose);
    TokenizerType type = TokenizerType.getTokenizerType(props);
    factory = initFactory(type, props, options);
  }

  /**
   * initFactory returns the right type of TokenizerFactory based on the options in the properties file
   * and the type. When adding a new Tokenizer, modify TokenizerType.getTokenizerType() to retrieve
   * your tokenizer from the properties file, and then add a class is the switch structure here to
   * instantiate the new Tokenizer type.
   *
   * @param type the TokenizerType
   * @param props the properties file
   * @param extraOptions extra things that should be passed into the tokenizer constructor
   */
  private static TokenizerFactory initFactory(TokenizerType type, Properties props, String extraOptions) throws IllegalArgumentException{
    TokenizerFactory factory;
    String options = props.getProperty("tokenize.options", null);

    // set it to the equivalent of both extraOptions and options
    // TODO: maybe we should always have getDefaultOptions() and
    // expect the user to turn off default options.  That would
    // require all options to have negated options, but
    // currently there are some which don't have that
    if (options == null) {
      options = type.getDefaultOptions();
    }
    if (extraOptions != null) {
      if (extraOptions.endsWith(",")) {
        options = extraOptions + options;
      } else {
        options = extraOptions + ',' + options;
      }
    }

    switch(type) {

    case Arabic:
    case Chinese:
      factory = null;
      break;

    case Spanish:
      factory = SpanishTokenizer.factory(new CoreLabelTokenFactory(), options);
      break;

    case French:
      factory = FrenchTokenizer.factory(new CoreLabelTokenFactory(), options);
      break;

    case Whitespace:
      boolean eolIsSignificant = Boolean.valueOf(props.getProperty(EOL_PROPERTY, "false"));
      eolIsSignificant = eolIsSignificant || Boolean.valueOf(props.getProperty(StanfordCoreNLP.NEWLINE_SPLITTER_PROPERTY, "false"));
      factory = new WhitespaceTokenizer.WhitespaceTokenizerFactory<>(new CoreLabelTokenFactory(), eolIsSignificant);
      break;

    case English:
    case German:
      factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options);
      break;

    case Unspecified:
      log.info("No tokenizer type provided. Defaulting to PTBTokenizer.");
      factory = PTBTokenizer.factory(new CoreLabelTokenFactory(), options);
      break;

    default:
      throw new IllegalArgumentException("No valid tokenizer type provided.\n" +
                                         "Use -tokenize.language, -tokenize.class, or -tokenize.whitespace \n" +
                                         "to specify a tokenizer.");
    }
    return factory;
  }

  /**
   * Returns a thread-safe tokenizer
   */
  public Tokenizer getTokenizer(Reader r) {
    return factory.getTokenizer(r);
  }

  /**
   * Does the actual work of splitting TextAnnotation into CoreLabels,
   * which are then attached to the TokensAnnotation.
   */
  @Override
  public void annotate(Annotation annotation) {
    if (VERBOSE) {
      log.info("Tokenizing ... ");
    }

    // for Arabic and Chinese use a segmenter instead
    if (useSegmenter) {
      segmenterAnnotator.annotate(annotation);
      return;
    }

    if (annotation.containsKey(CoreAnnotations.TextAnnotation.class)) {
      String text = annotation.get(CoreAnnotations.TextAnnotation.class);
      Reader r = new StringReader(text);
      // don't wrap in BufferedReader.  It gives you nothing for in-memory String unless you need the readLine() method!

      List tokens = getTokenizer(r).tokenize();
      // cdm 2010-05-15: This is now unnecessary, as it is done in CoreLabelTokenFactory
      // for (CoreLabel token: tokens) {
      // token.set(CoreAnnotations.TextAnnotation.class, token.get(CoreAnnotations.TextAnnotation.class));
      // }

      annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
      if (VERBOSE) {
        log.info("done.");
        log.info("Tokens: " + annotation.get(CoreAnnotations.TokensAnnotation.class));
      }
    } else {
      throw new RuntimeException("Tokenizer unable to find text in annotation: " + annotation);
    }
  }

  @Override
  public Set> requires() {
    return Collections.emptySet();
  }

  @Override
  public Set> requirementsSatisfied() {
    return new HashSet<>(Arrays.asList(
        CoreAnnotations.TextAnnotation.class,
        CoreAnnotations.TokensAnnotation.class,
        CoreAnnotations.CharacterOffsetBeginAnnotation.class,
        CoreAnnotations.CharacterOffsetEndAnnotation.class,
        CoreAnnotations.BeforeAnnotation.class,
        CoreAnnotations.AfterAnnotation.class,
        CoreAnnotations.TokenBeginAnnotation.class,
        CoreAnnotations.TokenEndAnnotation.class,
        CoreAnnotations.PositionAnnotation.class,
        CoreAnnotations.IndexAnnotation.class,
        CoreAnnotations.OriginalTextAnnotation.class,
        CoreAnnotations.ValueAnnotation.class
    ));
  }

}