edu.stanford.nlp.process.TokenizerFactory Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

The newest version!

package edu.stanford.nlp.process;

import edu.stanford.nlp.objectbank.IteratorFromReaderFactory;

import java.io.Reader;

/**
 * A TokenizerFactory is a factory that can build a Tokenizer (an extension of Iterator)
 * from a java.io.Reader.
 *
 * IMPORTANT NOTE:
 *
 * A TokenizerFactory should also provide two static methods:
 *
 * {@code public static TokenizerFactory newTokenizerFactory(); }
 * {@code public static TokenizerFactory newWordTokenizerFactory(String options); }
 *
 * These are expected by certain JavaNLP code (e.g., LexicalizedParser),
 * which wants to produce a TokenizerFactory by reflection.
 *
 * @author Christopher Manning
 *
 * @param  The type of the tokens returned by the Tokenizer
 */
public interface TokenizerFactory extends IteratorFromReaderFactory {

  /** Get a tokenizer for this reader.
   *
   *  @param r A Reader (which is assumed to already by buffered, if appropriate)
   *  @return A Tokenizer
   */
  Tokenizer getTokenizer(Reader r);

  /** Get a tokenizer for this reader.
   *
   *  @param r A Reader (which is assumed to already by buffered, if appropriate)
   *  @param extraOptions Options for how this tokenizer should behave
   *  @return A Tokenizer
   */
  Tokenizer getTokenizer(Reader r, String extraOptions);

  /** Sets default options for how tokenizers built from this factory should behave.
   *
   *  @param options Options for how this tokenizer should behave
   */
  void setOptions(String options);

}