edu.stanford.nlp.process.Tokenizer Maven / Gradle / Ivy

Go to download

Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2

Show newest version

package edu.stanford.nlp.process;


import java.util.Iterator;
import java.util.List;

/**
 * Tokenizers break up text into individual Objects. These objects may be
 * Strings, Words, or other Objects.  A Tokenizer extends the Iterator
 * interface, but provides a lookahead operation peek().  An
 * implementation of this interface is expected to have a constructor that
 * takes a single argument, a Reader.
 *
 * @author Teg Grenager ([email protected])
 */
public interface Tokenizer extends Iterator {

  /**
   * Returns the next token from this Tokenizer.
   *
   * @return the next token in the token stream.
   * @throws java.util.NoSuchElementException
   *          if the token stream has no more tokens.
   */
  @Override
  public T next();

  /**
   * Returns true if and only if this Tokenizer has more elements.
   */
  @Override
  public boolean hasNext();

  /**
   * Removes from the underlying collection the last element returned by
   * the iterator.  This is an optional operation for Iterators - a
   * Tokenizer normally would not support it. This method can be called
   * only once per call to next.
   */
  @Override
  public void remove();

  /**
   * Returns the next token, without removing it, from the Tokenizer, so
   * that the same token will be again returned on the next call to
   * next() or peek().
   *
   * @return the next token in the token stream.
   * @throws java.util.NoSuchElementException
   *          if the token stream has no more tokens.
   */
  public T peek();

  /**
   * Returns all tokens of this Tokenizer as a List for convenience.
   *
   * @return A list of all the tokens
   */
  public List tokenize();

}