edu.stanford.nlp.process.AbstractTokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
The newest version!
package edu.stanford.nlp.process;

import java.util.ArrayList;
import java.util.List;
import java.util.NoSuchElementException;

// import edu.stanford.nlp.util.logging.Redwood;


/**
 * An abstract tokenizer. Tokenizers extending AbstractTokenizer need only
 * implement the {@code getNext()} method. This implementation does not
 * allow null tokens, since
 * null is used in the protected nextToken field to signify that no more
 * tokens are available.
 *
 * @author Teg Grenager ([email protected])
 */

public abstract class AbstractTokenizer implements Tokenizer  {

  // /** A logger for this class */
  // private static final Redwood.RedwoodChannels log = Redwood.channels(AbstractTokenizer.class);

  /** For tokenizing carriage returns.
   *  We return this token as a representation of newlines when a tokenizer has the option
   *  {@code tokenizeNLs = true}. It is assumed that no tokenizer allows *NL* as a token.
   *  This is certainly true for PTBTokenizer-derived tokenizers, where the asterisks would
   *  become separate tokens.
   */
  public static final String NEWLINE_TOKEN = "*NL*";


  protected T nextToken; // = null;

  /**
   * Internally fetches the next token.
   *
   * @return the next token in the token stream, or null if none exists.
   */
  protected abstract T getNext();

  /**
   * Returns the next token from this Tokenizer.
   *
   * @return the next token in the token stream.
   * @throws java.util.NoSuchElementException
   *          if the token stream has no more tokens.
   */
  @Override
  public T next() {
    if (nextToken == null) {
      nextToken = getNext();
    }
    T result = nextToken;
    nextToken = null;
    if (result == null) {
      throw new NoSuchElementException();
    }
    return result;
  }

  /**
   * Returns {@code true} if this Tokenizer has more elements.
   */
  @Override
  public boolean hasNext() {
    if (nextToken == null) {
      nextToken = getNext();
    }
    return nextToken != null;
  }

  /**
   * This is an optional operation, by default not supported.
   */
  @Override
  public void remove() {
    throw new UnsupportedOperationException();
  }

  /**
   * This is an optional operation, by default supported.
   *
   * @return The next token in the token stream.
   * @throws java.util.NoSuchElementException
   *          if the token stream has no more tokens.
   */
  @Override
  public T peek() {
    if (nextToken == null) {
      nextToken = getNext();
    }
    if (nextToken == null) {
      throw new NoSuchElementException();
    }
    return nextToken;
  }

  // Assume that the text we are being asked to tokenize is usually more than 10 tokens; save 5 reallocations
  private static final int DEFAULT_TOKENIZE_LIST_SIZE = 64;

  /**
   * Returns text as a List of tokens.
   *
   * @return A list of all tokens remaining in the underlying Reader
   */
  @Override
  public List tokenize() {
    ArrayList result = new ArrayList<>(DEFAULT_TOKENIZE_LIST_SIZE);
    while (hasNext()) {
      result.add(next());
    }
    // log.info("tokenize() produced " + result);
    // if it was tiny, reallocate small
    if (result.size() <= DEFAULT_TOKENIZE_LIST_SIZE / 4) {
      result.trimToSize();
    }
    return result;
  }

}