All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.stanford.nlp.process.TokenizerAdapter Maven / Gradle / Ivy

Go to download

Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.

There is a newer version: 3.9.2
Show newest version
package edu.stanford.nlp.process;


import java.io.IOException;
import java.io.StreamTokenizer;


/**
 * This class adapts between a java.io.StreamTokenizer
 * and a edu.stanford.nlp.process.Tokenizer.
 *
 * @author Christopher Manning
 * @version 2004/04/01
 */
public class TokenizerAdapter extends AbstractTokenizer {

  protected final StreamTokenizer st;

  protected String eolString = "";


  /**
   * Create a new TokenizerAdaptor.  In general, it is
   * recommended that the passed in StreamTokenizer should
   * have had resetSyntax() done to it, so that numbers are
   * returned as entered as tokens of type String, though this
   * code will cope as best it can.
   *
   * @param st The internal java.io.StreamTokenizer
   */
  public TokenizerAdapter(StreamTokenizer st) {
    this.st = st;
  }


  /**
   * Internally fetches the next token.
   *
   * @return the next token in the token stream, or null if none exists.
   */
  @Override
  public String getNext() {
    try {
      int nextTok = st.nextToken();
      switch (nextTok) {
        case java.io.StreamTokenizer.TT_EOL:
          return eolString;
        case java.io.StreamTokenizer.TT_EOF:
          return null;
        case java.io.StreamTokenizer.TT_WORD:
          return st.sval;
        case java.io.StreamTokenizer.TT_NUMBER:
          return Double.toString(st.nval);
        default:
          char[] t = { (char) nextTok };    // (array initialization)
          return new String(t);
      }
    } catch (IOException ioe) {
      // do nothing, return null
      return null;
    }
  }


  /**
   * Set the String returned when the inner tokenizer
   * returns an end-of-line token.  This will only happen if the
   * inner tokenizer has been set to eolIsSignificant(true).
   *
   * @param eolString The String used to represent eol.  It is not allowed
   *                  to be null (which would confuse line ends and file end)
   */
  public void setEolString(String eolString) {
    if (eolString == null) {
      throw new IllegalArgumentException("eolString cannot be null");
    }
    this.eolString = eolString;
  }


  /**
   * Say whether the String is the end-of-line token for
   * this tokenizer.
   *
   * @param str The String being tested
   * @return Whether it is the end-of-line token
   */
  public boolean isEol(String str) {
    return eolString.equals(str);
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy