edu.stanford.nlp.process.WhitespaceTokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of stanford-parser Show documentation
Stanford Parser processes raw text in English, Chinese, German, Arabic, and French, and extracts constituency parse trees.
The newest version!
package edu.stanford.nlp.process;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.util.Iterator;
import java.util.Properties;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;

/**
 * A WhitespaceTokenizer is a tokenizer that splits on and discards only
 * whitespace characters.
 * This implementation can return Word, CoreLabel or other LexedToken objects. It has a parameter
 * for whether to make EOL a token or whether to treat EOL characters as whitespace.
 * If an EOL is a token, the class returns it as a Word with String value "\n".
 *
 * Implementation note: This was rewritten in Apr 2006 to discard the old StreamTokenizer-based
 * implementation and to replace it with a Unicode compliant JFlex-based version.
 * This tokenizer treats as Whitespace almost exactly the same characters deemed Whitespace by the
 * Java function {@link java.lang.Character#isWhitespace(int) isWhitespace}. That is, a whitespace
 * is a Unicode SPACE_SEPARATOR, LINE_SEPARATOR or PARAGRAPH_SEPARATOR, or one of the control characters
 * U+0009-U+000D or U+001C-U+001F except the non-breaking space characters. The one addition is
 * to also allow U+0085 as a line ending character, for compatibility with certain IBM systems.
 * For including "spaces" in tokens, it is recommended that you represent them as the non-break space
 * character U+00A0.
 *
 * @author Joseph Smarr ([email protected])
 * @author Teg Grenager ([email protected])
 * @author Roger Levy
 * @author Christopher Manning
 */
public class WhitespaceTokenizer extends AbstractTokenizer {

  private WhitespaceLexer lexer;
  private final boolean eolIsSignificant;

  /**
   * A factory which vends WhitespaceTokenizers.
   *
   * @author Christopher Manning
   */
  public static class WhitespaceTokenizerFactory implements TokenizerFactory {

    private static final long serialVersionUID = -5438594683910349897L;

    private boolean tokenizeNLs;
    @SuppressWarnings("serial")
    private final LexedTokenFactory factory;

    /**
     * Constructs a new TokenizerFactory that returns Word objects and
     * treats carriage returns as normal whitespace.
     * THIS METHOD IS INVOKED BY REFLECTION BY SOME OF THE JAVANLP
     * CODE TO LOAD A TOKENIZER FACTORY.  IT SHOULD BE PRESENT IN A
     * TokenizerFactory.
     *
     * @return A TokenizerFactory that returns Word objects
     */
    public static TokenizerFactory newTokenizerFactory() {
      return new WhitespaceTokenizerFactory<>(new WordTokenFactory(),
              false);
    }

    public WhitespaceTokenizerFactory(LexedTokenFactory factory) {
      this(factory, false);
    }

    public WhitespaceTokenizerFactory(LexedTokenFactory factory,
                                      String options) {
      this.factory = factory;
      Properties prop = StringUtils.stringToProperties(options);
      this.tokenizeNLs = PropertiesUtils.getBool(prop, "tokenizeNLs", false);
    }

    public WhitespaceTokenizerFactory(LexedTokenFactory factory,
                                      boolean tokenizeNLs) {
      this.factory = factory;
      this.tokenizeNLs = tokenizeNLs;
    }

    @Override
    public Iterator getIterator(Reader r) {
      return getTokenizer(r);
    }

    @Override
    public Tokenizer getTokenizer(Reader r) {
      return new WhitespaceTokenizer<>(factory, r, tokenizeNLs);
    }

    @Override
    public Tokenizer getTokenizer(Reader r, String extraOptions) {
      Properties prop = StringUtils.stringToProperties(extraOptions);
      boolean tokenizeNewlines = PropertiesUtils.getBool(prop, "tokenizeNLs", this.tokenizeNLs);

      return new WhitespaceTokenizer<>(factory, r, tokenizeNewlines);
    }

    @Override
    public void setOptions(String options) {
      Properties prop = StringUtils.stringToProperties(options);
      tokenizeNLs = PropertiesUtils.getBool(prop, "tokenizeNLs", tokenizeNLs);
    }

  } // end class WhitespaceTokenizerFactory


  public static WhitespaceTokenizerFactory newCoreLabelTokenizerFactory(String options) {
    return new WhitespaceTokenizerFactory<>(new CoreLabelTokenFactory(), options);
  }

  public static WhitespaceTokenizerFactory newCoreLabelTokenizerFactory() {
    return new WhitespaceTokenizerFactory<>(new CoreLabelTokenFactory());
  }

  /**
   * Internally fetches the next token.
   *
   * @return the next token in the token stream, or null if none exists.
   */
  @SuppressWarnings("unchecked")
  @Override
  protected T getNext() {
    if (lexer == null) {
      return null;
    }
    try {
      T token = (T) lexer.next();
      while (token != null && token.word().equals(WhitespaceLexer.NEWLINE)) {
        if (eolIsSignificant) {
          return token;
        } else {
          token = (T) lexer.next();
        }
      }
      return token;
    } catch (IOException e) {
      return null;
    }

  }

  /**
   * Constructs a new WhitespaceTokenizer.
   *
   * @param r The Reader that is its source.
   * @param eolIsSignificant Whether eol tokens should be returned.
   */
  public WhitespaceTokenizer(LexedTokenFactory factory,
                             Reader r, boolean eolIsSignificant) {
    this.eolIsSignificant = eolIsSignificant;
    // The conditional below is perhaps currently needed in LexicalizedParser, since
    // it passes in a null arg while doing type-checking for sentence escaping
    // but StreamTokenizer barfs on that.  But maybe shouldn't be here.
    if (r != null) {
      lexer = new WhitespaceLexer(r, factory);
    }
  }

  public static WhitespaceTokenizer newCoreLabelWhitespaceTokenizer(Reader r) {
    return new WhitespaceTokenizer<>(new CoreLabelTokenFactory(), r, false);
  }

  public static WhitespaceTokenizer newCoreLabelWhitespaceTokenizer(Reader r, boolean tokenizeNLs) {
    return new WhitespaceTokenizer<>(new CoreLabelTokenFactory(), r, tokenizeNLs);
  }

  public static WhitespaceTokenizer
    newWordWhitespaceTokenizer(Reader r)
  {
    return newWordWhitespaceTokenizer(r, false);
  }

  public static WhitespaceTokenizer
    newWordWhitespaceTokenizer(Reader r, boolean eolIsSignificant)
  {
    return new WhitespaceTokenizer<>(new WordTokenFactory(), r,
            eolIsSignificant);
  }

  /* ----
   * Sets the source of this Tokenizer to be the Reader r.

  private void setSource(Reader r) {
    lexer = new WhitespaceLexer(r);
  }
  ---- */

  public static TokenizerFactory factory() {
    return new WhitespaceTokenizerFactory<>(new WordTokenFactory(),
            false);
  }

  public static TokenizerFactory factory(boolean eolIsSignificant) {
    return new WhitespaceTokenizerFactory<>(new WordTokenFactory(),
            eolIsSignificant);
  }

  /**
   * Reads a file from the argument and prints its tokens one per line.
   * This is mainly as a testing aid, but it can also be quite useful
   * standalone to turn a corpus into a one token per line file of tokens.
   *
   * Usage: {@code java edu.stanford.nlp.process.WhitespaceTokenizer filename }
   *
   * @param args Command line arguments
   * @throws IOException If can't open files, etc.
   */
  public static void main(String[] args) throws IOException {

    boolean eolIsSignificant = (args.length > 0 && args[0].equals("-cr"));
    Reader reader = ((args.length > 0 &&
                      !args[args.length - 1].equals("-cr")) ?
                     new InputStreamReader(new FileInputStream
                                           (args[args.length - 1]), "UTF-8") :
                     new InputStreamReader(System.in, "UTF-8"));
    WhitespaceTokenizer tokenizer =
            new WhitespaceTokenizer<>(new WordTokenFactory(), reader,
                    eolIsSignificant);
    PrintWriter pw =
      new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"), true);
    while (tokenizer.hasNext()) {
      Word w = tokenizer.next();
      if (w.value().equals(WhitespaceLexer.NEWLINE)) {
        pw.println("***CR***");
      } else {
        pw.println(w);
      }
    }
  }

} // end class WhitespaceTokenizer