edu.stanford.nlp.process.WhitespaceTokenizer Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of stanford-corenlp Show documentation
Stanford CoreNLP provides a set of natural language analysis tools which can take raw English language text input and give the base forms of words, their parts of speech, whether they are names of companies, people, etc., normalize dates, times, and numeric quantities, mark up the structure of sentences in terms of phrases and word dependencies, and indicate which noun phrases refer to the same entities. It provides the foundational building blocks for higher level text understanding applications.
There is a newer version: 4.5.7
Show newest version
package edu.stanford.nlp.process;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.util.Iterator;
import java.util.Properties;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;

/**
 * A WhitespaceTokenizer is a tokenizer that splits on and discards only
 * whitespace characters.
 * This implementation returns Word objects. It has a parameter for whether
 * to make EOL a token or whether to treat EOL characters as whitespace.
 * If an EOL is a token, the class returns it as a Word with String value "\n".
 * 
 * Implementation note: This was rewritten in Apr 2006 to discard the
 * old StreamTokenizer based implementation and to replace it with a
 * Unicode compliant JFlex-based version.
 *
 * @author Joseph Smarr ([email protected])
 * @author Teg Grenager ([email protected])
 * @author Roger Levy
 * @author Christopher Manning
 */
public class WhitespaceTokenizer extends AbstractTokenizer {

  private WhitespaceLexer lexer;
  private final boolean eolIsSignificant;

  /**
   * A factory which vends WhitespaceTokenizers.
   *
   * @author Christopher Manning
   */
  public static class WhitespaceTokenizerFactory implements TokenizerFactory {

    private boolean tokenizeNLs;
    private LexedTokenFactory factory;

    /**
     * Constructs a new TokenizerFactory that returns Word objects and
     * treats carriage returns as normal whitespace.
     * THIS METHOD IS INVOKED BY REFLECTION BY SOME OF THE JAVANLP
     * CODE TO LOAD A TOKENIZER FACTORY.  IT SHOULD BE PRESENT IN A
     * TokenizerFactory.
     *
     * @return A TokenizerFactory that returns Word objects
     */
    public static TokenizerFactory newTokenizerFactory() {
      return new WhitespaceTokenizerFactory<>(new WordTokenFactory(),
              false);
    }

    public WhitespaceTokenizerFactory(LexedTokenFactory factory) {
      this(factory, false);
    }

    public WhitespaceTokenizerFactory(LexedTokenFactory factory,
                                      String options) {
      this.factory = factory;
      Properties prop = StringUtils.stringToProperties(options);
      this.tokenizeNLs = PropertiesUtils.getBool(prop, "tokenizeNLs", false);
    }

    public WhitespaceTokenizerFactory(LexedTokenFactory factory,
                                      boolean tokenizeNLs) {
      this.factory = factory;
      this.tokenizeNLs = tokenizeNLs;
    }

    public Iterator getIterator(Reader r) {
      return getTokenizer(r);
    }

    public Tokenizer getTokenizer(Reader r) {
      return new WhitespaceTokenizer<>(factory, r, tokenizeNLs);
    }

    public Tokenizer getTokenizer(Reader r, String extraOptions) {
      Properties prop = StringUtils.stringToProperties(extraOptions);
      boolean tokenizeNewlines =
        PropertiesUtils.getBool(prop, "tokenizeNLs", this.tokenizeNLs);

      return new WhitespaceTokenizer<>(factory, r, tokenizeNewlines);
    }


    public void setOptions(String options) {
      Properties prop = StringUtils.stringToProperties(options);
      tokenizeNLs = PropertiesUtils.getBool(prop, "tokenizeNLs", tokenizeNLs);
    }
  } // end class WhitespaceTokenizerFactory

  public static WhitespaceTokenizerFactory newCoreLabelTokenizerFactory(String options) {
    return new WhitespaceTokenizerFactory<>(new CoreLabelTokenFactory(), options);
  }

  public static WhitespaceTokenizerFactory newCoreLabelTokenizerFactory() {
    return new WhitespaceTokenizerFactory<>(new CoreLabelTokenFactory());
  }

  /**
   * Internally fetches the next token.
   *
   * @return the next token in the token stream, or null if none exists.
   */
  @SuppressWarnings("unchecked")
  @Override
  protected T getNext() {
    T token = null;
    if (lexer == null) {
      return token;
    }
    try {
      token = (T) lexer.next();
      while (token != null && token.word().equals(WhitespaceLexer.NEWLINE)) {
        if (eolIsSignificant) {
          return token;
        } else {
          token = (T) lexer.next();
        }
      }
    } catch (IOException e) {
      // do nothing, return null
    }
    return token;
  }

  /**
   * Constructs a new WhitespaceTokenizer
   * @param r The Reader that is its source.
   * @param eolIsSignificant Whether eol tokens should be returned.
   */
  public WhitespaceTokenizer(LexedTokenFactory factory,
                             Reader r, boolean eolIsSignificant) {
    this.eolIsSignificant = eolIsSignificant;
    // The conditional below is perhaps currently needed in LexicalizedParser, since
    // it passes in a null arg while doing type-checking for sentence escaping
    // but StreamTokenizer barfs on that.  But maybe shouldn't be here.
    if (r != null) {
      lexer = new WhitespaceLexer(r, factory);
    }
  }

  public static WhitespaceTokenizer newCoreLabelWhitespaceTokenizer(Reader r) {
    return new WhitespaceTokenizer<>(new CoreLabelTokenFactory(), r, false);
  }

  public static WhitespaceTokenizer newCoreLabelWhitespaceTokenizer(Reader r, boolean tokenizeNLs) {
    return new WhitespaceTokenizer<>(new CoreLabelTokenFactory(), r, tokenizeNLs);
  }

  public static WhitespaceTokenizer
    newWordWhitespaceTokenizer(Reader r)
  {
    return newWordWhitespaceTokenizer(r, false);
  }

  public static WhitespaceTokenizer
    newWordWhitespaceTokenizer(Reader r, boolean eolIsSignificant)
  {
    return new WhitespaceTokenizer<>(new WordTokenFactory(), r,
            eolIsSignificant);
  }

  /* ----
   * Sets the source of this Tokenizer to be the Reader r.

  private void setSource(Reader r) {
    lexer = new WhitespaceLexer(r);
  }
  ---- */

  public static TokenizerFactory factory() {
    return new WhitespaceTokenizerFactory<>(new WordTokenFactory(),
            false);
  }

  public static TokenizerFactory factory(boolean eolIsSignificant) {
    return new WhitespaceTokenizerFactory<>(new WordTokenFactory(),
            eolIsSignificant);
  }

  /**
   * Reads a file from the argument and prints its tokens one per line.
   * This is mainly as a testing aid, but it can also be quite useful
   * standalone to turn a corpus into a one token per line file of tokens.
   * 
   * Usage: java edu.stanford.nlp.process.WhitespaceTokenizer filename
   * 
   *
   * @param args Command line arguments
   * @throws IOException If can't open files, etc.
   */
  public static void main(String[] args) throws IOException {

    boolean eolIsSignificant = (args.length > 0 && args[0].equals("-cr"));
    Reader reader = ((args.length > 0 &&
                      !args[args.length - 1].equals("-cr")) ?
                     new InputStreamReader(new FileInputStream
                                           (args[args.length - 1]), "UTF-8") :
                     new InputStreamReader(System.in, "UTF-8"));
    WhitespaceTokenizer tokenizer =
            new WhitespaceTokenizer<>(new WordTokenFactory(), reader,
                    eolIsSignificant);
    PrintWriter pw =
      new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"), true);
    while (tokenizer.hasNext()) {
      Word w = tokenizer.next();
      if (w.value().equals(WhitespaceLexer.NEWLINE)) {
        pw.println("***CR***");
      } else {
        pw.println(w);
      }
    }
  }

} // end class WhitespaceTokenizer