edu.berkeley.nlp.io.PTBTokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleyparser Show documentation
The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).
The newest version!
/**
 * 
 */
package edu.berkeley.nlp.io;


import java.io.*;
import java.util.*;

import edu.berkeley.nlp.util.StringUtils;

/**
 * Tokenizer implementation that conforms to the Penn Treebank tokenization
 * conventions.
 * This tokenizer is a Java implementation of Professor Chris Manning's Flex
 * tokenizer, pgtt-treebank.l.  It reads raw text and outputs
 * tokens as edu.stanford.nlp.trees.Words in the Penn treebank format. It can
 * optionally return carriage returns as tokens.
 *
 * @author Teg Grenager ([email protected])
 */
public class PTBTokenizer extends AbstractTokenizer {

  // whether carriage returns should be returned as tokens
  private boolean tokenizeCRs;
  // the underlying lexer
  PTBLexer lexer;

  /**
   * Constructs a new PTBTokenizer that treats carriage returns as normal whitespace.
   * No source is specified, so hasNext() will return false.
   */
  public PTBTokenizer() {
    this(false);
  }

  /**
   * Constructs a new PTBTokenizer that optionally returns carriage returns
   * as their own token. CRs come back as Words whose text is
   * the value of PTBLexer.cr.
   */
  public PTBTokenizer(boolean tokenizeCRs) {
    this.tokenizeCRs = tokenizeCRs;
  }

  /**
   * Constructs a new PTBTokenizer that treats carriage returns as normal whitespace.
   */
  public PTBTokenizer(Reader r) {
    this(r, false);
  }

  /**
   * Constructs a new PTBTokenizer that optionally returns carriage returns
   * as their own token. CRs come back as Words whose text is
   * the value of PTBLexer.cr.
   */
  public PTBTokenizer(Reader r, boolean tokenizeCRs) {
    this.tokenizeCRs = tokenizeCRs;
    setSource(r);
  }

  /**
   * Get the next valid Word from the lexer if possible.
   */
  protected Object getNext() {
    if (lexer == null) {
      return null;
    }
    Object token = null;
    try {
      token = lexer.next();
      // get rid of CRs if necessary
      while (!tokenizeCRs && PTBLexer.cr.equals((String)token))
        token = lexer.next();
    } catch (Exception e) {
      nextToken = null;
    }
    return token;
  }


  /** Reads a file from the argument and prints its tokens one per line.
   *  This is mainly as a testing aid, but it can also be quite useful
   *  standalone to turn a corpus into a one token per line file of tokens.
   *  
   *  Usage: java edu.stanford.nlp.process.PTBTokenizer filename
   *  
   *  @param args Command line arguments
   */
  public static void main(String[] args) throws IOException {
    if (args.length < 1) {
      System.err.println("usage: java edu.berkeley.nlp.io." +
              "PTBTokenizer [-cr] filename");
      return;
    }
    PTBTokenizer tokenizer = new PTBTokenizer(new FileReader(args[args.length - 1]), "-cr".equals(args[0]));
    List words = tokenizer.tokenize();
    for (int i = 0; i < words.size(); i++)
      System.out.println(words.get(i));
  }

  /**
   * Sets the source of this Tokenizer to be the Reader r.
   */
  public void setSource(Reader r) {
    lexer = new PTBLexer(r);
  }

  /**
   * Returns a presentable version of the given PTB-tokenized text.
   * PTB tokenization splits up punctuation and does various other things
   * that makes simply joining the tokens with spaces look bad. So join
   * the tokens with space and run it through this method to produce nice
   * looking text. It's not perfect, but it works pretty well.
   */
  public static String ptb2Text(String ptbText) {
    StringBuffer sb = new StringBuffer(ptbText.length()); // probably an overestimate
    PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(ptbText));
    String token;
    try {
      while ((token = lexer.next()) != null) sb.append(token);
    } catch (IOException e) {
      e.printStackTrace();
    }
    return (sb.toString());
  }

  /**
   * Returns a presentable version of the given PTB-tokenized words.
   * Pass in a List of Words or Strings, or a Document and this method will
   * join the words with spaces and call {@link #ptb2Text(String) } on the
   * output. This method will check if the elements in the list are subtypes
   * of Word, and if so, it will take the word() values to prevent additional
   * text from creeping in (e.g., POS tags). Otherwise the toString value will
   * be used.
   */
  public static String ptb2Text(List ptbWords) {
    for (int i = 0; i < ptbWords.size(); i++)
      if (ptbWords.get(i) instanceof String)
        ptbWords.set(i, ((String) ptbWords.get(i)));

    return (ptb2Text(StringUtils.join(ptbWords)));
  }

  public static TokenizerFactory factory() {
    return new PTBTokenizerFactory();
  }

  public static TokenizerFactory factory(boolean tokenizeCRs) {
    return new PTBTokenizerFactory(tokenizeCRs);
  }


  static class PTBTokenizerFactory implements TokenizerFactory {

    protected boolean tokenizeCRs;

    /**
     * Constructs a new PTBTokenizerFactory that treats carriage returns as
     * normal whitespace.
     */
    public PTBTokenizerFactory() {
      this(false);
    }

    /**
     * Constructs a new PTBTokenizer that optionally returns carriage returns
     * as their own token. CRs come back as Words whose text is
     * the value of PTBLexer.cr.
     */
    public PTBTokenizerFactory(boolean tokenizeCRs) {
      this.tokenizeCRs = tokenizeCRs;
    }

    public Iterator getIterator(Reader r) {
      return getTokenizer(r);
    }

    public Tokenizer getTokenizer(Reader r) {
      return new PTBTokenizer(r, tokenizeCRs);
    }

  }

}