edu.stanford.nlp.process.PTBTokenizer Maven / Gradle / Ivy

Go to download
package edu.stanford.nlp.process;

// Stanford English Tokenizer -- a deterministic, fast high-quality tokenizer
// Copyright (c) 2002-2009 The Board of Trustees of
// The Leland Stanford Junior University. All Rights Reserved.
//
// This program is free software; you can redistribute it and/or
// modify it under the terms of the GNU General Public License
// as published by the Free Software Foundation; either version 2
// of the License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//
// For more information, bug reports, fixes, contact:
//    Christopher Manning
//    Dept of Computer Science, Gates 1A
//    Stanford CA 94305-9010
//    USA
//    [email protected]
//    http://nlp.stanford.edu/software/


import java.io.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.Word;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;


/**
 * A fast, rule-based tokenizer implementation, which produces Penn Treebank
 * style tokenization of English text. It was initially written to conform
 * to Penn Treebank tokenization conventions over ASCII text, but now provides
 * a range of tokenization options over a broader space of Unicode text.
 * It reads raw text and outputs
 * tokens of classes that implement edu.stanford.nlp.trees.HasWord
 * (typically a Word or a CoreLabel). It can
 * optionally return end-of-line as a token.
 * 
 * New code is encouraged to use the {@link #PTBTokenizer(Reader,LexedTokenFactory,String)}
 * constructor. The other constructors are historical.
 * You specify the type of result tokens with a
 * LexedTokenFactory, and can specify the treatment of tokens by mainly boolean
 * options given in a comma separated String options
 * (e.g., "invertible,normalizeParentheses=true").
 * If the String is {@code null} or empty, you get the traditional
 * PTB3 normalization behaviour (i.e., you get ptb3Escaping=true).  If you
 * want no normalization, then you should pass in the String
 * "ptb3Escaping=false".  The known option names are:
 * 

 * invertible: Store enough information about the original form of the
 *     token and the whitespace around it that a list of tokens can be
 *     faithfully converted back to the original String.  Valid only if the
 *     LexedTokenFactory is an instance of CoreLabelTokenFactory.  The
 *
 *
 *
 *     keys used in it are: TextAnnotation for the tokenized form,
 *     OriginalTextAnnotation for the original string, BeforeAnnotation and
 *     AfterAnnotation for the whitespace before and after a token, and
 *     perhaps CharacterOffsetBeginAnnotation and CharacterOffsetEndAnnotation to record
 *     token begin/after end character offsets, if they were specified to be recorded
 *     in TokenFactory construction.  (Like the String class, begin and end
 *     are done so end - begin gives the token length.) Default is false.
 * 
tokenizeNLs: Whether end-of-lines should become tokens (or just
 *     be treated as part of whitespace). Default is false.
 * 
ptb3Escaping: Enable all traditional PTB3 token transforms
 *     (like parentheses becoming -LRB-, -RRB-).  This is a macro flag that
 *     sets or clears all the options below. (Default setting of the various
 *     properties below that this flag controls is equivalent to it being set
 *     to true.)
 * 
americanize: Whether to rewrite common British English spellings
 *     as American English spellings. (This is useful if your training
 *     material uses American English spelling, such as the Penn Treebank.)
 *     Default is true.
 * 
normalizeSpace: Whether any spaces in tokens (phone numbers, fractions
 *     get turned into U+00A0 (non-breaking space).  It's dangerous to turn
 *     this off for most of our Stanford NLP software, which assumes no
 *     spaces in tokens. Default is true.
 * 
normalizeAmpersandEntity: Whether to map the XML &amp; to an
 *      ampersand. Default is true.
 * 
normalizeCurrency: Whether to do some awful lossy currency mappings
 *     to turn common currency characters into $, #, or "cents", reflecting
 *     the fact that nothing else appears in the old PTB3 WSJ.  (No Euro!)
 *     Default is true.
 * 
normalizeFractions: Whether to map certain common composed
 *     fraction characters to spelled out letter forms like "1/2".
 *     Default is true.
 * 
normalizeParentheses: Whether to map round parentheses to -LRB-,
 *     -RRB-, as in the Penn Treebank. Default is true.
 * 
normalizeOtherBrackets: Whether to map other common bracket characters
 *     to -LCB-, -LRB-, -RCB-, -RRB-, roughly as in the Penn Treebank.
 *     Default is true.
 * 
asciiQuotes Whether to map all quote characters to the traditional ' and ".
 *     Default is false.
 * 
latexQuotes: Whether to map quotes to ``, `, ', '', as in Latex
 *     and the PTB3 WSJ (though this is now heavily frowned on in Unicode).
 *     If true, this takes precedence over the setting of unicodeQuotes;
 *     if both are false, no mapping is done.  Default is true.
 * 
unicodeQuotes: Whether to map quotes to the range U+2018 to U+201D,
 *     the preferred unicode encoding of single and double quotes.
 *     Default is false.
 * 
ptb3Ellipsis: Whether to map ellipses to three dots (...), the
 *     old PTB3 WSJ coding of an ellipsis. If true, this takes precedence
 *     over the setting of unicodeEllipsis; if both are false, no mapping
 *     is done. Default is true.
 * 
unicodeEllipsis: Whether to map dot and optional space sequences to
 *     U+2026, the Unicode ellipsis character. Default is false.
 * 
ptb3Dashes: Whether to turn various dash characters into "--",
 *     the dominant encoding of dashes in the PTB3 WSJ. Default is true.
 * 
keepAssimilations: true to tokenize "gonna", false to tokenize
 *                        "gon na".  Default is true.
 * 
escapeForwardSlashAsterisk: Whether to put a backslash escape in front
 *     of / and * as the old PTB3 WSJ does for some reason (something to do
 *     with Lisp readers??). Default is true.
 * 
untokenizable: What to do with untokenizable characters (ones not
 *     known to the tokenizer).  Six options combining whether to log a
 *     warning for none, the first, or all, and whether to delete them or
 *     to include them as single character tokens in the output: noneDelete,
 *     firstDelete, allDelete, noneKeep, firstKeep, allKeep.
 *     The default is "firstDelete".
 * 
strictTreebank3: PTBTokenizer deliberately deviates from strict PTB3
 *      WSJ tokenization in two cases.  Setting this improves compatibility
 *      for those cases.  They are: (i) When an acronym is followed by a
 *      sentence end, such as "U.K." at the end of a sentence, the PTB3
 *      has tokens of "Corp" and ".", while by default PTBTokenizer duplicates
 *      the period returning tokens of "Corp." and ".", and (ii) PTBTokenizer
 *      will return numbers with a whole number and a fractional part like
 *      "5 7/8" as a single token, with a non-breaking space in the middle,
 *      while the PTB3 separates them into two tokens "5" and "7/8".
 *      (Exception: for only "U.S." the treebank does have the two tokens
 *      "U.S." and "." like our default; strictTreebank3 now does that too.)
 *      The default is false.
 * 
 * 
 * A single instance of a PTBTokenizer is not thread safe, as it uses
 * a non-threadsafe JFlex object to do the processing.  Multiple
 * instances can be created safely, though.  A single instance of a
 * PTBTokenizerFactory is also not thread safe, as it keeps its
 * options in a local variable.
 * 
 *
 * @author Tim Grow (his tokenizer is a Java implementation of Professor
 *     Chris Manning's Flex tokenizer, pgtt-treebank.l)
 * @author Teg Grenager ([email protected])
 * @author Jenny Finkel (integrating in invertible PTB tokenizer)
 * @author Christopher Manning (redid API, added many options, maintenance)
 */
public class PTBTokenizer extends AbstractTokenizer {

  // the underlying lexer
  private final PTBLexer lexer;


  /**
   * Constructs a new PTBTokenizer that returns Word tokens and which treats
   * carriage returns as normal whitespace.
   *
   * @param r The Reader whose contents will be tokenized
   * @return A PTBTokenizer that tokenizes a stream to objects of type
   *          {@link Word}
   */
  public static PTBTokenizer newPTBTokenizer(Reader r) {
    return new PTBTokenizer<>(r, new WordTokenFactory(), "");
  }


  /**
   * Constructs a new PTBTokenizer that makes CoreLabel tokens.
   * It optionally returns carriage returns
   * as their own token. CRs come back as Words whose text is
   * the value of {@code PTBLexer.NEWLINE_TOKEN}.
   *
   * @param r The Reader to read tokens from
   * @param tokenizeNLs Whether to return newlines as separate tokens
   *         (otherwise they normally disappear as whitespace)
   * @param invertible if set to true, then will produce CoreLabels which
   *         will have fields for the string before and after, and the
   *         character offsets
   * @return A PTBTokenizer which returns CoreLabel objects
   */
  public static PTBTokenizer newPTBTokenizer(Reader r, boolean tokenizeNLs, boolean invertible) {
    return new PTBTokenizer<>(r, tokenizeNLs, invertible, false, new CoreLabelTokenFactory());
  }


  /**
   * Constructs a new PTBTokenizer that optionally returns carriage returns
   * as their own token, and has a custom LexedTokenFactory.
   * If asked for, CRs come back as Words whose text is
   * the value of {@code PTBLexer.cr}.  This constructor translates
   * between the traditional boolean options of PTBTokenizer and the new
   * options String.
   *
   * @param r The Reader to read tokens from
   * @param tokenizeNLs Whether to return newlines as separate tokens
   *         (otherwise they normally disappear as whitespace)
   * @param invertible if set to true, then will produce CoreLabels which
   *         will have fields for the string before and after, and the
   *         character offsets
   * @param suppressEscaping If true, all the traditional Penn Treebank
   *         normalizations are turned off.  Otherwise, they all happen.
   * @param tokenFactory The LexedTokenFactory to use to create
   *         tokens from the text.
   */
  private PTBTokenizer(final Reader r,
                       final boolean tokenizeNLs,
                       final boolean invertible,
                       final boolean suppressEscaping,
                       final LexedTokenFactory tokenFactory) {
    StringBuilder options = new StringBuilder();
    if (suppressEscaping) {
      options.append("ptb3Escaping=false");
    } else {
      options.append("ptb3Escaping=true"); // i.e., turn on all the historical PTB normalizations
    }
    if (tokenizeNLs) {
      options.append(",tokenizeNLs");
    }
    if (invertible) {
      options.append(",invertible");
    }
    lexer = new PTBLexer(r, tokenFactory, options.toString());
  }


  /**
   * Constructs a new PTBTokenizer with a custom LexedTokenFactory.
   * Many options for tokenization and what is returned can be set via
   * the options String. See the class documentation for details on
   * the options String.  This is the new recommended constructor!
   *
   * @param r The Reader to read tokens from.
   * @param tokenFactory The LexedTokenFactory to use to create
   *         tokens from the text.
   * @param options Options to the lexer.  See the extensive documentation
   *         in the class javadoc.  The String may be null or empty,
   *         which means that all traditional PTB normalizations are
   *         done.  You can pass in "ptb3Escaping=false" and have no
   *         normalizations done (that is, the behavior of the old
   *         suppressEscaping=true option).
   */
  public PTBTokenizer(final Reader r,
                      final LexedTokenFactory tokenFactory,
                      final String options) {
    lexer = new PTBLexer(r, tokenFactory, options);
  }


  /**
   * Internally fetches the next token.
   *
   * @return the next token in the token stream, or null if none exists.
   */
  @Override
  @SuppressWarnings("unchecked")
  protected T getNext() {
    // if (lexer == null) {
    //   return null;
    // }
    try {
      return (T) lexer.next();
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
    // cdm 2007: this shouldn't be necessary: PTBLexer decides for itself whether to return CRs based on the same flag!
    // get rid of CRs if necessary
    // while (!tokenizeNLs && PTBLexer.cr.equals(((HasWord) token).word())) {
    //   token = (T)lexer.next();
    // }

    // horatio: we used to catch exceptions here, which led to broken
    // behavior and made it very difficult to debug whatever the
    // problem was.
  }

  /**
   * Returns the string literal inserted for newlines when the -tokenizeNLs
   * options is set.
   *
   * @return string literal inserted for "\n".
   */
  public static String getNewlineToken() { return PTBLexer.NEWLINE_TOKEN; }

  /**
   * Returns a presentable version of the given PTB-tokenized text.
   * PTB tokenization splits up punctuation and does various other things
   * that makes simply joining the tokens with spaces look bad. So join
   * the tokens with space and run it through this method to produce nice
   * looking text. It's not perfect, but it works pretty well.
   *
   * @param ptbText A String in PTB3-escaped form
   * @return An approximation to the original String
   */
  public static String ptb2Text(String ptbText) {
    StringBuilder sb = new StringBuilder(ptbText.length()); // probably an overestimate
    PTB2TextLexer lexer = new PTB2TextLexer(new StringReader(ptbText));
    try {
      for (String token; (token = lexer.next()) != null; ) {
        sb.append(token);
      }
    } catch (IOException e) {
      e.printStackTrace();
    }
    return sb.toString();
  }

  /**
   * Returns a presentable version of a given PTB token. For instance,
   * it transforms -LRB- into (.
   */
  public static String ptbToken2Text(String ptbText) {
    return ptb2Text(' ' + ptbText + ' ').trim();
  }

  /**
   * Writes a presentable version of the given PTB-tokenized text.
   * PTB tokenization splits up punctuation and does various other things
   * that makes simply joining the tokens with spaces look bad. So join
   * the tokens with space and run it through this method to produce nice
   * looking text. It's not perfect, but it works pretty well.
   */
  public static int ptb2Text(Reader ptbText, Writer w) throws IOException {
    int numTokens = 0;
    PTB2TextLexer lexer = new PTB2TextLexer(ptbText);
    for (String token; (token = lexer.next()) != null; ) {
      numTokens++;
      w.write(token);
    }
    return numTokens;
  }

  private static void untok(List inputFileList, List outputFileList, String charset) throws IOException {
    final long start = System.nanoTime();
    int numTokens = 0;
    int sz = inputFileList.size();
    if (sz == 0) {
      Reader r = new InputStreamReader(System.in, charset);
      BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, charset));
      numTokens = ptb2Text(r, writer);
      writer.close();
    } else {
      for (int j = 0; j < sz; j++) {
        Reader r = IOUtils.readerFromString(inputFileList.get(j), charset);
        BufferedWriter writer;
        if (outputFileList == null) {
          writer = new BufferedWriter(new OutputStreamWriter(System.out, charset));
        } else {
          writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset));
        }
        numTokens += ptb2Text(r, writer);
        writer.close();
        r.close();
      }
    }
    final long duration = System.nanoTime() - start;
    final double wordsPerSec = (double) numTokens / ((double) duration / 1000000000.0);
    System.err.printf("PTBTokenizer untokenized %d tokens at %.2f tokens per second.%n", numTokens, wordsPerSec);
  }

  /**
   * Returns a presentable version of the given PTB-tokenized words.
   * Pass in a List of Strings and this method will
   * join the words with spaces and call {@link #ptb2Text(String)} on the
   * output.
   *
   * @param ptbWords A list of String
   * @return A presentable version of the given PTB-tokenized words
   */
  public static String ptb2Text(List ptbWords) {
    return ptb2Text(StringUtils.join(ptbWords));
  }


  /**
   * Returns a presentable version of the given PTB-tokenized words.
   * Pass in a List of Words or a Document and this method will
   * join the words with spaces and call {@link #ptb2Text(String)} on the
   * output. This method will take the word() values to prevent additional
   * text from creeping in (e.g., POS tags).
   *
   * @param ptbWords A list of HasWord objects
   * @return A presentable version of the given PTB-tokenized words
   */
  public static String labelList2Text(List ptbWords) {
    List words = new ArrayList<>();
    for (HasWord hw : ptbWords) {
      words.add(hw.word());
    }

    return ptb2Text(words);
  }


  private static void tok(List inputFileList, List outputFileList, String charset, Pattern parseInsidePattern, String options, boolean preserveLines, boolean dump, boolean lowerCase) throws IOException {
    final long start = System.nanoTime();
    long numTokens = 0;
    int numFiles = inputFileList.size();
    if (numFiles == 0) {
      Reader stdin = IOUtils.readerFromStdin(charset);
      BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(System.out, charset));
      numTokens += tokReader(stdin, writer, parseInsidePattern, options, preserveLines, dump, lowerCase);
      IOUtils.closeIgnoringExceptions(writer);

    } else {
      for (int j = 0; j < numFiles; j++) {
        Reader r = IOUtils.readerFromString(inputFileList.get(j), charset);
        BufferedWriter out = (outputFileList == null) ?
          new BufferedWriter(new OutputStreamWriter(System.out, charset)) :
            new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputFileList.get(j)), charset));
        numTokens += tokReader(r, out, parseInsidePattern, options, preserveLines, dump, lowerCase);
        r.close();
        IOUtils.closeIgnoringExceptions(out);
      } // end for j going through inputFileList
    }

    final long duration = System.nanoTime() - start;
    final double wordsPerSec = (double) numTokens / ((double) duration / 1000000000.0);
    System.err.printf("PTBTokenizer tokenized %d tokens at %.2f tokens per second.%n", numTokens, wordsPerSec);
  }

  private static int tokReader(Reader r, BufferedWriter writer, Pattern parseInsidePattern, String options, boolean preserveLines, boolean dump, boolean lowerCase) throws IOException {
    int numTokens = 0;
    boolean beginLine = true;
    boolean printing = (parseInsidePattern == null); // start off printing, unless you're looking for a start entity
    Matcher m = null;
    if (parseInsidePattern != null) {
      m = parseInsidePattern.matcher(""); // create once as performance hack
      // System.err.printf("parseInsidePattern is: |%s|%n", parseInsidePattern);
    }
    for (PTBTokenizer tokenizer = new PTBTokenizer<>(r, new CoreLabelTokenFactory(), options); tokenizer.hasNext(); ) {
      CoreLabel obj = tokenizer.next();
      // String origStr = obj.get(CoreAnnotations.TextAnnotation.class).replaceFirst("\n+$", ""); // DanC added this to fix a lexer bug, hopefully now corrected
      String origStr = obj.get(CoreAnnotations.TextAnnotation.class);
      String str;
      if (lowerCase) {
        str = origStr.toLowerCase(Locale.ENGLISH);
        obj.set(CoreAnnotations.TextAnnotation.class, str);
      } else {
        str = origStr;
      }
      if (m != null && m.reset(origStr).matches()) {
        printing = m.group(1).isEmpty(); // turn on printing if no end element slash, turn it off it there is
        // System.err.printf("parseInsidePattern matched against: |%s|, printing is %b.%n", origStr, printing);
      } else if (printing) {
        if (dump) {
          // after having checked for tags, change str to be exhaustive
          str = obj.toString();
        }
        if (preserveLines) {
          if (PTBLexer.NEWLINE_TOKEN.equals(origStr)) {
            beginLine = true;
            writer.newLine();
          } else {
            if ( ! beginLine) {
              writer.write(' ');
            } else {
              beginLine = false;
            }
            // writer.write(str.replace("\n", ""));
            writer.write(str);
          }
        } else {
          writer.write(str);
          writer.newLine();
        }
      }
      numTokens++;
    }
    return numTokens;
  }


  /** @return A PTBTokenizerFactory that vends Word tokens. */
  public static TokenizerFactory factory() {
    return PTBTokenizerFactory.newTokenizerFactory();
  }

  /** @return A PTBTokenizerFactory that vends CoreLabel tokens. */
  public static TokenizerFactory factory(boolean tokenizeNLs, boolean invertible) {
    return PTBTokenizerFactory.newPTBTokenizerFactory(tokenizeNLs, invertible);
  }


  /** @return A PTBTokenizerFactory that vends CoreLabel tokens with default tokenization. */
  public static TokenizerFactory coreLabelFactory() {
    return PTBTokenizerFactory.newPTBTokenizerFactory(new CoreLabelTokenFactory(), "");
  }

  /** Get a TokenizerFactory that does Penn Treebank tokenization.
   *  This is now the recommended factory method to use.
   *
   * @param factory A TokenFactory that determines what form of token is returned by the Tokenizer
   * @param options A String specifying options (see the class javadoc for details)
   * @param  The type of the tokens built by the LexedTokenFactory
   * @return A TokenizerFactory that does Penn Treebank tokenization
   */
  public static  TokenizerFactory factory(LexedTokenFactory factory, String options) {
    return new PTBTokenizerFactory<>(factory, options);

  }


  /** This class provides a factory which will vend instances of PTBTokenizer
   *  which wrap a provided Reader.  See the documentation for
   *  {@link PTBTokenizer} for details of the parameters and options.
   *
   *  @see PTBTokenizer
   *  @param  The class of the returned tokens
   */
  public static class PTBTokenizerFactory implements TokenizerFactory {

    protected final LexedTokenFactory factory;
    protected String options;


    /**
     * Constructs a new TokenizerFactory that returns Word objects and
     * treats carriage returns as normal whitespace.
     * THIS METHOD IS INVOKED BY REFLECTION BY SOME OF THE JAVANLP
     * CODE TO LOAD A TOKENIZER FACTORY.  IT SHOULD BE PRESENT IN A
     * TokenizerFactory.
     *
     * @return A TokenizerFactory that returns Word objects
     */
    public static TokenizerFactory newTokenizerFactory() {
      return newPTBTokenizerFactory(new WordTokenFactory(), "");
    }

    /**
     * Constructs a new PTBTokenizer that returns Word objects and
     * uses the options passed in.
     * THIS METHOD IS INVOKED BY REFLECTION BY SOME OF THE JAVANLP
     * CODE TO LOAD A TOKENIZER FACTORY.  IT SHOULD BE PRESENT IN A
     * TokenizerFactory.
     *
     * @param options A String of options
     * @return A TokenizerFactory that returns Word objects
     */
    public static PTBTokenizerFactory newWordTokenizerFactory(String options) {
      return new PTBTokenizerFactory<>(new WordTokenFactory(), options);
    }

    /**
     * Constructs a new PTBTokenizer that returns CoreLabel objects and
     * uses the options passed in.
     *
     * @param options A String of options. For the default, recommended
     *                options for PTB-style tokenization compatibility, pass
     *                in an empty String.
     * @return A TokenizerFactory that returns CoreLabel objects o
     */
    public static PTBTokenizerFactory newCoreLabelTokenizerFactory(String options) {
      return new PTBTokenizerFactory<>(new CoreLabelTokenFactory(), options);
    }

    /**
     * Constructs a new PTBTokenizer that uses the LexedTokenFactory and
     * options passed in.
     *
     * @param tokenFactory The LexedTokenFactory
     * @param options A String of options
     * @return A TokenizerFactory that returns objects of the type of the
     *         LexedTokenFactory
     */
    public static  PTBTokenizerFactory newPTBTokenizerFactory(LexedTokenFactory tokenFactory, String options) {
      return new PTBTokenizerFactory<>(tokenFactory, options);
    }

    public static PTBTokenizerFactory newPTBTokenizerFactory(boolean tokenizeNLs, boolean invertible) {
      return new PTBTokenizerFactory<>(tokenizeNLs, invertible, false, new CoreLabelTokenFactory());
    }


    // Constructors

    // This one is historical
    private PTBTokenizerFactory(boolean tokenizeNLs, boolean invertible, boolean suppressEscaping, LexedTokenFactory factory) {
      this.factory = factory;
      StringBuilder optionsSB = new StringBuilder();
      if (suppressEscaping) {
        optionsSB.append("ptb3Escaping=false");
      } else {
        optionsSB.append("ptb3Escaping=true"); // i.e., turn on all the historical PTB normalizations
      }
      if (tokenizeNLs) {
        optionsSB.append(",tokenizeNLs");
      }
      if (invertible) {
        optionsSB.append(",invertible");
      }
      this.options = optionsSB.toString();
    }

    /** Make a factory for PTBTokenizers.
     *
     *  @param tokenFactory A factory for the token type that the tokenizer will return
     *  @param options Options to the tokenizer (see the class documentation for details)
     */
    private PTBTokenizerFactory(LexedTokenFactory tokenFactory, String options) {
      this.factory = tokenFactory;
      this.options = options;
    }


    /** Returns a tokenizer wrapping the given Reader. */
    @Override
    public Iterator getIterator(Reader r) {
      return getTokenizer(r);
    }

    /** Returns a tokenizer wrapping the given Reader. */
    @Override
    public Tokenizer getTokenizer(Reader r) {
      return new PTBTokenizer<>(r, factory, options);
    }

    @Override
    public Tokenizer getTokenizer(Reader r, String extraOptions) {
      if (options == null || options.isEmpty()) {
        return new PTBTokenizer<>(r, factory, extraOptions);
      } else {
        return new PTBTokenizer<>(r, factory, options + ',' + extraOptions);
      }
    }

    @Override
    public void setOptions(String options) {
      this.options = options;
    }
  } // end static class PTBTokenizerFactory

  /**
   * Command-line option specification.
   */
  private static Map optionArgDefs() {
    Map optionArgDefs = Generics.newHashMap();
    optionArgDefs.put("options", 1);
    optionArgDefs.put("ioFileList", 0);
    optionArgDefs.put("lowerCase", 0);
    optionArgDefs.put("dump", 0);
    optionArgDefs.put("untok", 0);
    optionArgDefs.put("encoding", 1);
    optionArgDefs.put("parseInside", 1);
    optionArgDefs.put("preserveLines", 0);
    return optionArgDefs;
  }

  /**
   * Reads files given as arguments and print their tokens, by default as
   * one per line.  This is useful either for testing or to run
   * standalone to turn a corpus into a one-token-per-line file of tokens.
   * This main method assumes that the input file is in utf-8 encoding,
   * unless an encoding is specified.
   * 
   * Usage: 
   * java edu.stanford.nlp.process.PTBTokenizer [options] filename+
   * 
   * 

   * Options:
   * 

   *  -options options Set various tokenization options
   *       (see the documentation in the class javadoc)
   * 
 -preserveLines Produce space-separated tokens, except
   *       when the original had a line break, not one-token-per-line
   * 
 -encoding encoding Specifies a character encoding. If you do not
   *      specify one, the default is utf-8 (not the platform default).
   * 
 -lowerCase Lowercase all tokens (on tokenization)
   * 
 -parseInside regex Names an XML-style element or a regular expression
   *      over such elements.  The tokenizer will only tokenize inside elements
   *      that match this regex.  (This is done by regex matching, not an XML
   *      parser, but works well for simple XML documents, or other SGML-style
   *      documents, such as Linguistic Data Consortium releases, which adopt
   *      the convention that a line of a file is either XML markup or
   *      character data but never both.)
   * 
 -ioFileList file* The remaining command-line arguments are treated as
   *      filenames that themselves contain lists of pairs of input-output
   *      filenames (2 column, whitespace separated).
   * 
 -dump Print the whole of each CoreLabel, not just the value (word)
   * 
 -untok Heuristically untokenize tokenized text
   * 
 -h, -help Print usage info
   * 
   *
   * @param args Command line arguments
   * @throws IOException If any file I/O problem
   */
  public static void main(String[] args) throws IOException {
    Properties options = StringUtils.argsToProperties(args, optionArgDefs());
    boolean showHelp = PropertiesUtils.getBool(options, "help", false);
    showHelp = PropertiesUtils.getBool(options, "h", showHelp);
    if (showHelp) {
      System.err.println("Usage: java edu.stanford.nlp.process.PTBTokenizer [options]* filename*");
      System.err.println("  options: -h|-preserveLines|-lowerCase|-dump|-ioFileList|-encoding|-parseInside|-options");
      System.exit(0);
    }

    StringBuilder optionsSB = new StringBuilder();
    String tokenizerOptions = options.getProperty("options", null);
    if (tokenizerOptions != null) {
      optionsSB.append(tokenizerOptions);
    }
    boolean preserveLines = PropertiesUtils.getBool(options, "preserveLines", false);
    if (preserveLines) {
      optionsSB.append(",tokenizeNLs");
    }
    boolean inputOutputFileList = PropertiesUtils.getBool(options, "ioFileList", false);
    boolean lowerCase = PropertiesUtils.getBool(options, "lowerCase", false);
    boolean dump = PropertiesUtils.getBool(options, "dump", false);
    boolean untok = PropertiesUtils.getBool(options, "untok", false);
    String charset = options.getProperty("encoding", "utf-8");
    String parseInsideKey = options.getProperty("parseInside", null);
    Pattern parseInsidePattern = null;
    if (parseInsideKey != null) {
      try {
        // We still allow space, but PTBTokenizer will change space to   so need to also match it
        parseInsidePattern = Pattern.compile("<(/?)(?:" + parseInsideKey + ")(?:(?:\\s|\u00A0)[^>]*?)?>");
      } catch (PatternSyntaxException e) {
        // just go with null parseInsidePattern
      }
    }

    // Other arguments are filenames
    String parsedArgStr = options.getProperty("",null);
    String[] parsedArgs = (parsedArgStr == null) ? null : parsedArgStr.split("\\s+");

    ArrayList inputFileList = new ArrayList<>();
    ArrayList outputFileList = null;
    if (inputOutputFileList && parsedArgs != null) {
      outputFileList = new ArrayList<>();
      for (String fileName : parsedArgs) {
        BufferedReader r = IOUtils.readerFromString(fileName, charset);
        for (String inLine; (inLine = r.readLine()) != null; ) {
          String[] fields = inLine.split("\\s+");
          inputFileList.add(fields[0]);
          if (fields.length > 1) {
            outputFileList.add(fields[1]);
          } else {
            outputFileList.add(fields[0] + ".tok");
          }
        }
        r.close();
      }
    } else if (parsedArgs != null) {
      // Concatenate input files into a single output file
      inputFileList.addAll(Arrays.asList(parsedArgs));
    }

    if (untok) {
      untok(inputFileList, outputFileList, charset);
    } else {
      tok(inputFileList, outputFileList, charset, parseInsidePattern, optionsSB.toString(), preserveLines, dump, lowerCase);
    }
  } // end main

} // end PTBTokenizer