edu.berkeley.nlp.tokenizer.PTB2TextLexer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of berkeleyparser Show documentation
The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).
The newest version!
/**
 * 
 */
package edu.berkeley.nlp.tokenizer;
/* The following code was generated by JFlex 1.3.5 on 12/2/02 9:02 PM */


import java.util.*;
import java.io.*;

/**
 * doesn't handle \"can not\" to \"cannot\".
 */


/**
 * This class is a scanner generated by 
 * JFlex 1.3.5
 * on 12/2/02 9:02 PM from the specification file
 * file:/nlp/u1/jsmarr/cs276a/src/googleling/PTB2TextLexer.flex
 */
class PTB2TextLexer {

  /** This character denotes the end of file */
  final public static int YYEOF = -1;

  /** initial size of the lookahead buffer */
  final private static int YY_BUFFERSIZE = 16384;

  /** lexical states */
  final public static int YYINITIAL = 0;

  /** 
   * Translates characters to character classes
   */
  final private static String yycmap_packed =
    "\12\0\1\6\25\0\1\1\1\5\2\0\1\25\1\24\1\0\1\2"+
    "\1\17\1\20\2\0\1\5\1\13\1\4\13\0\1\5\1\5\3\0"+
    "\1\5\2\0\1\16\1\21\10\0\1\14\1\0\1\11\3\0\1\15"+
    "\1\0\1\12\6\0\1\22\1\0\1\23\2\0\1\3\15\0\1\7"+
    "\5\0\1\10\uff8b\0";

  /** 
   * Translates characters to character classes
   */
  final private static char [] yycmap = yy_unpack_cmap(yycmap_packed);

  /** 
   * Translates a state to a row index in the transition table
   */
  final private static int yy_rowMap [] = { 
        0,    22,    44,    66,    88,   110,   132,   154,   176,   198, 
      220,    88,   242,   264,   286,    88,    88,    88,    88,   308, 
      330,    88,    88,    88,    88,   352,   374,   396,   418,   440, 
      462,    88,    88,   484,   506,   528,   550,   572,   594
  };

  /** 
   * The packed transition table of the DFA (part 0)
   */
  final private static String yy_packed0 =
    "\1\2\1\3\1\2\1\4\2\2\1\5\4\2\1\6"+
    "\3\2\1\7\2\2\1\10\2\2\1\11\1\2\1\0"+
    "\4\2\1\0\17\2\2\0\1\12\1\0\1\13\1\14"+
    "\1\0\1\15\1\0\1\16\1\0\1\17\4\0\1\20"+
    "\2\0\1\21\1\22\1\0\1\2\1\23\1\2\1\24"+
    "\2\2\1\0\17\2\26\0\1\2\1\0\4\2\1\0"+
    "\5\2\1\25\12\2\1\26\4\2\1\0\20\2\1\27"+
    "\4\2\1\0\20\2\1\30\4\2\1\0\17\2\1\14"+
    "\1\0\1\31\3\14\1\0\17\14\4\0\1\32\23\0"+
    "\1\33\25\0\1\34\40\0\1\35\10\0\1\2\1\31"+
    "\4\2\1\0\20\2\1\0\4\2\1\0\6\2\1\36"+
    "\3\2\1\37\4\2\4\0\1\14\31\0\1\40\27\0"+
    "\1\41\30\0\1\42\3\0\1\43\4\0\1\2\1\0"+
    "\4\2\1\0\7\2\1\44\10\2\1\0\4\2\1\0"+
    "\7\2\1\45\7\2\16\0\1\46\25\0\1\47\7\0"+
    "\1\2\1\0\4\2\1\0\4\2\1\7\13\2\1\0"+
    "\4\2\1\0\4\2\1\10\12\2\13\0\1\20\25\0"+
    "\1\21\12\0";

  /** 
   * The transition table of the DFA
   */
  final private static int yytrans [] = yy_unpack();


  /* error codes */
  final private static int YY_UNKNOWN_ERROR = 0;
  final private static int YY_ILLEGAL_STATE = 1;
  final private static int YY_NO_MATCH = 2;
  final private static int YY_PUSHBACK_2BIG = 3;

  /* error messages for the codes above */
  final private static String YY_ERROR_MSG[] = {
    "Unkown internal scanner error",
    "Internal error: unknown state",
    "Error: could not match input",
    "Error: pushback value was too large"
  };

  /**
   * YY_ATTRIBUTE[aState] contains the attributes of state aState
   */
  private final static byte YY_ATTRIBUTE[] = {
     0,  1,  1,  1,  9,  1,  1,  1,  1,  0,  1,  9,  0,  0,  0,  9, 
     9,  9,  9,  1,  1,  9,  9,  9,  9,  0,  0,  0,  0,  1,  1,  9, 
     9,  0,  0,  1,  1,  0,  0
  };

  /** the input device */
  private java.io.Reader yy_reader;

  /** the current state of the DFA */
  private int yy_state;

  /** the current lexical state */
  private int yy_lexical_state = YYINITIAL;

  /** this buffer contains the current text to be matched and is
      the source of the yytext() string */
  private char yy_buffer[] = new char[YY_BUFFERSIZE];

  /** the textposition at the last accepting state */
  private int yy_markedPos;

  /** the textposition at the last state to be included in yytext */
  private int yy_pushbackPos;

  /** the current text position in the buffer */
  private int yy_currentPos;

  /** startRead marks the beginning of the yytext() string in the buffer */
  private int yy_startRead;

  /** endRead marks the last character in the buffer, that has been read
      from input */
  private int yy_endRead;

  /** number of newlines encountered up to the start of the matched text */
  private int yyline;

  /** the number of characters up to the start of the matched text */
  private int yychar;

  /**
   * the number of characters from the last newline up to the start of the 
   * matched text
   */
  private int yycolumn; 

  /** 
   * yy_atBOL == true <=> the scanner is currently at the beginning of a line
   */
  private boolean yy_atBOL = true;

  /** yy_atEOF == true <=> the scanner is at the EOF */
  private boolean yy_atEOF;

  /* user code: */

/*
"'T WAS"
{ return("'TWAS"); }
"'T was"
{ return("'Twas"); }
"'t was"
{ return("'twas"); }
"'T IS"
{ return("'TIS"); }
"'T is"
{ return("'Tis"); }
"'t is"
{ return("'tis"); }
*/



  /**
   * Creates a new scanner
   * There is also a java.io.InputStream version of this constructor.
   *
   * @param   in  the java.io.Reader to read input from.
   */
  PTB2TextLexer(java.io.Reader in) {
    this.yy_reader = in;
  }

  /**
   * Creates a new scanner.
   * There is also java.io.Reader version of this constructor.
   *
   * @param   in  the java.io.Inputstream to read input from.
   */
  PTB2TextLexer(java.io.InputStream in) {
    this(new java.io.InputStreamReader(in));
  }

  /** 
   * Unpacks the split, compressed DFA transition table.
   *
   * @return the unpacked transition table
   */
  private static int [] yy_unpack() {
    int [] trans = new int[616];
    int offset = 0;
    offset = yy_unpack(yy_packed0, offset, trans);
    return trans;
  }

  /** 
   * Unpacks the compressed DFA transition table.
   *
   * @param packed   the packed transition table
   * @return         the index of the last entry
   */
  private static int yy_unpack(String packed, int offset, int [] trans) {
    int i = 0;       /* index in packed string  */
    int j = offset;  /* index in unpacked array */
    int l = packed.length();
    while (i < l) {
      int count = packed.charAt(i++);
      int value = packed.charAt(i++);
      value--;
      do trans[j++] = value; while (--count > 0);
    }
    return j;
  }

  /** 
   * Unpacks the compressed character translation table.
   *
   * @param packed   the packed character translation table
   * @return         the unpacked character translation table
   */
  private static char [] yy_unpack_cmap(String packed) {
    char [] map = new char[0x10000];
    int i = 0;  /* index in packed string  */
    int j = 0;  /* index in unpacked array */
    while (i < 86) {
      int  count = packed.charAt(i++);
      char value = packed.charAt(i++);
      do map[j++] = value; while (--count > 0);
    }
    return map;
  }


  /**
   * Refills the input buffer.
   *
   * @return      false, iff there was new input.
   * 
   * @exception   java.io.IOException  if any I/O-Error occurs
   */
  private boolean yy_refill() throws java.io.IOException {

    /* first: make room (if you can) */
    if (yy_startRead > 0) {
      System.arraycopy(yy_buffer, yy_startRead,
                       yy_buffer, 0, 
                       yy_endRead-yy_startRead);

      /* translate stored positions */
      yy_endRead-= yy_startRead;
      yy_currentPos-= yy_startRead;
      yy_markedPos-= yy_startRead;
      yy_pushbackPos-= yy_startRead;
      yy_startRead = 0;
    }

    /* is the buffer big enough? */
    if (yy_currentPos >= yy_buffer.length) {
      /* if not: blow it up */
      char newBuffer[] = new char[yy_currentPos*2];
      System.arraycopy(yy_buffer, 0, newBuffer, 0, yy_buffer.length);
      yy_buffer = newBuffer;
    }

    /* finally: fill the buffer with new input */
    int numRead = yy_reader.read(yy_buffer, yy_endRead, 
                                            yy_buffer.length-yy_endRead);

    if (numRead < 0) {
      return true;
    }
    else {
      yy_endRead+= numRead;  
      return false;
    }
  }


  /**
   * Closes the input stream.
   */
  final public void yyclose() throws java.io.IOException {
    yy_atEOF = true;            /* indicate end of file */
    yy_endRead = yy_startRead;  /* invalidate buffer    */

    if (yy_reader != null)
      yy_reader.close();
  }


  /**
   * Closes the current stream, and resets the
   * scanner to read from a new input stream.
   *
   * All internal variables are reset, the old input stream 
   * cannot be reused (internal buffer is discarded and lost).
   * Lexical state is set to YY_INITIAL.
   *
   * @param reader   the new input stream 
   */
  final public void yyreset(java.io.Reader reader) throws java.io.IOException {
    yyclose();
    yy_reader = reader;
    yy_atBOL  = true;
    yy_atEOF  = false;
    yy_endRead = yy_startRead = 0;
    yy_currentPos = yy_markedPos = yy_pushbackPos = 0;
    yyline = yychar = yycolumn = 0;
    yy_lexical_state = YYINITIAL;
  }


  /**
   * Returns the current lexical state.
   */
  final public int yystate() {
    return yy_lexical_state;
  }


  /**
   * Enters a new lexical state
   *
   * @param newState the new lexical state
   */
  final public void yybegin(int newState) {
    yy_lexical_state = newState;
  }


  /**
   * Returns the text matched by the current regular expression.
   */
  final public String yytext() {
    return new String( yy_buffer, yy_startRead, yy_markedPos-yy_startRead );
  }


  /**
   * Returns the character at position pos from the 
   * matched text. 
   * 
   * It is equivalent to yytext().charAt(pos), but faster
   *
   * @param pos the position of the character to fetch. 
   *            A value from 0 to yylength()-1.
   *
   * @return the character at position pos
   */
  final public char yycharat(int pos) {
    return yy_buffer[yy_startRead+pos];
  }


  /**
   * Returns the length of the matched text region.
   */
  final public int yylength() {
    return yy_markedPos-yy_startRead;
  }


  /**
   * Reports an error that occured while scanning.
   *
   * In a wellformed scanner (no or only correct usage of 
   * yypushback(int) and a match-all fallback rule) this method 
   * will only be called with things that "Can't Possibly Happen".
   * If this method is called, something is seriously wrong
   * (e.g. a JFlex bug producing a faulty scanner etc.).
   *
   * Usual syntax/scanner level error handling should be done
   * in error fallback rules.
   *
   * @param   errorCode  the code of the errormessage to display
   */
  private void yy_ScanError(int errorCode) {
    String message;
    try {
      message = YY_ERROR_MSG[errorCode];
    }
    catch (ArrayIndexOutOfBoundsException e) {
      message = YY_ERROR_MSG[YY_UNKNOWN_ERROR];
    }

    throw new Error(message);
  } 


  /**
   * Pushes the specified amount of characters back into the input stream.
   *
   * They will be read again by then next call of the scanning method
   *
   * @param number  the number of characters to be read again.
   *                This number must not be greater than yylength()!
   */
  private void yypushback(int number)  {
    if ( number > yylength() )
      yy_ScanError(YY_PUSHBACK_2BIG);

    yy_markedPos -= number;
  }


  /**
   * Resumes scanning until the next regular expression is matched,
   * the end of input is encountered or an I/O-Error occurs.
   *
   * @return      the next token
   * @exception   java.io.IOException  if any I/O-Error occurs
   */
  public String next() throws java.io.IOException {
    int yy_input;
    int yy_action;

    // cached fields:
    int yy_currentPos_l;
    int yy_startRead_l;
    int yy_markedPos_l;
    int yy_endRead_l = yy_endRead;
    char [] yy_buffer_l = yy_buffer;
    char [] yycmap_l = yycmap;

    int [] yytrans_l = yytrans;
    int [] yy_rowMap_l = yy_rowMap;
    byte [] yy_attr_l = YY_ATTRIBUTE;

    while (true) {
      yy_markedPos_l = yy_markedPos;

      yy_action = -1;

      yy_startRead_l = yy_currentPos_l = yy_currentPos = 
                       yy_startRead = yy_markedPos_l;

      yy_state = yy_lexical_state;


      yy_forAction: {
        while (true) {

          if (yy_currentPos_l < yy_endRead_l)
            yy_input = yy_buffer_l[yy_currentPos_l++];
          else if (yy_atEOF) {
            yy_input = YYEOF;
            break yy_forAction;
          }
          else {
            // store back cached positions
            yy_currentPos  = yy_currentPos_l;
            yy_markedPos   = yy_markedPos_l;
            boolean eof = yy_refill();
            // get translated positions and possibly new buffer
            yy_currentPos_l  = yy_currentPos;
            yy_markedPos_l   = yy_markedPos;
            yy_buffer_l      = yy_buffer;
            yy_endRead_l     = yy_endRead;
            if (eof) {
              yy_input = YYEOF;
              break yy_forAction;
            }
            else {
              yy_input = yy_buffer_l[yy_currentPos_l++];
            }
          }
          int yy_next = yytrans_l[ yy_rowMap_l[yy_state] + yycmap_l[yy_input] ];
          if (yy_next == -1) break yy_forAction;
          yy_state = yy_next;

          int yy_attributes = yy_attr_l[yy_state];
          if ( (yy_attributes & 1) == 1 ) {
            yy_action = yy_state; 
            yy_markedPos_l = yy_currentPos_l; 
            if ( (yy_attributes & 8) == 8 ) break yy_forAction;
          }

        }
      }

      // store back cached position
      yy_markedPos = yy_markedPos_l;

      switch (yy_action) {

        case 32: 
          {  return("N'T");  }
        case 40: break;
        case 31: 
          {  return("n't");  }
        case 41: break;
        case 10: 
        case 11: 
          {  return(yytext().substring(1, yytext().length()));  }
        case 42: break;
        case 1: 
        case 3: 
        case 5: 
        case 6: 
        case 7: 
        case 8: 
        case 19: 
        case 20: 
        case 29: 
        case 30: 
        case 35: 
        case 36: 
          {  return(yytext());  }
        case 43: break;
        case 2: 
          {  return(yytext());  }
        case 44: break;
        case 4: 
          {  return(yytext());  }
        case 45: break;
        case 24: 
          {  return("\"");  }
        case 46: break;
        case 23: 
          {  return("$");  }
        case 47: break;
        case 22: 
          {  return("[");  }
        case 48: break;
        case 21: 
          {  return("(");  }
        case 49: break;
        case 15: 
          {  return(")");  }
        case 50: break;
        case 16: 
          {  return("]");  }
        case 51: break;
        case 17: 
          {  return("%");  }
        case 52: break;
        case 18: 
          {  return("`");  }
        case 53: break;
        default: 
          if (yy_input == YYEOF && yy_startRead == yy_currentPos) {
            yy_atEOF = true;
              {  return(null);  }
          } 
          else {
            yy_ScanError(YY_NO_MATCH);
          }
      }
    }
  }

  /**
   * Runs the scanner on input files.
   *
   * This is a standalone scanner, i.e. it will print any unmatched
   * text to System.out unchanged.
   *
   * @param argv   the command line, contains the filenames to run
   *               the scanner on.
   */
  public static void main(String argv[]) {
    if (argv.length == 0) {
      System.out.println("Usage : java PTB2TextLexer ");
    }
    else {
      for (int i = 0; i < argv.length; i++) {
        PTB2TextLexer scanner = null;
        try {
          scanner = new PTB2TextLexer( new java.io.FileReader(argv[i]) );
          while ( !scanner.yy_atEOF ) scanner.next();
        }
        catch (java.io.FileNotFoundException e) {
          System.out.println("File not found : \""+argv[i]+"\"");
        }
        catch (java.io.IOException e) {
          System.out.println("IO error scanning file \""+argv[i]+"\"");
          System.out.println(e);
        }
        catch (Exception e) {
          System.out.println("Unexpected exception:");
          e.printStackTrace();
        }
      }
    }
  }


}