All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.berkeley.nlp.tokenizer.PTB2TextLexer Maven / Gradle / Ivy

Go to download

The Berkeley parser analyzes the grammatical structure of natural language using probabilistic context-free grammars (PCFGs).

The newest version!
/**
 * 
 */
package edu.berkeley.nlp.tokenizer;
/* The following code was generated by JFlex 1.3.5 on 12/2/02 9:02 PM */


import java.util.*;
import java.io.*;

/**
 * doesn't handle \"can not\" to \"cannot\".
 */


/**
 * This class is a scanner generated by 
 * JFlex 1.3.5
 * on 12/2/02 9:02 PM from the specification file
 * file:/nlp/u1/jsmarr/cs276a/src/googleling/PTB2TextLexer.flex
 */
class PTB2TextLexer {

  /** This character denotes the end of file */
  final public static int YYEOF = -1;

  /** initial size of the lookahead buffer */
  final private static int YY_BUFFERSIZE = 16384;

  /** lexical states */
  final public static int YYINITIAL = 0;

  /** 
   * Translates characters to character classes
   */
  final private static String yycmap_packed =
    "\12\0\1\6\25\0\1\1\1\5\2\0\1\25\1\24\1\0\1\2"+
    "\1\17\1\20\2\0\1\5\1\13\1\4\13\0\1\5\1\5\3\0"+
    "\1\5\2\0\1\16\1\21\10\0\1\14\1\0\1\11\3\0\1\15"+
    "\1\0\1\12\6\0\1\22\1\0\1\23\2\0\1\3\15\0\1\7"+
    "\5\0\1\10\uff8b\0";

  /** 
   * Translates characters to character classes
   */
  final private static char [] yycmap = yy_unpack_cmap(yycmap_packed);

  /** 
   * Translates a state to a row index in the transition table
   */
  final private static int yy_rowMap [] = { 
        0,    22,    44,    66,    88,   110,   132,   154,   176,   198, 
      220,    88,   242,   264,   286,    88,    88,    88,    88,   308, 
      330,    88,    88,    88,    88,   352,   374,   396,   418,   440, 
      462,    88,    88,   484,   506,   528,   550,   572,   594
  };

  /** 
   * The packed transition table of the DFA (part 0)
   */
  final private static String yy_packed0 =
    "\1\2\1\3\1\2\1\4\2\2\1\5\4\2\1\6"+
    "\3\2\1\7\2\2\1\10\2\2\1\11\1\2\1\0"+
    "\4\2\1\0\17\2\2\0\1\12\1\0\1\13\1\14"+
    "\1\0\1\15\1\0\1\16\1\0\1\17\4\0\1\20"+
    "\2\0\1\21\1\22\1\0\1\2\1\23\1\2\1\24"+
    "\2\2\1\0\17\2\26\0\1\2\1\0\4\2\1\0"+
    "\5\2\1\25\12\2\1\26\4\2\1\0\20\2\1\27"+
    "\4\2\1\0\20\2\1\30\4\2\1\0\17\2\1\14"+
    "\1\0\1\31\3\14\1\0\17\14\4\0\1\32\23\0"+
    "\1\33\25\0\1\34\40\0\1\35\10\0\1\2\1\31"+
    "\4\2\1\0\20\2\1\0\4\2\1\0\6\2\1\36"+
    "\3\2\1\37\4\2\4\0\1\14\31\0\1\40\27\0"+
    "\1\41\30\0\1\42\3\0\1\43\4\0\1\2\1\0"+
    "\4\2\1\0\7\2\1\44\10\2\1\0\4\2\1\0"+
    "\7\2\1\45\7\2\16\0\1\46\25\0\1\47\7\0"+
    "\1\2\1\0\4\2\1\0\4\2\1\7\13\2\1\0"+
    "\4\2\1\0\4\2\1\10\12\2\13\0\1\20\25\0"+
    "\1\21\12\0";

  /** 
   * The transition table of the DFA
   */
  final private static int yytrans [] = yy_unpack();


  /* error codes */
  final private static int YY_UNKNOWN_ERROR = 0;
  final private static int YY_ILLEGAL_STATE = 1;
  final private static int YY_NO_MATCH = 2;
  final private static int YY_PUSHBACK_2BIG = 3;

  /* error messages for the codes above */
  final private static String YY_ERROR_MSG[] = {
    "Unkown internal scanner error",
    "Internal error: unknown state",
    "Error: could not match input",
    "Error: pushback value was too large"
  };

  /**
   * YY_ATTRIBUTE[aState] contains the attributes of state aState
   */
  private final static byte YY_ATTRIBUTE[] = {
     0,  1,  1,  1,  9,  1,  1,  1,  1,  0,  1,  9,  0,  0,  0,  9, 
     9,  9,  9,  1,  1,  9,  9,  9,  9,  0,  0,  0,  0,  1,  1,  9, 
     9,  0,  0,  1,  1,  0,  0
  };

  /** the input device */
  private java.io.Reader yy_reader;

  /** the current state of the DFA */
  private int yy_state;

  /** the current lexical state */
  private int yy_lexical_state = YYINITIAL;

  /** this buffer contains the current text to be matched and is
      the source of the yytext() string */
  private char yy_buffer[] = new char[YY_BUFFERSIZE];

  /** the textposition at the last accepting state */
  private int yy_markedPos;

  /** the textposition at the last state to be included in yytext */
  private int yy_pushbackPos;

  /** the current text position in the buffer */
  private int yy_currentPos;

  /** startRead marks the beginning of the yytext() string in the buffer */
  private int yy_startRead;

  /** endRead marks the last character in the buffer, that has been read
      from input */
  private int yy_endRead;

  /** number of newlines encountered up to the start of the matched text */
  private int yyline;

  /** the number of characters up to the start of the matched text */
  private int yychar;

  /**
   * the number of characters from the last newline up to the start of the 
   * matched text
   */
  private int yycolumn; 

  /** 
   * yy_atBOL == true <=> the scanner is currently at the beginning of a line
   */
  private boolean yy_atBOL = true;

  /** yy_atEOF == true <=> the scanner is at the EOF */
  private boolean yy_atEOF;

  /* user code: */

/*
"'T WAS"
{ return("'TWAS"); }
"'T was"
{ return("'Twas"); }
"'t was"
{ return("'twas"); }
"'T IS"
{ return("'TIS"); }
"'T is"
{ return("'Tis"); }
"'t is"
{ return("'tis"); }
*/



  /**
   * Creates a new scanner
   * There is also a java.io.InputStream version of this constructor.
   *
   * @param   in  the java.io.Reader to read input from.
   */
  PTB2TextLexer(java.io.Reader in) {
    this.yy_reader = in;
  }

  /**
   * Creates a new scanner.
   * There is also java.io.Reader version of this constructor.
   *
   * @param   in  the java.io.Inputstream to read input from.
   */
  PTB2TextLexer(java.io.InputStream in) {
    this(new java.io.InputStreamReader(in));
  }

  /** 
   * Unpacks the split, compressed DFA transition table.
   *
   * @return the unpacked transition table
   */
  private static int [] yy_unpack() {
    int [] trans = new int[616];
    int offset = 0;
    offset = yy_unpack(yy_packed0, offset, trans);
    return trans;
  }

  /** 
   * Unpacks the compressed DFA transition table.
   *
   * @param packed   the packed transition table
   * @return         the index of the last entry
   */
  private static int yy_unpack(String packed, int offset, int [] trans) {
    int i = 0;       /* index in packed string  */
    int j = offset;  /* index in unpacked array */
    int l = packed.length();
    while (i < l) {
      int count = packed.charAt(i++);
      int value = packed.charAt(i++);
      value--;
      do trans[j++] = value; while (--count > 0);
    }
    return j;
  }

  /** 
   * Unpacks the compressed character translation table.
   *
   * @param packed   the packed character translation table
   * @return         the unpacked character translation table
   */
  private static char [] yy_unpack_cmap(String packed) {
    char [] map = new char[0x10000];
    int i = 0;  /* index in packed string  */
    int j = 0;  /* index in unpacked array */
    while (i < 86) {
      int  count = packed.charAt(i++);
      char value = packed.charAt(i++);
      do map[j++] = value; while (--count > 0);
    }
    return map;
  }


  /**
   * Refills the input buffer.
   *
   * @return      false, iff there was new input.
   * 
   * @exception   java.io.IOException  if any I/O-Error occurs
   */
  private boolean yy_refill() throws java.io.IOException {

    /* first: make room (if you can) */
    if (yy_startRead > 0) {
      System.arraycopy(yy_buffer, yy_startRead,
                       yy_buffer, 0, 
                       yy_endRead-yy_startRead);

      /* translate stored positions */
      yy_endRead-= yy_startRead;
      yy_currentPos-= yy_startRead;
      yy_markedPos-= yy_startRead;
      yy_pushbackPos-= yy_startRead;
      yy_startRead = 0;
    }

    /* is the buffer big enough? */
    if (yy_currentPos >= yy_buffer.length) {
      /* if not: blow it up */
      char newBuffer[] = new char[yy_currentPos*2];
      System.arraycopy(yy_buffer, 0, newBuffer, 0, yy_buffer.length);
      yy_buffer = newBuffer;
    }

    /* finally: fill the buffer with new input */
    int numRead = yy_reader.read(yy_buffer, yy_endRead, 
                                            yy_buffer.length-yy_endRead);

    if (numRead < 0) {
      return true;
    }
    else {
      yy_endRead+= numRead;  
      return false;
    }
  }


  /**
   * Closes the input stream.
   */
  final public void yyclose() throws java.io.IOException {
    yy_atEOF = true;            /* indicate end of file */
    yy_endRead = yy_startRead;  /* invalidate buffer    */

    if (yy_reader != null)
      yy_reader.close();
  }


  /**
   * Closes the current stream, and resets the
   * scanner to read from a new input stream.
   *
   * All internal variables are reset, the old input stream 
   * cannot be reused (internal buffer is discarded and lost).
   * Lexical state is set to YY_INITIAL.
   *
   * @param reader   the new input stream 
   */
  final public void yyreset(java.io.Reader reader) throws java.io.IOException {
    yyclose();
    yy_reader = reader;
    yy_atBOL  = true;
    yy_atEOF  = false;
    yy_endRead = yy_startRead = 0;
    yy_currentPos = yy_markedPos = yy_pushbackPos = 0;
    yyline = yychar = yycolumn = 0;
    yy_lexical_state = YYINITIAL;
  }


  /**
   * Returns the current lexical state.
   */
  final public int yystate() {
    return yy_lexical_state;
  }


  /**
   * Enters a new lexical state
   *
   * @param newState the new lexical state
   */
  final public void yybegin(int newState) {
    yy_lexical_state = newState;
  }


  /**
   * Returns the text matched by the current regular expression.
   */
  final public String yytext() {
    return new String( yy_buffer, yy_startRead, yy_markedPos-yy_startRead );
  }


  /**
   * Returns the character at position pos from the 
   * matched text. 
   * 
   * It is equivalent to yytext().charAt(pos), but faster
   *
   * @param pos the position of the character to fetch. 
   *            A value from 0 to yylength()-1.
   *
   * @return the character at position pos
   */
  final public char yycharat(int pos) {
    return yy_buffer[yy_startRead+pos];
  }


  /**
   * Returns the length of the matched text region.
   */
  final public int yylength() {
    return yy_markedPos-yy_startRead;
  }


  /**
   * Reports an error that occured while scanning.
   *
   * In a wellformed scanner (no or only correct usage of 
   * yypushback(int) and a match-all fallback rule) this method 
   * will only be called with things that "Can't Possibly Happen".
   * If this method is called, something is seriously wrong
   * (e.g. a JFlex bug producing a faulty scanner etc.).
   *
   * Usual syntax/scanner level error handling should be done
   * in error fallback rules.
   *
   * @param   errorCode  the code of the errormessage to display
   */
  private void yy_ScanError(int errorCode) {
    String message;
    try {
      message = YY_ERROR_MSG[errorCode];
    }
    catch (ArrayIndexOutOfBoundsException e) {
      message = YY_ERROR_MSG[YY_UNKNOWN_ERROR];
    }

    throw new Error(message);
  } 


  /**
   * Pushes the specified amount of characters back into the input stream.
   *
   * They will be read again by then next call of the scanning method
   *
   * @param number  the number of characters to be read again.
   *                This number must not be greater than yylength()!
   */
  private void yypushback(int number)  {
    if ( number > yylength() )
      yy_ScanError(YY_PUSHBACK_2BIG);

    yy_markedPos -= number;
  }


  /**
   * Resumes scanning until the next regular expression is matched,
   * the end of input is encountered or an I/O-Error occurs.
   *
   * @return      the next token
   * @exception   java.io.IOException  if any I/O-Error occurs
   */
  public String next() throws java.io.IOException {
    int yy_input;
    int yy_action;

    // cached fields:
    int yy_currentPos_l;
    int yy_startRead_l;
    int yy_markedPos_l;
    int yy_endRead_l = yy_endRead;
    char [] yy_buffer_l = yy_buffer;
    char [] yycmap_l = yycmap;

    int [] yytrans_l = yytrans;
    int [] yy_rowMap_l = yy_rowMap;
    byte [] yy_attr_l = YY_ATTRIBUTE;

    while (true) {
      yy_markedPos_l = yy_markedPos;

      yy_action = -1;

      yy_startRead_l = yy_currentPos_l = yy_currentPos = 
                       yy_startRead = yy_markedPos_l;

      yy_state = yy_lexical_state;


      yy_forAction: {
        while (true) {

          if (yy_currentPos_l < yy_endRead_l)
            yy_input = yy_buffer_l[yy_currentPos_l++];
          else if (yy_atEOF) {
            yy_input = YYEOF;
            break yy_forAction;
          }
          else {
            // store back cached positions
            yy_currentPos  = yy_currentPos_l;
            yy_markedPos   = yy_markedPos_l;
            boolean eof = yy_refill();
            // get translated positions and possibly new buffer
            yy_currentPos_l  = yy_currentPos;
            yy_markedPos_l   = yy_markedPos;
            yy_buffer_l      = yy_buffer;
            yy_endRead_l     = yy_endRead;
            if (eof) {
              yy_input = YYEOF;
              break yy_forAction;
            }
            else {
              yy_input = yy_buffer_l[yy_currentPos_l++];
            }
          }
          int yy_next = yytrans_l[ yy_rowMap_l[yy_state] + yycmap_l[yy_input] ];
          if (yy_next == -1) break yy_forAction;
          yy_state = yy_next;

          int yy_attributes = yy_attr_l[yy_state];
          if ( (yy_attributes & 1) == 1 ) {
            yy_action = yy_state; 
            yy_markedPos_l = yy_currentPos_l; 
            if ( (yy_attributes & 8) == 8 ) break yy_forAction;
          }

        }
      }

      // store back cached position
      yy_markedPos = yy_markedPos_l;

      switch (yy_action) {

        case 32: 
          {  return("N'T");  }
        case 40: break;
        case 31: 
          {  return("n't");  }
        case 41: break;
        case 10: 
        case 11: 
          {  return(yytext().substring(1, yytext().length()));  }
        case 42: break;
        case 1: 
        case 3: 
        case 5: 
        case 6: 
        case 7: 
        case 8: 
        case 19: 
        case 20: 
        case 29: 
        case 30: 
        case 35: 
        case 36: 
          {  return(yytext());  }
        case 43: break;
        case 2: 
          {  return(yytext());  }
        case 44: break;
        case 4: 
          {  return(yytext());  }
        case 45: break;
        case 24: 
          {  return("\"");  }
        case 46: break;
        case 23: 
          {  return("$");  }
        case 47: break;
        case 22: 
          {  return("[");  }
        case 48: break;
        case 21: 
          {  return("(");  }
        case 49: break;
        case 15: 
          {  return(")");  }
        case 50: break;
        case 16: 
          {  return("]");  }
        case 51: break;
        case 17: 
          {  return("%");  }
        case 52: break;
        case 18: 
          {  return("`");  }
        case 53: break;
        default: 
          if (yy_input == YYEOF && yy_startRead == yy_currentPos) {
            yy_atEOF = true;
              {  return(null);  }
          } 
          else {
            yy_ScanError(YY_NO_MATCH);
          }
      }
    }
  }

  /**
   * Runs the scanner on input files.
   *
   * This is a standalone scanner, i.e. it will print any unmatched
   * text to System.out unchanged.
   *
   * @param argv   the command line, contains the filenames to run
   *               the scanner on.
   */
  public static void main(String argv[]) {
    if (argv.length == 0) {
      System.out.println("Usage : java PTB2TextLexer ");
    }
    else {
      for (int i = 0; i < argv.length; i++) {
        PTB2TextLexer scanner = null;
        try {
          scanner = new PTB2TextLexer( new java.io.FileReader(argv[i]) );
          while ( !scanner.yy_atEOF ) scanner.next();
        }
        catch (java.io.FileNotFoundException e) {
          System.out.println("File not found : \""+argv[i]+"\"");
        }
        catch (java.io.IOException e) {
          System.out.println("IO error scanning file \""+argv[i]+"\"");
          System.out.println(e);
        }
        catch (Exception e) {
          System.out.println("Unexpected exception:");
          e.printStackTrace();
        }
      }
    }
  }


}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy