All Downloads are FREE. Search and download functionalities are using the official Maven repository.

java_cup.lexer Maven / Gradle / Ivy

/*
 * Copyright 2021 EPAM Systems, Inc
 *
 * See the NOTICE file distributed with this work for additional information
 * regarding copyright ownership. Licensed under the Apache License,
 * Version 2.0 (the "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package java_cup;

import java_cup.runtime.Symbol;
import java.util.Hashtable;

/** This class implements a small scanner (aka lexical analyzer or lexer) for
 *  the JavaCup specification.  This scanner reads characters from standard 
 *  input (System.in) and returns integers corresponding to the terminal 
 *  number of the next Symbol. Once end of input is reached the EOF Symbol is 
 *  returned on every subsequent call.

* Symbols currently returned include:

 *    Symbol        Constant Returned     Symbol        Constant Returned
 *    ------        -----------------     ------        -----------------
 *    "package"     PACKAGE               "import"      IMPORT 
 *    "code"        CODE                  "action"      ACTION 
 *    "parser"      PARSER                "terminal"    TERMINAL
 *    "non"         NON                   "init"        INIT 
 *    "scan"        SCAN                  "with"        WITH
 *    "start"       START                 "precedence"  PRECEDENCE
 *    "left"        LEFT		  "right"       RIGHT
 *    "nonassoc"    NONASSOC		  "%prec        PRECENT_PREC  
 *      [           LBRACK                  ]           RBRACK
 *      {           LBRACE                  }           RBRACE
 *      ;           SEMI 
 *      ,           COMMA                   *           STAR 
 *      .           DOT                     :           COLON
 *      ::=         COLON_COLON_EQUALS      |           BAR
 *    identifier    ID                    {:...:}       CODE_STRING
 *    "nonterminal" NONTERMINAL
 *  
* All symbol constants are defined in sym.java which is generated by * JavaCup from parser.cup.

* * In addition to the scanner proper (called first via init() then with * next_token() to get each Symbol) this class provides simple error and * warning routines and keeps a count of errors and warnings that is * publicly accessible.

* * This class is "static" (i.e., it has only static members and methods). * * @version last updated: 7/3/96 * @author Frank Flannery */ public class lexer { /*-----------------------------------------------------------*/ /*--- Constructor(s) ----------------------------------------*/ /*-----------------------------------------------------------*/ /** The only constructor is private, so no instances can be created. */ private lexer() { } /*-----------------------------------------------------------*/ /*--- Static (Class) Variables ------------------------------*/ /*-----------------------------------------------------------*/ /** First character of lookahead. */ protected static int next_char; /** Second character of lookahead. */ protected static int next_char2; /** Second character of lookahead. */ protected static int next_char3; /** Second character of lookahead. */ protected static int next_char4; /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** EOF constant. */ protected static final int EOF_CHAR = -1; /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Table of keywords. Keywords are initially treated as identifiers. * Just before they are returned we look them up in this table to see if * they match one of the keywords. The string of the name is the key here, * which indexes Integer objects holding the symbol number. */ protected static Hashtable keywords = new Hashtable(23); /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Table of single character symbols. For ease of implementation, we * store all unambiguous single character Symbols in this table of Integer * objects keyed by Integer objects with the numerical value of the * appropriate char (currently Character objects have a bug which precludes * their use in tables). */ protected static Hashtable char_symbols = new Hashtable(11); /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Current line number for use in error messages. */ protected static int current_line = 1; /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Character position in current line. */ protected static int current_position = 1; /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Character position in current line. */ protected static int absolute_position = 1; /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Count of total errors detected so far. */ public static int error_count = 0; /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Count of warnings issued so far */ public static int warning_count = 0; /*-----------------------------------------------------------*/ /*--- Static Methods ----------------------------------------*/ /*-----------------------------------------------------------*/ /** Initialize the scanner. This sets up the keywords and char_symbols * tables and reads the first two characters of lookahead. */ public static void init() throws java.io.IOException { /* set up the keyword table */ keywords.put("package", new Integer(sym.PACKAGE)); keywords.put("import", new Integer(sym.IMPORT)); keywords.put("code", new Integer(sym.CODE)); keywords.put("action", new Integer(sym.ACTION)); keywords.put("parser", new Integer(sym.PARSER)); keywords.put("terminal", new Integer(sym.TERMINAL)); keywords.put("non", new Integer(sym.NON)); keywords.put("nonterminal",new Integer(sym.NONTERMINAL));// [CSA] keywords.put("init", new Integer(sym.INIT)); keywords.put("scan", new Integer(sym.SCAN)); keywords.put("with", new Integer(sym.WITH)); keywords.put("start", new Integer(sym.START)); keywords.put("precedence", new Integer(sym.PRECEDENCE)); keywords.put("left", new Integer(sym.LEFT)); keywords.put("right", new Integer(sym.RIGHT)); keywords.put("nonassoc", new Integer(sym.NONASSOC)); /* set up the table of single character symbols */ char_symbols.put(new Integer(';'), new Integer(sym.SEMI)); char_symbols.put(new Integer(','), new Integer(sym.COMMA)); char_symbols.put(new Integer('*'), new Integer(sym.STAR)); char_symbols.put(new Integer('.'), new Integer(sym.DOT)); char_symbols.put(new Integer('|'), new Integer(sym.BAR)); char_symbols.put(new Integer('['), new Integer(sym.LBRACK)); char_symbols.put(new Integer(']'), new Integer(sym.RBRACK)); char_symbols.put(new Integer('}'), new Integer(sym.RBRACE)); char_symbols.put(new Integer('<'), new Integer(sym.LT)); char_symbols.put(new Integer('>'), new Integer(sym.GT)); /* read two characters of lookahead */ next_char = System.in.read(); if (next_char == EOF_CHAR) { next_char2 = EOF_CHAR; next_char3 = EOF_CHAR; next_char4 = EOF_CHAR; } else { next_char2 = System.in.read(); if (next_char2 == EOF_CHAR) { next_char3 = EOF_CHAR; next_char4 = EOF_CHAR; } else { next_char3 = System.in.read(); if (next_char3 == EOF_CHAR) { next_char4 = EOF_CHAR; } else { next_char4 = System.in.read(); } } } } /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Advance the scanner one character in the input stream. This moves * next_char2 to next_char and then reads a new next_char2. */ protected static void advance() throws java.io.IOException { int old_char; old_char = next_char; next_char = next_char2; if (next_char == EOF_CHAR) { next_char2 = EOF_CHAR; next_char3 = EOF_CHAR; next_char4 = EOF_CHAR; } else { next_char2 = next_char3; if (next_char2 == EOF_CHAR) { next_char3 = EOF_CHAR; next_char4 = EOF_CHAR; } else { next_char3 = next_char4; if (next_char3 == EOF_CHAR) { next_char4 = EOF_CHAR; } else { next_char4 = System.in.read(); } } } /* count this */ absolute_position++; current_position++; if (old_char == '\n' || (old_char == '\r' && next_char!='\n')) { current_line++; current_position = 1; } } /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Emit an error message. The message will be marked with both the * current line number and the position in the line. Error messages * are printed on standard error (System.err). * @param message the message to print. */ public static void emit_error(String message) { System.err.println("Error at " + current_line + "(" + current_position + "): " + message); error_count++; } /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Emit a warning message. The message will be marked with both the * current line number and the position in the line. Messages are * printed on standard error (System.err). * @param message the message to print. */ public static void emit_warn(String message) { System.err.println("Warning at " + current_line + "(" + current_position + "): " + message); warning_count++; } /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Determine if a character is ok to start an id. * @param ch the character in question. */ protected static boolean id_start_char(int ch) { /* allow for % in identifiers. a hack to allow my %prec in. Should eventually make lex spec for this frankf */ return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch == '_'); // later need to deal with non-8-bit chars here } /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Determine if a character is ok for the middle of an id. * @param ch the character in question. */ protected static boolean id_char(int ch) { return id_start_char(ch) || (ch >= '0' && ch <= '9'); } /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Try to look up a single character symbol, returns -1 for not found. * @param ch the character in question. */ protected static int find_single_char(int ch) { Integer result; result = (Integer)char_symbols.get(new Integer((char)ch)); if (result == null) return -1; else return result.intValue(); } /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Handle swallowing up a comment. Both old style C and new style C++ * comments are handled. */ protected static void swallow_comment() throws java.io.IOException { /* next_char == '/' at this point */ /* is it a traditional comment */ if (next_char2 == '*') { /* swallow the opener */ advance(); advance(); /* swallow the comment until end of comment or EOF */ for (;;) { /* if its EOF we have an error */ if (next_char == EOF_CHAR) { emit_error("Specification file ends inside a comment"); return; } /* if we can see the closer we are done */ if (next_char == '*' && next_char2 == '/') { advance(); advance(); return; } /* otherwise swallow char and move on */ advance(); } } /* is its a new style comment */ if (next_char2 == '/') { /* swallow the opener */ advance(); advance(); /* swallow to '\n', '\r', '\f', or EOF */ while (next_char != '\n' && next_char != '\r' && next_char != '\f' && next_char!=EOF_CHAR) advance(); return; } /* shouldn't get here, but... if we get here we have an error */ emit_error("Malformed comment in specification -- ignored"); advance(); } /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Swallow up a code string. Code strings begin with "{:" and include all characters up to the first occurrence of ":}" (there is no way to include ":}" inside a code string). The routine returns a String object suitable for return by the scanner. */ protected static Symbol do_code_string() throws java.io.IOException { StringBuffer result = new StringBuffer(); /* at this point we have lookahead of "{:" -- swallow that */ advance(); advance(); /* save chars until we see ":}" */ while (!(next_char == ':' && next_char2 == '}')) { /* if we have run off the end issue a message and break out of loop */ if (next_char == EOF_CHAR) { emit_error("Specification file ends inside a code string"); break; } /* otherwise record the char and move on */ result.append(new Character((char)next_char)); advance(); } /* advance past the closer and build a return Symbol */ advance(); advance(); return new Symbol(sym.CODE_STRING, result.toString()); } /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Process an identifier. Identifiers begin with a letter, underscore, * or dollar sign, which is followed by zero or more letters, numbers, * underscores or dollar signs. This routine returns a String suitable * for return by the scanner. */ protected static Symbol do_id() throws java.io.IOException { StringBuffer result = new StringBuffer(); String result_str; Integer keyword_num; char buffer[] = new char[1]; /* next_char holds first character of id */ buffer[0] = (char)next_char; result.append(buffer,0,1); advance(); /* collect up characters while they fit in id */ while(id_char(next_char)) { buffer[0] = (char)next_char; result.append(buffer,0,1); advance(); } /* extract a string and try to look it up as a keyword */ result_str = result.toString(); keyword_num = (Integer)keywords.get(result_str); /* if we found something, return that keyword */ if (keyword_num != null) return new Symbol(keyword_num.intValue()); /* otherwise build and return an id Symbol with an attached string */ return new Symbol(sym.ID, result_str); } /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Return one Symbol. This is the main external interface to the scanner. * It consumes sufficient characters to determine the next input Symbol * and returns it. To help with debugging, this routine actually calls * real_next_token() which does the work. If you need to debug the * parser, this can be changed to call debug_next_token() which prints * a debugging message before returning the Symbol. */ public static Symbol next_token() throws java.io.IOException { return real_next_token(); } /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** Debugging version of next_token(). This routine calls the real scanning * routine, prints a message on System.out indicating what the Symbol is, * then returns it. */ public static Symbol debug_next_token() throws java.io.IOException { Symbol result = real_next_token(); System.out.println("# next_Symbol() => " + result.sym); return result; } /*. . . . . . . . . . . . . . . . . . . . . . . . . . . . . .*/ /** The actual routine to return one Symbol. This is normally called from * next_token(), but for debugging purposes can be called indirectly from * debug_next_token(). */ protected static Symbol real_next_token() throws java.io.IOException { int sym_num; for (;;) { /* look for white space */ if (next_char == ' ' || next_char == '\t' || next_char == '\n' || next_char == '\f' || next_char == '\r') { /* advance past it and try the next character */ advance(); continue; } /* look for a single character symbol */ sym_num = find_single_char(next_char); if (sym_num != -1) { /* found one -- advance past it and return a Symbol for it */ advance(); return new Symbol(sym_num); } /* look for : or ::= */ if (next_char == ':') { /* if we don't have a second ':' return COLON */ if (next_char2 != ':') { advance(); return new Symbol(sym.COLON); } /* move forward and look for the '=' */ advance(); if (next_char2 == '=') { advance(); advance(); return new Symbol(sym.COLON_COLON_EQUALS); } else { /* return just the colon (already consumed) */ return new Symbol(sym.COLON); } } /* find a "%prec" string and return it. otherwise, a '%' was found, which has no right being in the specification otherwise */ if (next_char == '%') { advance(); if ((next_char == 'p') && (next_char2 == 'r') && (next_char3 == 'e') && (next_char4 == 'c')) { advance(); advance(); advance(); advance(); return new Symbol(sym.PERCENT_PREC); } else { emit_error("Found extraneous percent sign"); } } /* look for a comment */ if (next_char == '/' && (next_char2 == '*' || next_char2 == '/')) { /* swallow then continue the scan */ swallow_comment(); continue; } /* look for start of code string */ if (next_char == '{') if (next_char2 == ':') return do_code_string(); else { advance (); return new Symbol(sym.LBRACE); } /* look for an id or keyword */ if (id_start_char(next_char)) return do_id(); /* look for EOF */ if (next_char == EOF_CHAR) return new Symbol(sym.EOF); /* if we get here, we have an unrecognized character */ emit_warn("Unrecognized character '" + new Character((char)next_char) + "'(" + next_char + ") -- ignored"); /* advance past it */ advance(); } } /*-----------------------------------------------------------*/ }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy