All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.sf.eBus.text.TokenLexer Maven / Gradle / Ivy

The newest version!
//
// Copyright 2001 - 2008 Charles W. Rapp
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package net.sf.eBus.text;

import java.io.EOFException;
import java.io.IOException;
import java.io.Reader;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.Collection;
import java.util.Locale;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Provides a generalized token lexer capability. This lexer
 * ability is beyond {@code java.util.StringTokenizer} in that
 * it identifies the token type along with the token and converts
 * the token string into the type's corresponding Java instance.
 * There are nine (9) pre-defined token types and two special
 * types: {@link net.sf.eBus.text.TokenLexer#ERROR} and
 * {@link net.sf.eBus.text.TokenLexer#EOF}. {@code ERROR} is returned
 * when an recoverable error occurred. {@code EOF} is returned
 * when the input end is reached and no more tokens will be
 * returned.
 * 

* The pre-defined token types are: *

    *
  1. * {@link net.sf.eBus.text.TokenLexer#CHARACTER}: * a single character between single quotes ('). *
  2. *
  3. * {@link net.sf.eBus.text.TokenLexer#COMMENT}: * Either a {@code //} or slash star comment. * Supports nested comments. *
  4. *
  5. * {@link net.sf.eBus.text.TokenLexer#FLOAT}: A decimal number. *
  6. *
  7. * {@link net.sf.eBus.text.TokenLexer#INTEGER}: * An integer number. *
  8. *
  9. * {@link net.sf.eBus.text.TokenLexer#NAME}: * An alpha-numeric identifier. *
  10. *
  11. * {@link net.sf.eBus.text.TokenLexer#OPERATOR}: * Punctuation only identifier. *
  12. *
  13. * {@link net.sf.eBus.text.TokenLexer#SOURCE}: * Raw, unanalyzed input. *
  14. *
  15. * {@link net.sf.eBus.text.TokenLexer#STRING}: * Zero or more characters between double quotes * (""). *
  16. *
* There is support for user-defined keyword, operator and * delimiter tokens. When a {@link net.sf.eBus.text.TokenLexer#NAME} * token is found, the user keywords map is checked if it * contains the token as a keyword. If so, then the associated * token type is returned instead of {@code NAME}. When a * {@link net.sf.eBus.text.TokenLexer#OPERATOR} token is found, * both the user operators and delimiters maps are checked. *

* The user-defined token maps should meet the following * criteria: *

*
    *
  • * The token type values must be >= to * {@link net.sf.eBus.text.TokenLexer#NEXT_TOKEN}. *
  • *
  • * The token type values do not need be unique either within * or across maps. *
  • *
  • * The token type values do not need to be consecutive. *
  • *
* The basic algorithm using {@code TokenLexer} is: *
 *   
 import java.io.Reader;
 import net.sf.eBus.text.TokenLexer;
 import net.sf.eBus.text.Token;
 ...
 TokenLexer lexer = new TokenLexer(Keywords, Operators, Delimiters);
 Token token;
 Reader input = ...;

 // Set the input to be tokenized.
 lexer.input(input);

 // Continue retrieving until no more tokens.
 while ((token = lexer.nextToken()).type() != TokenLexer.EOF)
 {
     // Process the next token based on token type.
 }

 // Finish up the tokenization.
   
 * 
*

Raw Lexical Mode

*

* Users may not want the lexer to analyze input between two * well-defined delimiters. This data is collected and returned * as a {@link net.sf.eBus.text.TokenLexer#SOURCE} token when the * terminating delimiter is reached. Raw mode requires both an * an opening and closing delimiter specified. This allows the * lexer to track the appearance of nested delimiters within the * input and return only when the top-level terminating delimiter * is found. *

*

* Raw lexical mode is used when input contains sub-text to be * handled by a different lexer. *

p * * @author Charles Rapp */ @SuppressWarnings("unchecked") public final class TokenLexer { //--------------------------------------------------------------- // Enums. // /** * The lexer will either analyze the tokens identifying the * type or collect raw input until a terminating delimiter * is found. */ public enum LexMode { /** * When in cooked mode identify the token type. */ COOKED, /** * When in raw mode, collect characters until the * terminating delimiter is found. * RAW is used to read in all characters between parens, * braces, etc. RAW mode will read in an entire file * if the open, close delimiters are mismatched. */ RAW } // end of enum LexMode //--------------------------------------------------------------- // Member data. // //----------------------------------------------------------- // Constants. // /** * When the raw mode open character is set to U+0000, this * means there is no open character, only a close character. */ public static final char NO_OPEN_CHAR = 0; // Read in this many bytes at a time into the buffer. private static final int MAX_BUFFER_LEN = 4096; // Read into the input buffer starting at this offset. private static final int BUFFER_OFFSET = 2; // The read buffer's allocated size in bytes. private static final int READ_BUFFER_LEN = MAX_BUFFER_LEN + BUFFER_OFFSET; // New line characters. private static final int EOL = 10; private static final int CR = 13; // Each token type has an integer value. These token type // values are package-wide scope so the parser can access // them. /** * An error occurred when seeking the next token (0). */ public static final int ERROR = 0; /** * The end of the input is reached (1). */ public static final int EOF = 1; /** * A single-quoted character token (2). Token value is a * {@code java.lang.Character} instance. */ public static final int CHARACTER = 2; /** * Either a {@code //} or a slash star * comment (3). Nested comments are supported. */ public static final int COMMENT = 3; /** * A floating point number (4). Token value is a * {@code java.lang.Double} instance. */ public static final int FLOAT = 4; /** * An integer number (5). Token value is a * {@code java.lang.Long} instance. */ public static final int INTEGER = 5; /** * An alphanumberic identifier (6). If the token appears in * the user-defined keywords map, then the user's token type * is returned instead. */ public static final int NAME = 6; /** * Token consists solely of punctuation characters (7). * If the token is in the user-defined operator or * delimiter map, then the user's token type is returned * instead. *

* Punctuation characters are: *

     *   
     * !  "  #  $  %  &  '  ( )  *
     * +  ,  -  .  /  :  ;  <  =  >
     * ?  @  [  \  ]  ^  _  `  {  }
     * |  ~
     *   
     * 
*/ public static final int OPERATOR = 7; /** * Raw, unanalyzed input (8). * @see net.sf.eBus.text.TokenLexer.LexMode#RAW */ public static final int SOURCE = 8; /** * A double-quoted string (9). */ public static final int STRING = 9; /** * There are eleven (11) predefined token types. */ public static final int TOKEN_COUNT = STRING + 1; /** * User-defined tokens must be >= 11. */ public static final int NEXT_TOKEN = TOKEN_COUNT; // The ASCII characters all have explicit transitions. // Unicode characters are simply given the unicode // transition. private static final int MIN_ASCII_CHAR = 0; private static final int MAX_ASCII_CHAR = 128; // The recognized punctuation characters. private static final int[] PUNCTUATION = { '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '}', '|', '~' }; //----------------------------------------------------------- // Statics. // /** * Logging subsystem interface. */ private static final Logger sLogger = LoggerFactory.getLogger(TokenLexer.class); // Use this array to convert a token type from integer ID // back to a human-readable name. private static String[] sTypeName; // Create an array which maps ASCII characters to // transitions. private static Method[] sTransMethod; static { String transName = ""; sTypeName = new String[TOKEN_COUNT]; sTypeName[ERROR] = "ERROR"; sTypeName[CHARACTER] = "CHARACTER"; sTypeName[COMMENT] = "COMMENT"; sTypeName[EOF] = "EOF"; sTypeName[FLOAT] = "FLOAT"; sTypeName[INTEGER] = "INTEGER"; sTypeName[NAME] = "NAME"; sTypeName[SOURCE] = "SOURCE"; sTypeName[STRING] = "STRING"; // Set up the transition map and token types. sTransMethod = new Method[TokenLexer.MAX_ASCII_CHAR]; try { int i; final Class fsmClass = TokenLexerContext.class; Method unicode; Method whitespace; Method alpha; Method digit; Method punctuation; Method eol; transName = "unicode"; unicode = fsmClass.getDeclaredMethod( transName, char.class); transName = "whitespace"; whitespace = fsmClass.getDeclaredMethod( transName, char.class); transName = "alpha"; alpha = fsmClass.getDeclaredMethod( transName, char.class); transName = "digit"; digit = fsmClass.getDeclaredMethod( transName, char.class); transName = "punctuation"; punctuation = fsmClass.getDeclaredMethod( transName, char.class); transName = "EOL"; eol = fsmClass.getDeclaredMethod( transName, char.class); // Set all transitions to unicode and then set known // characters to other transitions. for (i = MIN_ASCII_CHAR; i < MAX_ASCII_CHAR; ++i) { if (Character.isWhitespace(i)) { sTransMethod[i] = whitespace; } else { sTransMethod[i] = unicode; } } for (i = 'a'; i <= 'z'; ++i) { sTransMethod[i] = alpha; } for (i = 'A'; i <= 'Z'; ++i) { sTransMethod[i] = alpha; } for (i = '0'; i <= '9'; ++i) { sTransMethod[i] = digit; } // Only new line and carriage return are recognized // as end-of-line. sTransMethod[EOL] = eol; sTransMethod[CR] = eol; // Punctuation characters. for (i = 0; i < PUNCTUATION.length; ++i) { sTransMethod[PUNCTUATION[i]] = punctuation; } } catch (NoSuchMethodException | SecurityException jex) { sLogger.error( "INITIALIZATION ERROR! No such method as LexerContext.{}(char).", transName); } } // end of static //----------------------------------------------------------- // Locals. // // The lexer's state map. private TokenLexerContext mLexerFSM; // The file being parsed. private Reader mReader; // Either we are in "cooked" mode and looking for tokens or // we are in "raw" mode and are not processing the characters // but simply collecting them. private LexMode mMode; // Stop the event loop - a token has been found. private boolean mStopFlag; // Store the latest token in this object. private Token mToken; // Collect the token in a string buffer before making a // string out of it. private final StringBuilder mTokenBuffer; // The lexer is processing this line. private int mLineNumber; // Read in a buffer-full of data rather than one character // at a time. private final char[] mReadBuffer; // The actual number of read characters in the buffer. // May be less than the buffer's size. private int mBufferSize; // The next character to be processed. private int mReadIndex; // The offset into the input. private int mOffset; // True when the end-of-file is reached. private boolean mEofFlag; // When this flag is turned on, the parseer wants us to // collect a "raw" token. Keep track of nested clauses // using the depth count. private char mOpenChar; private char mCloseChar; // Maps keyword to token type. private final Map mKeywords; // Maps operators to token type. private final Map mOperators; // Maps delimiters to token type. private final Map mDelimiters; //--------------------------------------------------------------- // Member methods. // //----------------------------------------------------------- // Constructors. // /** * Creates a message layout lexer using the specified * keywords, operator and delimiters. These maps may be * empty but not {@code null}. * @param keywords Keyword to integer identifier mapping. * @param operators Operator to integer identifier mapping. * @param delimiters Delimiter to integer identifier mapping. * @exception IllegalArgumentException * if any of the user maps contains a value < * {@link net.sf.eBus.text.TokenLexer#NEXT_TOKEN}. */ public TokenLexer(final Map keywords, final Map operators, final Map delimiters) { mReader = null; mTokenBuffer = new StringBuilder(); mReadBuffer = new char[READ_BUFFER_LEN]; mBufferSize = 0; mReadIndex = 0; mOffset = 0; mLineNumber = 0; mEofFlag = false; mKeywords = keywords; mOperators = operators; mDelimiters = delimiters; // Check the maps validity. validate(keywords.values(), "keywords"); validate(operators.values(), "operators"); validate(delimiters.values(), "delimiters"); // We are in the "cooked" processing mode by default. mMode = LexMode.COOKED; mLexerFSM = null; } // end of TokenLexer(Map) // // end of Constructors. //----------------------------------------------------------- //----------------------------------------------------------- // Get methods. // /** * Returns the current line number being tokenized. * @return the current line number being tokenized. */ public int lineNumber() { return (mLineNumber); } // end of lineNumber() /** * Returns the current offset into the input. * @return the current offset into the input. */ public int offset() { return (mOffset); } // end of offset() /** * Returns the current lexer mode. * @return the current lexer mode. */ public LexMode mode() { return (mMode); } // end of mode() // // end of Get methods. //----------------------------------------------------------- //----------------------------------------------------------- // Set methods. // /** * Extract tokens from this input stream. * @param reader Tokenize this input. */ public void input(final Reader reader) { mReader = reader; mBufferSize = 0; mReadIndex = 0; mOffset = 0; mLineNumber = 0; mEofFlag = false; mLexerFSM = new TokenLexerContext(this); // mLexerFSM.setDebugFlag(true); mLexerFSM.enterStartState(); } // end of input(Reader) /** * Switch to raw tokenization. * @param openChar The open clause delimiter. * @param closeChar The close clause delimiter. * @see #cookedMode() */ public void rawMode(final char openChar, final char closeChar) { mMode = LexMode.RAW; mOpenChar = openChar; mCloseChar = closeChar; } // end of rawMode(char, char) /** * Switch back to cooked tokenization. * @see #rawMode(char, char) */ public void cookedMode() { mMode = LexMode.COOKED; mTokenBuffer.delete(0, mTokenBuffer.length()); } // end of cookedMode() // // end of Set methods. //----------------------------------------------------------- /** * Returns the next token found in the input stream. If there * are no more tokens in the input stream, then * {@link net.sf.eBus.text.TokenLexer#EOF} is returned. * @return the next token found in the input stream. * @throws IllegalStateException * if input reader is not set. */ public Token nextToken() { Token retval; if (mReader == null) { throw (new IllegalStateException("reader not set")); } if (mEofFlag) { retval = new Token(EOF, null, "", mLineNumber); } else if (mMode == LexMode.COOKED) { retval = nextCookedToken(); } else { retval = nextRawToken(); } return (retval); } // end of nextToken() //----------------------------------------------------------- // State Machine Actions // // Returns the current token. /* package */ String token() { return (mTokenBuffer.toString()); } // end of token() /** * Clears out the collected token buffer and resets the token * object to its initial state. */ /* package */ void startToken() { mToken = null; mTokenBuffer.delete(0, mTokenBuffer.length()); } // end of startToken() /** * Appends character to the token. * @param c append this character. */ /* package */ void appendToken(final char c) { mTokenBuffer.append(c); } // end of appendToken(char) /* package */ void endToken(final int type) { final String tokenStr = mTokenBuffer.toString(); // If this is a NAME type, then check if this name is // actually a message type, data type or keyword. switch (type) { case NAME: mToken = nameToken(tokenStr); break; case OPERATOR: mToken = operatorToken(tokenStr); break; case CHARACTER: mToken = new Token(type, tokenStr.charAt(0), tokenStr, mLineNumber); break; case FLOAT: mToken = floatToken(tokenStr); break; case INTEGER: mToken = longToken(tokenStr); break; default: mToken = new Token( type, tokenStr, tokenStr, mLineNumber); break; } mStopFlag = true; } // end of endToken(int) // A malformed token has been detected. /* package */ void badToken(final String errorMsg) { mToken = new Token( ERROR, errorMessage(errorMsg), "", mLineNumber); // Stop tokenizing. mStopFlag = true; } // end of badToken(String) // Back up one character in the file so that the character // will be read again when nextToken() is called. This is // usually done when one token is terminated by another. /* package */ void ungetChar() { --mReadIndex; --mOffset; } // end of ungetChar() // Returns true if there is a delimiter matching the // character. /* package */ boolean isDelimiter(final char c) { return (mDelimiters.containsKey(c)); } // end of containsDelimiter(char) // Returns the delimiter type. /* package */ int delimiterType(final char c) { return (mDelimiters.get(c)); } // end of delimiterType(char) // // end of State Machine Actions //----------------------------------------------------------- // Returns the next cooked token. private Token nextCookedToken() { char c; // Clear out the token and get ready to work. startToken(); try { mStopFlag = false; while (!mStopFlag) { c = readChar(); // If the character's integer value is greater // than 127, then issue a unicode transition // and let the lexer FSM decide whether it is // acceptable or not. if (c >= sTransMethod.length) { mLexerFSM.unicode(c); } else { // Translate character into a transition. sTransMethod[c].invoke(mLexerFSM, c); // If this is an end-of-line character, add // one to the current line number. CR-LF is // a single end-of-line. if (c == EOL) { ++mLineNumber; } } } } catch (EOFException e) { // If this is the end of the source file, let // the parser know. mLexerFSM.EOF(); } catch (InvocationTargetException | IllegalAccessException invokex) { badToken("Unknown token"); } catch (IOException ioex) { badToken(errorMessage(ioex.getMessage())); } return (mToken); } // end of nextCookedToken() // Keep reading in characters until the close character is // found. private Token nextRawToken() { final int startLine = mLineNumber; char c; String value; // Clear out the token and get ready to work. startToken(); // Keep reading until told to stop or the // end-of-file is reached. try { int depth = 0; mStopFlag = false; while (!mStopFlag) { c = readChar(); // When working in RAW mode, the close character // may naturally occur. The only way we have of // knowing if the close character is for us or // not is by keeping track of the matching open // characters seen. When an open character is // seen, add one to the depth. When a close // character is seen, then either: // + if the depth is zero, this is the end of the // raw code; return the token. // + if the depth is greater than zero, then // subtract one from the depth. mTokenBuffer.append(c); if (c == mCloseChar) { --depth; mStopFlag = (depth <= 0); } // If this is the open character, then // add one to the depth which lets us // know that the next close character // does *not* end the raw code section. else if (mOpenChar != NO_OPEN_CHAR && c == mOpenChar) { ++depth; } // If this is an end-of-line character, // add one to the current line number. // CR-LF is a single end-of-line. else if (c == EOL) { ++mLineNumber; } } value = mTokenBuffer.toString(); mToken = new Token(SOURCE, value, value, startLine); } catch (EOFException e) { final StringBuilder msg = new StringBuilder(); msg.append( "User source code contains an unbalanced "); msg.append(mOpenChar); msg.append(", "); msg.append(mCloseChar); msg.append(" pair."); // If this is the end of the source file, then the // raw code section has an unbalanced open character/ // close character pair. badToken(msg.toString()); } catch (IOException ioex) { badToken(errorMessage(ioex.getMessage())); } return (mToken); } // end of nextRawToken() // Returns either an name or a keyword token. private Token nameToken(final String token) { int type = NAME; Object value = token; // Convert the token string to all uppercase since // the message type and keywords are stored as such. final String key = token.toUpperCase(Locale.US); // If this key a keyword? if (mKeywords.containsKey(key)) { // The returned value is an integer containing // the token type. Extract the type and return // the token string. type = mKeywords.get(key); value = key; } return (new Token(type, value, token, mLineNumber)); } // end of nameToken() // Returns either an operator token or an error. private Token operatorToken(final String token) { int type; Object value = token; // Is this a known operator? if (!mOperators.containsKey(token)) { // No, it is unknown. Change this to an error. type = ERROR; value = "unknown operator"; } else { type = mOperators.get(token); } return (new Token(type, value, token, mLineNumber)); } // end of operatorToken(String) // Returns either a float token or an error. private Token floatToken(final String token) { int type = FLOAT; Object value; try { value = Double.valueOf(token); } catch (NumberFormatException formex) { final StringBuilder buffer = new StringBuilder(); buffer.append("invalid float, "); buffer.append(formex.getMessage()); type = ERROR; value = buffer.toString(); } return (new Token(type, value, token, mLineNumber)); } // end of floatToken(String) // Returns either a long token or an error. private Token longToken(final String initialToken) { String token = initialToken; int type = INTEGER; Object value; // If the first character is a '+', then strip that // from the string. if (token.charAt(0) == '+') { token = token.substring(1); } try { value = Long.valueOf(token); } catch (NumberFormatException formex) { final StringBuilder buffer = new StringBuilder(); buffer.append("invalid integer, "); buffer.append(formex.getMessage()); type = ERROR; value = buffer.toString(); } return (new Token(type, value, token, mLineNumber)); } // end of longToken(String) // Read the next character. Actually, this routine reads in // a large buffer and data returns the next character from // there. The idea is to do a few large, efficient reads and // make single character reads to be array retrievals. // NOTE: this lexer backs up at most two characters. So // when reading in a new buffer, copy the last two characters // to the first two bytes and read in the next maximum number // of bytes. private char readChar() throws IOException { char retval; // If we are at the end of the buffer, read the // next buffer-full. if (mReadIndex == mBufferSize) { int size; int offset = 0; int length; // Copy the last two bytes to the first two bytes. // Why? Because the lexer can back up to two bytes. if (mBufferSize > 2) { offset = 2; mReadBuffer[0] = mReadBuffer[mBufferSize - 2]; mReadBuffer[1] = mReadBuffer[mBufferSize - 1]; } else if (mBufferSize > 1) { offset = 1; mReadBuffer[0] = mReadBuffer[mBufferSize - 1]; } length = (MAX_BUFFER_LEN - offset); size = mReader.read(mReadBuffer, offset, length); // Has end of file been reached? if (size < 0) { // Yes. mBufferSize = 0; mEofFlag = true; throw (new EOFException("end-of-file reached")); } else { // The buffer's true size is the number of bytes // read plus the offset. mBufferSize = size + offset; mReadIndex = offset; } } retval = mReadBuffer[mReadIndex]; ++mReadIndex; ++mOffset; return (retval); } // end of readChar() // Make sure the token type values are in the user-defined // zone. Called for effect only. private void validate(final Collection values, final String name) { values.stream() .filter(value -> (value < NEXT_TOKEN)) .forEachOrdered( value -> { throw ( new IllegalArgumentException( String.format( "invalid %s token type (%s)", name, value))); }); } // end of validate(Collection, String) /** * Returns an error message starting with the given prefix * and containing the raw token buffer contents. * @param prefix prepend error message with this text. * @return token error message. */ private String errorMessage(final String prefix) { return ( String.format( "%s (token: %s)", prefix, mTokenBuffer.toString())); } // end of errorMessage(String) } // end of class TokenLexer




© 2015 - 2024 Weber Informatics LLC | Privacy Policy