net.sf.eBus.text.TokenLexer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of util Show documentation
The newest version!
//
// Copyright 2001 - 2008 Charles W. Rapp
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//

package net.sf.eBus.text;

import java.io.EOFException;
import java.io.IOException;
import java.io.Reader;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.Collection;
import java.util.Locale;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Provides a generalized token lexer capability. This lexer
 * ability is beyond {@code java.util.StringTokenizer} in that
 * it identifies the token type along with the token and converts
 * the token string into the type's corresponding Java instance.
 * There are nine (9) pre-defined token types and two special
 * types: {@link net.sf.eBus.text.TokenLexer#ERROR} and
 * {@link net.sf.eBus.text.TokenLexer#EOF}. {@code ERROR} is returned
 * when an recoverable error occurred. {@code EOF} is returned
 * when the input end is reached and no more tokens will be
 * returned.
 * 
 * The pre-defined token types are:
 * 

 *   
 *     {@link net.sf.eBus.text.TokenLexer#CHARACTER}:
 *     a single character between single quotes (').
 *   
 *   
 *     {@link net.sf.eBus.text.TokenLexer#COMMENT}:
 *     Either a {@code //} or slash star comment.
 *     Supports nested comments.
 *   
 *   
 *     {@link net.sf.eBus.text.TokenLexer#FLOAT}: A decimal number.
 *   
 *   
 *     {@link net.sf.eBus.text.TokenLexer#INTEGER}:
 *     An integer number.
 *   
 *   
 *     {@link net.sf.eBus.text.TokenLexer#NAME}:
 *     An alpha-numeric identifier.
 *   
 *   
 *     {@link net.sf.eBus.text.TokenLexer#OPERATOR}:
 *     Punctuation only identifier.
 *   
 *   
 *     {@link net.sf.eBus.text.TokenLexer#SOURCE}:
 *     Raw, unanalyzed input.
 *   
 *   
 *     {@link net.sf.eBus.text.TokenLexer#STRING}:
 *     Zero or more characters between double quotes
 *     ("").
 *   
 * 
 * There is support for user-defined keyword, operator and
 * delimiter tokens. When a {@link net.sf.eBus.text.TokenLexer#NAME}
 * token is found, the user keywords map is checked if it
 * contains the token as a keyword. If so, then the associated
 * token type is returned instead of {@code NAME}. When a
 * {@link net.sf.eBus.text.TokenLexer#OPERATOR} token is found,
 * both the user operators and delimiters maps are checked.
 * 
 * The user-defined token maps should meet the following
 * criteria:
 * 
 * 
 *   
 *     The token type values must be >= to
 *     {@link net.sf.eBus.text.TokenLexer#NEXT_TOKEN}.
 *   
 *   
 *     The token type values do not need be unique either within
 *     or across maps.
 *   
 *   
 *     The token type values do not need to be consecutive.
 *   
 * 
 * The basic algorithm using {@code TokenLexer} is:
 *  *   
 import java.io.Reader;
 import net.sf.eBus.text.TokenLexer;
 import net.sf.eBus.text.Token;
 ...
 TokenLexer lexer = new TokenLexer(Keywords, Operators, Delimiters);
 Token token;
 Reader input = ...;

 // Set the input to be tokenized.
 lexer.input(input);

 // Continue retrieving until no more tokens.
 while ((token = lexer.nextToken()).type() != TokenLexer.EOF)
 {
     // Process the next token based on token type.
 }

 // Finish up the tokenization.
   
 * 
 * Raw Lexical Mode
 * 
 * Users may not want the lexer to analyze input between two
 * well-defined delimiters. This data is collected and returned
 * as a {@link net.sf.eBus.text.TokenLexer#SOURCE} token when the
 * terminating delimiter is reached. Raw mode requires both an
 * an opening and closing delimiter specified. This allows the
 * lexer to track the appearance of nested delimiters within the
 * input and return only when the top-level terminating delimiter
 * is found.
 * 
 * 
 * Raw lexical mode is used when input contains sub-text to be
 * handled by a different lexer.
 * p
 *
 * @author Charles Rapp
 */

@SuppressWarnings("unchecked")
public final class TokenLexer
{
//---------------------------------------------------------------
// Enums.
//

    /**
     * The lexer will either analyze the tokens identifying the
     * type or collect raw input until a terminating delimiter
     * is found.
     */
    public enum LexMode
    {
        /**
         * When in cooked mode identify the token type.
         */
        COOKED,

        /**
         * When in raw mode, collect characters until the
         * terminating delimiter is found.
         * RAW is used to read in all characters between parens,
         * braces, etc. RAW mode will read in an entire file
         * if the open, close delimiters are mismatched.
         */
        RAW
    } // end of enum LexMode

//---------------------------------------------------------------
// Member data.
//

    //-----------------------------------------------------------
    // Constants.
    //

    /**
     * When the raw mode open character is set to U+0000, this
     * means there is no open character, only a close character.
     */
    public static final char NO_OPEN_CHAR = 0;

    // Read in this many bytes at a time into the buffer.
    private static final int MAX_BUFFER_LEN = 4096;

    // Read into the input buffer starting at this offset.
    private static final int BUFFER_OFFSET = 2;

    // The read buffer's allocated size in bytes.
    private static final int READ_BUFFER_LEN =
        MAX_BUFFER_LEN + BUFFER_OFFSET;

    // New line characters.
    private static final int EOL = 10;
    private static final int  CR = 13;

    // Each token type has an integer value. These token type
    // values are package-wide scope so the parser can access
    // them.

    /**
     * An error occurred when seeking the next token (0).
     */
    public static final int ERROR         =  0;

    /**
     * The end of the input is reached (1).
     */
    public static final int EOF           =  1;

    /**
     * A single-quoted character token (2). Token value is a
     * {@code java.lang.Character} instance.
     */
    public static final int CHARACTER     =  2;

    /**
     * Either a {@code //} or a slash star
     * comment (3). Nested comments are supported.
     */
    public static final int COMMENT       =  3;

    /**
     * A floating point number (4). Token value is a
     * {@code java.lang.Double} instance.
     */
    public static final int FLOAT         =  4;

    /**
     * An integer number (5). Token value is a
     * {@code java.lang.Long} instance.
     */
    public static final int INTEGER       =  5;

    /**
     * An alphanumberic identifier (6). If the token appears in
     * the user-defined keywords map, then the user's token type
     * is returned instead.
     */
    public static final int NAME          =  6;

    /**
     * Token consists solely of punctuation characters (7).
     * If the token is in the user-defined operator or
     * delimiter map, then the user's token type is returned
     * instead.
     * 
     * Punctuation characters are:
     * 
     *   
     * !  "  #  $  %  &  '  ( )  *
     * +  ,  -  .  /  :  ;  <  =  >
     * ?  @  [  \  ]  ^  _  `  {  }
     * |  ~
     *   
     * 
     */
    public static final int OPERATOR      =  7;

    /**
     * Raw, unanalyzed input (8).
     * @see net.sf.eBus.text.TokenLexer.LexMode#RAW
     */
    public static final int SOURCE        =  8;

    /**
     * A double-quoted string (9).
     */
    public static final int STRING        =  9;

    /**
     * There are eleven (11) predefined token types.
     */
    public static final int TOKEN_COUNT   = STRING + 1;

    /**
     * User-defined tokens must be >= 11.
     */
    public static final int NEXT_TOKEN    = TOKEN_COUNT;

    // The ASCII characters all have explicit transitions.
    // Unicode characters are simply given the unicode
    // transition.
    private static final int MIN_ASCII_CHAR = 0;
    private static final int MAX_ASCII_CHAR = 128;

    // The recognized punctuation characters.
    private static final int[] PUNCTUATION =
    {
        '!', '"', '#', '$', '%', '&', '\'', '(', ')', '*',
        '+', ',', '-', '.', '/', ':', ';',  '<', '=', '>',
        '?', '@', '[', '\\', ']', '^', '_', '`', '{', '}',
        '|', '~'
    };

    //-----------------------------------------------------------
    // Statics.
    //

    /**
     * Logging subsystem interface.
     */
    private static final Logger sLogger =
        LoggerFactory.getLogger(TokenLexer.class);

    // Use this array to convert a token type from integer ID
    // back to a human-readable name.
    private static String[] sTypeName;

    // Create an array which maps ASCII characters to
    // transitions.
    private static Method[] sTransMethod;

    static
    {
        String transName = "";

        sTypeName = new String[TOKEN_COUNT];
        sTypeName[ERROR]         = "ERROR";
        sTypeName[CHARACTER]     = "CHARACTER";
        sTypeName[COMMENT]       = "COMMENT";
        sTypeName[EOF]           = "EOF";
        sTypeName[FLOAT]         = "FLOAT";
        sTypeName[INTEGER]       = "INTEGER";
        sTypeName[NAME]          = "NAME";
        sTypeName[SOURCE]        = "SOURCE";
        sTypeName[STRING]        = "STRING";

        // Set up the transition map and token types.
        sTransMethod = new Method[TokenLexer.MAX_ASCII_CHAR];

        try
        {
            int i;
            final Class fsmClass =
                TokenLexerContext.class;
            Method unicode;
            Method whitespace;
            Method alpha;
            Method digit;
            Method punctuation;
            Method eol;

            transName = "unicode";
            unicode =
                fsmClass.getDeclaredMethod(
                    transName, char.class);

            transName = "whitespace";
            whitespace =
                fsmClass.getDeclaredMethod(
                    transName, char.class);

            transName = "alpha";
            alpha =
                fsmClass.getDeclaredMethod(
                    transName, char.class);

            transName = "digit";
            digit =
                fsmClass.getDeclaredMethod(
                    transName, char.class);

            transName = "punctuation";
            punctuation =
                fsmClass.getDeclaredMethod(
                    transName, char.class);

            transName = "EOL";
            eol =
                fsmClass.getDeclaredMethod(
                    transName, char.class);

            // Set all transitions to unicode and then set known
            // characters to other transitions.
            for (i = MIN_ASCII_CHAR; i < MAX_ASCII_CHAR; ++i)
            {
                if (Character.isWhitespace(i))
                {
                    sTransMethod[i] = whitespace;
                }
                else
                {
                    sTransMethod[i] = unicode;
                }
            }

            for (i = 'a'; i <= 'z'; ++i)
            {
                sTransMethod[i] = alpha;
            }

            for (i = 'A'; i <= 'Z'; ++i)
            {
                sTransMethod[i] = alpha;
            }

            for (i = '0'; i <= '9'; ++i)
            {
                sTransMethod[i] = digit;
            }

            // Only new line and carriage return are recognized
            // as end-of-line.
            sTransMethod[EOL] = eol;
            sTransMethod[CR] = eol;

            // Punctuation characters.
            for (i = 0; i < PUNCTUATION.length; ++i)
            {
                sTransMethod[PUNCTUATION[i]] = punctuation;
            }
        }
        catch (NoSuchMethodException | SecurityException jex)
        {
            sLogger.error(
                "INITIALIZATION ERROR! No such method as LexerContext.{}(char).",
                transName);
        }
    } // end of static

    //-----------------------------------------------------------
    // Locals.
    //

    // The lexer's state map.
    private TokenLexerContext mLexerFSM;

    // The file being parsed.
    private Reader mReader;

    // Either we are in "cooked" mode and looking for tokens or
    // we are in "raw" mode and are not processing the characters
    // but simply collecting them.
    private LexMode mMode;

    // Stop the event loop - a token has been found.
    private boolean mStopFlag;

    // Store the latest token in this object.
    private Token mToken;

    // Collect the token in a string buffer before making a
    // string out of it.
    private final StringBuilder mTokenBuffer;

    // The lexer is processing this line.
    private int mLineNumber;

    // Read in a buffer-full of data rather than one character
    // at a time.
    private final char[] mReadBuffer;

    // The actual number of read characters in the buffer.
    // May be less than the buffer's size.
    private int mBufferSize;

    // The next character to be processed.
    private int mReadIndex;

    // The offset into the input.
    private int mOffset;

    // True when the end-of-file is reached.
    private boolean mEofFlag;

    // When this flag is turned on, the parseer wants us to
    // collect a "raw" token. Keep track of nested clauses
    // using the depth count.
    private char mOpenChar;
    private char mCloseChar;

    // Maps keyword to token type.
    private final Map mKeywords;

    // Maps operators to token type.
    private final Map mOperators;

    // Maps delimiters to token type.
    private final Map mDelimiters;

//---------------------------------------------------------------
// Member methods.
//

    //-----------------------------------------------------------
    // Constructors.
    //

    /**
     * Creates a message layout lexer using the specified
     * keywords, operator and delimiters. These maps may be
     * empty but not {@code null}.
     * @param keywords Keyword to integer identifier mapping.
     * @param operators Operator to integer identifier mapping.
     * @param delimiters Delimiter to integer identifier mapping.
     * @exception IllegalArgumentException
     * if any of the user maps contains a value <
     * {@link net.sf.eBus.text.TokenLexer#NEXT_TOKEN}.
     */
    public TokenLexer(final Map keywords,
                      final Map operators,
                      final Map delimiters)
    {
        mReader = null;
        mTokenBuffer = new StringBuilder();
        mReadBuffer = new char[READ_BUFFER_LEN];
        mBufferSize = 0;
        mReadIndex = 0;
        mOffset = 0;
        mLineNumber = 0;
        mEofFlag = false;
        mKeywords = keywords;
        mOperators = operators;
        mDelimiters = delimiters;

        // Check the maps validity.
        validate(keywords.values(), "keywords");
        validate(operators.values(), "operators");
        validate(delimiters.values(), "delimiters");

        // We are in the "cooked" processing mode by default.
        mMode = LexMode.COOKED;

        mLexerFSM = null;
    } // end of TokenLexer(Map)

    //
    // end of Constructors.
    //-----------------------------------------------------------

    //-----------------------------------------------------------
    // Get methods.
    //

    /**
     * Returns the current line number being tokenized.
     * @return the current line number being tokenized.
     */
    public int lineNumber()
    {
        return (mLineNumber);
    } // end of lineNumber()

    /**
     * Returns the current offset into the input.
     * @return the current offset into the input.
     */
    public int offset()
    {
        return (mOffset);
    } // end of offset()

    /**
     * Returns the current lexer mode.
     * @return the current lexer mode.
     */
    public LexMode mode()
    {
        return (mMode);
    } // end of mode()

    //
    // end of Get methods.
    //-----------------------------------------------------------

    //-----------------------------------------------------------
    // Set methods.
    //

    /**
     * Extract tokens from this input stream.
     * @param reader Tokenize this input.
     */
    public void input(final Reader reader)
    {
        mReader = reader;
        mBufferSize = 0;
        mReadIndex = 0;
        mOffset = 0;
        mLineNumber = 0;
        mEofFlag = false;
        mLexerFSM = new TokenLexerContext(this);

        // mLexerFSM.setDebugFlag(true);
        mLexerFSM.enterStartState();
    } // end of input(Reader)

    /**
     * Switch to raw tokenization.
     * @param openChar The open clause delimiter.
     * @param closeChar The close clause delimiter.
     * @see #cookedMode()
     */
    public void rawMode(final char openChar,
                        final char closeChar)
    {
        mMode = LexMode.RAW;
        mOpenChar = openChar;
        mCloseChar = closeChar;
    } // end of rawMode(char, char)

    /**
     * Switch back to cooked tokenization.
     * @see #rawMode(char, char)
     */
    public void cookedMode()
    {
        mMode = LexMode.COOKED;
        mTokenBuffer.delete(0, mTokenBuffer.length());
    } // end of cookedMode()

    //
    // end of Set methods.
    //-----------------------------------------------------------

    /**
     * Returns the next token found in the input stream. If there
     * are no more tokens in the input stream, then
     * {@link net.sf.eBus.text.TokenLexer#EOF} is returned.
     * @return the next token found in the input stream.
     * @throws IllegalStateException
     * if input reader is not set.
     */
    public Token nextToken()
    {
        Token retval;

        if (mReader == null)
        {
            throw (new IllegalStateException("reader not set"));
        }

        if (mEofFlag)
        {
            retval = new Token(EOF, null, "", mLineNumber);
        }
        else if (mMode == LexMode.COOKED)
        {
            retval = nextCookedToken();
        }
        else
        {
            retval = nextRawToken();
        }

        return (retval);
    } // end of nextToken()

    //-----------------------------------------------------------
    // State Machine Actions
    //

    // Returns the current token.
    /* package */ String token()
    {
        return (mTokenBuffer.toString());
    } // end of token()

    /**
     * Clears out the collected token buffer and resets the token
     * object to its initial state.
     */
    /* package */ void startToken()
    {
        mToken = null;
        mTokenBuffer.delete(0, mTokenBuffer.length());
    } // end of startToken()

    /**
     * Appends character to the token.
     * @param c append this character.
     */
    /* package */ void appendToken(final char c)
    {
        mTokenBuffer.append(c);
    } // end of appendToken(char)

    /* package */ void endToken(final int type)
    {
        final String tokenStr = mTokenBuffer.toString();

        // If this is a NAME type, then check if this name is
        // actually a message type, data type or keyword.
        switch (type)
        {
            case NAME:
                mToken = nameToken(tokenStr);
                break;

            case OPERATOR:
                mToken = operatorToken(tokenStr);
                break;

            case CHARACTER:
                mToken =
                    new Token(type,
                              tokenStr.charAt(0),
                              tokenStr,
                              mLineNumber);
                break;

            case FLOAT:
                mToken = floatToken(tokenStr);
                break;

            case INTEGER:
                mToken = longToken(tokenStr);
                break;

            default:
                mToken =
                    new Token(
                        type, tokenStr, tokenStr, mLineNumber);
                break;
        }

        mStopFlag = true;
    } // end of endToken(int)

    // A malformed token has been detected.
    /* package */ void badToken(final String errorMsg)
    {
        mToken =
            new Token(
                ERROR, errorMessage(errorMsg), "", mLineNumber);

        // Stop tokenizing.
        mStopFlag = true;
    } // end of badToken(String)

    // Back up one character in the file so that the character
    // will be read again when nextToken() is called. This is
    // usually done when one token is terminated by another.
    /* package */ void ungetChar()
    {
        --mReadIndex;
        --mOffset;
    } // end of ungetChar()

    // Returns true if there is a delimiter matching the
    // character.
    /* package */ boolean isDelimiter(final char c)
    {
        return (mDelimiters.containsKey(c));
    } // end of containsDelimiter(char)

    // Returns the delimiter type.
    /* package */ int delimiterType(final char c)
    {
        return (mDelimiters.get(c));
    } // end of delimiterType(char)

    //
    // end of State Machine Actions
    //-----------------------------------------------------------

    // Returns the next cooked token.
    private Token nextCookedToken()
    {
        char c;

        // Clear out the token and get ready to work.
        startToken();

        try
        {
            mStopFlag = false;
            while (!mStopFlag)
            {
                c = readChar();

                // If the character's integer value is greater
                // than 127, then issue a unicode transition
                // and let the lexer FSM decide whether it is
                // acceptable or not.
                if (c >= sTransMethod.length)
                {
                    mLexerFSM.unicode(c);
                }
                else
                {
                    // Translate character into a transition.
                    sTransMethod[c].invoke(mLexerFSM, c);

                    // If this is an end-of-line character, add
                    // one to the current line number. CR-LF is
                    // a single end-of-line.
                    if (c == EOL)
                    {
                        ++mLineNumber;
                    }
                }
            }
        }
        catch (EOFException e)
        {
            // If this is the end of the source file, let
            // the parser know.
            mLexerFSM.EOF();
        }
        catch (InvocationTargetException |
               IllegalAccessException invokex)
        {
            badToken("Unknown token");
        }
        catch (IOException ioex)
        {
            badToken(errorMessage(ioex.getMessage()));
        }

        return (mToken);
    } // end of nextCookedToken()

    // Keep reading in characters until the close character is
    // found.
    private Token nextRawToken()
    {
        final int startLine = mLineNumber;
        char c;
        String value;

        // Clear out the token and get ready to work.
        startToken();

        // Keep reading until told to stop or the
        // end-of-file is reached.
        try
        {
            int depth = 0;

            mStopFlag = false;
            while (!mStopFlag)
            {
                c = readChar();

                // When working in RAW mode, the close character
                // may naturally occur. The only way we have of
                // knowing if the close character is for us or
                // not is by keeping track of the matching open
                // characters seen. When an open character is
                // seen, add one to the depth. When a close
                // character is seen, then either:
                // + if the depth is zero, this is the end of the
                //   raw code; return the token.
                // + if the depth is greater than zero, then
                //   subtract one from the depth.
                mTokenBuffer.append(c);

                if (c == mCloseChar)
                {
                    --depth;
                    mStopFlag = (depth <= 0);
                }
                // If this is the open character, then
                // add one to the depth which lets us
                // know that the next close character
                // does *not* end the raw code section.
                else if (mOpenChar != NO_OPEN_CHAR &&
                         c == mOpenChar)
                {
                    ++depth;
                }
                // If this is an end-of-line character,
                // add one to the current line number.
                // CR-LF is a single end-of-line.
                else if (c == EOL)
                {
                    ++mLineNumber;
                }
            }

            value = mTokenBuffer.toString();

            mToken = new Token(SOURCE, value, value, startLine);
        }
        catch (EOFException e)
        {
            final StringBuilder msg = new StringBuilder();

            msg.append(
                "User source code contains an unbalanced ");
            msg.append(mOpenChar);
            msg.append(", ");
            msg.append(mCloseChar);
            msg.append(" pair.");

            // If this is the end of the source file, then the
            // raw code section has an unbalanced open character/
            // close character pair.
            badToken(msg.toString());
        }
        catch (IOException ioex)
        {
            badToken(errorMessage(ioex.getMessage()));
        }

        return (mToken);
    } // end of nextRawToken()

    // Returns either an name or a keyword token.
    private Token nameToken(final String token)
    {
        int type = NAME;
        Object value = token;

        // Convert the token string to all uppercase since
        // the message type and keywords are stored as such.
        final String key = token.toUpperCase(Locale.US);

        // If this key a keyword?
        if (mKeywords.containsKey(key))
        {
            // The returned value is an integer containing
            // the token type. Extract the type and return
            // the token string.
            type = mKeywords.get(key);
            value = key;
        }

        return (new Token(type, value, token, mLineNumber));
    } // end of nameToken()

    // Returns either an operator token or an error.
    private Token operatorToken(final String token)
    {
        int type;
        Object value = token;

        // Is this a known operator?
        if (!mOperators.containsKey(token))
        {
            // No, it is unknown. Change this to an error.
            type = ERROR;
            value = "unknown operator";
        }
        else
        {
            type = mOperators.get(token);
        }

        return (new Token(type, value, token, mLineNumber));
    } // end of operatorToken(String)

    // Returns either a float token or an error.
    private Token floatToken(final String token)
    {
        int type = FLOAT;
        Object value;

        try
        {
            value = Double.valueOf(token);
        }
        catch (NumberFormatException formex)
        {
            final StringBuilder buffer = new StringBuilder();

            buffer.append("invalid float, ");
            buffer.append(formex.getMessage());

            type = ERROR;
            value = buffer.toString();
        }

        return (new Token(type, value, token, mLineNumber));
    } // end of floatToken(String)

    // Returns either a long token or an error.
    private Token longToken(final String initialToken)
    {
        String token = initialToken;
        int type = INTEGER;
        Object value;

        // If the first character is a '+', then strip that
        // from the string.
        if (token.charAt(0) == '+')
        {
            token = token.substring(1);
        }

        try
        {
            value = Long.valueOf(token);
        }
        catch (NumberFormatException formex)
        {
            final StringBuilder buffer = new StringBuilder();

            buffer.append("invalid integer, ");
            buffer.append(formex.getMessage());

            type = ERROR;
            value = buffer.toString();
        }

        return (new Token(type, value, token, mLineNumber));
    } // end of longToken(String)

    // Read the next character. Actually, this routine reads in
    // a large buffer and data returns the next character from
    // there. The idea is to do a few large, efficient reads and
    // make single character reads to be array retrievals.
    // NOTE: this lexer backs up at most two characters. So
    // when reading in a new buffer, copy the last two characters
    // to the first two bytes and read in the next maximum number
    // of bytes.
    private char readChar()
        throws IOException
    {
        char retval;

        // If we are at the end of the buffer, read the
        // next buffer-full.
        if (mReadIndex == mBufferSize)
        {
            int size;
            int offset = 0;
            int length;

            // Copy the last two bytes to the first two bytes.
            // Why? Because the lexer can back up to two bytes.
            if (mBufferSize > 2)
            {
                offset = 2;

                mReadBuffer[0] = mReadBuffer[mBufferSize - 2];
                mReadBuffer[1] = mReadBuffer[mBufferSize - 1];
            }
            else if (mBufferSize > 1)
            {
                offset = 1;

                mReadBuffer[0] = mReadBuffer[mBufferSize - 1];
            }

            length = (MAX_BUFFER_LEN - offset);
            size = mReader.read(mReadBuffer, offset, length);

            // Has end of file been reached?
            if (size < 0)
            {
                // Yes.
                mBufferSize = 0;
                mEofFlag = true;

                throw (new EOFException("end-of-file reached"));
            }
            else
            {
                // The buffer's true size is the number of bytes
                // read plus the offset.
                mBufferSize = size + offset;
                mReadIndex = offset;
            }
        }

        retval = mReadBuffer[mReadIndex];
        ++mReadIndex;
        ++mOffset;

        return (retval);
    } // end of readChar()

    // Make sure the token type values are in the user-defined
    // zone. Called for effect only.
    private void validate(final Collection values,
                          final String name)
    {
        values.stream()
              .filter(value -> (value < NEXT_TOKEN))
              .forEachOrdered(
                  value ->
                  {
                      throw (
                          new IllegalArgumentException(
                              String.format(
                                  "invalid %s token type (%s)",
                              name,
                              value)));
                  });
    } // end of validate(Collection, String)

    /**
     * Returns an error message starting with the given prefix
     * and containing the raw token buffer contents.
     * @param prefix prepend error message with this text.
     * @return token error message.
     */
    private String errorMessage(final String prefix)
    {
        return (
            String.format(
                "%s  (token: %s)",
                prefix,
                mTokenBuffer.toString()));
    } // end of errorMessage(String)
} // end of class TokenLexer