net.sf.eBus.text.TokenLexer Maven / Gradle / Ivy
//
// Copyright 2001 - 2008 Charles W. Rapp
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package net.sf.eBus.text;
import java.io.EOFException;
import java.io.IOException;
import java.io.Reader;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.Collection;
import java.util.Locale;
import java.util.Map;
import java.util.logging.Logger;
/**
* Provides a generalized token lexer capability. This lexer
* ability is beyond {@code java.util.StringTokenizer} in that
* it identifies the token type along with the token and converts
* the token string into the type's corresponding Java instance.
* There are nine (9) pre-defined token types and two special
* types: {@link net.sf.eBus.text.TokenLexer#ERROR} and
* {@link net.sf.eBus.text.TokenLexer#EOF}. {@code ERROR} is returned
* when an recoverable error occurred. {@code EOF} is returned
* when the input end is reached and no more tokens will be
* returned.
*
* The pre-defined token types are:
*
* -
* {@link net.sf.eBus.text.TokenLexer#CHARACTER}:
* a single character between single quotes (').
*
* -
* {@link net.sf.eBus.text.TokenLexer#COMMENT}:
* Either a {@code //} or slash star comment.
* Supports nested comments.
*
* -
* {@link net.sf.eBus.text.TokenLexer#FLOAT}: A decimal number.
*
* -
* {@link net.sf.eBus.text.TokenLexer#INTEGER}:
* An integer number.
*
* -
* {@link net.sf.eBus.text.TokenLexer#NAME}:
* An alpha-numeric identifier.
*
* -
* {@link net.sf.eBus.text.TokenLexer#OPERATOR}:
* Punctuation only identifier.
*
* -
* {@link net.sf.eBus.text.TokenLexer#SOURCE}:
* Raw, unanalyzed input.
*
* -
* {@link net.sf.eBus.text.TokenLexer#STRING}:
* Zero or more characters between double quotes
* ("").
*
*
* There is support for user-defined keyword, operator and
* delimiter tokens. When a {@link net.sf.eBus.text.TokenLexer#NAME}
* token is found, the user keywords map is checked if it
* contains the token as a keyword. If so, then the associated
* token type is returned instead of {@code NAME}. When a
* {@link net.sf.eBus.text.TokenLexer#OPERATOR} token is found,
* both the user operators and delimiters maps are checked.
*
* The user-defined token maps should meet the following
* criteria:
*
*
* -
* The token type values must be >= to
* {@link net.sf.eBus.text.TokenLexer#NEXT_TOKEN}.
*
* -
* The token type values do not need be unique either within
* or across maps.
*
* -
* The token type values do not need to be consecutive.
*
*
* The basic algorithm using {@code TokenLexer} is:
*
*
import java.io.Reader;
import net.sf.eBus.text.TokenLexer;
import net.sf.eBus.text.Token;
...
TokenLexer lexer = new TokenLexer(Keywords, Operators, Delimiters);
Token token;
Reader input = ...;
// Set the input to be tokenized.
lexer.input(input);
// Continue retrieving until no more tokens.
while ((token = lexer.nextToken()).type() != TokenLexer.EOF)
{
// Process the next token based on token type.
}
// Finish up the tokenization.
*
* Raw Lexical Mode
*
* Users may not want the lexer to analyze input between two
* well-defined delimiters. This data is collected and returned
* as a {@link net.sf.eBus.text.TokenLexer#SOURCE} token when the
* terminating delimiter is reached. Raw mode requires both an
* an opening and closing delimiter specified. This allows the
* lexer to track the appearance of nested delimiters within the
* input and return only when the top-level terminating delimiter
* is found.
*
*
* Raw lexical mode is used when input contains sub-text to be
* handled by a different lexer.
*
p
*
* @author Charles Rapp
*/
@SuppressWarnings("unchecked")
public final class TokenLexer
{
//---------------------------------------------------------------
// Enums.
//
/**
* The lexer will either analyze the tokens identifying the
* type or collect raw input until a terminating delimiter
* is found.
*/
public enum LexMode
{
/**
* When in cooked mode identify the token type.
*/
COOKED,
/**
* When in raw mode, collect characters until the
* terminating delimiter is found.
* RAW is used to read in all characters between parens,
* braces, etc. RAW mode will read in an entire file
* if the open, close delimiters are mismatched.
*/
RAW
} // end of enum LexMode
//---------------------------------------------------------------
// Member data.
//
//-----------------------------------------------------------
// Constants.
//
/**
* When the raw mode open character is set to U+0000, this
* means there is no open character, only a close character.
*/
public static final char NO_OPEN_CHAR = 0;
// Read in this many bytes at a time into the buffer.
private static final int MAX_BUFFER_LEN = 4096;
// Read into the input buffer starting at this offset.
private static final int BUFFER_OFFSET = 2;
// The read buffer's allocated size in bytes.
private static final int READ_BUFFER_LEN =
MAX_BUFFER_LEN + BUFFER_OFFSET;
// New line characters.
private static final int EOL = 10;
private static final int CR = 13;
// Each token type has an integer value. These token type
// values are package-wide scope so the parser can access
// them.
/**
* An error occurred when seeking the next token (0).
*/
public static final int ERROR = 0;
/**
* The end of the input is reached (1).
*/
public static final int EOF = 1;
/**
* A single-quoted character token (2). Token value is a
* {@code java.lang.Character} instance.
*/
public static final int CHARACTER = 2;
/**
* Either a {@code //} or a slash star
* comment (3). Nested comments are supported.
*/
public static final int COMMENT = 3;
/**
* A floating point number (4). Token value is a
* {@code java.lang.Double} instance.
*/
public static final int FLOAT = 4;
/**
* An integer number (5). Token value is a
* {@code java.lang.Long} instance.
*/
public static final int INTEGER = 5;
/**
* An alphanumberic identifier (6). If the token appears in
* the user-defined keywords map, then the user's token type
* is returned instead.
*/
public static final int NAME = 6;
/**
* Token consists solely of punctuation characters (7).
* If the token is in the user-defined operator or
* delimiter map, then the user's token type is returned
* instead.
*
* Punctuation characters are:
*
*
* ! " # $ % & ' ( ) *
* + , - . / : ; < = >
* ? @ [ \ ] ^ _ ` { }
* | ~
*
*
*/
public static final int OPERATOR = 7;
/**
* Raw, unanalyzed input (8).
* @see net.sf.eBus.text.TokenLexer.LexMode#RAW
*/
public static final int SOURCE = 8;
/**
* A double-quoted string (9).
*/
public static final int STRING = 9;
/**
* There are eleven (11) predefined token types.
*/
public static final int TOKEN_COUNT = STRING + 1;
/**
* User-defined tokens must be >= 11.
*/
public static final int NEXT_TOKEN = TOKEN_COUNT;
// The ASCII characters all have explicit transitions.
// Unicode characters are simply given the unicode
// transition.
private static final int MIN_ASCII_CHAR = 0;
private static final int MAX_ASCII_CHAR = 128;
// The recognized punctuation characters.
private static final int[] PUNCTUATION =
{
'!', '"', '#', '$', '%', '&', '\'', '(', ')', '*',
'+', ',', '-', '.', '/', ':', ';', '<', '=', '>',
'?', '@', '[', '\\', ']', '^', '_', '`', '{', '}',
'|', '~'
};
//-----------------------------------------------------------
// Statics.
//
/**
* Logging subsystem interface.
*/
private static final Logger sLogger =
Logger.getLogger(TokenLexer.class.getName());
// Use this array to convert a token type from integer ID
// back to a human-readable name.
private static String[] sTypeName;
// Create an array which maps ASCII characters to
// transitions.
private static Method[] sTransMethod;
static
{
String transName = "";
sTypeName = new String[TOKEN_COUNT];
sTypeName[ERROR] = "ERROR";
sTypeName[CHARACTER] = "CHARACTER";
sTypeName[COMMENT] = "COMMENT";
sTypeName[EOF] = "EOF";
sTypeName[FLOAT] = "FLOAT";
sTypeName[INTEGER] = "INTEGER";
sTypeName[NAME] = "NAME";
sTypeName[SOURCE] = "SOURCE";
sTypeName[STRING] = "STRING";
// Set up the transition map and token types.
sTransMethod = new Method[TokenLexer.MAX_ASCII_CHAR];
try
{
int i;
final Class fsmClass =
TokenLexerContext.class;
Method unicode;
Method whitespace;
Method alpha;
Method digit;
Method punctuation;
Method eol;
transName = "unicode";
unicode =
fsmClass.getDeclaredMethod(
transName, char.class);
transName = "whitespace";
whitespace =
fsmClass.getDeclaredMethod(
transName, char.class);
transName = "alpha";
alpha =
fsmClass.getDeclaredMethod(
transName, char.class);
transName = "digit";
digit =
fsmClass.getDeclaredMethod(
transName, char.class);
transName = "punctuation";
punctuation =
fsmClass.getDeclaredMethod(
transName, char.class);
transName = "EOL";
eol =
fsmClass.getDeclaredMethod(
transName, char.class);
// Set all transitions to unicode and then set known
// characters to other transitions.
for (i = MIN_ASCII_CHAR; i < MAX_ASCII_CHAR; ++i)
{
if (Character.isWhitespace(i))
{
sTransMethod[i] = whitespace;
}
else
{
sTransMethod[i] = unicode;
}
}
for (i = 'a'; i <= 'z'; ++i)
{
sTransMethod[i] = alpha;
}
for (i = 'A'; i <= 'Z'; ++i)
{
sTransMethod[i] = alpha;
}
for (i = '0'; i <= '9'; ++i)
{
sTransMethod[i] = digit;
}
// Only new line and carriage return are recognized
// as end-of-line.
sTransMethod[EOL] = eol;
sTransMethod[CR] = eol;
// Punctuation characters.
for (i = 0; i < PUNCTUATION.length; ++i)
{
sTransMethod[PUNCTUATION[i]] = punctuation;
}
}
catch (NoSuchMethodException | SecurityException jex)
{
sLogger.severe(
String.format(
"INITIALIZATION ERROR! No such method as LexerContext.%s(char).%n",
transName));
}
} // end of static
//-----------------------------------------------------------
// Locals.
//
// The lexer's state map.
private TokenLexerContext mLexerFSM;
// The file being parsed.
private Reader mReader;
// Either we are in "cooked" mode and looking for tokens or
// we are in "raw" mode and are not processing the characters
// but simply collecting them.
private LexMode mMode;
// Stop the event loop - a token has been found.
private boolean mStopFlag;
// Store the latest token in this object.
private Token mToken;
// Collect the token in a string buffer before making a
// string out of it.
private final StringBuilder mTokenBuffer;
// The lexer is processing this line.
private int mLineNumber;
// Read in a buffer-full of data rather than one character
// at a time.
private final char[] mReadBuffer;
// The actual number of read characters in the buffer.
// May be less than the buffer's size.
private int mBufferSize;
// The next character to be processed.
private int mReadIndex;
// The offset into the input.
private int mOffset;
// True when the end-of-file is reached.
private boolean mEofFlag;
// When this flag is turned on, the parseer wants us to
// collect a "raw" token. Keep track of nested clauses
// using the depth count.
private char mOpenChar;
private char mCloseChar;
// Maps keyword to token type.
private final Map mKeywords;
// Maps operators to token type.
private final Map mOperators;
// Maps delimiters to token type.
private final Map mDelimiters;
//---------------------------------------------------------------
// Member methods.
//
//-----------------------------------------------------------
// Constructors.
//
/**
* Creates a message layout lexer using the specified
* keywords, operator and delimiters. These maps may be
* empty but not {@code null}.
* @param keywords Keyword to integer identifier mapping.
* @param operators Operator to integer identifier mapping.
* @param delimiters Delimiter to integer identifier mapping.
* @exception IllegalArgumentException
* if any of the user maps contains a value <
* {@link net.sf.eBus.text.TokenLexer#NEXT_TOKEN}.
*/
public TokenLexer(final Map keywords,
final Map operators,
final Map delimiters)
{
mReader = null;
mTokenBuffer = new StringBuilder();
mReadBuffer = new char[READ_BUFFER_LEN];
mBufferSize = 0;
mReadIndex = 0;
mOffset = 0;
mLineNumber = 0;
mEofFlag = false;
mKeywords = keywords;
mOperators = operators;
mDelimiters = delimiters;
// Check the maps validity.
validate(keywords.values(), "keywords");
validate(operators.values(), "operators");
validate(delimiters.values(), "delimiters");
// We are in the "cooked" processing mode by default.
mMode = LexMode.COOKED;
mLexerFSM = null;
} // end of TokenLexer(Map)
//
// end of Constructors.
//-----------------------------------------------------------
//-----------------------------------------------------------
// Get methods.
//
/**
* Returns the current line number being tokenized.
* @return the current line number being tokenized.
*/
public int lineNumber()
{
return (mLineNumber);
} // end of lineNumber()
/**
* Returns the current offset into the input.
* @return the current offset into the input.
*/
public int offset()
{
return (mOffset);
} // end of offset()
/**
* Returns the current lexer mode.
* @return the current lexer mode.
*/
public LexMode mode()
{
return (mMode);
} // end of mode()
//
// end of Get methods.
//-----------------------------------------------------------
//-----------------------------------------------------------
// Set methods.
//
/**
* Extract tokens from this input stream.
* @param reader Tokenize this input.
*/
public void input(final Reader reader)
{
mReader = reader;
mBufferSize = 0;
mReadIndex = 0;
mOffset = 0;
mLineNumber = 0;
mEofFlag = false;
mLexerFSM = new TokenLexerContext(this);
// mLexerFSM.setDebugFlag(true);
mLexerFSM.enterStartState();
} // end of input(Reader)
/**
* Switch to raw tokenization.
* @param openChar The open clause delimiter.
* @param closeChar The close clause delimiter.
* @see #cookedMode()
*/
public void rawMode(final char openChar,
final char closeChar)
{
mMode = LexMode.RAW;
mOpenChar = openChar;
mCloseChar = closeChar;
} // end of rawMode(char, char)
/**
* Switch back to cooked tokenization.
* @see #rawMode(char, char)
*/
public void cookedMode()
{
mMode = LexMode.COOKED;
mTokenBuffer.delete(0, mTokenBuffer.length());
} // end of cookedMode()
//
// end of Set methods.
//-----------------------------------------------------------
/**
* Returns the next token found in the input stream. If there
* are no more tokens in the input stream, then
* {@link net.sf.eBus.text.TokenLexer#EOF} is returned.
* @return the next token found in the input stream.
* @throws IllegalStateException
* if input reader is not set.
*/
public Token nextToken()
{
Token retval;
if (mReader == null)
{
throw (new IllegalStateException("reader not set"));
}
if (mEofFlag)
{
retval = new Token(EOF, null, "", mLineNumber);
}
else if (mMode == LexMode.COOKED)
{
retval = nextCookedToken();
}
else
{
retval = nextRawToken();
}
return (retval);
} // end of nextToken()
//-----------------------------------------------------------
// State Machine Actions
//
// Returns the current token.
/* package */ String token()
{
return (mTokenBuffer.toString());
} // end of token()
/**
* Clears out the collected token buffer and resets the token
* object to its initial state.
*/
/* package */ void startToken()
{
mToken = null;
mTokenBuffer.delete(0, mTokenBuffer.length());
} // end of startToken()
/**
* Appends character to the token.
* @param c append this character.
*/
/* package */ void appendToken(final char c)
{
mTokenBuffer.append(c);
} // end of appendToken(char)
/* package */ void endToken(final int type)
{
final String tokenStr = mTokenBuffer.toString();
// If this is a NAME type, then check if this name is
// actually a message type, data type or keyword.
switch (type)
{
case NAME:
mToken = nameToken(tokenStr);
break;
case OPERATOR:
mToken = operatorToken(tokenStr);
break;
case CHARACTER:
mToken =
new Token(type,
tokenStr.charAt(0),
tokenStr,
mLineNumber);
break;
case FLOAT:
mToken = floatToken(tokenStr);
break;
case INTEGER:
mToken = longToken(tokenStr);
break;
default:
mToken =
new Token(
type, tokenStr, tokenStr, mLineNumber);
break;
}
mStopFlag = true;
} // end of endToken(int)
// A malformed token has been detected.
/* package */ void badToken(final String errorMsg)
{
mToken =
new Token(
ERROR, errorMessage(errorMsg), "", mLineNumber);
// Stop tokenizing.
mStopFlag = true;
} // end of badToken(String)
// Back up one character in the file so that the character
// will be read again when nextToken() is called. This is
// usually done when one token is terminated by another.
/* package */ void ungetChar()
{
--mReadIndex;
--mOffset;
} // end of ungetChar()
// Returns true if there is a delimiter matching the
// character.
/* package */ boolean isDelimiter(final char c)
{
return (mDelimiters.containsKey(c));
} // end of containsDelimiter(char)
// Returns the delimiter type.
/* package */ int delimiterType(final char c)
{
return (mDelimiters.get(c));
} // end of delimiterType(char)
//
// end of State Machine Actions
//-----------------------------------------------------------
// Returns the next cooked token.
private Token nextCookedToken()
{
char c;
// Clear out the token and get ready to work.
startToken();
try
{
mStopFlag = false;
while (!mStopFlag)
{
c = readChar();
// If the character's integer value is greater
// than 127, then issue a unicode transition
// and let the lexer FSM decide whether it is
// acceptable or not.
if (c >= sTransMethod.length)
{
mLexerFSM.unicode(c);
}
else
{
// Translate character into a transition.
sTransMethod[c].invoke(mLexerFSM, c);
// If this is an end-of-line character, add
// one to the current line number. CR-LF is
// a single end-of-line.
if (c == EOL)
{
++mLineNumber;
}
}
}
}
catch (EOFException e)
{
// If this is the end of the source file, let
// the parser know.
mLexerFSM.EOF();
}
catch (InvocationTargetException |
IllegalAccessException invokex)
{
badToken("Unknown token");
}
catch (IOException ioex)
{
badToken(errorMessage(ioex.getMessage()));
}
return (mToken);
} // end of nextCookedToken()
// Keep reading in characters until the close character is
// found.
private Token nextRawToken()
{
final int startLine = mLineNumber;
char c;
String value;
// Clear out the token and get ready to work.
startToken();
// Keep reading until told to stop or the
// end-of-file is reached.
try
{
int depth = 0;
mStopFlag = false;
while (!mStopFlag)
{
c = readChar();
// When working in RAW mode, the close character
// may naturally occur. The only way we have of
// knowing if the close character is for us or
// not is by keeping track of the matching open
// characters seen. When an open character is
// seen, add one to the depth. When a close
// character is seen, then either:
// + if the depth is zero, this is the end of the
// raw code; return the token.
// + if the depth is greater than zero, then
// subtract one from the depth.
mTokenBuffer.append(c);
if (c == mCloseChar)
{
--depth;
mStopFlag = (depth <= 0);
}
// If this is the open character, then
// add one to the depth which lets us
// know that the next close character
// does *not* end the raw code section.
else if (mOpenChar != NO_OPEN_CHAR &&
c == mOpenChar)
{
++depth;
}
// If this is an end-of-line character,
// add one to the current line number.
// CR-LF is a single end-of-line.
else if (c == EOL)
{
++mLineNumber;
}
}
value = mTokenBuffer.toString();
mToken = new Token(SOURCE, value, value, startLine);
}
catch (EOFException e)
{
final StringBuilder msg = new StringBuilder();
msg.append(
"User source code contains an unbalanced ");
msg.append(mOpenChar);
msg.append(", ");
msg.append(mCloseChar);
msg.append(" pair.");
// If this is the end of the source file, then the
// raw code section has an unbalanced open character/
// close character pair.
badToken(msg.toString());
}
catch (IOException ioex)
{
badToken(errorMessage(ioex.getMessage()));
}
return (mToken);
} // end of nextRawToken()
// Returns either an name or a keyword token.
private Token nameToken(final String token)
{
int type = NAME;
Object value = token;
// Convert the token string to all uppercase since
// the message type and keywords are stored as such.
final String key = token.toUpperCase(Locale.US);
// If this key a keyword?
if (mKeywords.containsKey(key))
{
// The returned value is an integer containing
// the token type. Extract the type and return
// the token string.
type = mKeywords.get(key);
value = key;
}
return (new Token(type, value, token, mLineNumber));
} // end of nameToken()
// Returns either an operator token or an error.
private Token operatorToken(final String token)
{
int type;
Object value = token;
// Is this a known operator?
if (!mOperators.containsKey(token))
{
// No, it is unknown. Change this to an error.
type = ERROR;
value = "unknown operator";
}
else
{
type = mOperators.get(token);
}
return (new Token(type, value, token, mLineNumber));
} // end of operatorToken(String)
// Returns either a float token or an error.
private Token floatToken(final String token)
{
int type = FLOAT;
Object value;
try
{
value = Double.valueOf(token);
}
catch (NumberFormatException formex)
{
final StringBuilder buffer = new StringBuilder();
buffer.append("invalid float, ");
buffer.append(formex.getMessage());
type = ERROR;
value = buffer.toString();
}
return (new Token(type, value, token, mLineNumber));
} // end of floatToken(String)
// Returns either a long token or an error.
private Token longToken(final String initialToken)
{
String token = initialToken;
int type = INTEGER;
Object value;
// If the first character is a '+', then strip that
// from the string.
if (token.charAt(0) == '+')
{
token = token.substring(1);
}
try
{
value = Long.valueOf(token);
}
catch (NumberFormatException formex)
{
final StringBuilder buffer = new StringBuilder();
buffer.append("invalid integer, ");
buffer.append(formex.getMessage());
type = ERROR;
value = buffer.toString();
}
return (new Token(type, value, token, mLineNumber));
} // end of longToken(String)
// Read the next character. Actually, this routine reads in
// a large buffer and data returns the next character from
// there. The idea is to do a few large, efficient reads and
// make single character reads to be array retrievals.
// NOTE: this lexer backs up at most two characters. So
// when reading in a new buffer, copy the last two characters
// to the first two bytes and read in the next maximum number
// of bytes.
private char readChar()
throws IOException
{
char retval;
// If we are at the end of the buffer, read the
// next buffer-full.
if (mReadIndex == mBufferSize)
{
int size;
int offset = 0;
int length;
// Copy the last two bytes to the first two bytes.
// Why? Because the lexer can back up to two bytes.
if (mBufferSize > 2)
{
offset = 2;
mReadBuffer[0] = mReadBuffer[mBufferSize - 2];
mReadBuffer[1] = mReadBuffer[mBufferSize - 1];
}
else if (mBufferSize > 1)
{
offset = 1;
mReadBuffer[0] = mReadBuffer[mBufferSize - 1];
}
length = (MAX_BUFFER_LEN - offset);
size = mReader.read(mReadBuffer, offset, length);
// Has end of file been reached?
if (size < 0)
{
// Yes.
mBufferSize = 0;
mEofFlag = true;
throw (new EOFException("end-of-file reached"));
}
else
{
// The buffer's true size is the number of bytes
// read plus the offset.
mBufferSize = size + offset;
mReadIndex = offset;
}
}
retval = mReadBuffer[mReadIndex];
++mReadIndex;
++mOffset;
return (retval);
} // end of readChar()
// Make sure the token type values are in the user-defined
// zone. Called for effect only.
private void validate(final Collection values,
final String name)
{
values.stream()
.filter(value -> (value < NEXT_TOKEN))
.forEachOrdered(
value ->
{
throw (
new IllegalArgumentException(
String.format(
"invalid %s token type (%s)",
name,
value)));
});
} // end of validate(Collection, String)
/**
* Returns an error message starting with the given prefix
* and containing the raw token buffer contents.
* @param prefix prepend error message with this text.
* @return token error message.
*/
private String errorMessage(final String prefix)
{
return (
String.format(
"%s (token: %s)",
prefix,
mTokenBuffer.toString()));
} // end of errorMessage(String)
} // end of class TokenLexer