cdc.applic.expressions.parsing.Tokenizer Maven / Gradle / Ivy
Show all versions of cdc-applic-expressions Show documentation
package cdc.applic.expressions.parsing;
import cdc.applic.expressions.LexicalException;
import cdc.applic.expressions.SyntacticException;
import cdc.applic.expressions.literals.EscapingUtils;
import cdc.util.lang.Checks;
/**
* Implementation of Tokenizer.
*
* This class analyzes an expression or piece of expression and identifies its tokens.
* It does not check conformity of the expression to a grammar.
*
* There are 2 modes of tokenization:
*
* - strict mode must be used when names, operators, .. are expected
*
- non strict mode must be used when values are expected (right side of an equality operator or inside a set).
*
* A typical usage would be:
*
* final Tokenizer tokenizer = new Tokenizer();
* tokenizer.init("my expression");
* while (tokenizer.hasMoreTokens()) {
* final boolean strict = ...
* final Token token = tokenizer.nextToken(strict);
* // Do something with token
* }
*
*
* @author Damien Carbonne
*/
public final class Tokenizer {
/** The expression to tokenize. */
private String expression;
private TokenType tokenType;
/** Index of token first character (inclusive) **/
private int begin;
/** Index of token last character (exclusive) */
private int end;
/**
* Equivalent character array.
*
* Accessing this array is faster that accessing expression (less checks).
*/
private char[] chars;
/**
* Length of chars (chars.length).
*/
private int charsLength;
/**
* Current analysis position.
*/
private int pos;
public Tokenizer() {
super();
}
/**
* @param c The char.
* @return {@code true} if {@code c} is digit.
*/
private static boolean isDigit(char c) {
return '0' <= c && c <= '9';
}
/**
* Returns {@code true} when there is a char at an index.
*
* @param index The tested index.
* @return {@code true} when there is a char at {@code index}.
*/
private boolean hasCharAt(int index) {
return index < charsLength;
}
/**
* Returns {@code true} when a token boundary exists at a location.
*
* This must be called after a ??? or digit has been found.
* It searches special chars.
*
* @param index The location.
* @param strict If {@code true}, a strict interpretation is used.
* @return {@code true} when a token boundary exists at {@code index}.
*/
private boolean hasBoundaryAt(int index,
boolean strict) {
if (hasCharAt(index)) {
if (OneCharSeparators.BEST_MATCHER.test(chars[index])) {
return strict || chars[index] != '.';
} else {
return matchesAt(index, '-') && hasCharAt(index + 1) && matchesAt(index + 1, '>');
}
} else {
// End of chars
return true;
}
}
private boolean hasNumberBoundaryAt(int index) {
if (hasCharAt(index)) {
if (chars[index] != '.' && OneCharSeparators.BEST_MATCHER.test(chars[index])) {
return true;
} else {
return matchesAt(index, '-') && hasCharAt(index + 1) && matchesAt(index + 1, '>');
}
} else {
// End of chars
return true;
}
}
/**
* Increments {@code pos} while the designated character is a space.
*
* After that, the character designated by {@code pos}, if it exists, is not a
* space.
*/
private void skipSpaces() {
while (pos < charsLength && Spaces.BEST_MATCHER.test(chars[pos])) {
pos++;
}
}
/**
* Increments {@code pos} while the designated character is a digit.
*
* After that, the character designated by {@code pos}, if it exists, is not a
* digit.
*
* @return The number of skipped digits.
*/
private int skipDigits() {
final int mem = pos;
while (pos < charsLength && isDigit(chars[pos])) {
pos++;
}
return pos - mem;
}
private enum NumberType {
INTEGER(TokenType.INTEGER),
REAL(TokenType.REAL);
private final TokenType tokenType;
private NumberType(TokenType tokenType) {
this.tokenType = tokenType;
}
public TokenType getTokenType() {
return tokenType;
}
}
/**
* Skip all chars that correspond to a number:
*
* - integer: {@code [0-9]+ [number boundary]}
*
- real: {@code [0-9]+[.][0-9]+([eE][+-]?[0-9]+)?[number boundary]}
*
* Must be called with current char being a digit.
*
* After that, {@code pos} designates the first character following the number.
*
* @return The number type.
* @throws LexicalException When a number can not be parsed:
* it is malformed or not followed by a number boundary.
*/
private NumberType skipPossibleNumber() {
final int beginIndex = pos;
final NumberType result;
skipDigits();
if (hasCharAt(pos) && matchesAt(pos, '.')) {
pos++;
// Accept no decimal digits ?
final int decimals = skipDigits();
if (decimals == 0) {
throw new LexicalException(LexicalException.Detail.INVALID_NUMBER,
"Real number must have decimals, at " + pos,
expression,
beginIndex,
pos);
}
if (hasCharAt(pos) && matchesAt(pos, 'e', 'E')) {
pos++;
if (hasCharAt(pos) && matchesAt(pos, '+', '-')) {
pos++;
}
final int exponent = skipDigits();
if (exponent > 0) {
result = NumberType.REAL;
} else {
throw new LexicalException(LexicalException.Detail.INVALID_NUMBER,
"Exponent must be followed by digits, at " + pos,
expression,
beginIndex,
pos);
}
} else {
result = NumberType.REAL;
}
} else {
result = NumberType.INTEGER;
}
if (hasNumberBoundaryAt(pos)) {
return result;
} else {
throw new LexicalException(LexicalException.Detail.MISSING_BOUNDARY,
"A number must be followed by boundary, at " + pos,
expression,
beginIndex,
pos);
}
}
private void skipText(boolean strict) {
while (pos < charsLength && !hasBoundaryAt(pos, strict)) {
pos++;
}
}
/**
* Searches the closing '"' character.
*
* It is a '"' not followed by another '"'.
* Must be invoked with {@code pos} designating the first character after the opening '"'.
* After that, {@code pos} designates the first character following the escaped text.
*
* @return {@code true} when closing '"' has been found.
*/
private boolean skipDoubleQuotesText() {
while (pos < charsLength) {
if (chars[pos] == '"') {
pos++;
if (pos < charsLength && chars[pos] == '"') {
// One char after '""'
pos++;
// continue exploration
} else {
// One char after closing '"'
return true;
}
} else {
pos++;
}
}
return false;
}
/**
* Searches the closing '$' character.
*
* It is a '$' not followed by another '$'.
* Must be invoked with {@code pos} designating the first character after the opening '$'.
* After that, {@code pos} designates the first character following the dollar text.
*
* @return {@code true} when closing '$' has been found.
*/
private boolean skipDollarText() {
while (pos < charsLength) {
if (chars[pos] == '$') {
pos++;
if (pos < charsLength && chars[pos] == '$') {
// One char after '$$'
pos++;
// continue exploration
} else {
// One char after closing '$'
return true;
}
} else {
pos++;
}
}
return false;
}
/**
* Returns {@code true} when the character at a given index corresponds to a given
* character.
*
* @param c The searched character.
* @param index The tested index.
* @return {@code true} when the character at {@code index} corresponds to {@code c}.
*/
private boolean matchesAt(int index,
char c) {
return chars[index] == c;
}
/**
* Returns {@code true} when the character at a given index corresponds to one of 2
* given characters.
*
* @param c1 The first searched character.
* @param c2 The second searched character.
* @param index The tested index.
* @return {@code true} when the character at {@code index} corresponds to {@code c1} or {@code c2}.
*/
private boolean matchesAt(int index,
char c1,
char c2) {
return chars[index] == c1 || chars[index] == c2;
}
/**
* Initializes this tokenizer with an expression.
*
* @param expression The expression to tokenize.
* @throws IllegalArgumentException When {@code expression} is {@code null}.
*/
public void init(String expression) {
Checks.isNotNull(expression, "expression");
this.expression = expression;
this.chars = expression.toCharArray();
this.charsLength = chars.length;
this.pos = 0;
this.begin = -1;
this.end = -1;
this.tokenType = null;
skipSpaces();
}
/**
* @return The expression being tokenized.
*/
public String getExpression() {
return expression;
}
/**
* @return The current token type.
*/
public TokenType getTokenType() {
return tokenType;
}
/**
* @return The begin index of current token.
*/
public int getBeginIndex() {
return begin;
}
/**
* @return The end index of current token.
*/
public int getEndIndex() {
return end;
}
/**
* @return The text of current token.
*/
public String getText() {
if (end <= begin) {
return "";
} else {
return expression.substring(getBeginIndex(), getEndIndex());
}
}
/**
* @return The unescaped text of current token.
*/
public String getUnescapedText() {
final String text = getText();
if (tokenType == TokenType.DOUBLE_QUOTES_TEXT) {
return EscapingUtils.unescapeDoubleQuotes(text);
} else if (tokenType == TokenType.DOLLAR_TEXT) {
return EscapingUtils.unescapeDollars(text);
} else {
return text;
}
}
/**
* @return The current token.
*/
public Token getToken() {
return new Token(tokenType, expression, begin, end);
}
/**
* @return {@code true} when more tokens follow.
*/
public boolean hasMoreTokens() {
return pos < charsLength;
}
/**
* Moves to next token.
*
* @param strict If {@code true}, a strict interpretation of reserved words is used.
* In that case, a sequence that matches a reserved word is interpreted as the reserved word.
* Otherwise, the sequence is interpreted as standard text.
*
* @throws LexicalException When tokenization fails.
*/
public void next(boolean strict) {
if (hasCharAt(pos)) {
begin = pos;
// Current char
final char c = chars[pos];
// Advance reading position.
pos++;
// Index of token last character (exclusive)
end = pos;
if (OneCharTokens.BEST_MATCHER.test(c)) {
// Handling of special characters that correspond to a token
// type without further reading.
tokenType = OneCharTokens.BEST_MAPPER.apply(c);
} else {
// Handling of other characters
switch (c) {
case '!' -> {
// Recognize '!', '!=','!<', '!<:', '!<=', '!>' and'!>='
if (hasCharAt(pos)) {
// ! + something
if (matchesAt(pos, '=')) {
// !=
tokenType = TokenType.NOT_EQUAL;
pos++;
end = pos;
} else if (matchesAt(pos, '<')) {
// !<
if (hasCharAt(pos + 1)) {
// !< + something
if (matchesAt(pos + 1, ':')) {
// !<:
tokenType = TokenType.NOT_IN;
pos += 2;
end = pos;
} else if (matchesAt(pos + 1, '=')) {
// !<=
tokenType = TokenType.NEITHER_LESS_NOR_EQUAL;
pos += 2;
end = pos;
} else {
// !< + non interesting
tokenType = TokenType.NOT_LESS;
pos += 1;
end = pos;
}
} else {
// !< + nothing
tokenType = TokenType.NOT_LESS;
pos += 1;
end = pos;
}
} else if (matchesAt(pos, '>')) {
// !>
if (hasCharAt(pos + 1)) {
// !> + something
if (matchesAt(pos + 1, '=')) {
// !>=
tokenType = TokenType.NEITHER_GREATER_NOR_EQUAL;
pos += 2;
end = pos;
} else {
// !> + non interesting
tokenType = TokenType.NOT_GREATER;
pos += 1;
end = pos;
}
} else {
// !> + nothing
tokenType = TokenType.NOT_GREATER;
pos += 1;
end = pos;
}
} else {
// ! + + non interesting
tokenType = TokenType.NOT;
}
} else {
// ! + nothing
tokenType = TokenType.NOT;
}
}
case '-' -> {
// Recognize '->' and negative numbers
if (hasCharAt(pos)) {
if (matchesAt(pos, '>')) {
tokenType = TokenType.IMPL;
pos++;
end = pos;
} else if (isDigit(chars[pos])) {
final NumberType type = skipPossibleNumber();
tokenType = type.getTokenType();
end = pos;
} else {
skipText(strict);
tokenType = TokenType.TEXT;
end = pos;
}
} else {
tokenType = TokenType.TEXT;
end = pos;
}
}
case '+' -> {
// Recognize positive numbers
if (hasCharAt(pos)) {
if (isDigit(chars[pos])) {
final NumberType type = skipPossibleNumber();
tokenType = type.getTokenType();
end = pos;
} else {
skipText(strict);
tokenType = TokenType.TEXT;
end = pos;
}
} else {
tokenType = TokenType.TEXT;
end = pos;
}
}
case '<' -> {
// Recognize '<', '<:' '<=' and '<->'
if (hasCharAt(pos)) {
// < + something
if (matchesAt(pos, ':')) {
// <:
tokenType = TokenType.IN;
pos++;
end = pos;
} else if (matchesAt(pos, '=')) {
// <=
tokenType = TokenType.LESS_OR_EQUAL;
pos++;
end = pos;
} else if (hasCharAt(pos + 1)
&& matchesAt(pos, '-')
&& matchesAt(pos + 1, '>')) {
// <->
tokenType = TokenType.EQUIV;
pos += 2;
end = pos;
} else {
// < + non interesting
tokenType = TokenType.LESS;
end = pos;
}
} else {
// < + nothing
tokenType = TokenType.LESS;
end = pos;
}
}
case '>' -> {
// Recognize '>', '>=', '>-<'
if (hasCharAt(pos)) {
// > + something
if (matchesAt(pos, '=')) {
// >=
tokenType = TokenType.GREATER_OR_EQUAL;
pos++;
end = pos;
} else if (hasCharAt(pos + 1)
&& matchesAt(pos, '-')
&& matchesAt(pos + 1, '<')) {
tokenType = TokenType.XOR;
pos += 2;
end = pos;
} else {
// > + non interesting
tokenType = TokenType.GREATER;
end = pos;
}
} else {
// > + nothing
tokenType = TokenType.GREATER;
end = pos;
}
}
case '"' -> {
// Found a '"' escaped text
final boolean closed = skipDoubleQuotesText();
if (!closed) {
throw new LexicalException(LexicalException.Detail.MISSING_CLOSING_DOUBLE_QUOTES,
"Closing '\"' not found",
expression,
begin,
-1);
}
if (pos == begin + 2) {
throw new LexicalException(LexicalException.Detail.EMPTY_DOUBLE_QUOTES_ESCAPED_TEXT,
"Invalid empty escaped text",
expression,
begin,
pos);
}
tokenType = TokenType.DOUBLE_QUOTES_TEXT;
end = pos;
}
case '$' -> {
// Found a '$' escaped text
final boolean closed = skipDollarText();
if (!closed) {
throw new LexicalException(LexicalException.Detail.MISSING_CLOSING_DOLLAR,
"Closing '$' not found",
expression,
begin,
-1);
}
if (pos == begin + 2) {
throw new LexicalException(LexicalException.Detail.EMPTY_DOLLAR_ESCAPED_TEXT,
"Invalid empty dollar text",
expression,
begin,
pos);
}
tokenType = TokenType.DOLLAR_TEXT;
end = pos;
}
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' -> {
final NumberType type = skipPossibleNumber();
tokenType = type.getTokenType();
end = pos;
}
case 'a', 'A' -> {
// Recognize [aA][nN][dD]
if (hasCharAt(pos + 1)
&& matchesAt(pos, 'n', 'N')
&& matchesAt(pos + 1, 'd', 'D')
&& hasBoundaryAt(pos + 2, strict)) {
if (strict) {
tokenType = TokenType.AND;
} else {
tokenType = TokenType.TEXT;
}
pos += 2;
} else {
skipText(strict);
tokenType = TokenType.TEXT;
}
end = pos;
}
case 'f', 'F' -> {
// Recognize [fF][aA][lL][sS][eE]
if (hasCharAt(pos + 3)
&& matchesAt(pos, 'a', 'A')
&& matchesAt(pos + 1, 'l', 'L')
&& matchesAt(pos + 2, 's', 'S')
&& matchesAt(pos + 3, 'e', 'E')
&& hasBoundaryAt(pos + 4, strict)) {
tokenType = TokenType.FALSE;
pos += 4;
} else {
skipText(strict);
tokenType = TokenType.TEXT;
}
end = pos;
}
case 'i', 'I' -> {
// Recognize [iI][nN] and [iI][mM][pP] and [iI][fF][fF]
if (hasCharAt(pos)
&& matchesAt(pos, 'n', 'N')
&& hasBoundaryAt(pos + 1, strict)) {
if (strict) {
tokenType = TokenType.IN;
} else {
tokenType = TokenType.TEXT;
}
pos++;
} else if (hasCharAt(pos + 1)
&& matchesAt(pos, 'm', 'M')
&& matchesAt(pos + 1, 'p', 'P')
&& hasBoundaryAt(pos + 2, strict)) {
if (strict) {
tokenType = TokenType.IMPL;
} else {
tokenType = TokenType.TEXT;
}
pos += 2;
} else if (hasCharAt(pos + 1)
&& matchesAt(pos, 'f', 'F')
&& matchesAt(pos + 1, 'f', 'F')
&& hasBoundaryAt(pos + 2, strict)) {
if (strict) {
tokenType = TokenType.EQUIV;
} else {
tokenType = TokenType.TEXT;
}
pos += 2;
} else {
skipText(strict);
tokenType = TokenType.TEXT;
}
end = pos;
}
case 'n', 'N' -> {
// Recognize [nN][oO][tT] and [nN][oO][tT] [iI][nN]
if (hasCharAt(pos + 1)
&& matchesAt(pos, 'o', 'O')
&& matchesAt(pos + 1, 't', 'T')
&& hasBoundaryAt(pos + 2, strict)) {
pos += 2;
end = pos;
skipSpaces();
if (hasCharAt(pos + 1)
&& matchesAt(pos, 'i', 'I')
&& matchesAt(pos + 1, 'n', 'N')
&& hasBoundaryAt(pos + 2, strict)) {
tokenType = TokenType.NOT_IN;
pos += 2;
end = pos;
} else {
if (strict) {
tokenType = TokenType.NOT;
} else {
tokenType = TokenType.TEXT;
}
}
} else {
skipText(strict);
tokenType = TokenType.TEXT;
end = pos;
}
}
case 'o', 'O' -> {
// Recognize [oO][rR]
if (hasCharAt(pos)
&& matchesAt(pos, 'r', 'R')
&& hasBoundaryAt(pos + 1, strict)) {
if (strict) {
tokenType = TokenType.OR;
} else {
tokenType = TokenType.TEXT;
}
pos++;
} else {
skipText(strict);
tokenType = TokenType.TEXT;
}
end = pos;
}
case 't', 'T' -> {
// Recognize [tT][oO] and [tT][rR][uU][eE]
if (hasCharAt(pos)
&& matchesAt(pos, 'o', 'O')
&& hasBoundaryAt(pos + 1, strict)) {
if (strict) {
tokenType = TokenType.TO;
} else {
tokenType = TokenType.TEXT;
}
pos++;
} else if (hasCharAt(pos + 2)
&& matchesAt(pos, 'r', 'R')
&& matchesAt(pos + 1, 'u', 'U')
&& matchesAt(pos + 2, 'e', 'E')
&& hasBoundaryAt(pos + 3, strict)) {
tokenType = TokenType.TRUE;
pos += 3;
} else {
skipText(strict);
tokenType = TokenType.TEXT;
}
end = pos;
}
case 'x', 'X' -> {
// Recognize [xX][oO][rR]
if (hasCharAt(pos + 1)
&& matchesAt(pos, 'o', 'O')
&& matchesAt(pos + 1, 'r', 'R')
&& hasBoundaryAt(pos + 2, strict)) {
if (strict) {
tokenType = TokenType.XOR;
} else {
tokenType = TokenType.TEXT;
}
pos += 2;
} else {
skipText(strict);
tokenType = TokenType.TEXT;
}
end = pos;
}
default -> {
skipText(strict);
tokenType = TokenType.TEXT;
end = pos;
}
}
}
skipSpaces();
} else {
tokenType = TokenType.EPSILON;
begin = pos;
end = begin;
}
}
/**
* Moves to next token and returns it.
*
* When no more token are available, returns a Token of type {@link TokenType#EPSILON}.
*
* @param strict If {@code true}, a strict interpretation is used.
* @return The following token.
* @throws SyntacticException When the parsed expression is invalid.
*/
public Token nextToken(boolean strict) {
next(strict);
return getToken();
}
}