All Downloads are FREE. Search and download functionalities are using the official Maven repository.

cdc.applic.expressions.parsing.Tokenizer Maven / Gradle / Ivy

The newest version!
package cdc.applic.expressions.parsing;

import cdc.applic.expressions.LexicalException;
import cdc.applic.expressions.SyntacticException;
import cdc.applic.expressions.literals.EscapingUtils;
import cdc.util.lang.Checks;

/**
 * Implementation of Tokenizer.
 * 

* This class analyzes an expression or piece of expression and identifies its tokens. * It does not check conformity of the expression to a grammar. *

* There are 2 modes of tokenization: *

    *
  • strict mode must be used when names, operators, .. are expected *
  • non strict mode must be used when values are expected (right side of an equality operator or inside a set). *
* A typical usage would be: *

 * final Tokenizer tokenizer = new Tokenizer();
 * tokenizer.init("my expression");
 * while (tokenizer.hasMoreTokens()) {
 *     final boolean strict = ...
 *     final Token token = tokenizer.nextToken(strict);
 *     // Do something with token
 * }
 * 
* * @author Damien Carbonne */ public final class Tokenizer { /** The expression to tokenize. */ private String expression; private TokenType tokenType; /** Index of token first character (inclusive) **/ private int begin; /** Index of token last character (exclusive) */ private int end; /** * Equivalent character array. *

* Accessing this array is faster that accessing expression (less checks). */ private char[] chars; /** * Length of chars (chars.length). */ private int charsLength; /** * Current analysis position. */ private int pos; public Tokenizer() { super(); } /** * @param c The char. * @return {@code true} if {@code c} is digit. */ private static boolean isDigit(char c) { return '0' <= c && c <= '9'; } /** * Returns {@code true} when there is a char at an index. * * @param index The tested index. * @return {@code true} when there is a char at {@code index}. */ private boolean hasCharAt(int index) { return index < charsLength; } /** * Returns {@code true} when a token boundary exists at a location. *

* This must be called after a ??? or digit has been found. * It searches special chars. * * @param index The location. * @param strict If {@code true}, a strict interpretation is used. * @return {@code true} when a token boundary exists at {@code index}. */ private boolean hasBoundaryAt(int index, boolean strict) { if (hasCharAt(index)) { if (OneCharSeparators.BEST_MATCHER.test(chars[index])) { return strict || chars[index] != '.'; } else { return matchesAt(index, '-') && hasCharAt(index + 1) && matchesAt(index + 1, '>'); } } else { // End of chars return true; } } private boolean hasNumberBoundaryAt(int index) { if (hasCharAt(index)) { if (chars[index] != '.' && OneCharSeparators.BEST_MATCHER.test(chars[index])) { return true; } else { return matchesAt(index, '-') && hasCharAt(index + 1) && matchesAt(index + 1, '>'); } } else { // End of chars return true; } } /** * Increments {@code pos} while the designated character is a space. *

* After that, the character designated by {@code pos}, if it exists, is not a * space. */ private void skipSpaces() { while (pos < charsLength && Spaces.BEST_MATCHER.test(chars[pos])) { pos++; } } /** * Increments {@code pos} while the designated character is a digit. *

* After that, the character designated by {@code pos}, if it exists, is not a * digit. * * @return The number of skipped digits. */ private int skipDigits() { final int mem = pos; while (pos < charsLength && isDigit(chars[pos])) { pos++; } return pos - mem; } private enum NumberType { INTEGER(TokenType.INTEGER), REAL(TokenType.REAL); private final TokenType tokenType; private NumberType(TokenType tokenType) { this.tokenType = tokenType; } public TokenType getTokenType() { return tokenType; } } /** * Skip all chars that correspond to a number: *

    *
  • integer: {@code [0-9]+ [number boundary]} *
  • real: {@code [0-9]+[.][0-9]+([eE][+-]?[0-9]+)?[number boundary]} *
* Must be called with current char being a digit. *

* After that, {@code pos} designates the first character following the number. * * @return The number type. * @throws LexicalException When a number can not be parsed: * it is malformed or not followed by a number boundary. */ private NumberType skipPossibleNumber() { final int beginIndex = pos; final NumberType result; skipDigits(); if (hasCharAt(pos) && matchesAt(pos, '.')) { pos++; // Accept no decimal digits ? final int decimals = skipDigits(); if (decimals == 0) { throw new LexicalException(LexicalException.Detail.INVALID_NUMBER, "Real number must have decimals, at " + pos, expression, beginIndex, pos); } if (hasCharAt(pos) && matchesAt(pos, 'e', 'E')) { pos++; if (hasCharAt(pos) && matchesAt(pos, '+', '-')) { pos++; } final int exponent = skipDigits(); if (exponent > 0) { result = NumberType.REAL; } else { throw new LexicalException(LexicalException.Detail.INVALID_NUMBER, "Exponent must be followed by digits, at " + pos, expression, beginIndex, pos); } } else { result = NumberType.REAL; } } else { result = NumberType.INTEGER; } if (hasNumberBoundaryAt(pos)) { return result; } else { throw new LexicalException(LexicalException.Detail.MISSING_BOUNDARY, "A number must be followed by boundary, at " + pos, expression, beginIndex, pos); } } private void skipText(boolean strict) { while (pos < charsLength && !hasBoundaryAt(pos, strict)) { pos++; } } /** * Searches the closing '"' character. *

* It is a '"' not followed by another '"'. * Must be invoked with {@code pos} designating the first character after the opening '"'. * After that, {@code pos} designates the first character following the escaped text. * * @return {@code true} when closing '"' has been found. */ private boolean skipDoubleQuotesText() { while (pos < charsLength) { if (chars[pos] == '"') { pos++; if (pos < charsLength && chars[pos] == '"') { // One char after '""' pos++; // continue exploration } else { // One char after closing '"' return true; } } else { pos++; } } return false; } /** * Searches the closing '$' character. *

* It is a '$' not followed by another '$'. * Must be invoked with {@code pos} designating the first character after the opening '$'. * After that, {@code pos} designates the first character following the dollar text. * * @return {@code true} when closing '$' has been found. */ private boolean skipDollarText() { while (pos < charsLength) { if (chars[pos] == '$') { pos++; if (pos < charsLength && chars[pos] == '$') { // One char after '$$' pos++; // continue exploration } else { // One char after closing '$' return true; } } else { pos++; } } return false; } /** * Returns {@code true} when the character at a given index corresponds to a given * character. * * @param c The searched character. * @param index The tested index. * @return {@code true} when the character at {@code index} corresponds to {@code c}. */ private boolean matchesAt(int index, char c) { return chars[index] == c; } /** * Returns {@code true} when the character at a given index corresponds to one of 2 * given characters. * * @param c1 The first searched character. * @param c2 The second searched character. * @param index The tested index. * @return {@code true} when the character at {@code index} corresponds to {@code c1} or {@code c2}. */ private boolean matchesAt(int index, char c1, char c2) { return chars[index] == c1 || chars[index] == c2; } /** * Initializes this tokenizer with an expression. * * @param expression The expression to tokenize. * @throws IllegalArgumentException When {@code expression} is {@code null}. */ public void init(String expression) { Checks.isNotNull(expression, "expression"); this.expression = expression; this.chars = expression.toCharArray(); this.charsLength = chars.length; this.pos = 0; this.begin = -1; this.end = -1; this.tokenType = null; skipSpaces(); } /** * @return The expression being tokenized. */ public String getExpression() { return expression; } /** * @return The current token type. */ public TokenType getTokenType() { return tokenType; } /** * @return The begin index of current token. */ public int getBeginIndex() { return begin; } /** * @return The end index of current token. */ public int getEndIndex() { return end; } /** * @return The text of current token. */ public String getText() { if (end <= begin) { return ""; } else { return expression.substring(getBeginIndex(), getEndIndex()); } } /** * @return The unescaped text of current token. */ public String getUnescapedText() { final String text = getText(); if (tokenType == TokenType.DOUBLE_QUOTES_TEXT) { return EscapingUtils.unescapeDoubleQuotes(text); } else if (tokenType == TokenType.DOLLAR_TEXT) { return EscapingUtils.unescapeDollars(text); } else { return text; } } /** * @return The current token. */ public Token getToken() { return new Token(tokenType, expression, begin, end); } /** * @return {@code true} when more tokens follow. */ public boolean hasMoreTokens() { return pos < charsLength; } /** * Moves to next token. * * @param strict If {@code true}, a strict interpretation of reserved words is used.
* In that case, a sequence that matches a reserved word is interpreted as the reserved word.
* Otherwise, the sequence is interpreted as standard text. * * @throws LexicalException When tokenization fails. */ public void next(boolean strict) { if (hasCharAt(pos)) { begin = pos; // Current char final char c = chars[pos]; // Advance reading position. pos++; // Index of token last character (exclusive) end = pos; if (OneCharTokens.BEST_MATCHER.test(c)) { // Handling of special characters that correspond to a token // type without further reading. tokenType = OneCharTokens.BEST_MAPPER.apply(c); } else { // Handling of other characters switch (c) { case '!' -> { // Recognize '!', '!=','!<', '!<:', '!<=', '!>' and'!>=' if (hasCharAt(pos)) { // ! + something if (matchesAt(pos, '=')) { // != tokenType = TokenType.NOT_EQUAL; pos++; end = pos; } else if (matchesAt(pos, '<')) { // !< if (hasCharAt(pos + 1)) { // !< + something if (matchesAt(pos + 1, ':')) { // !<: tokenType = TokenType.NOT_IN; pos += 2; end = pos; } else if (matchesAt(pos + 1, '=')) { // !<= tokenType = TokenType.NEITHER_LESS_NOR_EQUAL; pos += 2; end = pos; } else { // !< + non interesting tokenType = TokenType.NOT_LESS; pos += 1; end = pos; } } else { // !< + nothing tokenType = TokenType.NOT_LESS; pos += 1; end = pos; } } else if (matchesAt(pos, '>')) { // !> if (hasCharAt(pos + 1)) { // !> + something if (matchesAt(pos + 1, '=')) { // !>= tokenType = TokenType.NEITHER_GREATER_NOR_EQUAL; pos += 2; end = pos; } else { // !> + non interesting tokenType = TokenType.NOT_GREATER; pos += 1; end = pos; } } else { // !> + nothing tokenType = TokenType.NOT_GREATER; pos += 1; end = pos; } } else { // ! + + non interesting tokenType = TokenType.NOT; } } else { // ! + nothing tokenType = TokenType.NOT; } } case '-' -> { // Recognize '->' and negative numbers if (hasCharAt(pos)) { if (matchesAt(pos, '>')) { tokenType = TokenType.IMPL; pos++; end = pos; } else if (isDigit(chars[pos])) { final NumberType type = skipPossibleNumber(); tokenType = type.getTokenType(); end = pos; } else { skipText(strict); tokenType = TokenType.TEXT; end = pos; } } else { tokenType = TokenType.TEXT; end = pos; } } case '+' -> { // Recognize positive numbers if (hasCharAt(pos)) { if (isDigit(chars[pos])) { final NumberType type = skipPossibleNumber(); tokenType = type.getTokenType(); end = pos; } else { skipText(strict); tokenType = TokenType.TEXT; end = pos; } } else { tokenType = TokenType.TEXT; end = pos; } } case '<' -> { // Recognize '<', '<:' '<=' and '<->' if (hasCharAt(pos)) { // < + something if (matchesAt(pos, ':')) { // <: tokenType = TokenType.IN; pos++; end = pos; } else if (matchesAt(pos, '=')) { // <= tokenType = TokenType.LESS_OR_EQUAL; pos++; end = pos; } else if (hasCharAt(pos + 1) && matchesAt(pos, '-') && matchesAt(pos + 1, '>')) { // <-> tokenType = TokenType.EQUIV; pos += 2; end = pos; } else { // < + non interesting tokenType = TokenType.LESS; end = pos; } } else { // < + nothing tokenType = TokenType.LESS; end = pos; } } case '>' -> { // Recognize '>', '>=', '>-<' if (hasCharAt(pos)) { // > + something if (matchesAt(pos, '=')) { // >= tokenType = TokenType.GREATER_OR_EQUAL; pos++; end = pos; } else if (hasCharAt(pos + 1) && matchesAt(pos, '-') && matchesAt(pos + 1, '<')) { tokenType = TokenType.XOR; pos += 2; end = pos; } else { // > + non interesting tokenType = TokenType.GREATER; end = pos; } } else { // > + nothing tokenType = TokenType.GREATER; end = pos; } } case '"' -> { // Found a '"' escaped text final boolean closed = skipDoubleQuotesText(); if (!closed) { throw new LexicalException(LexicalException.Detail.MISSING_CLOSING_DOUBLE_QUOTES, "Closing '\"' not found", expression, begin, -1); } if (pos == begin + 2) { throw new LexicalException(LexicalException.Detail.EMPTY_DOUBLE_QUOTES_ESCAPED_TEXT, "Invalid empty escaped text", expression, begin, pos); } tokenType = TokenType.DOUBLE_QUOTES_TEXT; end = pos; } case '$' -> { // Found a '$' escaped text final boolean closed = skipDollarText(); if (!closed) { throw new LexicalException(LexicalException.Detail.MISSING_CLOSING_DOLLAR, "Closing '$' not found", expression, begin, -1); } if (pos == begin + 2) { throw new LexicalException(LexicalException.Detail.EMPTY_DOLLAR_ESCAPED_TEXT, "Invalid empty dollar text", expression, begin, pos); } tokenType = TokenType.DOLLAR_TEXT; end = pos; } case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' -> { final NumberType type = skipPossibleNumber(); tokenType = type.getTokenType(); end = pos; } case 'a', 'A' -> { // Recognize [aA][nN][dD] if (hasCharAt(pos + 1) && matchesAt(pos, 'n', 'N') && matchesAt(pos + 1, 'd', 'D') && hasBoundaryAt(pos + 2, strict)) { if (strict) { tokenType = TokenType.AND; } else { tokenType = TokenType.TEXT; } pos += 2; } else { skipText(strict); tokenType = TokenType.TEXT; } end = pos; } case 'f', 'F' -> { // Recognize [fF][aA][lL][sS][eE] if (hasCharAt(pos + 3) && matchesAt(pos, 'a', 'A') && matchesAt(pos + 1, 'l', 'L') && matchesAt(pos + 2, 's', 'S') && matchesAt(pos + 3, 'e', 'E') && hasBoundaryAt(pos + 4, strict)) { tokenType = TokenType.FALSE; pos += 4; } else { skipText(strict); tokenType = TokenType.TEXT; } end = pos; } case 'i', 'I' -> { // Recognize [iI][nN] and [iI][mM][pP] and [iI][fF][fF] if (hasCharAt(pos) && matchesAt(pos, 'n', 'N') && hasBoundaryAt(pos + 1, strict)) { if (strict) { tokenType = TokenType.IN; } else { tokenType = TokenType.TEXT; } pos++; } else if (hasCharAt(pos + 1) && matchesAt(pos, 'm', 'M') && matchesAt(pos + 1, 'p', 'P') && hasBoundaryAt(pos + 2, strict)) { if (strict) { tokenType = TokenType.IMPL; } else { tokenType = TokenType.TEXT; } pos += 2; } else if (hasCharAt(pos + 1) && matchesAt(pos, 'f', 'F') && matchesAt(pos + 1, 'f', 'F') && hasBoundaryAt(pos + 2, strict)) { if (strict) { tokenType = TokenType.EQUIV; } else { tokenType = TokenType.TEXT; } pos += 2; } else { skipText(strict); tokenType = TokenType.TEXT; } end = pos; } case 'n', 'N' -> { // Recognize [nN][oO][tT] and [nN][oO][tT] [iI][nN] if (hasCharAt(pos + 1) && matchesAt(pos, 'o', 'O') && matchesAt(pos + 1, 't', 'T') && hasBoundaryAt(pos + 2, strict)) { pos += 2; end = pos; skipSpaces(); if (hasCharAt(pos + 1) && matchesAt(pos, 'i', 'I') && matchesAt(pos + 1, 'n', 'N') && hasBoundaryAt(pos + 2, strict)) { tokenType = TokenType.NOT_IN; pos += 2; end = pos; } else { if (strict) { tokenType = TokenType.NOT; } else { tokenType = TokenType.TEXT; } } } else { skipText(strict); tokenType = TokenType.TEXT; end = pos; } } case 'o', 'O' -> { // Recognize [oO][rR] if (hasCharAt(pos) && matchesAt(pos, 'r', 'R') && hasBoundaryAt(pos + 1, strict)) { if (strict) { tokenType = TokenType.OR; } else { tokenType = TokenType.TEXT; } pos++; } else { skipText(strict); tokenType = TokenType.TEXT; } end = pos; } case 't', 'T' -> { // Recognize [tT][oO] and [tT][rR][uU][eE] if (hasCharAt(pos) && matchesAt(pos, 'o', 'O') && hasBoundaryAt(pos + 1, strict)) { if (strict) { tokenType = TokenType.TO; } else { tokenType = TokenType.TEXT; } pos++; } else if (hasCharAt(pos + 2) && matchesAt(pos, 'r', 'R') && matchesAt(pos + 1, 'u', 'U') && matchesAt(pos + 2, 'e', 'E') && hasBoundaryAt(pos + 3, strict)) { tokenType = TokenType.TRUE; pos += 3; } else { skipText(strict); tokenType = TokenType.TEXT; } end = pos; } case 'x', 'X' -> { // Recognize [xX][oO][rR] if (hasCharAt(pos + 1) && matchesAt(pos, 'o', 'O') && matchesAt(pos + 1, 'r', 'R') && hasBoundaryAt(pos + 2, strict)) { if (strict) { tokenType = TokenType.XOR; } else { tokenType = TokenType.TEXT; } pos += 2; } else { skipText(strict); tokenType = TokenType.TEXT; } end = pos; } default -> { skipText(strict); tokenType = TokenType.TEXT; end = pos; } } } skipSpaces(); } else { tokenType = TokenType.EPSILON; begin = pos; end = begin; } } /** * Moves to next token and returns it. *

* When no more token are available, returns a Token of type {@link TokenType#EPSILON}. * * @param strict If {@code true}, a strict interpretation is used. * @return The following token. * @throws SyntacticException When the parsed expression is invalid. */ public Token nextToken(boolean strict) { next(strict); return getToken(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy