cdc.applic.expressions.parsing.Tokenizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cdc-applic-expressions Show documentation
Applicabilities Expressions.
The newest version!
package cdc.applic.expressions.parsing;

import cdc.applic.expressions.LexicalException;
import cdc.applic.expressions.SyntacticException;
import cdc.applic.expressions.literals.EscapingUtils;
import cdc.util.lang.Checks;

/**
 * Implementation of Tokenizer.
 * 
 * This class analyzes an expression or piece of expression and identifies its tokens.
 * It does not check conformity of the expression to a grammar.
 * 

 * There are 2 modes of tokenization:
 * 

 * strict mode must be used when names, operators, .. are expected
 * 
non strict mode must be used when values are expected (right side of an equality operator or inside a set).
 * 
 * A typical usage would be:
 * 
 * final Tokenizer tokenizer = new Tokenizer();
 * tokenizer.init("my expression");
 * while (tokenizer.hasMoreTokens()) {
 *     final boolean strict = ...
 *     final Token token = tokenizer.nextToken(strict);
 *     // Do something with token
 * }
 * 
 *
 * @author Damien Carbonne
 */
public final class Tokenizer {
    /** The expression to tokenize. */
    private String expression;

    private TokenType tokenType;
    /** Index of token first character (inclusive) **/
    private int begin;
    /** Index of token last character (exclusive) */
    private int end;

    /**
     * Equivalent character array.
     * 
     * Accessing this array is faster that accessing expression (less checks).
     */
    private char[] chars;

    /**
     * Length of chars (chars.length).
     */
    private int charsLength;

    /**
     * Current analysis position.
     */
    private int pos;

    public Tokenizer() {
        super();
    }

    /**
     * @param c The char.
     * @return {@code true} if {@code c} is digit.
     */
    private static boolean isDigit(char c) {
        return '0' <= c && c <= '9';
    }

    /**
     * Returns {@code true} when there is a char at an index.
     *
     * @param index The tested index.
     * @return {@code true} when there is a char at {@code index}.
     */
    private boolean hasCharAt(int index) {
        return index < charsLength;
    }

    /**
     * Returns {@code true} when a token boundary exists at a location.
     * 

     * This must be called after a ??? or digit has been found.
     * It searches special chars.
     *
     * @param index The location.
     * @param strict If {@code true}, a strict interpretation is used.
     * @return {@code true} when a token boundary exists at {@code index}.
     */
    private boolean hasBoundaryAt(int index,
                                  boolean strict) {
        if (hasCharAt(index)) {
            if (OneCharSeparators.BEST_MATCHER.test(chars[index])) {
                return strict || chars[index] != '.';
            } else {
                return matchesAt(index, '-') && hasCharAt(index + 1) && matchesAt(index + 1, '>');
            }
        } else {
            // End of chars
            return true;
        }
    }

    private boolean hasNumberBoundaryAt(int index) {
        if (hasCharAt(index)) {
            if (chars[index] != '.' && OneCharSeparators.BEST_MATCHER.test(chars[index])) {
                return true;
            } else {
                return matchesAt(index, '-') && hasCharAt(index + 1) && matchesAt(index + 1, '>');
            }
        } else {
            // End of chars
            return true;
        }
    }

    /**
     * Increments {@code pos} while the designated character is a space.
     * 

     * After that, the character designated by {@code pos}, if it exists, is not a
     * space.
     */
    private void skipSpaces() {
        while (pos < charsLength && Spaces.BEST_MATCHER.test(chars[pos])) {
            pos++;
        }
    }

    /**
     * Increments {@code pos} while the designated character is a digit.
     * 

     * After that, the character designated by {@code pos}, if it exists, is not a
     * digit.
     *
     * @return The number of skipped digits.
     */
    private int skipDigits() {
        final int mem = pos;
        while (pos < charsLength && isDigit(chars[pos])) {
            pos++;
        }
        return pos - mem;
    }

    private enum NumberType {
        INTEGER(TokenType.INTEGER),
        REAL(TokenType.REAL);

        private final TokenType tokenType;

        private NumberType(TokenType tokenType) {
            this.tokenType = tokenType;
        }

        public TokenType getTokenType() {
            return tokenType;
        }
    }

    /**
     * Skip all chars that correspond to a number:
     * 

     * integer: {@code [0-9]+ [number boundary]}
     * 
real: {@code [0-9]+[.][0-9]+([eE][+-]?[0-9]+)?[number boundary]}
     * 
     * Must be called with current char being a digit.
     * 
     * After that, {@code pos} designates the first character following the number.
     *
     * @return The number type.
     * @throws LexicalException When a number can not be parsed:
     *             it is malformed or not followed by a number boundary.
     */
    private NumberType skipPossibleNumber() {
        final int beginIndex = pos;
        final NumberType result;
        skipDigits();

        if (hasCharAt(pos) && matchesAt(pos, '.')) {
            pos++;
            // Accept no decimal digits ?
            final int decimals = skipDigits();
            if (decimals == 0) {
                throw new LexicalException(LexicalException.Detail.INVALID_NUMBER,
                                           "Real number must have decimals, at " + pos,
                                           expression,
                                           beginIndex,
                                           pos);
            }
            if (hasCharAt(pos) && matchesAt(pos, 'e', 'E')) {
                pos++;
                if (hasCharAt(pos) && matchesAt(pos, '+', '-')) {
                    pos++;
                }
                final int exponent = skipDigits();
                if (exponent > 0) {
                    result = NumberType.REAL;
                } else {
                    throw new LexicalException(LexicalException.Detail.INVALID_NUMBER,
                                               "Exponent must be followed by digits, at " + pos,
                                               expression,
                                               beginIndex,
                                               pos);
                }
            } else {
                result = NumberType.REAL;
            }
        } else {
            result = NumberType.INTEGER;
        }
        if (hasNumberBoundaryAt(pos)) {
            return result;
        } else {
            throw new LexicalException(LexicalException.Detail.MISSING_BOUNDARY,
                                       "A number must be followed by boundary, at " + pos,
                                       expression,
                                       beginIndex,
                                       pos);
        }
    }

    private void skipText(boolean strict) {
        while (pos < charsLength && !hasBoundaryAt(pos, strict)) {
            pos++;
        }
    }

    /**
     * Searches the closing '"' character.
     * 

     * It is a '"' not followed by another '"'.
     * Must be invoked with {@code pos} designating the first character after the opening '"'.
     * After that, {@code pos} designates the first character following the escaped text.
     *
     * @return {@code true} when closing '"' has been found.
     */
    private boolean skipDoubleQuotesText() {
        while (pos < charsLength) {
            if (chars[pos] == '"') {
                pos++;
                if (pos < charsLength && chars[pos] == '"') {
                    // One char after '""'
                    pos++;
                    // continue exploration
                } else {
                    // One char after closing '"'
                    return true;
                }
            } else {
                pos++;
            }
        }
        return false;
    }

    /**
     * Searches the closing '$' character.
     * 

     * It is a '$' not followed by another '$'.
     * Must be invoked with {@code pos} designating the first character after the opening '$'.
     * After that, {@code pos} designates the first character following the dollar text.
     *
     * @return {@code true} when closing '$' has been found.
     */
    private boolean skipDollarText() {
        while (pos < charsLength) {
            if (chars[pos] == '$') {
                pos++;
                if (pos < charsLength && chars[pos] == '$') {
                    // One char after '$$'
                    pos++;
                    // continue exploration
                } else {
                    // One char after closing '$'
                    return true;
                }
            } else {
                pos++;
            }
        }
        return false;
    }

    /**
     * Returns {@code true} when the character at a given index corresponds to a given
     * character.
     *
     * @param c The searched character.
     * @param index The tested index.
     * @return {@code true} when the character at {@code index} corresponds to {@code c}.
     */
    private boolean matchesAt(int index,
                              char c) {
        return chars[index] == c;
    }

    /**
     * Returns {@code true} when the character at a given index corresponds to one of 2
     * given characters.
     *
     * @param c1 The first searched character.
     * @param c2 The second searched character.
     * @param index The tested index.
     * @return {@code true} when the character at {@code index} corresponds to {@code c1} or {@code c2}.
     */
    private boolean matchesAt(int index,
                              char c1,
                              char c2) {
        return chars[index] == c1 || chars[index] == c2;
    }

    /**
     * Initializes this tokenizer with an expression.
     *
     * @param expression The expression to tokenize.
     * @throws IllegalArgumentException When {@code expression} is {@code null}.
     */
    public void init(String expression) {
        Checks.isNotNull(expression, "expression");

        this.expression = expression;
        this.chars = expression.toCharArray();
        this.charsLength = chars.length;
        this.pos = 0;
        this.begin = -1;
        this.end = -1;
        this.tokenType = null;
        skipSpaces();
    }

    /**
     * @return The expression being tokenized.
     */
    public String getExpression() {
        return expression;
    }

    /**
     * @return The current token type.
     */
    public TokenType getTokenType() {
        return tokenType;
    }

    /**
     * @return The begin index of current token.
     */
    public int getBeginIndex() {
        return begin;
    }

    /**
     * @return The end index of current token.
     */
    public int getEndIndex() {
        return end;
    }

    /**
     * @return The text of current token.
     */
    public String getText() {
        if (end <= begin) {
            return "";
        } else {
            return expression.substring(getBeginIndex(), getEndIndex());
        }
    }

    /**
     * @return The unescaped text of current token.
     */
    public String getUnescapedText() {
        final String text = getText();
        if (tokenType == TokenType.DOUBLE_QUOTES_TEXT) {
            return EscapingUtils.unescapeDoubleQuotes(text);
        } else if (tokenType == TokenType.DOLLAR_TEXT) {
            return EscapingUtils.unescapeDollars(text);
        } else {
            return text;
        }
    }

    /**
     * @return The current token.
     */
    public Token getToken() {
        return new Token(tokenType, expression, begin, end);
    }

    /**
     * @return {@code true} when more tokens follow.
     */
    public boolean hasMoreTokens() {
        return pos < charsLength;
    }

    /**
     * Moves to next token.
     *
     * @param strict If {@code true}, a strict interpretation of reserved words is used.

     *            In that case, a sequence that matches a reserved word is interpreted as the reserved word.

     *            Otherwise, the sequence is interpreted as standard text.
     *
     * @throws LexicalException When tokenization fails.
     */
    public void next(boolean strict) {
        if (hasCharAt(pos)) {
            begin = pos;
            // Current char
            final char c = chars[pos];
            // Advance reading position.
            pos++;
            // Index of token last character (exclusive)
            end = pos;

            if (OneCharTokens.BEST_MATCHER.test(c)) {
                // Handling of special characters that correspond to a token
                // type without further reading.
                tokenType = OneCharTokens.BEST_MAPPER.apply(c);
            } else {
                // Handling of other characters
                switch (c) {
                case '!' -> {
                    // Recognize '!', '!=','!<', '!<:', '!<=', '!>' and'!>='
                    if (hasCharAt(pos)) {
                        // ! + something
                        if (matchesAt(pos, '=')) {
                            // !=
                            tokenType = TokenType.NOT_EQUAL;
                            pos++;
                            end = pos;
                        } else if (matchesAt(pos, '<')) {
                            // !<
                            if (hasCharAt(pos + 1)) {
                                // !< + something
                                if (matchesAt(pos + 1, ':')) {
                                    // !<:
                                    tokenType = TokenType.NOT_IN;
                                    pos += 2;
                                    end = pos;
                                } else if (matchesAt(pos + 1, '=')) {
                                    // !<=
                                    tokenType = TokenType.NEITHER_LESS_NOR_EQUAL;
                                    pos += 2;
                                    end = pos;
                                } else {
                                    // !< + non interesting
                                    tokenType = TokenType.NOT_LESS;
                                    pos += 1;
                                    end = pos;
                                }
                            } else {
                                // !< + nothing
                                tokenType = TokenType.NOT_LESS;
                                pos += 1;
                                end = pos;
                            }
                        } else if (matchesAt(pos, '>')) {
                            // !>
                            if (hasCharAt(pos + 1)) {
                                // !> + something
                                if (matchesAt(pos + 1, '=')) {
                                    // !>=
                                    tokenType = TokenType.NEITHER_GREATER_NOR_EQUAL;
                                    pos += 2;
                                    end = pos;
                                } else {
                                    // !> + non interesting
                                    tokenType = TokenType.NOT_GREATER;
                                    pos += 1;
                                    end = pos;
                                }
                            } else {
                                // !> + nothing
                                tokenType = TokenType.NOT_GREATER;
                                pos += 1;
                                end = pos;
                            }
                        } else {
                            // ! + + non interesting
                            tokenType = TokenType.NOT;
                        }
                    } else {
                        // ! + nothing
                        tokenType = TokenType.NOT;
                    }
                }

                case '-' -> {
                    // Recognize '->' and negative numbers
                    if (hasCharAt(pos)) {
                        if (matchesAt(pos, '>')) {
                            tokenType = TokenType.IMPL;
                            pos++;
                            end = pos;
                        } else if (isDigit(chars[pos])) {
                            final NumberType type = skipPossibleNumber();
                            tokenType = type.getTokenType();
                            end = pos;
                        } else {
                            skipText(strict);
                            tokenType = TokenType.TEXT;
                            end = pos;
                        }
                    } else {
                        tokenType = TokenType.TEXT;
                        end = pos;
                    }
                }

                case '+' -> {
                    // Recognize positive numbers
                    if (hasCharAt(pos)) {
                        if (isDigit(chars[pos])) {
                            final NumberType type = skipPossibleNumber();
                            tokenType = type.getTokenType();
                            end = pos;
                        } else {
                            skipText(strict);
                            tokenType = TokenType.TEXT;
                            end = pos;
                        }
                    } else {
                        tokenType = TokenType.TEXT;
                        end = pos;
                    }
                }

                case '<' -> {
                    // Recognize '<', '<:' '<=' and '<->'
                    if (hasCharAt(pos)) {
                        // < + something
                        if (matchesAt(pos, ':')) {
                            // <:
                            tokenType = TokenType.IN;
                            pos++;
                            end = pos;
                        } else if (matchesAt(pos, '=')) {
                            // <=
                            tokenType = TokenType.LESS_OR_EQUAL;
                            pos++;
                            end = pos;
                        } else if (hasCharAt(pos + 1)
                                && matchesAt(pos, '-')
                                && matchesAt(pos + 1, '>')) {
                            // <->
                            tokenType = TokenType.EQUIV;
                            pos += 2;
                            end = pos;
                        } else {
                            // < + non interesting
                            tokenType = TokenType.LESS;
                            end = pos;
                        }
                    } else {
                        // < + nothing
                        tokenType = TokenType.LESS;
                        end = pos;
                    }
                }

                case '>' -> {
                    // Recognize '>', '>=', '>-<'
                    if (hasCharAt(pos)) {
                        // > + something
                        if (matchesAt(pos, '=')) {
                            // >=
                            tokenType = TokenType.GREATER_OR_EQUAL;
                            pos++;
                            end = pos;
                        } else if (hasCharAt(pos + 1)
                                && matchesAt(pos, '-')
                                && matchesAt(pos + 1, '<')) {
                            tokenType = TokenType.XOR;
                            pos += 2;
                            end = pos;
                        } else {
                            // > + non interesting
                            tokenType = TokenType.GREATER;
                            end = pos;
                        }
                    } else {
                        // > + nothing
                        tokenType = TokenType.GREATER;
                        end = pos;
                    }
                }

                case '"' -> {
                    // Found a '"' escaped text
                    final boolean closed = skipDoubleQuotesText();
                    if (!closed) {
                        throw new LexicalException(LexicalException.Detail.MISSING_CLOSING_DOUBLE_QUOTES,
                                                   "Closing '\"' not found",
                                                   expression,
                                                   begin,
                                                   -1);
                    }
                    if (pos == begin + 2) {
                        throw new LexicalException(LexicalException.Detail.EMPTY_DOUBLE_QUOTES_ESCAPED_TEXT,
                                                   "Invalid empty escaped text",
                                                   expression,
                                                   begin,
                                                   pos);
                    }
                    tokenType = TokenType.DOUBLE_QUOTES_TEXT;
                    end = pos;
                }

                case '$' -> {
                    // Found a '$' escaped text
                    final boolean closed = skipDollarText();
                    if (!closed) {
                        throw new LexicalException(LexicalException.Detail.MISSING_CLOSING_DOLLAR,
                                                   "Closing '$' not found",
                                                   expression,
                                                   begin,
                                                   -1);
                    }
                    if (pos == begin + 2) {
                        throw new LexicalException(LexicalException.Detail.EMPTY_DOLLAR_ESCAPED_TEXT,
                                                   "Invalid empty dollar text",
                                                   expression,
                                                   begin,
                                                   pos);
                    }
                    tokenType = TokenType.DOLLAR_TEXT;
                    end = pos;
                }

                case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' -> {
                    final NumberType type = skipPossibleNumber();
                    tokenType = type.getTokenType();
                    end = pos;
                }

                case 'a', 'A' -> {
                    // Recognize [aA][nN][dD]
                    if (hasCharAt(pos + 1)
                            && matchesAt(pos, 'n', 'N')
                            && matchesAt(pos + 1, 'd', 'D')
                            && hasBoundaryAt(pos + 2, strict)) {
                        if (strict) {
                            tokenType = TokenType.AND;
                        } else {
                            tokenType = TokenType.TEXT;
                        }
                        pos += 2;
                    } else {
                        skipText(strict);
                        tokenType = TokenType.TEXT;
                    }
                    end = pos;
                }

                case 'f', 'F' -> {
                    // Recognize [fF][aA][lL][sS][eE]
                    if (hasCharAt(pos + 3)
                            && matchesAt(pos, 'a', 'A')
                            && matchesAt(pos + 1, 'l', 'L')
                            && matchesAt(pos + 2, 's', 'S')
                            && matchesAt(pos + 3, 'e', 'E')
                            && hasBoundaryAt(pos + 4, strict)) {
                        tokenType = TokenType.FALSE;
                        pos += 4;
                    } else {
                        skipText(strict);
                        tokenType = TokenType.TEXT;
                    }
                    end = pos;
                }

                case 'i', 'I' -> {
                    // Recognize [iI][nN] and [iI][mM][pP] and [iI][fF][fF]
                    if (hasCharAt(pos)
                            && matchesAt(pos, 'n', 'N')
                            && hasBoundaryAt(pos + 1, strict)) {
                        if (strict) {
                            tokenType = TokenType.IN;
                        } else {
                            tokenType = TokenType.TEXT;
                        }
                        pos++;
                    } else if (hasCharAt(pos + 1)
                            && matchesAt(pos, 'm', 'M')
                            && matchesAt(pos + 1, 'p', 'P')
                            && hasBoundaryAt(pos + 2, strict)) {
                        if (strict) {
                            tokenType = TokenType.IMPL;
                        } else {
                            tokenType = TokenType.TEXT;
                        }
                        pos += 2;
                    } else if (hasCharAt(pos + 1)
                            && matchesAt(pos, 'f', 'F')
                            && matchesAt(pos + 1, 'f', 'F')
                            && hasBoundaryAt(pos + 2, strict)) {
                        if (strict) {
                            tokenType = TokenType.EQUIV;
                        } else {
                            tokenType = TokenType.TEXT;
                        }
                        pos += 2;
                    } else {
                        skipText(strict);
                        tokenType = TokenType.TEXT;
                    }
                    end = pos;
                }

                case 'n', 'N' -> {
                    // Recognize [nN][oO][tT] and [nN][oO][tT] [iI][nN]
                    if (hasCharAt(pos + 1)
                            && matchesAt(pos, 'o', 'O')
                            && matchesAt(pos + 1, 't', 'T')
                            && hasBoundaryAt(pos + 2, strict)) {
                        pos += 2;
                        end = pos;
                        skipSpaces();
                        if (hasCharAt(pos + 1)
                                && matchesAt(pos, 'i', 'I')
                                && matchesAt(pos + 1, 'n', 'N')
                                && hasBoundaryAt(pos + 2, strict)) {
                            tokenType = TokenType.NOT_IN;
                            pos += 2;
                            end = pos;
                        } else {
                            if (strict) {
                                tokenType = TokenType.NOT;
                            } else {
                                tokenType = TokenType.TEXT;
                            }
                        }
                    } else {
                        skipText(strict);
                        tokenType = TokenType.TEXT;
                        end = pos;
                    }
                }

                case 'o', 'O' -> {
                    // Recognize [oO][rR]
                    if (hasCharAt(pos)
                            && matchesAt(pos, 'r', 'R')
                            && hasBoundaryAt(pos + 1, strict)) {
                        if (strict) {
                            tokenType = TokenType.OR;
                        } else {
                            tokenType = TokenType.TEXT;
                        }
                        pos++;
                    } else {
                        skipText(strict);
                        tokenType = TokenType.TEXT;
                    }
                    end = pos;
                }

                case 't', 'T' -> {
                    // Recognize [tT][oO] and [tT][rR][uU][eE]
                    if (hasCharAt(pos)
                            && matchesAt(pos, 'o', 'O')
                            && hasBoundaryAt(pos + 1, strict)) {
                        if (strict) {
                            tokenType = TokenType.TO;
                        } else {
                            tokenType = TokenType.TEXT;
                        }
                        pos++;
                    } else if (hasCharAt(pos + 2)
                            && matchesAt(pos, 'r', 'R')
                            && matchesAt(pos + 1, 'u', 'U')
                            && matchesAt(pos + 2, 'e', 'E')
                            && hasBoundaryAt(pos + 3, strict)) {
                        tokenType = TokenType.TRUE;
                        pos += 3;
                    } else {
                        skipText(strict);
                        tokenType = TokenType.TEXT;
                    }
                    end = pos;
                }

                case 'x', 'X' -> {
                    // Recognize [xX][oO][rR]
                    if (hasCharAt(pos + 1)
                            && matchesAt(pos, 'o', 'O')
                            && matchesAt(pos + 1, 'r', 'R')
                            && hasBoundaryAt(pos + 2, strict)) {
                        if (strict) {
                            tokenType = TokenType.XOR;
                        } else {
                            tokenType = TokenType.TEXT;
                        }
                        pos += 2;
                    } else {
                        skipText(strict);
                        tokenType = TokenType.TEXT;
                    }
                    end = pos;
                }

                default -> {
                    skipText(strict);
                    tokenType = TokenType.TEXT;
                    end = pos;
                }
                }
            }
            skipSpaces();
        } else {
            tokenType = TokenType.EPSILON;
            begin = pos;
            end = begin;
        }
    }

    /**
     * Moves to next token and returns it.
     * 
     * When no more token are available, returns a Token of type {@link TokenType#EPSILON}.
     *
     * @param strict If {@code true}, a strict interpretation is used.
     * @return The following token.
     * @throws SyntacticException When the parsed expression is invalid.
     */
    public Token nextToken(boolean strict) {
        next(strict);
        return getToken();
    }
}