All Downloads are FREE. Search and download functionalities are using the official Maven repository.

javafx.css.CssLexer Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2010, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.  Oracle designates this
 * particular file as subject to the "Classpath" exception as provided
 * by Oracle in the LICENSE file that accompanied this code.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */

package javafx.css;

import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;

import com.sun.javafx.css.parser.LexerState;
import com.sun.javafx.css.parser.Recognizer;
import com.sun.javafx.css.parser.Token;


final class CssLexer {
    final static int STRING = 10;
    final static int IDENT = 11;
    final static int FUNCTION = 12;
    final static int NUMBER = 13;
    final static int CM = 14;
    final static int EMS = 15;
    final static int EXS = 16;
    final static int IN = 17;
    final static int MM = 18;
    final static int PC = 19;
    final static int PT = 20;
    final static int PX = 21;
    final static int PERCENTAGE = 22;
    final static int DEG = 23;
    final static int GRAD = 24;
    final static int RAD = 25;
    final static int TURN = 26;
    final static int GREATER = 27;
    final static int LBRACE = 28;
    final static int RBRACE = 29;
    final static int SEMI = 30;
    final static int COLON = 31;
    final static int SOLIDUS = 32;
    final static int STAR = 33;
    final static int LPAREN = 34;
    final static int RPAREN = 35;
    final static int COMMA = 36;
    final static int HASH = 37;
    final static int DOT = 38;
    final static int IMPORTANT_SYM = 39;
    final static int WS = 40;
    final static int NL = 41;
    final static int FONT_FACE = 42;
    final static int URL = 43;
    final static int IMPORT = 44;
    final static int SECONDS = 45;
    final static int MS = 46;
    final static int AT_KEYWORD = 47;

    private final Recognizer A = (c) -> c == 'a' || c == 'A';
    private final Recognizer B = (c) -> c == 'b' || c == 'B';
    private final Recognizer C = (c) -> c == 'c' || c == 'C';
    private final Recognizer D = (c) -> c == 'd' || c == 'D';
    private final Recognizer E = (c) -> c == 'e' || c == 'E';
    private final Recognizer F = (c) -> c == 'f' || c == 'F';
    private final Recognizer G = (c) -> c == 'g' || c == 'G';
    private final Recognizer H = (c) -> c == 'h' || c == 'H';
    private final Recognizer I = (c) -> c == 'i' || c == 'I';
    private final Recognizer J = (c) -> c == 'j' || c == 'J';
    private final Recognizer K = (c) -> c == 'k' || c == 'K';
    private final Recognizer L = (c) -> c == 'l' || c == 'L';
    private final Recognizer M = (c) -> c == 'm' || c == 'M';
    private final Recognizer N = (c) -> c == 'n' || c == 'N';
    private final Recognizer O = (c) -> c == 'o' || c == 'O';
    private final Recognizer P = (c) -> c == 'p' || c == 'P';
    private final Recognizer Q = (c) -> c == 'q' || c == 'Q';
    private final Recognizer R = (c) -> c == 'r' || c == 'R';
    private final Recognizer S = (c) -> c == 's' || c == 'S';
    private final Recognizer T = (c) -> c == 't' || c == 'T';
    private final Recognizer U = (c) -> c == 'u' || c == 'U';
    private final Recognizer V = (c) -> c == 'v' || c == 'V';
    private final Recognizer W = (c) -> c == 'w' || c == 'W';
    private final Recognizer X = (c) -> c == 'x' || c == 'X';
    private final Recognizer Y = (c) -> c == 'y' || c == 'Y';
    private final Recognizer Z = (c) -> c == 'z' || c == 'Z';
    private final Recognizer ALPHA =  (c) -> ('a' <= c && c <= 'z') ||
           ('A' <= c && c <= 'Z');

    private final Recognizer NON_ASCII = (c) -> '\u0080' <= c && c <= '\uFFFF';

    private final Recognizer DOT_CHAR =        (c) -> c == '.';
    private final Recognizer GREATER_CHAR =    (c) -> c == '>';
    private final Recognizer LBRACE_CHAR =     (c) -> c == '{';
    private final Recognizer RBRACE_CHAR =     (c) -> c == '}';
    private final Recognizer SEMI_CHAR  =      (c) -> c == ';';
    private final Recognizer COLON_CHAR =      (c) -> c == ':';
    private final Recognizer SOLIDUS_CHAR =    (c) -> c == '/';
    private final Recognizer MINUS_CHAR =      (c) -> c == '-';
    private final Recognizer PLUS_CHAR =       (c) -> c == '+';
    private final Recognizer STAR_CHAR =       (c) -> c == '*';
    private final Recognizer LPAREN_CHAR =     (c) -> c == '(';
    private final Recognizer RPAREN_CHAR =     (c) -> c == ')';
    private final Recognizer COMMA_CHAR =      (c) -> c == ',';
    private final Recognizer UNDERSCORE_CHAR = (c) -> c == '_';
    private final Recognizer HASH_CHAR =       (c) -> c == '#';

    private final Recognizer WS_CHARS = (c) -> c == ' '  ||
           c == '\t' ||
           c == '\r' ||
           c == '\n' ||
           c == '\f';
    private final Recognizer NL_CHARS = (c) -> (c == '\r' || c == '\n');

    private final Recognizer DIGIT = (c) -> '0' <= c && c <= '9';

    private final Recognizer HEX_DIGIT = (c) -> ('0' <= c && c <= '9') ||
           ('a' <= c && c <= 'f') ||
           ('A' <= c && c <= 'F');

    // The initial accepts any character
    final LexerState initState = new LexerState("initState", null) {
        @Override public boolean accepts(int c) { return true; }
    };

    final LexerState hashState = new LexerState("hashState",
        HASH_CHAR
    );

    final LexerState minusState = new LexerState("minusState",
        MINUS_CHAR
    );

    final LexerState plusState = new LexerState("plusState",
        PLUS_CHAR
    );

    // The dot char is either just a dot or may be the start of a number
    final LexerState dotState = new LexerState(DOT, "dotState",
        DOT_CHAR
    );

    // [_a-z]|{nonascii}|{escape}
    final LexerState nmStartState = new LexerState(IDENT, "nmStartState",
        UNDERSCORE_CHAR, ALPHA
    );

    // nmchar       [_a-z0-9-]|{nonascii}|{escape}
    final LexerState nmCharState = new LexerState(IDENT, "nmCharState",
        UNDERSCORE_CHAR, ALPHA, DIGIT, MINUS_CHAR
    );

    // same as nmchar, but need to differentiate between nmchar in ident and
    // nmchar in
    final LexerState hashNameCharState = new LexerState(HASH, "hashNameCharState",
        UNDERSCORE_CHAR, ALPHA, DIGIT, MINUS_CHAR
    );

    // lparen after ident implies function
    final LexerState lparenState = new LexerState(FUNCTION, "lparenState",
        LPAREN_CHAR
    ) {
        @Override public int getType() {

            if (text.indexOf("url(") == 0) {
                try {
                    return consumeUrl();
                } catch (IOException ioe) {
                    return Token.INVALID;
                }
            }
            return super.getType();
        }
    };


    // initial digits in a number
    final LexerState leadingDigitsState = new LexerState(NUMBER,"leadingDigitsState",
        DIGIT
    );

    // If the dot char follows leading digits, a plus or a minus, then it is
    // a decimal mark
    final LexerState decimalMarkState = new LexerState("decimalMarkState",
        DOT_CHAR
    );

    // digits following decimal mark
    final LexerState trailingDigitsState = new LexerState(NUMBER,"trailingDigitsState",
        DIGIT
    );

    // http://www.w3.org/TR/css3-values/
    final LexerState unitsState = new UnitsState();

    private Map createStateMap() {

        Map map =
                new HashMap<>();

        // initState -- [#] --> hashState
        // initState -- [-] --> minusState
        // initState -- [+] --> plusState
        // initState -- [_a-z] --> nmStartState
        // initState -- [0-9] --> leadingDigitsState
        // initState -- [.] --> dotState
        map.put(
                initState,
                new LexerState[] {
                    hashState,
                    minusState,
                    nmStartState,
                    plusState,
                    minusState,
                    leadingDigitsState,
                    dotState
                }
        );

        // minus could be the start of an ident or a number
        // minusState -- [_a-z] --> nmStartState
        // minusState -- [0-9] --> leadingDigitsState
        // minusState -- [.] --> decimalMarkState
        map.put(
                minusState,
                new LexerState[] {
                    nmStartState,
                    leadingDigitsState,
                    decimalMarkState,
                }
        );

        //
        // # {name}
        // hash {nmchar}+
        // hashState -- [_a-z0-9-] --> nmCharState
        // nmCharState -- [_a-z0-9-] --> nmCharState
        //
        map.put(
                hashState,
                new LexerState[] {
                    hashNameCharState
                }
        );

        map.put(
                hashNameCharState,
                new LexerState[] {
                    hashNameCharState,
                }
        );


        //
        // {ident}
        // ident '-'? {nmchar}+
        // nmStartState -- [_a-z0-9-] --> nmCharState
        // nmCharState -- [_a-z0-9-] --> nmCharState
        // nmCharState -- [(] --> lparenState
        //
        map.put(
                nmStartState,
                new LexerState[] {
                    nmCharState
                }
        );

        map.put(
                nmCharState,
                new LexerState[] {
                    nmCharState,
                    lparenState
                }
        );

        // from +/- state, next state must be a digit or a dot
        map.put(
                plusState,
                new LexerState[] {
                    leadingDigitsState,
                    decimalMarkState
                }
        );

        // from leadingDigitsState, next state must be
        // another digit, a decimal mark, or units
        map.put(
                leadingDigitsState,
                new LexerState[] {
                    leadingDigitsState,
                    decimalMarkState,
                    unitsState
                }
        );

        // from decimal mark, next state must be a digit.
        // Need to map both dotState and decimalMarkState
        // since dot might be the first character and would
        // not be seen as a decimal point.
        map.put(
                dotState,
                new LexerState[] {
                    trailingDigitsState
                }
        );

        map.put(
                decimalMarkState,
                new LexerState[] {
                    trailingDigitsState
                }
        );

        // from trailingDigitsState, next state must be another digit or units
        map.put(
                trailingDigitsState,
                new LexerState[] {
                    trailingDigitsState,
                    unitsState,
                }
        );

        // UnitsState stays in UnitsState
        map.put(
                unitsState,
                new LexerState[] {
                    unitsState
                }
        );

        return map;
    }

    CssLexer() {
        this.stateMap = createStateMap();
        this.text = new StringBuilder(64);
        this.currentState = initState;
    }

    void setReader(Reader reader) {
        this.reader = reader;
        lastc = -1;
        pos = offset = 0;
        line = 1;
        this.currentState = initState;
        this.token = null;
        try {
            this.ch = readChar();
        } catch (IOException ioe) {
            token = Token.EOF_TOKEN;
        }
    }

    private Token scanImportant()  throws IOException{
        // CSS 2.1 grammar for important_sym
        // "!"({w}|{comment})*{I}{M}{P}{O}{R}{T}{A}{N}{T}
        final Recognizer[] important_sym =
                new Recognizer[] { I, M, P, O, R, T, A, N, T };
        int current = 0;

        text.append((char)ch);

        // get past the '!'
        ch = readChar();

        while(true) {

            switch (ch) {

                case Token.EOF:
                    token = Token.EOF_TOKEN;
                    return token;

                case '/':
                    ch = readChar();
                    if (ch == '*') skipComment();
                    else if (ch == '/') skipEOL();
                    else {
                        text.append('/').append((char)ch);
                        int temp = offset;
                        offset = pos;
                        return new Token(Token.INVALID, text.toString(), line, temp);
                    }
                    break;

                case ' ':
                case '\t':
                case '\r':
                case '\n':
                case '\f':
                    ch = readChar();
                    break;

                default:
                    boolean accepted = true;
                    while(accepted && current < important_sym.length) {
                        accepted = important_sym[current++].recognize(ch);
                        text.append((char)ch);
                        ch = readChar();
                    }
                    if (accepted) {
                        final int temp = offset;
                        offset = pos-1; // will have read one char too many
                        return new Token(IMPORTANT_SYM, "!important", line, temp);
                    } else {
                        while (ch != ';' &&
                               ch != '}' &&
                               ch != Token.EOF) {
                            ch = readChar();
                        }
                        if (ch != Token.EOF) {
                            final int temp = offset;
                            offset = pos-1; // will have read one char too many
                            return new Token(Token.SKIP, text.toString(), line, temp);
                        } else {
                            return Token.EOF_TOKEN;
                        }
                    }
            }
        }
    }

    // http://www.ietf.org/rfc/rfc3986
    // http://www.w3.org/TR/2011/REC-CSS2-20110607/syndata.html#uri
    // http://www.w3.org/TR/css3-syntax/#consume-a-url-token
    private int consumeUrl() throws IOException {

        text.delete(0, text.length());

        // skip initial white space
        while (WS_CHARS.recognize(ch)) {
            ch = readChar();
        }

        if (ch == Token.EOF) {
            return Token.EOF;
        }

        if (ch == '\'' || ch == '"') {

            int endQuote = ch;

            ch = readChar();

            // consume the string
            while (ch != endQuote) {

                if (ch == Token.EOF) {
                    break;
                }

                // un-escaped newline is an error
                if (NL_CHARS.recognize(ch)) {
                    break;
                }

                // handle escaped char
                // Note: this block does not handle the algorithm for consuming hex-digits
                if (ch == '\\') {

                    ch = readChar();

                    if (NL_CHARS.recognize(ch)) {

                        // consume newline
                        while(NL_CHARS.recognize(ch)) {
                            ch = readChar();
                        }

                    } else if (ch != Token.EOF) {
                        // if EOF, do nothing
                        text.append((char)ch);
                        ch = readChar();
                    }

                    continue;
                }

                text.append((char)ch);
                ch = readChar();

            }

            if (ch == endQuote) {

                ch = readChar();
                while(WS_CHARS.recognize(ch)) {
                    ch = readChar();
                }

                // After consuming white-space, the char has to be rparen or EOF. Error otherwise.
                if (ch == ')') {
                    // consume the rparen
                    ch = readChar();
                    return URL;
                }

                if(ch == Token.EOF) {
                    return URL;
                }
            }

        } else {

            // TODO: a lot of repeat code from above
            text.append((char)ch);
            ch = readChar();

            while (true) {

                while (WS_CHARS.recognize(ch)) {
                    ch = readChar();
                }

                if (ch == ')') {
                    // consume the rparen
                    ch = readChar();
                    return URL;
                }

                if (ch == Token.EOF) {
                    return URL;
                }

                // handle escaped char
                // Note: this block does not handle the algorithm for consuming hex-digits
                if (ch == '\\') {

                    ch = readChar();

                    if (NL_CHARS.recognize(ch)) {

                        // consume newline
                        while(NL_CHARS.recognize(ch)) {
                            ch = readChar();
                        }

                    } else if (ch != Token.EOF) {
                        // if EOF, do nothing
                        text.append((char)ch);
                        ch = readChar();
                    }

                    continue;
                }

                if (ch == '\'' || ch == '"' || ch == '(') {
                    break;
                }

                text.append((char)ch);
                ch = readChar();

            }
        }

        // if we get to here, then the token is bad
        // consume up to rparen or eof
        while(true) {
            int lastCh = ch;
            if (ch == Token.EOF) {
                return Token.EOF;
            } else if (ch == ')' && lastCh != '\\') {
                ch = readChar();
                return Token.INVALID;
            }

            lastCh = ch;
            ch = readChar();
        }

    }

    private class UnitsState extends LexerState {

        private final Recognizer[][] units = {

            // TODO: all units from http://www.w3.org/TR/css3-values/
            // If units are added, getType and unitsMask must be updated!
            { C, M },
            { D, E, G },
            { E, M },
            { E, X },
            { G, R, A, D },
            { I, N },
            { M, M },
            { M, S },
            { P, C },
            { P, T },
            { P, X },
            { R, A, D },
            { S },
            { T, U, R, N },
            { (c) -> c == '%'}
        };

        // One bit per unit
        private int unitsMask = 0x7FFF;

        // Offset into inner array of units
        private int index = -1;

        UnitsState() {
            super(-1, "UnitsState", null);
        }

        @Override
        public int getType() {

            int type = Token.INVALID;

            // Must keep this in sync with units array.
            // Small switch will be faster than Math.log(oldMask)/Math.log(2)
            switch (unitsMask) {
                case 0x1: type = CM; break;
                case 0x2: type = DEG; break;
                case 0x4: type = EMS; break;
                case 0x8: type = EXS; break;
                case 0x10: type = GRAD; break;
                case 0x20: type = IN; break;
                case 0x40: type = MM; break;
                case 0x80: type = MS; break;
                case 0x100: type = PC; break;
                case 0x200: type = PT; break;
                case 0x400: type = PX; break;
                case 0x800: type = RAD; break;
                case 0x1000: type = SECONDS; break;
                case 0x2000: type = TURN; break;
                case 0x4000: type = PERCENTAGE; break;
                default: type = Token.INVALID;
            }

            // reset
            unitsMask = 0x7fff;
            index = -1;

            return type;
        }

        @Override
        public boolean accepts(int c) {

            // Ensure that something bogus like '10xyzzy' is
            // consumed as a token by only returning false
            // if the char is not alpha or %
            if (!ALPHA.recognize(c) && c != '%') {
                return false;
            }

            // If unitsMask is zero, then we've already figured out that
            // this is an invalid token, but we want to accept c so that
            // '10xyzzy' is consumed as a token, albeit an invalid one.
            if (unitsMask == 0) return true;

            index += 1;

            for (int n=0 ; n < units.length; n++) {

                final int u = 1 << n;

                // the unit at this index already failed. Move on.
                if ((unitsMask & u) == 0) continue;

                if ((index >= units[n].length) || !(units[n][index].recognize(c))) {
                    // not a match, turn off this bit
                    unitsMask &= ~u;
                }

            }


            return true;
        }

    }

    private  void skipComment() throws IOException {
        while(ch != -1) {
            if (ch == '*') {
                ch = readChar();
                if (ch == '/') {
                    offset = pos;
                    ch=readChar();
                    break;
                }
            } else {
                ch = readChar();
            }
        }
    }

    private void skipEOL() throws IOException {

        int lastc = ch;

        while (ch != -1) {

            ch = readChar();

            // EOL is cr, lf, or crlf
            if ((ch == '\n') || (lastc == '\r' && ch != '\n')) {
                    break;
            }
        }

    }

    private int pos = 0;
    private int offset = 0;
    private int line = 1;
    private int lastc = -1;

    private int readChar() throws IOException {

        int c = reader.read();

        // only reset line and pos counters after having read a NL since
        // a NL token is created after the readChar
        if (lastc == '\n' || (lastc == '\r' && c != '\n')) {
            // set pos to 1 since we've already read the first char of the new line
            pos = 1;
            offset = 0;
            line++;
        } else {
            pos++;
        }

        lastc = c;
        return c;
    }

    Token nextToken() {

        Token tok = null;
        if (token != null) {
            tok = token;
            if (token.getType() != Token.EOF) token = null;
        } else {
            do {
                tok = getToken();
            } while (tok != null &&
//                     tok.getType() != Token.EOF &&
                     Token.SKIP_TOKEN.equals(tok));
        }

        // reset text buffer and currentState
        text.delete(0,text.length());
        currentState = initState;

        return tok;
    }

    private Token getToken() {

        try {
            while (true) {
                charNotConsumed = false;

                final LexerState[] reachableStates =
                        currentState != null ? stateMap.get(currentState) : null;

                final int max = reachableStates != null ? reachableStates.length : 0;

                LexerState newState = null;
                for (int n=0; n':

                        token = new Token(GREATER,">", line, offset);
                        offset = pos;
                        break;

                    case '{':
                        token = new Token(LBRACE,"{", line, offset);
                        offset = pos;
                        break;

                    case '}':
                        token = new Token(RBRACE,"}", line, offset);
                        offset = pos;
                        break;

                    case ';':
                        token = new Token(SEMI,";", line, offset);
                        offset = pos;
                        break;

                    case ':':
                        token = new Token(COLON,":", line, offset);
                        offset = pos;
                        break;

                    case '*':
                        token = new Token(STAR,"*", line, offset);
                        offset = pos;
                        break;

                    case '(':
                        token = new Token(LPAREN,"(", line, offset);
                        offset = pos;
                        break;

                    case ')':
                        token = new Token(RPAREN,")", line, offset);
                        offset = pos;
                        break;

                    case ',':
                        token = new Token(COMMA,",", line, offset);
                        offset = pos;
                        break;

                    case '.':
                        token = new Token(DOT,".", line, offset);
                        offset = pos;
                        break;

                    case ' ':
                    case '\t':
                    case '\f':
                        token = new Token(WS, Character.toString((char)ch), line, offset);
                        offset = pos;
                        break;


                    case '\r':
                        token = new Token(NL, "\\r", line, offset);
                        // offset and pos are reset on next readChar

                        ch = readChar();
                        if (ch == '\n') {
                            token = new Token(NL, "\\r\\n", line, offset);
                            // offset and pos are reset on next readChar
                        } else {
                            // already read the next character, so return
                            // return the NL token here (avoid the readChar
                            // at the end of the loop below)
                            final Token tok = token;
                            token = (ch == -1) ? Token.EOF_TOKEN : null;
                            return tok;
                        }
                        break;

                    case '\n':
                        token = new Token(NL, "\\n", line, offset);
                        // offset and pos are reset on next readChar
                        break;

                    case '!':
                        Token tok = scanImportant();
                        return tok;

                    case '@':
                        token = new Token(AT_KEYWORD, "@", line, offset);
                        offset = pos;
                        break;

                    default:
//                      System.err.println("hit default case: ch = " + Character.toString((char)ch));
                        token = new Token(Token.INVALID, Character.toString((char)ch), line, offset);
                        offset = pos;
                        break;
                }

                if (token == null) {
//                    System.err.println("token is null! ch = " + Character.toString((char)ch));
                    token = new Token(Token.INVALID, null, line, offset);
                    offset = pos;
                } else if (token.getType() == Token.EOF) {
                    return token;
                }

                if (ch != -1 && !charNotConsumed) ch = readChar();

                final Token tok = token;
                token = null;
                return tok;
            }
        } catch (IOException ioe) {
            token = Token.EOF_TOKEN;
            return token;
        }
    }

    private int ch;
    private boolean charNotConsumed = false;
    private Reader reader;
    private Token token;
    private final Map stateMap;
    private LexerState currentState;
    private final StringBuilder text;

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy