All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.netbeans.lib.html.lexer.HtmlLexer Maven / Gradle / Ivy

There is a newer version: RELEASE240
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package org.netbeans.lib.html.lexer;

import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.netbeans.api.html.lexer.HTMLTokenId;
import org.netbeans.api.html.lexer.HtmlLexerPlugin;
import org.netbeans.api.lexer.InputAttributes;
import org.netbeans.api.lexer.LanguagePath;
import org.netbeans.api.lexer.Token;
import org.netbeans.spi.lexer.Lexer;
import org.netbeans.spi.lexer.LexerInput;
import org.netbeans.spi.lexer.LexerRestartInfo;
import org.netbeans.spi.lexer.TokenFactory;
import org.netbeans.spi.lexer.TokenPropertyProvider;

/**
 * Lexical analyzer for HTML. Based on original HTML lexer from html/editor module.
 *
 * @author Petr Nejedly
 * @author Miloslav Metelka
 * @author Jan Lahoda
 * @author Marek Fukala
 * @version 1.00
 */

public final class HtmlLexer implements Lexer {

    private static final Logger LOGGER = Logger.getLogger(HtmlLexer.class.getName());
    private static final boolean LOG = Boolean.getBoolean("j2ee_lexer_debug"); //NOI18N

    private static final int EOF = LexerInput.EOF;

    private final LexerInput input;

    private final TokenFactory tokenFactory;

    private static final class CompoundState {
        private int lexerState;
        private int lexerSubState;
        private int lexerEmbeddingState;
        private byte customELIndex;
        private String attribute;
        private String tag;
        private String scriptType;
        private boolean quoteType;

        public CompoundState(int lexerState, int lexerSubState, int lexerEmbeddingState, String attributeName, String tagName, String scriptType, byte customELIndex, boolean quoteType) {
            this.lexerState = lexerState;
            this.lexerSubState = lexerSubState;
            this.lexerEmbeddingState = lexerEmbeddingState;
            this.attribute = attributeName;
            this.tag = tagName;
            this.scriptType = scriptType;
            this.customELIndex = customELIndex;
            this.quoteType = quoteType;
        }

        @Override
        public boolean equals(Object obj) {
            if (obj == null) {
                return false;
            }
            if (getClass() != obj.getClass()) {
                return false;
            }
            final CompoundState other = (CompoundState) obj;
            if (this.lexerState != other.lexerState) {
                return false;
            }
            if (this.lexerSubState != other.lexerSubState) {
                return false;
            }
            if (this.lexerEmbeddingState != other.lexerEmbeddingState) {
                return false;
            }
            if (this.attribute != other.attribute && (this.attribute == null || !this.attribute.equals(other.attribute))) {
                return false;
            }
            if (this.tag != other.tag && (this.tag == null || !this.tag.equals(other.tag))) {
                return false;
            }
            if (this.scriptType != other.scriptType && (this.scriptType == null || !this.scriptType.equals(other.scriptType))) {
                return false;
            }
            if (this.customELIndex != other.customELIndex) {
                return false;
            }
            if (this.quoteType != other.quoteType) {
                return false;
            }

            return true;
        }

        @Override
        public int hashCode() {
            int hash = 3;
            hash = 17 * hash + this.lexerState;
            hash = 17 * hash + this.lexerSubState;
            hash = 17 * hash + this.lexerEmbeddingState;
            hash = 17 * hash + (this.attribute != null ? this.attribute.hashCode() : 0);
            hash = 17 * hash + (this.tag != null ? this.tag.hashCode() : 0);
            hash = 17 * hash + (this.scriptType != null ? this.scriptType.hashCode() : 0);
            if(this.customELIndex > 0) {
                //do not alter hash code if there's no custom el index set
                hash = 17 * hash + this.customELIndex;
            }
            //do not alter the hash code out of the related area
            switch(lexerState) {
                case ISI_VAL_QUOT:
                case ISI_VAL_QUOT_EL:
                case ISI_VAL_QUOT_ESC:
                    hash = 17 * hash + (quoteType ? 1 : 0);
                    break;
            }

            return hash;
        }

        @Override
        public String toString() {
            StringBuilder sb = new StringBuilder();
            sb.append("HLS(hc="); //NOI18N
            sb.append(hashCode());
            sb.append(",s="); //NOI18N
            sb.append(lexerState);
            if(lexerSubState > 0) {
                sb.append(",ss="); //NOI18N
                sb.append(lexerSubState);
            }
            if(lexerEmbeddingState > 0) {
                sb.append(",es="); //NOI18N
                sb.append(lexerEmbeddingState);
            }
            if(tag != null) {
                sb.append(",tag="); //NOI18N
                sb.append(tag);
            }
            if(attribute != null) {
                sb.append(",attribute="); //NOI18N
                sb.append(attribute);
            }
            if(scriptType != null) {
                sb.append(",scriptType="); //NOI18N
                sb.append(scriptType);
            }
            sb.append(')'); //NOI18N
            return sb.toString();
        }

    }

    private final HashMap STATES_CACHE = new HashMap<>();

    @Override
    public Object state() {
        //cache the states so lexing of large files do not eat too much memory
        CompoundState currentState = new CompoundState(lexerState, lexerSubState, lexerEmbeddingState, attribute, tag, scriptType, customELIndex, quoteType);
        CompoundState cached = STATES_CACHE.get(currentState);
        if(cached == null) {
            STATES_CACHE.put(currentState, currentState);
            return currentState;
        } else {
            return cached;
        }
    }

    //script and style tag names
    private static final String SCRIPT = "script"; //NOI18N
    private static final String STYLE = "style"; //NOI18N

    private static final String[] STYLE_ATTRS = new String[]{"style", "id", "class"}; //NOI18N

    /** Internal state of the lexical analyzer before entering subanalyzer of
     * character references. It is initially set to INIT, but before first usage,
     * this will be overwritten with state, which originated transition to
     * charref subanalyzer.
     */
    private int lexerSubState = INIT;
    private int lexerState    = INIT;

    private String attribute;
    private String tag; //tag name of the current context tag

    /**
     * Value of the "type" attribute in SCRIPT tag
     */
    private String scriptType;

    //tag name with namespace prefix to collection of attributes which should have
    //css class embedding by default
    private Map> cssClassTagAttrMap;
    private String CSS_CLASS_MAP_PROPERTY_KEY = "cssClassTagAttrMap"; //NOI18N //semi api

    /** indicated whether we are in a script */
    private int lexerEmbeddingState = INIT;

    private byte customELIndex = INIT;

    /**
     * Indicates the quote type in ISI_VAL_QUOT state.
     *
     * true means double qoute, false single quote.
     */
    private boolean quoteType;

    public static final String EL_CONTENT_PROVIDER_INDEX = "elci"; //NOI18N

    // internal 'in script' state. 'scriptState' internal state is set to it when the
    // analyzer goes into a script tag body
    private static final int ISI_SCRIPT = 1;
    private static final int ISI_STYLE = 2;

    // Internal states
    private static final int INIT = 0;
    private static final int ISI_TEXT = 1;    // Plain text between tags
    private static final int ISI_ERROR = 2;   // Syntax error in HTML syntax
    private static final int ISA_LT = 3;      // After start of tag delimiter - "<"
    private static final int ISA_SLASH = 4;   // After ETAGO - ""
    private static final int ISI_TAG = 8;     // Inside tag - "<[a..Z]+"
    private static final int ISP_TAG_X = 9;   // X-switch after TAG's name
    private static final int ISP_TAG_WS = 10; // In WS in TAG - ""
    private static final int ISI_ARG = 11;    // Inside tag's argument - ""
    private static final int ISP_ARG_X = 12;  // X-switch after ARGUMENT's name
    private static final int ISP_ARG_WS = 13; // Inside WS after argument awaiting '='
    private static final int ISP_EQ = 14;     // X-switch after '=' in TAG's ARGUMENT
    private static final int ISP_EQ_WS = 15;  // In WS after '='
    private static final int ISI_VAL = 16;    // Non-quoted value
    private static final int ISI_VAL_QUOT = 17;   // quoted value
    private static final int ISI_VAL_QUOT_EL = 18;   // in EL in quoted value
    private static final int ISA_SGML_ESCAPE = 19;  // After " tags closing symbol '>' - the tag content
    private static final int ISI_SCRIPT_CONTENT_AFTER_LT = 36; //after < in script content
    private static final int ISI_SCRIPT_CONTENT_ENDTAG = 37; //after  tags closing symbol '>' - the tag content
    private static final int ISI_STYLE_CONTENT_AFTER_LT = 39; //after < in style content
    private static final int ISI_STYLE_CONTENT_ENDTAG = 40; //after 
    private static final int ISI_XML_PI_QM = 48; //after ? in XML PI

    private static final int ISI_EL = 49; //EL custom open delimiter: {{.....}}

    static final Set EVENT_HANDLER_NAMES = new HashSet<>();
    static {
        // See http://www.w3.org/TR/html401/interact/scripts.html
        EVENT_HANDLER_NAMES.add("onload"); // NOI18N
        EVENT_HANDLER_NAMES.add("onunload"); // NOI18N
        EVENT_HANDLER_NAMES.add("onclick"); // NOI18N
        EVENT_HANDLER_NAMES.add("ondblclick"); // NOI18N
        EVENT_HANDLER_NAMES.add("onmousedown"); // NOI18N
        EVENT_HANDLER_NAMES.add("onmouseup"); // NOI18N
        EVENT_HANDLER_NAMES.add("onmouseover"); // NOI18N
        EVENT_HANDLER_NAMES.add("onmousemove"); // NOI18N
        EVENT_HANDLER_NAMES.add("onmouseout"); // NOI18N
        EVENT_HANDLER_NAMES.add("onfocus"); // NOI18N
        EVENT_HANDLER_NAMES.add("onblur"); // NOI18N
        EVENT_HANDLER_NAMES.add("onkeypress"); // NOI18N
        EVENT_HANDLER_NAMES.add("onkeydown"); // NOI18N
        EVENT_HANDLER_NAMES.add("onkeyup"); // NOI18N
        EVENT_HANDLER_NAMES.add("onsubmit"); // NOI18N
        EVENT_HANDLER_NAMES.add("onreset"); // NOI18N
        EVENT_HANDLER_NAMES.add("onselect"); // NOI18N
        EVENT_HANDLER_NAMES.add("onchange"); // NOI18N
        EVENT_HANDLER_NAMES.add("ondrag"); // NOI18N
        EVENT_HANDLER_NAMES.add("ondrop"); // NOI18N

        // IMPORTANT - if you add any that DON'T start with "o" here,
        // make sure you update the optimized firstchar look in isJavaScriptArgument
    }

    private static final String SUPPORTED_SCRIPT_TYPE = "text/javascript"; //NOI18N

    //flyweight token images
    private static final String IMG_EQUAL_SIGN = "="; //NOI18N
    private static final String IMG_CLOSE_TAG_SYMBOL = ">"; //NOI18N
    private static final String IMG_CLOSE_TAG_SYMBOL2 = "/>"; //NOI18N
    private static final String IMG_OPEN_TAG_SYMBOL = "<"; //NOI18N
    private static final String IMG_OPEN_TAG_SYMBOL2 = " info) {
        this.input = info.input();
        this.tokenFactory = info.tokenFactory();
        if (info.state() == null) {
            this.lexerSubState = INIT;
            this.lexerState = INIT;
            this.lexerEmbeddingState = INIT;
            this.customELIndex = INIT;
            this.quoteType = false;
        } else {
            CompoundState cs = (CompoundState) info.state();
            lexerState = cs.lexerState;
            lexerSubState = cs.lexerSubState;
            lexerEmbeddingState = cs.lexerEmbeddingState;
            attribute = cs.attribute;
            tag = cs.tag;
            customELIndex = cs.customELIndex;
            quoteType = cs.quoteType;
        }

        InputAttributes inputAttributes = info.inputAttributes();
        if (inputAttributes != null) {
            cssClassTagAttrMap = (Map>)inputAttributes.getValue(
                    LanguagePath.get(HTMLTokenId.language()), CSS_CLASS_MAP_PROPERTY_KEY); //NOI18N
        }
    }

    private boolean isAZ( int character ) {
        return( (character >= 'a' && character <= 'z') || (character >= 'A' && character <= 'Z') );
    }

    private boolean isName( int character ) {
        return Character.isLetterOrDigit(character) ||
                character == '-' || character == '_' || character == '.' || character == ':';
    }

    private boolean isAttributeName( int character ) {
        return (! Character.isWhitespace(character)) && character != '/'
            && character != '>' && character != '<' && character != '='
            && character != 0;
    }

    /**
     * Resolves if given char is whitespace in terms of HTML4.0 specs
     * According to specs, following characters are treated as whitespace:
     * Space - '\u0020', Tab - '\u0009',
     * Formfeed - '\u000C',Zero-width space - '\u200B',
     * Carriage return - '\u000D' and Line feed - '\u000A'
     * CR's are included for completenes only, they should never appear in document
     */

    private boolean isWS( int character ) {
        //why there is the || character == '@'???
        //----------------------------------------
        //see the issue #149968. It is the simpliest
        //and not very harmful solution to that.
        //In principle we need to recognize three at signs
        // (@@@) anywhere in the html code and ignore it.
        //This mark can occure in the generated virtual
        //html code and denotes the places where there is
        //some templating language in the real document.
        //To fix this completely properly I would have to
        //either somehow preprocess the text or introduce some
        //more states to the already complicated lexer.
        //The sideeffect of this change is that a single at sign
        //wont be signalled as error in the editor and lexed as whitespace
        //which doesn't sound too bad.
        //
        //note: the language construct where one generates
        //attribute name doesn't work, but I consider this a quite
        //unusuall: 
="center" /> return Character.isWhitespace(character) || character == '@'; } private boolean isJavascriptEventHandlerName(CharSequence attributeName) { if(attributeName == null) { return false; } if(attributeName.length() > 2) { char firstChar = attributeName.charAt(0); char secondChar = attributeName.charAt(1); if((firstChar == 'o' || firstChar == 'O') && (secondChar == 'n' || secondChar == 'N')) { return EVENT_HANDLER_NAMES.contains(attributeName.toString().toLowerCase(Locale.ENGLISH)); } } return false; } private boolean isStyleAttributeName(CharSequence chs) { if(chs == null) { return false; } outer: for (int j = 0; j < STYLE_ATTRS.length; j++) { if (chs.length() == STYLE_ATTRS[j].length()) { for (int i = 0; i < chs.length(); i++) { if (Character.toLowerCase(chs.charAt(i)) != Character.toLowerCase(STYLE_ATTRS[j].charAt(i))) { continue outer; } } return true; } } return false; } private CharSequence getScriptType(CharSequence attributeValue, boolean quoted) { char lastChar = attributeValue.charAt(attributeValue.length() - 1); boolean hasEndQuote = attributeValue.length() > 1 && (lastChar == '\'' || lastChar == '"'); return quoted ? attributeValue.subSequence(1, attributeValue.length() - (hasEndQuote ? 1 : 0)) : attributeValue; } private boolean followsCloseTag(CharSequence closeTagName) { int actChar; int prev_read = input.readLength(); //remember the size of the read sequence //substract the first read character int read = 0; while(true) { actChar = input.read(); read++; if(!(Character.isLetter(actChar) || Character.isDigit(actChar) || (actChar == '_') || (actChar == '-') || (actChar == ':') || (actChar == '.') || (actChar == '/')) || (actChar == EOF)) { // EOL or not alpha //end of tagname CharSequence tagName = input.readText().subSequence(prev_read, prev_read + read - 1); input.backup(read); //put the lookahead text back to the buffer if(equals(closeTagName, tagName, true, true)) { if(actChar == '>') { return true; } } return false; } } } @Override public Token nextToken() { int actChar; main: while (true) { actChar = input.read(); if (actChar == EOF) { if(input.readLengthEOF() == 1) { return null; //just EOL is read } else { //there is something else in the buffer except EOL //we will return last token now input.backup(1); //backup the EOL, we will return null in next nextToken() call break; } } //System.out.println("HTMLSyntax: parseToken tokenOffset=" + tokenOffset + ", actChar='" + actChar + "', offset=" + offset + ", state=" + getStateName(state) + // ", stopOffset=" + stopOffset + ", lastBuffer=" + lastBuffer); switch( lexerState ) { case INIT: // DONE switch( actChar ) { case '<': lexerState = ISA_LT; continue main; case '&': lexerState = ISA_REF; lexerSubState = ISI_TEXT; continue main; default: lexerState = ISI_TEXT; break; } //fall through to ISI_TEXT case ISI_TEXT: // DONE switch( actChar ) { case '<': case '&': lexerState = INIT; input.backup(1); if(input.readLength() > 0) { //is there any text before & or < ??? return token(HTMLTokenId.TEXT); } break; } //custom EL support delimiters: for(byte delimiterIndex = 0; delimiterIndex < customELQuery.getOpenDelimiters().length; delimiterIndex++ ) { String openDelimiter = customELQuery.getOpenDelimiters()[delimiterIndex]; if(openDelimiter == null) { continue; } int alreadyRead = input.readLength(); char read = (char)actChar; //first char is already read for(int i = 0; i < openDelimiter.length(); i++) { char delimChar = openDelimiter.charAt(i); if(read != delimChar) { //no match input.backup(input.readLengthEOF() - alreadyRead); //backup text continue delimiters; //and try next one } if((i+1) < openDelimiter.length()) { //will be next loop, read char read = (char)input.read(); } } //we've found an open delimiter //check if the there was already something read before checking the delimiter, //if so then return it and re-run this step again so then we can return //clean token for the delimiter if(input.readLength() > openDelimiter.length()) { input.backup(openDelimiter.length()); return token(HTMLTokenId.TEXT); } else { //return the open symbol token and switch to "in el" state lexerState = ISI_EL; customELIndex = (byte)(delimiterIndex + 1); //0 is reserved for "no delimiter", 1 means delimiter with index 0 //save the provider's index in the delimiter token's property so once can recognize what should be //the delimiters' content if it is empty //TODO "contentMimetype" INTO API??? return token(HTMLTokenId.EL_OPEN_DELIMITER, new HtmlTokenPropertyProvider(EL_EXPRESSION_CONTENT_MIMETYPE_TOKEN_PROPERTY_KEY, customELQuery.getMimeTypes()[delimiterIndex])); } } break; case ISI_EL: delimiters: for(byte delimiterIndex = 0; delimiterIndex < customELQuery.getOpenDelimiters().length; delimiterIndex++ ) { String closeDelimiter = customELQuery.getCloseDelimiters()[delimiterIndex]; if(closeDelimiter == null) { continue; } int alreadyRead = input.readLength(); char read = (char)actChar; //first char is already read for(int i = 0; i < closeDelimiter.length(); i++) { char delimChar = closeDelimiter.charAt(i); if(read != delimChar) { //no match input.backup(input.readLength() - alreadyRead); //backup text continue delimiters; //and try next one } if((i+1) < closeDelimiter.length()) { //will be next loop, read char read = (char)input.read(); } } //we've found a close delimiter //check if the there was already something read before checking the delimiter, //if so then return it and re-run this step again so then we can return //clean token for the delimiter if(input.readLength() > closeDelimiter.length()) { input.backup(closeDelimiter.length()); //save the provider's index in the token's property so we can set the corresponding embdding in HTMLTokenId.language() return token(HTMLTokenId.EL_CONTENT, new HtmlTokenPropertyProvider(EL_CONTENT_PROVIDER_INDEX, (byte)(customELIndex - 1))); } else { //return the open symbol token and switch to "in el" state lexerState = INIT; customELIndex = INIT; return token(HTMLTokenId.EL_CLOSE_DELIMITER); } } break; case ISI_ERROR: // DONE lexerState = INIT; tag = null; return token(HTMLTokenId.ERROR); case ISA_LT: // PENDING other transitions - e.g ' 1) { //lexer restart check, token already returned before last EOF input.backup(1); return token(HTMLTokenId.TAG_OPEN_SYMBOL); } break; } switch( actChar ) { case '/': // ETAGO - ': // Empty start tag <>, RELAXED lexerState = INIT; return token(HTMLTokenId.TAG_CLOSE_SYMBOL); case '!': lexerState = ISA_SGML_ESCAPE; break; case '?': lexerState = ISI_XML_PI; break; default: input.backup(1); lexerState = ISI_TEXT; break; } break; case ISI_XML_PI: if(actChar == '?') { lexerState = ISI_XML_PI_QM; break; } //else stay in XML PI break; case ISI_XML_PI_QM: if(actChar == '>') { //XML PI token lexerState = INIT; return token(HTMLTokenId.XML_PI); } else { lexerState = ISI_XML_PI; break; } case ISA_SLASH: // DONE if( isAZ( actChar ) ) { // ': // Empty end tag , RELAXED lexerState = INIT; return token(HTMLTokenId.TAG_CLOSE_SYMBOL); default: // Part of text, e.g. 1) { //lexer restart check, token already returned before last EOF input.backup(1); return token(HTMLTokenId.TAG_CLOSE); } break; case ISP_ENDTAG_X: // DONE if( isWS( actChar ) ) { lexerState = ISP_ENDTAG_WS; break; } tag = null; switch( actChar ) { case '>': // Closing of endtag, e.g. _ lexerState = INIT; return token(HTMLTokenId.TAG_CLOSE_SYMBOL); case '<': // next tag, e.g. 1) { //lexer restart check, token already returned before last EOF input.backup(1); return token(HTMLTokenId.WS); } break; case ISI_TAG: // DONE if( isName( actChar ) ) break; // Still in tag identifier, eat next char lexerState = ISP_TAG_X; if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF input.backup(1); //test if the tagname is SCRIPT tag = input.readText().toString(); if(equals(SCRIPT, tag, true, true)) { lexerEmbeddingState = ISI_SCRIPT; } if(equals(STYLE, tag, true, true)) { lexerEmbeddingState = ISI_STYLE; } return token(HTMLTokenId.TAG_OPEN); } break; case ISP_TAG_X: // DONE if( isWS( actChar ) ) { lexerState = ISP_TAG_WS; break; } if( isAttributeName(actChar) ) { lexerState = ISI_ARG; break; } switch( actChar ) { case '/': lexerState = ISI_TAG_SLASH; break; case '>': switch (lexerEmbeddingState) { case INIT: lexerState = INIT; break; case ISI_SCRIPT: //script w/ "text/html" content type workaround //do lex the script content as normal html code if(scriptType != null && "text/html".equalsIgnoreCase(scriptType)) { //NOI18N lexerEmbeddingState = INIT; scriptType = null; lexerState = INIT; } else { lexerState = ISI_SCRIPT_CONTENT; } break; case ISI_STYLE: lexerState = ISI_STYLE_CONTENT; break; } tag = null; return token(HTMLTokenId.TAG_CLOSE_SYMBOL); case '<': tag = null; lexerState = INIT; input.backup(1); break; default: lexerState = ISP_TAG_X_ERROR; break; } break; case ISP_TAG_X_ERROR: if(isWS(actChar)) { lexerState = tag == null ? INIT : ISP_TAG_X; input.backup(1); //backup the WS return token(HTMLTokenId.ERROR); } switch(actChar) { case '/': case '>': lexerState = tag == null ? INIT : ISP_TAG_X; input.backup(1); //lets reread the token again return token(HTMLTokenId.ERROR); } //stay in error break; case ISP_TAG_WS: // DONE if( isWS( actChar ) ) break; // eat all WS lexerState = ISP_TAG_X; if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF input.backup(1); return token(HTMLTokenId.WS); } case ISI_TAG_SLASH: tag = null; switch( actChar ) { case '>': lexerEmbeddingState = INIT; //possibly cancel 'in script' if empty tag found lexerState = INIT; return token(HTMLTokenId.TAG_CLOSE_SYMBOL); default: lexerState = tag == null ? INIT : ISP_TAG_X; input.backup(1); return token(HTMLTokenId.ERROR); } case ISI_SCRIPT_CONTENT: switch( actChar ) { case '<' : lexerState = ISI_SCRIPT_CONTENT_AFTER_LT; break; default: break; } break; case ISI_SCRIPT_CONTENT_AFTER_LT: if (actChar == '/') { if (followsCloseTag(SCRIPT)) { //end of script section found lexerEmbeddingState = INIT; lexerState = INIT; tag = null; String type = scriptType; scriptType = null; input.backup(input.readLength() > 2 ? 2 : input.readLength()); //backup the ' 0) { //the script has a body return token(HTMLTokenId.SCRIPT, new HtmlTokenPropertyProvider(HTMLTokenId.SCRIPT_TYPE_TOKEN_PROPERTY, type)); //NOI18N } else { break; } } } lexerState = ISI_SCRIPT_CONTENT; break; case ISI_STYLE_CONTENT: switch( actChar ) { case '<' : lexerState = ISI_STYLE_CONTENT_AFTER_LT; break; default: break; } break; case ISI_STYLE_CONTENT_AFTER_LT: if (actChar == '/') { if (followsCloseTag(STYLE)) { //end of script section found lexerEmbeddingState = INIT; lexerState = INIT; tag = null; input.backup(input.readLength() > 2 ? 2 : input.readLength()); //backup the ' 0) { //the script has a body return token(HTMLTokenId.STYLE); } else { break; } } } lexerState = ISI_STYLE_CONTENT; break; case ISI_ARG: // DONE if( isAttributeName(actChar) ) break; // eat next char lexerState = ISP_ARG_X; if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF input.backup(1); attribute =input.readText().toString(); return token(HTMLTokenId.ARGUMENT); } break; case ISP_ARG_X: if( isWS( actChar ) ) { lexerState = ISP_ARG_WS; break; } if( isAttributeName(actChar) ) { lexerState = ISI_ARG; break; } switch( actChar ) { case '/': case '>': input.backup(1); lexerState = ISP_TAG_X; break; case '<': lexerState = INIT; input.backup(1); break; case '=': lexerState = ISP_EQ; return token(HTMLTokenId.OPERATOR); default: lexerState = ISI_ERROR; input.backup(1); break; } break; case ISP_ARG_WS: if( isWS( actChar ) ) break; // Eat all WhiteSpace lexerState = ISP_ARG_X; if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF input.backup(1); return token(HTMLTokenId.WS); } break; case ISP_EQ: if( isWS( actChar ) ) { lexerState = ISP_EQ_WS; break; } switch( actChar ) { case '\'': quoteType = false; lexerState = ISI_VAL_QUOT; break; case '"': quoteType = true; lexerState = ISI_VAL_QUOT; break; case '/': case '>': case '<': input.backup(1); lexerState = ISP_TAG_X; break; default: lexerState = ISI_VAL; //everything else if attribute value break; } break; case ISP_EQ_WS: if( isWS( actChar ) ) break; // Consume all WS lexerState = ISP_EQ; if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF input.backup(1); return token(HTMLTokenId.WS); } break; case ISI_VAL: if(actChar == '/') { //slash in unquoted value -- may be there but not followed by >. //In such case IMO the value should be closed char next = (char)input.read(); input.backup(1); //backup the next char if(next != '>') { //continue lexing the value break; } } else if(!isWS(actChar) && actChar != '>' && actChar != '<') { break; //continue lexing the attribute value } //finish lexing the value lexerState = ISP_TAG_X; if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF input.backup(1); Token resolveValueToken = resolveValueToken(); attribute = null; return resolveValueToken; } break; case ISI_VAL_QUOT: //custom EL support delimiters: for(byte delimiterIndex = 0; delimiterIndex < customELQuery.getOpenDelimiters().length; delimiterIndex++ ) { String openDelimiter = customELQuery.getOpenDelimiters()[delimiterIndex]; if(openDelimiter == null) { continue; } int alreadyRead = input.readLength(); char read = (char)actChar; //first char is already read for(int i = 0; i < openDelimiter.length(); i++) { char delimChar = openDelimiter.charAt(i); if(read != delimChar) { //no match input.backup(input.readLength() - alreadyRead); //backup text continue delimiters; //and try next one } if((i+1) < openDelimiter.length()) { //will be next loop, read char read = (char)input.read(); } } //we've found an open delimiter //check if the there was already something read before checking the delimiter, //if so then return it and re-run this step again so then we can return //clean token for the delimiter if(input.readLength() > openDelimiter.length()) { input.backup(openDelimiter.length()); return resolveValueToken(); } else { //return the open symbol token and switch to "in el" state lexerState = ISI_VAL_QUOT_EL; customELIndex = (byte)(delimiterIndex + 1); //0 is reserved for "no delimiter", 1 means delimiter with index 0 //save the provider's index in the delimiter token's property so once can recognize what should be //the delimiters' content if it is empty //TODO "contentMimetype" INTO API??? return token(HTMLTokenId.EL_OPEN_DELIMITER, new HtmlTokenPropertyProvider(EL_EXPRESSION_CONTENT_MIMETYPE_TOKEN_PROPERTY_KEY, customELQuery.getMimeTypes()[delimiterIndex])); } } switch (actChar) { case '\\': //may be escaped quote lexerState = ISI_VAL_QUOT_ESC; break; case '\'': case '"': if(actChar == '\'' && !quoteType || actChar == '"' && quoteType) { //reset the 'script embedding will follow state' if the value represents a //type attribute value of a script tag if(equals(SCRIPT, tag, true, true) && equals("type", attribute, true, true)) { //NOI18N //inside script tag scriptType = getScriptType(input.readText(), true).toString(); } lexerState = ISP_TAG_X; Token resolveValueToken = resolveValueToken(); attribute = null; return resolveValueToken; } } break; // else simply consume next char of VALUE case ISI_VAL_QUOT_EL: delimiters: for(byte delimiterIndex = 0; delimiterIndex < customELQuery.getOpenDelimiters().length; delimiterIndex++ ) { String closeDelimiter = customELQuery.getCloseDelimiters()[delimiterIndex]; if(closeDelimiter == null) { continue; } int alreadyRead = input.readLength(); char read = (char)actChar; //first char is already read for(int i = 0; i < closeDelimiter.length(); i++) { char delimChar = closeDelimiter.charAt(i); if(read != delimChar) { //no match input.backup(input.readLength() - alreadyRead); //backup text continue delimiters; //and try next one } if((i+1) < closeDelimiter.length()) { //will be next loop, read char read = (char)input.read(); } } //we've found a close delimiter //check if the there was already something read before checking the delimiter, //if so then return it and re-run this step again so then we can return //clean token for the delimiter if(input.readLength() > closeDelimiter.length()) { input.backup(closeDelimiter.length()); //save the provider's index in the token's property so we can set the corresponding embdding in HTMLTokenId.language() return token(HTMLTokenId.EL_CONTENT, new HtmlTokenPropertyProvider(EL_CONTENT_PROVIDER_INDEX, (byte)(customELIndex - 1))); } else { //return the close symbol token and switch to "in value" state lexerState = ISI_VAL_QUOT; customELIndex = INIT; return token(HTMLTokenId.EL_CLOSE_DELIMITER); } } break; case ISI_VAL_QUOT_ESC: //Just consume the escaped char. //The state prevents the quoted value //to be finished by an escaped quote. lexerState = ISI_VAL_QUOT; break; case ISA_SGML_ESCAPE: // DONE if( isAZ(actChar) ) { lexerState = ISI_SGML_DECL; break; } switch( actChar ) { case '-': lexerState = ISA_SGML_DASH; break; default: lexerState = ISI_TEXT; input.backup(1); continue; } break; case ISA_SGML_DASH: // DONE switch( actChar ) { case '-': lexerState = ISI_HTML_COMMENT; break; default: lexerState = ISI_TEXT; input.backup(1); continue; } break; case ISI_HTML_COMMENT: // DONE switch( actChar ) { case '-': lexerState = ISA_HTML_COMMENT_DASH; break; //create an HTML comment token for each line of the comment - a performance fix for #43532 case '\n': //leave the some state - we are still in an HTML comment, //we just need to create a token for each line. return token(HTMLTokenId.BLOCK_COMMENT); } break; case ISA_HTML_COMMENT_DASH: switch( actChar ) { case '-': lexerState = ISI_HTML_COMMENT_WS; break; default: lexerState = ISI_HTML_COMMENT; continue; } break; case ISI_HTML_COMMENT_WS: // DONE switch( actChar ) { case '>': lexerState = INIT; return token(HTMLTokenId.BLOCK_COMMENT); default: lexerState = ISI_HTML_COMMENT; input.backup(2); //backup everything except the first comma break; } break; case ISI_SGML_DECL: if(Character.isWhitespace(actChar)) { lexerState = ISI_SGML_DECL_WS; if(input.readLength() > 1) { input.backup(1); //backup the whitespace return token(HTMLTokenId.DECLARATION); } break; } switch( actChar ) { case '>': if(input.readLength() > 1) { input.backup(1); //backup the '<' char return token(HTMLTokenId.DECLARATION); } else { //just the symbol read - return it as a part of declaration lexerState = INIT; return token(HTMLTokenId.DECLARATION); } } break; case ISI_SGML_DECL_WS: if(actChar == '-') { if( input.readLength() == 1 ) { lexerState = ISA_SGML_DECL_DASH; break; } else { if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF input.backup(1); return token(HTMLTokenId.DECLARATION); } } } else if(!Character.isWhitespace(actChar)) { lexerState = ISI_SGML_DECL; input.backup(1); return token(HTMLTokenId.WS); } break; case ISA_SGML_DECL_DASH: if( actChar == '-' ) { lexerState = ISI_SGML_COMMENT; break; } else { lexerState = ISI_SGML_DECL; input.backup(1); continue; } case ISI_SGML_COMMENT: switch( actChar ) { case '-': lexerState = ISA_SGML_COMMENT_DASH; break; } break; case ISA_SGML_COMMENT_DASH: if( actChar == '-' ) { lexerState = ISI_SGML_DECL; return token(HTMLTokenId.SGML_COMMENT); } else { lexerState = ISI_SGML_COMMENT; input.backup(1); continue; } case ISA_REF: if( isAZ( actChar ) ) { lexerState = ISI_REF_NAME; break; } if( actChar == '#' ) { lexerState = ISA_REF_HASH; break; } lexerState = lexerSubState; input.backup(1); continue; case ISI_REF_NAME: if( isName( actChar ) ) break; lexerState = lexerSubState; if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF if( actChar != ';' ) { input.backup(1); return token(HTMLTokenId.TEXT); } return token(HTMLTokenId.CHARACTER); } break; case ISA_REF_HASH: if( actChar >= '0' && actChar <= '9' ) { lexerState = ISI_REF_DEC; break; } if( actChar == 'x' || actChar == 'X' ) { lexerState = ISA_REF_X; break; } if( isAZ( actChar ) ) { lexerState = lexerSubState; return token(HTMLTokenId.ERROR); } lexerState = lexerSubState; input.backup(1); continue; case ISI_REF_DEC: if( actChar >= '0' && actChar <= '9' ) break; lexerState = lexerSubState; if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF if( actChar != ';' ) input.backup(1); return token(HTMLTokenId.CHARACTER); } break; case ISA_REF_X: if( (actChar >= '0' && actChar <= '9') || (actChar >= 'a' && actChar <= 'f') || (actChar >= 'A' && actChar <= 'F') ) { lexerState = ISI_REF_HEX; break; } lexerState = lexerSubState; if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF input.backup(1); return token(HTMLTokenId.ERROR); // error on previous "&#x" sequence } break; case ISI_REF_HEX: if( (actChar >= '0' && actChar <= '9') || (actChar >= 'a' && actChar <= 'f') || (actChar >= 'A' && actChar <= 'F') ) break; lexerState = lexerSubState; if(input.readLength() > 1) { //lexer restart check, token already returned before last EOF if( actChar != ';' ) input.backup(1); return token(HTMLTokenId.CHARACTER); } break; } } // end of while(offset...) /** At this stage there's no more text in the scanned buffer. * Scanner first checks whether this is completely the last * available buffer. */ switch( lexerState ) { case INIT: if (input.readLength() == 0) { return null; } break; case ISI_TEXT: case ISA_LT: case ISA_SLASH: case ISA_SGML_ESCAPE: case ISA_SGML_DASH: case ISI_TAG_SLASH: return token(HTMLTokenId.TEXT); case ISI_XML_PI: case ISI_XML_PI_QM: return token(HTMLTokenId.XML_PI); case ISA_REF: case ISA_REF_HASH: if( lexerSubState == ISI_TEXT ) return token(HTMLTokenId.TEXT); else return token(HTMLTokenId.VALUE); case ISI_HTML_COMMENT: case ISA_HTML_COMMENT_DASH: case ISI_HTML_COMMENT_WS: return token(HTMLTokenId.BLOCK_COMMENT); case ISI_TAG: lexerState = ISP_TAG_X; //test if the tagname is SCRIPT if(equals(SCRIPT, input.readText(), true, true)) { lexerEmbeddingState = ISI_SCRIPT; } if(equals(STYLE, input.readText(), true, true)) { lexerEmbeddingState = ISI_STYLE; } return token(HTMLTokenId.TAG_OPEN); case ISI_ENDTAG: return token(HTMLTokenId.TAG_CLOSE); case ISI_ARG: return token(HTMLTokenId.ARGUMENT); case ISI_ERROR: case ISP_TAG_X_ERROR: return token(HTMLTokenId.ERROR); case ISP_ARG_WS: case ISP_TAG_WS: case ISP_ENDTAG_WS: case ISP_EQ_WS: return token(HTMLTokenId.WS); case ISP_ARG_X: case ISP_TAG_X: case ISP_ENDTAG_X: case ISP_EQ: return token(HTMLTokenId.WS); case ISI_VAL: case ISI_VAL_QUOT: case ISI_VAL_QUOT_ESC: return resolveValueToken(); case ISI_SGML_DECL: case ISA_SGML_DECL_DASH: case ISI_SGML_DECL_WS: return token(HTMLTokenId.DECLARATION); case ISI_SGML_COMMENT: case ISA_SGML_COMMENT_DASH: return token(HTMLTokenId.SGML_COMMENT); case ISI_REF_NAME: case ISI_REF_DEC: case ISA_REF_X: case ISI_REF_HEX: return token(HTMLTokenId.TEXT); case ISI_SCRIPT_CONTENT: case ISI_SCRIPT_CONTENT_ENDTAG: case ISI_SCRIPT_CONTENT_AFTER_LT: return token(HTMLTokenId.SCRIPT); case ISI_STYLE_CONTENT: case ISI_STYLE_CONTENT_ENDTAG: case ISI_STYLE_CONTENT_AFTER_LT: return token(HTMLTokenId.STYLE); case ISI_EL: case ISI_VAL_QUOT_EL: return token(HTMLTokenId.EL_CONTENT, new HtmlTokenPropertyProvider(EL_CONTENT_PROVIDER_INDEX, (byte)(customELIndex - 1))); } assert input.readLength() == 0 : "Returning null even if some chars still needs to be tokenized! " + "lexer state=" + lexerState + "; " + "lexer substate=" + lexerSubState + "; " + "lexer embedding state=" + lexerEmbeddingState + "; " + "readtext='" + input.readText() + "'"; return null; } private static final String CLASS_ATTR_NAME = "class"; //NOI18N private static final String ID_ATTR_NAME = "id"; //NOI18N private Token resolveValueToken() { assert attribute != null; //onclick and similar method javascript embedding if (isJavascriptEventHandlerName(attribute)) { return token(HTMLTokenId.VALUE_JAVASCRIPT); } //style, id or class attribute value css embeddeding if (isStyleAttributeName(attribute)) { return createCssValueToken(); } //generic css "class" embedding if (cssClassTagAttrMap != null && tag != null) { Collection attrs = cssClassTagAttrMap.get(tag); if (attrs != null && attrs.contains(attribute)) { //yup the attribute's value should have css "class" selector embedding return token(HTMLTokenId.VALUE_CSS, CLASS_TOKEN_PP); } } //lexer plugins: String embeddingMimeType = HtmlPlugins.getDefault().createAttributeEmbedding(tag, attribute); if (embeddingMimeType != null) { LOGGER.log(Level.FINE, "creating html attribute value token {0} in tag {1} with embedding {2}", new Object[]{attribute, tag, embeddingMimeType}); return token(HTMLTokenId.VALUE, new HtmlTokenPropertyProvider(ATTRIBUTE_VALUE_EMBEDDING_MIMETYPE_TOKEN_PROPERTY_KEY, embeddingMimeType)); } return token(HTMLTokenId.VALUE); } private Token createCssValueToken() { TokenPropertyProvider provider; if(equals(CLASS_ATTR_NAME, attribute, true, true)) { provider = CLASS_TOKEN_PP; } else if(equals(ID_ATTR_NAME, attribute, true, true)) { provider = ID_TOKEN_PP; } else { provider = null; } return token(HTMLTokenId.VALUE_CSS, provider); } private Token token(HTMLTokenId tokenId) { return token(tokenId, null); } private Token token(HTMLTokenId tokenId, TokenPropertyProvider tokenPropertyProvider) { if(LOG) { if(input.readLength() == 0) { LOGGER.log(Level.INFO, "Found zero length token: "); //NOI18N } LOGGER.log(Level.INFO, "[{0}] token (''{1}''; id={2}; state={3})\n", new Object[]{this.getClass().getSimpleName(), input.readText().toString(), tokenId, state()}); //NOI18N } if(tokenPropertyProvider != null) { return tokenFactory.createPropertyToken(tokenId, input.readLength(), tokenPropertyProvider); } else { CharSequence image = input.readText(); switch(tokenId) { case OPERATOR: return tokenFactory.getFlyweightToken(tokenId, IMG_EQUAL_SIGN); case TAG_CLOSE_SYMBOL: switch(image.charAt(0)) { case '/': if(input.readLength() > 1) { if(image.charAt(1) == '>') { return tokenFactory.getFlyweightToken(tokenId, IMG_CLOSE_TAG_SYMBOL2); } } break; case '>': return tokenFactory.getFlyweightToken(tokenId, IMG_CLOSE_TAG_SYMBOL); } case TAG_OPEN_SYMBOL: switch(image.charAt(0)) { case '<': if(input.readLength() > 1) { if(image.charAt(1) == '/') { return tokenFactory.getFlyweightToken(tokenId, IMG_OPEN_TAG_SYMBOL2); } break; } else { return tokenFactory.getFlyweightToken(tokenId, IMG_OPEN_TAG_SYMBOL); } } case TAG_OPEN: case TAG_CLOSE: String cachedTagName = HtmlElements.getCachedTagName(image); if(cachedTagName != null) { assert (cachedTagName.length() <= input.readLength()) : "readlength == " + input.readLength() + "; text=" + cachedTagName + "; image=" + image; return tokenFactory.getFlyweightToken(tokenId, cachedTagName); } break; case ARGUMENT: String cachedAttrName = HtmlElements.getCachedAttrName(image); if(cachedAttrName != null) { assert (cachedAttrName.length() <= input.readLength()) : "readlength == " + input.readLength() + "; text=" + cachedAttrName + "; image=" + image; return tokenFactory.getFlyweightToken(tokenId, cachedAttrName); } break; } return tokenFactory.createToken(tokenId); } } @Override public void release() { } /** @param optimized - first sequence is lowercase, one call to Character.toLowerCase() */ private static boolean equals(CharSequence text1, CharSequence text2, boolean ignoreCase, boolean optimized) { if (text1 == text2) { return true; } if (text1 == null || text2 == null) { return false; } if (text1.length() != text2.length()) { return false; } else { //compare content for (int i = 0; i < text1.length(); i++) { char ch1 = ignoreCase && !optimized ? Character.toLowerCase(text1.charAt(i)) : text1.charAt(i); char ch2 = ignoreCase ? Character.toLowerCase(text2.charAt(i)) : text2.charAt(i); if (ch1 != ch2) { return false; } } return true; } } private static class HtmlTokenPropertyProvider implements TokenPropertyProvider { private final String key; private final Object value; HtmlTokenPropertyProvider(String key, Object value) { this.key = key; this.value = value; } @Override public Object getValue(Token token, Object key) { if (this.key.equals(key)) { return value; } else { return null; } } } private static final TokenPropertyProvider CLASS_TOKEN_PP = new HtmlTokenPropertyProvider(HTMLTokenId.VALUE_CSS_TOKEN_TYPE_PROPERTY, HTMLTokenId.VALUE_CSS_TOKEN_TYPE_CLASS); private static final TokenPropertyProvider ID_TOKEN_PP = new HtmlTokenPropertyProvider(HTMLTokenId.VALUE_CSS_TOKEN_TYPE_PROPERTY, HTMLTokenId.VALUE_CSS_TOKEN_TYPE_ID); }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy