org.netbeans.lib.html.lexer.HtmlLexer Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.netbeans.lib.html.lexer;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.netbeans.api.html.lexer.HTMLTokenId;
import org.netbeans.api.html.lexer.HtmlLexerPlugin;
import org.netbeans.api.lexer.InputAttributes;
import org.netbeans.api.lexer.LanguagePath;
import org.netbeans.api.lexer.Token;
import org.netbeans.spi.lexer.Lexer;
import org.netbeans.spi.lexer.LexerInput;
import org.netbeans.spi.lexer.LexerRestartInfo;
import org.netbeans.spi.lexer.TokenFactory;
import org.netbeans.spi.lexer.TokenPropertyProvider;
/**
* Lexical analyzer for HTML. Based on original HTML lexer from html/editor module.
*
* @author Petr Nejedly
* @author Miloslav Metelka
* @author Jan Lahoda
* @author Marek Fukala
* @version 1.00
*/
public final class HtmlLexer implements Lexer {
private static final Logger LOGGER = Logger.getLogger(HtmlLexer.class.getName());
private static final boolean LOG = Boolean.getBoolean("j2ee_lexer_debug"); //NOI18N
private static final int EOF = LexerInput.EOF;
private final LexerInput input;
private final TokenFactory tokenFactory;
private static final class CompoundState {
private int lexerState;
private int lexerSubState;
private int lexerEmbeddingState;
private byte customELIndex;
private String attribute;
private String tag;
private String scriptType;
private boolean quoteType;
public CompoundState(int lexerState, int lexerSubState, int lexerEmbeddingState, String attributeName, String tagName, String scriptType, byte customELIndex, boolean quoteType) {
this.lexerState = lexerState;
this.lexerSubState = lexerSubState;
this.lexerEmbeddingState = lexerEmbeddingState;
this.attribute = attributeName;
this.tag = tagName;
this.scriptType = scriptType;
this.customELIndex = customELIndex;
this.quoteType = quoteType;
}
@Override
public boolean equals(Object obj) {
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
final CompoundState other = (CompoundState) obj;
if (this.lexerState != other.lexerState) {
return false;
}
if (this.lexerSubState != other.lexerSubState) {
return false;
}
if (this.lexerEmbeddingState != other.lexerEmbeddingState) {
return false;
}
if (this.attribute != other.attribute && (this.attribute == null || !this.attribute.equals(other.attribute))) {
return false;
}
if (this.tag != other.tag && (this.tag == null || !this.tag.equals(other.tag))) {
return false;
}
if (this.scriptType != other.scriptType && (this.scriptType == null || !this.scriptType.equals(other.scriptType))) {
return false;
}
if (this.customELIndex != other.customELIndex) {
return false;
}
if (this.quoteType != other.quoteType) {
return false;
}
return true;
}
@Override
public int hashCode() {
int hash = 3;
hash = 17 * hash + this.lexerState;
hash = 17 * hash + this.lexerSubState;
hash = 17 * hash + this.lexerEmbeddingState;
hash = 17 * hash + (this.attribute != null ? this.attribute.hashCode() : 0);
hash = 17 * hash + (this.tag != null ? this.tag.hashCode() : 0);
hash = 17 * hash + (this.scriptType != null ? this.scriptType.hashCode() : 0);
if(this.customELIndex > 0) {
//do not alter hash code if there's no custom el index set
hash = 17 * hash + this.customELIndex;
}
//do not alter the hash code out of the related area
switch(lexerState) {
case ISI_VAL_QUOT:
case ISI_VAL_QUOT_EL:
case ISI_VAL_QUOT_ESC:
hash = 17 * hash + (quoteType ? 1 : 0);
break;
}
return hash;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("HLS(hc="); //NOI18N
sb.append(hashCode());
sb.append(",s="); //NOI18N
sb.append(lexerState);
if(lexerSubState > 0) {
sb.append(",ss="); //NOI18N
sb.append(lexerSubState);
}
if(lexerEmbeddingState > 0) {
sb.append(",es="); //NOI18N
sb.append(lexerEmbeddingState);
}
if(tag != null) {
sb.append(",tag="); //NOI18N
sb.append(tag);
}
if(attribute != null) {
sb.append(",attribute="); //NOI18N
sb.append(attribute);
}
if(scriptType != null) {
sb.append(",scriptType="); //NOI18N
sb.append(scriptType);
}
sb.append(')'); //NOI18N
return sb.toString();
}
}
private final HashMap STATES_CACHE = new HashMap<>();
@Override
public Object state() {
//cache the states so lexing of large files do not eat too much memory
CompoundState currentState = new CompoundState(lexerState, lexerSubState, lexerEmbeddingState, attribute, tag, scriptType, customELIndex, quoteType);
CompoundState cached = STATES_CACHE.get(currentState);
if(cached == null) {
STATES_CACHE.put(currentState, currentState);
return currentState;
} else {
return cached;
}
}
//script and style tag names
private static final String SCRIPT = "script"; //NOI18N
private static final String STYLE = "style"; //NOI18N
private static final String[] STYLE_ATTRS = new String[]{"style", "id", "class"}; //NOI18N
/** Internal state of the lexical analyzer before entering subanalyzer of
* character references. It is initially set to INIT, but before first usage,
* this will be overwritten with state, which originated transition to
* charref subanalyzer.
*/
private int lexerSubState = INIT;
private int lexerState = INIT;
private String attribute;
private String tag; //tag name of the current context tag
/**
* Value of the "type" attribute in SCRIPT tag
*/
private String scriptType;
//tag name with namespace prefix to collection of attributes which should have
//css class embedding by default
private Map> cssClassTagAttrMap;
private String CSS_CLASS_MAP_PROPERTY_KEY = "cssClassTagAttrMap"; //NOI18N //semi api
/** indicated whether we are in a script */
private int lexerEmbeddingState = INIT;
private byte customELIndex = INIT;
/**
* Indicates the quote type in ISI_VAL_QUOT state.
*
* true means double qoute, false single quote.
*/
private boolean quoteType;
public static final String EL_CONTENT_PROVIDER_INDEX = "elci"; //NOI18N
// internal 'in script' state. 'scriptState' internal state is set to it when the
// analyzer goes into a script tag body
private static final int ISI_SCRIPT = 1;
private static final int ISI_STYLE = 2;
// Internal states
private static final int INIT = 0;
private static final int ISI_TEXT = 1; // Plain text between tags
private static final int ISI_ERROR = 2; // Syntax error in HTML syntax
private static final int ISA_LT = 3; // After start of tag delimiter - "<"
private static final int ISA_SLASH = 4; // After ETAGO - ""
private static final int ISI_ENDTAG = 5; // Inside endtag - "[a..Z]+"
private static final int ISP_ENDTAG_X = 6; // X-switch after ENDTAG's name
private static final int ISP_ENDTAG_WS = 7; // In WS in ENDTAG - ""
private static final int ISI_TAG = 8; // Inside tag - "<[a..Z]+"
private static final int ISP_TAG_X = 9; // X-switch after TAG's name
private static final int ISP_TAG_WS = 10; // In WS in TAG - ""
private static final int ISI_ARG = 11; // Inside tag's argument - ""
private static final int ISP_ARG_X = 12; // X-switch after ARGUMENT's name
private static final int ISP_ARG_WS = 13; // Inside WS after argument awaiting '='
private static final int ISP_EQ = 14; // X-switch after '=' in TAG's ARGUMENT
private static final int ISP_EQ_WS = 15; // In WS after '='
private static final int ISI_VAL = 16; // Non-quoted value
private static final int ISI_VAL_QUOT = 17; // quoted value
private static final int ISI_VAL_QUOT_EL = 18; // in EL in quoted value
private static final int ISA_SGML_ESCAPE = 19; // After " tags closing symbol '>' - the tag content
private static final int ISI_SCRIPT_CONTENT_AFTER_LT = 36; //after < in script content
private static final int ISI_SCRIPT_CONTENT_ENDTAG = 37; //after in script content
private static final int ISI_STYLE_CONTENT = 38; //after