All Downloads are FREE. Search and download functionalities are using the official Maven repository.

jodd.lagarto.LagartoParser Maven / Gradle / Ivy

Go to download

Jodd Lagarto is fast and versatile all purpose HTML parser. Includes Jerry and CSSelly.

There is a newer version: 6.0.6
Show newest version
// Copyright (c) 2003-present, Jodd Team (http://jodd.org)
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.

package jodd.lagarto;

import jodd.util.ArraysUtil;
import jodd.util.CharArraySequence;
import jodd.util.CharUtil;
import jodd.util.StringPool;
import jodd.util.UnsafeUtil;
import jodd.net.HtmlDecoder;

import static jodd.util.CharUtil.equalsOne;
import static jodd.util.CharUtil.isAlpha;
import static jodd.util.CharUtil.isDigit;

/**
 * HTML/XML content parser/tokenizer using {@link TagVisitor} for callbacks.
 * Works by the HTML5 specs for tokenization, as described
 * on WhatWG.
 * Differences from the specs:
 *
 * 
    *
  • text is emitted as a block of text, and not character by character.
  • *
  • tags name case (and letter case of other entities) is not changed, but case-sensitive * information exist for matching. *
  • the whole tokenization process is implemented here, without going into the tree building. * This applies for switching to the RAWTEXT state. *
  • *
  • script tag is emitted separately
  • *
  • conditional comments added
  • *
  • xml states and callbacks added
  • *
* *

* There are two ways how text is passed back to the visitor. * By default it is passed as CharBuffer, which * gives excellent performances. However, if you need more Strings * than enable it, and all text will be strings. This is faster * then first converting to char buffer and then to strings. */ public class LagartoParser extends Scanner { protected TagVisitor visitor; protected ParsedTag tag; protected ParsedDoctype doctype; protected long parsingTime; /** * Creates parser on char array. */ public LagartoParser(final char[] charArray) { initialize(charArray); } /** * Creates parser on a String. */ public LagartoParser(final String string) { initialize(UnsafeUtil.getChars(string)); } /** * Initializes parser. */ @Override protected void initialize(final char[] input) { super.initialize(input); this.tag = new ParsedTag(); this.doctype = new ParsedDoctype(); this.text = new char[1024]; this.textLen = 0; this.parsingTime = -1; } // ---------------------------------------------------------------- configuration protected LagartoParserConfig config = new LagartoParserConfig(); /** * Returns {@link jodd.lagarto.LagartoParserConfig configuration} for the parser. */ public LagartoParserConfig getConfig() { return config; } /** * Sets parser configuration. */ public void setConfig(final LagartoParserConfig config) { this.config = config; } // ---------------------------------------------------------------- parse protected boolean parsing; /** * Parses content and callback provided {@link TagVisitor}. */ public void parse(final TagVisitor visitor) { tag.init(config.caseSensitive); this.parsingTime = System.currentTimeMillis(); this.visitor = visitor; visitor.start(); parsing = true; while (parsing) { state.parse(); } emitText(); visitor.end(); this.parsingTime = System.currentTimeMillis() - parsingTime; } /** * Returns parsing time in milliseconds. */ public long getParsingTime() { return parsingTime; } // ---------------------------------------------------------------- start & end /** * Data state. */ protected State DATA_STATE = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { emitText(); parsing = false; return; } char c = input[ndx]; if (c == '<') { emitText(); state = TAG_OPEN; return; } if (c == '&') { consumeCharacterReference(); continue; } textEmitChar(c); } } }; protected void consumeCharacterReference(final char allowedChar) { ndx++; if (isEOF()) { return; } char c = input[ndx]; if (c == allowedChar) { ndx--; return; } _consumeAttrCharacterReference(); } protected void consumeCharacterReference() { ndx++; if (isEOF()) { return; } _consumeCharacterReference(); } private void _consumeCharacterReference() { int unconsumeNdx = ndx - 1; char c = input[ndx]; if (equalsOne(c, CONTINUE_CHARS)) { ndx = unconsumeNdx; textEmitChar('&'); return; } if (c == '#') { _consumeNumber(unconsumeNdx); } else { String name = HtmlDecoder.detectName(input, ndx); if (name == null) { // this error is not quite as by the spec. The spec says that // only a sequence of alphanumeric chars ending with semicolon // gives na error errorCharReference(); textEmitChar('&'); ndx = unconsumeNdx; return; } // missing legacy attribute thing ndx += name.length(); textEmitChars(HtmlDecoder.lookup(name)); c = input[ndx]; if (c != ';') { errorCharReference(); ndx--; } } } private void _consumeAttrCharacterReference() { final int unconsumeNdx = ndx - 1; char c = input[ndx]; if (equalsOne(c, CONTINUE_CHARS)) { ndx = unconsumeNdx; textEmitChar('&'); return; } if (c == '#') { _consumeNumber(unconsumeNdx); } else { final String name = HtmlDecoder.detectName(input, ndx); if (name == null) { // this error is not quite as by the spec. The spec says that // only a sequence of alphanumeric chars ending with semicolon // gives na error errorCharReference(); textEmitChar('&'); ndx = unconsumeNdx; return; } // missing legacy attribute thing ndx += name.length(); c = input[ndx]; if (c == ';') { textEmitChars(HtmlDecoder.lookup(name)); } else { textEmitChar('&'); ndx = unconsumeNdx; } } } private void _consumeNumber(final int unconsumeNdx) { ndx++; if (isEOF()) { ndx = unconsumeNdx; return; } char c = input[ndx]; int value = 0; int digitCount = 0; if (c == 'X' || c == 'x') { while (true) { ndx++; if (isEOF()) { ndx = unconsumeNdx; return; } c = input[ndx]; if (isDigit(c)) { value *= 16; value += c - '0'; digitCount++; } else if ((c >= 'a') && (c <= 'f')) { value *= 16; value += c - 'a' + 10; digitCount++; } else if ((c >= 'A') && (c <= 'F')) { value *= 16; value += c - 'A' + 10; digitCount++; } else { break; } } } else { while (isDigit(c)) { value *= 10; value += c - '0'; ndx++; if (isEOF()) { ndx = unconsumeNdx; return; } c = input[ndx]; digitCount++; } } if (digitCount == 0) { // no character matches the range errorCharReference(); ndx = unconsumeNdx; return; } if (c != ';') { errorCharReference(); ndx--; // go back, as pointer is on the next char } boolean isErr = true; switch (value) { case 0: c = REPLACEMENT_CHAR; break; case 0x80: c = '\u20AC'; break; case 0x81: c = '\u0081'; break; case 0x82: c = '\u201A'; break; case 0x83: c = '\u0192'; break; case 0x84: c = '\u201E'; break; case 0x85: c = '\u2026'; break; case 0x86: c = '\u2020'; break; case 0x87: c = '\u2021'; break; case 0x88: c = '\u02C6'; break; case 0x89: c = '\u2030'; break; case 0x8A: c = '\u0160'; break; case 0x8B: c = '\u2039'; break; case 0x8C: c = '\u0152'; break; case 0x8D: c = '\u008D'; break; case 0x8E: c = '\u017D'; break; case 0x8F: c = '\u008F'; break; case 0x90: c = '\u0090'; break; case 0x91: c = '\u2018'; break; case 0x92: c = '\u2019'; break; case 0x93: c = '\u201C'; break; case 0x94: c = '\u201D'; break; case 0x95: c = '\u2022'; break; case 0x96: c = '\u2013'; break; case 0x97: c = '\u2014'; break; case 0x98: c = '\u02DC'; break; case 0x99: c = '\u2122'; break; case 0x9A: c = '\u0161'; break; case 0x9B: c = '\u203A'; break; case 0x9C: c = '\u0153'; break; case 0x9D: c = '\u009D'; break; case 0x9E: c = '\u017E'; break; case 0x9F: c = '\u0178'; break; default: isErr = false; } if (isErr) { errorCharReference(); textEmitChar(c); return; } if (((value >= 0xD800) && (value <= 0xDFF)) || (value > 0x10FFFF)) { errorCharReference(); textEmitChar(REPLACEMENT_CHAR); return; } c = (char) value; textEmitChar(c); if ( ((c >= 0x0001) && (c <= 0x0008)) || ((c >= 0x000D) && (c <= 0x001F)) || ((c >= 0x007F) && (c <= 0x009F)) || ((c >= 0xFDD0) && (c <= 0xFDEF)) ) { errorCharReference(); return; } if (equalsOne(c, INVALID_CHARS)) { errorCharReference(); } } protected State TAG_OPEN = new State() { @Override public void parse() { tag.start(ndx); ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; textEmitChar('<'); return; } char c = input[ndx]; if (c == '!') { state = MARKUP_DECLARATION_OPEN; return; } if (c == '/') { state = END_TAG_OPEN; return; } if (isAlpha(c)) { state = TAG_NAME; return; } if (config.parseXmlTags) { if (match(XML)) { ndx += XML.length - 1; if (xmlDeclaration == null) { xmlDeclaration = new XmlDeclaration(); } state = xmlDeclaration.XML_BETWEEN; return; } } if (c == '?') { errorInvalidToken(); state = BOGUS_COMMENT; return; } errorInvalidToken(); state = DATA_STATE; textEmitChar('<'); ndx--; } }; protected State END_TAG_OPEN = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (isAlpha(c)) { tag.setType(TagType.END); state = TAG_NAME; return; } errorInvalidToken(); state = BOGUS_COMMENT; } }; protected State TAG_NAME = new State() { @Override public void parse() { int nameNdx = ndx; while (true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { state = BEFORE_ATTRIBUTE_NAME; tag.setName(charSequence(nameNdx, ndx)); break; } if (c == '/') { state = SELF_CLOSING_START_TAG; tag.setName(charSequence(nameNdx, ndx)); break; } if (c == '>') { state = DATA_STATE; tag.setName(charSequence(nameNdx, ndx)); emitTag(); break; } } } }; protected State BEFORE_ATTRIBUTE_NAME = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { continue; } if (c == '/') { state = SELF_CLOSING_START_TAG; return; } if (c == '>') { state = DATA_STATE; emitTag(); return; } if (equalsOne(c, ATTR_INVALID_1)) { errorInvalidToken(); } state = ATTRIBUTE_NAME; return; } } }; protected State ATTRIBUTE_NAME = new State() { @Override public void parse() { attrStartNdx = ndx; while (true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { attrEndNdx = ndx; state = AFTER_ATTRIBUTE_NAME; return; } if (c == '/') { attrEndNdx = ndx; _addAttribute(); state = SELF_CLOSING_START_TAG; return; } if (c == '=') { attrEndNdx = ndx; state = BEFORE_ATTRIBUTE_VALUE; return; } if (c == '>') { state = DATA_STATE; attrEndNdx = ndx; _addAttribute(); emitTag(); return; } if (equalsOne(c, ATTR_INVALID_2)) { errorInvalidToken(); } } } }; protected State AFTER_ATTRIBUTE_NAME = new State() { @Override public void parse() { while(true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { continue; } if (c == '/') { state = SELF_CLOSING_START_TAG; return; } if (c == '=') { state = BEFORE_ATTRIBUTE_VALUE; return; } if (c == '>') { state = DATA_STATE; emitTag(); return; } if (equalsOne(c, ATTR_INVALID_2)) { errorInvalidToken(); } _addAttribute(); state = ATTRIBUTE_NAME; return; } } }; protected State BEFORE_ATTRIBUTE_VALUE = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { continue; } if (c == '\"') { state = ATTR_VALUE_DOUBLE_QUOTED; return; } if (c == '\'') { state = ATTR_VALUE_SINGLE_QUOTED; return; } if (c == '&') { state = ATTR_VALUE_UNQUOTED; ndx--; return; } if (c == '>') { _addAttribute(); errorInvalidToken(); state = DATA_STATE; emitTag(); return; } if (equalsOne(c, ATTR_INVALID_3)) { errorInvalidToken(); } state = ATTR_VALUE_UNQUOTED; return; } } }; protected State ATTR_VALUE_UNQUOTED = new State() { @Override public void parse() { textStart(); textEmitChar(input[ndx]); while (true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { _addAttributeWithValue(); state = BEFORE_ATTRIBUTE_NAME; return; } if (c == '&') { consumeCharacterReference('>'); continue; } if (c == '>') { _addAttributeWithValue(); state = DATA_STATE; emitTag(); return; } if (equalsOne(c, ATTR_INVALID_4)) { errorInvalidToken(); } textEmitChar(c); } } }; protected State ATTR_VALUE_SINGLE_QUOTED = new State() { @Override public void parse() { textStart(); while (true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (c == '\'') { _addAttributeWithValue(); state = AFTER_ATTRIBUTE_VALUE_QUOTED; return; } if (c == '&') { consumeCharacterReference('\''); continue; } textEmitChar(c); } } }; protected State ATTR_VALUE_DOUBLE_QUOTED = new State() { @Override public void parse() { textStart(); while (true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (c == '"') { _addAttributeWithValue(); state = AFTER_ATTRIBUTE_VALUE_QUOTED; return; } if (c == '&') { consumeCharacterReference('\"'); continue; } textEmitChar(c); } } }; protected State AFTER_ATTRIBUTE_VALUE_QUOTED = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { state = BEFORE_ATTRIBUTE_NAME; return; } if (c == '/') { state = SELF_CLOSING_START_TAG; return; } if (c == '>') { state = DATA_STATE; emitTag(); return; } errorInvalidToken(); state = BEFORE_ATTRIBUTE_NAME; ndx--; } }; protected State SELF_CLOSING_START_TAG = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (c == '>') { tag.setType(TagType.SELF_CLOSING); state = DATA_STATE; emitTag(); return; } errorInvalidToken(); state = BEFORE_ATTRIBUTE_NAME; ndx--; } }; // ---------------------------------------------------------------- special protected State BOGUS_COMMENT = new State() { @Override public void parse() { int commentEndNdx = find('>', ndx, total); if (commentEndNdx == -1) { commentEndNdx = total; } emitComment(ndx, commentEndNdx); state = DATA_STATE; ndx = commentEndNdx; } }; protected State MARKUP_DECLARATION_OPEN = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = BOGUS_COMMENT; return; } if (match(COMMENT_DASH)) { state = COMMENT_START; ndx++; return; } if (matchUpperCase(T_DOCTYPE)) { state = DOCTYPE; ndx += T_DOCTYPE.length - 1; return; } if (config.enableConditionalComments) { // CC: downlevel-revealed starting if (match(CC_IF)) { int ccEndNdx = find(CC_END, ndx + CC_IF.length, total); if (ccEndNdx == -1) { ccEndNdx = total; } CharSequence expression = charSequence(ndx + 1, ccEndNdx); visitor.condComment(expression, true, false, false); ndx = ccEndNdx + 1; state = DATA_STATE; return; } // CC: downlevel-* ending tag if (match(CC_ENDIF)) { ndx += CC_ENDIF.length; int ccEndNdx = find('>', ndx, total); if (ccEndNdx == -1) { ccEndNdx = total; } if (match(COMMENT_DASH, ccEndNdx - 2)) { // downlevel-hidden ending tag visitor.condComment(_ENDIF, false, true, false); } else { visitor.condComment(_ENDIF, false, false, false); } ndx = ccEndNdx; state = DATA_STATE; return; } } if (config.parseXmlTags) { if (match(CDATA)) { ndx += CDATA.length - 1; if (xmlDeclaration == null) { xmlDeclaration = new XmlDeclaration(); } state = xmlDeclaration.CDATA; return; } } errorInvalidToken(); state = BOGUS_COMMENT; } }; // ---------------------------------------------------------------- RAWTEXT protected int rawTextStart; protected int rawTextEnd; protected char[] rawTagName; protected State RAWTEXT = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { state = DATA_STATE; return; } char c = input[ndx]; if (c == '<') { rawTextEnd = ndx; state = RAWTEXT_LESS_THAN_SIGN; return; } } } }; protected State RAWTEXT_LESS_THAN_SIGN = new State() { @Override public void parse() { ndx++; if (isEOF()) { state = RAWTEXT; return; } char c = input[ndx]; if (c == '/') { state = RAWTEXT_END_TAG_OPEN; return; } state = RAWTEXT; } }; protected State RAWTEXT_END_TAG_OPEN = new State() { @Override public void parse() { ndx++; if (isEOF()) { state = RAWTEXT; return; } char c = input[ndx]; if (isAlpha(c)) { state = RAWTEXT_END_TAG_NAME; return; } state = RAWTEXT; } }; protected State RAWTEXT_END_TAG_NAME = new State() { @Override public void parse() { int rawtextEndTagNameStartNdx = ndx; while (true) { ndx++; if (isEOF()) { state = RAWTEXT; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { if (isAppropriateTagName(rawTagName, rawtextEndTagNameStartNdx, ndx)) { textEmitChars(rawTextStart, rawTextEnd); emitText(); state = BEFORE_ATTRIBUTE_NAME; tag.start(rawTextEnd); tag.setName(charSequence(rawtextEndTagNameStartNdx, ndx)); tag.setType(TagType.END); } else { state = RAWTEXT; } return; } if (c == '/') { if (isAppropriateTagName(rawTagName, rawtextEndTagNameStartNdx, ndx)) { textEmitChars(rawTextStart, rawTextEnd); emitText(); state = SELF_CLOSING_START_TAG; tag.start(rawTextEnd); tag.setName(charSequence(rawtextEndTagNameStartNdx, ndx)); tag.setType(TagType.SELF_CLOSING); } else { state = RAWTEXT; } return; } if (c == '>') { if (isAppropriateTagName(rawTagName, rawtextEndTagNameStartNdx, ndx)) { textEmitChars(rawTextStart, rawTextEnd); emitText(); state = DATA_STATE; tag.start(rawTextEnd); tag.setName(charSequence(rawtextEndTagNameStartNdx, ndx)); tag.setType(TagType.END); tag.end(ndx); emitTag(); } else { state = RAWTEXT; } return; } if (isAlpha(c)) { continue; } state = RAWTEXT; return; } } }; // ---------------------------------------------------------------- RCDATA protected int rcdataTagStart = -1; protected char[] rcdataTagName; protected State RCDATA = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { state = DATA_STATE; return; } char c = input[ndx]; if (c == '<') { rcdataTagStart = ndx; state = RCDATA_LESS_THAN_SIGN; return; } if (c == '&') { consumeCharacterReference(); continue; } textEmitChar(c); } } }; protected State RCDATA_LESS_THAN_SIGN = new State() { @Override public void parse() { ndx++; if (isEOF()) { state = RCDATA; return; } char c = input[ndx]; if (c == '/') { state = RCDATA_END_TAG_OPEN; return; } state = RCDATA; textEmitChar('<'); textEmitChar(c); } }; protected State RCDATA_END_TAG_OPEN = new State() { @Override public void parse() { ndx++; if (isEOF()) { state = RCDATA; return; } char c = input[ndx]; if (isAlpha(c)) { state = RCDATA_END_TAG_NAME; return; } state = RCDATA; textEmitChar('<'); textEmitChar('/'); textEmitChar(c); } }; protected State RCDATA_END_TAG_NAME = new State() { @Override public void parse() { int rcdataEndTagNameStartNdx = ndx; while (true) { ndx++; if (isEOF()) { state = RCDATA; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { if (isAppropriateTagName(rcdataTagName, rcdataEndTagNameStartNdx, ndx)) { emitText(); state = BEFORE_ATTRIBUTE_NAME; tag.start(rcdataTagStart); tag.setName(charSequence(rcdataEndTagNameStartNdx, ndx)); tag.setType(TagType.END); } else { state = RCDATA; } return; } if (c == '/') { if (isAppropriateTagName(rcdataTagName, rcdataEndTagNameStartNdx, ndx)) { emitText(); state = SELF_CLOSING_START_TAG; tag.start(rcdataTagStart); tag.setName(charSequence(rcdataEndTagNameStartNdx, ndx)); tag.setType(TagType.SELF_CLOSING); } else { state = RCDATA; } return; } if (c == '>') { if (isAppropriateTagName(rcdataTagName, rcdataEndTagNameStartNdx, ndx)) { emitText(); state = DATA_STATE; tag.start(rcdataTagStart); tag.setName(charSequence(rcdataEndTagNameStartNdx, ndx)); tag.setType(TagType.END); tag.end(ndx); emitTag(); } else { state = RCDATA; } return; } if (isAlpha(c)) { continue; } state = RCDATA; return; } } }; // ---------------------------------------------------------------- comments protected int commentStart; protected State COMMENT_START = new State() { @Override public void parse() { ndx++; commentStart = ndx; if (isEOF()) { errorEOF(); state = DATA_STATE; emitComment(commentStart, total); return; } char c = input[ndx]; if (c == '-') { state = COMMENT_START_DASH; return; } if (c == '>') { errorInvalidToken(); state = DATA_STATE; emitComment(commentStart, ndx); return; } state = COMMENT; } }; protected State COMMENT_START_DASH = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; emitComment(commentStart, total); return; } char c = input[ndx]; if (c == '-') { state = COMMENT_END; return; } if (c == '>') { errorInvalidToken(); state = DATA_STATE; emitComment(commentStart, ndx); } state = COMMENT; } }; protected State COMMENT = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; emitComment(commentStart, total); return; } char c = input[ndx]; if (c == '-') { state = COMMENT_END_DASH; return; } } } }; protected State COMMENT_END_DASH = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; emitComment(commentStart, total); return; } char c = input[ndx]; if (c == '-') { state = COMMENT_END; return; } state = COMMENT; } }; protected State COMMENT_END = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; emitComment(commentStart, total); return; } char c = input[ndx]; if (c == '>') { state = DATA_STATE; emitComment(commentStart, ndx - 2); return; } if (c == '!') { errorInvalidToken(); state = COMMENT_END_BANG; return; } if (c == '-') { errorInvalidToken(); } else { errorInvalidToken(); state = COMMENT; } } }; protected State COMMENT_END_BANG = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; emitComment(commentStart, total); return; } char c = input[ndx]; if (c == '-') { state = COMMENT_END_DASH; return; } if (c == '>') { state = DATA_STATE; emitComment(commentStart, ndx - 3); return; } state = COMMENT; } }; // ---------------------------------------------------------------- DOCTYPE protected State DOCTYPE = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { state = BEFORE_DOCTYPE_NAME; return; } errorInvalidToken(); state = BEFORE_DOCTYPE_NAME; ndx--; } }; protected State BEFORE_DOCTYPE_NAME = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { continue; } if (c == '>') { errorInvalidToken(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } state = DOCTYPE_NAME; return; } } }; protected State DOCTYPE_NAME = new State() { @Override public void parse() { int nameStartNdx = ndx; while (true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; doctype.setName(charSequence(nameStartNdx, ndx)); doctype.setQuirksMode(true); emitDoctype(); return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { state = AFTER_DOCUMENT_NAME; doctype.setName(charSequence(nameStartNdx, ndx)); return; } if (c == '>') { state = DATA_STATE; doctype.setName(charSequence(nameStartNdx, ndx)); emitDoctype(); return; } } } }; protected State AFTER_DOCUMENT_NAME = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { continue; } if (c == '>') { state = DATA_STATE; emitDoctype(); return; } if (matchUpperCase(A_PUBLIC)) { ndx += A_PUBLIC.length - 1; state = AFTER_DOCTYPE_PUBLIC_KEYWORD; return; } if (matchUpperCase(A_SYSTEM)) { ndx += A_SYSTEM.length - 1; state = AFTER_DOCTYPE_SYSTEM_KEYWORD; return; } errorInvalidToken(); state = BOGUS_DOCTYPE; doctype.setQuirksMode(true); return; } } }; protected int doctypeIdNameStart; protected State AFTER_DOCTYPE_PUBLIC_KEYWORD = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { state = BEFORE_DOCTYPE_PUBLIC_IDENTIFIER; return; } if (c == '\"') { errorInvalidToken(); doctypeIdNameStart = ndx + 1; state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED; return; } if (c == '\'') { errorInvalidToken(); doctypeIdNameStart = ndx + 1; state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; return; } if (c == '>') { errorInvalidToken(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } errorInvalidToken(); state = BOGUS_DOCTYPE; doctype.setQuirksMode(true); } }; protected State BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = new State() { @Override public void parse() { while(true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; emitDoctype(); return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { continue; } if (c == '\"') { doctypeIdNameStart = ndx + 1; state = DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED; return; } if (c == '\'') { doctypeIdNameStart = ndx + 1; state = DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED; return; } if (c == '>') { errorInvalidToken(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } errorInvalidToken(); doctype.setQuirksMode(true); state = BOGUS_DOCTYPE; return; } } }; protected State DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { doctype.setPublicIdentifier(charSequence(doctypeIdNameStart, ndx)); errorEOF(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); } char c = input[ndx]; if (c == '\"') { doctype.setPublicIdentifier(charSequence(doctypeIdNameStart, ndx)); state = AFTER_DOCTYPE_PUBLIC_IDENTIFIER; return; } if (c == '>') { doctype.setPublicIdentifier(charSequence(doctypeIdNameStart, ndx)); errorInvalidToken(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } } } }; protected State DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { doctype.setPublicIdentifier(charSequence(doctypeIdNameStart, ndx)); errorEOF(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); } char c = input[ndx]; if (c == '\'') { doctype.setPublicIdentifier(charSequence(doctypeIdNameStart, ndx)); state = AFTER_DOCTYPE_PUBLIC_IDENTIFIER; return; } if (c == '>') { doctype.setPublicIdentifier(charSequence(doctypeIdNameStart, ndx)); errorInvalidToken(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } } } }; protected State AFTER_DOCTYPE_PUBLIC_IDENTIFIER = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { state = BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS; return; } if (c == '>') { state = DATA_STATE; emitDoctype(); return; } if (c == '\"') { errorInvalidToken(); doctypeIdNameStart = ndx + 1; state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; return; } if (c == '\'') { errorInvalidToken(); doctypeIdNameStart = ndx + 1; state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; return; } errorInvalidToken(); doctype.setQuirksMode(true); state = BOGUS_DOCTYPE; } }; protected State BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; emitDoctype(); return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { continue; } if (c == '>') { state = DATA_STATE; emitDoctype(); return; } if (c == '\"') { doctypeIdNameStart = ndx + 1; state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; return; } if (c == '\'') { doctypeIdNameStart = ndx + 1; state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; return; } errorInvalidToken(); doctype.setQuirksMode(true); state = BOGUS_DOCTYPE; return; } } }; protected State BOGUS_DOCTYPE = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { state = DATA_STATE; emitDoctype(); return; } char c = input[ndx]; if (c == '>') { state = DATA_STATE; emitDoctype(); return; } } } }; protected State AFTER_DOCTYPE_SYSTEM_KEYWORD = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { state = BEFORE_DOCTYPE_SYSTEM_IDENTIFIER; return; } if (c == '\"') { errorInvalidToken(); doctypeIdNameStart = ndx + 1; state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; return; } if (c == '\'') { errorInvalidToken(); doctypeIdNameStart = ndx + 1; state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; return; } if (c == '>') { errorInvalidToken(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } errorInvalidToken(); state = BOGUS_DOCTYPE; doctype.setQuirksMode(true); } }; protected State BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = new State() { @Override public void parse() { while(true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; emitDoctype(); return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { continue; } if (c == '\"') { doctypeIdNameStart = ndx + 1; state = DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED; return; } if (c == '\'') { doctypeIdNameStart = ndx + 1; state = DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED; return; } if (c == '>') { errorInvalidToken(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } errorInvalidToken(); doctype.setQuirksMode(true); state = BOGUS_DOCTYPE; return; } } }; protected State DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { doctype.setSystemIdentifier(charSequence(doctypeIdNameStart, ndx)); errorEOF(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); } char c = input[ndx]; if (c == '\"') { doctype.setSystemIdentifier(charSequence(doctypeIdNameStart, ndx)); state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER; return; } if (c == '>') { doctype.setSystemIdentifier(charSequence(doctypeIdNameStart, ndx)); errorInvalidToken(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } } } }; protected State DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { doctype.setSystemIdentifier(charSequence(doctypeIdNameStart, ndx)); errorEOF(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); } char c = input[ndx]; if (c == '\'') { doctype.setSystemIdentifier(charSequence(doctypeIdNameStart, ndx)); state = AFTER_DOCTYPE_SYSTEM_IDENTIFIER; return; } if (c == '>') { doctype.setSystemIdentifier(charSequence(doctypeIdNameStart, ndx)); errorInvalidToken(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } } } }; protected State AFTER_DOCTYPE_SYSTEM_IDENTIFIER = new State() { @Override public void parse() { while(true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; doctype.setQuirksMode(true); emitDoctype(); return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { continue; } if (c == '>') { state = DATA_STATE; emitDoctype(); return; } errorInvalidToken(); state = BOGUS_DOCTYPE; // does NOT set the quirks mode! } } }; // ---------------------------------------------------------------- SCRIPT protected int scriptStartNdx = -1; protected int scriptEndNdx = -1; protected int scriptEndTagName = -1; protected State SCRIPT_DATA = new State() { @Override public void parse() { while(true) { ndx++; if (isEOF()) { emitScript(scriptStartNdx, total); state = DATA_STATE; return; } char c = input[ndx]; if (c == '<') { scriptEndNdx = ndx; state = SCRIPT_DATA_LESS_THAN_SIGN; return; } } } }; protected State SCRIPT_DATA_LESS_THAN_SIGN = new State() { @Override public void parse() { ndx++; if (isEOF()) { state = SCRIPT_DATA; ndx--; return; } char c = input[ndx]; if (c == '/') { state = SCRIPT_DATA_END_TAG_OPEN; return; } if (c == '!') { if (scriptEscape == null) { // create script escape states only if really needed scriptEscape = new ScriptEscape(); } state = scriptEscape.SCRIPT_DATA_ESCAPE_START; return; } state = SCRIPT_DATA; } }; protected State SCRIPT_DATA_END_TAG_OPEN = new State() { @Override public void parse() { ndx++; if (isEOF()) { state = SCRIPT_DATA; ndx--; return; } char c = input[ndx]; if (isAlpha(c)) { state = SCRIPT_DATA_END_TAG_NAME; scriptEndTagName = ndx; return; } state = SCRIPT_DATA; } }; protected State SCRIPT_DATA_END_TAG_NAME = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { state = SCRIPT_DATA; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { if (isAppropriateTagName(T_SCRIPT, scriptEndTagName, ndx)) { state = BEFORE_ATTRIBUTE_NAME; } else { state = SCRIPT_DATA; } return; } if (c == '/') { if (isAppropriateTagName(T_SCRIPT, scriptEndTagName, ndx)) { state = SELF_CLOSING_START_TAG; } else { state = SCRIPT_DATA; } return; } if (c == '>') { if (isAppropriateTagName(T_SCRIPT, scriptEndTagName, ndx)) { state = DATA_STATE; emitScript(scriptStartNdx, scriptEndNdx); } else { state = SCRIPT_DATA; } return; } if (isAlpha(c)) { continue; } state = SCRIPT_DATA; return; } } }; // ---------------------------------------------------------------- SCRIPT ESCAPE protected ScriptEscape scriptEscape = null; /** * Since escaping states inside the SCRIPT tag are rare, we want to use them * lazy, only when really needed. Therefore, they are all grouped inside separate * class that will be instantiated only if needed. */ protected class ScriptEscape { protected int doubleEscapedNdx = -1; protected int doubleEscapedEndTag = -1; protected State SCRIPT_DATA_ESCAPE_START = new State() { @Override public void parse() { ndx++; if (isEOF()) { state = SCRIPT_DATA; ndx--; return; } char c = input[ndx]; if (c == '-') { state = SCRIPT_DATA_ESCAPE_START_DASH; return; } state = SCRIPT_DATA; } }; protected State SCRIPT_DATA_ESCAPE_START_DASH = new State() { @Override public void parse() { ndx++; if (isEOF()) { state = SCRIPT_DATA; return; } char c = input[ndx]; if (c == '-') { state = SCRIPT_DATA_ESCAPED_DASH_DASH; return; } state = SCRIPT_DATA; } }; protected State SCRIPT_DATA_ESCAPED_DASH_DASH = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (c == '-') { return; } if (c == '<') { state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; return; } if (c == '>') { state = SCRIPT_DATA; return; } state = SCRIPT_DATA_ESCAPED; } }; protected State SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = new State() { @Override public void parse() { ndx++; if (isEOF()) { state = SCRIPT_DATA_ESCAPED; return; } char c = input[ndx]; if (c == '/') { doubleEscapedNdx = -1; state = SCRIPT_DATA_ESCAPED_END_TAG_OPEN; return; } if (isAlpha(c)) { doubleEscapedNdx = ndx; state = SCRIPT_DATA_DOUBLE_ESCAPE_START; return; } state = SCRIPT_DATA_ESCAPED; } }; protected State SCRIPT_DATA_ESCAPED = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { errorEOF(); emitScript(scriptStartNdx, total); state = DATA_STATE; return; } char c = input[ndx]; if (c == '-') { state = SCRIPT_DATA_ESCAPED_DASH; break; } if (c == '<') { state = SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN; return; } } } }; protected State SCRIPT_DATA_ESCAPED_DASH = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (c == '-') { state = SCRIPT_DATA_ESCAPED_DASH_DASH; return; } if (c == '<') { state = SCRIPT_DATA_ESCAPED_DASH_DASH; return; } state = SCRIPT_DATA_ESCAPED; } }; protected State SCRIPT_DATA_ESCAPED_END_TAG_OPEN = new State() { @Override public void parse() { ndx++; if (isEOF()) { state = SCRIPT_DATA_ESCAPED; return; } char c = input[ndx]; if (isAlpha(c)) { // todo Create a new end tag token? state = SCRIPT_DATA_ESCAPED_END_TAG_NAME; } state = SCRIPT_DATA_ESCAPED; } }; protected State SCRIPT_DATA_ESCAPED_END_TAG_NAME = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { state = SCRIPT_DATA_ESCAPED; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { if (isAppropriateTagName(T_SCRIPT, scriptEndTagName, ndx)) { state = BEFORE_ATTRIBUTE_NAME; } else { state = SCRIPT_DATA_ESCAPED; } return; } if (c == '/') { if (isAppropriateTagName(T_SCRIPT, scriptEndTagName, ndx)) { state = SELF_CLOSING_START_TAG; } else { state = SCRIPT_DATA_ESCAPED; } return; } if (c == '>') { if (isAppropriateTagName(T_SCRIPT, scriptEndTagName, ndx)) { state = DATA_STATE; emitTag(); } else { state = SCRIPT_DATA_ESCAPED; } return; } if (isAlpha(c)) { continue; } state = SCRIPT_DATA_ESCAPED; return; } } }; // ---------------------------------------------------------------- SCRIPT DOUBLE ESCAPE protected State SCRIPT_DATA_DOUBLE_ESCAPE_START = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { state = SCRIPT_DATA_ESCAPED; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES_OR_END)) { if (isAppropriateTagName(T_SCRIPT, doubleEscapedNdx, ndx)) { state = SCRIPT_DATA_DOUBLE_ESCAPED; } else { state = SCRIPT_DATA_ESCAPED; } return; } if (isAlpha(c)) { continue; } state = SCRIPT_DATA_ESCAPED; return; } } }; protected State SCRIPT_DATA_DOUBLE_ESCAPED = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (c == '-') { state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH; return; } if (c == '<') { state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; return; } } } }; protected State SCRIPT_DATA_DOUBLE_ESCAPED_DASH = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (c == '-') { state = SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH; return; } if (c == '<') { state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; return; } state = SCRIPT_DATA_DOUBLE_ESCAPED; } }; protected State SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (c == '-') { continue; } if (c == '<') { state = SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN; return; } if (c == '>') { state = SCRIPT_DATA; return; } state = SCRIPT_DATA_DOUBLE_ESCAPED; return; } } }; protected State SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = new State() { @Override public void parse() { ndx++; if (isEOF()) { state = SCRIPT_DATA_DOUBLE_ESCAPED; return; } char c = input[ndx]; if (c == '/') { state = SCRIPT_DATA_DOUBLE_ESCAPE_END; return; } state = SCRIPT_DATA_DOUBLE_ESCAPED; } }; protected State SCRIPT_DATA_DOUBLE_ESCAPE_END = new State() { @Override public void parse() { doubleEscapedEndTag = ndx + 1; while (true) { ndx++; if (isEOF()) { state = SCRIPT_DATA_DOUBLE_ESCAPED; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES_OR_END)) { if (isAppropriateTagName(T_SCRIPT, doubleEscapedEndTag, ndx)) { state = SCRIPT_DATA_ESCAPED; } else { state = SCRIPT_DATA_DOUBLE_ESCAPED; } return; } if (isAlpha(c)) { continue; } state = SCRIPT_DATA_DOUBLE_ESCAPED; return; } } }; } // ---------------------------------------------------------------- xml protected XmlDeclaration xmlDeclaration = null; protected class XmlDeclaration { protected int xmlAttrCount = 0; protected int xmlAttrStartNdx = -1; protected CharSequence version; protected CharSequence encoding; protected CharSequence standalone; protected char attrQuote; protected void reset() { xmlAttrCount = 0; xmlAttrStartNdx = -1; version = encoding = standalone = null; } protected State XML_BETWEEN = new State() { @Override public void parse() { while (true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { continue; } if (c == '?') { state = XML_CLOSE; return; } switch (xmlAttrCount) { case 0: if (match(XML_VERSION)) { ndx += XML_VERSION.length - 1; state = AFTER_XML_ATTRIBUTE_NAME; return; } break; case 1: if (match(XML_ENCODING)) { ndx += XML_ENCODING.length - 1; state = AFTER_XML_ATTRIBUTE_NAME; return; } break; case 2: if (match(XML_STANDALONE)) { ndx += XML_STANDALONE.length - 1; state = AFTER_XML_ATTRIBUTE_NAME; return; } break; } errorInvalidToken(); state = DATA_STATE; } } }; protected State AFTER_XML_ATTRIBUTE_NAME = new State() { @Override public void parse() { while(true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { continue; } if (c == '=') { state = BEFORE_XML_ATTRIBUTE_VALUE; return; } errorInvalidToken(); state = DATA_STATE; return; } } }; protected State BEFORE_XML_ATTRIBUTE_VALUE = new State() { @Override public void parse() { while(true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (equalsOne(c, TAG_WHITESPACES)) { continue; } if (c == '\"' || c == '\'') { state = XML_ATTRIBUTE_VALUE; attrQuote = c; return; } errorInvalidToken(); state = DATA_STATE; return; } } }; protected State XML_ATTRIBUTE_VALUE = new State() { @Override public void parse() { xmlAttrStartNdx = ndx + 1; while(true) { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (c == attrQuote) { CharSequence value = charSequence(xmlAttrStartNdx, ndx); switch (xmlAttrCount) { case 0: version = value; break; case 1: encoding = value; break; case 2: standalone = value; break; } xmlAttrCount++; state = XML_BETWEEN; return; } } } }; protected State XML_CLOSE = new State() { @Override public void parse() { ndx++; if (isEOF()) { errorEOF(); state = DATA_STATE; return; } char c = input[ndx]; if (c == '>') { emitXml(); state = DATA_STATE; return; } errorInvalidToken(); state = DATA_STATE; } }; // ---------------------------------------------------------------- CDATA protected State CDATA = new State() { @Override public void parse() { ndx++; int cdataEndNdx = find(CDATA_END, ndx, total); if (cdataEndNdx == -1) { cdataEndNdx = total; } CharSequence cdata = charSequence(ndx, cdataEndNdx); emitCData(cdata); ndx = cdataEndNdx + 2; state = DATA_STATE; } }; } // ---------------------------------------------------------------- text protected char[] text; protected int textLen; private void ensureCapacity() { if (textLen == text.length) { text = ArraysUtil.resize(text, textLen << 1); } } private void ensureCapacity(final int growth) { int desiredLen = textLen + growth; if (desiredLen > text.length) { text = ArraysUtil.resize(text, Math.max(textLen << 1, desiredLen)); } } /** * Emits characters into the local text buffer. */ protected void textEmitChar(final char c) { ensureCapacity(); text[textLen++] = c; } /** * Resets text buffer. */ protected void textStart() { textLen = 0; } protected void textEmitChars(int from, final int to) { ensureCapacity(to - from); while (from < to) { text[textLen++] = input[from++]; } } protected void textEmitChars(final char[] buffer) { ensureCapacity(buffer.length); for (char aBuffer : buffer) { text[textLen++] = aBuffer; } } protected CharSequence textWrap() { if (textLen == 0) { return CharArraySequence.EMPTY; } return new String(text, 0, textLen); // todo use charSequence pointer instead! } // ---------------------------------------------------------------- attr protected int attrStartNdx = -1; protected int attrEndNdx = -1; private void _addAttribute() { _addAttribute(charSequence(attrStartNdx, attrEndNdx), null); } private void _addAttributeWithValue() { _addAttribute(charSequence(attrStartNdx, attrEndNdx), textWrap().toString()); } private void _addAttribute(final CharSequence attrName, final CharSequence attrValue) { if (tag.getType() == TagType.END) { _error("Ignored end tag attribute"); } else { if (tag.hasAttribute(attrName)) { _error("Ignored duplicated attribute: " + attrName); } else { tag.addAttribute(attrName, attrValue); } } attrStartNdx = -1; attrEndNdx = -1; textLen = 0; } protected void emitTag() { tag.end(ndx + 1); if (config.calculatePosition) { tag.setPosition(position(tag.getTagPosition())); } if (tag.getType().isStartingTag()) { if (matchTagName(T_SCRIPT)) { scriptStartNdx = ndx + 1; state = SCRIPT_DATA; return; } // detect RAWTEXT tags if (config.enableRawTextModes) { for (char[] rawtextTagName : RAWTEXT_TAGS) { if (matchTagName(rawtextTagName)) { tag.setRawTag(true); state = RAWTEXT; rawTextStart = ndx + 1; rawTagName = rawtextTagName; break; } } // detect RCDATA tag for (char[] rcdataTextTagName : RCDATA_TAGS) { if (matchTagName(rcdataTextTagName)) { state = RCDATA; rcdataTagStart = ndx + 1; rcdataTagName = rcdataTextTagName; break; } } } tag.increaseDeepLevel(); } visitor.tag(tag); if (tag.getType().isEndingTag()) { tag.decreaseDeepLevel(); } } /** * Emits a comment. Also checks for conditional comments! */ protected void emitComment(final int from, final int to) { if (config.enableConditionalComments) { // CC: downlevel-hidden starting if (match(CC_IF, from)) { int endBracketNdx = find(']', from + 3, to); CharSequence expression = charSequence(from + 1, endBracketNdx); ndx = endBracketNdx + 1; char c = input[ndx]; if (c != '>') { errorInvalidToken(); } visitor.condComment(expression, true, true, false); state = DATA_STATE; return; } if (to > CC_ENDIF2.length && match(CC_ENDIF2, to - CC_ENDIF2.length)) { // CC: downlevel-hidden ending visitor.condComment(_ENDIF, false, true, true); state = DATA_STATE; return; } } CharSequence comment = charSequence(from, to); visitor.comment(comment); commentStart = -1; } /** * Emits text if there is some content. */ protected void emitText() { if (textLen != 0) { visitor.text(textWrap()); } textLen = 0; } protected void emitScript(final int from, final int to) { tag.increaseDeepLevel(); tag.setRawTag(true); visitor.script(tag, charSequence(from, to)); tag.decreaseDeepLevel(); scriptStartNdx = -1; scriptEndNdx = -1; } protected void emitDoctype() { visitor.doctype(doctype); doctype.reset(); } protected void emitXml() { visitor.xml(xmlDeclaration.version, xmlDeclaration.encoding, xmlDeclaration.standalone); xmlDeclaration.reset(); } protected void emitCData(final CharSequence charSequence) { visitor.cdata(charSequence); } // ---------------------------------------------------------------- error protected void errorEOF() { _error("Parse error: EOF"); } protected void errorInvalidToken() { _error("Parse error: invalid token"); } protected void errorCharReference() { _error("Parse error: invalid character reference"); } /** * Prepares error message and reports it to the visitor. */ protected void _error(String message) { if (config.calculatePosition) { Position currentPosition = position(ndx); message = message .concat(StringPool.SPACE) .concat(currentPosition.toString()); } else { message = message .concat(" [@") .concat(Integer.toString(ndx)) .concat(StringPool.RIGHT_SQ_BRACKET); } visitor.error(message); } // ---------------------------------------------------------------- util private boolean isAppropriateTagName(final char[] lowerCaseNameToMatch, final int from, final int to) { final int len = to - from; if (len != lowerCaseNameToMatch.length) { return false; } for (int i = from, k = 0; i < to; i++, k++) { char c = input[i]; c = CharUtil.toLowerAscii(c); if (c != lowerCaseNameToMatch[k]) { return false; } } return true; } private boolean matchTagName(final char[] tagNameLowercase) { final CharSequence charSequence = tag.getName(); int length = tagNameLowercase.length; if (charSequence.length() != length) { return false; } for (int i = 0; i < length; i++) { char c = charSequence.charAt(i); c = CharUtil.toLowerAscii(c); if (c != tagNameLowercase[i]) { return false; } } return true; } // ---------------------------------------------------------------- state protected State state = DATA_STATE; // ---------------------------------------------------------------- names private static final char[] TAG_WHITESPACES = new char[] {'\t', '\n', '\r', ' '}; private static final char[] TAG_WHITESPACES_OR_END = new char[] {'\t', '\n', '\r', ' ', '/', '>'}; private static final char[] CONTINUE_CHARS = new char[] {'\t', '\n', '\r', ' ', '<', '&'}; private static final char[] ATTR_INVALID_1 = new char[] {'\"', '\'', '<', '='}; private static final char[] ATTR_INVALID_2 = new char[] {'\"', '\'', '<'}; private static final char[] ATTR_INVALID_3 = new char[] {'<', '=', '`'}; private static final char[] ATTR_INVALID_4 = new char[] {'"', '\'', '<', '=', '`'}; private static final char[] COMMENT_DASH = new char[] {'-', '-'}; private static final char[] T_DOCTYPE = new char[] {'D', 'O', 'C', 'T', 'Y', 'P', 'E'}; private static final char[] T_SCRIPT = new char[] {'s', 'c', 'r', 'i', 'p', 't'}; private static final char[] T_XMP = new char[] {'x', 'm', 'p'}; private static final char[] T_STYLE = new char[] {'s', 't', 'y', 'l', 'e'}; private static final char[] T_IFRAME = new char[] {'i', 'f', 'r', 'a', 'm', 'e'}; private static final char[] T_NOFRAMES = new char[] {'n', 'o', 'f', 'r', 'a', 'm', 'e', 's'}; private static final char[] T_NOEMBED = new char[] {'n', 'o', 'e', 'm', 'b', 'e', 'd'}; private static final char[] T_NOSCRIPT = new char[] {'n', 'o', 's', 'c', 'r', 'i', 'p', 't'}; private static final char[] T_TEXTAREA = new char[] {'t', 'e', 'x', 't', 'a', 'r', 'e', 'a'}; private static final char[] T_TITLE = new char[] {'t', 'i', 't', 'l', 'e'}; private static final char[] A_PUBLIC = new char[] {'P', 'U', 'B', 'L', 'I', 'C'}; private static final char[] A_SYSTEM = new char[] {'S', 'Y', 'S', 'T', 'E', 'M'}; private static final char[] CDATA = new char[] {'[', 'C', 'D', 'A', 'T', 'A', '['}; private static final char[] CDATA_END = new char[] {']', ']', '>'}; private static final char[] XML = new char[] {'?', 'x', 'm', 'l'}; private static final char[] XML_VERSION = new char[] {'v', 'e', 'r', 's', 'i', 'o', 'n'}; private static final char[] XML_ENCODING = new char[] {'e', 'n', 'c', 'o', 'd', 'i', 'n', 'g'}; private static final char[] XML_STANDALONE = new char[] {'s', 't', 'a', 'n', 'd', 'a', 'l', 'o', 'n', 'e'}; private static final char[] CC_IF = new char[] {'[', 'i', 'f', ' '}; private static final char[] CC_ENDIF = new char[] {'[', 'e', 'n', 'd', 'i', 'f', ']'}; private static final char[] CC_ENDIF2 = new char[] {'<', '!', '[', 'e', 'n', 'd', 'i', 'f', ']'}; private static final char[] CC_END = new char[] {']', '>'}; // CDATA private static final char[][] RAWTEXT_TAGS = new char[][] { T_XMP, T_STYLE, T_IFRAME, T_NOEMBED, T_NOFRAMES, T_NOSCRIPT, T_SCRIPT }; private static final char[][] RCDATA_TAGS = new char[][] { T_TEXTAREA, T_TITLE }; private static final char REPLACEMENT_CHAR = '\uFFFD'; private static final char[] INVALID_CHARS = new char[] {'\u000B', '\uFFFE', '\uFFFF'}; //, '\u1FFFE', '\u1FFFF', '\u2FFFE', '\u2FFFF', '\u3FFFE', '\u3FFFF', '\u4FFFE, // '\u4FFFF', '\u5FFFE', '\u5FFFF', '\u6FFFE', '\u6FFFF', '\u7FFFE', '\u7FFFF', '\u8FFFE', '\u8FFFF', '\u9FFFE, // '\u9FFFF', '\uAFFFE', '\uAFFFF', '\uBFFFE', '\uBFFFF', '\uCFFFE', '\uCFFFF', '\uDFFFE', '\uDFFFF', '\uEFFFE, // '\uEFFFF', '\uFFFFE', '\uFFFFF', '\u10FFFE', '\u10FFFF', private static final CharSequence _ENDIF = "endif"; }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy