All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.seasar.mayaa.impl.builder.parser.HtmlStandardScanner Maven / Gradle / Ivy

Go to download

Mayaa is server side web template engine that is strongly aware of work sharing between programmers and designers based on HTML based templates.

The newest version!
/*
 * Copyright 2004-2022 the Seasar Foundation and the Others.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 * either express or implied. See the License for the specific language
 * governing permissions and limitations under the License.
 */
package org.seasar.mayaa.impl.builder.parser;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.nio.BufferUnderflowException;
import java.nio.CharBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.EmptyStackException;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.Stack;
import java.util.regex.Pattern;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.xerces.impl.Constants;
import org.apache.xerces.impl.XMLErrorReporter;
import org.apache.xerces.xni.Augmentations;
import org.apache.xerces.xni.QName;
import org.apache.xerces.xni.XMLAttributes;
import org.apache.xerces.xni.XMLDocumentHandler;
import org.apache.xerces.xni.XMLLocator;
import org.apache.xerces.xni.XMLString;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.XMLComponent;
import org.apache.xerces.xni.parser.XMLComponentManager;
import org.apache.xerces.xni.parser.XMLConfigurationException;
import org.apache.xerces.xni.parser.XMLDocumentScanner;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.seasar.mayaa.impl.builder.parser.HtmlTokenizer.TagToken;
import org.seasar.mayaa.impl.knowledge.HTMLKnowledge;
import org.xml.sax.Locator;

/**
 * 
 * 外部の文書宣言は解決しない
 * フォーマル公開識別子については妥当性の検証を行わない。
 * HTML文字参照以外は解決しない
 */
public class HtmlStandardScanner implements XMLComponent, XMLDocumentScanner {
    static final Log LOG = LogFactory.getLog(HtmlStandardScanner.class);
    static final Log LOG_TOKENHANDLER = LogFactory.getLog(HtmlStandardScanner.class.getName() + ".TokenHandler");
    static final Log LOG_TOKENIZER = LogFactory.getLog(HtmlStandardScanner.class.getName() + ".Tokenizer");

    static final String NS_URI_HTML = "http://www.w3.org/1999/xhtml";

    static final String NS_URI_MATHML = "http://www.w3.org/1998/Math/MathML";

    static final String NS_URI_SVG = "http://www.w3.org/2000/svg";

    static final String NS_URI_XLINK = "http://www.w3.org/1999/xlink";

    static final String NS_URI_XML = "http://www.w3.org/XML/1998/namespace";

    static final String NS_URI_XMLNS = "http://www.w3.org/2000/xmlns/";

    static final Attributes EMPTY_ATTRIBUTES = new Attributes();

    static final Pattern REGEX_WHITESPACE_ONLY = Pattern.compile("\\s+");

    static final QName QN_HTML = new QName(null, "html", "html", NS_URI_HTML);
    static final QName QN_HEAD = new QName(null, "head", "head", NS_URI_HTML);
    static final QName QN_BODY = new QName(null, "body", "body", NS_URI_HTML);
    static final QName QN_TEMPLATE = new QName(null, "template", "template", NS_URI_HTML);

    static {
        QN_HEAD.hashCode();
    }
    /**  */
    XMLDocumentHandler documentHandler = null;

    HtmlTokenizer tokenizer;

    //ByteBuffer buffer = ByteBuffer.allocate(8 * 1024);
    // CharBuffer buffer = CharBuffer.allocate(8 * 1024);
    XMLInputSource inputSource;

    @Override
    public void setInputSource(XMLInputSource inputSource) throws IOException {
        tokenizer.setInputSource(inputSource);
        this.inputSource = inputSource;
    }

    @Override
    public void setDocumentHandler(XMLDocumentHandler handler) {
        this.documentHandler = handler;
    }

    @Override
    public XMLDocumentHandler getDocumentHandler() {
        return documentHandler;
    }

    // FEATURES
    public static final String FEATURE_PREFIX = "http://mayaa.seasar.org/parser/feature/";
    public static final String FEATURE_DELETE_UNEXPECTED_ELEMENT = FEATURE_PREFIX + "delete-unexpected-element";
    public static final String FEATURE_INSERT_IMPLIED_ELEMENT = FEATURE_PREFIX + "insert-implied-element";
    public static final String FEATURE_DOCUMENT_FRAGMENT = FEATURE_PREFIX + "document-fragment";

    boolean featureInsertImpliedElement = false;
    boolean featureDeleteUnexpectedElement = false;
    boolean featureDocumentFragment = true;
    
    @Override
    public String[] getRecognizedFeatures() {
        return new String[] {
            FEATURE_DELETE_UNEXPECTED_ELEMENT,
            FEATURE_INSERT_IMPLIED_ELEMENT,
            FEATURE_DOCUMENT_FRAGMENT,
        };
    }

    @Override
    public Boolean getFeatureDefault(String featureId) {
        switch (featureId) {
            case FEATURE_DELETE_UNEXPECTED_ELEMENT: return Boolean.FALSE;
            case FEATURE_INSERT_IMPLIED_ELEMENT: return Boolean.FALSE;
            case FEATURE_DOCUMENT_FRAGMENT: return Boolean.TRUE;
            default: return Boolean.FALSE;
        }
    }

    @Override
    public void setFeature(String featureId, boolean state) throws XMLConfigurationException {
        switch (featureId) {
            case FEATURE_DELETE_UNEXPECTED_ELEMENT:
                featureDeleteUnexpectedElement = state;
                break;
            case FEATURE_INSERT_IMPLIED_ELEMENT:
                featureInsertImpliedElement = state;
                break;
            case FEATURE_DOCUMENT_FRAGMENT:
                featureDocumentFragment = state;
                break;
        }
    }
    // FEATURES:END

    // PROPERTEIS
    static final String ERROR_REPORTER = Constants.XERCES_PROPERTY_PREFIX + Constants.ERROR_REPORTER_PROPERTY;

    @Override
    public String[] getRecognizedProperties() {
        return new String[] {
        };
    }

    @Override
    public Object getPropertyDefault(String propertyId) {
        return null;
    }

    @Override
    public void setProperty(String propertyId, Object value) throws XMLConfigurationException {
    }
    // PROPERTEIS:END

    // INTERNAL UTILS
    boolean matchOneOfThese(String target, String... compare) {
        for (String c : compare) {
            if (c.equalsIgnoreCase(target)) {
                return true;
            }
        }
        return false;
    }

    QName createHtmlQName(String tagName) {
        return new QName(null, tagName, tagName, NS_URI_HTML);
    }

    void insertStartTag(TagToken tagToken, XMLAttributes attributes) {
        QName qName = createHtmlQName(tagToken.nameBuilder.toString());
        insertStartTag(qName, attributes);
    }

    void insertStartTag(QName qName, XMLAttributes attributes) {
        if (HTMLKnowledge.isVoidElementLocalPart(qName.localpart)) {
            if (LOG_TOKENHANDLER.isTraceEnabled()) {
                LOG_TOKENHANDLER.trace(String.format("%s: Insert void tag: <%s>", insertionMode, qName.localpart));
            }
            documentHandler.emptyElement(qName, attributes, null);
        } else {
            if (LOG_TOKENHANDLER.isTraceEnabled()) {
                LOG_TOKENHANDLER.trace(String.format("%s: Insert start tag: <%s>", insertionMode, qName.localpart));
            }
            unclosedElementStack.push(qName);
            documentHandler.startElement(qName, attributes, null);
        }
    }

    void insertEndTag(TagToken tagToken) {
        QName qName = createHtmlQName(tagToken.nameBuilder.toString());
        insertEndTag(qName);
    }

    void insertEndTag(final QName qName) {
        try {
            do {
                QName qn = unclosedElementStack.peek();
                if (qName.localpart.equals(qn.localpart)) {
                    break;
                }
                if (LOG_TOKENHANDLER.isTraceEnabled()) {
                    LOG_TOKENHANDLER.trace(String.format("%s: Insert implied end tag: ", insertionMode, qn.localpart));
                }
                unclosedElementStack.pop();
                documentHandler.endElement(qn, null);
            } while(true);

            if (LOG_TOKENHANDLER.isTraceEnabled()) {
                LOG_TOKENHANDLER.trace(String.format("%s: Insert end tag: ", insertionMode, qName.localpart));
            }
            unclosedElementStack.pop();
            documentHandler.endElement(qName, null);
        } catch (EmptyStackException e) {
            // An appropriate end tag token is an end tag token whose tag name matches the tag name of the last start tag 
            // to have been emitted from this tokenizer, if any. If no start tag has been emitted from this tokenizer,
            // then no end tag token is appropriate.
        }
    }
    void insertImpliedStartTag(QName qName) {
        if (HTMLKnowledge.isVoidElementLocalPart(qName.localpart)) {
            if (LOG_TOKENHANDLER.isTraceEnabled()) {
                LOG_TOKENHANDLER.trace(String.format("%s: Insert implied void tag: <%s>", insertionMode, qName.localpart));
            }
            documentHandler.emptyElement(qName, EMPTY_ATTRIBUTES, null);
        } else {
            if (LOG_TOKENHANDLER.isTraceEnabled()) {
                LOG_TOKENHANDLER.trace(String.format("%s: Insert implied start tag: <%s>", insertionMode, qName.localpart));
            }
            unclosedElementStack.push(qName);
            documentHandler.startElement(qName, EMPTY_ATTRIBUTES, null);
        }
    }
    void insertImpliedEndTag(QName qName) {
        if (LOG_TOKENHANDLER.isTraceEnabled()) {
            LOG_TOKENHANDLER.trace(String.format("%s: Insert implied end tag: ", insertionMode, qName.localpart));
        }
        unclosedElementStack.pop();
        documentHandler.endElement(qName, null);
    }

    void insertImpliedEndTagIfOpened(final QName qName) {
        for (QName qn : unclosedElementStack) {
            if (qn.localpart.equalsIgnoreCase(qName.localpart)) {
                insertEndTag(qName);
                break;
            }
        }
    }

    boolean isStartTagOf(TagToken tagToken, String ... tagName) {
        if (tagToken.isEndTag) {
            return false;
        }
        final String rawname = tagToken.nameBuilder.toString();
        if (tagName.length == 1) {
            return tagName[0].equalsIgnoreCase(rawname);
        } else if (tagName.length > 1 && matchOneOfThese(rawname, tagName)) {
            return true;
        }
        return false;
    }

    boolean isEndTagOf(TagToken tagToken, String ... tagName) {
        if (!tagToken.isEndTag) {
            return false;
        }
        final String rawname = tagToken.nameBuilder.toString();
        if (tagName.length == 1) {
            return tagName[0].equalsIgnoreCase(rawname);
        } else if (tagName.length > 1 && matchOneOfThese(rawname, tagName)) {
            return true;
        }
        return false;
    }

    // INTERNAL UTILS:END

    /**
     * Initially, the stack of open elements is empty.
     * https://html.spec.whatwg.org/multipage/parsing.html#the-stack-of-open-elements
     */
    Stack unclosedElementStack = new Stack<>();
    /*
     * Document Handler
     */
    enum InsertionMode {
        Initial(TokenHandlerInitial.class),
        BeforeHtml(TokenHandlerBeforeHtml.class),
        BeforeHead(TokenHandlerBeforeHead.class),
        InHead(TokenHandlerInHead.class),
        InHeadNoScript(TokenHandlerBase.class),
        AfterHead(TokenHandlerAfterHead.class),
        InBody(TokenHandlerInBody.class),
        Text(TokenHandlerBase.class),
        InTable(TokenHandlerBase.class),
        InTableText(TokenHandlerBase.class),
        InCaption(TokenHandlerBase.class),
        InColumnGroup(TokenHandlerBase.class),
        InTableBody(TokenHandlerBase.class),
        InRow(TokenHandlerBase.class),
        InCell(TokenHandlerBase.class),
        InSelect(TokenHandlerBase.class),
        InSelectInTable(TokenHandlerBase.class),
        InTemplate(TokenHandlerBase.class),
        AfterBody(TokenHandlerAfterBody.class),
        InFrameset(TokenHandlerBase.class),
        AfterFrameset(TokenHandlerBase.class),
        AfterAfterBody(TokenHandlerBase.class),
        AfterAfterFrameset(TokenHandlerBase.class);

        Class handlerClass;
        TokenHandler handler;

        InsertionMode(Class handlerClass) {
            this.handlerClass = handlerClass;
        }
    }

    public boolean fragmentCase = false;

    InsertionMode insertionMode = InsertionMode.Initial;

    private void setInsertionMode(InsertionMode insertionMode) {
        if (HtmlStandardScanner.LOG_TOKENHANDLER.isTraceEnabled()) {
            HtmlStandardScanner.LOG_TOKENHANDLER.trace("Insertion mode:" + this.insertionMode + " -> " + insertionMode);
        }
        this.insertionMode = insertionMode;
    }
    
    class TokenHandlerInitial extends TokenHandlerBase {
        @Override
        public void emitText(HtmlTokenizer tokenizer, HtmlLocation location, String text) {
            // ignore whitespaces before doctype, otherwise change insertion mode to "before html"
            if (REGEX_WHITESPACE_ONLY.matcher(text).matches()) {
                // Ignore whitespace originally, but to retain template as it.
                super.emitText(tokenizer, location, text);
            } else {
                super.emitText(tokenizer, location, text);
                setInsertionMode(InsertionMode.BeforeHtml);
                insertionMode.handler.emitText(tokenizer, location, text);
            }
        }

        @Override
        public void emitDoctype(HtmlTokenizer tokenizer, HtmlLocation location, String doctypeName, String publicId, String systemId) {
            if (documentHandler != null) {
                documentHandler.doctypeDecl(doctypeName, publicId, systemId, null);
            }
            fragmentCase = false;
            setInsertionMode(InsertionMode.BeforeHtml);
        }

        @Override
        public void emitXmlDecl(HtmlTokenizer tokenizer, HtmlLocation location, String version, String encoding, String standalone) {
            documentHandler.xmlDecl(version, encoding, standalone, null);
        }

        @Override
        public void emitTag(HtmlTokenizer tokenizer, HtmlLocation location, TagToken tagToken, Attributes attributes) {
            setInsertionMode(InsertionMode.BeforeHtml);
            insertionMode.handler.emitTag(tokenizer, location, tagToken, attributes);
        }
    }

    /**
     * Handle tokens in "before html" insertion mode.
     * https://html.spec.whatwg.org/multipage/parsing.html#the-before-html-insertion-mode
     */
    class TokenHandlerBeforeHtml extends TokenHandlerBase {

        @Override
        public void emitText(HtmlTokenizer tokenizer, HtmlLocation location, String text) {
            // ignore whitespaces before doctype, otherwise change insertion mode to "before html"
            if (REGEX_WHITESPACE_ONLY.matcher(text).matches()) {
                // Ignore whitespace originally, but to retain template as it.
                super.emitText(tokenizer, location, text);
            } else {
                super.emitText(tokenizer, location, text);
                if (!fragmentCase) {
                    insertImpliedStartTag(QN_HTML);
                }
                setInsertionMode(InsertionMode.BeforeHead);
            }
        }

        @Override
        public void emitTag(HtmlTokenizer tokenizer, HtmlLocation location, TagToken tagToken, Attributes attributes) {
            final String tagName = tagToken.nameBuilder.toString();
            if (tagToken.isEndTag && !matchOneOfThese(tagName, "head", "body", "html", "br")) {
                reportError("parse-error", new Object[]{ tagName });
                // AND IGNORE THIS TOKEN
            } else if (isStartTagOf(tagToken, "html")) {
                insertStartTag(QN_HTML, attributes);
                setInsertionMode(InsertionMode.BeforeHead);
            } else {
                if (!fragmentCase) {
                    insertImpliedStartTag(QN_HTML);
                }
                setInsertionMode(InsertionMode.BeforeHead);

                insertionMode.handler.emitTag(tokenizer, location, tagToken, attributes);
            }
        }
    }

    /**
     * Handle tokens in "before head" insertion mode.
     * https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode
     */
    class TokenHandlerBeforeHead extends TokenHandlerBase {
        @Override
        public void emitText(HtmlTokenizer tokenizer, HtmlLocation location, String text) {
            super.emitText(tokenizer, location, text);
        }

        @Override
        public void emitTag(HtmlTokenizer tokenizer, HtmlLocation location, TagToken tagToken, Attributes attributes) {
            final String tagName = tagToken.nameBuilder.toString();
            if (tagToken.isEndTag && !matchOneOfThese(tagName, "head", "body", "html", "br")) {
                reportError("parse-error", new Object[]{ tagName });
                // AND IGNORE THIS TOKEN
            } else if (isStartTagOf(tagToken, "html")) {
                reportError("parse-error", new Object[]{ tagName });
                // Parse error.
                if (unclosedElementStack.contains(QN_TEMPLATE)) {
                    // If there is a template element on the stack of open elements, then ignore the token.
                    // IGNORE THIS TOKEN
                } else {
                    // Otherwise, for each attribute on the token, check to see if the attribute is already 
                    // present on the top element of the stack of open elements. If it is not, add the attribute
                    // and its corresponding value to that element.
                }
                documentHandler.startElement(QN_HEAD, attributes, null);
                setInsertionMode(InsertionMode.InHead);
            } else if (isStartTagOf(tagToken, "head")) {
                insertStartTag(tagToken, attributes);
                setInsertionMode(InsertionMode.InHead);
            } else {
                if (fragmentCase) {
                    setInsertionMode(InsertionMode.AfterHead);
                } else if (isStartTagOf(tagToken, "body") && !featureInsertImpliedElement) {
                    setInsertionMode(InsertionMode.AfterHead);
                } else {
                    insertImpliedStartTag(QN_HEAD);
                    setInsertionMode(InsertionMode.InHead);
                }

                insertionMode.handler.emitTag(tokenizer, location, tagToken, attributes);
            }
        }
    }

    /**
     * Handle tokens in "in head" insertion mode.
     * https://html.spec.whatwg.org/multipage/parsing.html#the-in-head-insertion-mode
     */
    class TokenHandlerInHead extends TokenHandlerBase {
        @Override
        public void emitTag(HtmlTokenizer tokenizer, HtmlLocation location, TagToken tagToken, Attributes attributes) {
            if (isStartTagOf(tagToken, "meta", "base", "basefont", "bgsound", "link", "title", "template", "script", "noscript", "noframe", "style")) {
                insertStartTag(tagToken, attributes);
            } else if (isEndTagOf(tagToken, "template")) {
                insertEndTag(tagToken);
            } else if (isEndTagOf(tagToken, "head")) {
                insertEndTag(tagToken);
                setInsertionMode(InsertionMode.AfterHead);
            } else if (tagToken.isEndTag) {
                insertEndTag(tagToken);
            } else {
                // headがすでに閉じている状態でbody以外のタグが検出されたとき
                insertImpliedEndTagIfOpened(QN_HEAD);
                setInsertionMode(InsertionMode.AfterHead);
                insertionMode.handler.emitTag(tokenizer, location, tagToken, attributes);
            }
        }
    }

    /**
     * Handle tokens in "after head" insertion mode.
     * https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inhead
     */
    class TokenHandlerAfterHead extends TokenHandlerBase {
        @Override
        public void emitTag(HtmlTokenizer tokenizer, HtmlLocation location, TagToken tagToken, Attributes attributes) {
            if (isStartTagOf(tagToken, "head", "meta", "base", "basefont", "bgsound", "link", "title", "template", "script", "noscript", "noframe", "style")) {
                insertStartTag(tagToken, attributes);
            } else if (isStartTagOf(tagToken, "body")) {
                insertStartTag(QN_BODY, attributes);
                setInsertionMode(InsertionMode.InBody);
            } else if (tagToken.isEndTag) {
                insertEndTag(tagToken);
            } else {
                // headがすでに閉じている状態でbody以外のタグが検出されたとき
                if (!fragmentCase) {
                    insertImpliedStartTag(QN_BODY);
                }
                setInsertionMode(InsertionMode.InBody);
                super.emitTag(tokenizer, location, tagToken, attributes);
            }
        }
    }

    /**
     * Handle tokens in "in body" insertion mode.
     * https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody
     */
    class TokenHandlerInBody extends TokenHandlerBase {
        @Override
        public void emitTag(HtmlTokenizer tokenizer, HtmlLocation location, TagToken tagToken, Attributes attributes) {
            final String tagName = tagToken.nameBuilder.toString();
            if (isStartTagOf(tagToken, "html")) {
                reportError("parse-error", new Object[]{ tagName });
                // Parse error.
                if (unclosedElementStack.contains(QN_TEMPLATE)) {
                    // If there is a template element on the stack of open elements, then ignore the token.
                    // IGNORE THIS TOKEN
                } else {
                    // Otherwise, for each attribute on the token, check to see if the attribute is already 
                    // present on the top element of the stack of open elements. If it is not, add the attribute
                    // and its corresponding value to that element.
                }
            } else if (isStartTagOf(tagToken, "body")) {
                reportError("parse-error", new Object[]{ tagName });
            } else if (isEndTagOf(tagToken, "body")) {
                insertEndTag(tagToken);
                setInsertionMode(InsertionMode.AfterBody);
            } else if (featureDeleteUnexpectedElement && isStartTagOf(tagToken, "head", "noscript")) {
                //     // IGNORE
            } else if (!tagToken.isEndTag) {
                insertStartTag(tagToken, attributes);
            } else if (tagToken.isEndTag) {
                insertEndTag(tagToken);
            } else {
                super.emitTag(tokenizer, location, tagToken, attributes);
            }
        }
    }

    /**
     * Handle tokens in "after body" insertion mode.
     * https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-afterbody
     */
    class TokenHandlerAfterBody extends TokenHandlerBase {
    }

    class TokenHandlerBase implements TokenHandler {

        @Override
        public void emitXmlDecl(HtmlTokenizer tokenizer, HtmlLocation location, String version, String encoding, String standalone) {
            reportError("parse-error", new Object[]{ "xmlDecl", version, encoding, standalone });
            // IGNORE THIS TOKEN
        }

        @Override
        public void emitDoctype(HtmlTokenizer tokenizer, HtmlLocation location, String doctypeName, String publicId, String systemId) {
            reportError("parse-error", new Object[]{ "doctype", doctypeName, publicId, systemId });
            // AND IGNORE THIS TOKEN
        }
    
        @Override
        public void emitComment(HtmlTokenizer tokenizer, HtmlLocation location, String comment) {
            char[] ch = comment.toCharArray();
            documentHandler.comment(new XMLString(ch, 0, ch.length), null);
        }
    
        @Override
        public void emitText(HtmlTokenizer tokenizer, HtmlLocation location, String text) {
            char[] ch = text.toCharArray();
            documentHandler.characters(new XMLString(ch, 0, ch.length), null);
        }
    
        @Override
        public void emitTag(HtmlTokenizer tokenizer, HtmlLocation location, TagToken tagToken, Attributes attributes) {        
            final String tagName = tagToken.nameBuilder.toString();
            if (tagToken.isSelfClosingTag) {
                documentHandler.emptyElement(createHtmlQName(tagName), attributes, null);
            } else if (tagToken.isEndTag) {
                insertEndTag(createHtmlQName(tagName));
            } else {
                insertStartTag(createHtmlQName(tagName), attributes);
            }
        }

        @Override
        public void reportError(String msgId, Object[] args) {
            if (HtmlStandardScanner.LOG.isInfoEnabled()) {
                LOG.info("ERROR:" + msgId + (args == null ? "": (" " + Arrays.toString(args))));
            }
            // reportFatalError(msgId, args);
        }

        public void reportWarn(String msgId, Object[] args) {
            if (LOG.isInfoEnabled()) {
                LOG.info(String.format("WARN: %s%s", insertionMode, msgId, (args == null ? "": (" " + Arrays.toString(args)))));
            }
        }

        @Override
        public TokenHandler getNextHandler() {
            return insertionMode.handler;
        }
    }

    XMLErrorReporter errorReporter; 

    @Override
    public void reset(XMLComponentManager componentManager) throws XMLConfigurationException {
        for (InsertionMode m: InsertionMode.values()) {
            try {
                m.handler = m.handlerClass.getDeclaredConstructor(this.getClass()).newInstance(this);
            } catch (InstantiationException | IllegalAccessException | IllegalArgumentException
                 | SecurityException | InvocationTargetException | NoSuchMethodException e) {
                    throw new IllegalStateException(e);
            }
        }

        // fSymbolTable = (SymbolTable)componentManager.getProperty(SYMBOL_TABLE);
        errorReporter = (XMLErrorReporter)componentManager.getProperty(ERROR_REPORTER);

        tokenizer = new HtmlTokenizer();
        tokenizer.setInputSource(inputSource);
        tokenizer.reset();

        setInsertionMode(InsertionMode.Initial);
        unclosedElementStack = new Stack<>();
        fragmentCase = featureDocumentFragment;
    }

    @Override
    public boolean scanDocument(boolean complete) throws IOException, XNIException {
        try {
            XMLLocator locator = tokenizer.getLocator();
            documentHandler.startDocument(locator, inputSource.getEncoding(), null, null);

            tokenizer.runTokenizer(insertionMode.handler);
            // return success
            return true;    
        } catch (BufferUnderflowException e) {
            return false;
        }
    }

} // class ElementStack


class ScanningInterruptedExeption extends Exception {
    private static final long serialVersionUID = -592372281502837246L;

    public ScanningInterruptedExeption() {
        super();
    }

    public ScanningInterruptedExeption(Throwable throwable) {
        super(throwable);
    }
}

class HtmlLocation implements Locator, XMLLocator, Cloneable {
    int line = 1;
    int column = 1;
    int offset = 0;
    String publicId = null;
    String systemId = null;

    @Override
    public String toString() {
        return String.format("(%06d)l%d:c%d", offset, line, column);
    }

    public void copyPositionTo(HtmlLocation copy) {
        copy.column = column;
        copy.line = line;
        copy.offset = offset;
    }

    @Override
    public int getLineNumber() {
        return line;
    }

    @Override
    public int getColumnNumber() {
        return column;
    }

    @Override
    public int getCharacterOffset() {
        return offset;
    }

    @Override
    public String getPublicId() {
        return publicId;
    }

    @Override
    public String getSystemId() {
        return systemId;
    }

    @Override
    public String getLiteralSystemId() {
        return getSystemId();
    }

    @Override
    public String getBaseSystemId() {
        return getSystemId();
    }

    @Override
    public String getExpandedSystemId() {
        return getSystemId();
    }

    @Override
    public String getEncoding() {
        return null;
    }

    @Override
    public String getXMLVersion() {
        return null;
    }
}

class Attributes implements XMLAttributes {
    class Attribute {
        QName attrName;
        String attrType;
        String attrValue;
        Attribute(QName attrName, String attrType, String attrValue) {
            this.attrName = attrName;
            this.attrType = attrType;
            this.attrValue = attrValue;
        }

        @Override
        public int hashCode() {
            final int prime = 31;
            int result = 1;
            result = prime * result + getEnclosingInstance().hashCode();
            result = prime * result + Objects.hash(attrName, attrType, attrValue);
            return result;
        }
        
        @Override
        public boolean equals(Object obj) {
            if (this == obj)
                return true;
            if (!(obj instanceof Attribute))
                return false;
            Attribute other = (Attribute) obj;
            if (!getEnclosingInstance().equals(other.getEnclosingInstance()))
                return false;
            return Objects.equals(attrName, other.attrName) && Objects.equals(attrType, other.attrType)
                    && Objects.equals(attrValue, other.attrValue);
        }
        

        /* (non-Javadoc)
         * @see java.lang.Object#toString()
         */
        
        @Override
        public String toString() {
            return attrName.rawname + "=\"" + attrValue.toString() + "\"";
        }

        private Attributes getEnclosingInstance() {
            return Attributes.this;
        }
        
    }
    ArrayList attributes = new ArrayList<>();

    
    /* (non-Javadoc)
     * @see java.lang.Object#toString()
     */
    
    @Override
    public String toString() {
        return attributes.toString();
    }

    @Override
    public int addAttribute(QName attrName, String attrType, String attrValue) {
        attributes.add(new Attribute(attrName, attrType, attrValue));
        return attributes.size() - 1;
    }

    @Override
    public void removeAllAttributes() {
        attributes.clear();
    }

    @Override
    public void removeAttributeAt(int attrIndex) {
        attributes.remove(attrIndex);
    }

    @Override
    public int getLength() {
        return attributes.size();
    }

    @Override
    public int getIndex(String qName) {
        int idx = qName.indexOf(':');
        QName nm = null;
        if (idx == -1) {
            nm = new QName(null, qName, qName, null);
        } else {
            nm = new QName(qName.substring(0, idx), qName.substring(idx + 1), qName, null);
        }
        return attributes.indexOf(new Attribute(nm, null, null));
    }

    @Override
    public int getIndex(String uri, String localPart) {
        QName nm = new QName(null, localPart, localPart, uri);
        return attributes.indexOf(new Attribute(nm, null, null));
    }

    @Override
    public void setName(int attrIndex, QName attrName) {
        Attribute a = attributes.get(attrIndex);
        if (a != null) {
            a.attrName = attrName;
        }
    }

    @Override
    public void getName(int attrIndex, QName attrName) {
        Attribute a = attributes.get(attrIndex);
        if (a != null) {
            attrName.setValues(a.attrName);
        }
    }

    @Override
    public String getPrefix(int index) {
        Attribute a = attributes.get(index);
        if (a != null) {
            return a.attrName.prefix;
        }
        return null;
    }

    @Override
    public String getURI(int index) {
        Attribute a = attributes.get(index);
        if (a != null) {
            return a.attrName.uri;
        }
        return null;
    }

    @Override
    public String getLocalName(int index) {
        Attribute a = attributes.get(index);
        if (a != null) {
            return a.attrName.localpart;
        }
        return null;
    }

    @Override
    public String getQName(int index) {
        Attribute a = attributes.get(index);
        if (a != null) {
            return a.attrName.rawname;
        }
        return null;
    }

    @Override
    public void setType(int attrIndex, String attrType) {
        Attribute a = attributes.get(attrIndex);
        if (a != null) {
            a.attrType = attrType;
        }
    }

    @Override
    public String getType(int index) {
        Attribute a = attributes.get(index);
        if (a != null) {
            return a.attrType;
        }
        return null;
    }

    @Override
    public String getType(String qName) {
        int index = getIndex(qName);
        if (index != -1) {
            return attributes.get(index).attrType;
        }
        return null;
    }

    @Override
    public String getType(String uri, String localName) {
        int index = getIndex(uri, localName);
        if (index != -1) {
            return attributes.get(index).attrType;
        }
        return null;
    }

    @Override
    public void setValue(int attrIndex, String attrValue) {
        Attribute a = attributes.get(attrIndex);
        if (a != null) {
            a.attrValue = attrValue;
        }
    }

    @Override
    public String getValue(int index) {
        Attribute a = attributes.get(index);
        if (a != null) {
            return a.attrValue;
        }
        return null;
    }

    @Override
    public String getValue(String qName) {
        int attrIndex = getIndex(qName);
        Attribute a = attributes.get(attrIndex);
        if (a != null) {
            return a.attrValue;
        }
        return null;
    }

    @Override
    public String getValue(String uri, String localName) {
        int attrIndex = getIndex(uri, localName);
        Attribute a = attributes.get(attrIndex);
        if (a != null) {
            return a.attrValue;
        }
        return null;
    }

    @Override
    public void setNonNormalizedValue(int attrIndex, String attrValue) {
        Attribute a = attributes.get(attrIndex);
        if (a != null) {
            a.attrValue = attrValue;
        }
    }

    @Override
    public String getNonNormalizedValue(int attrIndex) {
        Attribute a = attributes.get(attrIndex);
        if (a != null) {
            return a.attrValue;
        }
        return null;
    }

    @Override
    public void setSpecified(int attrIndex, boolean specified) {
    }

    @Override
    public boolean isSpecified(int attrIndex) {
        return false;
    }

    @Override
    public Augmentations getAugmentations(int attributeIndex) {
        return null;
    }

    @Override
    public Augmentations getAugmentations(String uri, String localPart) {
        return null;
    }

    @Override
    public Augmentations getAugmentations(String qName) {
        return null;
    }

    @Override
    public void setAugmentations(int attrIndex, Augmentations augs) {
    }
}

interface TokenHandler {
    void reportError(String msgId, Object[] args);

    void emitText(HtmlTokenizer tokenizer, HtmlLocation location, String text);

    void emitXmlDecl(HtmlTokenizer tokenizer, HtmlLocation location, String version, String encoding, String standalone);

    void emitDoctype(HtmlTokenizer tokenizer, HtmlLocation location, String doctypeName, String publicId, String systemId);

    void emitComment(HtmlTokenizer tokenizer, HtmlLocation location, String comment);

    void emitTag(HtmlTokenizer tokenizer, HtmlLocation location, TagToken tagToken, Attributes attributes);

    TokenHandler getNextHandler();

}

class HtmlTokenizer {

    // Represent EOF
    private static final char CHAR_SUB = 0x1A;

    private XMLInputSource inputSource;
    CharBuffer cbuf;

    private int pushedBack = CHAR_SUB;
    private int lastChar = CHAR_SUB;
    private StringBuilder characterBuilder = new StringBuilder();
    private Attributes attributes = new Attributes();
    private HtmlLocation location = new HtmlLocation();
    private HtmlLocation currentLocation = new HtmlLocation();

    private TagToken lastStartTagToken = null;
    TokenizeState tokenizeState = TokenizeState.Data;

    enum TokenizeState {
        Data,
        RcData,
        RawText,
        ScriptData,
        TagOpen,
        EndTagOpen,
        TagName,
        RcDataLessThanSign,
        RcDataEndTagOpen,
        RcDataEndTagName,
        RawTextLessThanSign,
        RawTextEndTagOpen,
        RawTextEndTagName,
        ScriptDataLessThanSign,
        ScriptDataEndTagOpen,
        ScriptDataEndTagName,
        ScriptDataEscapeStart,
        ScriptDataEscapeStartDash,
        ScriptDataEscaped,
        ScriptDataEscapedDash,
        ScriptDataEscapedLessThanSign,
        ScriptDataEscapedEndTagOpen,
        ScriptDataEscapedEndTagName,
        ScriptDataDoubleEscapeStart,
        ScriptDataDoubleEscaped,
        ScriptDataDoubleEscapedDash,
        ScriptDataDoubleEscapedDashDash,
        ScriptDataDoubleEscapedLessThanSign,
        ScriptDataDoubleEscapeEnd,
        BeforeAttributeName,
        AttributeName,
        AfterAttributeName,
        BeforeAttributeValue,
        AttributeValueDoubleQuoted,
        AttributeValueSingleQuoted,
        AttributeValueUnquoted,
        AfterAttributeValueQuoted,
        SelfClosingStartTag,
        BogusComment,
        MarkupDeclarationOpen,
        CommentStart,
        CommentStartDash,
        Comment,
        CommentLessThanSign,
        CommentLessThanSignBang,
        CommentLessThanSignBangDash,
        CommentLessThanSignBangDashDash,
        CommentEndDash,
        CommentEnd,
        CommentEndBang,
        Doctype,
        BeforeDoctypeName,
        DoctypeName,
        AfterDoctypeName,
        AfterDoctypePublicKeyword,
        BeforeDoctypePublicIdentifier,
        DoctypePublicIdentifierDoubleQuoted,
        DoctypePublicIdentifierSingleQuoted,
        AfterDoctypePublicIdentifier,
        BetweenDoctypePublicAndSystemIdentifiers,
        AfterDoctypeSystemKeyword,
        BeforeDoctypeSystemIdentifier,
        DoctypeSystemIdentifierDoubleQuoted,
        DoctypeSystemIdentifierSingleQuoted,
        AfterDoctypeSystemIdentifier,
        BogusDoctype,
        CDataSection,
        CDataSectionBracket,
        CDataSectionEnd,
        CharacterReference,
        NamedCharacterReference,
        AmbiguousAmpersand,
        NumericCharacterReference,
        HexadecimalCharacterReferenceStart,
        DecimalCharacterReferenceStart,
        HexadecimalCharacterReference,
        DecimalCharacterReference,
        NumericCharacterReferenceEnd,
        ScriptDataEscapedDashDash,
    }

    class Doctype {
        StringBuilder nameBuilder;
        boolean forceQuirkFlag = false;
        StringBuilder publicIdBuilder;
        StringBuilder systemIdBuilder;
    }

    class TagToken {
        StringBuilder nameBuilder = new StringBuilder();
        boolean isEndTag;
        boolean isSelfClosingTag;
    }

    public void setTokenizerState(TokenizeState state) {
        this.tokenizeState = state;
    }

    public XMLLocator getLocator() {
        return location;
    }

    public void reset() {
        // RESET PARSING STATES
        attributes.removeAllAttributes();
        lastStartTagToken = null;
        location.column = 1;
        location.line = 1;
        location.offset = 1;
        if (inputSource != null) {
            location.publicId = inputSource.getPublicId();
            location.systemId = inputSource.getSystemId();
        } else {
            location.publicId = null;
            location.systemId = null;
        }
        location.copyPositionTo(currentLocation);
        cbuf = CharBuffer.allocate(4096);
        cbuf.flip();
    }

    public void setInputSource(XMLInputSource inputSource) {
        this.inputSource = inputSource;
        location.column = 1;
        location.line = 1;
        location.offset = 1;
        if (inputSource != null) {
            location.publicId = inputSource.getPublicId();
            location.systemId = inputSource.getSystemId();    
        } else {
            location.publicId = null;
            location.systemId = null;
        }
    }

    private char getChar() throws ScanningInterruptedExeption, IOException {
        if (cbuf.remaining() == 0) {
            cbuf.flip();
            cbuf.clear();
            if (inputSource.getCharacterStream().read(cbuf) == 0) {
                cbuf.flip();
                return CHAR_SUB;
            };
            cbuf.flip();
        }
        if (pushedBack != CHAR_SUB) {
            final char c = (char) pushedBack;
            pushedBack = CHAR_SUB;
            return c;
        }
        try {
            int c = cbuf.get();
            while (c == '\r') {
                c = cbuf.get();
            }

            lastChar = c;
            currentLocation.offset++;
            if (c == '\n') {
                currentLocation.line++;
                currentLocation.column = 1;
            } else {
                currentLocation.column++;
            }
            return (char) c;
        } catch (BufferUnderflowException e) {
            return CHAR_SUB;
        }
    }

    private void pushBack() {
        pushedBack = lastChar;
        // pushbackされた文字は次回すぐに読み出されるためcurrentLocationは戻さない
    }

    private boolean skipStringIgnoreCase(String string) throws IOException {
        final char[] array = cbuf.array();
        final int start = cbuf.position() + cbuf.arrayOffset();
        final int end = start + string.length();
        for (int i = start, j = 0; i < end; ++i, ++j) {
            final char b = string.charAt(j);
            if (Character.toUpperCase(array[i]) != Character.toUpperCase(b)) {
                return false;
            }
        }
        cbuf.position(cbuf.position() + string.length());
        currentLocation.offset += string.length();
        currentLocation.column += string.length();
        return true;
    }

    private boolean skipString(String string) throws IOException {
        final char[] array = cbuf.array();
        final int start = cbuf.position() + cbuf.arrayOffset();
        final int end = start + string.length();
        for (int i = start, j = 0; i < end; ++i, ++j) {
            if (array[i] != string.charAt(j)) {
                return false;
            }
        }
        cbuf.position(cbuf.position() + string.length());
        currentLocation.offset += string.length();
        currentLocation.column += string.length();
        return true;
    }

    private boolean isApplicableAttributeName(final String token) {
        if (token == null || token.length() == 0) {
            return false;
        }
        if (token.charAt(0) == '$' && token.length() > 2 && token.charAt(1) == '{' && token.charAt(token.length() - 1) == '}') {
            // 属性名としての変数参照は無視する
            return false;
        }
        return true;
        // return ATTR_NAME_PATTERN.matcher(token).matches();
    }

    private boolean isAppropriateEndTagToken(TagToken tagToken) {
        if (lastStartTagToken == null) {
            return false;
        }
        if (lastStartTagToken.nameBuilder.toString().equalsIgnoreCase(tagToken.nameBuilder.toString())) {
            return true;
        }
        return false;
    }

    private boolean containSameAttributeName(String name) {
        if (attributes == null) {
            return false;
        }
        return attributes.getIndex(name) != -1;
    }

    private void appendTextNode(int c) {
        characterBuilder.append((char) c);
    }

    private void appendTextNode(char c) {
        characterBuilder.append(c);
    }

    private void appendTextNode(String string) {
        characterBuilder.append(string);
    }

    private void emitAttribute(String prefix, String name, String value) {
        if (!isApplicableAttributeName(name)) {
            return;
        }
        // add to current attribute list.
        attributes.addAttribute(new QName(prefix, name, name, null), name, value);
    }

    private void emitEof() throws ScanningInterruptedExeption {
        throw new ScanningInterruptedExeption();
    }

    private void emitDoctype(TokenHandler handler, Doctype doctype) {
        String doctypeName = doctype.nameBuilder == null ? null: doctype.nameBuilder.toString();
        String publicId = doctype.publicIdBuilder == null ? null: doctype.publicIdBuilder.toString();
        String systemId = doctype.systemIdBuilder == null ? null: doctype.systemIdBuilder.toString();

        if (HtmlStandardScanner.LOG_TOKENIZER.isTraceEnabled()) {
            HtmlStandardScanner.LOG_TOKENIZER.trace(String.format("%s: DOCTYPE %s %s %s", location.toString(), doctypeName, publicId, systemId));
        }
        handler.emitDoctype(null, location, doctypeName, publicId, systemId);
        currentLocation.copyPositionTo(location);
    }

    private Map extractAttributes(String str) {
        char quoteChar = 0;
        HashMap map = new HashMap<>();
        StringBuilder key = null;
        StringBuilder value = null;
        final int BEFORE_KEY = 1;
        final int KEY = 2;
        final int AFTER_KEY = 3;
        final int BEFORE_VALUE = 4;
        final int VALUE = 5;
        int state = BEFORE_KEY;

        for (char c: str.toCharArray()) {
            switch (state) {
                case BEFORE_KEY:
                    if (Character.isWhitespace(c)) {
                        // SKIP WHITE SPACE
                    } else {
                        key = new StringBuilder();
                        key.append(c);
                        state = KEY;
                    }
                    break;
                case KEY:
                    if (Character.isWhitespace(c)) {
                        state = AFTER_KEY;
                    } else if (c == '=') {
                        state = BEFORE_VALUE;
                    } else {
                        key.append(c);    
                    }
                    break;
                case AFTER_KEY:
                    if (Character.isAlphabetic(c)) {
                        map.put(key.toString(), "");
                        key = new StringBuilder();
                        key.append(c);
                    } else if (Character.isWhitespace(c)) {
                        // SKIP WHITE SPACE
                    } else if (c == '=') {
                        state = BEFORE_VALUE;
                    }
                    break;
                case BEFORE_VALUE:
                    if (Character.isAlphabetic(c)) {
                        quoteChar = ' ';
                        value = new StringBuilder();
                        value.append(c);
                        state = VALUE;
                    } else if (Character.isWhitespace(c)) {
                        // SKIP WHITE SPACE
                    } else if (c == '"' || c == '\'') {
                        quoteChar = c;
                        value = new StringBuilder();
                        state = VALUE;
                    }
                    break;
                case VALUE:
                    if (quoteChar == ' ' && Character.isWhitespace(c)) {
                        map.put(key.toString(), value.toString());
                        state = BEFORE_KEY;
                    } else if (c == quoteChar) {
                        map.put(key.toString(), value.toString());
                        state = BEFORE_KEY;
                    } else {
                        value.append(c);
                    }
                    break;
            }
        }
        return map;
    }
    private void emitComment(TokenHandler handler, String comment, boolean mayXmlDecl) {
        if (mayXmlDecl && comment.startsWith("?xml ")) {
            if (HtmlStandardScanner.LOG_TOKENIZER.isTraceEnabled()) {
                HtmlStandardScanner.LOG_TOKENIZER.trace(String.format("%s: XML DECL <%s>", location.toString(), comment));
            }
            Map map = extractAttributes(comment.substring(5));
            handler.emitXmlDecl(this, location, map.get("version"), map.get("encoding"), map.get("standalone"));
        } else {
            if (HtmlStandardScanner.LOG_TOKENIZER.isTraceEnabled()) {
                HtmlStandardScanner.LOG_TOKENIZER.trace(String.format("%s: COMMENT ", location.toString(), comment));
            }
            handler.emitComment(this, location, comment);
        }
        currentLocation.copyPositionTo(location);
    }

    private void emitTag(TokenHandler handler, TagToken tagToken) {
        final String tagName = tagToken.nameBuilder.toString();
        if (tagToken.isSelfClosingTag) {
            if (HtmlStandardScanner.LOG_TOKENIZER.isTraceEnabled()) {
                HtmlStandardScanner.LOG_TOKENIZER.trace(String.format("%s: ELEM(EMPTY) %s %s", location.toString(), tagName, attributes));
            }
        } else if (tagToken.isEndTag) {
            if (HtmlStandardScanner.LOG_TOKENIZER.isTraceEnabled()) {
                HtmlStandardScanner.LOG_TOKENIZER.trace(String.format("%s: ELEM END /%s %s", location.toString(), tagName, attributes));
            }
        } else {
            if (HtmlStandardScanner.LOG_TOKENIZER.isTraceEnabled()) {
                HtmlStandardScanner.LOG_TOKENIZER.trace(String.format("%s: ELEM START %s %s", location.toString(), tagName, attributes));
            }
        }

        handler.emitTag(this, location, tagToken, attributes);

        if (tagToken.isEndTag) {
            lastStartTagToken = null;
        } else {
            lastStartTagToken = tagToken;

            // https://html.spec.whatwg.org/multipage/parsing.html#parsing-html-fragments
            // 4. Set the state of the HTML parser's tokenization stage as follows, switching on the context element:
            switch (tagName) {
                case "title":
                case "textarea":
                    setTokenizerState(TokenizeState.RcData);
                    break;
                case "style":
                case "xmp":
                case "iframe":
                case "noembed":
                case "noframes":
                    setTokenizerState(TokenizeState.RawText);
                    break;
                case "script":
                    setTokenizerState(TokenizeState.ScriptData);
                    break;
                case "noscript":
                    setTokenizerState(TokenizeState.RawText);
                    break;
                default:
                    break;
            }
            attributes.removeAllAttributes();
        }
        currentLocation.copyPositionTo(location);
    }

    private void emitSelfClosingTag(TokenHandler handler, TagToken tagToken) {
        tagToken.isSelfClosingTag = true;
        if (HtmlStandardScanner.LOG_TOKENIZER.isTraceEnabled()) {
            final String tagName = tagToken.nameBuilder.toString();
            HtmlStandardScanner.LOG_TOKENIZER.trace(String.format("%s: ELEM(EMPTY) %s %s", location.toString(), tagName, attributes));
        }

        if (tagToken.isEndTag) {
            lastStartTagToken = null;
        } else {
            lastStartTagToken = null;
        }
        handler.emitTag(this, location, tagToken, attributes);
        attributes.removeAllAttributes();
        currentLocation.copyPositionTo(location);
    }

    private void emitTextIfAvailable(TokenHandler handler) {
        if (characterBuilder.length() > 0) {
            if (HtmlStandardScanner.LOG_TOKENIZER.isTraceEnabled()) {
                String text = characterBuilder.toString();
                text = text.replace("\n", "\\n");
                text = text.replace("\t", "\\t");
                final int LIMIT_LANGTH = 40;
                if (text.length() > LIMIT_LANGTH) {
                    text = text.substring(0, LIMIT_LANGTH/2-1) + "..."
                         + text.substring(text.length() - LIMIT_LANGTH/2-2, text.length());
                }
                HtmlStandardScanner.LOG_TOKENIZER.trace(String.format("%s: TEXT \"%s\"", location, text));
            }
            handler.emitText(this, location, characterBuilder.toString()); 
            characterBuilder = new StringBuilder();
            currentLocation.copyPositionTo(location);
        }
    }

    void runTokenizer(TokenHandler handler) {
        TagToken tagToken = null;
        StringBuilder attrNameBuilder = new StringBuilder();
        StringBuilder attrValueBuilder = new StringBuilder();
        StringBuilder commentBuilder = new StringBuilder();
        Doctype doctype = new Doctype();
        StringBuilder temporaryBuffer = new StringBuilder();
        boolean mayXmlDeclAsBogusComment = false;


        TokenizeState lastTokenizeState = null;
        try {
            char c = 0xFFFF;
            do {
                if (HtmlStandardScanner.LOG_TOKENIZER.isTraceEnabled()) {
                    if (lastTokenizeState != tokenizeState) {                        
                        HtmlStandardScanner.LOG_TOKENIZER.trace("Tokenize state:" + lastTokenizeState + " -> " + tokenizeState);
                        lastTokenizeState = tokenizeState;
                    }
                }
                switch (tokenizeState) {
                    case Data:
                        c = getChar();
                        if (c == '<') {
                            tokenizeState = TokenizeState.TagOpen;
                        // } else if (c == '&') {
                        //     tokenizeState = TokenizeState.CharacterReference;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            appendTextNode(0);
                        } else if (c == CHAR_SUB) {
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            appendTextNode(c);
                        }
                        break;

                    case RcData:
                        // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
                        c = getChar();
                        if (c == '<') {
                            tokenizeState = TokenizeState.RcDataLessThanSign;
                        // } else if (c == '&') {
                        //     tokenizeState = TokenizeState.CharacterReference;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            appendTextNode(0xFFFD);
                        } else if (c == CHAR_SUB) {
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            appendTextNode(c);
                        }
                        break;

                    case RawText:
                        // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
                        c = getChar();
                        if (c == '<') {
                            tokenizeState = TokenizeState.RawTextLessThanSign;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            appendTextNode(0xFFFD);
                        } else if (c == CHAR_SUB) {
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            appendTextNode(c);
                        }
                        break;

                    case ScriptData:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
                        c = getChar();
                        if (c == '<') {
                            tokenizeState = TokenizeState.ScriptDataLessThanSign;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            appendTextNode(0xFFFD);
                        } else if (c == CHAR_SUB) {
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            appendTextNode(c);
                        }
                        break;

                    case TagOpen:
                        emitTextIfAvailable(handler);
                        c = getChar();
                        if (c == '!') {
                            tokenizeState = TokenizeState.MarkupDeclarationOpen;
                        } else if (c == '/') {
                            tokenizeState = TokenizeState.EndTagOpen;
                        } else if (Character.isAlphabetic(c)) {
                            tagToken = new TagToken();
                            tagToken.nameBuilder.append(c /*Character.toLowerCase(c)*/);
                            tokenizeState = TokenizeState.TagName;
                        } else if (c == '?') {
                            handler.reportError("unexpected-question-mark-instead-of-tag-name", null);
                            commentBuilder = new StringBuilder();
                            commentBuilder.append(c);
                            mayXmlDeclAsBogusComment = true;
                            tokenizeState = TokenizeState.BogusComment;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-before-tag-name", null);
                            appendTextNode('<');
                        } else {
                            handler.reportError("invalid-first-character-of-tag-name", null);
                            tokenizeState = TokenizeState.Data;
                            appendTextNode('<');
                        }
                        break;

                    case EndTagOpen:
                        c = getChar();
                        if (Character.isAlphabetic(c)) {
                            tagToken = new TagToken();
                            tagToken.isEndTag = true;
                            tagToken.nameBuilder.append(c /*Character.toLowerCase(c)*/);
                            tokenizeState = TokenizeState.TagName;
                        } else if (c == '>') {
                            handler.reportError("missing-end-tag-name", null);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-before-tag-name", null);
                            appendTextNode('<');
                            appendTextNode(0x002F);
                            emitEof();
                        } else {
                            handler.reportError("invalid-first-character-of-tag-name", null);
                            tokenizeState = TokenizeState.BogusComment;
                            appendTextNode('<');
                        }
                        break;

                    case TagName:
                        // https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
                        c = getChar();
                        if (Character.isAlphabetic(c)) {
                            tagToken.nameBuilder.append(c /*Character.toLowerCase(c)*/);
                        } else if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            tokenizeState = TokenizeState.BeforeAttributeName;
                        } else if (c == '/') {
                            tokenizeState = TokenizeState.SelfClosingStartTag;
                        } else if (c == '>') {
                            tokenizeState = TokenizeState.Data;
                            emitTextIfAvailable(handler);
                            emitTag(handler, tagToken);
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            tagToken.nameBuilder.append((char) 0xFFFD);
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-before-tag-name", null);
                            appendTextNode('<');
                            appendTextNode(0x002F);
                            emitEof();
                        } else {
                            tagToken.nameBuilder.append(c);
                        }
                        break;
                    case RcDataLessThanSign:
                        // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
                        c = getChar();
                        if (c == '/') {
                            temporaryBuffer = new StringBuilder();
                            tokenizeState = TokenizeState.RcDataEndTagOpen;
                        } else {
                            appendTextNode('<');
                            pushBack();
                            tokenizeState = TokenizeState.RcData;
                        }
                        break;

                    case RcDataEndTagOpen:
                        // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
                        c = getChar();
                        if (Character.isAlphabetic(c)) {
                            tagToken = new TagToken();
                            tagToken.isEndTag = true;
                            pushBack();
                            tokenizeState = TokenizeState.RcDataEndTagName;
                        } else {
                            appendTextNode('<');
                            appendTextNode('/');
                            pushBack();
                            tokenizeState = TokenizeState.RcData;
                        }
                        break;

                    case RcDataEndTagName:
                        // https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
                        c = getChar();
                        boolean doElseClause = true;
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            if (isAppropriateEndTagToken(tagToken)) {
                                tokenizeState = TokenizeState.BeforeAttributeName;
                                doElseClause = false;
                            }
                        } else if (c == '/') {
                            if (isAppropriateEndTagToken(tagToken)) {
                                tokenizeState = TokenizeState.SelfClosingStartTag;
                                doElseClause = false;
                            }
                        } else if (c == '>') {
                            if (isAppropriateEndTagToken(tagToken)) {
                                tokenizeState = TokenizeState.Data;
                                emitTextIfAvailable(handler);
                                emitTag(handler, tagToken);
                                doElseClause = false;
                            }
                        } else if (Character.isAlphabetic(c)) {
                            tagToken.nameBuilder.append(c /*Character.toLowerCase(c)*/);
                            temporaryBuffer.append((char) c);
                            doElseClause = false;
                        }
                        if (doElseClause) {
                            appendTextNode('<');
                            appendTextNode('/');
                            appendTextNode(temporaryBuffer.toString());
                            pushBack();
                            tokenizeState = TokenizeState.RcData;
                        }
                        break;

                    case RawTextLessThanSign:
                        // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
                        c = getChar();
                        if (c == '/') {
                            temporaryBuffer = new StringBuilder();
                            tokenizeState = TokenizeState.RawTextEndTagOpen;
                        } else {
                            appendTextNode('<');
                            pushBack();
                            tokenizeState = TokenizeState.RawText;
                        }
                        break;


                    case RawTextEndTagOpen:
                        // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
                        c = getChar();
                        if (Character.isAlphabetic(c)) {
                            tagToken = new TagToken();
                            tagToken.isEndTag = true;
                            pushBack();
                            tokenizeState = TokenizeState.RawTextEndTagName;
                        } else {
                            appendTextNode('<');
                            appendTextNode('/');
                            pushBack();
                            tokenizeState = TokenizeState.RawText;
                        }
                        break;

                    case RawTextEndTagName:
                        // https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
                        c = getChar();
                        doElseClause = true;
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            if (isAppropriateEndTagToken(tagToken)) {
                                tokenizeState = TokenizeState.BeforeAttributeName;
                                doElseClause = false;
                            }
                        } else if (c == '/') {
                            if (isAppropriateEndTagToken(tagToken)) {
                                tokenizeState = TokenizeState.SelfClosingStartTag;
                                doElseClause = false;
                            }
                        } else if (c == '>') {
                            if (isAppropriateEndTagToken(tagToken)) {
                                tokenizeState = TokenizeState.Data;
                                emitTextIfAvailable(handler);
                                emitTag(handler, tagToken);
                                doElseClause = false;
                            }
                        } else if (Character.isAlphabetic(c)) {
                            tagToken.nameBuilder.append(c /*Character.toLowerCase(c)*/);
                            temporaryBuffer.append((char) c);
                            doElseClause = false;
                        }
                        if (doElseClause) {
                            appendTextNode('<');
                            appendTextNode('/');
                            appendTextNode(temporaryBuffer.toString());
                            pushBack();
                            tokenizeState = TokenizeState.RawText;
                        }
                        break;

                    case ScriptDataLessThanSign:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
                        c = getChar();
                        if (c == '/') {
                            temporaryBuffer = new StringBuilder();
                            tokenizeState = TokenizeState.ScriptDataEndTagOpen;
                        } else if (c == '!') {
                            temporaryBuffer = new StringBuilder();
                            tokenizeState = TokenizeState.ScriptDataEscapeStart;
                            appendTextNode('<');
                            appendTextNode('!');
                        } else {
                            appendTextNode('<');
                            pushBack();
                            tokenizeState = TokenizeState.ScriptData;
                        }
                        break;

                    case ScriptDataEndTagOpen:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
                        c = getChar();
                        if (Character.isAlphabetic(c)) {
                            tagToken = new TagToken();
                            tagToken.isEndTag = true;
                            pushBack();
                            tokenizeState = TokenizeState.ScriptDataEndTagName;
                        } else {
                            appendTextNode('<');
                            appendTextNode('/');
                            pushBack();
                            tokenizeState = TokenizeState.ScriptData;
                        }
                        break;

                    case ScriptDataEndTagName:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
                        c = getChar();
                        doElseClause = true;
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            if (isAppropriateEndTagToken(tagToken)) {
                                tokenizeState = TokenizeState.BeforeAttributeName;
                                doElseClause = false;
                            }
                        } else if (c == '/') {
                            if (isAppropriateEndTagToken(tagToken)) {
                                tokenizeState = TokenizeState.SelfClosingStartTag;
                                doElseClause = false;
                            }
                        } else if (c == '>') {
                            if (isAppropriateEndTagToken(tagToken)) {
                                tokenizeState = TokenizeState.Data;
                                emitTextIfAvailable(handler);
                                emitTag(handler, tagToken);
                                doElseClause = false;
                            }
                        } else if (Character.isAlphabetic(c)) {
                            tagToken.nameBuilder.append(c /*Character.toLowerCase(c)*/);
                            temporaryBuffer.append((char) c);
                            doElseClause = false;
                        }
                        if (doElseClause) {
                            appendTextNode('<');
                            appendTextNode('/');
                            appendTextNode(temporaryBuffer.toString());
                            pushBack();
                            tokenizeState = TokenizeState.ScriptData;
                        }
                        break;

                    case ScriptDataEscapeStart:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
                        c = getChar();
                        if (c == '-') {
                            tokenizeState = TokenizeState.ScriptDataEscapedDash;
                            appendTextNode('-');
                        } else {
                            pushBack();
                            tokenizeState = TokenizeState.ScriptData;
                        }
                        break;


                    case ScriptDataEscapeStartDash:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
                        c = getChar();
                        if (c == '-') {
                            tokenizeState = TokenizeState.ScriptDataEscapedDashDash;
                            appendTextNode('-');
                        } else {
                            pushBack();
                            tokenizeState = TokenizeState.ScriptData;
                        }
                        break;

                    case ScriptDataEscaped:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
                        c = getChar();
                        if (c == '-') {
                            tokenizeState = TokenizeState.ScriptDataEscapedDash;
                            appendTextNode('-');
                        } else if (c == '<') {
                            tokenizeState = TokenizeState.ScriptDataEscapedLessThanSign;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            appendTextNode(0xFFFD);
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-script-html-comment-like-text", null);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            appendTextNode(c);
                        }
                        break;

                    case ScriptDataEscapedDash:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
                        c = getChar();
                        if (c == '-') {
                            tokenizeState = TokenizeState.ScriptDataEscapedDashDash;
                            appendTextNode('-');
                        } else if (c == '<') {
                            tokenizeState = TokenizeState.ScriptDataEscapedLessThanSign;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            tokenizeState = TokenizeState.ScriptDataEscaped;
                            appendTextNode(0xFFFD);
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-script-html-comment-like-text", null);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            tokenizeState = TokenizeState.ScriptDataEscaped;
                            appendTextNode(c);
                        }
                        break;

                    case ScriptDataEscapedDashDash:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
                        c = getChar();
                        if (c == '-') {
                            appendTextNode('-');
                        } else if (c == '<') {
                            tokenizeState = TokenizeState.ScriptDataEscapedLessThanSign;
                        } else if (c == '>') {
                            tokenizeState = TokenizeState.ScriptData;
                            appendTextNode('>');
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            tokenizeState = TokenizeState.ScriptDataEscaped;
                            appendTextNode(0xFFFD);
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-script-html-comment-like-text", null);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            tokenizeState = TokenizeState.ScriptDataEscaped;
                            appendTextNode(c);
                        }
                        break;

                    case ScriptDataEscapedLessThanSign:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
                        c = getChar();
                        if (c == '/') {
                            temporaryBuffer = new StringBuilder();
                            tokenizeState = TokenizeState.ScriptDataEscapedEndTagOpen;
                        } else if (Character.isAlphabetic(c)) {
                            temporaryBuffer = new StringBuilder();
                            appendTextNode('<');
                            pushBack();
                            tokenizeState = TokenizeState.ScriptDataDoubleEscapeStart;
                        } else {
                            appendTextNode('<');
                            pushBack();
                            tokenizeState = TokenizeState.ScriptDataEscaped;
                        }
                        break;

                    case ScriptDataEscapedEndTagOpen:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
                        c = getChar();
                        if (Character.isAlphabetic(c)) {
                            tagToken = new TagToken();
                            tagToken.isEndTag = true;
                            pushBack();
                            tokenizeState = TokenizeState.ScriptDataEscapedEndTagName;
                        } else {
                            appendTextNode('<');
                            appendTextNode('/');
                            pushBack();
                            tokenizeState = TokenizeState.ScriptDataEscaped;
                        }
                        break;

                    case ScriptDataEscapedEndTagName:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
                        c = getChar();
                        doElseClause = true;
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            if (isAppropriateEndTagToken(tagToken)) {
                                tokenizeState = TokenizeState.BeforeAttributeName;
                                doElseClause = false;
                            }
                        } else if (c == '/') {
                            if (isAppropriateEndTagToken(tagToken)) {
                                tokenizeState = TokenizeState.SelfClosingStartTag;
                                doElseClause = false;
                            }
                        } else if (c == '>') {
                            if (isAppropriateEndTagToken(tagToken)) {
                                tokenizeState = TokenizeState.Data;
                                emitTextIfAvailable(handler);
                                emitTag(handler, tagToken);
                                doElseClause = false;
                            }
                        } else if (Character.isAlphabetic(c)) {
                            tagToken.nameBuilder.append(c /*Character.toLowerCase(c)*/);
                            temporaryBuffer.append((char) c);
                            doElseClause = false;
                        }
                        if (doElseClause) {
                            appendTextNode('<');
                            appendTextNode('/');
                            appendTextNode(temporaryBuffer.toString());
                            pushBack();
                            tokenizeState = TokenizeState.ScriptDataEscaped;
                        }
                        break;


                    case ScriptDataDoubleEscapeStart:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ' || c == '/' || c == '>') {
                            if (temporaryBuffer.length() >= 6) {
                                String firstSixChars = temporaryBuffer.substring(0, 6);
                                if ("script".equalsIgnoreCase(firstSixChars)) {
                                    tokenizeState = TokenizeState.ScriptDataDoubleEscaped;
                                    appendTextNode(c);
                                } else {
                                    tokenizeState = TokenizeState.ScriptDataEscaped;
                                    appendTextNode(c);
                                }
                            } else {
                                tokenizeState = TokenizeState.ScriptDataEscaped;
                                appendTextNode(c);
                            }
                        } else if (Character.isAlphabetic(c)) {
                            temporaryBuffer.append(c /*Character.toLowerCase(c)*/);
                            appendTextNode(c);
                        } else {
                            pushBack();
                            tokenizeState = TokenizeState.ScriptDataEscaped;
                        }
                        break;

                    case ScriptDataDoubleEscaped:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
                        c = getChar();
                        if (c == '-') {
                            tokenizeState = TokenizeState.ScriptDataDoubleEscapedDash;
                            appendTextNode('-');
                        } else if (c == '<') {
                            tokenizeState = TokenizeState.ScriptDataDoubleEscapedLessThanSign;
                            appendTextNode('<');
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            appendTextNode(0xFFFD);
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-script-html-comment-like-text", null);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            appendTextNode(c);
                        }
                        break;


                    case ScriptDataDoubleEscapedDash:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
                        c = getChar();
                        if (c == '-') {
                            tokenizeState = TokenizeState.ScriptDataDoubleEscapedDashDash;
                            appendTextNode('-');
                        } else if (c == '<') {
                            tokenizeState = TokenizeState.ScriptDataDoubleEscapedLessThanSign;
                            appendTextNode('>');
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            tokenizeState = TokenizeState.ScriptDataDoubleEscaped;
                            appendTextNode(0xFFFD);
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-script-html-comment-like-text", null);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            tokenizeState = TokenizeState.ScriptDataDoubleEscaped;
                            appendTextNode(c);
                        }
                        break;

                    case ScriptDataDoubleEscapedDashDash:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
                        c = getChar();
                        if (c == '-') {
                            appendTextNode('-');
                        } else if (c == '<') {
                            tokenizeState = TokenizeState.ScriptDataDoubleEscapedLessThanSign;
                        } else if (c == '>') {
                            tokenizeState = TokenizeState.ScriptData;
                            appendTextNode('>');
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            tokenizeState = TokenizeState.ScriptDataDoubleEscaped;
                            appendTextNode(0xFFFD);
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-script-html-comment-like-text", null);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            tokenizeState = TokenizeState.ScriptDataDoubleEscaped;
                            appendTextNode(c);
                        }
                        break;

                    case ScriptDataDoubleEscapedLessThanSign:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
                        c = getChar();
                        if (c == '/') {
                            temporaryBuffer = new StringBuilder();
                            tokenizeState = TokenizeState.ScriptDataDoubleEscapeEnd;
                            appendTextNode('/');
                        } else {
                            pushBack();
                            tokenizeState = TokenizeState.ScriptDataDoubleEscaped;
                        }
                        break;


                    case ScriptDataDoubleEscapeEnd:
                        // https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ' || c == '/' || c == '>') {
                            if (temporaryBuffer.length() >= 6) {
                                String firstSixChars = temporaryBuffer.substring(0, 6);
                                if ("script".equalsIgnoreCase(firstSixChars)) {
                                    tokenizeState = TokenizeState.ScriptDataEscaped;
                                    appendTextNode(c);
                                } else {
                                    tokenizeState = TokenizeState.ScriptDataDoubleEscaped;
                                    appendTextNode(c);
                                }
                            } else {
                                tokenizeState = TokenizeState.ScriptDataDoubleEscaped;
                                appendTextNode(c);
                            }
                        } else if (Character.isAlphabetic(c)) {
                            temporaryBuffer.append(c /*Character.toLowerCase(c)*/);
                            appendTextNode(c);
                        } else {
                            pushBack();
                            tokenizeState = TokenizeState.ScriptDataDoubleEscaped;
                        }
                        break;

                    case BeforeAttributeName:
                        // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            // Ignore these.
                        } else if (c == '/' || c == '>' || c == CHAR_SUB) {
                            pushBack();
                            attrNameBuilder = new StringBuilder();
                            attrValueBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.AfterAttributeName;
                        } else if (c == '=') {
                            // https://html.spec.whatwg.org/multipage/parsing.html#parse-error-unexpected-equals-sign-before-attribute-name
                            handler.reportError("unexpected-equals-sign-before-attribute-name", null);
                            attrNameBuilder = new StringBuilder();
                            attrValueBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.AttributeName;
                            attrNameBuilder.append("=");
                        } else {
                            pushBack();
                            attrNameBuilder = new StringBuilder();
                            attrValueBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.AttributeName;
                        }
                        break;

                    case AttributeName:
                        // https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' '
                            || c == '/' || c == '>' || c == CHAR_SUB) {
                            pushBack();
                            tokenizeState = TokenizeState.AfterAttributeName;
                        } else if (c == '=') {
                            tokenizeState = TokenizeState.BeforeAttributeValue;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            attrNameBuilder.append((char) 0xFFFD);
                        } else if (c == '"' || c == '\'' || c == '<') {
                            handler.reportError("unexpected-null-character", null);
                            attrNameBuilder.append((char) c);
                        } else {
                            attrNameBuilder.append(c);
                        }

                        if (tokenizeState != TokenizeState.AttributeName) {
                            // check duplication
                            if (containSameAttributeName(attrNameBuilder.toString())) {
                                handler.reportError("duplicate-attribute", null);
                                attrNameBuilder = new StringBuilder(); // reject this attribute 
                            }
                        }
                        break;
                    case AfterAttributeName:
                        // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            // Ignore these.
                        } else if (c == '/') {
                            emitAttribute(null, attrNameBuilder.toString(), attrValueBuilder.toString());
                            tokenizeState = TokenizeState.SelfClosingStartTag;
                        } else if (c == '=') {
                            tokenizeState = TokenizeState.BeforeAttributeValue;
                        } else if (c == '>') {
                            emitAttribute(null, attrNameBuilder.toString(), attrValueBuilder.toString());
                            emitTextIfAvailable(handler);
                            tokenizeState = TokenizeState.Data;
                            emitTag(handler, tagToken);
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-tag", null);
                            emitAttribute(null, attrNameBuilder.toString(), attrValueBuilder.toString());
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            emitTextIfAvailable(handler);
                            emitAttribute(null, attrNameBuilder.toString(), "");
                            attrNameBuilder = new StringBuilder();
                            attrValueBuilder = new StringBuilder();
                            pushBack();
                            tokenizeState = TokenizeState.AttributeName;
                        }
                        break;

                    case BeforeAttributeValue:
                        // https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            // Ignore these.
                        } else if (c == '"') {
                            tokenizeState = TokenizeState.AttributeValueDoubleQuoted;
                        } else if (c == '\'') {
                            tokenizeState = TokenizeState.AttributeValueSingleQuoted;
                        } else if (c == '>') {
                            handler.reportError("missing-attribute-value", null);
                            emitTextIfAvailable(handler);
                            tokenizeState = TokenizeState.Data;
                            emitTag(handler, tagToken);
                        } else {
                            pushBack();
                            tokenizeState = TokenizeState.AttributeValueUnquoted;
                        }
                        break;
                    case AttributeValueDoubleQuoted:
                        c = getChar();
                        if (c == '"') {
                            // emitTextIfAvailable(handler);
                            emitAttribute(null, attrNameBuilder.toString(), attrValueBuilder.toString());
                            tokenizeState = TokenizeState.AfterAttributeValueQuoted;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            attrValueBuilder.append((char) 0xFFFD);
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-tag", null);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            attrValueBuilder.append(c);
                        }
                        break;
                    case AttributeValueSingleQuoted:
                        c = getChar();
                        if (c == '\'') {
                            emitTextIfAvailable(handler);
                            emitAttribute(null, attrNameBuilder.toString(), attrValueBuilder.toString());
                            tokenizeState = TokenizeState.AfterAttributeValueQuoted;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            attrValueBuilder.append((char) 0xFFFD);
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-tag", null);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            attrValueBuilder.append(c);
                        }
                        break;

                    case AttributeValueUnquoted:
                        // https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-unquoted-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            emitTextIfAvailable(handler);
                            emitAttribute(null, attrNameBuilder.toString(), attrValueBuilder.toString());
                            tokenizeState = TokenizeState.BeforeAttributeName;
                        } else if (c == '>') {
                            emitTextIfAvailable(handler);
                            emitAttribute(null, attrNameBuilder.toString(), attrValueBuilder.toString());
                            tokenizeState = TokenizeState.Data;
                            emitTag(handler, tagToken);
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            attrValueBuilder.append((char) 0xFFFD);
                        } else if (c == '"' || c == '\'' || c == '<' || c == '=' || c == '`') {
                            handler.reportError("unexpected-character-in-unquoted-attribute-value", null);
                            attrValueBuilder.append(c);
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-tag", null);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            attrValueBuilder.append(c);
                        }
                        break;

                    case AfterAttributeValueQuoted:
                        // https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-unquoted-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            tokenizeState = TokenizeState.BeforeAttributeName;
                        } else if (c == '/') {
                            tokenizeState = TokenizeState.SelfClosingStartTag;
                        } else if (c == '>') {
                            emitTextIfAvailable(handler);
                            emitAttribute(null, attrNameBuilder.toString(), attrValueBuilder.toString());
                            tokenizeState = TokenizeState.Data;
                            emitTag(handler, tagToken);
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-tag", null);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            handler.reportError("missing-whitespace-between-attributes", null);
                            pushBack();
                            tokenizeState = TokenizeState.BeforeAttributeName;
                        }
                        break;

                    case SelfClosingStartTag:
                        // https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
                        emitTextIfAvailable(handler);
                        c = getChar();
                        if (c == '>') {
                            emitSelfClosingTag(handler, tagToken);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-tag", null);
                            emitEof();
                        } else {
                            handler.reportError("unexpected-solidus-in-tag", null);
                            pushBack();
                            tokenizeState = TokenizeState.BeforeAttributeName;
                        }
                        break;

                    case BogusComment:
                        // https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
                        c = getChar();
                        if (c == '>') {
                            emitTextIfAvailable(handler);
                            emitComment(handler, commentBuilder.toString(), mayXmlDeclAsBogusComment);
                            mayXmlDeclAsBogusComment = false;
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            emitTextIfAvailable(handler);
                            emitComment(handler, commentBuilder.toString(), mayXmlDeclAsBogusComment);
                            mayXmlDeclAsBogusComment = false;
                            handler.reportError("eof-in-comment", null);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            commentBuilder.append((char) 0xFFFD);
                        } else {
                            commentBuilder.append((char) c);
                        }
                        break;

                    case MarkupDeclarationOpen:
                        // https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
                        if (skipString("--")) {
                            commentBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.CommentStart;
                        } else if (skipStringIgnoreCase("DOCTYPE")) {
                            tokenizeState = TokenizeState.Doctype;
                        } else if (skipString("[CDATA[")) {
                            handler.reportError("cdata-in-html-content", null);
                            commentBuilder = new StringBuilder();
                            commentBuilder.append("[CDATA[");
                            tokenizeState = TokenizeState.BogusComment;
                        } else {
                            handler.reportError("incorrectly-opened-comment", null);
                            tokenizeState = TokenizeState.BogusComment;
                        }
                        break;
                    case CommentStart:
                        // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
                        c = getChar();
                        if (c == '-') {
                            tokenizeState = TokenizeState.CommentStartDash;
                        } else if (c == '>') {
                            handler.reportError("abrupt-closing-of-empty-comment", null);
                            emitTextIfAvailable(handler);
                            emitComment(handler, commentBuilder.toString(), mayXmlDeclAsBogusComment);
                            mayXmlDeclAsBogusComment = false;
                            tokenizeState = TokenizeState.Data;
                        } else {
                            tokenizeState = TokenizeState.Comment;
                            pushBack();
                        }
                        break;
                    case CommentStartDash:
                        // https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
                        c = getChar();
                        if (c == '-') {
                            tokenizeState = TokenizeState.CommentEnd;
                        } else if (c == '>') {
                            handler.reportError("abrupt-closing-of-empty-comment", null);
                            emitTextIfAvailable(handler);
                            emitComment(handler, commentBuilder.toString(), false);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            emitTextIfAvailable(handler);
                            emitComment(handler, commentBuilder.toString(), false);
                            handler.reportError("eof-in-comment", null);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            commentBuilder.append((char) '-');
                            tokenizeState = TokenizeState.Comment;
                            pushBack();
                            commentBuilder.append((char) c);
                        }
                        break;


                    case Comment:
                        // https://html.spec.whatwg.org/multipage/parsing.html
                        c = getChar();
                        if (c == '<') {
                            commentBuilder.append((char) '<');
                            tokenizeState = TokenizeState.CommentLessThanSign;
                        } else if (c == '-') {
                            tokenizeState = TokenizeState.CommentEndDash;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            commentBuilder.append((char) 0xFFFD);
                        } else if (c == CHAR_SUB) {
                            emitTextIfAvailable(handler);
                            emitComment(handler, commentBuilder.toString(), false);
                            handler.reportError("eof-in-comment", null);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            commentBuilder.append((char) c);
                        }
                        break;

                    case CommentLessThanSign:
                        // https://html.spec.whatwg.org/multipage/parsing.html
                        c = getChar();
                        if (c == '!') {
                            commentBuilder.append((char) '!');
                            tokenizeState = TokenizeState.CommentLessThanSignBang;
                        } else if (c == '<') {
                            commentBuilder.append((char) c);
                        } else {
                            pushBack();
                            tokenizeState = TokenizeState.Comment;
                        }
                        break;

                    case CommentLessThanSignBang:
                        // https://html.spec.whatwg.org/multipage/parsing.html
                        c = getChar();
                        if (c == '-') {
                            tokenizeState = TokenizeState.CommentLessThanSignBangDash;
                        } else {
                            pushBack();
                            tokenizeState = TokenizeState.Comment;
                        }
                        break;
                    
                    case CommentLessThanSignBangDash:
                        // https://html.spec.whatwg.org/multipage/parsing.html
                        c = getChar();
                        if (c == '-') {
                            tokenizeState = TokenizeState.CommentLessThanSignBangDashDash;
                        } else {
                            pushBack();
                            tokenizeState = TokenizeState.CommentEndDash;
                        }
                        break;

                    case CommentLessThanSignBangDashDash:
                        // https://html.spec.whatwg.org/multipage/parsing.html
                        c = getChar();
                        if (c == '>' || c == CHAR_SUB) {
                            pushBack();
                            tokenizeState = TokenizeState.CommentEnd;
                        } else {
                            handler.reportError("nested-comment", null);
                            pushBack();
                            tokenizeState = TokenizeState.CommentEnd;
                        }
                        break;

                    case CommentEndDash:
                        // https://html.spec.whatwg.org/multipage/parsing.html
                        c = getChar();
                        if (c == '-') {
                            tokenizeState = TokenizeState.CommentEnd;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-comment", null);
                            emitTextIfAvailable(handler);
                            emitComment(handler, commentBuilder.toString(), false);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            commentBuilder.append((char) '-');
                            pushBack();
                            tokenizeState = TokenizeState.Comment;
                        }
                        break;

                    case CommentEnd:
                        // https://html.spec.whatwg.org/multipage/parsing.html
                        c = getChar();
                        if (c == '>') {
                            emitTextIfAvailable(handler);
                            emitComment(handler, commentBuilder.toString(), false);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == '!') {
                            tokenizeState = TokenizeState.CommentEndBang;
                        } else if (c == '-') {
                            commentBuilder.append('-');
                        } else if (c == CHAR_SUB) {
                            emitTextIfAvailable(handler);
                            emitComment(handler, commentBuilder.toString(), false);
                            handler.reportError("eof-in-comment", null);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            commentBuilder.append((char) '-');
                            commentBuilder.append((char) '-');
                            pushBack();
                            tokenizeState = TokenizeState.Comment;
                        }
                        break;

                    case CommentEndBang:
                        // https://html.spec.whatwg.org/multipage/parsing.html
                        c = getChar();
                        if (c == '-') {
                            commentBuilder.append('-');
                            commentBuilder.append('-');
                            commentBuilder.append('!');
                            tokenizeState = TokenizeState.CommentEndDash;
                        } else if (c == '>') {
                            emitTextIfAvailable(handler);
                            emitComment(handler, commentBuilder.toString(), false);
                            handler.reportError("incorrectly-closed-comment", null);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            emitTextIfAvailable(handler);
                            emitComment(handler, commentBuilder.toString(), false);
                            handler.reportError("eof-in-comment", null);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            commentBuilder.append((char) '-');
                            commentBuilder.append((char) '-');
                            commentBuilder.append((char) '!');
                            pushBack();
                            tokenizeState = TokenizeState.Comment;
                        }
                        break;

                    case Doctype:
                        // https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            tokenizeState = TokenizeState.BeforeDoctypeName;
                        } else if (c == '>') {
                            pushBack();
                            tokenizeState = TokenizeState.BeforeDoctypeName;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-doctype", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            handler.reportError("missing-whitespace-before-doctype-name", null);
                            pushBack();
                            tokenizeState = TokenizeState.BeforeDoctypeName;
                        }
                        break;

                    case BeforeDoctypeName:
                        // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            // Ignore these
                        } else if (Character.isAlphabetic(c)) {
                            doctype.nameBuilder = new StringBuilder();
                            doctype.nameBuilder.append(c /*Character.toLowerCase(c)*/);
                            tokenizeState = TokenizeState.DoctypeName;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            doctype.nameBuilder.append((char) 0xFFFD);
                            tokenizeState = TokenizeState.DoctypeName;
                        } else if (c == '>' ) {
                            handler.reportError("missing-doctype-name", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-doctype", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitTextIfAvailable(handler);
                            emitEof();
                        } else {
                            doctype.nameBuilder = new StringBuilder();
                            doctype.nameBuilder.append(c);
                            tokenizeState = TokenizeState.DoctypeName;
                        }
                        break;

                    case DoctypeName:
                        // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            tokenizeState = TokenizeState.AfterDoctypeName;
                        } else if (Character.isAlphabetic(c)) {
                            doctype.nameBuilder.append(c /*Character.toLowerCase(c)*/);
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            doctype.nameBuilder.append((char) 0xFFFD);
                        } else if (c == '>' ) {
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-doctype", null);
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitEof();
                        } else {
                            doctype.nameBuilder.append(c);
                        }
                        break;

                    case AfterDoctypeName:
                        // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            // Ignore these
                        } else if (c == '>' ) {
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-doctype", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitEof();
                        } else {
                            if ((c == 'p' || c == 'P') && skipStringIgnoreCase("UBLIC")) {
                                tokenizeState = TokenizeState.AfterDoctypePublicKeyword;
                            } else if ((c == 's' || c == 'S') && skipStringIgnoreCase("YSTEM")) {
                                tokenizeState = TokenizeState.AfterDoctypeSystemKeyword;
                            } else {
                                handler.reportError("invalid-character-sequence-after-doctype-name", null);
                                pushBack();
                                doctype.forceQuirkFlag = true;
                                tokenizeState = TokenizeState.BogusDoctype;
                            }
                        }
                        break;

                    case AfterDoctypePublicKeyword:
                        // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            tokenizeState = TokenizeState.BeforeDoctypePublicIdentifier;
                        } else if (c == '"') {
                            handler.reportError("missing-whitespace-after-doctype-public-keyword", null);
                            doctype.publicIdBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.DoctypePublicIdentifierDoubleQuoted;
                        } else if (c == '\'') {
                            handler.reportError("missing-whitespace-after-doctype-public-keyword", null);
                            doctype.publicIdBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.DoctypePublicIdentifierSingleQuoted;
                        } else if (c == '>') {
                            handler.reportError("missing-doctype-public-identifier", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-doctype", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitEof();
                        } else {
                            handler.reportError("missing-quote-before-doctype-public-identifier", null);
                            doctype.forceQuirkFlag = true;
                            pushBack();
                            tokenizeState = TokenizeState.BogusDoctype;
                        }
                        break;

                    case BeforeDoctypePublicIdentifier:
                        // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            // Ignore these
                        } else if (c == '"') {
                            doctype.publicIdBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.DoctypePublicIdentifierDoubleQuoted;
                        } else if (c == '\'') {
                            doctype.publicIdBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.DoctypePublicIdentifierSingleQuoted;
                        } else if (c == '>') {
                            handler.reportError("missing-doctype-public-identifier", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-doctype", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitEof();
                        } else {
                            handler.reportError("missing-quote-before-doctype-public-identifier", null);
                            doctype.forceQuirkFlag = true;
                            pushBack();
                            tokenizeState = TokenizeState.BogusDoctype;
                        }
                        break;

                    case DoctypePublicIdentifierDoubleQuoted:
                        // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
                        c = getChar();
                        if (c == '"') {
                            tokenizeState = TokenizeState.AfterDoctypePublicIdentifier;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            doctype.publicIdBuilder.append((char) 0xFFFD);
                        } else if (c == '>') {
                            handler.reportError("abrupt-doctype-public-identifier", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-doctype", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitEof();
                        } else {
                            doctype.publicIdBuilder.append((char) c);
                        }
                        break;

                    case DoctypePublicIdentifierSingleQuoted:
                        // https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
                        c = getChar();
                        if (c == '\'') {
                            tokenizeState = TokenizeState.AfterDoctypePublicIdentifier;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            doctype.publicIdBuilder.append((char) 0xFFFD);
                        } else if (c == '>') {
                            handler.reportError("abrupt-doctype-public-identifier", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-doctype", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitEof();
                        } else {
                            doctype.publicIdBuilder.append((char) c);
                        }
                        break;

                    case AfterDoctypePublicIdentifier:
                        // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            tokenizeState = TokenizeState.BetweenDoctypePublicAndSystemIdentifiers;
                        } else if (c == '>') {
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == '"') {
                            handler.reportError("missing-whitespace-between-doctype-public-and-system-identifiers", null);
                            doctype.systemIdBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.DoctypeSystemIdentifierDoubleQuoted;
                        } else if (c == '\'') {
                            handler.reportError("missing-whitespace-between-doctype-public-and-system-identifiers", null);
                            doctype.systemIdBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.DoctypeSystemIdentifierSingleQuoted;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-doctype", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitEof();
                        } else {
                            handler.reportError("missing-quote-before-doctype-system-identifier", null);
                            doctype.forceQuirkFlag = true;
                            pushBack();
                            tokenizeState = TokenizeState.BogusDoctype;
                        }
                        break;

                    case BetweenDoctypePublicAndSystemIdentifiers:
                        // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            // ignore these
                        } else if (c == '>') {
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == '"') {
                            doctype.systemIdBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.DoctypeSystemIdentifierDoubleQuoted;
                        } else if (c == '\'') {
                            doctype.systemIdBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.DoctypeSystemIdentifierSingleQuoted;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-doctype", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitEof();
                        } else {
                            handler.reportError("missing-quote-before-doctype-system-identifier", null);
                            doctype.forceQuirkFlag = true;
                            pushBack();
                            tokenizeState = TokenizeState.BogusDoctype;
                        }
                        break;

                    case AfterDoctypeSystemKeyword:
                        // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            tokenizeState = TokenizeState.BeforeDoctypePublicIdentifier;
                        } else if (c == '"') {
                            handler.reportError("missing-whitespace-after-doctype-system-keyword", null);
                            doctype.publicIdBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.DoctypeSystemIdentifierDoubleQuoted;
                        } else if (c == '\'') {
                            handler.reportError("missing-whitespace-after-doctype-system-keyword", null);
                            doctype.publicIdBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.DoctypeSystemIdentifierSingleQuoted;
                        } else if (c == '>') {
                            handler.reportError("missing-doctype-system-identifier", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-doctype", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitEof();
                        } else {
                            handler.reportError("missing-quote-before-doctype-system-identifier", null);
                            doctype.forceQuirkFlag = true;
                            pushBack();
                            tokenizeState = TokenizeState.BogusDoctype;
                        }
                        break;

                    case BeforeDoctypeSystemIdentifier:
                        // https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            // Ignore these
                        } else if (c == '"') {
                            doctype.systemIdBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.DoctypeSystemIdentifierDoubleQuoted;
                        } else if (c == '\'') {
                            doctype.systemIdBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.DoctypeSystemIdentifierSingleQuoted;
                        } else if (c == '>') {
                            handler.reportError("missing-doctype-system-identifier", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-doctype", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitEof();
                        } else {
                            handler.reportError("missing-quote-before-doctype-system-identifier", null);
                            doctype.forceQuirkFlag = true;
                            pushBack();
                            tokenizeState = TokenizeState.BogusDoctype;
                        }
                        break;


                    case DoctypeSystemIdentifierDoubleQuoted:
                        // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
                        c = getChar();
                        if (c == '"') {
                            tokenizeState = TokenizeState.AfterDoctypeSystemIdentifier;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            doctype.systemIdBuilder.append((char) 0xFFFD);
                        } else if (c == '>') {
                            handler.reportError("abrupt-doctype-system-identifier", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-doctype", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitEof();
                        } else {
                            doctype.systemIdBuilder.append((char) c);
                        }
                        break;

                    case DoctypeSystemIdentifierSingleQuoted:
                        // https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
                        c = getChar();
                        if (c == '\'') {
                            doctype.systemIdBuilder = new StringBuilder();
                            tokenizeState = TokenizeState.AfterDoctypeSystemIdentifier;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            doctype.systemIdBuilder.append((char) 0xFFFD);
                        } else if (c == '>') {
                            handler.reportError("abrupt-doctype-system-identifier", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-doctype", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitEof();
                        } else {
                            doctype.systemIdBuilder.append((char) c);
                        }
                        break;

                    case AfterDoctypeSystemIdentifier:
                        // https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
                        c = getChar();
                        if (c == '\t'/*TAB*/ || c == '\n'/*LINEFEED*/ || c == 0x0C/*FORMFEED*/ || c == ' ') {
                            // ignore these
                        } else if (c == '>') {
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-doctype", null);
                            doctype.forceQuirkFlag = true;
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitEof();
                        } else {
                            handler.reportError("unexpected-character-after-doctype-system-identifier", null);
                            doctype.forceQuirkFlag = false; // (This does not set the current DOCTYPE token's force-quirks flag to on.) 
                            pushBack();
                            tokenizeState = TokenizeState.BogusDoctype;
                        }
                        break;

                    case BogusDoctype:
                        // https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
                        c = getChar();
                        if (c == '>') {
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            tokenizeState = TokenizeState.Data;
                        } else if (c == 0) {
                            handler.reportError("unexpected-null-character", null);
                            // ignore this character
                        } else if (c == CHAR_SUB) {
                            emitTextIfAvailable(handler);
                            emitDoctype(handler, doctype);
                            emitEof();
                        } else {
                            // ignore this character
                        }
                        break;

                    case CDataSection:
                        // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
                        c = getChar();
                        if (c == ']') {
                            tokenizeState = TokenizeState.CDataSectionBracket;
                        } else if (c == CHAR_SUB) {
                            handler.reportError("eof-in-cdata", null);
                            emitEof();
                        } else {
                            appendTextNode(c); 
                        }
                        break;

                    case CDataSectionBracket:
                        // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
                        c = getChar();
                        if (c == ']') {
                            tokenizeState = TokenizeState.CDataSectionEnd;
                        } else {
                            appendTextNode(']'); 
                            pushBack();
                            tokenizeState = TokenizeState.CDataSection;
                        }
                        break;

                    case CDataSectionEnd:
                        // https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
                        c = getChar();
                        if (c == ']') {
                            appendTextNode(']'); 
                        } else if (c == '>') {
                            tokenizeState = TokenizeState.Data;
                        } else {
                            appendTextNode(']'); 
                            pushBack();
                            tokenizeState = TokenizeState.CDataSection;
                        }
                        break;
                    
                    case CharacterReference:
                        // not supported
                        break;
                    case NamedCharacterReference:
                        // not supported
                        break;
                    case AmbiguousAmpersand:
                        // not supported
                        break;
                    case NumericCharacterReference:
                        // not supported
                        break;
                    case HexadecimalCharacterReferenceStart:
                        // not supported
                        break;
                    case DecimalCharacterReferenceStart:
                        // not supported
                        break;
                    case HexadecimalCharacterReference:
                        // not supported
                        break;
                    case DecimalCharacterReference:
                        // not supported
                        break;
                    case NumericCharacterReferenceEnd:
                        // not supported
                        break;
                }
                handler = handler.getNextHandler();
            } while (c != CHAR_SUB);
        } catch (ScanningInterruptedExeption e) {
            // EOF
            return;
        } catch (IOException e) {
            HtmlStandardScanner.LOG.error("Parser error " + e.getMessage(), e);
            return;
        }
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy