org.htmlunit.cyberneko.HTMLScanner Maven / Gradle / Ivy

Go to download
/*
 * Copyright (c) 2002-2009 Andy Clark, Marc Guillemot
 * Copyright (c) 2017-2024 Ronald Brill
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.htmlunit.cyberneko;

import java.io.BufferedReader;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.util.Locale;

import org.htmlunit.cyberneko.io.PlaybackInputStream;
import org.htmlunit.cyberneko.util.MiniStack;
import org.htmlunit.cyberneko.xerces.util.EncodingMap;
import org.htmlunit.cyberneko.xerces.util.NamespaceSupport;
import org.htmlunit.cyberneko.xerces.util.URI;
import org.htmlunit.cyberneko.xerces.util.XMLAttributesImpl;
import org.htmlunit.cyberneko.xerces.xni.Augmentations;
import org.htmlunit.cyberneko.xerces.xni.NamespaceContext;
import org.htmlunit.cyberneko.xerces.xni.QName;
import org.htmlunit.cyberneko.xerces.xni.XMLAttributes;
import org.htmlunit.cyberneko.xerces.xni.XMLDocumentHandler;
import org.htmlunit.cyberneko.xerces.xni.XMLLocator;
import org.htmlunit.cyberneko.xerces.xni.XMLString;
import org.htmlunit.cyberneko.xerces.xni.XNIException;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLComponentManager;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLConfigurationException;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLDocumentScanner;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;

/**
 * A simple HTML scanner. This scanner makes no attempt to balance tags or fix
 * other problems in the source document — it just scans what it can and
 * generates XNI document "events", ignoring errors of all kinds.
 * 
 * This component recognizes the following features:
 * 

 * http://cyberneko.org/html/features/augmentations
 * 
http://cyberneko.org/html/features/report-errors
 * 
http://apache.org/xml/features/scanner/notify-builtin-refs
 * 
http://cyberneko.org/html/features/scanner/notify-builtin-refs
 * 
http://cyberneko.org/html/features/scanner/script/strip-cdata-delims
 * 
http://cyberneko.org/html/features/scanner/script/strip-comment-delims
 * 
http://cyberneko.org/html/features/scanner/style/strip-cdata-delims
 * 
http://cyberneko.org/html/features/scanner/style/strip-comment-delims
 * 
http://cyberneko.org/html/features/scanner/ignore-specified-charset
 * 
http://cyberneko.org/html/features/scanner/cdata-sections
 * 
http://cyberneko.org/html/features/override-doctype
 * 
http://cyberneko.org/html/features/insert-doctype
 * 
http://cyberneko.org/html/features/parse-noscript-content
 * 
http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe
 * 
http://cyberneko.org/html/features/scanner/allow-selfclosing-tags
 * 
 * 
 * This component recognizes the following properties:
 * 

 * http://cyberneko.org/html/properties/names/elems
 * 
http://cyberneko.org/html/properties/names/attrs
 * 
http://cyberneko.org/html/properties/default-encoding
 * 
http://cyberneko.org/html/properties/error-reporter
 * 
http://cyberneko.org/html/properties/doctype/pubid
 * 
http://cyberneko.org/html/properties/doctype/sysid
 * 
 *
 * @see HTMLElements
 *
 * @author Andy Clark
 * @author Marc Guillemot
 * @author Ahmed Ashour
 * @author Ronald Brill
 * @author René Schwietzke
 */
public class HTMLScanner implements XMLDocumentScanner, XMLLocator, HTMLComponent {

    // doctype info: HTML 4.01 strict

    /** HTML 4.01 strict public identifier ("-//W3C//DTD HTML 4.01//EN"). */
    public static final String HTML_4_01_STRICT_PUBID = "-//W3C//DTD HTML 4.01//EN";

    /**
     * HTML 4.01 strict system identifier ("http://www.w3.org/TR/html4/strict.dtd").
     */
    public static final String HTML_4_01_STRICT_SYSID = "http://www.w3.org/TR/html4/strict.dtd";

    // doctype info: HTML 4.01 loose

    /**
     * HTML 4.01 transitional public identifier ("-//W3C//DTD HTML 4.01
     * Transitional//EN").
     */
    public static final String HTML_4_01_TRANSITIONAL_PUBID = "-//W3C//DTD HTML 4.01 Transitional//EN";

    /**
     * HTML 4.01 transitional system identifier
     * ("http://www.w3.org/TR/html4/loose.dtd").
     */
    public static final String HTML_4_01_TRANSITIONAL_SYSID = "http://www.w3.org/TR/html4/loose.dtd";

    // doctype info: HTML 4.01 frameset

    /**
     * HTML 4.01 frameset public identifier ("-//W3C//DTD HTML 4.01 Frameset//EN").
     */
    public static final String HTML_4_01_FRAMESET_PUBID = "-//W3C//DTD HTML 4.01 Frameset//EN";

    /**
     * HTML 4.01 frameset system identifier
     * ("http://www.w3.org/TR/html4/frameset.dtd").
     */
    public static final String HTML_4_01_FRAMESET_SYSID = "http://www.w3.org/TR/html4/frameset.dtd";

    // features

    /** Include infoset augmentations. */
    protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";

    /** Report errors. */
    protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";

    /**
     * Strip HTML comment delimiters ("<!−−" and
     * "−−>") from SCRIPT tag contents.
     */
    public static final String SCRIPT_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-comment-delims";

    /**
     * Strip XHTML CDATA delimiters ("<![CDATA[" and "]]>") from SCRIPT tag
     * contents.
     */
    public static final String SCRIPT_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-cdata-delims";

    /**
     * Strip HTML comment delimiters ("<!−−" and
     * "−−>") from STYLE tag contents.
     */
    public static final String STYLE_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-comment-delims";

    /**
     * Strip XHTML CDATA delimiters ("<![CDATA[" and "]]>") from STYLE tag
     * contents.
     */
    public static final String STYLE_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-cdata-delims";

    /**
     * Ignore specified charset found in the <meta equiv='Content-Type'
     * content='text/html;charset=…'> tag or in the <?xml …
     * encoding='…'> processing instruction
     */
    public static final String IGNORE_SPECIFIED_CHARSET = "http://cyberneko.org/html/features/scanner/ignore-specified-charset";

    /** Scan CDATA sections. */
    public static final String CDATA_SECTIONS = "http://cyberneko.org/html/features/scanner/cdata-sections";

    /** Override doctype declaration public and system identifiers. */
    public static final String OVERRIDE_DOCTYPE = "http://cyberneko.org/html/features/override-doctype";

    /** Insert document type declaration. */
    public static final String INSERT_DOCTYPE = "http://cyberneko.org/html/features/insert-doctype";

    /** Parse <noscript>...</noscript> content */
    public static final String PARSE_NOSCRIPT_CONTENT = "http://cyberneko.org/html/features/parse-noscript-content";

    /** Allows self closing <iframe/> tag */
    public static final String ALLOW_SELFCLOSING_IFRAME = "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe";

    /** Allows self closing tags e.g. <div/> (XHTML) */
    public static final String ALLOW_SELFCLOSING_TAGS = "http://cyberneko.org/html/features/scanner/allow-selfclosing-tags";

    /** Normalize attribute values. */
    protected static final String NORMALIZE_ATTRIBUTES = "http://cyberneko.org/html/features/scanner/normalize-attrs";

    /** Recognized features. */
    private static final String[] RECOGNIZED_FEATURES = {
        AUGMENTATIONS,
        REPORT_ERRORS,
        SCRIPT_STRIP_CDATA_DELIMS,
        SCRIPT_STRIP_COMMENT_DELIMS,
        STYLE_STRIP_CDATA_DELIMS,
        STYLE_STRIP_COMMENT_DELIMS,
        IGNORE_SPECIFIED_CHARSET,
        CDATA_SECTIONS,
        OVERRIDE_DOCTYPE,
        INSERT_DOCTYPE,
        NORMALIZE_ATTRIBUTES,
        PARSE_NOSCRIPT_CONTENT,
        ALLOW_SELFCLOSING_IFRAME,
        ALLOW_SELFCLOSING_TAGS, };

    /** Recognized features defaults. */
    private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = {
        null,
        null,
        Boolean.FALSE,
        Boolean.FALSE,
        Boolean.FALSE,
        Boolean.FALSE,
        Boolean.FALSE,
        Boolean.FALSE,
        Boolean.FALSE,
        Boolean.FALSE,
        Boolean.FALSE,
        Boolean.TRUE,
        Boolean.FALSE,
        Boolean.FALSE, };

    // properties

    /** Modify HTML element names: { "upper", "lower", "default" }. */
    protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";

    /** Modify HTML attribute names: { "upper", "lower", "default" }. */
    protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";

    /** Default encoding. */
    protected static final String DEFAULT_ENCODING = "http://cyberneko.org/html/properties/default-encoding";

    /** Error reporter. */
    protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";

    /** Doctype declaration public identifier. */
    protected static final String DOCTYPE_PUBID = "http://cyberneko.org/html/properties/doctype/pubid";

    /** Doctype declaration system identifier. */
    protected static final String DOCTYPE_SYSID = "http://cyberneko.org/html/properties/doctype/sysid";

    /** Recognized properties. */
    private static final String[] RECOGNIZED_PROPERTIES = {
        NAMES_ELEMS,
        NAMES_ATTRS,
        DEFAULT_ENCODING,
        ERROR_REPORTER,
        DOCTYPE_PUBID,
        DOCTYPE_SYSID};

    /** Recognized properties defaults. */
    private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = {
        null,
        null,
        "Windows-1252",
        null,
        HTML_4_01_TRANSITIONAL_PUBID,
        HTML_4_01_TRANSITIONAL_SYSID};

    // states

    /** State: content. */
    protected static final short STATE_CONTENT = 0;

    /** State: markup bracket. */
    protected static final short STATE_MARKUP_BRACKET = 1;

    /** State: start document. */
    protected static final short STATE_START_DOCUMENT = 10;

    /** State: end document. */
    protected static final short STATE_END_DOCUMENT = 11;

    // modify HTML names

    /** Don't modify HTML names. */
    protected static final short NAMES_NO_CHANGE = 0;

    /** Uppercase HTML names. */
    protected static final short NAMES_UPPERCASE = 1;

    /** Lowercase HTML names. */
    protected static final short NAMES_LOWERCASE = 2;

    // defaults

    /* Default buffer size, 10 cache lines minus overhead
     * A smaller buffer creates less cache misses compared
     * to 2048 bytes or more.
     */
    protected static final int DEFAULT_BUFFER_SIZE = (10 * 64) - 24;

    // debugging

    /** Set to true to debug changes in the scanner. */
    private static final boolean DEBUG_SCANNER = false;

    /** Set to true to debug changes in the scanner state. */
    private static final boolean DEBUG_SCANNER_STATE = false;

    /** Set to true to debug the buffer. */
    private static final boolean DEBUG_BUFFER = false;

    /** Set to true to debug character encoding handling. */
    private static final boolean DEBUG_CHARSET = false;

    /** Set to true to debug callbacks. */
    protected static final boolean DEBUG_CALLBACKS = false;

    // static vars

    /** Synthesized event info item. */
    protected static final HTMLEventInfo SYNTHESIZED_ITEM = new HTMLEventInfo.SynthesizedItem();

    // features

    /** Augmentations. */
    private boolean fAugmentations_;

    /** Report errors. */
    boolean fReportErrors_;

    /** Strip CDATA delimiters from SCRIPT tags. */
    boolean fScriptStripCDATADelims_;

    /** Strip comment delimiters from SCRIPT tags. */
    boolean fScriptStripCommentDelims_;

    /** Strip CDATA delimiters from STYLE tags. */
    boolean fStyleStripCDATADelims_;

    /** Strip comment delimiters from STYLE tags. */
    boolean fStyleStripCommentDelims_;

    /** Ignore specified character set. */
    boolean fIgnoreSpecifiedCharset_;

    /** CDATA sections. */
    boolean fCDATASections_;

    /** Override doctype declaration public and system identifiers. */
    private boolean fOverrideDoctype_;

    /** Insert document type declaration. */
    boolean fInsertDoctype_;

    /** Normalize attribute values. */
    boolean fNormalizeAttributes_;

    /** Parse noscript content. */
    boolean fParseNoScriptContent_;

    /** Allows self closing iframe tags. */
    boolean fAllowSelfclosingIframe_;

    /** Allows self closing tags. */
    boolean fAllowSelfclosingTags_;

    // properties

    /** Modify HTML element names. */
    protected short fNamesElems;

    /** Modify HTML attribute names. */
    protected short fNamesAttrs;

    /** Default encoding. */
    protected String fDefaultIANAEncoding;

    /** Error reporter. */
    protected HTMLErrorReporter fErrorReporter;

    /** Doctype declaration public identifier. */
    protected String fDoctypePubid;

    /** Doctype declaration system identifier. */
    protected String fDoctypeSysid;

    // boundary locator information

    /** Beginning line number. */
    protected int fBeginLineNumber;

    /** Beginning column number. */
    protected int fBeginColumnNumber;

    /** Beginning character offset in the file. */
    protected int fBeginCharacterOffset;

    /** Ending line number. */
    protected int fEndLineNumber;

    /** Ending column number. */
    protected int fEndColumnNumber;

    /** Ending character offset in the file. */
    protected int fEndCharacterOffset;

    // state

    /** The playback byte stream. */
    protected PlaybackInputStream fByteStream;

    /** Current entity. */
    CurrentEntity fCurrentEntity;

    /** The current entity stack. */
    protected final MiniStack fCurrentEntityStack = new MiniStack<>();

    /** The current scanner. */
    protected Scanner fScanner;

    /** The current scanner state. */
    protected short fScannerState;

    /** The document handler. */
    protected XMLDocumentHandler fDocumentHandler;

    /** Auto-detected IANA encoding. */
    protected String fIANAEncoding;

    /** Auto-detected Java encoding. */
    protected String fJavaEncoding;

    /** Element count. */
    protected int fElementCount;

    /** Element depth. */
    protected int fElementDepth;

    // scanners

    /** Content scanner. */
    protected Scanner fContentScanner = new ContentScanner();

    /**
     * Special scanner used for elements whose content needs to be scanned as plain
     * text, ignoring markup such as elements and entity references. For example:
     * <SCRIPT> and <COMMENT>.
     */
    protected final SpecialScanner fSpecialScanner = new SpecialScanner();

    // temp vars

    /** String buffer. */
    protected final XMLString fStringBuffer = new XMLString();

    /** String buffer. */
    final XMLString fStringBuffer2 = new XMLString();

    /** String buffer, larger because scripts areas are larger */
    final XMLString fScanScriptContent = new XMLString(128);

    final XMLString fScanUntilEndTag = new XMLString();

    final XMLString fScanComment = new XMLString();

    private final XMLString fScanLiteral = new XMLString();

    /** Single boolean array. */
    final boolean[] fSingleBoolean = {false};

    final HTMLConfiguration htmlConfiguration_;

    /**
     * Our location item, to be reused because {@link Augmentations}
     * says so, so let's save on memory
     */
    private final LocationItem fLocationItem = new LocationItem();

    /**
     * Creates a new HTMLScanner with the given configuration
     *
     * @param htmlConfiguration the configuration to use
     */
    HTMLScanner(final HTMLConfiguration htmlConfiguration) {
        this.htmlConfiguration_ = htmlConfiguration;
    }

    /**
     * Pushes an input source onto the current entity stack. This enables the
     * scanner to transparently scan new content (e.g. the output written by an
     * embedded script). At the end of the current entity, the scanner returns where
     * it left off at the time this entity source was pushed.
     * 
     * Note: This functionality is experimental at this time and is
     * subject to change in future releases of NekoHTML.
     *
     * @param inputSource The new input source to start scanning.
     * @see #evaluateInputSource(XMLInputSource)
     */
    public void pushInputSource(final XMLInputSource inputSource) {
        final Reader reader = getReader(inputSource);

        fCurrentEntityStack.push(fCurrentEntity);
        final String encoding = inputSource.getEncoding();
        final String publicId = inputSource.getPublicId();
        final String baseSystemId = inputSource.getBaseSystemId();
        final String literalSystemId = inputSource.getSystemId();
        final String expandedSystemId = expandSystemId(literalSystemId, baseSystemId);
        fCurrentEntity = new CurrentEntity(reader, encoding, publicId, baseSystemId, literalSystemId, expandedSystemId);
    }

    private Reader getReader(final XMLInputSource inputSource) {
        final Reader reader = inputSource.getCharacterStream();
        if (reader == null) {
            try {
                return new InputStreamReader(inputSource.getByteStream(), fJavaEncoding);
            }
            catch (final UnsupportedEncodingException e) {
                // should not happen as this encoding is already used to parse the "main" source
            }
        }
        return reader;
    }

    /**
     * Immediately evaluates an input source and add the new content (e.g. the
     * output written by an embedded script).
     *
     * @param inputSource The new input source to start evaluating.
     * @see #pushInputSource(XMLInputSource)
     */
    public void evaluateInputSource(final XMLInputSource inputSource) {
        final Scanner previousScanner = fScanner;
        final short previousScannerState = fScannerState;
        final CurrentEntity previousEntity = fCurrentEntity;
        final Reader reader = getReader(inputSource);

        final String encoding = inputSource.getEncoding();
        final String publicId = inputSource.getPublicId();
        final String baseSystemId = inputSource.getBaseSystemId();
        final String literalSystemId = inputSource.getSystemId();
        final String expandedSystemId = expandSystemId(literalSystemId, baseSystemId);
        fCurrentEntity = new CurrentEntity(reader, encoding, publicId, baseSystemId, literalSystemId, expandedSystemId);
        setScanner(fContentScanner);
        setScannerState(STATE_CONTENT);
        try {
            do {
                fScanner.scan(false);
            }
            while (fScannerState != STATE_END_DOCUMENT);
        }
        catch (final IOException e) {
            // ignore
        }
        setScanner(previousScanner);
        setScannerState(previousScannerState);
        fCurrentEntity = previousEntity;
    }

    /**
     * Cleans up used resources. For example, if scanning is terminated early, then
     * this method ensures all remaining open streams are closed.
     *
     * @param closeall Close all streams, including the original. This is used in
     *                 cases when the application has opened the original document
     *                 stream and should be responsible for closing it.
     */
    public void cleanup(final boolean closeall) {
        final int size = fCurrentEntityStack.size();
        if (size > 0) {
            // current entity is not the original, so close it
            if (fCurrentEntity != null) {
                fCurrentEntity.closeQuietly();
            }
            // close remaining streams
            for (int i = closeall ? 0 : 1; i < size; i++) {
                fCurrentEntity = fCurrentEntityStack.pop();
                fCurrentEntity.closeQuietly();
            }
        }
        else if (closeall && fCurrentEntity != null) {
            fCurrentEntity.closeQuietly();
        }
    }

    /** Returns the encoding. */
    @Override
    public String getEncoding() {
        return fCurrentEntity != null ? fCurrentEntity.encoding_ : null;
    }

    /** Returns the public identifier. */
    @Override
    public String getPublicId() {
        return fCurrentEntity != null ? fCurrentEntity.publicId : null;
    }

    /** Returns the base system identifier. */
    @Override
    public String getBaseSystemId() {
        return fCurrentEntity != null ? fCurrentEntity.baseSystemId : null;
    }

    /** Returns the literal system identifier. */
    @Override
    public String getLiteralSystemId() {
        return fCurrentEntity != null ? fCurrentEntity.literalSystemId : null;
    }

    /** Returns the expanded system identifier. */
    @Override
    public String getExpandedSystemId() {
        return fCurrentEntity != null ? fCurrentEntity.expandedSystemId : null;
    }

    /** Returns the current line number. */
    @Override
    public int getLineNumber() {
        return fCurrentEntity != null ? fCurrentEntity.getLineNumber() : -1;
    }

    /** Returns the current column number. */
    @Override
    public int getColumnNumber() {
        return fCurrentEntity != null ? fCurrentEntity.getColumnNumber() : -1;
    }

    /** Returns the XML version. */
    @Override
    public String getXMLVersion() {
        return fCurrentEntity != null ? fCurrentEntity.version : null;
    }

    /** Returns the character offset. */
    @Override
    public int getCharacterOffset() {
        return fCurrentEntity != null ? fCurrentEntity.getCharacterOffset() : -1;
    }

    /** Returns the default state for a feature. */
    @Override
    public Boolean getFeatureDefault(final String featureId) {
        final int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length : 0;
        for (int i = 0; i < length; i++) {
            if (RECOGNIZED_FEATURES[i].equals(featureId)) {
                return RECOGNIZED_FEATURES_DEFAULTS[i];
            }
        }
        return null;
    }

    /** Returns the default state for a property. */
    @Override
    public Object getPropertyDefault(final String propertyId) {
        final int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length : 0;
        for (int i = 0; i < length; i++) {
            if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) {
                return RECOGNIZED_PROPERTIES_DEFAULTS[i];
            }
        }
        return null;
    }

    /** Returns recognized features. */
    @Override
    public String[] getRecognizedFeatures() {
        return RECOGNIZED_FEATURES;
    }

    /** Returns recognized properties. */
    @Override
    public String[] getRecognizedProperties() {
        return RECOGNIZED_PROPERTIES;
    }

    /** Resets the component. */
    @Override
    public void reset(final XMLComponentManager manager) throws XMLConfigurationException {

        // get features
        fAugmentations_ = manager.getFeature(AUGMENTATIONS);
        fReportErrors_ = manager.getFeature(REPORT_ERRORS);
        fScriptStripCDATADelims_ = manager.getFeature(SCRIPT_STRIP_CDATA_DELIMS);
        fScriptStripCommentDelims_ = manager.getFeature(SCRIPT_STRIP_COMMENT_DELIMS);
        fStyleStripCDATADelims_ = manager.getFeature(STYLE_STRIP_CDATA_DELIMS);
        fStyleStripCommentDelims_ = manager.getFeature(STYLE_STRIP_COMMENT_DELIMS);
        fIgnoreSpecifiedCharset_ = manager.getFeature(IGNORE_SPECIFIED_CHARSET);
        fCDATASections_ = manager.getFeature(CDATA_SECTIONS);
        fOverrideDoctype_ = manager.getFeature(OVERRIDE_DOCTYPE);
        fInsertDoctype_ = manager.getFeature(INSERT_DOCTYPE);
        fNormalizeAttributes_ = manager.getFeature(NORMALIZE_ATTRIBUTES);
        fParseNoScriptContent_ = manager.getFeature(PARSE_NOSCRIPT_CONTENT);
        fAllowSelfclosingIframe_ = manager.getFeature(ALLOW_SELFCLOSING_IFRAME);
        fAllowSelfclosingTags_ = manager.getFeature(ALLOW_SELFCLOSING_TAGS);

        // get properties
        fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS)));
        fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS)));
        fDefaultIANAEncoding = String.valueOf(manager.getProperty(DEFAULT_ENCODING));
        fErrorReporter = (HTMLErrorReporter) manager.getProperty(ERROR_REPORTER);
        fDoctypePubid = String.valueOf(manager.getProperty(DOCTYPE_PUBID));
        fDoctypeSysid = String.valueOf(manager.getProperty(DOCTYPE_SYSID));
    }

    /** Sets a feature. */
    @Override
    public void setFeature(final String featureId, final boolean state) {

        if (featureId.equals(AUGMENTATIONS)) {
            fAugmentations_ = state;
        }
        else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) {
            fIgnoreSpecifiedCharset_ = state;
        }
        else if (featureId.equals(SCRIPT_STRIP_CDATA_DELIMS)) {
            fScriptStripCDATADelims_ = state;
        }
        else if (featureId.equals(SCRIPT_STRIP_COMMENT_DELIMS)) {
            fScriptStripCommentDelims_ = state;
        }
        else if (featureId.equals(STYLE_STRIP_CDATA_DELIMS)) {
            fStyleStripCDATADelims_ = state;
        }
        else if (featureId.equals(STYLE_STRIP_COMMENT_DELIMS)) {
            fStyleStripCommentDelims_ = state;
        }
        else if (featureId.equals(PARSE_NOSCRIPT_CONTENT)) {
            fParseNoScriptContent_ = state;
        }
        else if (featureId.equals(ALLOW_SELFCLOSING_IFRAME)) {
            fAllowSelfclosingIframe_ = state;
        }
        else if (featureId.equals(ALLOW_SELFCLOSING_TAGS)) {
            fAllowSelfclosingTags_ = state;
        }
    }

    /** Sets a property. */
    @Override
    public void setProperty(final String propertyId, final Object value) throws XMLConfigurationException {

        if (propertyId.equals(NAMES_ELEMS)) {
            fNamesElems = getNamesValue(String.valueOf(value));
            return;
        }

        if (propertyId.equals(NAMES_ATTRS)) {
            fNamesAttrs = getNamesValue(String.valueOf(value));
            return;
        }

        if (propertyId.equals(DEFAULT_ENCODING)) {
            fDefaultIANAEncoding = String.valueOf(value);
            return;
        }
    }

    /** Sets the input source. */
    @Override
    public void setInputSource(final XMLInputSource source) throws IOException {

        // reset state
        fElementCount = 0;
        fElementDepth = -1;
        fByteStream = null;
        fCurrentEntityStack.clear();

        fBeginLineNumber = 1;
        fBeginColumnNumber = 1;
        fBeginCharacterOffset = 0;
        fEndLineNumber = fBeginLineNumber;
        fEndColumnNumber = fBeginColumnNumber;
        fEndCharacterOffset = fBeginCharacterOffset;

        // reset encoding information
        fIANAEncoding = fDefaultIANAEncoding;
        fJavaEncoding = fIANAEncoding;

        // get location information
        String encoding = source.getEncoding();
        final String publicId = source.getPublicId();
        final String baseSystemId = source.getBaseSystemId();
        final String literalSystemId = source.getSystemId();
        final String expandedSystemId = expandSystemId(literalSystemId, baseSystemId);

        // open stream
        Reader reader = source.getCharacterStream();
        if (reader == null) {
            InputStream inputStream = source.getByteStream();
            if (inputStream == null) {
                final URL url = new URL(expandedSystemId);
                inputStream = url.openStream();
            }
            fByteStream = new PlaybackInputStream(inputStream);
            final String[] encodings = new String[2];
            if (encoding == null) {
                fByteStream.detectEncoding(encodings);
            }
            else {
                encodings[0] = encoding;
            }
            if (encodings[0] == null) {
                encodings[0] = fDefaultIANAEncoding;
                if (fReportErrors_) {
                    fErrorReporter.reportWarning("HTML1000", null);
                }
            }
            if (encodings[1] == null) {
                encodings[1] = EncodingMap.getIANA2JavaMapping(encodings[0].toUpperCase(Locale.ROOT));
                if (encodings[1] == null) {
                    encodings[1] = encodings[0];
                    if (fReportErrors_) {
                        fErrorReporter.reportWarning("HTML1001", new Object[] {encodings[0]});
                    }
                }
            }
            fIANAEncoding = encodings[0];
            fJavaEncoding = encodings[1];
            encoding = fIANAEncoding;
            reader = new BufferedReader(new InputStreamReader(fByteStream, fJavaEncoding));
        }
        fCurrentEntity = new CurrentEntity(reader, encoding, publicId, baseSystemId, literalSystemId, expandedSystemId);

        // set scanner and state
        setScanner(fContentScanner);
        setScannerState(STATE_START_DOCUMENT);
    }

    /** Scans the document. */
    @Override
    public boolean scanDocument(final boolean complete) throws XNIException, IOException {
        do {
            if (!fScanner.scan(complete)) {
                return false;
            }
        }
        while (complete);

        return true;
    }

    /** Sets the document handler. */
    @Override
    public void setDocumentHandler(final XMLDocumentHandler handler) {
        fDocumentHandler = handler;
    }

    /** Returns the document handler. */
    @Override
    public XMLDocumentHandler getDocumentHandler() {
        return fDocumentHandler;
    }

    // Returns the value of the specified attribute, ignoring case.
    protected static String getValue(final XMLAttributes attrs, final String aname) {
        if (attrs != null) {
            final int length = attrs.getLength();
            for (int i = 0; i < length; i++) {
                if (attrs.getQName(i).equalsIgnoreCase(aname)) {
                    return attrs.getValue(i);
                }
            }
        }
        return null;
    }

    /**
     * Expands a system id and returns the system id as a URI, if it can be
     * expanded. A return value of null means that the identifier is already
     * expanded. An exception thrown indicates a failure to expand the id.
     *
     * @param systemId     The systemId to be expanded.
     * @param baseSystemId baseSystemId
     *
     * @return Returns the URI string representing the expanded system identifier. A
     *         null value indicates that the given system identifier is already
     *         expanded.
     *
     */
    @SuppressWarnings("unused")
    public static String expandSystemId(final String systemId, final String baseSystemId) {

        // check for bad parameters id
        if (systemId == null || systemId.length() == 0) {
            return systemId;
        }
        // if id already expanded, return
        try {
            new URI(systemId);
            return systemId;
        }
        catch (final URI.MalformedURIException e) {
            // continue on...
        }
        // normalize id
        final String id = fixURI(systemId);

        // normalize base
        URI base;
        URI uri = null;
        try {
            if (baseSystemId == null || baseSystemId.length() == 0 || baseSystemId.equals(systemId)) {

                String dir;
                try {
                    dir = fixURI(System.getProperty("user.dir"))
                            // deal with blanks in paths; maybe we have to do better uri encoding here
                            .replaceAll(" ", "%20");

                }
                catch (final SecurityException se) {
                    dir = "";
                }
                if (!dir.endsWith("/")) {
                    dir = dir + "/";
                }
                base = new URI("file", "", dir, null, null);
            }
            else {
                try {
                    base = new URI(fixURI(baseSystemId));
                }
                catch (final URI.MalformedURIException e) {
                    String dir;
                    try {
                        dir = fixURI(System.getProperty("user.dir"))
                                // deal with blanks in paths; maybe we have to do better uri encoding here
                                .replaceAll(" ", "%20");
                    }
                    catch (final SecurityException se) {
                        dir = "";
                    }
                    if (baseSystemId.indexOf(':') != -1) {
                        // for xml schemas we might have baseURI with
                        // a specified drive
                        base = new URI("file", "", fixURI(baseSystemId), null, null);
                    }
                    else {
                        if (!dir.endsWith("/")) {
                            dir = dir + "/";
                        }
                        dir = dir + fixURI(baseSystemId);
                        base = new URI("file", "", dir, null, null);
                    }
                }
            }

            // expand id
            uri = new URI(base, id);
        }
        catch (final URI.MalformedURIException e) {
            // let it go through
        }

        if (uri == null) {
            return systemId;
        }
        return uri.toString();
    }

    /**
     * Fixes a platform dependent filename to standard URI form.
     *
     * @param str The string to fix.
     *
     * @return Returns the fixed URI string.
     */
    protected static String fixURI(String str) {

        // handle platform dependent strings
        str = str.replace(java.io.File.separatorChar, '/');

        // Windows fix
        if (str.length() >= 2) {
            final char ch1 = str.charAt(1);
            // change "C:blah" to "/C:blah"
            if (ch1 == ':') {
                final char ch0 = String.valueOf(str.charAt(0)).toUpperCase(Locale.ROOT).charAt(0);
                if (ch0 >= 'A' && ch0 <= 'Z') {
                    str = "/" + str;
                }
            }
            // change "//blah" to "file://blah"
            else if (ch1 == '/' && str.charAt(0) == '/') {
                str = "file:" + str;
            }
        }

        // done
        return str;
    }

    // Modifies the given name based on the specified mode.
    protected static String modifyName(final String name, final short mode) {
        switch (mode) {
            case NAMES_UPPERCASE:
                return name.toUpperCase(Locale.ROOT);
            case NAMES_LOWERCASE:
                return name.toLowerCase(Locale.ROOT);
        }
        return name;
    }

    // Converts HTML names string value to constant value.
    //
    // @see #NAMES_NO_CHANGE
    // @see #NAMES_LOWERCASE
    // @see #NAMES_UPPERCASE
    protected static short getNamesValue(final String value) {
        if ("lower".equals(value)) {
            return NAMES_LOWERCASE;
        }
        if ("upper".equals(value)) {
            return NAMES_UPPERCASE;
        }
        return NAMES_NO_CHANGE;
    }
    // debugging

    // Sets the scanner.
    protected void setScanner(final Scanner scanner) {
        fScanner = scanner;
        if (DEBUG_SCANNER) {
            System.out.print("$$$ setScanner(");
            System.out.print(scanner != null ? scanner.getClass().getName() : "null");
            System.out.println(");");
        }
    }

    // Sets the scanner state.
    protected void setScannerState(final short state) {
        fScannerState = state;
        if (DEBUG_SCANNER_STATE) {
            System.out.print("$$$ setScannerState(");
            switch (fScannerState) {
                case STATE_CONTENT: {
                    System.out.print("STATE_CONTENT");
                    break;
                }
                case STATE_MARKUP_BRACKET: {
                    System.out.print("STATE_MARKUP_BRACKET");
                    break;
                }
                case STATE_START_DOCUMENT: {
                    System.out.print("STATE_START_DOCUMENT");
                    break;
                }
                case STATE_END_DOCUMENT: {
                    System.out.print("STATE_END_DOCUMENT");
                    break;
                }
            }
            System.out.println(");");
        }
    }

    // scanning

    // Scans a DOCTYPE line.
    protected void scanDoctype() throws IOException {
        String root = null;
        String pubid = null;
        String sysid = null;

        if (skipSpaces()) {
            root = scanName(true);
            if (root == null) {
                if (fReportErrors_) {
                    fErrorReporter.reportError("HTML1014", null);
                }
            }
            else {
                root = modifyName(root, fNamesElems);
            }
            if (skipSpaces()) {
                if (skip("PUBLIC", false)) {
                    skipSpaces();
                    pubid = scanLiteral();
                    if (skipSpaces()) {
                        sysid = scanLiteral();
                    }
                }
                else if (skip("SYSTEM", false)) {
                    skipSpaces();
                    sysid = scanLiteral();
                }
            }
        }
        int c;
        while ((c = fCurrentEntity.read()) != -1) {
            if (c == '<') {
                fCurrentEntity.rewind();
                break;
            }
            if (c == '>') {
                break;
            }
            if (c == '[') {
                skipMarkup(true);
                break;
            }
        }

        if (fDocumentHandler != null) {
            if (fOverrideDoctype_) {
                pubid = fDoctypePubid;
                sysid = fDoctypeSysid;
            }
            fEndLineNumber = fCurrentEntity.getLineNumber();
            fEndColumnNumber = fCurrentEntity.getColumnNumber();
            fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
            fDocumentHandler.doctypeDecl(root, pubid, sysid, locationAugs());
        }
    }

    // Scans a quoted literal.
    protected String scanLiteral() throws IOException {
        final int quote = fCurrentEntity.read();

        if (quote == '"' || quote == '\'') {
            final XMLString str = fScanLiteral.clear();
            int c;
            while ((c = fCurrentEntity.read()) != -1) {
                if (c == quote) {
                    break;
                }
                if (c == '\n' || c == '\r') {
                    fCurrentEntity.rewind();
                    // NOTE: This collapses newlines to a single space.
                    // [Q] Is this the right thing to do here? -Ac
                    skipNewlines();
                    str.append(' ');
                }
                else if (c == '<') {
                    fCurrentEntity.rewind();
                    break;
                }
                else {
                    if (!str.appendCodePoint(c)) {
                        if (fReportErrors_) {
                            fErrorReporter.reportError("HTML1005", new Object[] {"&#" + c + ';'});
                        }
                    }
                }
            }
            if (c == -1) {
                if (fReportErrors_) {
                    fErrorReporter.reportError("HTML1007", null);
                }
                throw new EOFException();
            }
            return str.toString();
        }
        fCurrentEntity.rewind();
        return null;
    }

    // Scans a name.
    protected String scanName(final boolean strict) throws IOException {
        if (DEBUG_BUFFER) {
            fCurrentEntity.debugBufferIfNeeded("(scanName: ");
        }
        if (fCurrentEntity.offset_ == fCurrentEntity.length_) {
            if (fCurrentEntity.load(0) == -1) {
                if (DEBUG_BUFFER) {
                    fCurrentEntity.debugBufferIfNeeded(")scanName: ");
                }
                return null;
            }
        }
        int offset = fCurrentEntity.offset_;
        while (true) {
            while (fCurrentEntity.hasNext()) {
                final char c = fCurrentEntity.getNextChar();
                // this has been split up to cater to the needs of branch prediction
                if (strict && (!Character.isLetterOrDigit(c) && c != '-' && c != '.' && c != ':' && c != '_')) {
                    fCurrentEntity.rewind();
                    break;
                }
                // we check for the regular space first because isWhitespace is no inlineable and hence expensive
                // regular space should be the norm as well as newlines
                else if (!strict && (c == ' ' || c == '\n' || c == '=' || c == '/' || c == '>' || Character.isWhitespace(c))) {
                    fCurrentEntity.rewind();
                    break;
                }
            }
            if (fCurrentEntity.offset_ == fCurrentEntity.length_) {
                final int length = fCurrentEntity.length_ - offset;
                System.arraycopy(fCurrentEntity.buffer_, offset, fCurrentEntity.buffer_, 0, length);
                final int count = fCurrentEntity.load(length);
                offset = 0;
                if (count == -1) {
                    break;
                }
            }
            else {
                break;
            }
        }
        final int length = fCurrentEntity.offset_ - offset;
        final String name = length > 0 ? new String(fCurrentEntity.buffer_, offset, length) : null;
        if (DEBUG_BUFFER) {
            fCurrentEntity.debugBufferIfNeeded(")scanName: ", " -> \"" + name + '"');
        }
        return name;
    }

    // Scans a tag name.
    protected String scanTagName() throws IOException {
        if (DEBUG_BUFFER) {
            fCurrentEntity.debugBufferIfNeeded("(scanName: ");
        }
        if (fCurrentEntity.offset_ == fCurrentEntity.length_) {
            if (fCurrentEntity.load(0) == -1) {
                if (DEBUG_BUFFER) {
                    fCurrentEntity.debugBufferIfNeeded(")scanName: ");
                }
                return null;
            }
        }
        int offset = fCurrentEntity.offset_;
        boolean isFirst = true;

        while (true) {
            while (fCurrentEntity.hasNext()) {
                final char c = fCurrentEntity.getNextChar();

                if (isFirst) {
                    isFirst = false;

                    // first char has to be ASCII alpha
                    if (!('A' <= c && c <= 'Z' || 'a' <= c && c <= 'z')) {
                        fCurrentEntity.rewind();
                        break;
                    }
                }
                else {
                    if (c == '\t' || c == '\r' || c == '\n' || c == ' ' || c == 0
                            || c == '/' || c == '>') {
                        fCurrentEntity.rewind();
                        break;
                    }
                }
            }
            if (fCurrentEntity.offset_ == fCurrentEntity.length_) {
                final int length = fCurrentEntity.length_ - offset;
                System.arraycopy(fCurrentEntity.buffer_, offset, fCurrentEntity.buffer_, 0, length);
                final int count = fCurrentEntity.load(length);
                offset = 0;
                if (count == -1) {
                    break;
                }
            }
            else {
                break;
            }
        }

        final int length = fCurrentEntity.offset_ - offset;
        final String name = length > 0 ? new String(fCurrentEntity.buffer_, offset, length) : null;
        if (DEBUG_BUFFER) {
            fCurrentEntity.debugBufferIfNeeded(")scanName: ", " -> \"" + name + '"');
        }

        return name;
    }

    // Scans an entity reference.
    protected int scanEntityRef(final XMLString str, final boolean content) throws IOException {
        str.clearAndAppend('&');

        // use readPreservingBufferContent inside this method to be sure we can rewind

        int nextChar = readPreservingBufferContent();
        if (nextChar == -1) {
            return returnEntityRefString(str, content);
        }
        str.append((char) nextChar);

        if ('#' == nextChar) {
            final HTMLUnicodeEntitiesParser parser = new HTMLUnicodeEntitiesParser();

            nextChar = readPreservingBufferContent();
            if (nextChar != -1) {
                str.append((char) nextChar);
            }

            while (nextChar != -1 && parser.parseNumeric(nextChar)) {
                nextChar = readPreservingBufferContent();
                if (nextChar != -1) {
                    str.append((char) nextChar);
                }
            }

            final String match = parser.getMatch();
            if (match == null) {
                final String consumed = str.toString();
                fCurrentEntity.rewind(consumed.length() - 1);
                str.clearAndAppend('&');
            }
            else {
                fCurrentEntity.rewind(parser.getRewindCount());
                str.clear().append(match);
            }
            return returnEntityRefString(str, content);
        }

        // we read regular entities such as < here
        int readCount = 1;
        // this will be our state of the parsing, we have to feed that back to the parser
        HTMLNamedEntitiesParser.State result = null;
        // in case of incorrect entities such as ¬in where we are supposed to recognize
        // ¬, we have to keep the last matching state, so we can fall back to it
        HTMLNamedEntitiesParser.State lastMatchingResult = null;

        while (nextChar != -1) {
            final HTMLNamedEntitiesParser.State intermediateResult = HTMLNamedEntitiesParser.get().lookup(nextChar, result);

            if (intermediateResult.endNode_) {
                result = intermediateResult;
                break;
            }
            if (intermediateResult == result) {
                // nothing changed, more characters have not done anything
                break;
            }
            if (intermediateResult.isMatch_) {
                lastMatchingResult = intermediateResult;
            }
            result = intermediateResult;

            nextChar = readPreservingBufferContent();
            if (nextChar != -1) {
                str.append((char) nextChar);
                readCount++;
            }
        }

        // it might happen that we read <a but need just < so
        // we have to go back to the last match
        if (!result.isMatch_ && lastMatchingResult != null) {
            result = lastMatchingResult;
        }

        // hopefully, we got something, otherwise we have to go
        // the error route
        if (result.isMatch_) {
            // in case we overran because the entity was broken or
            // not terminated by a ;, we have to reset the char
            // position because we read one more char than the entity has
            fCurrentEntity.rewind(readCount - result.length_);

            // if we have a correct character that is terminate by ;
            // we can keep things simple
            if (result.endsWithSemicolon_) {
                str.clear().append(result.resolvedValue_);
            }
            else {
                if (fReportErrors_) {
                    fErrorReporter.reportWarning("HTML1004", null);
                }

                // If there is a match
                // {
                //      If the character reference was consumed as part of an attribute, and the last character matched is not
                //      a U+003B SEMICOLON character (;), and the next input character is either a U+003D EQUALS SIGN character (=)
                //      or an ASCII alphanumeric,
                //      then, for historical reasons, flush code points consumed as a character reference and switch to the return state.

                //      Otherwise:
                //      1. If the last character matched is not a U+003B SEMICOLON character (;), then this is a missing-semicolon-after-character-reference parse error.
                //      2. Set the temporary buffer to the empty string. Append one or two characters corresponding to the character reference name
                //      (as given by the second column of the named character references table) to the temporary buffer.
                //      3. Flush code points consumed as a character reference. Switch to the return state.
                // }
                // Otherwise
                // {
                //      Flush code points consumed as a character reference. Switch to the ambiguous ampersand state.
                // }
                if (content) {
                    str.clear().append(result.resolvedValue_);
                }
                else {
                    // look ahead
                    // 13.2.5.73
                    final int matchLength = result.length_ + 1;
                    if (matchLength < str.length()) {
                        nextChar = str.charAt(matchLength);
                        if ('=' == nextChar || '0' <= nextChar && nextChar <= '9' || 'A' <= nextChar && nextChar <= 'Z'
                                || 'a' <= nextChar && nextChar <= 'z') {
                            // we just shorten our temp str instead of copying stuff around
                            str.shortenBy(str.length() - result.length_ - 1);
                        }
                        else {
                            str.clear().append(result.resolvedValue_);
                        }
                    }
                    else {
                        str.clear().append(result.resolvedValue_);
                    }
                }
            }
        }
        else {
            // Entity not found, rewind and continue
            // broken from here, aka keeping everything
            fCurrentEntity.rewind(readCount);
            str.clearAndAppend('&');
        }

        return returnEntityRefString(str, content);
    }

    private int returnEntityRefString(final XMLString str, final boolean content) {
        if (content && fDocumentHandler != null && fElementCount >= fElementDepth) {
            fEndLineNumber = fCurrentEntity.getLineNumber();
            fEndColumnNumber = fCurrentEntity.getColumnNumber();
            fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
            fDocumentHandler.characters(str, locationAugs());
        }
        return -1;
    }

    // Returns true if the specified text is present and is skipped.
    protected boolean skip(final String s, final boolean caseSensitive) throws IOException {
        final int length = s != null ? s.length() : 0;
        for (int i = 0; i < length; i++) {
            if (fCurrentEntity.offset_ == fCurrentEntity.length_) {
                System.arraycopy(fCurrentEntity.buffer_, fCurrentEntity.offset_ - i, fCurrentEntity.buffer_, 0, i);
                if (fCurrentEntity.load(i) == -1) {
                    fCurrentEntity.offset_ = 0;
                    return false;
                }
            }
            char c0 = s.charAt(i);
            char c1 = fCurrentEntity.getNextChar();
            if (!caseSensitive) {
                c0 = String.valueOf(c0).toUpperCase(Locale.ROOT).charAt(0);
                c1 = String.valueOf(c1).toUpperCase(Locale.ROOT).charAt(0);
            }
            if (c0 != c1) {
                fCurrentEntity.rewind(i + 1);
                return false;
            }
        }
        return true;
    }

    // Skips markup.
    protected boolean skipMarkup(final boolean balance) throws IOException {
        if (DEBUG_BUFFER) {
            fCurrentEntity.debugBufferIfNeeded("(skipMarkup: ");
        }
        int depth = 1;
        boolean slashgt = false;
        OUTER: while (true) {
            if (fCurrentEntity.offset_ == fCurrentEntity.length_) {
                if (fCurrentEntity.load(0) == -1) {
                    break OUTER;
                }
            }
            while (fCurrentEntity.hasNext()) {
                char c = fCurrentEntity.getNextChar();
                if (balance && c == '<') {
                    depth++;
                }
                else if (c == '>') {
                    depth--;
                    if (depth == 0) {
                        break OUTER;
                    }
                }
                else if (c == '/') {
                    if (fCurrentEntity.offset_ == fCurrentEntity.length_) {
                        if (fCurrentEntity.load(0) == -1) {
                            break OUTER;
                        }
                    }
                    c = fCurrentEntity.getNextChar();
                    if (c == '>') {
                        slashgt = true;
                        depth--;
                        if (depth == 0) {
                            break OUTER;
                        }
                    }
                    else {
                        fCurrentEntity.rewind();
                    }
                }
                else if (c == '\r' || c == '\n') {
                    fCurrentEntity.rewind();
                    skipNewlines();
                }
            }
        }
        if (DEBUG_BUFFER) {
            fCurrentEntity.debugBufferIfNeeded(")skipMarkup: ", " -> " + slashgt);
        }
        return slashgt;
    }

    // Skips whitespace.
    protected boolean skipSpaces() throws IOException {
        if (DEBUG_BUFFER) {
            fCurrentEntity.debugBufferIfNeeded("(skipSpaces: ");
        }
        boolean spaces = false;
        while (true) {
            if (fCurrentEntity.offset_ == fCurrentEntity.length_) {
                if (fCurrentEntity.load(0) == -1) {
                    break;
                }
            }
            final char c = fCurrentEntity.getNextChar();
            // compare against the usual suspects first before going
            // the expensive route
            if (c == ' ' || c == '\n' || Character.isWhitespace(c)) {
                spaces = true;
                // unix \n might dominate
                if (c == '\n' || c == '\r') {
                    fCurrentEntity.rewind();
                    skipNewlines();
                }
            }
            else {
                fCurrentEntity.rewind();
                break;
            }
        }
        if (DEBUG_BUFFER) {
            fCurrentEntity.debugBufferIfNeeded(")skipSpaces: ", " -> " + spaces);
        }
        return spaces;
    }

    // Skips newlines and returns the number of newlines skipped.
    protected int skipNewlines() throws IOException {
        if (DEBUG_BUFFER) {
            fCurrentEntity.debugBufferIfNeeded("(skipNewlines: ");
        }

        if (fCurrentEntity.offset_ == fCurrentEntity.length_) {
            if (fCurrentEntity.load(0) == -1) {
                if (DEBUG_BUFFER) {
                    fCurrentEntity.debugBufferIfNeeded(")skipNewlines: ");
                }
                return 0;
            }
        }
        char c = fCurrentEntity.getCurrentChar();
        int newlines = 0;
        if (c == '\n' || c == '\r') {
            do {
                c = fCurrentEntity.getNextChar();
                if (c == '\n') {
                    newlines++;
                    if (fCurrentEntity.offset_ == fCurrentEntity.length_) {
                        fCurrentEntity.offset_ = newlines;
                        if (fCurrentEntity.load(newlines) == -1) {
                            break;
                        }
                    }
                }
                else if (c == '\r') {
                    newlines++;
                    if (fCurrentEntity.offset_ == fCurrentEntity.length_) {
                        fCurrentEntity.offset_ = newlines;
                        if (fCurrentEntity.load(newlines) == -1) {
                            break;
                        }
                    }
                    if (fCurrentEntity.getCurrentChar() == '\n') {
                        fCurrentEntity.offset_++;
                        fCurrentEntity.characterOffset_++;
                    }
                }
                else {
                    fCurrentEntity.rewind();
                    break;
                }
            }
            while (fCurrentEntity.offset_ < fCurrentEntity.length_ - 1);
            fCurrentEntity.incLine(newlines);
        }
        if (DEBUG_BUFFER) {
            fCurrentEntity.debugBufferIfNeeded(")skipNewlines: ", " -> " + newlines);
        }
        return newlines;
    }

    // infoset utility methods

    // Returns an augmentations object with a location item added.
    protected final Augmentations locationAugs() {
        // we don't have to create a new LocationItem all the time, because the interface says:
        // Methods that receive Augmentations are required to copy the information
        // if it is to be saved for use beyond the scope of the method.
        if (fAugmentations_) {
            fLocationItem.setValues(fBeginLineNumber, fBeginColumnNumber, fBeginCharacterOffset, fEndLineNumber,
                    fEndColumnNumber, fEndCharacterOffset);
            return fLocationItem;
        }
        return null;
    }

    // Returns an augmentations object with a synthesized item added.
    protected final Augmentations synthesizedAugs() {
        if (fAugmentations_) {
            return SYNTHESIZED_ITEM;
        }
        return null;
    }

    // Returns true if the name is a built-in XML general entity reference.
    protected static boolean builtinXmlRef(final String name) {
        return "amp".equals(name) || "lt".equals(name) || "gt".equals(name) || "quot".equals(name)
                || "apos".equals(name);
    }

    /**
     * Basic scanner interface.
     *
     * @author Andy Clark
     */
    public interface Scanner {

        /**
         * Scans part of the document. This interface allows scanning to be performed in
         * a pulling manner.
         *
         * @param complete True if the scanner should not return until scanning is
         *                 complete.
         *
         * @return True if additional scanning is required.
         *
         * @throws IOException Thrown if I/O error occurs.
         */
        boolean scan(boolean complete) throws IOException;
    }

    /**
     * Current entity.
     *
     * @author Andy Clark
     */
    private static final class CurrentEntity {

        /** Character stream. */
        private Reader stream_;

        /** Encoding. */
        String encoding_;

        /** Public identifier. */
        public final String publicId;

        /** Base system identifier. */
        public final String baseSystemId;

        /** Literal system identifier. */
        public final String literalSystemId;

        /** Expanded system identifier. */
        final String expandedSystemId;

        /** XML version. */
        public final String version = "1.0";

        /** Line number. */
        private int lineNumber_ = 1;

        /** Column number. */
        private int columnNumber_ = 1;

        /** Character offset in the file. */
        int characterOffset_ = 0;

        // buffer

        /** Character buffer. */
        char[] buffer_ = new char[DEFAULT_BUFFER_SIZE];

        /** Offset into character buffer. */
        int offset_ = 0;

        /** Length of characters read into character buffer. */
        int length_ = 0;

        private boolean endReached_ = false;

        // Constructs an entity from the specified stream.
        CurrentEntity(final Reader stream, final String encoding, final String publicId,
                final String baseSystemId, final String literalSystemId, final String expandedSystemId) {
            stream_ = stream;
            this.encoding_ = encoding;
            this.publicId = publicId;
            this.baseSystemId = baseSystemId;
            this.literalSystemId = literalSystemId;
            this.expandedSystemId = expandedSystemId;
        }

        char getCurrentChar() {
            return buffer_[offset_];
        }

        /**
         * @return the current character and moves to next one.
         */
        char getNextChar() {
            characterOffset_++;
            columnNumber_++;
            return buffer_[offset_++];
        }

        void closeQuietly() {
            try {
                stream_.close();
            }
            catch (final IOException e) {
                // ignore
            }
        }

        /**
         * Indicates if there are characters left.
         */
        boolean hasNext() {
            return offset_ < length_;
        }

        /**
         * Loads a new chunk of data into the buffer and returns the number of
         * characters loaded or -1 if no additional characters were loaded.
         *
         * @param loadOffset The offset at which new characters should be loaded.
         * @return count
         * @throws IOException in case of io problems
         */
        protected int load(final int loadOffset) throws IOException {
            if (DEBUG_BUFFER) {
                debugBufferIfNeeded("(load: ");
            }
            // resize buffer, if needed
            if (loadOffset == this.buffer_.length) {
                final int adjust = this.buffer_.length / 4;
                final char[] array = new char[buffer_.length + adjust];
                System.arraycopy(this.buffer_, 0, array, 0, this.length_);
                this.buffer_ = array;
            }
            // read a block of characters
            final int count = stream_.read(buffer_, loadOffset, buffer_.length - loadOffset);
            if (count == -1) {
                this.length_ = loadOffset;
                this.endReached_ = true;
            }
            else {
                this.length_ = count + loadOffset;
            }
            this.offset_ = loadOffset;
            if (DEBUG_BUFFER) {
                debugBufferIfNeeded(")load: ", " -> " + count);
            }
            return count;
        }

        // Reads a single character.
        protected int read() throws IOException {
            if (DEBUG_BUFFER) {
                debugBufferIfNeeded("(read: ");
            }

            if (offset_ == length_) {
                if (endReached_) {
                    return -1;
                }
                if (load(0) == -1) {
                    if (DEBUG_BUFFER) {
                        System.out.println(")read: -> -1");
                    }
                    return -1;
                }
            }
            final char c = buffer_[offset_];
            offset_++;
            characterOffset_++;
            columnNumber_++;

            if (DEBUG_BUFFER) {
                debugBufferIfNeeded(")read: ", " -> " + c);
            }

            return c;
        }

        /** Prints the contents of the character buffer to standard out. */
        private void debugBufferIfNeeded(final String prefix) {
            debugBufferIfNeeded(prefix, "");
        }

        /** Prints the contents of the character buffer to standard out. */
        private void debugBufferIfNeeded(final String prefix, final String suffix) {
            if (DEBUG_BUFFER) {
                System.out.print(prefix);
                System.out.print('[');
                System.out.print(length_);
                System.out.print(' ');
                System.out.print(offset_);
                if (length_ > 0) {
                    System.out.print(" \"");
                    for (int i = 0; i < length_; i++) {
                        if (i == offset_) {
                            System.out.print('^');
                        }
                        final char c = buffer_[i];
                        switch (c) {
                            case '\r':
                                System.out.print("\\r");
                                break;
                            case '\n':
                                System.out.print("\\n");
                                break;
                            case '\t':
                                System.out.print("\\t");
                                break;
                            case '"':
                                System.out.print("\\\"");
                                break;
                            default:
                                System.out.print(c);
                        }
                    }
                    if (offset_ == length_) {
                        System.out.print('^');
                    }
                    System.out.print('"');
                }
                System.out.print(']');
                System.out.print(suffix);
                System.out.println();
            }
        }

        void setStream(final InputStreamReader inputStreamReader) {
            stream_ = inputStreamReader;
            offset_ = 0;
            length_ = 0;
            characterOffset_ = 0;
            lineNumber_ = 1;
            columnNumber_ = 1;
            encoding_ = inputStreamReader.getEncoding();
        }

        /**
         * Goes back, cancelling the effect of the previous read() call.
         */
        void rewind() {
            offset_--;
            characterOffset_--;
            columnNumber_--;
        }

        void rewind(final int i) {
            offset_ -= i;
            characterOffset_ -= i;
            columnNumber_ -= i;
        }

        void incLine() {
            lineNumber_++;
            columnNumber_ = 1;
        }

        void incLine(final int nbLines) {
            lineNumber_ += nbLines;
            columnNumber_ = 1;
        }

        public int getLineNumber() {
            return lineNumber_;
        }

        void resetBuffer(final XMLString xmlBuffer, final int lineNumber, final int columnNumber,
                final int characterOffset) {
            lineNumber_ = lineNumber;
            columnNumber_ = columnNumber;
            this.characterOffset_ = characterOffset;

            // TODO RBRi
            this.buffer_ = xmlBuffer.getChars();
            this.offset_ = 0;
            this.length_ = xmlBuffer.length();
        }

        int getColumnNumber() {
            return columnNumber_;
        }

        void restorePosition(final int originalOffset, final int originalColumnNumber, final int originalCharacterOffset) {
            this.offset_ = originalOffset;
            this.columnNumber_ = originalColumnNumber;
            this.characterOffset_ = originalCharacterOffset;
        }

        int getCharacterOffset() {
            return characterOffset_;
        }
    }

    /*
     * Script parsing states based on https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
     */
    private enum ScanScriptState {
        /** Script data state */
        DATA,
        /** Script data escaped state */
        ESCAPED,
        /** Script data escaped less-than sign state */
        ESCAPED_LT,
        /** Script data double escaped state */
        DOUBLE_ESCAPED,
        /** Script data double escaped less-than sign state */
        DOUBLE_ESCAPED_LT,
    }

    /**
     * The primary HTML document scanner.
     *
     * @author Andy Clark
     */
    public class ContentScanner implements Scanner {

        /** A qualified name. */
        private final QName qName_ = new QName();

        /** Attributes. */
        private final XMLAttributesImpl attributes_ = new XMLAttributesImpl();

        /** Scan. */
        @Override
        public boolean scan(final boolean complete) throws IOException {
            boolean next;
            do {
                try {
                    next = false;
                    switch (fScannerState) {
                        case STATE_CONTENT: {
                            fBeginLineNumber = fCurrentEntity.getLineNumber();
                            fBeginColumnNumber = fCurrentEntity.getColumnNumber();
                            fBeginCharacterOffset = fCurrentEntity.getCharacterOffset();
                            final int c = fCurrentEntity.read();
                            if (c == -1) {
                                throw new EOFException();
                            }
                            if (c == '<') {
                                setScannerState(STATE_MARKUP_BRACKET);
                                next = true;
                            }
                            else if (c == '&') {
                                scanEntityRef(fStringBuffer, true);
                            }
                            else {
                                fCurrentEntity.rewind();
                                scanCharacters();
                            }
                            break;
                        }
                        case STATE_MARKUP_BRACKET: {
                            final int c = fCurrentEntity.read();
                            if (c == -1) {
                                if (fReportErrors_) {
                                    fErrorReporter.reportError("HTML1003", null);
                                }
                                if (fDocumentHandler != null && fElementCount >= fElementDepth) {
                                    fStringBuffer.clearAndAppend('<');
                                    fDocumentHandler.characters(fStringBuffer, null);
                                }
                                throw new EOFException();
                            }
                            if (c == '!') {
                                // process some strange self closing comments first
                                if (skip("--->", false)
                                        || skip("-->", false)
                                        || skip("->", false)
                                        || skip(">", false)) {
                                    fEndLineNumber = fCurrentEntity.getLineNumber();
                                    fEndColumnNumber = fCurrentEntity.getColumnNumber();
                                    fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
                                    // using EMPTY here is slightly dangerous but a review showed
                                    // that all implementations of comment() only read the data
                                    // never do anything else with it, so safe for now
                                    fDocumentHandler.comment(XMLString.EMPTY, locationAugs());
                                }
                                else if (skip("-!>", false)) {
                                    fEndLineNumber = fCurrentEntity.getLineNumber();
                                    fEndColumnNumber = fCurrentEntity.getColumnNumber();
                                    fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
                                    final XMLString str = new XMLString();
                                    str.append("-!");
                                    fDocumentHandler.comment(str, locationAugs());
                                }
                                else if (skip("--", false)) {
                                    scanComment();
                                }
                                else if (skip("[CDATA[", false)) {
                                    scanCDATA();
                                }
                                else if (skip("DOCTYPE", false)) {
                                    scanDoctype();
                                }
                                else {
                                    if (fReportErrors_) {
                                        fErrorReporter.reportError("HTML1002", null);
                                    }
                                    skipMarkup(true);
                                }
                            }
                            else if (c == '?') {
                                scanPI();
                            }
                            else if (c == '/') {
                                scanEndElement();
                            }
                            else {
                                fCurrentEntity.rewind();
                                fElementCount++;
                                fSingleBoolean[0] = false;

                                final String ename = scanStartElement(fSingleBoolean);
                                final String enameLC = ename == null ? null : ename.toLowerCase(Locale.ROOT);

                                fBeginLineNumber = fCurrentEntity.getLineNumber();
                                fBeginColumnNumber = fCurrentEntity.getColumnNumber();
                                fBeginCharacterOffset = fCurrentEntity.getCharacterOffset();

                                if ("script".equals(enameLC)) {
                                    scanScriptContent();
                                }
                                else if (!fAllowSelfclosingTags_ && !fAllowSelfclosingIframe_ && "iframe".equals(enameLC)) {
                                    scanUntilEndTag("iframe");
                                }
                                else if (!fParseNoScriptContent_ && "noscript".equals(enameLC)) {
                                    scanUntilEndTag("noscript");
                                }
                                else if ("noframes".equals(enameLC)) {
                                    scanUntilEndTag("noframes");
                                }
                                else if ("noembed".equals(enameLC)) {
                                    scanUntilEndTag("noembed");
                                }
                                else if (ename != null && htmlConfiguration_.getHtmlElements().getElement(enameLC).isSpecial()
                                        && (!"title".equals(enameLC) || isEnded(enameLC))) {
                                    if ("plaintext".equals(enameLC)) {
                                        setScanner(new PlainTextScanner());
                                    }
                                    else {
                                        setScanner(fSpecialScanner.setElementName(ename));
                                        setScannerState(STATE_CONTENT);
                                    }
                                    return true;
                                }
                            }
                            setScannerState(STATE_CONTENT);
                            break;
                        }
                        case STATE_START_DOCUMENT: {
                            if (fDocumentHandler != null && fElementCount >= fElementDepth) {
                                if (DEBUG_CALLBACKS) {
                                    System.out.println("startDocument()");
                                }
                                final XMLLocator locator = HTMLScanner.this;
                                final String encoding = fIANAEncoding;
                                final Augmentations augs = locationAugs();
                                final NamespaceContext nscontext = new NamespaceSupport();
                                fDocumentHandler.startDocument(locator, encoding, nscontext, augs);
                            }
                            if (fInsertDoctype_ && fDocumentHandler != null) {
                                String root = htmlConfiguration_.getHtmlElements().getElement(HTMLElements.HTML).name;
                                root = modifyName(root, fNamesElems);
                                final String pubid = fDoctypePubid;
                                final String sysid = fDoctypeSysid;
                                fDocumentHandler.doctypeDecl(root, pubid, sysid, synthesizedAugs());
                            }
                            setScannerState(STATE_CONTENT);
                            break;
                        }
                        case STATE_END_DOCUMENT: {
                            if (fDocumentHandler != null && fElementCount >= fElementDepth && complete) {
                                if (DEBUG_CALLBACKS) {
                                    System.out.println("endDocument()");
                                }
                                fEndLineNumber = fCurrentEntity.getLineNumber();
                                fEndColumnNumber = fCurrentEntity.getColumnNumber();
                                fEndCharacterOffset = fCurrentEntity.getCharacterOffset();
                                fDocumentHandler.endDocument(locationAugs());
                            }
                            return false;
                        }
                        default: {
                            throw new RuntimeException("unknown scanner state: " + fScannerState);
                        }
                    }
                }
                catch (final EOFException e) {
                    if (fCurrentEntityStack.isEmpty()) {
                        setScannerState(STATE_END_DOCUMENT);
                    }
                    else {
                        fCurrentEntity = fCurrentEntityStack.pop();
                    }
                    next = true;
                }
            }
            while (next || complete);
            return true;
        }

        /**
         * Scans the content of