All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.htmlunit.cyberneko.HTMLScanner Maven / Gradle / Ivy

/*
 * Copyright (c) 2002-2009 Andy Clark, Marc Guillemot
 * Copyright (c) 2017-2024 Ronald Brill
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.htmlunit.cyberneko;

import java.io.BufferedReader;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.Locale;

import org.htmlunit.cyberneko.io.PlaybackInputStream;
import org.htmlunit.cyberneko.util.MiniStack;
import org.htmlunit.cyberneko.xerces.util.EncodingTranslator;
import org.htmlunit.cyberneko.xerces.util.NamespaceSupport;
import org.htmlunit.cyberneko.xerces.util.StandardEncodingTranslator;
import org.htmlunit.cyberneko.xerces.util.URI;
import org.htmlunit.cyberneko.xerces.util.XMLAttributesImpl;
import org.htmlunit.cyberneko.xerces.xni.Augmentations;
import org.htmlunit.cyberneko.xerces.xni.NamespaceContext;
import org.htmlunit.cyberneko.xerces.xni.QName;
import org.htmlunit.cyberneko.xerces.xni.XMLAttributes;
import org.htmlunit.cyberneko.xerces.xni.XMLDocumentHandler;
import org.htmlunit.cyberneko.xerces.xni.XMLLocator;
import org.htmlunit.cyberneko.xerces.xni.XMLString;
import org.htmlunit.cyberneko.xerces.xni.XNIException;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLComponentManager;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLConfigurationException;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLDocumentSource;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;

/**
 * A simple HTML scanner. This scanner makes no attempt to balance tags or fix
 * other problems in the source document — it just scans what it can and
 * generates XNI document "events", ignoring errors of all kinds.
 * 

* This component recognizes the following features: *

    *
  • http://cyberneko.org/html/features/augmentations *
  • http://cyberneko.org/html/features/report-errors *
  • http://cyberneko.org/html/features/scanner/script/strip-cdata-delims *
  • http://cyberneko.org/html/features/scanner/script/strip-comment-delims *
  • http://cyberneko.org/html/features/scanner/style/strip-cdata-delims *
  • http://cyberneko.org/html/features/scanner/style/strip-comment-delims *
  • http://cyberneko.org/html/features/scanner/ignore-specified-charset *
  • http://cyberneko.org/html/features/scanner/cdata-sections *
  • http://cyberneko.org/html/features/override-doctype *
  • http://cyberneko.org/html/features/insert-doctype *
  • http://cyberneko.org/html/features/parse-noscript-content *
  • http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe *
  • http://cyberneko.org/html/features/scanner/allow-selfclosing-tags *
  • http://cyberneko.org/html/features/scanner/normalize-attrs *
  • http://cyberneko.org/html/features/scanner/plain-attr-values *
*

* This component recognizes the following properties: *

    *
  • http://cyberneko.org/html/properties/names/elems *
  • http://cyberneko.org/html/properties/names/attrs *
  • http://cyberneko.org/html/properties/default-encoding *
  • http://cyberneko.org/html/properties/error-reporter *
  • http://cyberneko.org/html/properties/encoding-translator *
  • http://cyberneko.org/html/properties/doctype/pubid *
  • http://cyberneko.org/html/properties/doctype/sysid *
* * @see HTMLElements * * @author Andy Clark * @author Marc Guillemot * @author Ahmed Ashour * @author Ronald Brill * @author René Schwietzke */ public class HTMLScanner implements XMLDocumentSource, XMLLocator, HTMLComponent { // doctype info: HTML 4.01 strict /** HTML 4.01 strict public identifier ("-//W3C//DTD HTML 4.01//EN"). */ public static final String HTML_4_01_STRICT_PUBID = "-//W3C//DTD HTML 4.01//EN"; /** * HTML 4.01 strict system identifier ("http://www.w3.org/TR/html4/strict.dtd"). */ public static final String HTML_4_01_STRICT_SYSID = "http://www.w3.org/TR/html4/strict.dtd"; // doctype info: HTML 4.01 loose /** * HTML 4.01 transitional public identifier ("-//W3C//DTD HTML 4.01 * Transitional//EN"). */ public static final String HTML_4_01_TRANSITIONAL_PUBID = "-//W3C//DTD HTML 4.01 Transitional//EN"; /** * HTML 4.01 transitional system identifier * ("http://www.w3.org/TR/html4/loose.dtd"). */ public static final String HTML_4_01_TRANSITIONAL_SYSID = "http://www.w3.org/TR/html4/loose.dtd"; // doctype info: HTML 4.01 frameset /** * HTML 4.01 frameset public identifier ("-//W3C//DTD HTML 4.01 Frameset//EN"). */ public static final String HTML_4_01_FRAMESET_PUBID = "-//W3C//DTD HTML 4.01 Frameset//EN"; /** * HTML 4.01 frameset system identifier * ("http://www.w3.org/TR/html4/frameset.dtd"). */ public static final String HTML_4_01_FRAMESET_SYSID = "http://www.w3.org/TR/html4/frameset.dtd"; // features /** Include infoset augmentations. */ public static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations"; /** Report errors. */ public static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors"; /** * Strip HTML comment delimiters ("<!−−" and * "−−>") from SCRIPT tag contents. */ public static final String SCRIPT_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-comment-delims"; /** * Strip XHTML CDATA delimiters ("<![CDATA[" and "]]>") from SCRIPT tag * contents. */ public static final String SCRIPT_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/script/strip-cdata-delims"; /** * Strip HTML comment delimiters ("<!−−" and * "−−>") from STYLE tag contents. */ public static final String STYLE_STRIP_COMMENT_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-comment-delims"; /** * Strip XHTML CDATA delimiters ("<![CDATA[" and "]]>") from STYLE tag * contents. */ public static final String STYLE_STRIP_CDATA_DELIMS = "http://cyberneko.org/html/features/scanner/style/strip-cdata-delims"; /** * Ignore specified charset found in the <meta equiv='Content-Type' * content='text/html;charset=…'> tag or in the <?xml … * encoding='…'> processing instruction */ public static final String IGNORE_SPECIFIED_CHARSET = "http://cyberneko.org/html/features/scanner/ignore-specified-charset"; /** Scan CDATA sections. */ public static final String CDATA_SECTIONS = "http://cyberneko.org/html/features/scanner/cdata-sections"; /** Override doctype declaration public and system identifiers. */ public static final String OVERRIDE_DOCTYPE = "http://cyberneko.org/html/features/override-doctype"; /** Insert document type declaration. */ public static final String INSERT_DOCTYPE = "http://cyberneko.org/html/features/insert-doctype"; /** Parse <noscript>...</noscript> content */ public static final String PARSE_NOSCRIPT_CONTENT = "http://cyberneko.org/html/features/parse-noscript-content"; /** Allows self closing <iframe/> tag */ public static final String ALLOW_SELFCLOSING_IFRAME = "http://cyberneko.org/html/features/scanner/allow-selfclosing-iframe"; /** Allows self closing tags e.g. <div/> (XHTML) */ public static final String ALLOW_SELFCLOSING_TAGS = "http://cyberneko.org/html/features/scanner/allow-selfclosing-tags"; /** Normalize attribute values. */ public static final String NORMALIZE_ATTRIBUTES = "http://cyberneko.org/html/features/scanner/normalize-attrs"; /** Store the plain attribute values also. */ public static final String PLAIN_ATTRIBUTE_VALUES = "http://cyberneko.org/html/features/scanner/plain-attr-values"; /** Recognized features. */ private static final String[] RECOGNIZED_FEATURES = { AUGMENTATIONS, REPORT_ERRORS, SCRIPT_STRIP_CDATA_DELIMS, SCRIPT_STRIP_COMMENT_DELIMS, STYLE_STRIP_CDATA_DELIMS, STYLE_STRIP_COMMENT_DELIMS, IGNORE_SPECIFIED_CHARSET, CDATA_SECTIONS, OVERRIDE_DOCTYPE, INSERT_DOCTYPE, NORMALIZE_ATTRIBUTES, PLAIN_ATTRIBUTE_VALUES, PARSE_NOSCRIPT_CONTENT, ALLOW_SELFCLOSING_IFRAME, ALLOW_SELFCLOSING_TAGS, }; /** Recognized features defaults. */ private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = { null, null, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.FALSE, Boolean.TRUE, Boolean.FALSE, Boolean.FALSE, }; // properties /** Modify HTML element names: { "upper", "lower", "default" }. */ public static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems"; /** Modify HTML attribute names: { "upper", "lower", "default" }. */ public static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs"; /** Default encoding. */ public static final String DEFAULT_ENCODING = "http://cyberneko.org/html/properties/default-encoding"; /** Error reporter. */ public static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter"; /** Encoding translator. */ public static final String ENCODING_TRANSLATOR = "http://cyberneko.org/html/properties/encoding-translator"; /** Doctype declaration public identifier. */ public static final String DOCTYPE_PUBID = "http://cyberneko.org/html/properties/doctype/pubid"; /** Doctype declaration system identifier. */ public static final String DOCTYPE_SYSID = "http://cyberneko.org/html/properties/doctype/sysid"; /** Recognized properties. */ private static final String[] RECOGNIZED_PROPERTIES = { NAMES_ELEMS, NAMES_ATTRS, DEFAULT_ENCODING, ERROR_REPORTER, ENCODING_TRANSLATOR, DOCTYPE_PUBID, DOCTYPE_SYSID}; /** Recognized properties defaults. */ private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = { null, null, "Windows-1252", null, StandardEncodingTranslator.INSTANCE, HTML_4_01_TRANSITIONAL_PUBID, HTML_4_01_TRANSITIONAL_SYSID}; // states /** State: content. */ protected static final short STATE_CONTENT = 0; /** State: markup bracket. */ protected static final short STATE_MARKUP_BRACKET = 1; /** State: start document. */ protected static final short STATE_START_DOCUMENT = 10; /** State: end document. */ protected static final short STATE_END_DOCUMENT = 11; // modify HTML names /** Don't modify HTML names. */ protected static final short NAMES_NO_CHANGE = 0; /** Uppercase HTML names. */ protected static final short NAMES_UPPERCASE = 1; /** Lowercase HTML names. */ protected static final short NAMES_LOWERCASE = 2; // defaults /* Default buffer size, 10 cache lines minus overhead * A smaller buffer creates less cache misses compared * to 2048 bytes or more. */ protected static final int DEFAULT_BUFFER_SIZE = (10 * 64) - 24; // debugging /** Set to true to debug changes in the scanner. */ private static final boolean DEBUG_SCANNER = false; /** Set to true to debug changes in the scanner state. */ private static final boolean DEBUG_SCANNER_STATE = false; /** Set to true to debug the buffer. */ private static final boolean DEBUG_BUFFER = false; /** Set to true to debug character encoding handling. */ private static final boolean DEBUG_CHARSET = false; /** Set to true to debug callbacks. */ protected static final boolean DEBUG_CALLBACKS = false; // static vars /** Synthesized event info item. */ protected static final HTMLEventInfo SYNTHESIZED_ITEM = new HTMLEventInfo.SynthesizedItem(); // features /** Augmentations. */ private boolean fAugmentations_; /** Report errors. */ boolean fReportErrors_; /** Strip CDATA delimiters from SCRIPT tags. */ boolean fScriptStripCDATADelims_; /** Strip comment delimiters from SCRIPT tags. */ boolean fScriptStripCommentDelims_; /** Strip CDATA delimiters from STYLE tags. */ boolean fStyleStripCDATADelims_; /** Strip comment delimiters from STYLE tags. */ boolean fStyleStripCommentDelims_; /** Ignore specified character set. */ boolean fIgnoreSpecifiedCharset_; /** CDATA sections. */ boolean fCDATASections_; /** Override doctype declaration public and system identifiers. */ private boolean fOverrideDoctype_; /** Insert document type declaration. */ boolean fInsertDoctype_; /** Normalize attribute values. */ boolean fNormalizeAttributes_; /** Store the plain attribute values also. */ boolean fPlainAttributeValues_; /** Parse noscript content. */ boolean fParseNoScriptContent_; /** Allows self closing iframe tags. */ boolean fAllowSelfclosingIframe_; /** Allows self closing tags. */ boolean fAllowSelfclosingTags_; // properties /** Modify HTML element names. */ protected short fNamesElems; /** Modify HTML attribute names. */ protected short fNamesAttrs; /** Default encoding. */ protected String fDefaultIANAEncoding; /** Error reporter. */ protected HTMLErrorReporter fErrorReporter; /** Error reporter. */ protected EncodingTranslator fEncodingTranslator; /** Doctype declaration public identifier. */ protected String fDoctypePubid; /** Doctype declaration system identifier. */ protected String fDoctypeSysid; // boundary locator information /** Beginning line number. */ protected int fBeginLineNumber; /** Beginning column number. */ protected int fBeginColumnNumber; /** Beginning character offset in the file. */ protected int fBeginCharacterOffset; /** Ending line number. */ protected int fEndLineNumber; /** Ending column number. */ protected int fEndColumnNumber; /** Ending character offset in the file. */ protected int fEndCharacterOffset; // state /** The playback byte stream. */ protected PlaybackInputStream fByteStream; /** Current entity. */ CurrentEntity fCurrentEntity; /** The current entity stack. */ protected final MiniStack fCurrentEntityStack = new MiniStack<>(); /** The current scanner. */ protected Scanner fScanner; /** The current scanner state. */ protected short fScannerState; /** The document handler. */ protected XMLDocumentHandler fDocumentHandler; /** Auto-detected IANA encoding. */ protected String fIANAEncoding; /** Auto-detected Java encoding. */ protected String fJavaEncoding; /** Element count. */ protected int fElementCount; /** Element depth. */ protected int fElementDepth; /** if not empty this is the name of the tag, that requires the special scanner. */ private String fFragmentSpecialScannerTag_; // scanners /** Content scanner. */ protected Scanner fContentScanner = new ContentScanner(); /** * Special scanner used for elements whose content needs to be scanned as plain * text, ignoring markup such as elements and entity references. For example: * <SCRIPT> and <COMMENT>. */ protected final SpecialScanner fSpecialScanner = new SpecialScanner(); /** * Special scanner used script tags. */ protected final ScriptScanner fScriptScanner = new ScriptScanner(); // temp vars /** String buffer. */ protected final XMLString fStringBuffer = new XMLString(); /** String buffer used when resolving entity refs. */ final XMLString fStringBufferEntiyRef = new XMLString(); final XMLString fStringBufferPlainAttribValue = new XMLString(); final XMLString fScanUntilEndTag = new XMLString(); final XMLString fScanComment = new XMLString(); private final XMLString fScanLiteral = new XMLString(); /** Single boolean array. */ final boolean[] fSingleBoolean = {false}; final HTMLConfiguration htmlConfiguration_; /** * Our location item, to be reused because {@link Augmentations} * says so, so let's save on memory */ private final LocationItem fLocationItem = new LocationItem(); /** * Creates a new HTMLScanner with the given configuration * * @param htmlConfiguration the configuration to use */ HTMLScanner(final HTMLConfiguration htmlConfiguration) { htmlConfiguration_ = htmlConfiguration; } /** * Pushes an input source onto the current entity stack. This enables the * scanner to transparently scan new content (e.g. the output written by an * embedded script). At the end of the current entity, the scanner returns where * it left off at the time this entity source was pushed. *

* Note: This functionality is experimental at this time and is * subject to change in future releases of NekoHTML. * * @param inputSource The new input source to start scanning. * @see #evaluateInputSource(XMLInputSource) */ public void pushInputSource(final XMLInputSource inputSource) { final Reader reader = getReader(inputSource); fCurrentEntityStack.push(fCurrentEntity); final String encoding = inputSource.getEncoding(); final String publicId = inputSource.getPublicId(); final String baseSystemId = inputSource.getBaseSystemId(); final String literalSystemId = inputSource.getSystemId(); final String expandedSystemId = expandSystemId(literalSystemId, baseSystemId); fCurrentEntity = new CurrentEntity(reader, encoding, publicId, baseSystemId, literalSystemId, expandedSystemId); } private Reader getReader(final XMLInputSource inputSource) { final Reader reader = inputSource.getCharacterStream(); if (reader == null) { try { if (StandardEncodingTranslator.REPLACEMENT.equalsIgnoreCase(fJavaEncoding)) { return new StringReader("\uFFFD"); } return new InputStreamReader(inputSource.getByteStream(), fJavaEncoding); } catch (final UnsupportedEncodingException e) { // should not happen as this encoding is already used to parse the "main" source } } return reader; } /** * Immediately evaluates an input source and add the new content (e.g. the * output written by an embedded script). * * @param inputSource The new input source to start evaluating. * @see #pushInputSource(XMLInputSource) */ public void evaluateInputSource(final XMLInputSource inputSource) { final Scanner previousScanner = fScanner; final short previousScannerState = fScannerState; final CurrentEntity previousEntity = fCurrentEntity; final Reader reader = getReader(inputSource); final String encoding = inputSource.getEncoding(); final String publicId = inputSource.getPublicId(); final String baseSystemId = inputSource.getBaseSystemId(); final String literalSystemId = inputSource.getSystemId(); final String expandedSystemId = expandSystemId(literalSystemId, baseSystemId); fCurrentEntity = new CurrentEntity(reader, encoding, publicId, baseSystemId, literalSystemId, expandedSystemId); setScanner(fContentScanner); setScannerState(STATE_CONTENT); try { while (fScanner.scan(false)) { // do nothing } } catch (final IOException e) { // ignore } // preserve the plaintext scanning process setScanner(fScanner instanceof PlainTextScanner ? new PlainTextScanner() : previousScanner); setScannerState(previousScannerState); fCurrentEntity = previousEntity; } /** * Cleans up used resources. For example, if scanning is terminated early, then * this method ensures all remaining open streams are closed. * * @param closeall Close all streams, including the original. This is used in * cases when the application has opened the original document * stream and should be responsible for closing it. */ public void cleanup(final boolean closeall) { final int size = fCurrentEntityStack.size(); if (size > 0) { // current entity is not the original, so close it if (fCurrentEntity != null) { fCurrentEntity.closeQuietly(); } // close remaining streams for (int i = closeall ? 0 : 1; i < size; i++) { fCurrentEntity = fCurrentEntityStack.pop(); fCurrentEntity.closeQuietly(); } } else if (closeall && fCurrentEntity != null) { fCurrentEntity.closeQuietly(); } } /** Returns the encoding. */ @Override public String getEncoding() { return fCurrentEntity != null ? fCurrentEntity.encoding_ : null; } /** Returns the public identifier. */ @Override public String getPublicId() { return fCurrentEntity != null ? fCurrentEntity.publicId : null; } /** Returns the base system identifier. */ @Override public String getBaseSystemId() { return fCurrentEntity != null ? fCurrentEntity.baseSystemId : null; } /** Returns the literal system identifier. */ @Override public String getLiteralSystemId() { return fCurrentEntity != null ? fCurrentEntity.literalSystemId : null; } /** Returns the expanded system identifier. */ @Override public String getExpandedSystemId() { return fCurrentEntity != null ? fCurrentEntity.expandedSystemId : null; } /** Returns the current line number. */ @Override public int getLineNumber() { return fCurrentEntity != null ? fCurrentEntity.getLineNumber() : -1; } /** Returns the current column number. */ @Override public int getColumnNumber() { return fCurrentEntity != null ? fCurrentEntity.getColumnNumber() : -1; } /** Returns the XML version. */ @Override public String getXMLVersion() { return fCurrentEntity != null ? fCurrentEntity.version : null; } /** Returns the character offset. */ @Override public int getCharacterOffset() { return fCurrentEntity != null ? fCurrentEntity.getCharacterOffset() : -1; } /** Returns the default state for a feature. */ @Override public Boolean getFeatureDefault(final String featureId) { final int length = RECOGNIZED_FEATURES != null ? RECOGNIZED_FEATURES.length : 0; for (int i = 0; i < length; i++) { if (RECOGNIZED_FEATURES[i].equals(featureId)) { return RECOGNIZED_FEATURES_DEFAULTS[i]; } } return null; } /** Returns the default state for a property. */ @Override public Object getPropertyDefault(final String propertyId) { final int length = RECOGNIZED_PROPERTIES != null ? RECOGNIZED_PROPERTIES.length : 0; for (int i = 0; i < length; i++) { if (RECOGNIZED_PROPERTIES[i].equals(propertyId)) { return RECOGNIZED_PROPERTIES_DEFAULTS[i]; } } return null; } /** Returns recognized features. */ @Override public String[] getRecognizedFeatures() { return RECOGNIZED_FEATURES; } /** Returns recognized properties. */ @Override public String[] getRecognizedProperties() { return RECOGNIZED_PROPERTIES; } /** Resets the component. */ @Override public void reset(final XMLComponentManager manager) throws XMLConfigurationException { // get features fAugmentations_ = manager.getFeature(AUGMENTATIONS); fReportErrors_ = manager.getFeature(REPORT_ERRORS); fScriptStripCDATADelims_ = manager.getFeature(SCRIPT_STRIP_CDATA_DELIMS); fScriptStripCommentDelims_ = manager.getFeature(SCRIPT_STRIP_COMMENT_DELIMS); fStyleStripCDATADelims_ = manager.getFeature(STYLE_STRIP_CDATA_DELIMS); fStyleStripCommentDelims_ = manager.getFeature(STYLE_STRIP_COMMENT_DELIMS); fIgnoreSpecifiedCharset_ = manager.getFeature(IGNORE_SPECIFIED_CHARSET); fCDATASections_ = manager.getFeature(CDATA_SECTIONS); fOverrideDoctype_ = manager.getFeature(OVERRIDE_DOCTYPE); fInsertDoctype_ = manager.getFeature(INSERT_DOCTYPE); fNormalizeAttributes_ = manager.getFeature(NORMALIZE_ATTRIBUTES); fPlainAttributeValues_ = manager.getFeature(PLAIN_ATTRIBUTE_VALUES); fParseNoScriptContent_ = manager.getFeature(PARSE_NOSCRIPT_CONTENT); fAllowSelfclosingIframe_ = manager.getFeature(ALLOW_SELFCLOSING_IFRAME); fAllowSelfclosingTags_ = manager.getFeature(ALLOW_SELFCLOSING_TAGS); // get properties fNamesElems = getNamesValue(String.valueOf(manager.getProperty(NAMES_ELEMS))); fNamesAttrs = getNamesValue(String.valueOf(manager.getProperty(NAMES_ATTRS))); fDefaultIANAEncoding = String.valueOf(manager.getProperty(DEFAULT_ENCODING)); fErrorReporter = (HTMLErrorReporter) manager.getProperty(ERROR_REPORTER); fEncodingTranslator = (EncodingTranslator) manager.getProperty(ENCODING_TRANSLATOR); fDoctypePubid = String.valueOf(manager.getProperty(DOCTYPE_PUBID)); fDoctypeSysid = String.valueOf(manager.getProperty(DOCTYPE_SYSID)); final QName[] fragmentContextStack = (QName[]) manager.getProperty(HTMLTagBalancer.FRAGMENT_CONTEXT_STACK); if (fragmentContextStack != null) { final int length = fragmentContextStack.length; if (length > 0) { final QName lastQname = fragmentContextStack[length - 1]; final String name = lastQname.getLocalpart(); final String nameLC = name.toLowerCase(Locale.ROOT); if (htmlConfiguration_.getHtmlElements().getElement(nameLC).isSpecial()) { fFragmentSpecialScannerTag_ = name; } } } } /** Sets a feature. */ @Override public void setFeature(final String featureId, final boolean state) { if (featureId.equals(AUGMENTATIONS)) { fAugmentations_ = state; } else if (featureId.equals(IGNORE_SPECIFIED_CHARSET)) { fIgnoreSpecifiedCharset_ = state; } else if (featureId.equals(SCRIPT_STRIP_CDATA_DELIMS)) { fScriptStripCDATADelims_ = state; } else if (featureId.equals(SCRIPT_STRIP_COMMENT_DELIMS)) { fScriptStripCommentDelims_ = state; } else if (featureId.equals(STYLE_STRIP_CDATA_DELIMS)) { fStyleStripCDATADelims_ = state; } else if (featureId.equals(STYLE_STRIP_COMMENT_DELIMS)) { fStyleStripCommentDelims_ = state; } else if (featureId.equals(PARSE_NOSCRIPT_CONTENT)) { fParseNoScriptContent_ = state; } else if (featureId.equals(ALLOW_SELFCLOSING_IFRAME)) { fAllowSelfclosingIframe_ = state; } else if (featureId.equals(ALLOW_SELFCLOSING_TAGS)) { fAllowSelfclosingTags_ = state; } } /** Sets a property. */ @Override public void setProperty(final String propertyId, final Object value) throws XMLConfigurationException { if (propertyId.equals(NAMES_ELEMS)) { fNamesElems = getNamesValue(String.valueOf(value)); return; } if (propertyId.equals(NAMES_ATTRS)) { fNamesAttrs = getNamesValue(String.valueOf(value)); return; } if (propertyId.equals(DEFAULT_ENCODING)) { fDefaultIANAEncoding = String.valueOf(value); return; } } /** * Sets the input source. * * @param source The input source. * * @throws IOException Thrown on i/o error. */ public void setInputSource(final XMLInputSource source) throws IOException { // reset state fElementCount = 0; fElementDepth = -1; fByteStream = null; fCurrentEntityStack.clear(); fBeginLineNumber = 1; fBeginColumnNumber = 1; fBeginCharacterOffset = 0; fEndLineNumber = fBeginLineNumber; fEndColumnNumber = fBeginColumnNumber; fEndCharacterOffset = fBeginCharacterOffset; // reset encoding information fIANAEncoding = fDefaultIANAEncoding; fJavaEncoding = fIANAEncoding; // get location information final String encoding = source.getEncoding(); final String publicId = source.getPublicId(); final String baseSystemId = source.getBaseSystemId(); final String literalSystemId = source.getSystemId(); final String expandedSystemId = expandSystemId(literalSystemId, baseSystemId); // open stream Reader reader = source.getCharacterStream(); if (reader == null) { InputStream inputStream = source.getByteStream(); if (inputStream == null) { final URL url = new URL(expandedSystemId); inputStream = url.openStream(); } fByteStream = new PlaybackInputStream(inputStream); final String[] encodings = new String[2]; if (encoding == null) { fByteStream.detectEncoding(encodings); } else { encodings[0] = encoding; } if (encodings[0] == null) { encodings[0] = fDefaultIANAEncoding; if (fReportErrors_) { fErrorReporter.reportWarning("HTML1000", null); } } if (encodings[1] == null) { encodings[1] = fEncodingTranslator.encodingNameFromLabel(encodings[0]); if (encodings[1] == null || (!StandardEncodingTranslator.REPLACEMENT.equalsIgnoreCase(encodings[1]) && !Charset.isSupported(encodings[1]))) { encodings[1] = encodings[0]; if (fReportErrors_) { fErrorReporter.reportWarning("HTML1001", new Object[] {encodings[0]}); } } } fIANAEncoding = encodings[0]; fJavaEncoding = encodings[1]; if (StandardEncodingTranslator.REPLACEMENT.equalsIgnoreCase(fJavaEncoding)) { reader = new BufferedReader(new StringReader("\uFFFD")); } else { reader = new BufferedReader(new InputStreamReader(fByteStream, fJavaEncoding)); } } fCurrentEntity = new CurrentEntity(reader, fIANAEncoding, publicId, baseSystemId, literalSystemId, expandedSystemId); // set scanner and state if (fFragmentSpecialScannerTag_ != null) { if ("script".equals(fFragmentSpecialScannerTag_.toLowerCase(Locale.ROOT))) { setScanner(fScriptScanner); } else if ("plaintext".equals(fFragmentSpecialScannerTag_.toLowerCase(Locale.ROOT))) { setScanner(new PlainTextScanner()); } else { setScanner(fSpecialScanner.setElementName(fFragmentSpecialScannerTag_)); setScannerState(STATE_CONTENT); } } else { setScanner(fContentScanner); setScannerState(STATE_START_DOCUMENT); } } /** * Scans a document. * * @param complete True if the scanner should scan the document completely, * pushing all events to the registered document handler. A * value of false indicates that the scanner should only * scan the next portion of the document and return. A scanner * instance is permitted to completely scan a document if it * does not support this "pull" scanning model. * * @return True if there is more to scan, false otherwise. * @throws IOException Thrown on i/o error. * @throws XNIException on error. */ public boolean scanDocument(final boolean complete) throws XNIException, IOException { do { if (!fScanner.scan(complete)) { return false; } } while (complete); return true; } /** Sets the document handler. */ @Override public void setDocumentHandler(final XMLDocumentHandler handler) { fDocumentHandler = handler; } /** Returns the document handler. */ @Override public XMLDocumentHandler getDocumentHandler() { return fDocumentHandler; } // Returns the value of the specified attribute, ignoring case. protected static String getValue(final XMLAttributes attrs, final String aname) { if (attrs != null) { final int length = attrs.getLength(); for (int i = 0; i < length; i++) { if (attrs.getQName(i).equalsIgnoreCase(aname)) { return attrs.getValue(i); } } } return null; } /** * Expands a system id and returns the system id as a URI, if it can be * expanded. A return value of null means that the identifier is already * expanded. An exception thrown indicates a failure to expand the id. * * @param systemId The systemId to be expanded. * @param baseSystemId baseSystemId * * @return Returns the URI string representing the expanded system identifier. A * null value indicates that the given system identifier is already * expanded. * */ @SuppressWarnings("unused") public static String expandSystemId(final String systemId, final String baseSystemId) { // check for bad parameters id if (systemId == null || systemId.length() == 0) { return systemId; } // if id already expanded, return try { new URI(systemId); return systemId; } catch (final URI.MalformedURIException e) { // continue on... } // normalize id final String id = fixURI(systemId); // normalize base URI base; URI uri = null; try { if (baseSystemId == null || baseSystemId.length() == 0 || baseSystemId.equals(systemId)) { String dir; try { dir = fixURI(System.getProperty("user.dir")) // deal with blanks in paths; maybe we have to do better uri encoding here .replaceAll(" ", "%20"); } catch (final SecurityException se) { dir = ""; } if (!dir.endsWith("/")) { dir = dir + "/"; } base = new URI("file", "", dir, null, null); } else { try { base = new URI(fixURI(baseSystemId)); } catch (final URI.MalformedURIException e) { String dir; try { dir = fixURI(System.getProperty("user.dir")) // deal with blanks in paths; maybe we have to do better uri encoding here .replaceAll(" ", "%20"); } catch (final SecurityException se) { dir = ""; } if (baseSystemId.indexOf(':') != -1) { // for xml schemas we might have baseURI with // a specified drive base = new URI("file", "", fixURI(baseSystemId), null, null); } else { if (!dir.endsWith("/")) { dir = dir + "/"; } dir = dir + fixURI(baseSystemId); base = new URI("file", "", dir, null, null); } } } // expand id uri = new URI(base, id); } catch (final URI.MalformedURIException e) { // let it go through } if (uri == null) { return systemId; } return uri.toString(); } /** * Fixes a platform dependent filename to standard URI form. * * @param str The string to fix. * * @return Returns the fixed URI string. */ protected static String fixURI(String str) { // handle platform dependent strings str = str.replace(java.io.File.separatorChar, '/'); // Windows fix if (str.length() >= 2) { final char ch1 = str.charAt(1); // change "C:blah" to "/C:blah" if (ch1 == ':') { final char ch0 = String.valueOf(str.charAt(0)).toUpperCase(Locale.ROOT).charAt(0); if (ch0 >= 'A' && ch0 <= 'Z') { str = "/" + str; } } // change "//blah" to "file://blah" else if (ch1 == '/' && str.charAt(0) == '/') { str = "file:" + str; } } // done return str; } // Modifies the given name based on the specified mode. protected static String modifyName(final String name, final short mode) { switch (mode) { case NAMES_UPPERCASE: return name.toUpperCase(Locale.ROOT); case NAMES_LOWERCASE: return name.toLowerCase(Locale.ROOT); } return name; } // Converts HTML names string value to constant value. // // @see #NAMES_NO_CHANGE // @see #NAMES_LOWERCASE // @see #NAMES_UPPERCASE protected static short getNamesValue(final String value) { if ("lower".equals(value)) { return NAMES_LOWERCASE; } if ("upper".equals(value)) { return NAMES_UPPERCASE; } return NAMES_NO_CHANGE; } // debugging // Sets the scanner. protected void setScanner(final Scanner scanner) { fScanner = scanner; if (DEBUG_SCANNER) { System.out.print("$$$ setScanner("); System.out.print(scanner != null ? scanner.getClass().getName() : "null"); System.out.println(");"); } } // Sets the scanner state. protected void setScannerState(final short state) { fScannerState = state; if (DEBUG_SCANNER_STATE) { System.out.print("$$$ setScannerState("); switch (fScannerState) { case STATE_CONTENT: { System.out.print("STATE_CONTENT"); break; } case STATE_MARKUP_BRACKET: { System.out.print("STATE_MARKUP_BRACKET"); break; } case STATE_START_DOCUMENT: { System.out.print("STATE_START_DOCUMENT"); break; } case STATE_END_DOCUMENT: { System.out.print("STATE_END_DOCUMENT"); break; } } System.out.println(");"); } } // scanning // Scans a DOCTYPE line. protected void scanDoctype() throws IOException { String root = null; String pubid = null; String sysid = null; if (skipSpaces()) { root = scanName(true); if (root == null) { if (fReportErrors_) { fErrorReporter.reportError("HTML1014", null); } } else { root = modifyName(root, fNamesElems); } if (skipSpaces()) { if (skip("PUBLIC")) { skipSpaces(); pubid = scanLiteral(); if (skipSpaces()) { sysid = scanLiteral(); } } else if (skip("SYSTEM")) { skipSpaces(); sysid = scanLiteral(); } } } int c; while ((c = fCurrentEntity.read()) != -1) { if (c == '<') { fCurrentEntity.rewind(); break; } if (c == '>') { break; } if (c == '[') { skipMarkup(true); break; } } if (fDocumentHandler != null) { if (fOverrideDoctype_) { pubid = fDoctypePubid; sysid = fDoctypeSysid; } fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); fDocumentHandler.doctypeDecl(root, pubid, sysid, locationAugs()); } } // Scans a quoted literal. protected String scanLiteral() throws IOException { final int quote = fCurrentEntity.read(); if (quote == '"' || quote == '\'') { final XMLString str = fScanLiteral.clear(); int c; while ((c = fCurrentEntity.read()) != -1) { if (c == quote) { break; } if (c == '\n' || c == '\r') { fCurrentEntity.rewind(); // NOTE: This collapses newlines to a single space. // [Q] Is this the right thing to do here? -Ac skipNewlines(); str.append(' '); } else if (c == '<') { fCurrentEntity.rewind(); break; } else { if (!str.appendCodePoint(c)) { if (fReportErrors_) { fErrorReporter.reportError("HTML1005", new Object[] {"&#" + c + ';'}); } } } } if (c == -1) { if (fReportErrors_) { fErrorReporter.reportError("HTML1007", null); } throw new EOFException(); } return str.toString(); } fCurrentEntity.rewind(); return null; } // Scans a name. protected String scanName(final boolean strict) throws IOException { if (DEBUG_BUFFER) { fCurrentEntity.debugBufferIfNeeded("(scanName: "); } if (fCurrentEntity.offset_ == fCurrentEntity.length_) { if (fCurrentEntity.load(0) == -1) { if (DEBUG_BUFFER) { fCurrentEntity.debugBufferIfNeeded(")scanName: "); } return null; } } int offset = fCurrentEntity.offset_; while (true) { while (fCurrentEntity.hasNext()) { final char c = fCurrentEntity.getNextChar(); // this has been split up to cater to the needs of branch prediction if (strict && (!Character.isLetterOrDigit(c) && c != '-' && c != '.' && c != ':' && c != '_')) { fCurrentEntity.rewind(); break; } // we check for the regular space first because isWhitespace is no inlineable and hence expensive // regular space should be the norm as well as newlines else if (!strict && (c == ' ' || c == '\n' || c == '=' || c == '/' || c == '>' || Character.isWhitespace(c))) { fCurrentEntity.rewind(); break; } } if (fCurrentEntity.offset_ == fCurrentEntity.length_) { final int length = fCurrentEntity.length_ - offset; System.arraycopy(fCurrentEntity.buffer_, offset, fCurrentEntity.buffer_, 0, length); final int count = fCurrentEntity.load(length); offset = 0; if (count == -1) { break; } } else { break; } } final int length = fCurrentEntity.offset_ - offset; final String name = length > 0 ? new String(fCurrentEntity.buffer_, offset, length) : null; if (DEBUG_BUFFER) { fCurrentEntity.debugBufferIfNeeded(")scanName: ", " -> \"" + name + '"'); } return name; } // Scans a tag name. protected String scanTagName() throws IOException { if (DEBUG_BUFFER) { fCurrentEntity.debugBufferIfNeeded("(scanName: "); } if (fCurrentEntity.offset_ == fCurrentEntity.length_) { if (fCurrentEntity.load(0) == -1) { if (DEBUG_BUFFER) { fCurrentEntity.debugBufferIfNeeded(")scanName: "); } return null; } } int offset = fCurrentEntity.offset_; boolean isFirst = true; while (true) { while (fCurrentEntity.hasNext()) { final char c = fCurrentEntity.getNextChar(); if (isFirst) { isFirst = false; // first char has to be ASCII alpha if (!('A' <= c && c <= 'Z' || 'a' <= c && c <= 'z')) { fCurrentEntity.rewind(); break; } } else { if (c == '\t' || c == '\r' || c == '\n' || c == ' ' || c == 0 || c == '/' || c == '>') { fCurrentEntity.rewind(); break; } } } if (fCurrentEntity.offset_ == fCurrentEntity.length_) { final int length = fCurrentEntity.length_ - offset; System.arraycopy(fCurrentEntity.buffer_, offset, fCurrentEntity.buffer_, 0, length); final int count = fCurrentEntity.load(length); offset = 0; if (count == -1) { break; } } else { break; } } final int length = fCurrentEntity.offset_ - offset; final String name = length > 0 ? new String(fCurrentEntity.buffer_, offset, length) : null; if (DEBUG_BUFFER) { fCurrentEntity.debugBufferIfNeeded(")scanName: ", " -> \"" + name + '"'); } return name; } // Scans an entity reference. protected int scanEntityRef(final XMLString str, final XMLString plainValue, final boolean content) throws IOException { str.clearAndAppend('&'); // use readPreservingBufferContent inside this method to be sure we can rewind int nextChar = readPreservingBufferContent(); if (nextChar == -1) { if (plainValue != null) { plainValue.append(str); } return returnEntityRefString(str, content); } str.append((char) nextChar); if ('#' == nextChar) { final HTMLUnicodeEntitiesParser parser = new HTMLUnicodeEntitiesParser(); do { nextChar = readPreservingBufferContent(); if (nextChar != -1) { str.append((char) nextChar); } } while (nextChar != -1 && parser.parseNumeric(nextChar)); final String match = parser.getMatch(); if (match == null) { final String consumed = str.toString(); fCurrentEntity.rewind(consumed.length() - 1); if (plainValue != null) { plainValue.append('&'); } str.clearAndAppend('&'); } else { fCurrentEntity.rewind(parser.getRewindCount()); if (plainValue != null) { plainValue.append(str); } str.clear().append(match); } return returnEntityRefString(str, content); } // we read regular entities such as < here int readCount = 1; // this will be our state of the parsing, we have to feed that back to the parser HTMLNamedEntitiesParser.State result = null; // in case of incorrect entities such as ¬in where we are supposed to recognize // ¬, we have to keep the last matching state, so we can fall back to it HTMLNamedEntitiesParser.State lastMatchingResult = null; while (nextChar != -1) { final HTMLNamedEntitiesParser.State intermediateResult = HTMLNamedEntitiesParser.get().lookup(nextChar, result); if (intermediateResult.endNode_) { result = intermediateResult; break; } if (intermediateResult == result) { // nothing changed, more characters have not done anything break; } if (intermediateResult.isMatch_) { lastMatchingResult = intermediateResult; } result = intermediateResult; nextChar = readPreservingBufferContent(); if (nextChar != -1) { str.append((char) nextChar); readCount++; } } // it might happen that we read <a but need just < so // we have to go back to the last match if (!result.isMatch_ && lastMatchingResult != null) { result = lastMatchingResult; } // hopefully, we got something, otherwise we have to go // the error route if (result.isMatch_) { // in case we overrun because the entity was broken or // not terminated by a ;, we have to reset the char // position because we read one more char than the entity has fCurrentEntity.rewind(readCount - result.length_); // if we have a correct character that is terminate by ; // we can keep things simple if (result.endsWithSemicolon_) { if (plainValue != null) { plainValue.append(str); } str.clear().append(result.resolvedValue_); } else { if (fReportErrors_) { fErrorReporter.reportWarning("HTML1004", null); } // If there is a match // { // If the character reference was consumed as part of an attribute, and the last character matched is not // a U+003B SEMICOLON character (;), and the next input character is either a U+003D EQUALS SIGN character (=) // or an ASCII alphanumeric, // then, for historical reasons, flush code points consumed as a character reference and switch to the return state. // Otherwise: // 1. If the last character matched is not a U+003B SEMICOLON character (;), then this is a missing-semicolon-after-character-reference parse error. // 2. Set the temporary buffer to the empty string. Append one or two characters corresponding to the character reference name // (as given by the second column of the named character references table) to the temporary buffer. // 3. Flush code points consumed as a character reference. Switch to the return state. // } // Otherwise // { // Flush code points consumed as a character reference. Switch to the ambiguous ampersand state. // } if (content) { if (plainValue != null) { plainValue.append(str); } str.clear().append(result.resolvedValue_); } else { // look ahead // 13.2.5.73 final int matchLength = result.length_ + 1; if (matchLength < str.length()) { nextChar = str.charAt(matchLength); if ('=' == nextChar || '0' <= nextChar && nextChar <= '9' || 'A' <= nextChar && nextChar <= 'Z' || 'a' <= nextChar && nextChar <= 'z') { // we just shorten our temp str instead of copying stuff around str.shortenBy(str.length() - result.length_ - 1); if (plainValue != null) { plainValue.append(str); } } else { if (plainValue != null) { plainValue.append(str); } str.clear().append(result.resolvedValue_); } } else { if (plainValue != null) { plainValue.append(str); } str.clear().append(result.resolvedValue_); } } } } else { // Entity not found, rewind and continue // broken from here, aka keeping everything fCurrentEntity.rewind(readCount); if (plainValue != null) { plainValue.append('&'); } str.clearAndAppend('&'); } return returnEntityRefString(str, content); } private int returnEntityRefString(final XMLString str, final boolean content) { if (content && fDocumentHandler != null && fElementCount >= fElementDepth) { fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); fDocumentHandler.characters(str, locationAugs()); } return -1; } // Returns true if the specified text is present (case-insensitive) and is skipped. protected boolean skip(final String s) throws IOException { final int length = s != null ? s.length() : 0; for (int i = 0; i < length; i++) { if (fCurrentEntity.offset_ == fCurrentEntity.length_) { System.arraycopy(fCurrentEntity.buffer_, fCurrentEntity.offset_ - i, fCurrentEntity.buffer_, 0, i); if (fCurrentEntity.load(i) == -1) { fCurrentEntity.offset_ = 0; return false; } } char c0 = s.charAt(i); char c1 = fCurrentEntity.getNextChar(); c0 = String.valueOf(c0).toUpperCase(Locale.ROOT).charAt(0); c1 = String.valueOf(c1).toUpperCase(Locale.ROOT).charAt(0); if (c0 != c1) { fCurrentEntity.rewind(i + 1); return false; } } return true; } // Skips markup. protected boolean skipMarkup(final boolean balance) throws IOException { if (DEBUG_BUFFER) { fCurrentEntity.debugBufferIfNeeded("(skipMarkup: "); } int depth = 1; boolean slashgt = false; OUTER: while (true) { if (fCurrentEntity.offset_ == fCurrentEntity.length_) { if (fCurrentEntity.load(0) == -1) { break OUTER; } } while (fCurrentEntity.hasNext()) { char c = fCurrentEntity.getNextChar(); if (balance && c == '<') { depth++; } else if (c == '>') { depth--; if (depth == 0) { break OUTER; } } else if (c == '/') { if (fCurrentEntity.offset_ == fCurrentEntity.length_) { if (fCurrentEntity.load(0) == -1) { break OUTER; } } c = fCurrentEntity.getNextChar(); if (c == '>') { slashgt = true; depth--; if (depth == 0) { break OUTER; } } else { fCurrentEntity.rewind(); } } else if (c == '\r' || c == '\n') { fCurrentEntity.rewind(); skipNewlines(); } } } if (DEBUG_BUFFER) { fCurrentEntity.debugBufferIfNeeded(")skipMarkup: ", " -> " + slashgt); } return slashgt; } // Skips whitespace. protected boolean skipSpaces() throws IOException { if (DEBUG_BUFFER) { fCurrentEntity.debugBufferIfNeeded("(skipSpaces: "); } boolean spaces = false; while (true) { if (fCurrentEntity.offset_ == fCurrentEntity.length_) { if (fCurrentEntity.load(0) == -1) { break; } } final char c = fCurrentEntity.getNextChar(); // compare against the usual suspects first before going // the expensive route if (c == ' ' || c == '\n' || Character.isWhitespace(c)) { spaces = true; // unix \n might dominate if (c == '\n' || c == '\r') { fCurrentEntity.rewind(); skipNewlines(); } } else { fCurrentEntity.rewind(); break; } } if (DEBUG_BUFFER) { fCurrentEntity.debugBufferIfNeeded(")skipSpaces: ", " -> " + spaces); } return spaces; } // Skips newlines and returns the number of newlines skipped. protected int skipNewlines() throws IOException { if (DEBUG_BUFFER) { fCurrentEntity.debugBufferIfNeeded("(skipNewlines: "); } if (fCurrentEntity.offset_ == fCurrentEntity.length_) { if (fCurrentEntity.load(0) == -1) { if (DEBUG_BUFFER) { fCurrentEntity.debugBufferIfNeeded(")skipNewlines: "); } return 0; } } char c = fCurrentEntity.getCurrentChar(); int newlines = 0; if (c == '\n' || c == '\r') { do { c = fCurrentEntity.getNextChar(); if (c == '\n') { newlines++; if (fCurrentEntity.offset_ == fCurrentEntity.length_) { fCurrentEntity.offset_ = newlines; if (fCurrentEntity.load(newlines) == -1) { break; } } } else if (c == '\r') { newlines++; if (fCurrentEntity.offset_ == fCurrentEntity.length_) { fCurrentEntity.offset_ = newlines; if (fCurrentEntity.load(newlines) == -1) { break; } } if (fCurrentEntity.getCurrentChar() == '\n') { fCurrentEntity.offset_++; fCurrentEntity.characterOffset_++; } } else { fCurrentEntity.rewind(); break; } } while (fCurrentEntity.offset_ < fCurrentEntity.length_ - 1); fCurrentEntity.incLine(newlines); } if (DEBUG_BUFFER) { fCurrentEntity.debugBufferIfNeeded(")skipNewlines: ", " -> " + newlines); } return newlines; } // infoset utility methods // Returns an augmentations object with a location item added. protected final Augmentations locationAugs() { // we don't have to create a new LocationItem all the time, because the interface says: // Methods that receive Augmentations are required to copy the information // if it is to be saved for use beyond the scope of the method. if (fAugmentations_) { fLocationItem.setValues(fBeginLineNumber, fBeginColumnNumber, fBeginCharacterOffset, fEndLineNumber, fEndColumnNumber, fEndCharacterOffset); return fLocationItem; } return null; } // Returns an augmentations object with a synthesized item added. protected final Augmentations synthesizedAugs() { if (fAugmentations_) { return SYNTHESIZED_ITEM; } return null; } /** * Basic scanner interface. */ public interface Scanner { /** * Scans part of the document. This interface allows scanning to be performed in * a pulling manner. * * @param complete True if the scanner should not return until scanning is * complete. * * @return True if additional scanning is required. * * @throws IOException Thrown if I/O error occurs. */ boolean scan(boolean complete) throws IOException; } /** * Current entity. */ private static final class CurrentEntity { /** Character stream. */ private Reader stream_; /** Encoding. */ String encoding_; /** Public identifier. */ public final String publicId; /** Base system identifier. */ public final String baseSystemId; /** Literal system identifier. */ public final String literalSystemId; /** Expanded system identifier. */ final String expandedSystemId; /** XML version. */ public final String version = "1.0"; /** Line number. */ private int lineNumber_ = 1; /** Column number. */ int columnNumber_ = 1; /** Character offset in the file. */ int characterOffset_ = 0; // buffer /** Character buffer. */ char[] buffer_ = new char[DEFAULT_BUFFER_SIZE]; /** Offset into character buffer. */ int offset_ = 0; /** Length of characters read into character buffer. */ int length_ = 0; private boolean endReached_ = false; // Constructs an entity from the specified stream. CurrentEntity(final Reader stream, final String encoding, final String publicId, final String baseSystemId, final String literalSystemId, final String expandedSystemId) { stream_ = stream; encoding_ = encoding; this.publicId = publicId; this.baseSystemId = baseSystemId; this.literalSystemId = literalSystemId; this.expandedSystemId = expandedSystemId; } char getCurrentChar() { return buffer_[offset_]; } /** * @return the current character and moves to next one. */ char getNextChar() { characterOffset_++; columnNumber_++; return buffer_[offset_++]; } void closeQuietly() { try { stream_.close(); } catch (final IOException e) { // ignore } } /** * Indicates if there are characters left. */ boolean hasNext() { return offset_ < length_; } /** * Loads a new chunk of data into the buffer and returns the number of * characters loaded or -1 if no additional characters were loaded. * * @param loadOffset The offset at which new characters should be loaded. * @return count * @throws IOException in case of io problems */ int load(final int loadOffset) throws IOException { if (DEBUG_BUFFER) { debugBufferIfNeeded("(load: "); } // resize buffer, if needed if (loadOffset == buffer_.length) { final int adjust = buffer_.length / 4; final char[] array = new char[buffer_.length + adjust]; System.arraycopy(buffer_, 0, array, 0, length_); buffer_ = array; } // read a block of characters final int count = stream_.read(buffer_, loadOffset, buffer_.length - loadOffset); if (count == -1) { length_ = loadOffset; endReached_ = true; } else { length_ = count + loadOffset; } offset_ = loadOffset; if (DEBUG_BUFFER) { debugBufferIfNeeded(")load: ", " -> " + count); } return count; } // Reads a single character. int read() throws IOException { if (DEBUG_BUFFER) { debugBufferIfNeeded("(read: "); } if (offset_ == length_) { if (endReached_) { return -1; } if (load(0) == -1) { if (DEBUG_BUFFER) { System.out.println(")read: -> -1"); } return -1; } } final char c = buffer_[offset_]; offset_++; characterOffset_++; columnNumber_++; if (DEBUG_BUFFER) { debugBufferIfNeeded(")read: ", " -> " + c); } return c; } /** Prints the contents of the character buffer to standard out. */ private void debugBufferIfNeeded(final String prefix) { debugBufferIfNeeded(prefix, ""); } /** Prints the contents of the character buffer to standard out. */ private void debugBufferIfNeeded(final String prefix, final String suffix) { System.out.print(prefix); System.out.print('['); System.out.print(length_); System.out.print(' '); System.out.print(offset_); if (length_ > 0) { System.out.print(" \""); for (int i = 0; i < length_; i++) { if (i == offset_) { System.out.print('^'); } final char c = buffer_[i]; switch (c) { case '\r': System.out.print("\\r"); break; case '\n': System.out.print("\\n"); break; case '\t': System.out.print("\\t"); break; case '"': System.out.print("\\\""); break; default: System.out.print(c); } } if (offset_ == length_) { System.out.print('^'); } System.out.print('"'); } System.out.print(']'); System.out.print(suffix); System.out.println(); } void setStream(final Reader inputStreamReader, final String encoding) { stream_ = inputStreamReader; offset_ = 0; length_ = 0; characterOffset_ = 0; lineNumber_ = 1; columnNumber_ = 1; encoding_ = encoding; } /** * Goes back, canceling the effect of the previous read() call. */ void rewind() { offset_--; characterOffset_--; columnNumber_--; } void rewind(final int i) { offset_ -= i; characterOffset_ -= i; columnNumber_ -= i; } void incLine() { lineNumber_++; columnNumber_ = 1; } void incLine(final int nbLines) { lineNumber_ += nbLines; columnNumber_ = 1; } public int getLineNumber() { return lineNumber_; } void resetBuffer(final XMLString xmlBuffer, final int lineNumber, final int columnNumber, final int characterOffset) { lineNumber_ = lineNumber; columnNumber_ = columnNumber; characterOffset_ = characterOffset; // TODO RBRi buffer_ = xmlBuffer.getChars(); offset_ = 0; length_ = xmlBuffer.length(); } int getColumnNumber() { return columnNumber_; } int getCharacterOffset() { return characterOffset_; } } /* * Script parsing states based on https://html.spec.whatwg.org/multipage/parsing.html#script-data-state */ private enum ScanScriptState { /** Script data state */ DATA, /** Script data escaped state */ ESCAPED, /** Script data escaped less-than sign state */ ESCAPED_LT, /** Script data double escaped state */ DOUBLE_ESCAPED, /** Script data double escaped less-than sign state */ DOUBLE_ESCAPED_LT, } /** * The primary HTML document scanner. */ public class ContentScanner implements Scanner { /** A qualified name. */ private final QName qName_ = new QName(); /** Attributes. */ private final XMLAttributesImpl attributes_ = new XMLAttributesImpl(); /** Scan. */ @Override public boolean scan(final boolean complete) throws IOException { boolean next; do { try { next = false; switch (fScannerState) { case STATE_CONTENT: { fBeginLineNumber = fCurrentEntity.getLineNumber(); fBeginColumnNumber = fCurrentEntity.getColumnNumber(); fBeginCharacterOffset = fCurrentEntity.getCharacterOffset(); final int c = fCurrentEntity.read(); if (c == -1) { throw new EOFException(); } if (c == '<') { setScannerState(STATE_MARKUP_BRACKET); next = true; } else if (c == '&') { scanEntityRef(fStringBuffer, null, true); } else { fCurrentEntity.rewind(); scanCharacters(); } break; } case STATE_MARKUP_BRACKET: { final int c = fCurrentEntity.read(); if (c == -1) { if (fReportErrors_) { fErrorReporter.reportError("HTML1003", null); } if (fDocumentHandler != null && fElementCount >= fElementDepth) { fStringBuffer.clearAndAppend('<'); fDocumentHandler.characters(fStringBuffer, null); } throw new EOFException(); } if (c == '!') { // process some strange self closing comments first if (skip("--->") || skip("-->") || skip("->") || skip(">")) { fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); // using EMPTY here is slightly dangerous but a review showed // that all implementations of comment() only read the data // never do anything else with it, so safe for now fDocumentHandler.comment(XMLString.EMPTY, locationAugs()); } else if (skip("-!>")) { fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); final XMLString str = new XMLString(); str.append("-!"); fDocumentHandler.comment(str, locationAugs()); } else if (skip("--")) { scanComment(); } else if (skip("[CDATA[")) { scanCDATA(); } else if (skip("DOCTYPE")) { scanDoctype(); } else { if (fReportErrors_) { fErrorReporter.reportError("HTML1002", null); } skipMarkup(true); } } else if (c == '?') { scanPI(); } else if (c == '/') { scanEndElement(); } else { fCurrentEntity.rewind(); fElementCount++; fSingleBoolean[0] = false; final String ename = scanStartElement(fSingleBoolean); final String enameLC = ename == null ? null : ename.toLowerCase(Locale.ROOT); fBeginLineNumber = fCurrentEntity.getLineNumber(); fBeginColumnNumber = fCurrentEntity.getColumnNumber(); fBeginCharacterOffset = fCurrentEntity.getCharacterOffset(); if ("script".equals(enameLC)) { setScanner(fScriptScanner); setScannerState(STATE_CONTENT); return true; } else if (!fAllowSelfclosingTags_ && !fAllowSelfclosingIframe_ && "iframe".equals(enameLC)) { scanUntilEndTag("iframe"); } else if (!fParseNoScriptContent_ && "noscript".equals(enameLC)) { scanUntilEndTag("noscript"); } else if ("noframes".equals(enameLC)) { scanUntilEndTag("noframes"); } else if ("noembed".equals(enameLC)) { scanUntilEndTag("noembed"); } else if (ename != null && htmlConfiguration_.getHtmlElements().getElement(enameLC).isSpecial()) { // title inside svg if ("title".equals(enameLC) && htmlConfiguration_.getTagBalancer().fOpenedSvg) { setScannerState(STATE_CONTENT); break; } if ("plaintext".equals(enameLC)) { setScanner(new PlainTextScanner()); } else { setScanner(fSpecialScanner.setElementName(ename)); setScannerState(STATE_CONTENT); } return true; } } setScannerState(STATE_CONTENT); break; } case STATE_START_DOCUMENT: { if (fDocumentHandler != null && fElementCount >= fElementDepth) { if (DEBUG_CALLBACKS) { System.out.println("startDocument()"); } final XMLLocator locator = HTMLScanner.this; final String encoding = fIANAEncoding; final Augmentations augs = locationAugs(); final NamespaceContext nscontext = new NamespaceSupport(); fDocumentHandler.startDocument(locator, encoding, nscontext, augs); } if (fInsertDoctype_ && fDocumentHandler != null) { String root = htmlConfiguration_.getHtmlElements().getElement(HTMLElements.HTML).name; root = modifyName(root, fNamesElems); final String pubid = fDoctypePubid; final String sysid = fDoctypeSysid; fDocumentHandler.doctypeDecl(root, pubid, sysid, synthesizedAugs()); } setScannerState(STATE_CONTENT); break; } case STATE_END_DOCUMENT: { if (fDocumentHandler != null && fElementCount >= fElementDepth && complete) { if (DEBUG_CALLBACKS) { System.out.println("endDocument()"); } fEndLineNumber = fCurrentEntity.getLineNumber(); fEndColumnNumber = fCurrentEntity.getColumnNumber(); fEndCharacterOffset = fCurrentEntity.getCharacterOffset(); fDocumentHandler.endDocument(locationAugs()); } return false; } default: { throw new RuntimeException("unknown scanner state: " + fScannerState); } } if (fScanner instanceof PlainTextScanner) { return true; } } catch (final EOFException e) { if (fCurrentEntityStack.isEmpty()) { setScannerState(STATE_END_DOCUMENT); } else { fCurrentEntity = fCurrentEntityStack.pop(); } next = true; } } while (next || complete); return true; } /** * Scans the content of





© 2015 - 2024 Weber Informatics LLC | Privacy Policy