org.htmlunit.cyberneko.HTMLConfiguration Maven / Gradle / Ivy
/*
* Copyright (c) 2002-2009 Andy Clark, Marc Guillemot
* Copyright (c) 2017-2024 Ronald Brill
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.htmlunit.cyberneko;
import java.io.IOException;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.MissingResourceException;
import java.util.ResourceBundle;
import org.htmlunit.cyberneko.filters.NamespaceBinder;
import org.htmlunit.cyberneko.xerces.util.ParserConfigurationSettings;
import org.htmlunit.cyberneko.xerces.xni.XMLDocumentHandler;
import org.htmlunit.cyberneko.xerces.xni.XNIException;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLConfigurationException;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLDocumentFilter;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLDocumentSource;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLErrorHandler;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLInputSource;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLParseException;
import org.htmlunit.cyberneko.xerces.xni.parser.XMLParserConfiguration;
/**
* An XNI-based parser configuration that can be used to parse HTML
* documents. This configuration can be used directly in order to
* parse HTML documents or can be used in conjunction with any XNI
* based tools, such as the Xerces2 implementation.
*
* This configuration recognizes the following features:
*
* - http://cyberneko.org/html/features/augmentations
*
- http://cyberneko.org/html/features/report-errors
*
- http://cyberneko.org/html/features/report-errors/simple
*
- and
*
- the features supported by the scanner and tag balancer components.
*
*
* This configuration recognizes the following properties:
*
* - http://cyberneko.org/html/properties/names/elems
*
- http://cyberneko.org/html/properties/names/attrs
*
- http://cyberneko.org/html/properties/filters
*
- http://cyberneko.org/html/properties/error-reporter
*
- and
*
- the properties supported by the scanner and tag balancer.
*
*
* For complete usage information, refer to the documentation.
*
* @see HTMLScanner
* @see HTMLTagBalancer
* @see HTMLErrorReporter
*
* @author Andy Clark
*/
public class HTMLConfiguration extends ParserConfigurationSettings implements XMLParserConfiguration {
// features
/** Namespaces. */
protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";
/** Include infoset augmentations. */
protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
/** Report errors. */
protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
/** Simple report format. */
protected static final String SIMPLE_ERROR_FORMAT = "http://cyberneko.org/html/features/report-errors/simple";
// properties
/** Modify HTML element names: { "upper", "lower", "default" }. */
protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
/** Modify HTML attribute names: { "upper", "lower", "default" }. */
protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
/** Pipeline filters. */
protected static final String FILTERS = "http://cyberneko.org/html/properties/filters";
/** Error reporter. */
protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";
// other
/** Error domain. */
protected static final String ERROR_DOMAIN = "http://cyberneko.org/html";
/** Document handler. */
private XMLDocumentHandler documentHandler_;
/** Error handler. */
XMLErrorHandler errorHandler_;
/**
* Stream opened by parser. Therefore, must close stream manually upon
* termination of parsing.
*/
private boolean closeStream_;
/** Components. */
private final List htmlComponents_ = new ArrayList<>(2);
/** Document scanner. */
final HTMLScanner documentScanner_ = createDocumentScanner();
/** HTML tag balancer. */
private final HTMLTagBalancer tagBalancer_ = new HTMLTagBalancer(this);
/** Namespace binder. */
private final NamespaceBinder namespaceBinder_ = new NamespaceBinder(this);
private final HTMLElements htmlElements_;
/** Default constructor. */
public HTMLConfiguration() {
this(new HTMLElements());
}
public HTMLConfiguration(final HTMLElements htmlElements) {
htmlElements_ = htmlElements;
// add components
addComponent(documentScanner_);
addComponent(tagBalancer_);
addComponent(namespaceBinder_);
// recognized features
final String[] recognizedFeatures = {
AUGMENTATIONS,
NAMESPACES,
REPORT_ERRORS,
SIMPLE_ERROR_FORMAT,
};
addRecognizedFeatures(recognizedFeatures);
setFeature(AUGMENTATIONS, false);
setFeature(NAMESPACES, true);
setFeature(REPORT_ERRORS, false);
setFeature(SIMPLE_ERROR_FORMAT, false);
// recognized properties
final String[] recognizedProperties = {
NAMES_ELEMS,
NAMES_ATTRS,
FILTERS,
ERROR_REPORTER,
};
addRecognizedProperties(recognizedProperties);
setProperty(NAMES_ELEMS, "default");
setProperty(NAMES_ATTRS, "lower");
setProperty(ERROR_REPORTER, new ErrorReporter());
}
protected HTMLScanner createDocumentScanner() {
return new HTMLScanner(this);
}
/**
* Pushes an input source onto the current entity stack. This
* enables the scanner to transparently scan new content (e.g.
* the output written by an embedded script). At the end of the
* current entity, the scanner returns where it left off at the
* time this entity source was pushed.
*
* Hint:
* To use this feature to insert the output of <SCRIPT>
* tags, remember to buffer the entire output of the
* processed instructions before pushing a new input source.
* Otherwise, events may appear out of sequence.
*
* @param inputSource The new input source to start scanning.
* @see #evaluateInputSource(XMLInputSource)
*/
public void pushInputSource(final XMLInputSource inputSource) {
documentScanner_.pushInputSource(inputSource);
}
/**
* EXPERIMENTAL: may change in next release
* Immediately evaluates an input source and add the new content (e.g.
* the output written by an embedded script).
*
* @param inputSource The new input source to start scanning.
* @see #pushInputSource(XMLInputSource)
*/
public void evaluateInputSource(final XMLInputSource inputSource) {
documentScanner_.evaluateInputSource(inputSource);
}
// Sets a feature.
@Override
public void setFeature(final String featureId, final boolean state)
throws XMLConfigurationException {
super.setFeature(featureId, state);
for (final HTMLComponent component : htmlComponents_) {
component.setFeature(featureId, state);
}
}
// Sets a property.
@Override
public void setProperty(final String propertyId, final Object value)
throws XMLConfigurationException {
super.setProperty(propertyId, value);
if (propertyId.equals(FILTERS)) {
final XMLDocumentFilter[] filters = (XMLDocumentFilter[]) getProperty(FILTERS);
if (filters != null) {
for (final XMLDocumentFilter filter : filters) {
if (filter instanceof HTMLComponent) {
addComponent((HTMLComponent) filter);
}
}
}
}
for (final HTMLComponent component : htmlComponents_) {
component.setProperty(propertyId, value);
}
}
// Sets the document handler.
@Override
public void setDocumentHandler(final XMLDocumentHandler handler) {
documentHandler_ = handler;
if (handler instanceof HTMLTagBalancingListener) {
tagBalancer_.setTagBalancingListener((HTMLTagBalancingListener) handler);
}
}
/** @return the document handler. */
@Override
public XMLDocumentHandler getDocumentHandler() {
return documentHandler_;
}
// Sets the error handler.
@Override
public void setErrorHandler(final XMLErrorHandler handler) {
errorHandler_ = handler;
}
/** @return the error handler. */
@Override
public XMLErrorHandler getErrorHandler() {
return errorHandler_;
}
/** @return the HTMLElements */
public HTMLElements getHtmlElements() {
return htmlElements_;
}
/** @return the list of HTMLComponents */
public List getHtmlComponents() {
return htmlComponents_;
}
/** @return the DocumentScanner */
public HTMLScanner getDocumentScanner() {
return documentScanner_;
}
/** @return the TagBalancer */
public HTMLTagBalancer getTagBalancer() {
return tagBalancer_;
}
/** @return the NamespaceBinder */
public NamespaceBinder getNamespaceBinder() {
return namespaceBinder_;
}
/** Parses a document. */
@Override
public void parse(final XMLInputSource source) throws XNIException, IOException {
setInputSource(source);
parse(true);
}
/**
* Sets the input source for the document to parse.
*
* @param inputSource The document's input source.
*
* @exception XMLConfigurationException Thrown if there is a
* configuration error when initializing the
* parser.
* @exception IOException Thrown on I/O error.
*
* @see #parse(boolean)
*/
@Override
public void setInputSource(final XMLInputSource inputSource)
throws XMLConfigurationException, IOException {
reset();
closeStream_ = inputSource.getByteStream() == null && inputSource.getCharacterStream() == null;
documentScanner_.setInputSource(inputSource);
}
/**
* Parses the document in a pull parsing fashion.
*
* @param complete True if the pull parser should parse the
* remaining document completely.
*
* @return True if there is more document to parse.
*
* @exception XNIException Any XNI exception, possibly wrapping
* another exception.
* @exception IOException An IO exception from the parser, possibly
* from a byte stream or character stream
* supplied by the parser.
*
* @see #setInputSource
*/
@Override
public boolean parse(final boolean complete) throws XNIException, IOException {
try {
final boolean more = documentScanner_.scanDocument(complete);
if (!more) {
cleanup();
}
return more;
}
catch (final XNIException | IOException e) {
cleanup();
throw e;
}
}
/**
* If the application decides to terminate parsing before the xml document
* is fully parsed, the application should call this method to free any
* resource allocated during parsing. For example, close all opened streams.
*/
@Override
public void cleanup() {
documentScanner_.cleanup(closeStream_);
}
// Adds a component.
protected void addComponent(final HTMLComponent component) {
// add component to list
htmlComponents_.add(component);
// add recognized features and set default states
final String[] features = component.getRecognizedFeatures();
addRecognizedFeatures(features);
if (features != null) {
for (final String feature : features) {
final Boolean state = component.getFeatureDefault(feature);
if (state != null) {
setFeature(feature, state.booleanValue());
}
}
}
// add recognized properties and set default values
final String[] properties = component.getRecognizedProperties();
addRecognizedProperties(properties);
if (properties != null) {
for (final String property : properties) {
final Object value = component.getPropertyDefault(property);
if (value != null) {
setProperty(property, value);
}
}
}
}
/** Resets the parser configuration. */
protected void reset() throws XMLConfigurationException {
// reset components
for (final HTMLComponent component : htmlComponents_) {
component.reset(this);
}
// configure pipeline
XMLDocumentSource lastSource = documentScanner_;
if (getFeature(NAMESPACES)) {
lastSource.setDocumentHandler(namespaceBinder_);
namespaceBinder_.setDocumentSource(tagBalancer_);
lastSource = namespaceBinder_;
}
lastSource.setDocumentHandler(tagBalancer_);
tagBalancer_.setDocumentSource(documentScanner_);
lastSource = tagBalancer_;
final XMLDocumentFilter[] filters = (XMLDocumentFilter[]) getProperty(FILTERS);
if (filters != null) {
for (final XMLDocumentFilter filter : filters) {
filter.setDocumentSource(lastSource);
lastSource.setDocumentHandler(filter);
lastSource = filter;
}
}
lastSource.setDocumentHandler(documentHandler_);
}
/**
* Defines an error reporter for reporting HTML errors. There is no such
* thing as a fatal error in parsing HTML. I/O errors are fatal but should
* throw an IOException
directly instead of reporting an error.
*
* When used in a configuration, the error reporter instance should be
* set as a property with the following property identifier:
*
* "http://cyberneko.org/html/internal/error-reporter" in the
*
* Components in the configuration can query the error reporter using this
* property identifier.
*
* Note:
* All reported errors are within the domain "http://cyberneko.org/html".
*
* @author Andy Clark
*/
protected class ErrorReporter implements HTMLErrorReporter {
/** Error messages. */
private ResourceBundle errorMessages_;
/** Format message without reporting error. */
@Override
public String formatMessage(final String key, final Object[] args) {
if (!getFeature(SIMPLE_ERROR_FORMAT)) {
if (errorMessages_ == null) {
errorMessages_ =
ResourceBundle.getBundle("org/htmlunit/cyberneko/res/ErrorMessages");
}
try {
final String value = errorMessages_.getString(key);
return MessageFormat.format(value, args);
}
catch (final MissingResourceException e) {
// ignore and return a simple format
}
}
return formatSimpleMessage(key, args);
}
/** Reports a warning. */
@Override
public void reportWarning(final String key, final Object[] args)
throws XMLParseException {
if (errorHandler_ != null) {
errorHandler_.warning(ERROR_DOMAIN, key, createException(key, args));
}
}
/** Reports an error. */
@Override
public void reportError(final String key, final Object[] args)
throws XMLParseException {
if (errorHandler_ != null) {
errorHandler_.error(ERROR_DOMAIN, key, createException(key, args));
}
}
// Creates parse exception.
protected XMLParseException createException(final String key, final Object[] args) {
final String message = formatMessage(key, args);
return new XMLParseException(documentScanner_, message);
}
// Format simple message.
protected String formatSimpleMessage(final String key, final Object[] args) {
final StringBuilder str = new StringBuilder();
str.append(ERROR_DOMAIN);
str.append('#');
str.append(key);
if (args != null && args.length > 0) {
str.append('\t');
for (int i = 0; i < args.length; i++) {
if (i > 0) {
str.append('\t');
}
str.append(args[i]);
}
}
return str.toString();
}
}
}