com.googlecode.html.HTMLConfiguration Maven / Gradle / Ivy
/*
* Copyright 2002-2009 Andy Clark, Marc Guillemot
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package com.googlecode.html;
import com.googlecode.html.filters.NamespaceBinder;
import com.googlecode.html.xercesbridge.XercesBridge;
import org.apache.xerces.util.DefaultErrorHandler;
import org.apache.xerces.util.ParserConfigurationSettings;
import org.apache.xerces.xni.XMLDTDContentModelHandler;
import org.apache.xerces.xni.XMLDTDHandler;
import org.apache.xerces.xni.XMLDocumentHandler;
import org.apache.xerces.xni.XNIException;
import org.apache.xerces.xni.parser.*;
import java.io.IOException;
import java.text.MessageFormat;
import java.util.Locale;
import java.util.MissingResourceException;
import java.util.ResourceBundle;
import java.util.Vector;
/**
* An XNI-based parser configuration that can be used to parse HTML documents. This configuration
* can be used directly in order to parse HTML documents or can be used in conjunction with any XNI
* based tools, such as the Xerces2 implementation.
*
* This configuration recognizes the following features:
*
* - http://cyberneko.org/html/features/augmentations
*
- http://cyberneko.org/html/features/report-errors
*
- http://cyberneko.org/html/features/report-errors/simple
*
- http://cyberneko.org/html/features/balance-tags
*
- and
*
- the features supported by the scanner and tag balancer components.
*
*
* This configuration recognizes the following properties:
*
* - http://cyberneko.org/html/properties/names/elems
*
- http://cyberneko.org/html/properties/names/attrs
*
- http://cyberneko.org/html/properties/filters
*
- http://cyberneko.org/html/properties/error-reporter
*
- and
*
- the properties supported by the scanner and tag balancer.
*
*
* For complete usage information, refer to the documentation.
*
* @author Andy Clark
* @version $Id: HTMLConfiguration.java,v 1.9 2005/02/14 03:56:54 andyc Exp $
* @see HTMLScanner
* @see HTMLTagBalancer
* @see HTMLErrorReporter
*/
public class HTMLConfiguration extends ParserConfigurationSettings implements
XMLPullParserConfiguration {
//
// Constants
//
// features
/**
* Defines an error reporter for reporting HTML errors. There is no such thing as a fatal error
* in parsing HTML. I/O errors are fatal but should throw an IOException
directly
* instead of reporting an error.
*
* When used in a configuration, the error reporter instance should be set as a property with the
* following property identifier:
*
*
* "http://cyberneko.org/html/internal/error-reporter" in the
*
* Components in the configuration can query the error reporter using this property identifier.
*
* Note: All reported errors are within the domain "http://cyberneko.org/html".
*
* @author Andy Clark
*/
protected class ErrorReporter implements HTMLErrorReporter {
//
// Data
//
/**
* Error messages.
*/
protected ResourceBundle fErrorMessages;
/**
* Last locale.
*/
protected Locale fLastLocale;
//
// HTMLErrorReporter methods
//
/**
* Format message without reporting error.
*/
public String formatMessage(String key, Object[] args) {
if (!getFeature(SIMPLE_ERROR_FORMAT)) {
if (!fLocale.equals(fLastLocale)) {
fErrorMessages = null;
fLastLocale = fLocale;
}
if (fErrorMessages == null) {
fErrorMessages = ResourceBundle.getBundle("org/cyberneko/html/res/ErrorMessages",
fLocale);
}
try {
String value = fErrorMessages.getString(key);
String message = MessageFormat.format(value, args);
return message;
} catch (MissingResourceException e) {
// ignore and return a simple format
}
}
return formatSimpleMessage(key, args);
} // formatMessage(String,Object[]):String
/**
* Reports an error.
*/
public void reportError(String key, Object[] args) throws XMLParseException {
if (fErrorHandler != null) {
fErrorHandler.error(ERROR_DOMAIN, key, createException(key, args));
}
} // reportError(String,Object[])
/**
* Reports a warning.
*/
public void reportWarning(String key, Object[] args) throws XMLParseException {
if (fErrorHandler != null) {
fErrorHandler.warning(ERROR_DOMAIN, key, createException(key, args));
}
} // reportWarning(String,Object[])
//
// Protected methods
//
/**
* Creates parse exception.
*/
protected XMLParseException createException(String key, Object[] args) {
String message = formatMessage(key, args);
return new XMLParseException(fDocumentScanner, message);
} // createException(String,Object[]):XMLParseException
/**
* Format simple message.
*/
protected String formatSimpleMessage(String key, Object[] args) {
StringBuffer str = new StringBuffer();
str.append(ERROR_DOMAIN);
str.append('#');
str.append(key);
if (args != null && args.length > 0) {
str.append('\t');
for (int i = 0; i < args.length; i++) {
if (i > 0) {
str.append('\t');
}
str.append(String.valueOf(args[i]));
}
}
return str.toString();
} // formatSimpleMessage(String,
} // class ErrorReporter
/**
* Include infoset augmentations.
*/
protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
/**
* Balance tags.
*/
protected static final String BALANCE_TAGS = "http://cyberneko.org/html/features/balance-tags";
/**
* Error domain.
*/
protected static final String ERROR_DOMAIN = "http://cyberneko.org/html";
/**
* Error reporter.
*/
protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";
// properties
/**
* Pipeline filters.
*/
protected static final String FILTERS = "http://cyberneko.org/html/properties/filters";
/**
* Modify HTML attribute names: { "upper", "lower", "default" }.
*/
protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
/**
* Modify HTML element names: { "upper", "lower", "default" }.
*/
protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
/**
* Namespaces.
*/
protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";
// other
/**
* Report errors.
*/
protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
// private
/**
* Simple report format.
*/
protected static final String SIMPLE_ERROR_FORMAT = "http://cyberneko.org/html/features/report-errors/simple";
//
// Data
//
// handlers
/**
* Parser version is Xerces 2.0.0.
*/
protected static boolean XERCES_2_0_0 = false;
/**
* Parser version is Xerces 2.0.1.
*/
protected static boolean XERCES_2_0_1 = false;
/**
* Parser version is XML4J 4.0.x.
*/
protected static boolean XML4J_4_0_x = false;
/**
* Document source class array.
*/
private static final Class[] DOCSOURCE = {XMLDocumentSource.class};
// other settings
static {
try {
String VERSION = "org.apache.xerces.impl.Version";
Object version = ObjectFactory.createObject(VERSION, VERSION);
java.lang.reflect.Field field = version.getClass().getField("fVersion");
String versionStr = String.valueOf(field.get(version));
XERCES_2_0_0 = versionStr.equals("Xerces-J 2.0.0");
XERCES_2_0_1 = versionStr.equals("Xerces-J 2.0.1");
XML4J_4_0_x = versionStr.startsWith("XML4J 4.0.");
} catch (Throwable e) {
// ignore
}
} // ()
/**
* Stream opened by parser. Therefore, must close stream manually upon termination of parsing.
*/
protected boolean fCloseStream;
// state
/**
* Document handler.
*/
protected XMLDocumentHandler fDocumentHandler;
// components
/**
* Document scanner.
*/
protected final HTMLScanner fDocumentScanner = createDocumentScanner();
// pipeline
/**
* DTD content model handler.
*/
protected XMLDTDContentModelHandler fDTDContentModelHandler;
/**
* DTD handler.
*/
protected XMLDTDHandler fDTDHandler;
/**
* Entity resolver.
*/
protected XMLEntityResolver fEntityResolver;
// other components
/**
* Error handler.
*/
protected XMLErrorHandler fErrorHandler = new DefaultErrorHandler();
// HACK: workarounds Xerces 2.0.x problems
/**
* Error reporter.
*/
protected final HTMLErrorReporter fErrorReporter = new ErrorReporter();
/**
* Components.
*/
protected final Vector fHTMLComponents = new Vector(2);
/**
* Locale.
*/
protected Locale fLocale = Locale.getDefault();
//
// Static initializer
//
/**
* Namespace binder.
*/
protected final NamespaceBinder fNamespaceBinder = new NamespaceBinder();
//
// Constructors
//
/**
* HTML tag balancer.
*/
protected final HTMLTagBalancer fTagBalancer = new HTMLTagBalancer();
/**
* Default constructor.
*/
public HTMLConfiguration() {
// add components
addComponent(fDocumentScanner);
addComponent(fTagBalancer);
addComponent(fNamespaceBinder);
//
// features
//
// recognized features
String VALIDATION = "http://xml.org/sax/features/validation";
String[] recognizedFeatures = {
AUGMENTATIONS, NAMESPACES, VALIDATION, REPORT_ERRORS, SIMPLE_ERROR_FORMAT,
BALANCE_TAGS,};
addRecognizedFeatures(recognizedFeatures);
setFeature(AUGMENTATIONS, false);
setFeature(NAMESPACES, true);
setFeature(VALIDATION, false);
setFeature(REPORT_ERRORS, false);
setFeature(SIMPLE_ERROR_FORMAT, false);
setFeature(BALANCE_TAGS, true);
// HACK: Xerces 2.0.0
if (XERCES_2_0_0) {
// NOTE: These features should not be required but it causes a
// problem if they're not there. This will be fixed in
// subsequent releases of Xerces.
recognizedFeatures = new String[]{"http://apache.org/xml/features/scanner/notify-builtin-refs",};
addRecognizedFeatures(recognizedFeatures);
}
// HACK: Xerces 2.0.1
if (XERCES_2_0_0 || XERCES_2_0_1 || XML4J_4_0_x) {
// NOTE: These features should not be required but it causes a
// problem if they're not there. This should be fixed in
// subsequent releases of Xerces.
recognizedFeatures = new String[]{
"http://apache.org/xml/features/validation/schema/normalized-value",
"http://apache.org/xml/features/scanner/notify-char-refs",};
addRecognizedFeatures(recognizedFeatures);
}
//
// properties
//
// recognized properties
String[] recognizedProperties = {NAMES_ELEMS, NAMES_ATTRS, FILTERS, ERROR_REPORTER,};
addRecognizedProperties(recognizedProperties);
setProperty(NAMES_ELEMS, "upper");
setProperty(NAMES_ATTRS, "lower");
setProperty(ERROR_REPORTER, fErrorReporter);
// HACK: Xerces 2.0.0
if (XERCES_2_0_0) {
// NOTE: This is a hack to get around a problem in the Xerces 2.0.0
// AbstractSAXParser. If it uses a parser configuration that
// does not have a SymbolTable, then it will remove *all*
// attributes. This will be fixed in subsequent releases of
// Xerces.
String SYMBOL_TABLE = "http://apache.org/xml/properties/internal/symbol-table";
recognizedProperties = new String[]{SYMBOL_TABLE,};
addRecognizedProperties(recognizedProperties);
Object symbolTable = ObjectFactory.createObject("org.apache.xerces.util.SymbolTable",
"org.apache.xerces.util.SymbolTable");
setProperty(SYMBOL_TABLE, symbolTable);
}
} // ()
//
// Public methods
//
/**
* If the application decides to terminate parsing before the xml document is fully parsed, the
* application should call this method to free any resource allocated during parsing. For
* example, close all opened streams.
*/
public void cleanup() {
fDocumentScanner.cleanup(fCloseStream);
} // cleanup()
/**
* EXPERIMENTAL: may change in next release
* Immediately evaluates an input source and add the new content (e.g. the output written by an
* embedded script).
*
* @param inputSource The new input source to start scanning.
* @see #pushInputSource(XMLInputSource)
*/
public void evaluateInputSource(XMLInputSource inputSource) {
fDocumentScanner.evaluateInputSource(inputSource);
} // evaluateInputSource(XMLInputSource)
// XMLParserConfiguration methods
//
/**
* Returns the document handler.
*/
public XMLDocumentHandler getDocumentHandler() {
return fDocumentHandler;
} // getDocumentHandler():XMLDocumentHandler
/**
* Returns the DTD content model handler.
*/
public XMLDTDContentModelHandler getDTDContentModelHandler() {
return fDTDContentModelHandler;
} // getDTDContentModelHandler():XMLDTDContentModelHandler
/**
* Returns the DTD handler.
*/
public XMLDTDHandler getDTDHandler() {
return fDTDHandler;
} // getDTDHandler():XMLDTDHandler
/**
* Returns the entity resolver.
*/
public XMLEntityResolver getEntityResolver() {
return fEntityResolver;
} // getEntityResolver():XMLEntityResolver
/**
* Returns the error handler.
*/
public XMLErrorHandler getErrorHandler() {
return fErrorHandler;
} // getErrorHandler():XMLErrorHandler
/**
* Returns the locale.
*/
public Locale getLocale() {
return fLocale;
} // getLocale():Locale
/**
* Parses the document in a pull parsing fashion.
*
* @param complete True if the pull parser should parse the remaining document completely.
* @return True if there is more document to parse.
* @throws XNIException Any XNI exception, possibly wrapping another exception.
* @throws IOException An IO exception from the parser, possibly from a byte stream or
* character stream supplied by the parser.
* @see #setInputSource
*/
public boolean parse(boolean complete) throws XNIException, IOException {
try {
boolean more = fDocumentScanner.scanDocument(complete);
if (!more) {
cleanup();
}
return more;
} catch (XNIException e) {
cleanup();
throw e;
} catch (IOException e) {
cleanup();
throw e;
}
} // parse(boolean):boolean
/**
* Parses a document.
*/
public void parse(XMLInputSource source) throws XNIException, IOException {
setInputSource(source);
parse(true);
} // parse(XMLInputSource)
/**
* Pushes an input source onto the current entity stack. This enables the scanner to
* transparently scan new content (e.g. the output written by an embedded script). At the end of
* the current entity, the scanner returns where it left off at the time this entity source was
* pushed.
*
* Hint: To use this feature to insert the output of <SCRIPT> tags,
* remember to buffer the entire output of the processed instructions before pushing a
* new input source. Otherwise, events may appear out of sequence.
*
* @param inputSource The new input source to start scanning.
* @see #evaluateInputSource(XMLInputSource)
*/
public void pushInputSource(XMLInputSource inputSource) {
fDocumentScanner.pushInputSource(inputSource);
} // pushInputSource(XMLInputSource)
/**
* Sets the document handler.
*/
public void setDocumentHandler(XMLDocumentHandler handler) {
fDocumentHandler = handler;
if (handler instanceof HTMLTagBalancingListener) {
fTagBalancer.setTagBalancingListener((HTMLTagBalancingListener) handler);
}
} // setDocumentHandler(XMLDocumentHandler)
/**
* Sets the DTD content model handler.
*/
public void setDTDContentModelHandler(XMLDTDContentModelHandler handler) {
fDTDContentModelHandler = handler;
} // setDTDContentModelHandler(XMLDTDContentModelHandler)
/**
* Sets the DTD handler.
*/
public void setDTDHandler(XMLDTDHandler handler) {
fDTDHandler = handler;
} // setDTDHandler(XMLDTDHandler)
/**
* Sets the entity resolver.
*/
public void setEntityResolver(XMLEntityResolver resolver) {
fEntityResolver = resolver;
} // setEntityResolver(XMLEntityResolver)
/**
* Sets the error handler.
*/
public void setErrorHandler(XMLErrorHandler handler) {
fErrorHandler = handler;
} // setErrorHandler(XMLErrorHandler)
/**
* Sets a feature.
*/
public void setFeature(String featureId, boolean state) throws XMLConfigurationException {
super.setFeature(featureId, state);
int size = fHTMLComponents.size();
for (int i = 0; i < size; i++) {
HTMLComponent component = (HTMLComponent) fHTMLComponents.elementAt(i);
component.setFeature(featureId, state);
}
} // setFeature(String,boolean)
//
// XMLPullParserConfiguration methods
//
// parsing
/**
* Sets the input source for the document to parse.
*
* @param inputSource The document's input source.
* @throws XMLConfigurationException Thrown if there is a configuration error when
* initializing the parser.
* @throws IOException Thrown on I/O error.
* @see #parse(boolean)
*/
public void setInputSource(XMLInputSource inputSource) throws XMLConfigurationException,
IOException {
reset();
fCloseStream = inputSource.getByteStream() == null
&& inputSource.getCharacterStream() == null;
fDocumentScanner.setInputSource(inputSource);
} // setInputSource(XMLInputSource)
/**
* Sets the locale.
*/
public void setLocale(Locale locale) {
if (locale == null) {
locale = Locale.getDefault();
}
fLocale = locale;
} // setLocale(Locale)
/**
* Sets a property.
*/
public void setProperty(String propertyId, Object value) throws XMLConfigurationException {
super.setProperty(propertyId, value);
if (propertyId.equals(FILTERS)) {
XMLDocumentFilter[] filters = (XMLDocumentFilter[]) getProperty(FILTERS);
if (filters != null) {
for (int i = 0; i < filters.length; i++) {
XMLDocumentFilter filter = filters[i];
if (filter instanceof HTMLComponent) {
addComponent((HTMLComponent) filter);
}
}
}
}
int size = fHTMLComponents.size();
for (int i = 0; i < size; i++) {
HTMLComponent component = (HTMLComponent) fHTMLComponents.elementAt(i);
component.setProperty(propertyId, value);
}
} // setProperty(String,Object)
//
// Protected methods
//
/**
* Adds a component.
*/
protected void addComponent(HTMLComponent component) {
// add component to list
fHTMLComponents.addElement(component);
// add recognized features and set default states
String[] features = component.getRecognizedFeatures();
addRecognizedFeatures(features);
int featureCount = features != null ? features.length : 0;
for (int i = 0; i < featureCount; i++) {
Boolean state = component.getFeatureDefault(features[i]);
if (state != null) {
setFeature(features[i], state.booleanValue());
}
}
// add recognized properties and set default values
String[] properties = component.getRecognizedProperties();
addRecognizedProperties(properties);
int propertyCount = properties != null ? properties.length : 0;
for (int i = 0; i < propertyCount; i++) {
Object value = component.getPropertyDefault(properties[i]);
if (value != null) {
setProperty(properties[i], value);
}
}
} // addComponent(HTMLComponent)
protected HTMLScanner createDocumentScanner() {
return new HTMLScanner();
}
//
// Interfaces
//
/**
* Resets the parser configuration.
*/
protected void reset() throws XMLConfigurationException {
// reset components
int size = fHTMLComponents.size();
for (int i = 0; i < size; i++) {
HTMLComponent component = (HTMLComponent) fHTMLComponents.elementAt(i);
component.reset(this);
}
// configure pipeline
XMLDocumentSource lastSource = fDocumentScanner;
if (getFeature(NAMESPACES)) {
lastSource.setDocumentHandler(fNamespaceBinder);
fNamespaceBinder.setDocumentSource(fTagBalancer);
lastSource = fNamespaceBinder;
}
if (getFeature(BALANCE_TAGS)) {
lastSource.setDocumentHandler(fTagBalancer);
fTagBalancer.setDocumentSource(fDocumentScanner);
lastSource = fTagBalancer;
}
XMLDocumentFilter[] filters = (XMLDocumentFilter[]) getProperty(FILTERS);
if (filters != null) {
for (int i = 0; i < filters.length; i++) {
XMLDocumentFilter filter = filters[i];
XercesBridge.getInstance().XMLDocumentFilter_setDocumentSource(filter, lastSource);
lastSource.setDocumentHandler(filter);
lastSource = filter;
}
}
lastSource.setDocumentHandler(fDocumentHandler);
} // reset()
} // class HTMLConfiguration