All Downloads are FREE. Search and download functionalities are using the official Maven repository.

nu.validator.htmlparser.dom.HtmlDocumentBuilder Maven / Gradle / Ivy

/*
 * Copyright (c) 2007 Henri Sivonen
 * Copyright (c) 2007-2008 Mozilla Foundation
 *
 * Permission is hereby granted, free of charge, to any person obtaining a 
 * copy of this software and associated documentation files (the "Software"), 
 * to deal in the Software without restriction, including without limitation 
 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
 * and/or sell copies of the Software, and to permit persons to whom the 
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in 
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
 * DEALINGS IN THE SOFTWARE.
 */

package nu.validator.htmlparser.dom;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;

import nu.validator.htmlparser.common.DoctypeExpectation;
import nu.validator.htmlparser.common.DocumentModeHandler;
import nu.validator.htmlparser.common.Heuristics;
import nu.validator.htmlparser.common.XmlViolationPolicy;
import nu.validator.htmlparser.io.Driver;

import org.w3c.dom.DOMImplementation;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.xml.sax.EntityResolver;
import org.xml.sax.ErrorHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/**
 * This class implements an HTML5 parser that exposes data through the DOM 
 * interface. 
 * 
 * 

By default, when using the constructor without arguments, the * this parser coerces XML 1.0-incompatible infosets into XML 1.0 infosets. * This corresponds to * ALTER_INFOSET as the general XML violation policy. To make the parser * support non-conforming HTML fully per the HTML 5 spec while on the other * hand potentially violating the DOM API contract, set the general XML * violation policy to ALLOW. This does not work with a standard * DOM implementation. Halting on XML-incompatible parser outputs is possible by setting * the general XML violation policy to FATAL. * *

The doctype is not represented in the tree. * *

The document mode is represented as user data DocumentMode * object with the key nu.validator.document-mode on the document * node. * *

The form pointer is also stored as user data with the key * nu.validator.form-pointer. * * @version $Id: HtmlDocumentBuilder.java 391 2008-08-06 19:34:39Z hsivonen $ * @author hsivonen */ public class HtmlDocumentBuilder extends DocumentBuilder { /** * Returns the JAXP DOM implementation. * * @return the JAXP DOM implementation */ private static DOMImplementation jaxpDOMImplementation() { DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); DocumentBuilder builder; try { builder = factory.newDocumentBuilder(); } catch (ParserConfigurationException e) { throw new RuntimeException(e); } return builder.getDOMImplementation(); } /** * The tokenizer. */ private final Driver tokenizer; /** * The tree builder. */ private final DOMTreeBuilder domTreeBuilder; /** * The DOM impl. */ private final DOMImplementation implementation; /** * The entity resolver. */ private EntityResolver entityResolver; /** * Instantiates the document builder with a specific DOM * implementation and XML violation policy. * * @param implementation * the DOM implementation * @param xmlPolicy the policy */ public HtmlDocumentBuilder(DOMImplementation implementation, XmlViolationPolicy xmlPolicy) { this.implementation = implementation; this.domTreeBuilder = new DOMTreeBuilder(implementation); this.tokenizer = new Driver(domTreeBuilder); this.tokenizer.setXmlnsPolicy(XmlViolationPolicy.ALTER_INFOSET); setXmlPolicy(xmlPolicy); } /** * Instantiates the document builder with a specific DOM implementation * and the infoset-altering XML violation policy. * * @param implementation * the DOM implementation */ public HtmlDocumentBuilder(DOMImplementation implementation) { this(implementation, XmlViolationPolicy.ALTER_INFOSET); } /** * Instantiates the document builder with the JAXP DOM implementation * and the infoset-altering XML violation policy. */ public HtmlDocumentBuilder() { this(XmlViolationPolicy.ALTER_INFOSET); } /** * Instantiates the document builder with the JAXP DOM implementation * and a specific XML violation policy. * @param xmlPolicy the policy */ public HtmlDocumentBuilder(XmlViolationPolicy xmlPolicy) { this(jaxpDOMImplementation(), xmlPolicy); } /** * Returns the DOM implementation * @return the DOM implementation * @see javax.xml.parsers.DocumentBuilder#getDOMImplementation() */ @Override public DOMImplementation getDOMImplementation() { return implementation; } /** * Returns true. * @return true * @see javax.xml.parsers.DocumentBuilder#isNamespaceAware() */ @Override public boolean isNamespaceAware() { return true; } /** * Returns false * @return false * @see javax.xml.parsers.DocumentBuilder#isValidating() */ @Override public boolean isValidating() { return false; } /** * For API compatibility. * @see javax.xml.parsers.DocumentBuilder#newDocument() */ @Override public Document newDocument() { return implementation.createDocument(null, null, null); } /** * Parses a document from a SAX InputSource. * @param is the source * @return the doc * @throws SAXException if stuff goes wrong * @throws IOException if IO goes wrong * @see javax.xml.parsers.DocumentBuilder#parse(org.xml.sax.InputSource) */ @Override public Document parse(InputSource is) throws SAXException, IOException { domTreeBuilder.setFragmentContext(null); tokenize(is); return domTreeBuilder.getDocument(); } /** * Parses a document fragment from a SAX InputSource. * @param is the source * @param context the context element name * @return the doc * @throws SAXException if stuff goes wrong * @throws IOException if IO goes wrong */ public DocumentFragment parseFragment(InputSource is, String context) throws IOException, SAXException { domTreeBuilder.setFragmentContext(context.intern()); tokenize(is); return domTreeBuilder.getDocumentFragment(); } /** * Sets the entity resolver for URI-only inputs. * @param resolver the resolver * @see javax.xml.parsers.DocumentBuilder#setEntityResolver(org.xml.sax.EntityResolver) */ @Override public void setEntityResolver(EntityResolver resolver) { this.entityResolver = resolver; } /** * Sets the error handler. * @param errorHandler the handler * @see javax.xml.parsers.DocumentBuilder#setErrorHandler(org.xml.sax.ErrorHandler) */ @Override public void setErrorHandler(ErrorHandler errorHandler) { domTreeBuilder.setErrorHandler(errorHandler); tokenizer.setErrorHandler(errorHandler); } /** * Sets whether comment nodes appear in the tree. * @param ignoreComments true to ignore comments * @see nu.validator.htmlparser.impl.TreeBuilder#setIgnoringComments(boolean) */ public void setIgnoringComments(boolean ignoreComments) { domTreeBuilder.setIgnoringComments(ignoreComments); } /** * Sets whether the parser considers scripting to be enabled for noscript treatment. * @param scriptingEnabled true to enable * @see nu.validator.htmlparser.impl.TreeBuilder#setScriptingEnabled(boolean) */ public void setScriptingEnabled(boolean scriptingEnabled) { domTreeBuilder.setScriptingEnabled(scriptingEnabled); } /** * Toggles the checking of the NFC normalization of source. * @param enable true to check normalization * @see nu.validator.htmlparser.impl.Tokenizer#setCheckingNormalization(boolean) */ public void setCheckingNormalization(boolean enable) { tokenizer.setCheckingNormalization(enable); } /** * Sets the policy for consecutive hyphens in comments. * @param commentPolicy the policy * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy) */ public void setCommentPolicy(XmlViolationPolicy commentPolicy) { tokenizer.setCommentPolicy(commentPolicy); } /** * Sets the policy for non-XML characters except white space. * @param contentNonXmlCharPolicy the policy * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy) */ public void setContentNonXmlCharPolicy( XmlViolationPolicy contentNonXmlCharPolicy) { tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy); } /** * Sets the policy for non-XML white space. * @param contentSpacePolicy the policy * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy) */ public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) { tokenizer.setContentSpacePolicy(contentSpacePolicy); } /** * Whether the HTML 4 mode reports boolean attributes in a way that repeats * the name in the value. * @param html4ModeCompatibleWithXhtml1Schemata */ public void setHtml4ModeCompatibleWithXhtml1Schemata( boolean html4ModeCompatibleWithXhtml1Schemata) { tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata); } /** * Whether to map the HTML lang attribute to xml:lang. * @param mappingLangToXmlLang true to map lang to xml:lang * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean) */ public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) { tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang); } /** * Sets the policy for dealing with names that aren't XML 1.0 4th ed. plus Namespaces NCNames. * @param namePolicy the policy * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy) */ public void setNamePolicy(XmlViolationPolicy namePolicy) { tokenizer.setNamePolicy(namePolicy); domTreeBuilder.setNamePolicy(namePolicy); } /** * This is a catch-all convenience method for setting name, content space, * content non-XML char and comment policies in one go. * * @param namePolicy the policy */ public void setXmlPolicy(XmlViolationPolicy xmlPolicy) { setNamePolicy(xmlPolicy); setContentSpacePolicy(xmlPolicy); setContentNonXmlCharPolicy(xmlPolicy); setCommentPolicy(xmlPolicy); } /** * Does nothing. * @deprecated */ public void setBogusXmlnsPolicy(XmlViolationPolicy bogusXmlnsPolicy) { } /** * Sets the doctype expectation. * * @param doctypeExpectation * the doctypeExpectation to set * @see nu.validator.htmlparser.impl.TreeBuilder#setDoctypeExpectation(nu.validator.htmlparser.common.DoctypeExpectation) */ public void setDoctypeExpectation(DoctypeExpectation doctypeExpectation) { domTreeBuilder.setDoctypeExpectation(doctypeExpectation); } /** * Sets the document mode handler. * * @param documentModeHandler * @see nu.validator.htmlparser.impl.TreeBuilder#setDocumentModeHandler(nu.validator.htmlparser.common.DocumentModeHandler) */ public void setDocumentModeHandler(DocumentModeHandler documentModeHandler) { domTreeBuilder.setDocumentModeHandler(documentModeHandler); } /** * Sets the encoding sniffing heuristics. * * @param heuristics the heuristics to set * @see nu.validator.htmlparser.impl.Tokenizer#setHeuristics(nu.validator.htmlparser.common.Heuristics) */ public void setHeuristics(Heuristics heuristics) { tokenizer.setHeuristics(heuristics); } /** * Tokenizes the input source. * * @param is the source * @throws SAXException if stuff goes wrong * @throws IOException if IO goes wrong * @throws MalformedURLException if the system ID is malformed and the entity resolver is null */ private void tokenize(InputSource is) throws SAXException, IOException, MalformedURLException { if (is == null) { throw new IllegalArgumentException("Null input."); } if (is.getByteStream() == null && is.getCharacterStream() == null) { String systemId = is.getSystemId(); if (systemId == null) { throw new IllegalArgumentException( "No byte stream, no character stream nor URI."); } if (entityResolver != null) { is = entityResolver.resolveEntity(is.getPublicId(), systemId); } if (is.getByteStream() == null || is.getCharacterStream() == null) { is = new InputSource(); is.setSystemId(systemId); is.setByteStream(new URL(systemId).openStream()); } } tokenizer.tokenize(is); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy