org.apache.html.dom.HTMLBuilder Maven / Gradle / Ivy
Go to download
Xerces2 provides high performance, fully compliant XML parsers in the Apache Xerces family. This new version of Xerces continues to build upon the Xerces Native Interface (XNI), a complete framework for building parser components and configurations that is extremely modular and easy to program.
The Apache Xerces2 parser is the reference implementation of XNI but other parser components, configurations, and parsers can be written using the Xerces Native Interface. For complete design and implementation documents, refer to the XNI Manual.
Xerces2 provides fully conforming XML Schema 1.0 and 1.1 processors. An experimental implementation of the "XML Schema Definition Language (XSD): Component Designators (SCD) Candidate Recommendation (January 2010)" is also provided for evaluation. For more information, refer to the XML Schema page.
Xerces2 also provides a complete implementation of the Document Object Model Level 3 Core and Load/Save W3C Recommendations and provides a complete implementation of the XML Inclusions (XInclude) W3C Recommendation. It also provides support for OASIS XML Catalogs v1.1.
Xerces2 is able to parse documents written according to the XML 1.1 Recommendation, except that it does not yet provide an option to enable normalization checking as described in section 2.13 of this specification. It also handles namespaces according to the XML Namespaces 1.1 Recommendation, and will correctly serialize XML 1.1 documents if the DOM level 3 load/save APIs are in use.
/*
* The Apache Software License, Version 1.1
*
*
* Copyright (c) 1999,2000 The Apache Software Foundation. All rights
* reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Xerces" and "Apache Software Foundation" must
* not be used to endorse or promote products derived from this
* software without prior written permission. For written
* permission, please contact [email protected].
*
* 5. Products derived from this software may not be called "Apache",
* nor may "Apache" appear in their name, without prior written
* permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation and was
* originally based on software copyright (c) 1999, International
* Business Machines, Inc., http://www.apache.org. For more
* information on the Apache Software Foundation, please see
* .
*/
package org.apache.html.dom;
import java.util.Vector;
import org.apache.xerces.dom.ElementImpl;
import org.apache.xerces.dom.ProcessingInstructionImpl;
import org.apache.xerces.dom.TextImpl;
import org.w3c.dom.Node;
import org.w3c.dom.html.HTMLDocument;
import org.xml.sax.AttributeList;
import org.xml.sax.DocumentHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
/**
* This is a SAX document handler that is used to build an HTML document.
* It can build a document from any SAX parser, but is specifically tuned
* for working with the OpenXML HTML parser.
*
*
* @version $Revision: 1.6 $ $Date: 2003/05/08 20:13:09 $
* @author Assaf Arkin
*/
public class HTMLBuilder
implements DocumentHandler
{
/**
* The document that is being built.
*/
protected HTMLDocumentImpl _document;
/**
* The current node in the document into which elements, text and
* other nodes will be inserted. This starts as the document iself
* and reflects each element that is currently being parsed.
*/
protected ElementImpl _current;
/**
* A reference to the current locator, this is generally the parser
* itself. The locator is used to locate errors and identify the
* source locations of elements.
*/
private Locator _locator;
/**
* Applies only to whitespace appearing between element tags in element content,
* as per the SAX definition, and true by default.
*/
private boolean _ignoreWhitespace = true;
/**
* Indicates whether finished building a document. If so, can start building
* another document. Must be initially true to get the first document processed.
*/
private boolean _done = true;
/**
* The document is only created the same time as the document element, however, certain
* nodes may precede the document element (comment and PI), and they are accumulated
* in this vector.
*/
protected Vector _preRootNodes;
public void startDocument()
throws SAXException
{
if ( ! _done )
throw new SAXException( "HTM001 State error: startDocument fired twice on one builder." );
_document = null;
_done = false;
}
public void endDocument()
throws SAXException
{
if ( _document == null )
throw new SAXException( "HTM002 State error: document never started or missing document element." );
if ( _current != null )
throw new SAXException( "HTM003 State error: document ended before end of document element." );
_current = null;
_done = true;
}
public synchronized void startElement( String tagName, AttributeList attrList )
throws SAXException
{
ElementImpl elem;
int i;
if ( tagName == null )
throw new SAXException( "HTM004 Argument 'tagName' is null." );
// If this is the root element, this is the time to create a new document,
// because only know we know the document element name and namespace URI.
if ( _document == null )
{
// No need to create the element explicitly.
_document = new HTMLDocumentImpl();
elem = (ElementImpl) _document.getDocumentElement();
_current = elem;
if ( _current == null )
throw new SAXException( "HTM005 State error: Document.getDocumentElement returns null." );
// Insert nodes (comment and PI) that appear before the root element.
if ( _preRootNodes != null )
{
for ( i = _preRootNodes.size() ; i-- > 0 ; )
_document.insertBefore( (Node) _preRootNodes.elementAt( i ), elem );
_preRootNodes = null;
}
}
else
{
// This is a state error, indicates that document has been parsed in full,
// or that there are two root elements.
if ( _current == null )
throw new SAXException( "HTM006 State error: startElement called after end of document element." );
elem = (ElementImpl) _document.createElement( tagName );
_current.appendChild( elem );
_current = elem;
}
// Add the attributes (specified and not-specified) to this element.
if ( attrList != null )
{
for ( i = 0 ; i < attrList.getLength() ; ++ i )
elem.setAttribute( attrList.getName( i ), attrList.getValue( i ) );
}
}
public void endElement( String tagName )
throws SAXException
{
if ( _current == null )
throw new SAXException( "HTM007 State error: endElement called with no current node." );
if ( ! _current.getNodeName().equalsIgnoreCase( tagName ))
throw new SAXException( "HTM008 State error: mismatch in closing tag name " + tagName + "\n" + tagName);
// Move up to the parent element. When you reach the top (closing the root element).
// the parent is document and current is null.
if ( _current.getParentNode() == _current.getOwnerDocument() )
_current = null;
else
_current = (ElementImpl) _current.getParentNode();
}
public void characters( String text )
throws SAXException
{
if ( _current == null )
throw new SAXException( "HTM009 State error: character data found outside of root element." );
_current.appendChild( new TextImpl( _document, text ) );
}
public void characters( char[] text, int start, int length )
throws SAXException
{
if ( _current == null )
throw new SAXException( "HTM010 State error: character data found outside of root element." );
_current.appendChild( new TextImpl( _document, new String( text, start, length ) ) );
}
public void ignorableWhitespace( char[] text, int start, int length )
throws SAXException
{
Node node;
if ( ! _ignoreWhitespace )
_current.appendChild( new TextImpl( _document, new String( text, start, length ) ) );
}
public void processingInstruction( String target, String instruction )
throws SAXException
{
Node node;
// Processing instruction may appear before the document element (in fact, before the
// document has been created, or after the document element has been closed.
if ( _current == null && _document == null )
{
if ( _preRootNodes == null )
_preRootNodes = new Vector();
_preRootNodes.addElement( new ProcessingInstructionImpl( null, target, instruction ) );
}
else
if ( _current == null && _document != null )
_document.appendChild( new ProcessingInstructionImpl( _document, target, instruction ) );
else
_current.appendChild( new ProcessingInstructionImpl( _document, target, instruction ) );
}
public HTMLDocument getHTMLDocument()
{
return (HTMLDocument) _document;
}
public void setDocumentLocator( Locator locator )
{
_locator = locator;
}
}