
org.enhydra.xml.xmlc.html.parsers.HTMLDocBuilder Maven / Gradle / Ivy
/*
* Enhydra Java Application Server Project
*
* The contents of this file are subject to the Enhydra Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License on
* the Enhydra web site ( http://www.enhydra.org/ ).
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific terms governing rights and limitations
* under the License.
*
* The Initial Developer of the Enhydra Application Server is Lutris
* Technologies, Inc. The Enhydra Application Server and portions created
* by Lutris Technologies, Inc. are Copyright Lutris Technologies, Inc.
* All Rights Reserved.
*
* Contributor(s):
*
* $Id: HTMLDocBuilder.java,v 1.2 2005/01/26 08:29:24 jkjome Exp $
*/
package org.enhydra.xml.xmlc.html.parsers;
import java.util.HashSet;
import org.enhydra.xml.xmlc.XMLCError;
import org.enhydra.xml.xmlc.XMLCException;
import org.enhydra.xml.xmlc.dom.XMLCDocument;
import org.enhydra.xml.xmlc.dom.XMLCDomFactory;
import org.w3c.dom.Comment;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.html.HTMLDocument;
import org.xml.sax.InputSource;
/**
* Class used by HTML parser to build a DOM.
*
* The document builder functions assume they are being called in the order the
* document is parsed. They keep a current node where new child nodes are
* appended.
*/
public class HTMLDocBuilder {
/**
* XMLC Document object.
*/
private XMLCDocument fXmlcDoc;
/**
* Factory for creating the document.
*/
private XMLCDomFactory fDomFactory;
/**
* The document.
*/
private HTMLDocument fDocument;
/**
* Have we got the parser callback for the document element.
* This is used to determine where to insert comments, since the
* document element pre-exists.
*/
private boolean fGotDocElement;
/**
* The current node that is being constructed. This functions as a stack
* during document construction.
*/
private Node fCurrentNode;
/**
* Table used to determine what tags have been closed by
* fixUnrecognizedTagNesting.
*/
private HashSet fClosedUnrecognizedElements = null;
/**
* Constructor. Creates XMLCDocument object.
*/
public HTMLDocBuilder(XMLCDomFactory domFactory,
InputSource input) throws XMLCException {
fXmlcDoc = new XMLCDocument(domFactory);
fDomFactory = domFactory;
Document doc = fXmlcDoc.createDocument(null, null);
if (!(doc instanceof HTMLDocument)) {
throw new XMLCException("DOM factory ("
+ fDomFactory.getClass().getName()
+ ") created a document that was not a HTMLDocument, got "
+ doc.getClass().getName());
}
fDocument = (HTMLDocument)doc;
fCurrentNode = fDocument;
String encoding = input.getEncoding();
if (encoding != null) {
fXmlcDoc.setEncoding(encoding);
}
}
/**
* Generate error about a method being called that should
* be called before the document is created.
*/
private void docNotCreatedError() {
throw new XMLCError("Bug: parser event on document contents occured before document is created");
}
/**
* Get the XMLC document associated with this object.
*/
public XMLCDocument getXMLCDocument() {
return fXmlcDoc;
}
/**
* Determine if an element name is a frameset-only element.
*/
private boolean isFrameSetElement(String tagName) {
return tagName.equalsIgnoreCase("frameset")
|| tagName.equalsIgnoreCase("noframes");
}
/**
* Start a new Element.
*/
public void startElement(String tagName) {
// Document element already exists
if (tagName.equals("html")) {
fCurrentNode = fDocument.getDocumentElement();
fGotDocElement = true;
} else {
Element element = fDocument.createElement(tagName);
fCurrentNode.appendChild(element);
fCurrentNode = element;
}
if (isFrameSetElement(tagName)) {
fXmlcDoc.setIsHtmlFrameSet();
}
}
/**
* Add an attribute to the element on the top of the
* stack.
*/
public void addAttribute(String name, String value) {
((Element)fCurrentNode).setAttribute(name, value);
}
/**
* Finish the element being constructed.
*/
public void finishElement() {
if (fCurrentNode == null) {
throw new XMLCError("node stack underflow; malformed document");
}
if (!(fCurrentNode instanceof Element)) {
throw new XMLCError("DOM node top of stack not a element for end tag");
}
fCurrentNode = fCurrentNode.getParentNode();
}
/**
* Add a Text
node.
*/
public void addTextNode(String data) {
if (fDocument == null) {
docNotCreatedError();
}
fCurrentNode.appendChild(fDocument.createTextNode(data));
}
/**
* Add a Comment
node.
*/
public void addComment(String data) {
Comment comment = fDocument.createComment(data);
// Handle insertion before document element (current should always
// be document, but we might be handling some invalid node).
if ((!fGotDocElement) && (fCurrentNode == fDocument)) {
fCurrentNode.insertBefore(comment, fDocument.getDocumentElement());
} else {
fCurrentNode.appendChild(comment);
}
}
/**
* Get the node on the top of the stack during parsing.
* FIXME: Added to work around bugs in the swing parser.
*/
public Node getCurrentNode() {
return fCurrentNode;
}
/**
* Pop the current node off of the stack. This is *only* used
* during error recover from a broken parser.
* FIXME: Added to work around bugs in the swing parser.
*/
public void popCurrentNode() {
fCurrentNode = fCurrentNode.getParentNode();
}
/**
* Recursive part of findUnrecognizedTag
*/
private Node recursiveFindUnrecognizedTag(String tagNameUpper,
Node parent) {
// Search right to left.
for (Node child = parent.getLastChild(); child != null;
child = child.getPreviousSibling()) {
if (child.getNodeName().equals(tagNameUpper)
&& !fClosedUnrecognizedElements.contains(child)) {
return child; // Found it!
}
}
// Search up the tree.
Node grandParent = parent.getParentNode();
if (grandParent != null) {
return recursiveFindUnrecognizedTag(tagNameUpper, grandParent);
} else {
return null;
}
}
/**
* Find the element for an unrecognized tag. This searches up the parse
* stack, looking at the siblings of each node on the stack. This starts
* with the parent of the top of the stack, and searches its children from
* right to left. Thus the first node checked is node on the top of the
* stack.
*/
private Node findUnrecognizedTag(String tagNameUpper) throws XMLCException {
Node openingElement = null;
if (fCurrentNode != null) {
openingElement = recursiveFindUnrecognizedTag(tagNameUpper,
fCurrentNode);
}
if (openingElement == null) {
throw new XMLCException("could not find matching opening tag for "
+ tagNameUpper + ">");
}
if (openingElement.getFirstChild() != null) {
throw new XMLCError("attempt to fix nesting for "
+ tagNameUpper
+ "> found a node that already has children");
}
return openingElement;
}
/**
* Make nodes to the right of an element the element's children.
*/
private void makeRightSiblingsChildren(Node openingElement) {
Node parent = openingElement.getParentNode();
Node sibling;
while ((sibling = openingElement.getNextSibling()) != null) {
openingElement.appendChild(sibling);
}
}
/**
* Used to correct nesting when handling an unknown tag. This is called
* when the end tag is encountered. The tree is walked backwards from the
* top of the stack to find the element pushed for the open tag. All of
* the siblings to the right of that element are moved to be children of
* the element. The stack is popped back until the parent of the
* element being closed is on top. This was put in to support the
* swing parser.
*/
public void fixUnrecognizedTagNesting(String tagName)
throws XMLCException {
String tagNameUpper = tagName.toUpperCase();
if (fClosedUnrecognizedElements == null) {
fClosedUnrecognizedElements = new HashSet();
}
// Find and correct
Node openingElement = findUnrecognizedTag(tagNameUpper);
makeRightSiblingsChildren(openingElement);
fClosedUnrecognizedElements.add(openingElement);
// Clean up the stack
Node openingParent = openingElement.getParentNode();
while (fCurrentNode != openingParent) {
popCurrentNode();
}
}
}