com.googlecode.html.HTMLTagBalancer Maven / Gradle / Ivy
/*
* Copyright 2002-2009 Andy Clark, Marc Guillemot
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package com.googlecode.html;
import com.googlecode.html.HTMLElements.Element;
import com.googlecode.html.filters.NamespaceBinder;
import com.googlecode.html.xercesbridge.XercesBridge;
import org.apache.xerces.util.XMLAttributesImpl;
import org.apache.xerces.xni.*;
import org.apache.xerces.xni.parser.XMLComponentManager;
import org.apache.xerces.xni.parser.XMLConfigurationException;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.apache.xerces.xni.parser.XMLDocumentSource;
import java.util.ArrayList;
import java.util.List;
/**
* Balances tags in an HTML document. This component receives document events and tries to correct
* many common mistakes that human (and computer) HTML document authors make. This tag balancer can:
*
* - add missing parent elements;
*
- automatically close elements with optional end tags; and
*
- handle mis-matched inline element tags.
*
*
* This component recognizes the following features:
*
* - http://cyberneko.org/html/features/augmentations
*
- http://cyberneko.org/html/features/report-errors
*
- http://cyberneko.org/html/features/balance-tags/document-fragment
*
- http://cyberneko.org/html/features/balance-tags/ignore-outside-content
*
*
* This component recognizes the following properties:
*
* - http://cyberneko.org/html/properties/names/elems
*
- http://cyberneko.org/html/properties/names/attrs
*
- http://cyberneko.org/html/properties/error-reporter
*
- http://cyberneko.org/html/properties/balance-tags/current-stack
*
*
* @author Andy Clark
* @author Marc Guillemot
* @version $Id: HTMLTagBalancer.java,v 1.20 2005/02/14 04:06:22 andyc Exp $
* @see HTMLElements
*/
public class HTMLTagBalancer implements XMLDocumentFilter, HTMLComponent {
//
// Constants
//
// features
/**
* Element info for each start element. This information is used when closing unbalanced inline
* elements. For example:
*
*
* <i>unbalanced <b>HTML</i> content</b>
*
*
* It seems that it is a waste of processing and memory to copy the attributes for every start
* element even if there are no unbalanced inline elements in the document. However, if the
* attributes are not saved, then important attributes such as style information would
* be lost.
*
* @author Andy Clark
*/
public static class Info {
//
// Data
//
/**
* The element attributes.
*/
public XMLAttributes attributes;
/**
* The element.
*/
public HTMLElements.Element element;
/**
* The element qualified name.
*/
public QName qname;
//
// Constructors
//
/**
* Creates an element information object.
*
* Note: This constructor makes a copy of the element information.
*
* @param element The element qualified name.
*/
public Info(HTMLElements.Element element, QName qname) {
this(element, qname, null);
} // (HTMLElements.Element,QName)
/**
* Creates an element information object.
*
* Note: This constructor makes a copy of the element information.
*
* @param element The element qualified name.
* @param attributes The element attributes.
*/
public Info(HTMLElements.Element element, QName qname, XMLAttributes attributes) {
this.element = element;
this.qname = new QName(qname);
if (attributes != null) {
int length = attributes.getLength();
if (length > 0) {
QName aqname = new QName();
XMLAttributes newattrs = new XMLAttributesImpl();
for (int i = 0; i < length; i++) {
attributes.getName(i, aqname);
String type = attributes.getType(i);
String value = attributes.getValue(i);
String nonNormalizedValue = attributes.getNonNormalizedValue(i);
boolean specified = attributes.isSpecified(i);
newattrs.addAttribute(aqname, type, value);
newattrs.setNonNormalizedValue(i, nonNormalizedValue);
newattrs.setSpecified(i, specified);
}
this.attributes = newattrs;
}
}
} // (HTMLElements.Element,QName,XMLAttributes)
/**
* Simple representation to make debugging easier
*/
@Override
public String toString() {
return super.toString() + qname;
}
} // class Info
/**
* Unsynchronized stack of element information.
*/
public static class InfoStack {
//
// Data
//
/**
* The stack data.
*/
public Info[] data = new Info[10];
/**
* The top of the stack.
*/
public int top;
//
// Public methods
//
/**
* Peeks at the top of the stack.
*/
public Info peek() {
return data[top - 1];
} // peek():Info
/**
* Pops the top item off of the stack.
*/
public Info pop() {
return data[--top];
} // pop():Info
/**
* Pushes element information onto the stack.
*/
public void push(Info info) {
if (top == data.length) {
Info[] newarray = new Info[top + 10];
System.arraycopy(data, 0, newarray, 0, top);
data = newarray;
}
data[top++] = info;
} // push(Info)
/**
* Simple representation to make debugging easier
*/
@Override
public String toString() {
final StringBuffer sb = new StringBuffer("InfoStack(");
for (int i = top - 1; i >= 0; --i) {
sb.append(data[i]);
if (i != 0) {
sb.append(", ");
}
}
sb.append(")");
return sb.toString();
}
} // class InfoStack
/**
* Structure to hold information about an element placed in buffer to be comsumed later
*/
static class ElementEntry {
private final Augmentations augs_;
private final QName name_;
ElementEntry(final QName element, final Augmentations augs) {
name_ = new QName(element);
augs_ = augs == null ? null : new HTMLAugmentations(augs);
}
}
/**
* EXPERIMENTAL: may change in next release
* Name of the property holding the stack of elements in which context a document fragment should
* be parsed.
**/
public static final String FRAGMENT_CONTEXT_STACK = "http://cyberneko.org/html/properties/balance-tags/fragment-context-stack";
/**
* Include infoset augmentations.
*/
protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
/**
* Document fragment balancing only.
*/
protected static final String DOCUMENT_FRAGMENT = "http://cyberneko.org/html/features/balance-tags/document-fragment";
/**
* Document fragment balancing only (deprecated).
*/
protected static final String DOCUMENT_FRAGMENT_DEPRECATED = "http://cyberneko.org/html/features/document-fragment";
/**
* Error reporter.
*/
protected static final String ERROR_REPORTER = "http://cyberneko.org/html/properties/error-reporter";
// properties
/**
* Ignore outside content.
*/
protected static final String IGNORE_OUTSIDE_CONTENT = "http://cyberneko.org/html/features/balance-tags/ignore-outside-content";
/**
* Modify HTML attribute names: { "upper", "lower", "default" }.
*/
protected static final String NAMES_ATTRS = "http://cyberneko.org/html/properties/names/attrs";
/**
* Modify HTML element names: { "upper", "lower", "default" }.
*/
protected static final String NAMES_ELEMS = "http://cyberneko.org/html/properties/names/elems";
/**
* Lowercase HTML names.
*/
protected static final short NAMES_LOWERCASE = 2;
/**
* Match HTML element names.
*/
protected static final short NAMES_MATCH = 0;
/**
* Don't modify HTML names.
*/
protected static final short NAMES_NO_CHANGE = 0;
// modify HTML names
/**
* Uppercase HTML names.
*/
protected static final short NAMES_UPPERCASE = 1;
/**
* Namespaces.
*/
protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";
/**
* Report errors.
*/
protected static final String REPORT_ERRORS = "http://cyberneko.org/html/features/report-errors";
/**
* Synthesized event info item.
*/
protected static final HTMLEventInfo SYNTHESIZED_ITEM = new HTMLEventInfo.SynthesizedItem();
// static vars
/**
* Recognized features.
*/
private static final String[] RECOGNIZED_FEATURES = {
NAMESPACES, AUGMENTATIONS, REPORT_ERRORS, DOCUMENT_FRAGMENT_DEPRECATED,
DOCUMENT_FRAGMENT, IGNORE_OUTSIDE_CONTENT,};
//
// Data
//
// features
/**
* Recognized features defaults.
*/
private static final Boolean[] RECOGNIZED_FEATURES_DEFAULTS = {
null, null, null, null, Boolean.FALSE, Boolean.FALSE,};
/**
* Recognized properties.
*/
private static final String[] RECOGNIZED_PROPERTIES = {
NAMES_ELEMS, NAMES_ATTRS, ERROR_REPORTER, FRAGMENT_CONTEXT_STACK,};
/**
* Recognized properties defaults.
*/
private static final Object[] RECOGNIZED_PROPERTIES_DEFAULTS = {null, null, null, null,};
/**
* Converts HTML names string value to constant value.
*
* @see #NAMES_NO_CHANGE
* @see #NAMES_LOWERCASE
* @see #NAMES_UPPERCASE
*/
protected static final short getNamesValue(String value) {
if (value.equals("lower")) {
return NAMES_LOWERCASE;
}
if (value.equals("upper")) {
return NAMES_UPPERCASE;
}
return NAMES_NO_CHANGE;
} // getNamesValue(String):short
/**
* Modifies the given name based on the specified mode.
*/
protected static final String modifyName(String name, short mode) {
switch (mode) {
case NAMES_UPPERCASE:
return name.toUpperCase();
case NAMES_LOWERCASE:
return name.toLowerCase();
}
return name;
} // modifyName(String,short):String
/**
* Allows self closing iframe tags.
*/
protected boolean fAllowSelfclosingIframe;
// properties
/**
* Include infoset augmentations.
*/
protected boolean fAugmentations;
/**
* Document fragment balancing only.
*/
protected boolean fDocumentFragment;
/**
* The document handler.
*/
protected XMLDocumentHandler fDocumentHandler;
// connections
/**
* The document source.
*/
protected XMLDocumentSource fDocumentSource;
/**
* The element stack.
*/
protected final InfoStack fElementStack = new InfoStack();
// state
/**
* Error reporter.
*/
protected HTMLErrorReporter fErrorReporter;
/**
* Ignore outside content.
*/
protected boolean fIgnoreOutsideContent;
/**
* The inline stack.
*/
protected final InfoStack fInlineStack = new InfoStack();
/**
* Modify HTML attribute names.
*/
protected short fNamesAttrs;
/**
* Modify HTML element names.
*/
protected short fNamesElems;
/**
* Namespaces.
*/
protected boolean fNamespaces;
/**
* True if a form is in the stack (allow to discard opening of nested forms)
*/
protected boolean fOpenedForm;
/**
* Report errors.
*/
protected boolean fReportErrors;
/**
* True if seen anything. Important for xml declaration.
*/
protected boolean fSeenAnything;
// temp vars
/**
* True if seen <body< element.
*/
protected boolean fSeenBodyElement;
/**
* True if root element has been seen.
*/
protected boolean fSeenDoctype;
/**
* True if seen <head< element.
*/
protected boolean fSeenHeadElement;
/**
* True if root element has been seen.
*/
protected boolean fSeenRootElement;
/**
* True if seen the end of the document element. In other words, this variable is set to false
* until the end </HTML> tag is seen (or synthesized). This is used to ensure that
* extraneous events after the end of the document element do not make the document stream
* ill-formed.
*/
protected boolean fSeenRootElementEnd;
protected HTMLTagBalancingListener tagBalancingListener;
private List/* ElementEntry */endElementsBuffer_ = new ArrayList();
/**
* Empty attributes.
*/
private final XMLAttributes fEmptyAttrs = new XMLAttributesImpl();
/**
* Augmentations.
*/
private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();
private boolean forcedEndElement_ = false;
//
// HTMLComponent methods
//
private boolean forcedStartElement_ = false;
/**
* A qualified name.
*/
private final QName fQName = new QName();
//
// XMLComponent methods
//
/**
* Stack of elements determining the context in which a document fragment should be parsed
*/
private QName[] fragmentContextStack_ = null;
private int fragmentContextStackSize_ = 0; // not 0 only when a fragment is
// parsed and
// fragmentContextStack_ is set
private LostText lostText_ = new LostText();
/**
* Characters.
*/
public void characters(final XMLString text, final Augmentations augs) throws XNIException {
// check for end of document
if (fSeenRootElementEnd) {
return;
}
if (fElementStack.top == 0 && !fDocumentFragment) {
// character before first opening tag
lostText_.add(text, augs);
return;
}
// is this text whitespace?
boolean whitespace = true;
for (int i = 0; i < text.length; i++) {
if (!Character.isWhitespace(text.ch[text.offset + i])) {
whitespace = false;
break;
}
}
if (!fDocumentFragment) {
// handle bare characters
if (!fSeenRootElement) {
if (whitespace) {
return;
}
forceStartBody();
}
if (whitespace && (fElementStack.top < 2 || endElementsBuffer_.size() == 1)) {
// ignore spaces directly within
return;
}
// handle character content in head
// NOTE: This frequently happens when the document looks like:
// Title
// And here's some text.
else if (!whitespace) {
Info info = fElementStack.peek();
if (info.element.code == HTMLElements.HEAD || info.element.code == HTMLElements.HTML) {
String hname = modifyName("head", fNamesElems);
String bname = modifyName("body", fNamesElems);
if (fReportErrors) {
fErrorReporter.reportWarning("HTML2009", new Object[]{hname, bname});
}
forceStartBody();
}
}
}
// call handler
if (fDocumentHandler != null) {
fDocumentHandler.characters(text, augs);
}
} // characters(XMLString,Augmentations)
/**
* Comment.
*/
public void comment(XMLString text, Augmentations augs) throws XNIException {
fSeenAnything = true;
consumeEarlyTextIfNeeded();
if (fDocumentHandler != null) {
fDocumentHandler.comment(text, augs);
}
} // comment(XMLString,Augmentations)
//
// XMLDocumentSource methods
//
/**
* Doctype declaration.
*/
public void doctypeDecl(String rootElementName, String publicId, String systemId,
Augmentations augs) throws XNIException {
fSeenAnything = true;
if (fReportErrors) {
if (fSeenRootElement) {
fErrorReporter.reportError("HTML2010", null);
} else if (fSeenDoctype) {
fErrorReporter.reportError("HTML2011", null);
}
}
if (!fSeenRootElement && !fSeenDoctype) {
fSeenDoctype = true;
if (fDocumentHandler != null) {
fDocumentHandler.doctypeDecl(rootElementName, publicId, systemId, augs);
}
}
} // doctypeDecl(String,String,String,Augmentations)
// @since Xerces 2.1.0
/**
* Empty element.
*/
public void emptyElement(final QName element, XMLAttributes attrs, Augmentations augs)
throws XNIException {
startElement(element, attrs, augs);
// browser ignore the closing indication for non empty tags like but not for unknown element
final HTMLElements.Element elem = getElement(element);
if (elem.isEmpty() || elem.code == HTMLElements.UNKNOWN || elem.code == HTMLElements.IFRAME
&& fAllowSelfclosingIframe) {
endElement(element, augs);
}
} // emptyElement(QName,XMLAttributes,Augmentations)
//
// XMLDocumentHandler methods
//
// since Xerces-J 2.2.0
/**
* End CDATA section.
*/
public void endCDATA(Augmentations augs) throws XNIException {
// check for end of document
if (fSeenRootElementEnd) {
return;
}
// call handler
if (fDocumentHandler != null) {
fDocumentHandler.endCDATA(augs);
}
} // endCDATA(Augmentations)
// old methods
/**
* End document.
*/
public void endDocument(Augmentations augs) throws XNIException {
// and