com.googlecode.html.filters.Purifier Maven / Gradle / Ivy
/*
* Copyright 2004-2008 Andy Clark
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package com.googlecode.html.filters;
import com.googlecode.html.HTMLAugmentations;
import com.googlecode.html.HTMLEventInfo;
import com.googlecode.html.xercesbridge.XercesBridge;
import org.apache.xerces.util.XMLChar;
import org.apache.xerces.util.XMLStringBuffer;
import org.apache.xerces.xni.*;
import org.apache.xerces.xni.parser.XMLComponentManager;
import org.apache.xerces.xni.parser.XMLConfigurationException;
/**
* This filter purifies the HTML input to ensure XML well-formedness. The purification process
* includes:
*
* - fixing illegal characters in the document, including
*
* - element and attribute names,
*
- processing instruction target and data,
*
- document text;
*
* - ensuring the string "--" does not appear in the content of a comment;
*
- ensuring the string "]]>" does not appear in the content of a CDATA section;
*
- ensuring that the XML declaration has required pseudo-attributes and that the values are
* correct; and
*
- synthesized missing namespace bindings.
*
*
* Illegal characters in XML names are converted to the character sequence "_u####_" where "####" is
* the value of the Unicode character represented in hexadecimal. Whereas illegal characters
* appearing in document content is converted to the character sequence "\\u####".
*
* In comments, the character '-' is replaced by the character sequence "- " to prevent "--" from
* ever appearing in the comment content. For CDATA sections, the character ']' is replaced by the
* character sequence "] " to prevent "]]" from appearing.
*
* The URI used for synthesized namespace bindings is
* "http://cyberneko.org/html/ns/synthesized/number" where number is generated to
* ensure uniqueness.
*
* @author Andy Clark
* @version $Id: Purifier.java,v 1.5 2005/02/14 03:56:54 andyc Exp $
*/
public class Purifier extends DefaultFilter {
//
// Constants
//
/**
* Synthesized namespace binding prefix.
*/
public static final String SYNTHESIZED_NAMESPACE_PREFX = "http://cyberneko.org/html/ns/synthesized/";
/**
* Include infoset augmentations.
*/
protected static final String AUGMENTATIONS = "http://cyberneko.org/html/features/augmentations";
/**
* Namespaces.
*/
protected static final String NAMESPACES = "http://xml.org/sax/features/namespaces";
/**
* Synthesized event info item.
*/
protected static final HTMLEventInfo SYNTHESIZED_ITEM = new HTMLEventInfo.SynthesizedItem();
// static vars
/**
* Recognized features.
*/
private static final String[] RECOGNIZED_FEATURES = {NAMESPACES, AUGMENTATIONS,};
//
// Data
//
// features
/**
* Returns a padded hexadecimal string for the given value.
*/
protected static String toHexString(int c, int padlen) {
StringBuffer str = new StringBuffer(padlen);
str.append(Integer.toHexString(c));
int len = padlen - str.length();
for (int i = 0; i < len; i++) {
str.insert(0, '0');
}
return str.toString().toUpperCase();
} // toHexString(int,int):String
/**
* Augmentations.
*/
protected boolean fAugmentations;
// state
/**
* True if inside a CDATA section.
*/
protected boolean fInCDATASection;
/**
* Namespace information.
*/
protected NamespaceContext fNamespaceContext;
/**
* Namespaces.
*/
protected boolean fNamespaces;
// doctype declaration info
/**
* Public identifier of doctype declaration.
*/
protected String fPublicId;
/**
* True if the doctype declaration was seen.
*/
protected boolean fSeenDoctype;
// namespace info
/**
* True if root element was seen.
*/
protected boolean fSeenRootElement;
/**
* Synthesized namespace binding count.
*/
protected int fSynthesizedNamespaceCount;
// temp vars
/**
* System identifier of doctype declaration.
*/
protected String fSystemId;
/**
* Augmentations.
*/
private final HTMLAugmentations fInfosetAugs = new HTMLAugmentations();
/**
* Qualified name.
*/
private QName fQName = new QName();
//
// XMLComponent methods
//
/**
* String buffer.
*/
private final XMLStringBuffer fStringBuffer = new XMLStringBuffer();
//
// XMLDocumentHandler methods
//
/**
* Characters.
*/
public void characters(XMLString text, Augmentations augs) throws XNIException {
text = purifyText(text);
if (fInCDATASection) {
StringBuffer str = new StringBuffer(text.toString());
int length = str.length();
for (int i = length - 1; i >= 0; i--) {
char c = str.charAt(i);
if (c == ']') {
str.insert(i + 1, ' ');
}
}
fStringBuffer.length = 0;
fStringBuffer.append(str.toString());
text = fStringBuffer;
}
super.characters(text, augs);
} // characters(XMLString,Augmentations)
/**
* Comment.
*/
public void comment(XMLString text, Augmentations augs) throws XNIException {
StringBuffer str = new StringBuffer(purifyText(text).toString());
int length = str.length();
for (int i = length - 1; i >= 0; i--) {
char c = str.charAt(i);
if (c == '-') {
str.insert(i + 1, ' ');
}
}
fStringBuffer.length = 0;
fStringBuffer.append(str.toString());
text = fStringBuffer;
super.comment(text, augs);
} // comment(XMLString,Augmentations)
/**
* Doctype declaration.
*/
public void doctypeDecl(String root, String pubid, String sysid, Augmentations augs)
throws XNIException {
fSeenDoctype = true;
// NOTE: It doesn't matter what the root element name is because
// it must match the root element. -Ac
fPublicId = pubid;
fSystemId = sysid;
// NOTE: If the public identifier is specified, then a system
// identifier must also be specified. -Ac
if (fPublicId != null && fSystemId == null) {
fSystemId = "";
}
// NOTE: Can't save the augmentations because the object state
// is transient. -Ac
} // doctypeDecl(String,String,String,Augmentations)
/**
* Empty element.
*/
public void emptyElement(QName element, XMLAttributes attrs, Augmentations augs)
throws XNIException {
handleStartElement(element, attrs);
super.emptyElement(element, attrs, augs);
} // emptyElement(QName,XMLAttributes,Augmentations)
/**
* End CDATA section.
*/
public void endCDATA(Augmentations augs) throws XNIException {
fInCDATASection = false;
super.endCDATA(augs);
} // endCDATA(Augmentations)
/**
* End element.
*/
public void endElement(QName element, Augmentations augs) throws XNIException {
element = purifyQName(element);
if (fNamespaces) {
if (element.prefix != null && element.uri == null) {
element.uri = fNamespaceContext.getURI(element.prefix);
}
}
super.endElement(element, augs);
} // endElement(QName,Augmentations)
/**
* Processing instruction.
*/
public void processingInstruction(String target, XMLString data, Augmentations augs)
throws XNIException {
target = purifyName(target, true);
data = purifyText(data);
super.processingInstruction(target, data, augs);
} // processingInstruction(String,XMLString,Augmentations)
public void reset(XMLComponentManager manager) throws XMLConfigurationException {
// state
fInCDATASection = false;
// features
fNamespaces = manager.getFeature(NAMESPACES);
fAugmentations = manager.getFeature(AUGMENTATIONS);
} // reset(XMLComponentManager)
/**
* Start CDATA section.
*/
public void startCDATA(Augmentations augs) throws XNIException {
fInCDATASection = true;
super.startCDATA(augs);
} // startCDATA(Augmentations)
/**
* Start document.
*/
public void startDocument(XMLLocator locator, String encoding, Augmentations augs)
throws XNIException {
fNamespaceContext = fNamespaces ? new NamespaceBinder.NamespaceSupport() : null;
fSynthesizedNamespaceCount = 0;
handleStartDocument();
super.startDocument(locator, encoding, augs);
} // startDocument(XMLLocator,String,Augmentations)
/**
* Start document.
*/
public void startDocument(XMLLocator locator, String encoding, NamespaceContext nscontext,
Augmentations augs) throws XNIException {
fNamespaceContext = nscontext;
fSynthesizedNamespaceCount = 0;
handleStartDocument();
super.startDocument(locator, encoding, nscontext, augs);
} // startDocument(XMLLocator,NamespaceContext,String,Augmentations)
/**
* Start element.
*/
public void startElement(QName element, XMLAttributes attrs, Augmentations augs)
throws XNIException {
handleStartElement(element, attrs);
super.startElement(element, attrs, augs);
} // startElement(QName,XMLAttributes,Augmentations)
//
// Protected methods
//
/**
* XML declaration.
*/
public void xmlDecl(String version, String encoding, String standalone, Augmentations augs)
throws XNIException {
if (version == null || !version.equals("1.0")) {
version = "1.0";
}
if (encoding != null && encoding.length() == 0) {
encoding = null;
}
if (standalone != null) {
if (!standalone.equalsIgnoreCase("true") && !standalone.equalsIgnoreCase("false")) {
standalone = null;
} else {
standalone = standalone.toLowerCase();
}
}
super.xmlDecl(version, encoding, standalone, augs);
} // xmlDecl(String,String,String,Augmentations)
/**
* Handle start document.
*/
protected void handleStartDocument() {
fSeenDoctype = false;
fSeenRootElement = false;
} // handleStartDocument()
/**
* Handle start element.
*/
protected void handleStartElement(QName element, XMLAttributes attrs) {
// handle element and attributes
element = purifyQName(element);
int attrCount = attrs != null ? attrs.getLength() : 0;
for (int i = attrCount - 1; i >= 0; i--) {
// purify attribute name
attrs.getName(i, fQName);
attrs.setName(i, purifyQName(fQName));
// synthesize namespace bindings
if (fNamespaces) {
if (!fQName.rawname.equals("xmlns") && !fQName.rawname.startsWith("xmlns:")) {
// NOTE: Must get attribute name again because the
// purifyQName method does not guarantee that
// the same QName object is returned. -Ac
attrs.getName(i, fQName);
if (fQName.prefix != null && fQName.uri == null) {
synthesizeBinding(attrs, fQName.prefix);
}
}
}
}
// synthesize namespace bindings
if (fNamespaces) {
if (element.prefix != null && element.uri == null) {
synthesizeBinding(attrs, element.prefix);
}
}
// synthesize doctype declaration
if (!fSeenRootElement && fSeenDoctype) {
Augmentations augs = synthesizedAugs();
super.doctypeDecl(element.rawname, fPublicId, fSystemId, augs);
}
// mark start element as seen
fSeenRootElement = true;
} // handleStartElement(QName,XMLAttributes)
/**
* Purify name.
*/
protected String purifyName(String name, boolean localpart) {
if (name == null) {
return name;
}
StringBuffer str = new StringBuffer();
int length = name.length();
boolean seenColon = localpart;
for (int i = 0; i < length; i++) {
char c = name.charAt(i);
if (i == 0) {
if (!XMLChar.isNameStart(c)) {
str.append("_u" + toHexString(c, 4) + "_");
} else {
str.append(c);
}
} else {
if ((fNamespaces && c == ':' && seenColon) || !XMLChar.isName(c)) {
str.append("_u" + toHexString(c, 4) + "_");
} else {
str.append(c);
}
seenColon = seenColon || c == ':';
}
}
return str.toString();
} // purifyName(String):String
//
// Protected methods
//
/**
* Purify qualified name.
*/
protected QName purifyQName(QName qname) {
qname.prefix = purifyName(qname.prefix, true);
qname.localpart = purifyName(qname.localpart, true);
qname.rawname = purifyName(qname.rawname, false);
return qname;
} // purifyQName(QName):QName
/**
* Purify content.
*/
protected XMLString purifyText(XMLString text) {
fStringBuffer.length = 0;
for (int i = 0; i < text.length; i++) {
char c = text.ch[text.offset + i];
if (XMLChar.isInvalid(c)) {
fStringBuffer.append("\\u" + toHexString(c, 4));
} else {
fStringBuffer.append(c);
}
}
return fStringBuffer;
} // purifyText(XMLString):XMLString
/**
* Synthesize namespace binding.
*/
protected void synthesizeBinding(XMLAttributes attrs, String ns) {
String prefix = "xmlns";
String localpart = ns;
String qname = prefix + ':' + localpart;
String uri = NamespaceBinder.NAMESPACES_URI;
String atype = "CDATA";
String avalue = SYNTHESIZED_NAMESPACE_PREFX + fSynthesizedNamespaceCount++;
// add attribute
fQName.setValues(prefix, localpart, qname, uri);
attrs.addAttribute(fQName, atype, avalue);
// bind namespace
XercesBridge.getInstance().NamespaceContext_declarePrefix(fNamespaceContext, ns, avalue);
} // synthesizeBinding(XMLAttributes,String)
//
// Protected static methods
//
/**
* Returns an augmentations object with a synthesized item added.
*/
protected final Augmentations synthesizedAugs() {
HTMLAugmentations augs = null;
if (fAugmentations) {
augs = fInfosetAugs;
augs.removeAllItems();
augs.putItem(AUGMENTATIONS, SYNTHESIZED_ITEM);
}
return augs;
} // synthesizedAugs():Augmentations
} // class Purifier