com.googlecode.html.filters.ElementRemover Maven / Gradle / Ivy
/*
* Copyright 2002-2009 Andy Clark, Marc Guillemot
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package com.googlecode.html.filters;
import org.apache.xerces.xni.*;
import java.util.Hashtable;
/**
* This class is a document filter capable of removing specified elements from the processing
* stream. There are two options for processing document elements:
*
* - specifying those elements which should be accepted and, optionally, which attributes of that
* element should be kept; and
*
- specifying those elements whose tags and content should be completely removed from the event
* stream.
*
*
* The first option allows the application to specify which elements appearing in the event stream
* should be accepted and, therefore, passed on to the next stage in the pipeline. All elements
* not in the list of acceptable elements have their start and end tags stripped from the
* event stream unless those elements appear in the list of elements to be removed.
*
* The second option allows the application to specify which elements should be completely removed
* from the event stream. When an element appears that is to be removed, the element's start and end
* tag as well as all of that element's content is removed from the event stream.
*
* A common use of this filter would be to only allow rich-text and linking elements as well as the
* character content to pass through the filter — all other elements would be stripped. The
* following code shows how to configure this filter to perform this task:
*
*
* ElementRemover remover = new ElementRemover();
* remover.acceptElement("b", null);
* remover.acceptElement("i", null);
* remover.acceptElement("u", null);
* remover.acceptElement("a", new String[] { "href" });
*
*
* However, this would still allow the text content of other elements to pass through, which may not
* be desirable. In order to further "clean" the input, the removeElement
option can be
* used. The following piece of code adds the ability to completely remove any <SCRIPT> tags
* and content from the stream.
*
*
* remover.removeElement("script");
*
*
* Note: All text and accepted element children of a stripped element is retained.
* To completely remove an element's content, use the removeElement
method.
*
* Note: Care should be taken when using this filter because the output may not be
* a well-balanced tree. Specifically, if the application removes the <HTML> element (with or
* without retaining its children), the resulting document event stream will no longer be
* well-formed.
*
* @author Andy Clark
* @version $Id: ElementRemover.java,v 1.5 2005/02/14 03:56:54 andyc Exp $
*/
public class ElementRemover extends DefaultFilter {
//
// Constants
//
/**
* A "null" object.
*/
protected static final Object NULL = new Object();
//
// Data
//
// information
/**
* Accepted elements.
*/
protected Hashtable fAcceptedElements = new Hashtable();
/**
* The element depth.
*/
protected int fElementDepth;
// state
/**
* The element depth at element removal.
*/
protected int fRemovalElementDepth;
/**
* Removed elements.
*/
protected Hashtable fRemovedElements = new Hashtable();
//
// Public methods
//
/**
* Specifies that the given element should be accepted and, optionally, which attributes of that
* element should be kept.
*
* @param element The element to accept.
* @param attributes The list of attributes to be kept or null if no attributes should be kept
* for this element.
*
* see #removeElement
*/
public void acceptElement(String element, String[] attributes) {
Object key = element.toLowerCase();
Object value = NULL;
if (attributes != null) {
String[] newarray = new String[attributes.length];
for (int i = 0; i < attributes.length; i++) {
newarray[i] = attributes[i].toLowerCase();
}
value = attributes;
}
fAcceptedElements.put(key, value);
} // acceptElement(String,String[])
/**
* Characters.
*/
public void characters(XMLString text, Augmentations augs) throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.characters(text, augs);
}
} // characters(XMLString,Augmentations)
//
// XMLDocumentHandler methods
//
// since Xerces-J 2.2.0
/**
* Comment.
*/
public void comment(XMLString text, Augmentations augs) throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.comment(text, augs);
}
} // comment(XMLString,Augmentations)
// old methods
/**
* Empty element.
*/
public void emptyElement(QName element, XMLAttributes attributes, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth && handleOpenTag(element, attributes)) {
super.emptyElement(element, attributes, augs);
}
} // emptyElement(QName,XMLAttributes,Augmentations)
/**
* End CDATA section.
*/
public void endCDATA(Augmentations augs) throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.endCDATA(augs);
}
} // endCDATA(Augmentations)
/**
* End element.
*/
public void endElement(QName element, Augmentations augs) throws XNIException {
if (fElementDepth <= fRemovalElementDepth && elementAccepted(element.rawname)) {
super.endElement(element, augs);
}
fElementDepth--;
if (fElementDepth == fRemovalElementDepth) {
fRemovalElementDepth = Integer.MAX_VALUE;
}
} // endElement(QName,Augmentations)
/**
* End general entity.
*/
public void endGeneralEntity(String name, Augmentations augs) throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.endGeneralEntity(name, augs);
}
} // endGeneralEntity(String,Augmentations)
/**
* End prefix mapping.
*/
public void endPrefixMapping(String prefix, Augmentations augs) throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.endPrefixMapping(prefix, augs);
}
} // endPrefixMapping(String,Augmentations)
/**
* Ignorable whitespace.
*/
public void ignorableWhitespace(XMLString text, Augmentations augs) throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.ignorableWhitespace(text, augs);
}
} // ignorableWhitespace(XMLString,Augmentations)
/**
* Processing instruction.
*/
public void processingInstruction(String target, XMLString data, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.processingInstruction(target, data, augs);
}
} // processingInstruction(String,XMLString,Augmentations)
/**
* Specifies that the given element should be completely removed. If an element is encountered
* during processing that is on the remove list, the element's start and end tags as well as all
* of content contained within the element will be removed from the processing stream.
*
* @param element The element to completely remove.
*/
public void removeElement(String element) {
Object key = element.toLowerCase();
Object value = NULL;
fRemovedElements.put(key, value);
} // removeElement(String)
/**
* Start CDATA section.
*/
public void startCDATA(Augmentations augs) throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.startCDATA(augs);
}
} // startCDATA(Augmentations)
/**
* Start document.
*/
public void startDocument(XMLLocator locator, String encoding, Augmentations augs)
throws XNIException {
startDocument(locator, encoding, null, augs);
} // startDocument(XMLLocator,String,Augmentations)
/**
* Start document.
*/
public void startDocument(XMLLocator locator, String encoding, NamespaceContext nscontext,
Augmentations augs) throws XNIException {
fElementDepth = 0;
fRemovalElementDepth = Integer.MAX_VALUE;
super.startDocument(locator, encoding, nscontext, augs);
} // startDocument(XMLLocator,String,NamespaceContext,Augmentations)
/**
* Start element.
*/
public void startElement(QName element, XMLAttributes attributes, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth && handleOpenTag(element, attributes)) {
super.startElement(element, attributes, augs);
}
fElementDepth++;
} // startElement(QName,XMLAttributes,Augmentations)
/**
* Start general entity.
*/
public void startGeneralEntity(String name, XMLResourceIdentifier id, String encoding,
Augmentations augs) throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.startGeneralEntity(name, id, encoding, augs);
}
} // startGeneralEntity(String,XMLResourceIdentifier,String,Augmentations)
/**
* Start prefix mapping.
*/
public void startPrefixMapping(String prefix, String uri, Augmentations augs)
throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.startPrefixMapping(prefix, uri, augs);
}
} // startPrefixMapping(String,String,Augmentations)
/**
* Text declaration.
*/
public void textDecl(String version, String encoding, Augmentations augs) throws XNIException {
if (fElementDepth <= fRemovalElementDepth) {
super.textDecl(version, encoding, augs);
}
} // textDecl(String,String,Augmentations)
//
// Protected methods
//
/**
* Returns true if the specified element is accepted.
*/
protected boolean elementAccepted(String element) {
Object key = element.toLowerCase();
return fAcceptedElements.containsKey(key);
} // elementAccepted(String):boolean
/**
* Returns true if the specified element should be removed.
*/
protected boolean elementRemoved(String element) {
Object key = element.toLowerCase();
return fRemovedElements.containsKey(key);
} // elementRemoved(String):boolean
/**
* Handles an open tag.
*/
protected boolean handleOpenTag(QName element, XMLAttributes attributes) {
if (elementAccepted(element.rawname)) {
Object key = element.rawname.toLowerCase();
Object value = fAcceptedElements.get(key);
if (value != NULL) {
String[] anames = (String[]) value;
int attributeCount = attributes.getLength();
LOOP:
for (int i = 0; i < attributeCount; i++) {
String aname = attributes.getQName(i).toLowerCase();
for (int j = 0; j < anames.length; j++) {
if (anames[j].equals(aname)) {
continue LOOP;
}
}
attributes.removeAttributeAt(i--);
attributeCount--;
}
} else {
attributes.removeAllAttributes();
}
return true;
} else if (elementRemoved(element.rawname)) {
fRemovalElementDepth = fElementDepth;
}
return false;
} // handleOpenTag(QName,XMLAttributes):boolean
} // class DefaultFilter