org.owasp.validator.html.scan.AntiSamyDOMScanner Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of antisamy Show documentation
Show all versions of antisamy Show documentation
A library for performing fast, configurable cleansing of HTML coming from untrusted sources.
The newest version!
/*
* Copyright (c) 2007-2023, Arshan Dabirsiaghi, Jason Li
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this list of conditions
* and the following disclaimer. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the documentation and/or other
* materials provided with the distribution. Neither the name of OWASP nor the names of its
* contributors may be used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
* IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.owasp.validator.html.scan;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.batik.css.parser.ParseException;
import org.htmlunit.cyberneko.parsers.DOMFragmentParser;
import org.htmlunit.cyberneko.xerces.dom.DocumentImpl;
import org.owasp.validator.css.CssScanner;
import org.owasp.validator.html.CleanResults;
import org.owasp.validator.html.Policy;
import org.owasp.validator.html.PolicyException;
import org.owasp.validator.html.ScanException;
import org.owasp.validator.html.model.Attribute;
import org.owasp.validator.html.model.Tag;
import org.owasp.validator.html.util.ErrorMessageUtil;
import org.owasp.validator.html.util.HTMLEntityEncoder;
import org.w3c.dom.Comment;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.ProcessingInstruction;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
/**
* This is where the magic lives (all the HTML scanning/filtration logic resides here). This class
* should not be called directly. All scanning should be done through an AntiSamy.scan()
*
method invocation.
*
* @author Arshan Dabirsiaghi
*/
public class AntiSamyDOMScanner extends AbstractAntiSamyScanner {
private Document document = new DocumentImpl();
private DocumentFragment dom = document.createDocumentFragment();
private CleanResults results = null;
private static final int maxDepth = 250;
private static final Pattern invalidXmlCharacters =
Pattern.compile("[\\u0000-\\u001F\\uD800-\\uDFFF\\uFFFE-\\uFFFF&&[^\\u0009\\u000A\\u000D]]");
private static final Pattern conditionalDirectives =
Pattern.compile("!?\\[\\s*(?:end)?if[^]]*\\]>?");
private static final Queue cachedItems = new ConcurrentLinkedQueue();
static class CachedItem {
private final DOMFragmentParser parser;
private final Matcher invalidXmlCharMatcher = invalidXmlCharacters.matcher("");
CachedItem() throws SAXNotSupportedException, SAXNotRecognizedException {
this.parser = getDomParser();
}
DOMFragmentParser getDomFragmentParser() {
return parser;
}
}
/**
* Create an instance of this class configured to use the specified policy.
*
* @param policy The policy to use.
*/
public AntiSamyDOMScanner(Policy policy) {
super(policy);
}
/**
* Create an instance of this class configured to use the default AntiSamy policy.
*
* @throws PolicyException thrown when there is a problem validating or parsing the policy file.
* Any validation errors not caught by the XML validation will be thrown with this exception.
*/
public AntiSamyDOMScanner() throws PolicyException {
super();
}
/**
* This is where the magic lives.
*
* @param html A String whose contents is to be sanitized per the configured AntiSamy policy.
* @return A CleanResults
object with (possibly) an XMLDocumentFragment
* object and a String representation of the cleaned HTML, as well as some scan statistics.
* Note that ONLY the cleaned HTML can be considered trustworthy. The absence of errorMessages
* in the CleanResults does NOT necessarily indicate the input was safe (i.e., contained no
* attacks).
* @throws ScanException When there is a problem encountered while scanning the HTML input.
*/
@Override
public CleanResults scan(String html) throws ScanException {
if (html == null) {
throw new ScanException(new NullPointerException("Null HTML input"));
}
errorMessages.clear();
int maxInputSize = policy.getMaxInputSize();
if (maxInputSize < html.length()) {
addError(ErrorMessageUtil.ERROR_INPUT_SIZE, new Object[] {html.length(), maxInputSize});
throw new ScanException(errorMessages.get(0));
}
isNofollowAnchors = policy.isNofollowAnchors();
isNoopenerAndNoreferrerAnchors = policy.isNoopenerAndNoreferrerAnchors();
isValidateParamAsEmbed = policy.isValidateParamAsEmbed();
long startOfScan = System.currentTimeMillis();
try {
CachedItem cachedItem;
cachedItem = cachedItems.poll();
if (cachedItem == null) {
cachedItem = new CachedItem();
}
/*
* We have to replace any invalid XML characters to prevent NekoHTML
* from breaking when it gets passed encodings like %21.
*/
html = stripNonValidXMLCharacters(html, cachedItem.invalidXmlCharMatcher);
/*
* First thing we do is call the HTML cleaner ("NekoHTML") on it
* with the appropriate options. We choose not to omit tags due to
* the fallibility of our own listing in the ever changing world of
* W3C.
*/
DOMFragmentParser parser = cachedItem.getDomFragmentParser();
try {
parser.parse(new InputSource(new StringReader(html)), dom);
} catch (Exception e) {
throw new ScanException(e);
}
processChildren(dom, 0);
/*
* Serialize the output and then return the resulting DOM object and
* its string representation.
*/
final String trimmedHtml = html;
StringWriter out = new StringWriter();
@SuppressWarnings("deprecation")
org.apache.xml.serialize.OutputFormat format = getOutputFormat();
//noinspection deprecation
org.apache.xml.serialize.HTMLSerializer serializer = getHTMLSerializer(out, format);
serializer.serialize(dom);
/*
* Get the String out of the StringWriter and rip out the XML
* declaration if the Policy says we should.
*/
final String trimmed = trim(trimmedHtml, out.getBuffer().toString());
Callable cleanHtml =
new Callable() {
@Override
public String call() throws Exception {
return trimmed;
}
};
/*
* Return the DOM object as well as string HTML.
*/
results = new CleanResults(startOfScan, cleanHtml, dom, errorMessages);
cachedItems.add(cachedItem);
return results;
} catch (SAXException | IOException e) {
throw new ScanException(e);
}
}
static DOMFragmentParser getDomParser()
throws SAXNotRecognizedException, SAXNotSupportedException {
DOMFragmentParser parser = new DOMFragmentParser();
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
parser.setFeature("http://cyberneko.org/html/features/scanner/style/strip-cdata-delims", false);
parser.setFeature("http://cyberneko.org/html/features/scanner/cdata-sections", true);
parser.setFeature("http://cyberneko.org/html/features/parse-noscript-content", false);
return parser;
}
/**
* The workhorse of the scanner. Recursively scans document elements according to the policy. This
* should be called implicitly through the AntiSamy.scan() method.
*
* @param node The node to validate.
*/
private void recursiveValidateTag(final Node node, int currentStackDepth) throws ScanException {
currentStackDepth++;
if (currentStackDepth > maxDepth) {
throw new ScanException("Too many nested tags");
}
if (node instanceof Comment) {
processCommentNode(node);
return;
}
boolean isElement = node instanceof Element;
NodeList eleChildNodes = node.getChildNodes();
if (isElement && eleChildNodes.getLength() == 0) {
if (removeDisallowedEmpty(node)) {
return;
}
}
if (node instanceof Text && Node.CDATA_SECTION_NODE == node.getNodeType()) {
stripCData(node);
return;
}
if (node instanceof ProcessingInstruction) {
removePI(node);
}
if (!isElement) {
return;
}
final Element ele = (Element) node;
final Node parentNode = ele.getParentNode();
final String tagName = ele.getNodeName();
final String tagNameLowerCase = tagName.toLowerCase();
Tag tagRule = policy.getTagByLowercaseName(tagNameLowerCase);
/*
* If and no policy and isValidateParamAsEmbed and policy in
* place for