org.owasp.validator.html.scan.AntiSamyDOMScanner Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of com.liferay.portal.security.antisamy
Show all versions of com.liferay.portal.security.antisamy
Liferay Portal Security AntiSamy
/*
* Copyright (c) 2007-2021, Arshan Dabirsiaghi, Jason Li
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
* Neither the name of OWASP nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.owasp.validator.html.scan;
import org.apache.batik.css.parser.ParseException;
import org.apache.xerces.dom.DocumentImpl;
import net.sourceforge.htmlunit.cyberneko.parsers.DOMFragmentParser;
import org.owasp.validator.css.CssScanner;
import org.owasp.validator.html.CleanResults;
import org.owasp.validator.html.Policy;
import org.owasp.validator.html.PolicyException;
import org.owasp.validator.html.ScanException;
import org.owasp.validator.html.model.Attribute;
import org.owasp.validator.html.model.Tag;
import org.owasp.validator.html.util.ErrorMessageUtil;
import org.owasp.validator.html.util.HTMLEntityEncoder;
import org.w3c.dom.Comment;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.ProcessingInstruction;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* This is where the magic lives. All the scanning/filtration logic resides
* here, but it should not be called directly. All scanning should be done
* through an AntiSamy.scan()
method.
*
* @author Arshan Dabirsiaghi
*/
public class AntiSamyDOMScanner extends AbstractAntiSamyScanner {
private Document document = new DocumentImpl();
private DocumentFragment dom = document.createDocumentFragment();
private CleanResults results = null;
private static final int maxDepth = 250;
private static final Pattern invalidXmlCharacters =
Pattern.compile("[\\u0000-\\u001F\\uD800-\\uDFFF\\uFFFE-\\uFFFF&&[^\\u0009\\u000A\\u000D]]");
private static final Pattern conditionalDirectives = Pattern.compile("!?\\[\\s*(?:end)?if[^]]*\\]>?");
private static final Queue cachedItems = new ConcurrentLinkedQueue();
static class CachedItem {
private final DOMFragmentParser parser;
private final Matcher invalidXmlCharMatcher = invalidXmlCharacters.matcher("");
CachedItem() throws SAXNotSupportedException, SAXNotRecognizedException {
this.parser = getDomParser();
}
DOMFragmentParser getDomFragmentParser() {
return parser;
}
}
public AntiSamyDOMScanner(Policy policy) {
super(policy);
}
/* UnusedDeclaration TODO Investigate */
public AntiSamyDOMScanner() throws PolicyException {
super();
}
/**
* This is where the magic lives.
*
* @param html A String whose contents we want to scan.
* @return A CleanResults
object with an
* XMLDocumentFragment
object and its String
* representation, as well as some scan statistics.
* @throws ScanException When there is a problem encountered
* while scanning the HTML.
*/
@Override
public CleanResults scan(String html) throws ScanException {
if (html == null) {
throw new ScanException(new NullPointerException("Null html input"));
}
errorMessages.clear();
int maxInputSize = policy.getMaxInputSize();
if (maxInputSize < html.length()) {
addError(ErrorMessageUtil.ERROR_INPUT_SIZE, new Object[]{html.length(), maxInputSize});
throw new ScanException(errorMessages.get(0));
}
isNofollowAnchors = policy.isNofollowAnchors();
isNoopenerAndNoreferrerAnchors = policy.isNoopenerAndNoreferrerAnchors();
isValidateParamAsEmbed = policy.isValidateParamAsEmbed();
long startOfScan = System.currentTimeMillis();
try {
CachedItem cachedItem;
cachedItem = cachedItems.poll();
if (cachedItem == null){
cachedItem = new CachedItem();
}
/*
* We have to replace any invalid XML characters to prevent NekoHTML
* from breaking when it gets passed encodings like %21.
*/
html = stripNonValidXMLCharacters(html, cachedItem.invalidXmlCharMatcher);
/*
* First thing we do is call the HTML cleaner ("NekoHTML") on it
* with the appropriate options. We choose not to omit tags due to
* the fallibility of our own listing in the ever changing world of
* W3C.
*/
DOMFragmentParser parser = cachedItem.getDomFragmentParser();
try {
parser.parse(new InputSource(new StringReader(html)), dom);
} catch (Exception e) {
throw new ScanException(e);
}
processChildren(dom, 0);
/*
* Serialize the output and then return the resulting DOM object and
* its string representation.
*/
final String trimmedHtml = html;
StringWriter out = new StringWriter();
@SuppressWarnings("deprecation")
org.apache.xml.serialize.OutputFormat format = getOutputFormat();
//noinspection deprecation
org.apache.xml.serialize.HTMLSerializer serializer = getHTMLSerializer(out, format);
serializer.serialize(dom);
/*
* Get the String out of the StringWriter and rip out the XML
* declaration if the Policy says we should.
*/
final String trimmed = trim( trimmedHtml, out.getBuffer().toString() );
Callable cleanHtml = new Callable() {
public String call() throws Exception {
return trimmed;
}
};
/*
* Return the DOM object as well as string HTML.
*/
results = new CleanResults(startOfScan, cleanHtml, dom, errorMessages);
cachedItems.add( cachedItem);
return results;
} catch (SAXException | IOException e) {
throw new ScanException(e);
}
}
static DOMFragmentParser getDomParser()
throws SAXNotRecognizedException, SAXNotSupportedException {
DOMFragmentParser parser = new DOMFragmentParser();
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
parser.setFeature("http://cyberneko.org/html/features/scanner/style/strip-cdata-delims", false);
parser.setFeature("http://cyberneko.org/html/features/scanner/cdata-sections", true);
try {
parser.setFeature("http://cyberneko.org/html/features/enforce-strict-attribute-names", true);
} catch (SAXNotRecognizedException se) {
// this indicates that the patched nekohtml is not on the
// classpath
}
return parser;
}
/**
* The workhorse of the scanner. Recursively scans document elements
* according to the policy. This should be called implicitly through the
* AntiSamy.scan() method.
*
* @param node The node to validate.
*/
private void recursiveValidateTag(final Node node, int currentStackDepth) throws ScanException {
currentStackDepth++;
if(currentStackDepth > maxDepth) {
throw new ScanException("Too many nested tags");
}
if (node instanceof Comment) {
processCommentNode(node);
return;
}
boolean isElement = node instanceof Element;
NodeList eleChildNodes = node.getChildNodes();
if (isElement && eleChildNodes.getLength() == 0) {
if (removeDisallowedEmpty(node)){
return;
}
}
if (node instanceof Text && Node.CDATA_SECTION_NODE == node.getNodeType()) {
stripCData(node);
return;
}
if (node instanceof ProcessingInstruction) {
removePI(node);
}
if (!isElement) {
return;
}
final Element ele = (Element) node;
final Node parentNode = ele.getParentNode();
final String tagName = ele.getNodeName();
final String tagNameLowerCase = tagName.toLowerCase();
Tag tagRule = policy.getTagByLowercaseName(tagNameLowerCase);
/*
* If and no policy and isValidateParamAsEmbed and policy in
* place for