
org.owasp.validator.html.scan.AntiSamyDOMScanner Maven / Gradle / Ivy
The newest version!
/*
* Copyright (c) 2007-2008, Arshan Dabirsiaghi, Jason Li
*
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
*
* Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
* Neither the name of OWASP nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package org.owasp.validator.html.scan;
import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.MissingResourceException;
import java.util.ResourceBundle;
import java.util.regex.Pattern;
import org.apache.batik.css.parser.ParseException;
import org.apache.xerces.dom.DocumentImpl;
import org.apache.xml.serialize.HTMLSerializer;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XHTMLSerializer;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.owasp.validator.css.CssScanner;
import org.owasp.validator.html.CleanResults;
import org.owasp.validator.html.Policy;
import org.owasp.validator.html.PolicyException;
import org.owasp.validator.html.ScanException;
import org.owasp.validator.html.model.Attribute;
import org.owasp.validator.html.model.Tag;
import org.owasp.validator.html.util.ErrorMessageUtil;
import org.owasp.validator.html.util.HTMLEntityEncoder;
import org.w3c.dom.Comment;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
/**
* This is where the magic lives. All the scanning/filtration logic resides here, but it should not be called
* directly. All scanning should be done through a AntiSamy.scan()
method.
*
* @author Arshan Dabirsiaghi
*
*/
public class AntiSamyDOMScanner {
private Policy policy;
private CleanResults results = null;
private ArrayList errorMessages = new ArrayList();
private Document document = new DocumentImpl();
private DocumentFragment dom = document.createDocumentFragment();
public static final String DEFAULT_ENCODING_ALGORITHM = "UTF-8";
private static final String DEFAULT_LOCALE_LANG = "en";
private static final String DEFAULT_LOCALE_LOC = "US";
private static final Tag BASIC_PARAM_TAG_RULE;
static {
Attribute paramNameAttr = new Attribute("name");
Attribute paramValueAttr = new Attribute("value");
paramNameAttr.addAllowedRegExp(Policy.ANYTHING_REGEXP);
paramValueAttr.addAllowedRegExp(Policy.ANYTHING_REGEXP);
BASIC_PARAM_TAG_RULE = new Tag("param");
BASIC_PARAM_TAG_RULE.addAttribute(paramNameAttr);
BASIC_PARAM_TAG_RULE.addAttribute(paramValueAttr);
BASIC_PARAM_TAG_RULE.setAction(Policy.ACTION_VALIDATE);
}
private ResourceBundle messages = null;
private Locale locale = Locale.getDefault();
private boolean isNofollowAnchors = false;
private boolean isValidateParamAsEmbed = false;
/*
* Hardcoded list of tags that are strictly barred from having children.
*/
private String[] allowedEmptyTags = {
"br", "hr", "a",
"img", "link", "iframe", "script", "object", "applet",
"frame", "base", "param", "meta", "input", "textarea", "embed",
"basefont", "col" };
public void initializeErrors() {
try {
messages = ResourceBundle.getBundle("AntiSamy", locale);
} catch (MissingResourceException mre) {
messages = ResourceBundle.getBundle("AntiSamy", new Locale(DEFAULT_LOCALE_LANG,DEFAULT_LOCALE_LOC));
}
}
/**
* This is where the magic lives.
* @param html A String whose contents we want to scan.
* @return A CleanResults
object with an XMLDocumentFragment
object and its String representation, as well as some scan statistics.
* @throws ScanException
*/
public CleanResults scan(String html, String inputEncoding, String outputEncoding) throws ScanException {
if ( html == null ) {
throw new ScanException(new NullPointerException("Null input"));
}
initializeErrors();
int maxInputSize = policy.getMaxInputSize();
if ( maxInputSize < html.length() ) {
addError(ErrorMessageUtil.ERROR_INPUT_SIZE, new Object[] { new Integer(html.length()), new Integer(maxInputSize) });
throw new ScanException( errorMessages.get(0).toString() );
}
isNofollowAnchors = "true".equals(policy.getDirective(Policy.ANCHORS_NOFOLLOW));
isValidateParamAsEmbed = "true".equals(policy.getDirective(Policy.VALIDATE_PARAM_AS_EMBED));
Date start = new Date();
try {
/*
* We have to replace any invalid XML characters to prevent NekoHTML from breaking when it gets passed
* encodings like %21.
*/
html = stripNonValidXMLCharacters(html);
/*
* First thing we do is call the HTML cleaner ("NekoHTML") on it with the appropriate options. We choose
* not to omit tags due to the fallibility of our own listing in the ever changing world
* of W3C.
*/
DOMFragmentParser parser = new DOMFragmentParser();
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
parser.setProperty("http://cyberneko.org/html/properties/default-encoding",inputEncoding);
parser.setFeature("http://cyberneko.org/html/features/scanner/style/strip-cdata-delims", false);
parser.setFeature("http://cyberneko.org/html/features/scanner/cdata-sections", true);
try {
parser.setFeature("http://cyberneko.org/html/features/enforce-strict-attribute-names", true);
} catch (SAXNotRecognizedException se) {
// this indicates that the patched nekohtml is not on the classpath
}
try {
parser.parse(new InputSource(new StringReader(html)),dom);
} catch (Exception e) {
throw new ScanException(e);
}
/*
* Call the work horse.
*/
for(int i = 0;i?", ""));
}
}
return;
}
if ( node instanceof Element && node.getChildNodes().getLength() == 0 ) {
boolean isEmptyAllowed = false;
for(int i=0; i and no policy and isValidateParamAsEmbed and policy in place for