All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.owasp.validator.html.scan.AntiSamyDOMScanner Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (c) 2007-2008, Arshan Dabirsiaghi, Jason Li
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 *
 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 * Neither the name of OWASP nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.owasp.validator.html.scan;

import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.MissingResourceException;
import java.util.ResourceBundle;
import java.util.regex.Pattern;

import org.apache.batik.css.parser.ParseException;
import org.apache.xerces.dom.DocumentImpl;
import org.apache.xml.serialize.HTMLSerializer;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XHTMLSerializer;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.owasp.validator.css.CssScanner;
import org.owasp.validator.html.CleanResults;
import org.owasp.validator.html.Policy;
import org.owasp.validator.html.PolicyException;
import org.owasp.validator.html.ScanException;
import org.owasp.validator.html.model.Attribute;
import org.owasp.validator.html.model.Tag;
import org.owasp.validator.html.util.ErrorMessageUtil;
import org.owasp.validator.html.util.HTMLEntityEncoder;
import org.w3c.dom.Comment;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;



/**
 * This is where the magic lives. All the scanning/filtration logic resides here, but it should not be called
 * directly. All scanning should be done through a AntiSamy.scan() method.
 *
 * @author Arshan Dabirsiaghi
 *
 */

public class AntiSamyDOMScanner {

	private Policy policy;
	private CleanResults results = null;
	private ArrayList errorMessages = new ArrayList();
	private Document document = new DocumentImpl();
	private DocumentFragment dom = document.createDocumentFragment();

	public static final String DEFAULT_ENCODING_ALGORITHM = "UTF-8";

	private static final String DEFAULT_LOCALE_LANG = "en";
	private static final String DEFAULT_LOCALE_LOC = "US";
	
	private static final Tag BASIC_PARAM_TAG_RULE;
	static {
		Attribute paramNameAttr = new Attribute("name");
		Attribute paramValueAttr = new Attribute("value");
		paramNameAttr.addAllowedRegExp(Policy.ANYTHING_REGEXP);
		paramValueAttr.addAllowedRegExp(Policy.ANYTHING_REGEXP);
		BASIC_PARAM_TAG_RULE = new Tag("param");
		BASIC_PARAM_TAG_RULE.addAttribute(paramNameAttr);
		BASIC_PARAM_TAG_RULE.addAttribute(paramValueAttr);
		BASIC_PARAM_TAG_RULE.setAction(Policy.ACTION_VALIDATE);
	}

	private ResourceBundle messages = null;
	private Locale locale = Locale.getDefault();

	private boolean isNofollowAnchors = false;
	private boolean isValidateParamAsEmbed = false;
	
	/*
	 * Hardcoded list of tags that are strictly barred from having children.
	 */
	private String[] allowedEmptyTags = { 
			"br", "hr", "a",
			"img", "link", "iframe", "script", "object", "applet",
			"frame", "base", "param", "meta", "input", "textarea", "embed",
			"basefont", "col"  };

	public void initializeErrors() {

		try {
			messages = ResourceBundle.getBundle("AntiSamy", locale);
		} catch (MissingResourceException mre) {
			messages = ResourceBundle.getBundle("AntiSamy", new Locale(DEFAULT_LOCALE_LANG,DEFAULT_LOCALE_LOC));
		}
	}

	/**
	 * This is where the magic lives.
	 * @param html A String whose contents we want to scan.
	 * @return A CleanResults object with an XMLDocumentFragment object and its String representation, as well as some scan statistics.
	 * @throws ScanException
	 */

	public CleanResults scan(String html, String inputEncoding, String outputEncoding) throws ScanException {

		if ( html == null ) {
			throw new ScanException(new NullPointerException("Null input"));
		}

		initializeErrors();

		int maxInputSize = policy.getMaxInputSize();

		if ( maxInputSize < html.length() ) {
			addError(ErrorMessageUtil.ERROR_INPUT_SIZE, new Object[] { new Integer(html.length()), new Integer(maxInputSize) });
			throw new ScanException( errorMessages.get(0).toString() );
		}

		isNofollowAnchors = "true".equals(policy.getDirective(Policy.ANCHORS_NOFOLLOW));
		isValidateParamAsEmbed = "true".equals(policy.getDirective(Policy.VALIDATE_PARAM_AS_EMBED));

		Date start = new Date();

		try {

			/*
			 * We have to replace any invalid XML characters to prevent NekoHTML from breaking when it gets passed
			 * encodings like %21.
			 */

			html = stripNonValidXMLCharacters(html);

			/*
			 * First thing we do is call the HTML cleaner ("NekoHTML") on it with the appropriate options. We choose
			 * not to omit tags due to the fallibility of our own listing in the ever changing world
			 * of W3C.
			 */

			DOMFragmentParser parser = new DOMFragmentParser();
			parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
			parser.setProperty("http://cyberneko.org/html/properties/default-encoding",inputEncoding);

			parser.setFeature("http://cyberneko.org/html/features/scanner/style/strip-cdata-delims", false);
			parser.setFeature("http://cyberneko.org/html/features/scanner/cdata-sections", true);

			try {
				parser.setFeature("http://cyberneko.org/html/features/enforce-strict-attribute-names", true);
			} catch (SAXNotRecognizedException se) {
				// this indicates that the patched nekohtml is not on the classpath
			}

			try {
				parser.parse(new InputSource(new StringReader(html)),dom);
			} catch (Exception e) {
				throw new ScanException(e);
			}

			/*
			 * Call the work horse.
			 */

			for(int i = 0;i?", ""));
				}		
			}
			
			return;
		}

		if ( node instanceof Element && node.getChildNodes().getLength() == 0 ) {

			boolean isEmptyAllowed = false;

			for(int i=0; i and no policy and isValidateParamAsEmbed and policy in place for  and
		 *  policy is to validate, use custom policy to get the tag through to the validator.
		 */
		boolean masqueradingParam = false;
		if (tag == null && isValidateParamAsEmbed && "param".equals(tagName.toLowerCase())) {
			Tag embedPolicy = policy.getTagByName("embed");
			if (embedPolicy != null && Policy.ACTION_VALIDATE.equals(embedPolicy.getAction())) {
				tag = BASIC_PARAM_TAG_RULE;
				masqueradingParam = true;
			}
		}
		
		if ((tag == null && "encode".equals(policy.getDirective("onUnknownTag"))) ||
			(tag != null && "encode".equals(tag.getAction())) ) {


			addError(ErrorMessageUtil.ERROR_TAG_ENCODED, new Object[] {HTMLEntityEncoder.htmlEntityEncode(tagName)});

			/*
			 * We have to filter out the tags only. This means the content should remain. First
			 * step is to validate before promoting its children.
			 */

			for (int i = 0; i < node.getChildNodes().getLength(); i++) {

				tmp = node.getChildNodes().item(i);

				recursiveValidateTag(tmp);

				/*
				 * This indicates the node was removed/failed validation.
       			 */
				if (tmp.getParentNode() == null) {
					i--;
				}
			}

			/*
			 * Transform the tag to text, HTML-encode it and promote the children. The tag will
			 * be kept in the fragment as one or two text Nodes located before and after the
			 * children; representing how the tag used to wrap them.
			 */

			encodeAndPromoteChildren(ele);

			return;

		} else if (tag == null || Policy.ACTION_FILTER.equals(tag.getAction())) {

			if ( tag == null ) {
				addError( ErrorMessageUtil.ERROR_TAG_NOT_IN_POLICY, new Object[] { HTMLEntityEncoder.htmlEntityEncode(tagName)} );
			} else {
				addError( ErrorMessageUtil.ERROR_TAG_FILTERED, new Object[] { HTMLEntityEncoder.htmlEntityEncode(tagName)} );
			}


			/*
			 * We have to filter out the tags only. This means
			 * the content should remain. First step is to validate
			 * before promoting its children.
			 */

			for(int i=0;i as , now is the time to convert it.
			 */
			String nameValue = null;
			if (masqueradingParam) {
				nameValue = ele.getAttribute("name");
				if (nameValue != null && ! "".equals(nameValue) ) {
					String valueValue = ele.getAttribute("value");
					ele.setAttribute(nameValue, valueValue);
					ele.removeAttribute("name");
					ele.removeAttribute("value");
					tag = policy.getTagByName("embed");
				}
			}
			
			/*
			 * Check to see if it's a