All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.owasp.validator.html.scan.AntiSamyDOMScanner Maven / Gradle / Ivy

Go to download

A library for performing fast, configurable cleansing of HTML coming from untrusted sources.

There is a newer version: 1.7.5
Show newest version
/*
 * Copyright (c) 2007-2011, Arshan Dabirsiaghi, Jason Li
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 *
 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 * Neither the name of OWASP nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.owasp.validator.html.scan;

import org.apache.batik.css.parser.ParseException;
import org.apache.xerces.dom.DocumentImpl;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.owasp.validator.css.CssScanner;
import org.owasp.validator.css.ExternalCssScanner;
import org.owasp.validator.html.CleanResults;
import org.owasp.validator.html.Policy;
import org.owasp.validator.html.PolicyException;
import org.owasp.validator.html.ScanException;
import org.owasp.validator.html.model.Attribute;
import org.owasp.validator.html.model.Tag;
import org.owasp.validator.html.util.ErrorMessageUtil;
import org.owasp.validator.html.util.HTMLEntityEncoder;
import org.w3c.dom.*;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;

import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * This is where the magic lives. All the scanning/filtration logic resides
 * here, but it should not be called directly. All scanning should be done
 * through a AntiSamy.scan() method.
 * 
 * @author Arshan Dabirsiaghi
 * 
 */
public class AntiSamyDOMScanner extends AbstractAntiSamyScanner {

    private Document document = new DocumentImpl();
    private DocumentFragment dom = document.createDocumentFragment();
    private CleanResults results = null;
    private static final int maxDepth = 250;
    private static final Pattern invalidXmlCharacters =
            Pattern.compile("[\\u0000-\\u001F\\uD800-\\uDFFF\\uFFFE-\\uFFFF&&[^\\u0009\\u000A\\u000D]]");
    private static final Pattern conditionalDirectives =
            Pattern.compile("?");

    private static final Queue cachedItems = new ConcurrentLinkedQueue();

    static class CachedItem {
        private final DOMFragmentParser parser;
        private final Matcher invalidXmlCharMatcher = invalidXmlCharacters.matcher("");


        CachedItem() throws SAXNotSupportedException, SAXNotRecognizedException {
            this.parser = getDomParser();
        }

        DOMFragmentParser getDomFragmentParser()  {
            return parser;
        }
    }

    public AntiSamyDOMScanner(Policy policy) {
        super(policy);
    }

    /** @noinspection UnusedDeclaration Todo Investigate */
    public AntiSamyDOMScanner() throws PolicyException {
        super();
    }

    /**
     * This is where the magic lives.
     *
     *
     * @param html
     *            A String whose contents we want to scan.
     * @return A CleanResults object with an
     *         XMLDocumentFragment object and its String
     *         representation, as well as some scan statistics.
     * @throws ScanException
     */
    public CleanResults scan(String html) throws ScanException {

        if (html == null) {
            throw new ScanException(new NullPointerException("Null input"));
        }

        errorMessages.clear();
        int maxInputSize = policy.getMaxInputSize();

        if (maxInputSize < html.length()) {
            addError(ErrorMessageUtil.ERROR_INPUT_SIZE, new Object[]{html.length(), maxInputSize});
            throw new ScanException(errorMessages.get(0));
        }

        isNofollowAnchors = policy.isNofollowAnchors();
        isValidateParamAsEmbed = policy.isValidateParamAsEmbed();

        long startOfScan = System.currentTimeMillis();

        try {

            CachedItem cachedItem;
            cachedItem = cachedItems.poll();
            if (cachedItem == null){
                cachedItem = new CachedItem();
            }

            /*
             * We have to replace any invalid XML characters to prevent NekoHTML
             * from breaking when it gets passed encodings like %21.
             */

            html = stripNonValidXMLCharacters(html, cachedItem.invalidXmlCharMatcher);

            /*
             * First thing we do is call the HTML cleaner ("NekoHTML") on it
             * with the appropriate options. We choose not to omit tags due to
             * the fallibility of our own listing in the ever changing world of
             * W3C.
             */


            DOMFragmentParser parser = cachedItem.getDomFragmentParser();

            try {
                parser.parse(new InputSource(new StringReader(html)), dom);
            } catch (Exception e) {
                throw new ScanException(e);
            }

            processChildren(dom, 0);

            /*
             * Serialize the output and then return the resulting DOM object and
             * its string representation.
             */


            final String trimmedHtml = html;

            StringWriter out = new StringWriter();

            @SuppressWarnings("deprecation")
            org.apache.xml.serialize.OutputFormat format = getOutputFormat();

            //noinspection deprecation
            org.apache.xml.serialize.HTMLSerializer serializer = getHTMLSerializer(out, format);
            serializer.serialize(dom);

                    /*
                    * Get the String out of the StringWriter and rip out the XML
                    * declaration if the Policy says we should.
                    */
            final String trimmed = trim( trimmedHtml, out.getBuffer().toString() );

            Callable cleanHtml = new Callable() {
                public String call() throws Exception {
                    return trimmed;
                }
            };

            /**
             * Return the DOM object as well as string HTML.
             */
            results = new CleanResults(startOfScan, cleanHtml, dom, errorMessages);

            cachedItems.add( cachedItem);
            return results;


        } catch (SAXException e) {
            throw new ScanException(e);
        }
        catch ( IOException e ) {
            throw new ScanException(e);
        }

    }

    static DOMFragmentParser getDomParser()
            throws SAXNotRecognizedException, SAXNotSupportedException {
        DOMFragmentParser parser = new DOMFragmentParser();
        parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");

        parser.setFeature("http://cyberneko.org/html/features/scanner/style/strip-cdata-delims", false);
        parser.setFeature("http://cyberneko.org/html/features/scanner/cdata-sections", true);

        try {
            parser.setFeature("http://cyberneko.org/html/features/enforce-strict-attribute-names", true);
        } catch (SAXNotRecognizedException se) {
            // this indicates that the patched nekohtml is not on the
            // classpath
        }
        return parser;
    }

    /**
     * The workhorse of the scanner. Recursively scans document elements
     * according to the policy. This should be called implicitly through the
     * AntiSamy.scan() method.
     *
     * @param node
     *            The node to validate.
     */
    private void recursiveValidateTag(final Node node, int currentStackDepth) throws ScanException {

        currentStackDepth++;

        if(currentStackDepth > maxDepth) {
            throw new ScanException("Too many nested tags");
        }

        if (node instanceof Comment) {
            processCommentNode(node);
            return;
        }

        boolean isElement = node instanceof Element;
        NodeList eleChildNodes = node.getChildNodes();
        if (isElement && eleChildNodes.getLength() == 0) {
            if (removeDisallowedEmpty(node)){
                return;
            }
        }

        if (node instanceof Text && Node.CDATA_SECTION_NODE == node.getNodeType()) {
            stripCData(node);
            return;
        }

        if (node instanceof ProcessingInstruction) {
            removePI(node);
        }

        if (!isElement) {
            return;
        }

        final Element ele = (Element) node;
        final Node parentNode = ele.getParentNode();

        final String tagName = ele.getNodeName();
        final String tagNameLowerCase = tagName.toLowerCase();
        Tag tagRule = policy.getTagByLowercaseName(tagNameLowerCase);

        /*
         * If  and no policy and isValidateParamAsEmbed and policy in
         * place for  and  policy is to validate, use custom
         * policy to get the tag through to the validator.
         */
        Tag embedTag = policy.getEmbedTag();
        boolean masqueradingParam = isMasqueradingParam(tagRule, embedTag, tagNameLowerCase);
        if (masqueradingParam){
            tagRule = Constants.BASIC_PARAM_TAG_RULE;
        }

        if ((tagRule == null && policy.isEncodeUnknownTag()) || (tagRule != null && tagRule.isAction( "encode"))) {
            encodeTag(currentStackDepth, ele, tagName, eleChildNodes);
        } else if (tagRule == null || tagRule.isAction( Policy.ACTION_FILTER)) {
            actionFilter(currentStackDepth, ele, tagName, tagRule, eleChildNodes);
        } else if (tagRule.isAction( Policy.ACTION_VALIDATE)) {
            actionValidate(currentStackDepth, ele, parentNode, tagName, tagNameLowerCase, tagRule, masqueradingParam, embedTag, eleChildNodes);
        } else if (tagRule.isAction( Policy.ACTION_TRUNCATE)) {
            actionTruncate(ele, tagName, eleChildNodes);
        } else {
            /*
             * If we reached this that means that the tag's action is "remove",
             * which means to remove the tag (including its contents).
             */
            addError(ErrorMessageUtil.ERROR_TAG_DISALLOWED, new Object[]{HTMLEntityEncoder.htmlEntityEncode(tagName)});
            removeNode(ele);
        }
    }

    private boolean isMasqueradingParam(Tag tagRule, Tag embedTag, String tagNameLowerCase){
        if (tagRule == null && isValidateParamAsEmbed && "param".equals(tagNameLowerCase)) {
            if (embedTag != null && embedTag.isAction( Policy.ACTION_VALIDATE)) {
                return true;
            }
        }
        return false;
    }

    private void encodeTag(int currentStackDepth, Element ele, String tagName, NodeList eleChildNodes) throws ScanException {
        addError(ErrorMessageUtil.ERROR_TAG_ENCODED, new Object[]{HTMLEntityEncoder.htmlEntityEncode(tagName)});
        processChildren(eleChildNodes, currentStackDepth);

        /*
    * Transform the tag to text, HTML-encode it and promote the
    * children. The tag will be kept in the fragment as one or two text
    * Nodes located before and after the children; representing how the
    * tag used to wrap them.
    */

        encodeAndPromoteChildren(ele);
    }

    private void actionFilter(int currentStackDepth, Element ele, String tagName, Tag tag, NodeList eleChildNodes) throws ScanException {
        if (tag == null) {
            addError(ErrorMessageUtil.ERROR_TAG_NOT_IN_POLICY, new Object[]{HTMLEntityEncoder.htmlEntityEncode(tagName)});
        } else {
            addError(ErrorMessageUtil.ERROR_TAG_FILTERED, new Object[]{HTMLEntityEncoder.htmlEntityEncode(tagName)});
        }

        processChildren(eleChildNodes, currentStackDepth);
        promoteChildren(ele);
    }

    private void actionValidate(int currentStackDepth, Element ele, Node parentNode, String tagName, String tagNameLowerCase, Tag tag, boolean masqueradingParam, Tag embedTag, NodeList eleChildNodes) throws ScanException {
        /*
    * If doing  as , now is the time to convert it.
    */
        String nameValue = null;
        if (masqueradingParam) {
            nameValue = ele.getAttribute("name");
            if (nameValue != null && !"".equals(nameValue)) {
                String valueValue = ele.getAttribute("value");
                ele.setAttribute(nameValue, valueValue);
                ele.removeAttribute("name");
                ele.removeAttribute("value");
                tag = embedTag;
            }
        }

        /*
    * Check to see if it's a