All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.owasp.validator.html.scan.AntiSamyDOMScanner Maven / Gradle / Ivy

Go to download

A library for performing fast, configurable cleansing of HTML coming from untrusted sources.

The newest version!
/*
 * Copyright (c) 2007-2023, Arshan Dabirsiaghi, Jason Li
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *
 * Redistributions of source code must retain the above copyright notice, this list of conditions
 * and the following disclaimer. Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the documentation and/or other
 * materials provided with the distribution. Neither the name of OWASP nor the names of its
 * contributors may be used to endorse or promote products derived from this software without
 * specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.owasp.validator.html.scan;

import java.io.IOException;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.List;
import java.util.Queue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.batik.css.parser.ParseException;
import org.htmlunit.cyberneko.parsers.DOMFragmentParser;
import org.htmlunit.cyberneko.xerces.dom.DocumentImpl;
import org.owasp.validator.css.CssScanner;
import org.owasp.validator.html.CleanResults;
import org.owasp.validator.html.Policy;
import org.owasp.validator.html.PolicyException;
import org.owasp.validator.html.ScanException;
import org.owasp.validator.html.model.Attribute;
import org.owasp.validator.html.model.Tag;
import org.owasp.validator.html.util.ErrorMessageUtil;
import org.owasp.validator.html.util.HTMLEntityEncoder;
import org.w3c.dom.Comment;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.ProcessingInstruction;
import org.w3c.dom.Text;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;

/**
 * This is where the magic lives (all the HTML scanning/filtration logic resides here). This class
 * should not be called directly. All scanning should be done through an AntiSamy.scan()
 *  method invocation.
 *
 * @author Arshan Dabirsiaghi
 */
public class AntiSamyDOMScanner extends AbstractAntiSamyScanner {
  private Document document = new DocumentImpl();
  private DocumentFragment dom = document.createDocumentFragment();
  private CleanResults results = null;
  private static final int maxDepth = 250;
  private static final Pattern invalidXmlCharacters =
      Pattern.compile("[\\u0000-\\u001F\\uD800-\\uDFFF\\uFFFE-\\uFFFF&&[^\\u0009\\u000A\\u000D]]");
  private static final Pattern conditionalDirectives =
      Pattern.compile("?");

  private static final Queue cachedItems = new ConcurrentLinkedQueue();

  static class CachedItem {
    private final DOMFragmentParser parser;
    private final Matcher invalidXmlCharMatcher = invalidXmlCharacters.matcher("");

    CachedItem() throws SAXNotSupportedException, SAXNotRecognizedException {
      this.parser = getDomParser();
    }

    DOMFragmentParser getDomFragmentParser() {
      return parser;
    }
  }

  /**
   * Create an instance of this class configured to use the specified policy.
   *
   * @param policy The policy to use.
   */
  public AntiSamyDOMScanner(Policy policy) {
    super(policy);
  }

  /**
   * Create an instance of this class configured to use the default AntiSamy policy.
   *
   * @throws PolicyException thrown when there is a problem validating or parsing the policy file.
   *     Any validation errors not caught by the XML validation will be thrown with this exception.
   */
  public AntiSamyDOMScanner() throws PolicyException {
    super();
  }

  /**
   * This is where the magic lives.
   *
   * @param html A String whose contents is to be sanitized per the configured AntiSamy policy.
   * @return A CleanResults object with (possibly) an XMLDocumentFragment
   *     object and a String representation of the cleaned HTML, as well as some scan statistics.
   *     Note that ONLY the cleaned HTML can be considered trustworthy. The absence of errorMessages
   *     in the CleanResults does NOT necessarily indicate the input was safe (i.e., contained no
   *     attacks).
   * @throws ScanException When there is a problem encountered while scanning the HTML input.
   */
  @Override
  public CleanResults scan(String html) throws ScanException {

    if (html == null) {
      throw new ScanException(new NullPointerException("Null HTML input"));
    }

    errorMessages.clear();
    int maxInputSize = policy.getMaxInputSize();

    if (maxInputSize < html.length()) {
      addError(ErrorMessageUtil.ERROR_INPUT_SIZE, new Object[] {html.length(), maxInputSize});
      throw new ScanException(errorMessages.get(0));
    }

    isNofollowAnchors = policy.isNofollowAnchors();
    isNoopenerAndNoreferrerAnchors = policy.isNoopenerAndNoreferrerAnchors();
    isValidateParamAsEmbed = policy.isValidateParamAsEmbed();

    long startOfScan = System.currentTimeMillis();

    try {

      CachedItem cachedItem;
      cachedItem = cachedItems.poll();
      if (cachedItem == null) {
        cachedItem = new CachedItem();
      }

      /*
       * We have to replace any invalid XML characters to prevent NekoHTML
       * from breaking when it gets passed encodings like %21.
       */

      html = stripNonValidXMLCharacters(html, cachedItem.invalidXmlCharMatcher);

      /*
       * First thing we do is call the HTML cleaner ("NekoHTML") on it
       * with the appropriate options. We choose not to omit tags due to
       * the fallibility of our own listing in the ever changing world of
       * W3C.
       */

      DOMFragmentParser parser = cachedItem.getDomFragmentParser();

      try {
        parser.parse(new InputSource(new StringReader(html)), dom);
      } catch (Exception e) {
        throw new ScanException(e);
      }

      processChildren(dom, 0);

      /*
       * Serialize the output and then return the resulting DOM object and
       * its string representation.
       */

      final String trimmedHtml = html;

      StringWriter out = new StringWriter();

      @SuppressWarnings("deprecation")
      org.apache.xml.serialize.OutputFormat format = getOutputFormat();

      //noinspection deprecation
      org.apache.xml.serialize.HTMLSerializer serializer = getHTMLSerializer(out, format);
      serializer.serialize(dom);

      /*
       * Get the String out of the StringWriter and rip out the XML
       * declaration if the Policy says we should.
       */
      final String trimmed = trim(trimmedHtml, out.getBuffer().toString());

      Callable cleanHtml =
          new Callable() {
            @Override
            public String call() throws Exception {
              return trimmed;
            }
          };

      /*
       * Return the DOM object as well as string HTML.
       */
      results = new CleanResults(startOfScan, cleanHtml, dom, errorMessages);

      cachedItems.add(cachedItem);
      return results;

    } catch (SAXException | IOException e) {
      throw new ScanException(e);
    }
  }

  static DOMFragmentParser getDomParser()
      throws SAXNotRecognizedException, SAXNotSupportedException {
    DOMFragmentParser parser = new DOMFragmentParser();
    parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");

    parser.setFeature("http://cyberneko.org/html/features/scanner/style/strip-cdata-delims", false);
    parser.setFeature("http://cyberneko.org/html/features/scanner/cdata-sections", true);
    parser.setFeature("http://cyberneko.org/html/features/parse-noscript-content", false);

    return parser;
  }

  /**
   * The workhorse of the scanner. Recursively scans document elements according to the policy. This
   * should be called implicitly through the AntiSamy.scan() method.
   *
   * @param node The node to validate.
   */
  private void recursiveValidateTag(final Node node, int currentStackDepth) throws ScanException {

    currentStackDepth++;

    if (currentStackDepth > maxDepth) {
      throw new ScanException("Too many nested tags");
    }

    if (node instanceof Comment) {
      processCommentNode(node);
      return;
    }

    boolean isElement = node instanceof Element;
    NodeList eleChildNodes = node.getChildNodes();
    if (isElement && eleChildNodes.getLength() == 0) {
      if (removeDisallowedEmpty(node)) {
        return;
      }
    }

    if (node instanceof Text && Node.CDATA_SECTION_NODE == node.getNodeType()) {
      stripCData(node);
      return;
    }

    if (node instanceof ProcessingInstruction) {
      removePI(node);
    }

    if (!isElement) {
      return;
    }

    final Element ele = (Element) node;
    final Node parentNode = ele.getParentNode();

    final String tagName = ele.getNodeName();
    final String tagNameLowerCase = tagName.toLowerCase();
    Tag tagRule = policy.getTagByLowercaseName(tagNameLowerCase);

    /*
     * If  and no policy and isValidateParamAsEmbed and policy in
     * place for  and  policy is to validate, use custom
     * policy to get the tag through to the validator.
     */
    Tag embedTag = policy.getEmbedTag();
    boolean masqueradingParam = isMasqueradingParam(tagRule, embedTag, tagNameLowerCase);
    if (masqueradingParam) {
      tagRule = Constants.BASIC_PARAM_TAG_RULE;
    }

    if ((tagRule == null && policy.isEncodeUnknownTag())
        || (tagRule != null && tagRule.isAction(Policy.ACTION_ENCODE))) {
      encodeTag(currentStackDepth, ele, tagName, eleChildNodes);
    } else if (tagRule == null || tagRule.isAction(Policy.ACTION_FILTER)) {
      actionFilter(currentStackDepth, ele, tagName, tagRule, eleChildNodes);
    } else if (tagRule.isAction(Policy.ACTION_VALIDATE)) {
      actionValidate(
          currentStackDepth,
          ele,
          parentNode,
          tagName,
          tagNameLowerCase,
          tagRule,
          masqueradingParam,
          embedTag,
          eleChildNodes);
    } else if (tagRule.isAction(Policy.ACTION_TRUNCATE)) {
      actionTruncate(ele, tagName, eleChildNodes);
    } else {
      /*
       * If we reached this that means that the tag's action is "remove",
       * which means to remove the tag (including its contents).
       */
      addError(
          ErrorMessageUtil.ERROR_TAG_DISALLOWED,
          new Object[] {HTMLEntityEncoder.htmlEntityEncode(tagName)});
      removeNode(ele);
    }
  }

  private boolean isMasqueradingParam(Tag tagRule, Tag embedTag, String tagNameLowerCase) {
    if (tagRule == null && isValidateParamAsEmbed && "param".equals(tagNameLowerCase)) {
      return embedTag != null && embedTag.isAction(Policy.ACTION_VALIDATE);
    }
    return false;
  }

  private void encodeTag(int currentStackDepth, Element ele, String tagName, NodeList eleChildNodes)
      throws ScanException {
    addError(
        ErrorMessageUtil.ERROR_TAG_ENCODED,
        new Object[] {HTMLEntityEncoder.htmlEntityEncode(tagName)});
    processChildren(eleChildNodes, currentStackDepth);

    /*
     * Transform the tag to text, HTML-encode it and promote the
     * children. The tag will be kept in the fragment as one or two text
     * Nodes located before and after the children; representing how the
     * tag used to wrap them.
     */

    encodeAndPromoteChildren(ele);
  }

  private void actionFilter(
      int currentStackDepth, Element ele, String tagName, Tag tag, NodeList eleChildNodes)
      throws ScanException {
    if (tag == null) {
      addError(
          ErrorMessageUtil.ERROR_TAG_NOT_IN_POLICY,
          new Object[] {HTMLEntityEncoder.htmlEntityEncode(tagName)});
    } else {
      addError(
          ErrorMessageUtil.ERROR_TAG_FILTERED,
          new Object[] {HTMLEntityEncoder.htmlEntityEncode(tagName)});
    }

    processChildren(eleChildNodes, currentStackDepth);
    promoteChildren(ele);
  }

  private void actionValidate(
      int currentStackDepth,
      Element ele,
      Node parentNode,
      String tagName,
      String tagNameLowerCase,
      Tag tag,
      boolean masqueradingParam,
      Tag embedTag,
      NodeList eleChildNodes)
      throws ScanException {
    /*
     * If doing  as , now is the time to convert it.
     */
    String nameValue = null;
    if (masqueradingParam) {
      nameValue = ele.getAttribute("name");
      if (nameValue != null && !"".equals(nameValue)) {
        String valueValue = ele.getAttribute("value");
        ele.setAttribute(nameValue, valueValue);
        ele.removeAttribute("name");
        ele.removeAttribute("value");
        tag = embedTag;
      }
    }

    /*
     * Check to see if it's a