org.owasp.validator.html.scan.AntiSamySAXScanner Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of antisamy Show documentation
The newest version!
/*
 * Copyright (c) 2007-2023, Arshan Dabirsiaghi, Jason Li
 *
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted
 * provided that the following conditions are met:
 *
 * Redistributions of source code must retain the above copyright notice, this list of conditions
 * and the following disclaimer. Redistributions in binary form must reproduce the above copyright
 * notice, this list of conditions and the following disclaimer in the documentation and/or other
 * materials provided with the distribution. Neither the name of OWASP nor the names of its
 * contributors may be used to endorse or promote products derived from this software without
 * specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

package org.owasp.validator.html.scan;

import java.io.Reader;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.util.Queue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;
import javax.xml.XMLConstants;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.sax.SAXResult;
import javax.xml.transform.sax.SAXSource;
import org.htmlunit.cyberneko.parsers.SAXParser;
import org.owasp.validator.html.CleanResults;
import org.owasp.validator.html.Policy;
import org.owasp.validator.html.ScanException;
import org.owasp.validator.html.util.ErrorMessageUtil;
import org.xml.sax.InputSource;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;

/**
 * This class allows you to use a SAX scanner to scan HTML rather than a DOM scanner. Its primary
 * purpose is the support incremental scanning of large blocks of HTML using the scan(Reader reader,
 * Writer writer) API so the HTML input and output doesn't all have to be held in memory at the same
 * time. It should not be called directly. All scanning should be done through an 
 * AntiSamy.scan() method invocation. The HTML sanitization logic built into
 * AntiSamyDOMScanner is leveraged by this class as well.
 *
 * @author Arshan Dabirsiaghi
 */
public class AntiSamySAXScanner extends AbstractAntiSamyScanner {

  private static final Queue cachedItems = new ConcurrentLinkedQueue();

  private static final TransformerFactory sTransformerFactory;

  static {
    // Per issue #103, an IllegalArgumentException could be thrown below if the SAX parser does
    // not support these JAXP 1.5 features. This did actually occur in certain environments where we
    // let the TransformerFactory create whatever instance it decided to create. For example, if
    // xalan:2.7.2 was on the classpath, which doesn't support these JAXP features. However, this
    // should never happen anymore because, by default, we now force the use of the
    // JDK provided Xalan SAX parser, which DOES support these features. However, if someone REALLY
    // wants to use a different implementation, they can set the new property
    // "antisamy.transformerfactory.impl" to whatever they prefer to use, but that class must
    // implement the two attributes we set.

    String TRANSFORMER_FACTORY_IMPL =
        System.getProperty(
            "antisamy.transformerfactory.impl",
            "com.sun.org.apache.xalan.internal.xsltc.trax.TransformerFactoryImpl");

    sTransformerFactory = TransformerFactory.newInstance(TRANSFORMER_FACTORY_IMPL, null);

    // Disable external entities, etc.
    sTransformerFactory.setAttribute(XMLConstants.ACCESS_EXTERNAL_DTD, "");
    sTransformerFactory.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
  }

  static class CachedItem {
    private final Transformer transformer;
    private final SAXParser saxParser;
    private final MagicSAXFilter magicSAXFilter;

    CachedItem(Transformer transformer, SAXParser saxParser, MagicSAXFilter magicSAXFilter) {
      this.transformer = transformer;
      this.saxParser = saxParser;
      this.magicSAXFilter = magicSAXFilter;
      MagicSAXFilter[] filters = {magicSAXFilter};
      try {
        saxParser.setProperty("http://cyberneko.org/html/properties/filters", filters);
      } catch (SAXNotRecognizedException | SAXNotSupportedException e) {
        throw new RuntimeException(e);
      }
    }
  }

  /**
   * Create an instance of this class configured to use the specified policy.
   *
   * @param policy The policy to use.
   */
  public AntiSamySAXScanner(Policy policy) {
    super(policy);
  }

  /**
   * The SAXScanner can't remember and return the complete scan results in all cases because the
   * scan(Reader reader, Writer writer) version of the scan() API incrementally scans the input to
   * generate incremental output. So to be safe, this class always returns null for getResults(). As
   * such, when using this class, you can only rely on the CleanResults returned by the scan()
   * method you invoked, or the output from the writer passed into scan(Reader reader, Writer
   * writer).
   *
   * @return always returns null
   */
  @Override
  public CleanResults getResults() {
    return null;
  }

  /**
   * Scan the provided HTML using the AntiSamy policy specified when this Scanner instance was
   * constructed, and return a CleanResult object that contains the sanitized output, along with
   * possibly some error messages and scan time statistics. This version of the scan() API is the
   * one that is typically used.
   *
   * @param html A String whose contents is to be sanitized per the configured AntiSamy policy.
   * @return A CleanResults object with (possibly) an XMLDocumentFragment
   *     object and a String representation of the cleaned HTML, as well as some scan statistics.
   *     Note that ONLY the cleaned HTML can be considered trustworthy. The absence of errorMessages
   *     in the CleanResults does NOT necessarily indicate the input was safe (i.e., contained no
   *     attacks).
   * @throws ScanException When there is a problem encountered while scanning the HTML input.
   */
  @Override
  public CleanResults scan(String html) throws ScanException {
    return scan(html, this.policy);
  }

  /**
   * This is where the magic lives. Scan the provided HTML and return a CleanResult object that
   * contains the sanitized output, along with possibly some error messages and scan time
   * statistics.
   *
   * @param html A String whose contents is to be sanitized per the configured AntiSamy policy.
   * @param policy The policy to use, overriding the policy specified when this class was
   *     instantiated.
   * @return A CleanResults object with (possibly) an XMLDocumentFragment
   *     object and a String representation of the cleaned HTML, as well as some scan statistics.
   *     Note that ONLY the cleaned HTML can be considered trustworthy. The absence of errorMessages
   *     in the CleanResults does NOT necessarily indicate the input was safe (i.e., contained no
   *     attacks).
   * @throws ScanException When there is a problem encountered while scanning the HTML input.
   */
  public CleanResults scan(String html, Policy policy) throws ScanException {
    if (html == null) {
      throw new ScanException(new NullPointerException("Null html input"));
    }

    int maxInputSize = this.policy.getMaxInputSize();

    if (html.length() > maxInputSize) {
      addError(ErrorMessageUtil.ERROR_INPUT_SIZE, new Object[] {html.length(), maxInputSize});
      throw new ScanException(errorMessages.get(0));
    }

    final StringWriter out = new StringWriter();
    StringReader reader = new StringReader(html);

    CleanResults results = scan(reader, out);
    final String tainted = html;
    Callable cleanCallable =
        new Callable() {
          @Override
          public String call() throws Exception {
            return trim(tainted, out.toString());
          }
        };
    return new CleanResults(
        results.getStartOfScan(), cleanCallable, null, results.getErrorMessages());
  }

  /**
   * Using a SAX parser, this supports Streams for input and output. The use case is where the input
   * is large and the caller does not need or want the entire string in memory all at one time.
   *
   * @param reader A Reader which can feed the SAXParser a little input at a time
   * @param writer A Writer that can take a little output at a time
   * @return CleanResults where the cleanHtml is null. If a caller wants the HTML as a string, it
   *     must capture the contents of the writer (i.e., use a StringWriter).
   * @throws ScanException When there is a problem encountered while scanning the HTML input.
   */
  public CleanResults scan(Reader reader, Writer writer) throws ScanException {
    try {

      CachedItem candidateCachedItem = cachedItems.poll();
      if (candidateCachedItem == null) {
        candidateCachedItem =
            new CachedItem(getNewTransformer(), getParser(), new MagicSAXFilter(messages));
      }

      final CachedItem cachedItem = candidateCachedItem;

      SAXParser parser = cachedItem.saxParser;
      cachedItem.magicSAXFilter.reset(policy);

      long startOfScan = System.currentTimeMillis();

      final SAXSource source = new SAXSource(parser, new InputSource(reader));

      final Transformer transformer = cachedItem.transformer;
      boolean formatOutput = policy.isFormatOutput();
      boolean omitXml = policy.isOmitXmlDeclaration();

      transformer.setOutputProperty(OutputKeys.INDENT, formatOutput ? "yes" : "no");
      transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, omitXml ? "yes" : "no");
      transformer.setOutputProperty(OutputKeys.METHOD, "html");

      //noinspection deprecation
      final org.apache.xml.serialize.OutputFormat format = getOutputFormat();
      //noinspection deprecation
      final org.apache.xml.serialize.HTMLSerializer serializer = getHTMLSerializer(writer, format);

      transformer.transform(source, new SAXResult(serializer));
      errorMessages.clear();
      errorMessages.addAll(cachedItem.magicSAXFilter.getErrorMessages());
      cachedItems.add(cachedItem);
      return new CleanResults(startOfScan, (String) null, null, errorMessages);

    } catch (Exception e) {
      throw new ScanException(e);
    }
  }

  /**
   * Return a new Transformer instance. This is wrapped in a synchronized method because there is no
   * guarantee that the TransformerFactory is thread-safe.
   *
   * @return a new Transformer instance.
   */
  private static synchronized Transformer getNewTransformer() {
    try {
      return sTransformerFactory.newTransformer();
    } catch (TransformerConfigurationException e) {
      throw new RuntimeException(e);
    }
  }

  /**
   * Get a properly configured SAXParser instance.
   *
   * @return A newly created and configured SAXParser instance.
   */
  private static SAXParser getParser() {
    try {
      SAXParser parser = new SAXParser();
      parser.setFeature("http://xml.org/sax/features/namespaces", false);
      parser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
      parser.setFeature("http://cyberneko.org/html/features/scanner/cdata-sections", true);
      parser.setFeature("http://cyberneko.org/html/features/parse-noscript-content", false);

      parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
      return parser;
    } catch (SAXNotRecognizedException | SAXNotSupportedException e) {
      throw new RuntimeException(e);
    }
  }
}