org.owasp.html.HtmlPolicyBuilder Maven / Gradle / Ivy
// Copyright (c) 2011, Mike Samuel
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// Neither the name of the OWASP nor the names of its contributors may
// be used to endorse or promote products derived from this software
// without specific prior written permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
package org.owasp.html;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import javax.annotation.concurrent.NotThreadSafe;
import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
/**
* Conveniences for configuring policies for the {@link HtmlSanitizer}.
*
* Usage
*
* To create a policy, first construct an instance of this class; then call
* allow…
methods to turn on tags, attributes, and other
* processing modes; and finally call build(renderer)
or
* toFactory()
.
*
*
* // Define the policy.
* Function<HtmlStreamEventReceiver, HtmlSanitizer.Policy> policy
* = new HtmlPolicyBuilder()
* .allowElements("a", "p")
* .allowAttributes("href").onElements("a")
* .toFactory();
*
* // Sanitize your output.
* HtmlSanitizer.sanitize(myHtml, policy.apply(myHtmlStreamRenderer));
*
*
* Embedded Content
*
* Embedded URLs are filtered by
* {@link HtmlPolicyBuilder#allowUrlProtocols protocol}.
* There is a {@link HtmlPolicyBuilder#allowStandardUrlProtocols canned policy}
* so you can easily white-list widely used policies that don't violate the
* current pages origin. See "Customization" below for ways to do further
* filtering. If you allow links it might be worthwhile to
* {@link HtmlPolicyBuilder#requireRelNofollowOnLinks() require}
* {@code rel=nofollow}.
*
*
* This class simply throws out all embedded JS.
* Use a custom element or attribute policy to allow through
* signed or otherwise known-safe code.
* Check out the Caja project if you need a way to contain third-party JS.
*
*
* This class does not attempt to faithfully parse and sanitize CSS.
* It does provide {@link HtmlPolicyBuilder#allowStyling() one} styling option
* that allows through a few CSS properties that allow textual styling, but that
* disallow image loading, history stealing, layout breaking, code execution,
* etc.
*
*
* Customization
*
* You can easily do custom processing on tags and attributes by supplying your
* own {@link ElementPolicy element policy} or
* {@link AttributePolicy attribute policy} when calling
* allow…
.
* E.g. to convert headers into {@code
}s, you could use an element policy
*
*
* new HtmlPolicyBuilder()
* .allowElement(
* new ElementPolicy() {
* public String apply(String elementName, List<String> attributes){
* attributes.add("class");
* attributes.add("header-" + elementName);
* return "div";
* }
* },
* "h1", "h2", "h3", "h4", "h5", "h6")
* .build(outputChannel)
*
*
* Rules of Thumb
*
* Throughout this class, several rules hold:
*
* - Everything is denied by default. There are
*
disallow…
methods, but those reverse
* allows instead of rolling back overly permissive defaults.
* - The order of allows and disallows does not matter.
* Disallows trump allows whether they occur before or after them.
* The only method that needs to be called in a particular place is
* {@link HtmlPolicyBuilder#build}.
* Allows or disallows after {@code build} is called have no
* effect on the already built policy.
*
- Element and attribute policies are applied in the following order:
* element specific attribute policy, global attribute policy, element
* policy.
* Element policies come last so they can observe all the post-processed
* attributes, and so they can add attributes that are exempt from
* attribute policies.
* Element specific policies go first, so they can normalize content to
* a form that might be acceptable to a more simplistic global policy.
*
*
* Thread safety and efficiency
*
* This class is not thread-safe. The resulting policy will not violate its
* security guarantees as a result of race conditions, but is not thread safe
* because it maintains state to track whether text inside disallowed elements
* should be suppressed.
*
* The resulting policy can be reused, but if you use the
* {@link HtmlPolicyBuilder#toFactory()} method instead of {@link #build}, then
* binding policies to output channels is cheap so there's no need.
*
*
* @author Mike Samuel ([email protected])
*/
@TCB
@NotThreadSafe
public class HtmlPolicyBuilder {
/**
* The default set of elements that are removed if they have no attributes.
* Since {@code } is in this set, by default, a policy will remove
* {@code } because its URL is not allowed
* and it has no other attributes that would warrant it appearing in the
* output.
*/
public static final ImmutableSet DEFAULT_SKIP_IF_EMPTY
= ImmutableSet.of("a", "font", "img", "input", "span");
private final Map elPolicies = Maps.newLinkedHashMap();
private final Map> attrPolicies
= Maps.newLinkedHashMap();
private final Map globalAttrPolicies
= Maps.newLinkedHashMap();
private final Set allowedProtocols = Sets.newLinkedHashSet();
private final Set skipIfEmpty = Sets.newLinkedHashSet(
DEFAULT_SKIP_IF_EMPTY);
private final Map textContainers = Maps.newLinkedHashMap();
private HtmlStreamEventProcessor postprocessor =
HtmlStreamEventProcessor.Processors.IDENTITY;
private HtmlStreamEventProcessor preprocessor =
HtmlStreamEventProcessor.Processors.IDENTITY;
private CssSchema stylingPolicySchema = null;
private AttributePolicy styleUrlPolicy =
AttributePolicy.REJECT_ALL_ATTRIBUTE_POLICY;
private boolean requireRelNofollowOnLinks;
/**
* Allows the named elements.
*/
public HtmlPolicyBuilder allowElements(String... elementNames) {
return allowElements(ElementPolicy.IDENTITY_ELEMENT_POLICY, elementNames);
}
/**
* Disallows the named elements. Elements are disallowed by default, so
* there is no need to disallow elements, unless you are making an exception
* based on an earlier allow.
*/
public HtmlPolicyBuilder disallowElements(String... elementNames) {
return allowElements(ElementPolicy.REJECT_ALL_ELEMENT_POLICY, elementNames);
}
/**
* Allow the given elements with the given policy.
*
* @param policy May remove or add attributes, change the element name, or
* deny the element.
*/
public HtmlPolicyBuilder allowElements(
ElementPolicy policy, String... elementNames) {
invalidateCompiledState();
for (String elementName : elementNames) {
elementName = HtmlLexer.canonicalName(elementName);
ElementPolicy newPolicy = ElementPolicy.Util.join(
elPolicies.get(elementName), policy);
// Don't remove if newPolicy is the always reject policy since we want
// that to infect later allowElement calls for this particular element
// name. rejects should have higher priority than allows.
elPolicies.put(elementName, newPolicy);
if (!textContainers.containsKey(elementName)
&& TagBalancingHtmlStreamEventReceiver
.allowsPlainTextualContent(elementName)) {
textContainers.put(elementName, true);
}
}
return this;
}
/**
* A canned policy that allows a number of common formatting elements.
*/
public HtmlPolicyBuilder allowCommonInlineFormattingElements() {
return allowElements(
"b", "i", "font", "s", "u", "o", "sup", "sub", "ins", "del", "strong",
"strike", "tt", "code", "big", "small", "br", "span", "em");
}
/**
* A canned policy that allows a number of common block elements.
*/
public HtmlPolicyBuilder allowCommonBlockElements() {
return allowElements(
"p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "ul", "ol", "li",
"blockquote");
}
/**
* Allows text content in the named elements.
* By default, text content is allowed in any
* {@link #allowElements allowed elements} that can contain character data per
* the HTML5 spec, but text content is not allowed by default in elements that
* contain content of other kinds (like JavaScript in {@code