org.owasp.html.HtmlSanitizer Maven / Gradle / Ivy
// Copyright (c) 2011, Mike Samuel
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// Neither the name of the OWASP nor the names of its contributors may
// be used to endorse or promote products derived from this software
// without specific prior written permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
package org.owasp.html;
import java.util.LinkedList;
import java.util.List;
import javax.annotation.Nullable;
import com.google.common.collect.Lists;
/**
* Consumes an HTML stream, and dispatches events to a policy object which
* decides which elements and attributes to allow.
*/
public final class HtmlSanitizer {
/**
* Receives events based on the HTML stream, and applies a policy to decide
* what HTML constructs to allow.
* Typically, implementations use an {@link HtmlStreamRenderer} to produce
* the sanitized output.
*
*
* Implementations of this class are in the TCB.
*/
@TCB
public interface Policy extends HtmlStreamEventReceiver {
/**
* Called when an HTML tag like {@code } is seen in the input.
*
* @param elementName a normalized (lower-case for non-namespaced names)
* element name.
* @param attrs a list of alternating attribute name and value pairs.
* For efficiency, this list may be mutated by this during this method
* call, but ownership reverts to the caller on method exit.
* The values are raw -- HTML entities have been decoded.
* Specifically, implementations are allowed to use a list iterator
* and remove all disallowed attributes, add necessary attributes, and
* then pass the list to an {@link HtmlStreamRenderer}.
*/
void openTag(String elementName, List attrs);
/**
* Called when an HTML tag like {@code } is seen in the input.
*
* @param elementName a normalized (lower-case for non-namespaced names)
* element name.
*/
void closeTag(String elementName);
/**
* Called when textual content is seen.
* @param textChunk raw content -- HTML entities have been decoded.
*/
void text(String textChunk);
}
/**
* Sanitizes the given HTML by applying the given policy to it.
*
*
* This method is not in the TCB.
*
*
* This method has no return value since policies are assumed to render things
* they accept and do nothing on things they reject.
* Use {@link HtmlStreamRenderer} to render content to an output buffer.
*
* @param html A snippet of HTML to sanitize. {@code null} is treated as the
* empty string and will not result in a {@code NullPointerException}.
* @param policy The Policy that will receive events based on the tokens in
* HTML. Typically, this policy ends up routing the events to an
* {@link HtmlStreamRenderer} after filtering.
* {@link HtmlPolicyBuilder} provides an easy way to create policies.
*/
public static void sanitize(
@Nullable String html, final Policy policy) {
sanitize(html, policy, HtmlStreamEventProcessor.Processors.IDENTITY);
}
/**
* Sanitizes the given HTML by applying the given policy to it.
*
*
* This method is not in the TCB.
*
*
* This method has no return value since policies are assumed to render things
* they accept and do nothing on things they reject.
* Use {@link HtmlStreamRenderer} to render content to an output buffer.
*
* @param html A snippet of HTML to sanitize. {@code null} is treated as the
* empty string and will not result in a {@code NullPointerException}.
* @param policy The Policy that will receive events based on the tokens in
* HTML. Typically, this policy ends up routing the events to an
* {@link HtmlStreamRenderer} after filtering.
* {@link HtmlPolicyBuilder} provides an easy way to create policies.
* @param preprocessor A processor that may wrap the policy to reinterpret
* parse events.
* Since the policy encapsulates its output buffer, this is not in the
* policy's TCB.
*/
public static void sanitize(
@Nullable String html, final Policy policy,
HtmlStreamEventProcessor preprocessor) {
String htmlContent = html != null ? html : "";
HtmlStreamEventReceiver receiver = initializePolicy(policy, preprocessor);
receiver.openDocument();
HtmlLexer lexer = new HtmlLexer(htmlContent);
// Use a linked list so that policies can use Iterator.remove() in an O(1)
// way.
LinkedList attrs = Lists.newLinkedList();
while (lexer.hasNext()) {
HtmlToken token = lexer.next();
switch (token.type) {
case TEXT:
receiver.text(
Encoding.decodeHtml(htmlContent.substring(token.start, token.end), false));
break;
case UNESCAPED:
receiver.text(Encoding.stripBannedCodeunits(
htmlContent.substring(token.start, token.end)));
break;
case TAGBEGIN:
if (htmlContent.charAt(token.start + 1) == '/') { // A close tag.
receiver.closeTag(HtmlLexer.canonicalElementName(
htmlContent.substring(token.start + 2, token.end)));
while (lexer.hasNext()
&& lexer.next().type != HtmlTokenType.TAGEND) {
// skip tokens until we see a ">"
}
} else {
attrs.clear();
boolean attrsReadyForName = true;
tagBody:
while (lexer.hasNext()) {
HtmlToken tagBodyToken = lexer.next();
switch (tagBodyToken.type) {
case ATTRNAME:
if (!attrsReadyForName) {
// Last attribute added was valueless.
attrs.add(attrs.getLast());
} else {
attrsReadyForName = false;
}
attrs.add(HtmlLexer.canonicalAttributeName(
htmlContent.substring(tagBodyToken.start, tagBodyToken.end)));
break;
case ATTRVALUE:
String attributeContentRaw =
stripQuotes(htmlContent.substring(tagBodyToken.start, tagBodyToken.end));
attrs.add(Encoding.decodeHtml(attributeContentRaw, true));
attrsReadyForName = true;
break;
case TAGEND:
break tagBody;
default:
// Just drop anything not recognized
}
}
if (!attrsReadyForName) {
attrs.add(attrs.getLast());
}
receiver.openTag(
HtmlLexer.canonicalElementName(
htmlContent.substring(token.start + 1, token.end)),
attrs);
}
break;
default:
// Ignore comments, XML prologues, processing instructions, and other
// stuff that shouldn't show up in the output.
break;
}
}
receiver.closeDocument();
}
private static String stripQuotes(String encodedAttributeValue) {
int n = encodedAttributeValue.length();
if (n > 0) {
char last = encodedAttributeValue.charAt(n - 1);
if (last == '"' || last == '\'') {
int start = 0;
if (n != 1 && last == encodedAttributeValue.charAt(0)) {
start = 1;
} else {
// Browsers deal with missing left quotes :
// but generally do not deal with missing right :