All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.google.gwt.safehtml.shared.SimpleHtmlSanitizer Maven / Gradle / Ivy

There is a newer version: 0.26
Show newest version
/*
 * Copyright 2010 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.gwt.safehtml.shared;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

/**
 * A simple and relatively inexpensive HTML sanitizer.
 *
 * 

* This sanitizer accepts the subset of HTML consisting of the following * attribute-free tags: * *

    *
  • {@code }, {@code }, {@code }
  • *
  • {@code

    }, {@code

    }, {@code

    }, * {@code

    }, {@code

    }, {@code
    }
  • *
  • {@code
      }, {@code
        }. {@code
      1. }
      2. *
      3. {@code
        }
      4. *
    * * as well as numeric HTML entities and HTML entity references. Any HTML * metacharacters that do not appear as part of markup in this subset will be * HTML-escaped. */ public final class SimpleHtmlSanitizer implements HtmlSanitizer { private static final SimpleHtmlSanitizer INSTANCE = new SimpleHtmlSanitizer(); private static final Set TAG_WHITELIST = new HashSet( Arrays.asList("b", "em", "i", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "ul", "ol", "li")); /** * Return a singleton SimpleHtmlSanitizer instance. * * @return the instance */ public static SimpleHtmlSanitizer getInstance() { return INSTANCE; } /** * HTML-sanitizes a string. * *

    * The input string is processed as described above. The result of sanitizing * the string is guaranteed to be safe to use (with respect to XSS * vulnerabilities) in HTML contexts, and is returned as an instance of the * {@link SafeHtml} type. * * @param html the input String * @return a sanitized SafeHtml instance */ public static SafeHtml sanitizeHtml(String html) { if (html == null) { throw new NullPointerException("html is null"); } return new SafeHtmlString(simpleSanitize(html)); } /* * Sanitize a string containing simple HTML markup as defined above. The * approach is as follows: We split the string at each occurence of '<'. Each * segment thus obtained is inspected to determine if the leading '<' was * indeed the start of a whitelisted tag or not. If so, the tag is emitted * unescaped, and the remainder of the segment (which cannot contain any * additional tags) is emitted in escaped form. Otherwise, the entire segment * is emitted in escaped form. * * In either case, EscapeUtils.htmlEscapeAllowEntities is used to escape, * which escapes HTML but does not double escape existing syntactially valid * HTML entities. */ // TODO(xtof): should this be in a utils class? private static String simpleSanitize(String text) { StringBuilder sanitized = new StringBuilder(); boolean firstSegment = true; for (String segment : text.split("<", -1)) { if (firstSegment) { /* * the first segment is never part of a valid tag; note that if the * input string starts with a tag, we will get an empty segment at the * beginning. */ firstSegment = false; sanitized.append(SafeHtmlUtils.htmlEscapeAllowEntities(segment)); continue; } /* * determine if the current segment is the start of an attribute-free tag * or end-tag in our whitelist */ int tagStart = 0; // will be 1 if this turns out to be an end tag. int tagEnd = segment.indexOf('>'); String tag = null; boolean isValidTag = false; if (tagEnd > 0) { if (segment.charAt(0) == '/') { tagStart = 1; } tag = segment.substring(tagStart, tagEnd); if (TAG_WHITELIST.contains(tag)) { isValidTag = true; } } if (isValidTag) { // append the tag, not escaping it if (tagStart == 0) { sanitized.append('<'); } else { // we had seen an end-tag sanitized.append("'); // append the rest of the segment, escaping it sanitized.append(SafeHtmlUtils.htmlEscapeAllowEntities( segment.substring(tagEnd + 1))); } else { // just escape the whole segment sanitized.append("<").append( SafeHtmlUtils.htmlEscapeAllowEntities(segment)); } } return sanitized.toString(); } /* * Note: We purposely do not provide a method to create a SafeHtml from * another (arbitrary) SafeHtml via sanitization, as this would permit the * construction of SafeHtml objects that are not stable in the sense that for * a {@code SafeHtml s} it may not be true that {@code s.asString()} equals * {@code SimpleHtmlSanitizer.sanitizeHtml(s.asString()).asString()}. While * this is not currently an issue, it might become one and result in * unexpected behavior if this class were to become serializable and enforce * its class invariant upon deserialization. */ // prevent external instantiation private SimpleHtmlSanitizer() { } public SafeHtml sanitize(String html) { return sanitizeHtml(html); } }