All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.gwtproject.safehtml.shared.SimpleHtmlSanitizer Maven / Gradle / Ivy

The newest version!
/*
 * Copyright © 2019 The GWT Project Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.gwtproject.safehtml.shared;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

/**
 * A simple and relatively inexpensive HTML sanitizer.
 *
 * 

This sanitizer accepts the subset of HTML consisting of the following attribute-free tags: * *

    *
  • {@code }, {@code }, {@code } *
  • {@code

    }, {@code

    }, {@code

    }, {@code

    }, {@code

    }, {@code
    } *
  • {@code
      }, {@code
        }, {@code
      1. } *
      2. {@code
        }, {@code
        }, {@code } *
    * *

    as well as numeric HTML entities and HTML entity references. Any HTML metacharacters that do * not appear as part of markup in this subset will be HTML-escaped. */ public final class SimpleHtmlSanitizer implements HtmlSanitizer { private static final SimpleHtmlSanitizer INSTANCE = new SimpleHtmlSanitizer(); private static final Set TAG_WHITELIST = new HashSet( Arrays.asList( "b", "em", "i", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "ul", "ol", "li", "strong", "br")); // prevent external instantiation private SimpleHtmlSanitizer() {} /** * Return a singleton SimpleHtmlSanitizer instance. * * @return the instance */ public static SimpleHtmlSanitizer getInstance() { return INSTANCE; } @Override public SafeHtml sanitize(String html) { return sanitizeHtml(html); } /* * Note: We purposely do not provide a method to create a SafeHtml from * another (arbitrary) SafeHtml via sanitization, as this would permit the * construction of SafeHtml objects that are not stable in the sense that for * a {@code SafeHtml s} it may not be true that {@code s.asString()} equals * {@code SimpleHtmlSanitizer.sanitizeHtml(s.asString()).asString()}. While * this is not currently an issue, it might become one and result in * unexpected behavior if this class were to become serializable and enforce * its class invariant upon deserialization. */ /** * HTML-sanitizes a string. * *

    The input string is processed as described above. The result of sanitizing the string is * guaranteed to be safe to use (with respect to XSS vulnerabilities) in HTML contexts, and is * returned as an instance of the {@link SafeHtml} type. * * @param html the input String * @return a sanitized SafeHtml instance */ public static SafeHtml sanitizeHtml(String html) { if (html == null) { throw new NullPointerException("html is null"); } return new SafeHtmlString(simpleSanitize(html)); } /* * Sanitize a string containing simple HTML markup as defined above. The * approach is as follows: We split the string at each occurence of '<'. Each * segment thus obtained is inspected to determine if the leading '<' was * indeed the start of a whitelisted tag or not. If so, the tag is emitted * unescaped, and the remainder of the segment (which cannot contain any * additional tags) is emitted in escaped form. Otherwise, the entire segment * is emitted in escaped form. * * In either case, EscapeUtils.htmlEscapeAllowEntities is used to escape, * which escapes HTML but does not double escape existing syntactially valid * HTML entities. */ // TODO(xtof): should this be in a utils class? private static String simpleSanitize(String text) { StringBuilder sanitized = new StringBuilder(); boolean firstSegment = true; for (String segment : text.split("<", -1)) { if (firstSegment) { /* * the first segment is never part of a valid tag; note that if the * input string starts with a tag, we will get an empty segment at the * beginning. */ firstSegment = false; sanitized.append(SafeHtmlUtils.htmlEscapeAllowEntities(segment)); continue; } /* * determine if the current segment is the start of an attribute-free tag * or end-tag in our whitelist */ int tagStart = 0; // will be 1 if this turns out to be an end tag. int tagEnd = segment.indexOf('>'); String tag = null; boolean isValidTag = false; if (tagEnd > 0) { if (segment.charAt(0) == '/') { tagStart = 1; } tag = segment.substring(tagStart, tagEnd); if (TAG_WHITELIST.contains(tag)) { isValidTag = true; } } if (isValidTag) { // append the tag, not escaping it if (tagStart == 0) { sanitized.append('<'); } else { // we had seen an end-tag sanitized.append("'); // append the rest of the segment, escaping it sanitized.append(SafeHtmlUtils.htmlEscapeAllowEntities(segment.substring(tagEnd + 1))); } else { // just escape the whole segment sanitized.append("<").append(SafeHtmlUtils.htmlEscapeAllowEntities(segment)); } } return sanitized.toString(); } }