com.google.gwt.safehtml.shared.SafeHtmlUtils Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of gwt-servlet Show documentation
There is a newer version: 2.10.0
/*
 * Copyright 2010 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not
 * use this file except in compliance with the License. You may obtain a copy of
 * the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
 * License for the specific language governing permissions and limitations under
 * the License.
 */
package com.google.gwt.safehtml.shared;

import com.google.gwt.regexp.shared.RegExp;
import com.google.gwt.safehtml.shared.annotations.IsSafeHtml;
import com.google.gwt.safehtml.shared.annotations.SuppressIsSafeHtmlCastCheck;

/**
 * Utility class containing static methods for escaping and sanitizing strings.
 */
public final class SafeHtmlUtils {

  private static final String HTML_ENTITY_REGEX = "[a-z]+|#[0-9]+|#x[0-9a-fA-F]+";

  /**
   * An empty String.
   */
  public static final SafeHtml EMPTY_SAFE_HTML = new SafeHtmlString("");

  private static final RegExp HTML_CHARS_RE = RegExp.compile("[&<>'\"]");
  private static final RegExp AMP_RE = RegExp.compile("&", "g");
  private static final RegExp GT_RE = RegExp.compile(">", "g");
  private static final RegExp LT_RE = RegExp.compile("<", "g");
  private static final RegExp SQUOT_RE = RegExp.compile("\'", "g");
  private static final RegExp QUOT_RE = RegExp.compile("\"", "g");

  /**
   * Returns a {@link SafeHtml} constructed from a safe string, i.e., without escaping
   * the string.
   *
   * 
   * Important: For this method to be able to honor the {@link SafeHtml}
   * contract, all uses of this method must satisfy the following constraints:
   *
   * 
   *
   * The argument expression must be fully determined at compile time.
   *
   * The value of the argument must end in "inner HTML" context and not
   * contain incomplete HTML tags. I.e., the following is not a correct use of
   * this method, because the {@code } tag is incomplete:
   *
   *    * {@code shb.appendHtmlConstant("':
        return ">";
      case '"':
        return """;
      case '\'':
        return "'";
      default:
        return "" + c;
    }
  }

  /**
   * HTML-escapes a string.
   *
   * Note: The following variants of this function were profiled on FF40,
   * Chrome44, Safari 8 and IE11:
   * 

   * For each metachar, check indexOf, then use s.replace(regex, string)
   * 
For each metachar use s.replace(regex, string)
   * 
Manual replace each metachar by looping through characters in a loop.
   * 
Check if any metachar is present using a regex, then use #1.
   * 
Check if any metachar is present using a regex, then use #2.
   * 
Check if any metachar is present using a regex, then use #3.
   * 
   *
   * For all browsers #4 was found to be the fastest, and is used below.
   *
   * The only out-lier was firefox with #6 being the optimal option, but #6
   * performs considerably worse in all other browsers.
   *
   * @param s the string to be escaped
   * @return the input string, with all occurrences of HTML meta-characters
   *         replaced with their corresponding HTML Entity References
   */
  public static String htmlEscape(String s) {
    if (!HTML_CHARS_RE.test(s)) {
      return s;
    }
    if (s.indexOf("&") != -1) {
      s = AMP_RE.replace(s, "&");
    }
    if (s.indexOf("<") != -1) {
      s = LT_RE.replace(s, "<");
    }
    if (s.indexOf(">") != -1) {
      s = GT_RE.replace(s, ">");
    }
    if (s.indexOf("\"") != -1) {
      s = QUOT_RE.replace(s, """);
    }
    if (s.indexOf("'") != -1) {
      s = SQUOT_RE.replace(s, "'");
    }
    return s;
  }

  /**
   * HTML-escapes a string, but does not double-escape HTML-entities already
   * present in the string.
   *
   * @param text the string to be escaped
   * @return the input string, with all occurrences of HTML meta-characters
   *         replaced with their corresponding HTML Entity References, with the
   *         exception that ampersand characters are not double-escaped if they
   *         form the start of an HTML Entity Reference
   */
  @IsSafeHtml
  @SuppressIsSafeHtmlCastCheck
  public static String htmlEscapeAllowEntities(String text) {
    StringBuilder escaped = new StringBuilder();

    boolean firstSegment = true;
    for (String segment : text.split("&", -1)) {
      if (firstSegment) {
        /*
         * The first segment is never part of an entity reference, so we always
         * escape it.
         * Note that if the input starts with an ampersand, we will get an empty
         * segment before that.
         */
        firstSegment = false;
        escaped.append(htmlEscape(segment));
        continue;
      }

      int entityEnd = segment.indexOf(';');
      if (entityEnd > 0 && segment.substring(0, entityEnd).matches(HTML_ENTITY_REGEX)) {
        // Append the entity without escaping.
        escaped.append("&").append(segment.substring(0, entityEnd + 1));

        // Append the rest of the segment, escaped.
        escaped.append(htmlEscape(segment.substring(entityEnd + 1)));
      } else {
        // The segment did not start with an entity reference, so escape the
        // whole segment.
        escaped.append("&").append(htmlEscape(segment));
      }
    }

    return escaped.toString();
  }

  // prevent instantiation
  private SafeHtmlUtils() {
  }
}