All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.owasp.html.Encoding Maven / Gradle / Ivy

There is a newer version: 20240325.1
Show newest version
// Copyright (c) 2012, Mike Samuel
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// Neither the name of the OWASP nor the names of its contributors may
// be used to endorse or promote products derived from this software
// without specific prior written permission.
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.

package org.owasp.html;

import java.io.IOException;

import javax.annotation.Nullable;

/** Encoders and decoders for HTML. */
public final class Encoding {

  /**
   * Decodes HTML entities to produce a string containing only valid
   * Unicode scalar values.
   *
   * @param s text/html
   * @return text/plain
   */
  public static String decodeHtml(String s) {
    int firstAmp = s.indexOf('&');
    int safeLimit = longestPrefixOfGoodCodeunits(s);
    if ((firstAmp & safeLimit) < 0) { return s; }

    StringBuilder sb;
    {
      int n = s.length();
      sb = new StringBuilder(n);
      int pos = 0;
      int amp = firstAmp;
      while (amp >= 0) {
        sb.append(s, pos, amp);
        int end = HtmlEntities.appendDecodedEntity(s, amp, n, sb);
        pos = end;
        amp = s.indexOf('&', end);
      }
      sb.append(s, pos, n);
    }

    stripBannedCodeunits(
        sb,
        firstAmp < 0
          ? safeLimit : safeLimit < 0
          ? firstAmp : Math.min(firstAmp, safeLimit));

    return sb.toString();
  }

  /**
   * Returns the portion of its input that consists of XML safe chars.
   * @see XML Ch. 2.2 - Characters
   */
  @TCB
  static String stripBannedCodeunits(String s) {
    int safeLimit = longestPrefixOfGoodCodeunits(s);
    if (safeLimit < 0) { return s; }

    StringBuilder sb = new StringBuilder(s);
    stripBannedCodeunits(sb, safeLimit);
    return sb.toString();
  }

  /**
   * Leaves in the input buffer only code-units that comprise XML safe chars.
   * @see XML Ch. 2.2 - Characters
   */
  @TCB
  static void stripBannedCodeunits(StringBuilder sb) {
    stripBannedCodeunits(sb, 0);
  }

  @TCB
  private static void stripBannedCodeunits(StringBuilder sb, int start) {
    int k = start;
    for (int i = start, n = sb.length(); i < n; ++i) {
      char ch = sb.charAt(i);
      if (ch < 0x20) {
        if (IS_BANNED_ASCII[ch]) {
          continue;
        }
      } else if (0xd800 <= ch) {
        if (ch <= 0xdfff) {
          if (i+1 < n) {
            char next = sb.charAt(i+1);
            if (Character.isSurrogatePair(ch, next)) {
              sb.setCharAt(k++, ch);
              sb.setCharAt(k++, next);
              ++i;
            }
          }
          continue;
        } else if ((ch & 0xfffe) == 0xfffe) {
          continue;
        }
      }
      sb.setCharAt(k++, ch);
    }
    sb.setLength(k);
  }

  /**
   * The number of code-units at the front of s that form code-points in the
   * XML Character production.
   * @return -1 if all of s is in the XML Character production.
   */
  @TCB
  private static int longestPrefixOfGoodCodeunits(String s) {
    int n = s.length(), i;
    for (i = 0; i < n; ++i) {
      char ch = s.charAt(i);
      if (ch < 0x20) {
        if (IS_BANNED_ASCII[ch]) {
          return i;
        }
      } else if (0xd800 <= ch) {
        if (ch <= 0xdfff) {
          if (i+1 < n && Character.isSurrogatePair(ch, s.charAt(i+1))) {
            ++i;  // Skip over low surrogate since we know it's ok.
          } else {
            return i;
          }
        } else if ((ch & 0xfffe) == 0xfffe) {
          return i;
        }
      }
    }
    return -1;
  }

  /**
   * Appends an encoded form of plainText to output where the encoding is
   * sufficient to prevent an HTML parser from interpreting any characters in
   * the appended chunk as part of an attribute or tag boundary.
   *
   * @param plainText text/plain
   * @param output a buffer of text/html that has a well-formed HTML prefix that
   *     ends after the open-quote of an attribute value and does not yet contain
   *     a corresponding close quote.
   *     Modified in place.
   */
  static void encodeHtmlAttribOnto(String plainText, Appendable output)
      throws IOException {
    encodeHtmlOnto(plainText, output, "{\u200B");
  }

  /**
   * Appends an encoded form of plainText to putput where the encoding is
   * sufficient to prevent an HTML parser from transitioning out of the
   * 
   * Data state.
   *
   * This is suitable for encoding a text node inside any element that does not
   * require special handling as a context element (see "context element" in
   * 
   * step 4.)
   *
   * @param plainText text/plain
   * @param output a buffer of text/html that has a well-formed HTML prefix that
   *     would leave an HTML parser in the Data state if it were to encounter a space
   *     character as the next character.  In practice this means that the buffer
   *     does not contain partial tags or comments, and does not have an unclosed
   *     element with a special content model.
   */
  static void encodePcdataOnto(String plainText, Appendable output)
      throws IOException {
    // Avoid problems with client-side template languages like
    // Angular & Polymer which attach special significance to text like
    // {{...}}.
    // We split brackets so that these template languages don't end up
    // executing expressions in sanitized text.
    encodeHtmlOnto(plainText, output, "{");
  }

  /**
   * Appends an encoded form of plainText to putput where the encoding is
   * sufficient to prevent an HTML parser from transitioning out of the
   * 
   * RCDATA state.
   *
   * This is suitable for encoding a text node inside a {@code