com.squareup.haha.guava.net.PercentEscaper Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of haha Show documentation
Java library to automate the analysis of Android heap dumps.
There is a newer version: 2.1
/*
 * Copyright (C) 2008 The Guava Authors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.squareup.haha.guava.net;

import static com.squareup.haha.guava.base.Preconditions.checkNotNull;

import com.squareup.haha.guava.annotations.Beta;
import com.squareup.haha.guava.annotations.GwtCompatible;
import com.squareup.haha.guava.escape.UnicodeEscaper;

/**
 * A {@code UnicodeEscaper} that escapes some set of Java characters using a
 * UTF-8 based percent encoding scheme. The set of safe characters (those which
 * remain unescaped) can be specified on construction.
 *
 * This class is primarily used for creating URI escapers in {@link
 * UrlEscapers} but can be used directly if required. While URI escapers impose
 * specific semantics on which characters are considered 'safe', this class has
 * a minimal set of restrictions.
 *
 * 
When escaping a String, the following rules apply:
 * 

 * All specified safe characters remain unchanged.
 * 
If {@code plusForSpace} was specified, the space character " " is
 *     converted into a plus sign {@code "+"}.
 * 
All other characters are converted into one or more bytes using UTF-8
 *     encoding and each byte is then represented by the 3-character string
 *     "%XX", where "XX" is the two-digit, uppercase, hexadecimal representation
 *     of the byte value.
 * 
 *
 * For performance reasons the only currently supported character encoding of
 * this class is UTF-8.
 *
 * 
Note: This escaper produces uppercase hexadecimal sequences. From
 * RFC 3986:

 * "URI producers and normalizers should use uppercase hexadecimal digits
 * for all percent-encodings."
 *
 * @author David Beaumont
 * @since 15.0
 */
@Beta
@GwtCompatible
public final class PercentEscaper extends UnicodeEscaper {

  // In some escapers spaces are escaped to '+'
  private static final char[] PLUS_SIGN = { '+' };

  // Percent escapers output upper case hex digits (uri escapers require this).
  private static final char[] UPPER_HEX_DIGITS =
      "0123456789ABCDEF".toCharArray();

  /**
   * If true we should convert space to the {@code +} character.
   */
  private final boolean plusForSpace;

  /**
   * An array of flags where for any {@code char c} if {@code safeOctets[c]} is
   * true then {@code c} should remain unmodified in the output. If
   * {@code c > safeOctets.length} then it should be escaped.
   */
  private final boolean[] safeOctets;

  /**
   * Constructs a percent escaper with the specified safe characters and
   * optional handling of the space character.
   *
   * Not that it is allowed, but not necessarily desirable to specify {@code %}
   * as a safe character. This has the effect of creating an escaper which has no
   * well defined inverse but it can be useful when escaping additional characters.
   *
   * @param safeChars a non null string specifying additional safe characters
   *        for this escaper (the ranges 0..9, a..z and A..Z are always safe and
   *        should not be specified here)
   * @param plusForSpace true if ASCII space should be escaped to {@code +}
   *        rather than {@code %20}
   * @throws IllegalArgumentException if any of the parameters were invalid
   */
  public PercentEscaper(String safeChars, boolean plusForSpace) {
    // TODO(user): Switch to static factory methods for creation now that class is final.
    // TODO(user): Support escapers where alphanumeric chars are not safe.
    checkNotNull(safeChars);  // eager for GWT.
    // Avoid any misunderstandings about the behavior of this escaper
    if (safeChars.matches(".*[0-9A-Za-z].*")) {
      throw new IllegalArgumentException(
          "Alphanumeric characters are always 'safe' and should not be " +
          "explicitly specified");
    }
    safeChars += "abcdefghijklmnopqrstuvwxyz" +
                 "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
                 "0123456789";
    // Avoid ambiguous parameters. Safe characters are never modified so if
    // space is a safe character then setting plusForSpace is meaningless.
    if (plusForSpace && safeChars.contains(" ")) {
      throw new IllegalArgumentException(
          "plusForSpace cannot be specified when space is a 'safe' character");
    }
    this.plusForSpace = plusForSpace;
    this.safeOctets = createSafeOctets(safeChars);
  }

  /**
   * Creates a boolean array with entries corresponding to the character values
   * specified in safeChars set to true. The array is as small as is required to
   * hold the given character information.
   */
  private static boolean[] createSafeOctets(String safeChars) {
    int maxChar = -1;
    char[] safeCharArray = safeChars.toCharArray();
    for (char c : safeCharArray) {
      maxChar = Math.max(c, maxChar);
    }
    boolean[] octets = new boolean[maxChar + 1];
    for (char c : safeCharArray) {
      octets[c] = true;
    }
    return octets;
  }

  /*
   * Overridden for performance. For unescaped strings this improved the
   * performance of the uri escaper from ~760ns to ~400ns as measured by
   * {@link CharEscapersBenchmark}.
   */
  @Override
  protected int nextEscapeIndex(CharSequence csq, int index, int end) {
    checkNotNull(csq);
    for (; index < end; index++) {
      char c = csq.charAt(index);
      if (c >= safeOctets.length || !safeOctets[c]) {
        break;
      }
    }
    return index;
  }

  /*
   * Overridden for performance. For unescaped strings this improved the
   * performance of the uri escaper from ~400ns to ~170ns as measured by
   * {@link CharEscapersBenchmark}.
   */
  @Override
  public String escape(String s) {
    checkNotNull(s);
    int slen = s.length();
    for (int index = 0; index < slen; index++) {
      char c = s.charAt(index);
      if (c >= safeOctets.length || !safeOctets[c]) {
        return escapeSlow(s, index);
      }
    }
    return s;
  }

  /**
   * Escapes the given Unicode code point in UTF-8.
   */
  @Override
  protected char[] escape(int cp) {
    // We should never get negative values here but if we do it will throw an
    // IndexOutOfBoundsException, so at least it will get spotted.
    if (cp < safeOctets.length && safeOctets[cp]) {
      return null;
    } else if (cp == ' ' && plusForSpace) {
      return PLUS_SIGN;
    } else if (cp <= 0x7F) {
      // Single byte UTF-8 characters
      // Start with "%--" and fill in the blanks
      char[] dest = new char[3];
      dest[0] = '%';
      dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
      dest[1] = UPPER_HEX_DIGITS[cp >>> 4];
      return dest;
    } else if (cp <= 0x7ff) {
      // Two byte UTF-8 characters [cp >= 0x80 && cp <= 0x7ff]
      // Start with "%--%--" and fill in the blanks
      char[] dest = new char[6];
      dest[0] = '%';
      dest[3] = '%';
      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
      cp >>>= 4;
      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
      cp >>>= 2;
      dest[2] = UPPER_HEX_DIGITS[cp & 0xF];
      cp >>>= 4;
      dest[1] = UPPER_HEX_DIGITS[0xC | cp];
      return dest;
    } else if (cp <= 0xffff) {
      // Three byte UTF-8 characters [cp >= 0x800 && cp <= 0xffff]
      // Start with "%E-%--%--" and fill in the blanks
      char[] dest = new char[9];
      dest[0] = '%';
      dest[1] = 'E';
      dest[3] = '%';
      dest[6] = '%';
      dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
      cp >>>= 4;
      dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
      cp >>>= 2;
      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
      cp >>>= 4;
      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
      cp >>>= 2;
      dest[2] = UPPER_HEX_DIGITS[cp];
      return dest;
    } else if (cp <= 0x10ffff) {
      char[] dest = new char[12];
      // Four byte UTF-8 characters [cp >= 0xffff && cp <= 0x10ffff]
      // Start with "%F-%--%--%--" and fill in the blanks
      dest[0] = '%';
      dest[1] = 'F';
      dest[3] = '%';
      dest[6] = '%';
      dest[9] = '%';
      dest[11] = UPPER_HEX_DIGITS[cp & 0xF];
      cp >>>= 4;
      dest[10] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
      cp >>>= 2;
      dest[8] = UPPER_HEX_DIGITS[cp & 0xF];
      cp >>>= 4;
      dest[7] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
      cp >>>= 2;
      dest[5] = UPPER_HEX_DIGITS[cp & 0xF];
      cp >>>= 4;
      dest[4] = UPPER_HEX_DIGITS[0x8 | (cp & 0x3)];
      cp >>>= 2;
      dest[2] = UPPER_HEX_DIGITS[cp & 0x7];
      return dest;
    } else {
      // If this ever happens it is due to bug in UnicodeEscaper, not bad input.
      throw new IllegalArgumentException(
          "Invalid unicode character value " + cp);
    }
  }
}