org.owasp.encoder.URIEncoder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of aem-sdk-api Show documentation
Show all versions of aem-sdk-api Show documentation
The Adobe Experience Manager SDK
// Copyright (c) 2012 Jeff Ichnowski
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
//
// * Redistributions of source code must retain the above
// copyright notice, this list of conditions and the following
// disclaimer.
//
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following
// disclaimer in the documentation and/or other materials
// provided with the distribution.
//
// * Neither the name of the OWASP nor the names of its
// contributors may be used to endorse or promote products
// derived from this software without specific prior written
// permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
// HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
// STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
// OF THE POSSIBILITY OF SUCH DAMAGE.
package org.owasp.encoder;
import java.nio.CharBuffer;
import java.nio.charset.CoderResult;
/**
* URIEncoder -- An encoder for URI based contexts.
*
* @author jeffi
*/
class URIEncoder extends Encoder {
/**
* Number of characters in the range '0' to '9'.
*/
static final int CHARS_0_TO_9 = 10;
/**
* Number of characters in the range 'a' to 'z'.
*/
static final int CHARS_A_TO_Z = 26;
/**
* Number of bits in a long.
*/
static final int LONG_BITS = 64;
/**
* Maximum number of characters quired to encode a single input character.
*/
static final int MAX_ENCODED_CHAR_LENGTH = 9;
/**
* Number of characters used to '%' encode a single hex-value.
*/
static final int PERCENT_ENCODED_LENGTH = 3;
/**
* Maximum code-point value that can be encoded with 2 utf-8 bytes.
*/
static final int MAX_UTF8_2_BYTE = 0x7ff;
/**
* When the encoded output requires 2 bytes, this is the high bits of the first byte.
*/
static final int UTF8_2_BYTE_FIRST_MSB = 0xc0;
/**
* When the encoded output requires 3 bytes, this is the high bits of the first byte.
*/
static final int UTF8_3_BYTE_FIRST_MSB = 0xe0;
/**
* When the encoded output requires 4 bytes, this is the high bits of the first byte.
*/
static final int UTF8_4_BYTE_FIRST_MSB = 0xf0;
/**
* For all characters in a 2-4 byte encoded sequence after the first this is the high bits of the input bytes.
*/
static final int UTF8_BYTE_MSB = 0x80;
/**
* UTF-8 encodes 6-bits of the code-point in each output UTF-8 byte.
*/
static final int UTF8_SHIFT = 6;
/**
* This is the mask containing 6-ones in the lower 6-bits.
*/
static final int UTF8_MASK = 0x3f;
/**
* The character to use when replacing an invalid character.
*/
static final char INVALID_REPLACEMENT_CHARACTER = '-';
/**
* RFC 3986 -- "The uppercase hexadecimal digits 'A' through 'F' are equivalent to the lowercase digits 'a' through 'f',
* respectively. If two URIs differ only in the case of hexadecimal digits used in percent- encoded octets, they are
* equivalent. For consistency, URI producers and normalizers should use uppercase hexadecimal digits for all percent-
* encodings."
*/
static final char[] UHEX = "0123456789ABCDEF".toCharArray();
// ASCII table
// 0x20: ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
// 0x40: @ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
// 0x60: ` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~
// ASCII table of RFC 3986 "Unreserved Characters"
// 0x20: - . 0 1 2 3 4 5 6 7 8 9
// 0x40: A B C D E F G H I J K L M N O P Q R S T U V W X Y Z _
// 0x60: a b c d e f g h i j k l m n o p q r s t u v w x y z ~
// Note: (1L << n) - 1 is bit arithmetic to get n 1 bits.
// e.g. (1L << 10) = binary 10000000000
// binary 10000000000 = 1111111111
/**
* RFC 3986 Unreserved Characters. The first 64.
*
* unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
*
*/
static final long UNRESERVED_MASK_LOW
= (((1L << CHARS_0_TO_9) - 1) << '0') | (1L << '-') | (1L << '.');
/**
* RFC 3986 Unreserved Characters. The second 64.
*
* unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
*
*/
static final long UNRESERVED_MASK_HIGH
= (((1L << CHARS_A_TO_Z) - 1) << ('a' - LONG_BITS))
| (((1L << CHARS_A_TO_Z) - 1) << ('A' - LONG_BITS))
| (1L << ('_' - LONG_BITS)) | (1L << ('~' - LONG_BITS));
/**
* RFC 3986 Reserved Characters. The first 64.
*
* reserved = gen-delims / sub-delims
*
* gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
*
* sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
* / "*" / "+" / "," / ";" / "="
*
*/
static final long RESERVED_MASK_LOW
= // gen-delims
(1L << ':') | (1L << '/') | (1L << '?') | (1L << '#')
| // sub-delims
(1L << '!') | (1L << '$') | (1L << '&') | (1L << '\'')
| (1L << '(') | (1L << ')') | (1L << '*') | (1L << '+')
| (1L << ',') | (1L << ';') | (1L << '=');
/**
* The second 64 RFC 3986 Reserved characters.
*/
static final long RESERVED_MASK_HIGH
= // gen-delims
(1L << ('[' - LONG_BITS)) | (1L << (']' - LONG_BITS)) | (1L << ('@' - LONG_BITS));
/**
* Encoding mode of operation for URI encodes. The modes define which characters get encoded using %-encoding, and which do
* not.
*/
public enum Mode {
/**
* In "component" mode, only the unreserved characters are left unescaped. Everything else is escaped.
*/
COMPONENT(UNRESERVED_MASK_LOW, UNRESERVED_MASK_HIGH),
/**
* In "full" mode, all unreserved and reserved characters are left unescaped. Anything else is escaped.
*/
FULL_URI(UNRESERVED_MASK_LOW | RESERVED_MASK_LOW,
UNRESERVED_MASK_HIGH | RESERVED_MASK_HIGH),;
/**
* The low bit-mask--copied into the _lowMask of the encoder.
*/
final long _lowMask;
/**
* The high bit-mask--copied into the _highMask of the encoder.
*/
final long _highMask;
/**
* Constructor to create a mode with the specified bit-masks.
*
* @param low the low bit-mask (characters 0 to 63)
* @param high the high bit-mask (characters 64 to 127)
*/
Mode(long low, long high) {
_lowMask = low;
_highMask = high;
}
/**
* Accessor for the low bit-mask.
*
* @return _lowMask
*/
long lowMask() {
return _lowMask;
}
/**
* Accessor for the high bit-mask.
*
* @return _highMask
*/
long highMask() {
return _highMask;
}
}
/**
* The bit-mask of characters that do not need to be escaped, for characters with code-points in the range 0 to 63.
*/
private final long _lowMask;
/**
* The bit-mask of characters that do not need to be escaped, for character with code-points in the range 64 to 127.
*/
private final long _highMask;
/**
* The encoding mode for this encoder--used primarily for toString().
*/
private final Mode _mode;
/**
* Constructor equivalent to @{code URIEncoder(Mode.FULL_URI)}.
*/
URIEncoder() {
this(Mode.FULL_URI);
}
/**
* Constructor for the URIEncoder the specifies the encoding mode the URIEncoder will use.
*
* @param mode the encoding mode for this encoder.
*/
URIEncoder(Mode mode) {
_mode = mode;
_lowMask = mode.lowMask();
_highMask = mode.highMask();
}
@Override
protected int maxEncodedLength(int n) {
// '\uffff' becomes 3 UTF-8 bytes, percent encoded to 9 characters.
//
// Note: Surrogate pairs (2 chars) become 4 UTF-8 or 12 characters,
// or 6 encoded characters per input char.
return n * MAX_ENCODED_CHAR_LENGTH;
}
@Override
protected int firstEncodedOffset(String input, int off, int len) {
final int n = off + len;
for (int i = off; i < n; ++i) {
char ch = input.charAt(i);
if (ch <= Unicode.DEL) {
if (!(ch < LONG_BITS ? (_lowMask & (1L << ch)) != 0 : (_highMask & (1L << (ch - LONG_BITS))) != 0)) {
return i;
// } else {
// // valid
}
} else {
return i;
}
}
return n;
}
@Override
protected CoderResult encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput) {
final char[] in = input.array();
final char[] out = output.array();
int i = input.arrayOffset() + input.position();
final int n = input.arrayOffset() + input.limit();
int j = output.arrayOffset() + output.position();
final int m = output.arrayOffset() + output.limit();
for (; i < n; ++i) {
char ch = in[i];
if (ch <= Unicode.DEL) {
if (ch < LONG_BITS ? (_lowMask & (1L << ch)) != 0 : (_highMask & (1L << (ch - LONG_BITS))) != 0) {
if (j >= m) {
return overflow(input, i, output, j);
}
out[j++] = ch;
} else {
// one UTF-8 byte
if (j + PERCENT_ENCODED_LENGTH > m) {
return overflow(input, i, output, j);
}
out[j++] = '%';
out[j++] = UHEX[ch >>> HEX_SHIFT];
out[j++] = UHEX[ch & HEX_MASK];
}
} else if (ch <= MAX_UTF8_2_BYTE) {
if (j + 2 * PERCENT_ENCODED_LENGTH > m) {
return overflow(input, i, output, j);
}
// two UTF-8 bytes
int b1 = UTF8_2_BYTE_FIRST_MSB | (ch >>> UTF8_SHIFT);
out[j++] = '%';
out[j++] = UHEX[b1 >>> HEX_SHIFT];
out[j++] = UHEX[b1 & HEX_MASK];
int b2 = UTF8_BYTE_MSB | (ch & UTF8_MASK);
out[j++] = '%';
out[j++] = UHEX[b2 >>> HEX_SHIFT];
out[j++] = UHEX[b2 & HEX_MASK];
} else if (ch < Character.MIN_HIGH_SURROGATE || ch > Character.MAX_LOW_SURROGATE) {
if (j + 3 * PERCENT_ENCODED_LENGTH > m) {
return overflow(input, i, output, j);
}
// three UTF-8 bytes
int b1 = UTF8_3_BYTE_FIRST_MSB | (ch >>> 2 * UTF8_SHIFT);
out[j++] = '%';
out[j++] = UHEX[b1 >>> HEX_SHIFT];
out[j++] = UHEX[b1 & HEX_MASK];
int b2 = UTF8_BYTE_MSB | ((ch >>> UTF8_SHIFT) & UTF8_MASK);
out[j++] = '%';
out[j++] = UHEX[b2 >>> HEX_SHIFT];
out[j++] = UHEX[b2 & HEX_MASK];
int b3 = UTF8_BYTE_MSB | (ch & UTF8_MASK);
out[j++] = '%';
out[j++] = UHEX[b3 >>> HEX_SHIFT];
out[j++] = UHEX[b3 & HEX_MASK];
} else if (ch <= Character.MAX_HIGH_SURROGATE) {
// surrogate pair: 2 UTF-16 => 4 UTF-8 bytes
if (i + 1 < n) {
if (Character.isLowSurrogate(in[i + 1])) {
if (j + 4 * PERCENT_ENCODED_LENGTH > m) {
return overflow(input, i, output, j);
}
int cp = Character.toCodePoint(ch, in[++i]);
int b1 = UTF8_4_BYTE_FIRST_MSB | (cp >>> 3 * UTF8_SHIFT);
out[j++] = '%';
out[j++] = UHEX[b1 >>> HEX_SHIFT];
out[j++] = UHEX[b1 & HEX_MASK];
int b2 = UTF8_BYTE_MSB | ((cp >>> 2 * UTF8_SHIFT) & UTF8_MASK);
out[j++] = '%';
out[j++] = UHEX[b2 >>> HEX_SHIFT];
out[j++] = UHEX[b2 & HEX_MASK];
int b3 = UTF8_BYTE_MSB | ((cp >>> UTF8_SHIFT) & UTF8_MASK);
out[j++] = '%';
out[j++] = UHEX[b3 >>> HEX_SHIFT];
out[j++] = UHEX[b3 & HEX_MASK];
int b4 = UTF8_BYTE_MSB | (cp & UTF8_MASK);
out[j++] = '%';
out[j++] = UHEX[b4 >>> HEX_SHIFT];
out[j++] = UHEX[b4 & HEX_MASK];
} else {
if (j >= m) {
return overflow(input, i, output, j);
}
out[j++] = INVALID_REPLACEMENT_CHARACTER;
}
} else if (endOfInput) {
if (j >= m) {
return overflow(input, i, output, j);
}
out[j++] = INVALID_REPLACEMENT_CHARACTER;
} else {
break;
}
} else {
// invalid (low surrogate without preceeding high surrogate)
if (j >= m) {
return overflow(input, i, output, j);
}
out[j++] = INVALID_REPLACEMENT_CHARACTER;
}
}
return underflow(input, i, output, j);
}
@Override
public String toString() {
return "URIEncoder(mode=" + _mode + ")";
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy