net.java.truecommons.shed.UriEncoder Maven / Gradle / Ivy
/*
* Copyright (C) 2005-2012 Schlichtherle IT Services.
* All rights reserved. Use is subject to license terms.
*/
package net.java.truecommons.shed;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import static java.nio.charset.CoderResult.OVERFLOW;
import static java.nio.charset.CoderResult.UNDERFLOW;
import javax.annotation.CheckForNull;
import javax.annotation.concurrent.NotThreadSafe;
/**
* Escapes illegal characters in URI components according to
* RFC 2396
* and its updates in
* RFC 2732
* for IPv6 addresses.
*
* @see
* RFC 2396: Uniform Resource Identifiers (URI): Generic Syntax
* @see
* RFC 2732: Format for Literal IPv6 Addresses in URL's
* @see UriBuilder
* @see UriDecoder
* @author Christian Schlichtherle
*/
@NotThreadSafe
final class UriEncoder {
/** The default character set. */
public static final Charset UTF8 = Charset.forName("UTF-8");
private static final char[] HEX = {
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
};
private static final String
ALPHANUM_CHARS = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
private static final String MARK_CHARS = "-_.!~*'()";
private static final String
DEFAULT_LEGAL_CHARS = ALPHANUM_CHARS + MARK_CHARS + ",;$&+=@";
private final CharsetEncoder encoder;
private final boolean encode;
private final boolean raw;
private @CheckForNull StringBuilder stringBuilder;
/**
* Constructs a new URI encoder which uses the UTF-8 character set to
* escape non-US-ASCII characters.
* Equivalent to {@link #UriEncoder(Charset, boolean) UriEncoder(UTF8, false)}.
*/
UriEncoder() {
this(UTF8, false);
}
/**
* Constructs a new URI codec which uses the UTF-8 character set to encode
* non-US-ASCII characters.
* Equivalent to {@link #UriEncoder(Charset, boolean) UriEncoder(UTF8, false)}.
*
* @param raw If {@code true}, then the {@code '%'} character doesn't get
* quoted.
*/
UriEncoder(boolean raw) {
this(UTF8, raw);
}
/**
* Constructs a new URI codec which uses the given character set to encode
* non-US-ASCII characters.
* Equivalent to {@link #UriEncoder(Charset, boolean) UriEncoder(charset, false)}.
*
* @param charset the character set to use for encoding non-US-ASCII
* characters.
* If this parameter is {@code null},
* then non-US-ASCII characters will get encoded to {@code UTF-8}
* if and only if {@link Character#isISOControl(char)} or
* {@link Character#isSpaceChar(char)} is {@code true},
* so that most non-US-ASCII character would get preserved.
* Note that providing any other value than {@code null} or
* {@code UTF-8} will void interoperability with most applications.
*/
UriEncoder(@CheckForNull Charset charset) {
this(charset, false);
}
/**
* Constructs a new URI codec which uses the given character set to escape
* non-US-ASCII characters.
*
*
* @param charset the character set to use for encoding non-US-ASCII
* characters.
* If this parameter is {@code null},
* then non-US-ASCII characters will get encoded to {@code UTF-8}
* if and only if {@link Character#isISOControl(char)} or
* {@link Character#isSpaceChar(char)} is {@code true},
* so that most non-US-ASCII character would get preserved.
* Note that providing any other value than {@code null} or
* {@code UTF-8} will void interoperability with most applications.
* @param raw If {@code true}, then the {@code '%'} character doesn't get
* quoted.
*/
UriEncoder(@CheckForNull Charset charset, final boolean raw) {
if (!(this.encode = null != charset))
charset = UTF8;
this.encoder = charset.newEncoder();
this.raw = raw;
}
boolean isRaw() {
return raw;
}
/**
* Encodes all characters in the string {@code dS} which are illegal within
* the URI component {@code comp}.
*
* Note that calling this method on an already encoded string escapes any
* escape sequences again, that is, each occurence of the character
* {@code '%'} is substituted with the string {@code "%25"} again.
*
* @param dS the decoded string to encode.
* @param comp the URI component to encode.
* @return The encoded string.
* @throws IllegalArgumentException on any encoding error with a
* {@link URISyntaxException} as its
* {@link IllegalArgumentException#getCause() cause}.
* This exception should never occur if the character set of this
* codec is UTF-8.
*/
public String encode(String dS, Encoding comp) {
try {
StringBuilder eS = encode(dS, comp, null);
return null != eS ? eS.toString() : dS;
} catch (URISyntaxException ex) {
throw new IllegalArgumentException(ex);
}
}
/**
* Encodes all characters in the string {@code dS} which are illegal within
* the URI component {@code comp} to the string builder {@code eS}.
*
* Note that calling this method on an already encoded string escapes
* any escape sequences again, that is, each occurence of the character
* {@code '%'} is substituted with the string {@code "%25"} again.
*
* @param dS the decoded string to encode.
* @param comp the URI component to encode.
* @param eS the string builder to which all encoded characters shall get
* appended.
* @return If {@code dS} contains only legal characters for the URI
* component {@code comp}, then {@code null} gets returned.
* Otherwise, if {@code eS} is not {@code null}, then it gets
* returned with all encoded characters appended to it.
* Otherwise, a temporary string builder gets returned which solely
* contains all encoded characters.
* This temporary string builder may get cleared and reused upon
* the next call to any method of this object.
* @throws URISyntaxException on any encoding error.
* This exception should never occur if the character set of this
* codec is UTF-8.
* If it occurs however, {@code eS} is left in an undefined state.
*/
public @CheckForNull StringBuilder encode(
final String dS,
final Encoding comp,
@CheckForNull StringBuilder eS) // encoded String
throws URISyntaxException {
final String[] escapes = comp.escapes;
final CharBuffer dC = CharBuffer.wrap(dS); // decoded characters
ByteBuffer eB = null; // encoded bytes
final CharsetEncoder enc = encoder;
final boolean encode = this.encode;
while (dC.hasRemaining()) {
dC.mark();
final char dc = dC.get(); // decoded character
if (dc < 0x80) {
final String es = escapes[dc]; // escape sequence
if (!(null == es || '%' == dc && raw)) {
if (null == eB) {
if (null == eS) {
if (null == (eS = stringBuilder))
eS = stringBuilder = new StringBuilder();
else
eS.setLength(0);
eS.append(dS, 0, dC.position() - 1); // prefix until current character
}
eB = ByteBuffer.allocate(3);
}
eS.append(es);
} else if (null != eS) {
eS.append(dc);
}
} else if (Character.isISOControl(dc) ||
Character.isSpaceChar(dc) ||
encode) {
if (null == eB) {
if (null == eS) {
if (null == (eS = stringBuilder))
eS = stringBuilder = new StringBuilder();
else
eS.setLength(0);
eS.append(dS, 0, dC.position() - 1); // prefix until current character
}
eB = ByteBuffer.allocate(3);
}
final int p = dC.position();
dC.reset();
dC.limit(p);
{ // Encode dC -> eB.
CoderResult cr;
if (UNDERFLOW != (cr = enc.reset().encode(dC, eB, true))
|| UNDERFLOW != (cr = enc.flush(eB))) {
assert OVERFLOW != cr;
throw new QuotedUriSyntaxException(dS, cr.toString());
}
}
eB.flip();
quote(eB, eS);
eB.clear();
dC.limit(dC.capacity());
} else if (null != eS) {
eS.append(dc);
}
}
return null == eB ? null : eS;
}
private static void quote(final char dc, final StringBuilder eS) {
quote(UTF8.encode(CharBuffer.wrap(Character.toString(dc))), eS);
}
private static void quote(final ByteBuffer eB, final StringBuilder eS) {
while (eB.hasRemaining()) {
final byte eb = eB.get();
eS.append('%');
eS.append(HEX[(eb >> 4) & 0xf]);
eS.append(HEX[ eb & 0xf]);
}
}
/**
* Defines the escape sequences for illegal characters in various URI
* components.
*/
@SuppressWarnings("PackageVisibleInnerClass")
public enum Encoding {
/**
* Encoding which can be safely used for any URI component, except the
* URI scheme component which does not allow escape sequences.
* This encoding may produce redundant escape sequences, however.
*/
ANY(DEFAULT_LEGAL_CHARS),
/** Encoding for exclusive use with the URI authority component. */
AUTHORITY(DEFAULT_LEGAL_CHARS + ":[]"),
/**
* Encoding for exclusive use with the URI path component
* where the path may contain arbitrary characters.
* This encoding may produce redundant escape sequences for absolute
* paths, however.
*
* @see #ABSOLUTE_PATH
*/
PATH(DEFAULT_LEGAL_CHARS + "/"),
/**
* Encoding for exclusive use with the URI path component
* where the path starts with the separator character {@code '/'}.
*
* @see #PATH
*/
ABSOLUTE_PATH(DEFAULT_LEGAL_CHARS + ":/"),
/** Encoding for exclusive use with the URI query component. */
QUERY(DEFAULT_LEGAL_CHARS + ":/?"),
/** Encoding for exclusive use with the URI fragment component. */
FRAGMENT(DEFAULT_LEGAL_CHARS + ":/?");
private final String[] escapes = new String[0x80];
private Encoding(final String legal) {
// Populate table of getEscapeSequence sequences.
final StringBuilder sb = new StringBuilder();
for (char c = 0; c < 0x80; c++) {
if (legal.indexOf(c) >= 0)
continue;
sb.setLength(0);
quote(c, sb);
escapes[c] = sb.toString();
}
}
}
}