All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.java.truecommons.shed.UriEncoder Maven / Gradle / Ivy

/*
 * Copyright (C) 2005-2012 Schlichtherle IT Services.
 * All rights reserved. Use is subject to license terms.
 */
package net.java.truecommons.shed;

import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import static java.nio.charset.CoderResult.OVERFLOW;
import static java.nio.charset.CoderResult.UNDERFLOW;
import javax.annotation.CheckForNull;
import javax.annotation.concurrent.NotThreadSafe;

/**
 * Escapes illegal characters in URI components according to
 * RFC 2396
 * and its updates in
 * RFC 2732
 * for IPv6 addresses.
 *
 * @see 
 *      RFC 2396: Uniform Resource Identifiers (URI): Generic Syntax
 * @see 
 *      RFC 2732: Format for Literal IPv6 Addresses in URL's
 * @see UriBuilder
 * @see UriDecoder
 * @author Christian Schlichtherle
 */
@NotThreadSafe
final class UriEncoder {

    /** The default character set. */
    public static final Charset UTF8 = Charset.forName("UTF-8");

    private static final char[] HEX = {
        '0', '1', '2', '3', '4', '5', '6', '7',
        '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
    };

    private static final String
            ALPHANUM_CHARS = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
    private static final String MARK_CHARS = "-_.!~*'()";
    private static final String
            DEFAULT_LEGAL_CHARS = ALPHANUM_CHARS + MARK_CHARS + ",;$&+=@";

    private final CharsetEncoder encoder;
    private final boolean encode;
    private final boolean raw;
    private @CheckForNull StringBuilder stringBuilder;

    /**
     * Constructs a new URI encoder which uses the UTF-8 character set to
     * escape non-US-ASCII characters.
     * Equivalent to {@link #UriEncoder(Charset, boolean) UriEncoder(UTF8, false)}.
     */
    UriEncoder() {
        this(UTF8, false);
    }

    /**
     * Constructs a new URI codec which uses the UTF-8 character set to encode
     * non-US-ASCII characters.
     * Equivalent to {@link #UriEncoder(Charset, boolean) UriEncoder(UTF8, false)}.
     * 
     * @param raw If {@code true}, then the {@code '%'} character doesn't get
     *        quoted.
     */
    UriEncoder(boolean raw) {
        this(UTF8, raw);
    }

    /**
     * Constructs a new URI codec which uses the given character set to encode
     * non-US-ASCII characters.
     * Equivalent to {@link #UriEncoder(Charset, boolean) UriEncoder(charset, false)}.
     * 
     * @param charset the character set to use for encoding non-US-ASCII
     *        characters.
     *        If this parameter is {@code null},
     *        then non-US-ASCII characters will get encoded to {@code UTF-8}
     *        if and only if {@link Character#isISOControl(char)} or
     *        {@link Character#isSpaceChar(char)} is {@code true},
     *        so that most non-US-ASCII character would get preserved.
     *        Note that providing any other value than {@code null} or
     *        {@code UTF-8} will void interoperability with most applications.
     */
    UriEncoder(@CheckForNull Charset charset) {
        this(charset, false);
    }

    /**
     * Constructs a new URI codec which uses the given character set to escape
     * non-US-ASCII characters.
     * 

* * @param charset the character set to use for encoding non-US-ASCII * characters. * If this parameter is {@code null}, * then non-US-ASCII characters will get encoded to {@code UTF-8} * if and only if {@link Character#isISOControl(char)} or * {@link Character#isSpaceChar(char)} is {@code true}, * so that most non-US-ASCII character would get preserved. * Note that providing any other value than {@code null} or * {@code UTF-8} will void interoperability with most applications. * @param raw If {@code true}, then the {@code '%'} character doesn't get * quoted. */ UriEncoder(@CheckForNull Charset charset, final boolean raw) { if (!(this.encode = null != charset)) charset = UTF8; this.encoder = charset.newEncoder(); this.raw = raw; } boolean isRaw() { return raw; } /** * Encodes all characters in the string {@code dS} which are illegal within * the URI component {@code comp}. *

* Note that calling this method on an already encoded string escapes any * escape sequences again, that is, each occurence of the character * {@code '%'} is substituted with the string {@code "%25"} again. * * @param dS the decoded string to encode. * @param comp the URI component to encode. * @return The encoded string. * @throws IllegalArgumentException on any encoding error with a * {@link URISyntaxException} as its * {@link IllegalArgumentException#getCause() cause}. * This exception should never occur if the character set of this * codec is UTF-8. */ public String encode(String dS, Encoding comp) { try { StringBuilder eS = encode(dS, comp, null); return null != eS ? eS.toString() : dS; } catch (URISyntaxException ex) { throw new IllegalArgumentException(ex); } } /** * Encodes all characters in the string {@code dS} which are illegal within * the URI component {@code comp} to the string builder {@code eS}. *

* Note that calling this method on an already encoded string escapes * any escape sequences again, that is, each occurence of the character * {@code '%'} is substituted with the string {@code "%25"} again. * * @param dS the decoded string to encode. * @param comp the URI component to encode. * @param eS the string builder to which all encoded characters shall get * appended. * @return If {@code dS} contains only legal characters for the URI * component {@code comp}, then {@code null} gets returned. * Otherwise, if {@code eS} is not {@code null}, then it gets * returned with all encoded characters appended to it. * Otherwise, a temporary string builder gets returned which solely * contains all encoded characters. * This temporary string builder may get cleared and reused upon * the next call to any method of this object. * @throws URISyntaxException on any encoding error. * This exception should never occur if the character set of this * codec is UTF-8. * If it occurs however, {@code eS} is left in an undefined state. */ public @CheckForNull StringBuilder encode( final String dS, final Encoding comp, @CheckForNull StringBuilder eS) // encoded String throws URISyntaxException { final String[] escapes = comp.escapes; final CharBuffer dC = CharBuffer.wrap(dS); // decoded characters ByteBuffer eB = null; // encoded bytes final CharsetEncoder enc = encoder; final boolean encode = this.encode; while (dC.hasRemaining()) { dC.mark(); final char dc = dC.get(); // decoded character if (dc < 0x80) { final String es = escapes[dc]; // escape sequence if (!(null == es || '%' == dc && raw)) { if (null == eB) { if (null == eS) { if (null == (eS = stringBuilder)) eS = stringBuilder = new StringBuilder(); else eS.setLength(0); eS.append(dS, 0, dC.position() - 1); // prefix until current character } eB = ByteBuffer.allocate(3); } eS.append(es); } else if (null != eS) { eS.append(dc); } } else if (Character.isISOControl(dc) || Character.isSpaceChar(dc) || encode) { if (null == eB) { if (null == eS) { if (null == (eS = stringBuilder)) eS = stringBuilder = new StringBuilder(); else eS.setLength(0); eS.append(dS, 0, dC.position() - 1); // prefix until current character } eB = ByteBuffer.allocate(3); } final int p = dC.position(); dC.reset(); dC.limit(p); { // Encode dC -> eB. CoderResult cr; if (UNDERFLOW != (cr = enc.reset().encode(dC, eB, true)) || UNDERFLOW != (cr = enc.flush(eB))) { assert OVERFLOW != cr; throw new QuotedUriSyntaxException(dS, cr.toString()); } } eB.flip(); quote(eB, eS); eB.clear(); dC.limit(dC.capacity()); } else if (null != eS) { eS.append(dc); } } return null == eB ? null : eS; } private static void quote(final char dc, final StringBuilder eS) { quote(UTF8.encode(CharBuffer.wrap(Character.toString(dc))), eS); } private static void quote(final ByteBuffer eB, final StringBuilder eS) { while (eB.hasRemaining()) { final byte eb = eB.get(); eS.append('%'); eS.append(HEX[(eb >> 4) & 0xf]); eS.append(HEX[ eb & 0xf]); } } /** * Defines the escape sequences for illegal characters in various URI * components. */ @SuppressWarnings("PackageVisibleInnerClass") public enum Encoding { /** * Encoding which can be safely used for any URI component, except the * URI scheme component which does not allow escape sequences. * This encoding may produce redundant escape sequences, however. */ ANY(DEFAULT_LEGAL_CHARS), /** Encoding for exclusive use with the URI authority component. */ AUTHORITY(DEFAULT_LEGAL_CHARS + ":[]"), /** * Encoding for exclusive use with the URI path component * where the path may contain arbitrary characters. * This encoding may produce redundant escape sequences for absolute * paths, however. * * @see #ABSOLUTE_PATH */ PATH(DEFAULT_LEGAL_CHARS + "/"), /** * Encoding for exclusive use with the URI path component * where the path starts with the separator character {@code '/'}. * * @see #PATH */ ABSOLUTE_PATH(DEFAULT_LEGAL_CHARS + ":/"), /** Encoding for exclusive use with the URI query component. */ QUERY(DEFAULT_LEGAL_CHARS + ":/?"), /** Encoding for exclusive use with the URI fragment component. */ FRAGMENT(DEFAULT_LEGAL_CHARS + ":/?"); private final String[] escapes = new String[0x80]; private Encoding(final String legal) { // Populate table of getEscapeSequence sequences. final StringBuilder sb = new StringBuilder(); for (char c = 0; c < 0x80; c++) { if (legal.indexOf(c) >= 0) continue; sb.setLength(0); quote(c, sb); escapes[c] = sb.toString(); } } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy