com.composum.sling.core.util.UrlCodec Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of composum-nodes-commons Show documentation
general components and objects to use the Sling API
There is a newer version: 4.3.4
Show newest version
package com.composum.sling.core.util;

import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import java.nio.BufferOverflowException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.*;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

/**
 * Codecs for the various URL parts.
 * Unlike {@link org.apache.commons.codec.net.URLCodec} this is focused on Strings
 * and thus the decoder can leave unknown characters untouched: "ä%C3%A4"
 * is decoded to "ää" instead of "?ä" as {@link org.apache.commons.codec.net.URLCodec#decode(String)} would do.
 */
public class UrlCodec {

    private static final Logger LOG = LoggerFactory.getLogger(UrlCodec.class);

    /**
     * The characters which can always appear in any URL without being encoded: the "unreserved"
     * chars. Unfortunately there are different recommendations about encoding $!*'(), so we exclude them.
     * Possibly we could include the "extra" chars !*'(), . We exlude ~ since it was declared unsafe in path part of an URL.
     */
    public static final UrlCodec PATH = new UrlCodec(PART_URL_SAFECHARS + "/:@" +
            "!$&'()*+,=" // the sub-delims  https://tools.ietf.org/html/rfc3986#section-2.2
            // We omit ; which is admissible in JCR resources, but is removed from the suffix in
            // SlingPathInfo if not quoted.
            , StandardCharsets.UTF_8);

    /**
     * Codec for the authority of an URL.
     */
    public static final UrlCodec AUTHORITY = new UrlCodec(PART_URL_SAFECHARS +
            "!$&'()*+,;=" // sub-delims
            , StandardCharsets.UTF_8);

    /**
     * Codec for the query part of an URL.
     */
    public static final UrlCodec QUERYPART = new UrlCodec(PART_URL_SAFECHARS + "/:@"
            + "!$'()*,;" // the sub-delims, we exclude & and = since they are meta-characters here, and & of course
            // also + since that encodes space and would have to be handled differently, and & of course
            , StandardCharsets.UTF_8) {

        @Override
        protected String charsToEncode(String admissibleCharacters) {
            return admissibleCharacters.replaceAll("\\+", " ");
        }

        @Override
        protected void encodePostprocess(StringBuffer out) {
            for (int i = 0; i < out.length(); ++i) {
                if (out.charAt(i) == ' ') {
                    out.setCharAt(i, '+');
                }
            }
        }

        @Override
        protected String decodePreprocess(String encoded) {
            return StringUtils.replaceChars(encoded, '+', ' ');
        }
    };

    /**
     * Codec for the fragment part of an URL.
     */
    public static final UrlCodec FRAGMENT = new UrlCodec(PART_URL_SAFECHARS + "!$&'()*+,;=" +
            "/?:@", StandardCharsets.UTF_8);

    /**
     * Codec for opaque URLs that are not parsed. Contains all unreserved, reserved and extra characters
     */
    public static final UrlCodec OPAQUE = new UrlCodec(PART_URL_SAFECHARS + "$.!*'()," + ";/?:@=&", StandardCharsets.UTF_8);

    /**
     * Matches one or several percent encoded bytes.
     */
    protected static final Pattern PAT_ENCODED_CHARACTERS = Pattern.compile("(%[0-9a-fA-F][0-9a-fA-F])+");

    /**
     * Matches a percent sign followed by something that's not a hexadecimally encoded byte.
     */
    protected static final Pattern PAT_INVALID_ENCODED_CHARACTER = Pattern.compile("%(?![0-9a-fA-F][0-9a-fA-F]).{0,2}");

    /**
     * {@value #INVALID_CHARACTER_MARKER} is inserted whenever something could not be decoded,
     * or sometimes when it's encoded - see {@link #encode(String)}.
     */
    protected static final String INVALID_CHARACTER_MARKER = "\ufffd";

    protected static final String HEXDIGITS = "0123456789ABCDEF";

    protected final Charset charset;

    protected final String admissibleCharacters;

    /**
     * Matches one or more characters not in the {@link #admissibleCharacters}.
     */
    protected final Pattern charsToEncodeRegex;
    /**
     * Matches an arbitrarily long sequence of admissible chars and percent encodings.
     */
    protected final Pattern validationRegex;

    protected transient String invalidCharacterMarkerForEncoding;

    /**
     * Initializes the Codec with a range of admissible characters.
     *
     * @param admissibleCharacters all characters that remain untouched when encoding, can contain ranges like a-z in simple regex character classes. (Thus, - has to be first or last character if it needs to be included. Obviously, the quoting character '%' always has to be admissible.
     * @param charset              the charset needed for the decoder.
     * @throws IllegalArgumentException if the admissibleCharacters don't contain '%'
     * @throws PatternSyntaxException   if the admissibleCharacters are not a well formed character class
     */
    public UrlCodec(@NotNull String admissibleCharacters, @NotNull Charset charset) throws IllegalArgumentException, PatternSyntaxException {
        this.charset = Objects.requireNonNull(charset);
        this.admissibleCharacters = Objects.requireNonNull(admissibleCharacters);
        this.charsToEncodeRegex = Pattern.compile("([^" + charsToEncode(admissibleCharacters) + "])+");
        if (!charsToEncodeRegex.matcher("%").matches()) {
            throw new IllegalArgumentException("Quoting character '%' cannot be admissible.");
        }
        this.validationRegex = Pattern.compile("([" + admissibleCharacters + "]|%[0-9a-fA-F][0-9a-fA-F])*");
    }

    /**
     * Hook to calculate the set of characters to encode from the admissibleCharacters
     */
    protected String charsToEncode(String admissibleCharacters) {
        return admissibleCharacters;
    }

    /**
     * Encodes all characters which are not admissible to percent-encodings wrt. the given charset.
     * If characters are not in the charset, they will silently be encoded as a replacement character,
     * which is either {@value #INVALID_CHARACTER_MARKER} or '?' if one of these is admissible, or the encoding
     * of {@value #INVALID_CHARACTER_MARKER} for the charset (which might be an encoded '?').
     */
    @Nullable
    public String encode(@Nullable String encoded) {
        return encode(encoded, false);
    }

    /**
     * Encodes all characters which are not admissible to percent-encodings wrt. the given charset.
     * If characters are not in the charset, we will throw an {@link IllegalArgumentException}.
     *
     * @throws IllegalArgumentException if a character cannot be encoded
     */
    @Nullable
    public String encodeValidated(@Nullable String encoded) throws IllegalArgumentException {
        return encode(encoded, true);
    }

    /**
     * Decodes a percent encoded characters in the string, never throwing exceptions: if an undecodeable
     * character is encountered it's replaced with the replacement character {@value #INVALID_CHARACTER_MARKER}.
     * The only exception we make here is that a % sign without a hexadecimal number is passed through unchanged,
     * so that this can be used to preventively decode strings that might be encoded - which is not 100% safe, though, since there might been something looking like a % encoded character: e.g. "an%effect" will be decoded to "an\ufffdfect".
     */
    @Nullable
    public String decode(@Nullable String encoded) {
        return decode(encoded, false);
    }

    @Nullable
    protected String encode(@Nullable String encoded, boolean doThrow) {
        if (encoded == null || encoded.isEmpty()) {
            return encoded;
        }
        Matcher matcher = charsToEncodeRegex.matcher(encoded);
        ByteBuffer bytes = ByteBuffer.allocate(100);
        CharsetEncoder charsetEncoder = charset.newEncoder();
        StringBuffer out = new StringBuffer();
        while (matcher.find()) { // found some not admissible characters we need to encode
            matcher.appendReplacement(out, "");

            CharSequence match = encoded.subSequence(matcher.start(), matcher.end());
            CharBuffer matchBuffer;
            boolean overflow, error = true;
            do {
                bytes.clear();
                charsetEncoder.reset();
                matchBuffer = CharBuffer.wrap(match);
                CoderResult result1 = charsetEncoder.encode(matchBuffer, bytes, true);
                CoderResult result2 = charsetEncoder.flush(bytes);
                overflow = result1.isOverflow() || result2.isOverflow();
                error = result1.isError() || result2.isError();

                if (overflow) { // enlarge byte buffer and try again
                    bytes = ByteBuffer.allocate((int) Math.max(2 * bytes.capacity(),
                            match.length() * charsetEncoder.maxBytesPerChar() * 1.2
                    ));
                }
            } while (overflow);

            // percent encode the bytes encoded from the not admissible characters
            bytes.flip().rewind();
            writePercentEncoded(bytes, out);

            if (error) {
                LOG.debug("Could not encode {} to {}", matcher.group(), charset.name());
                if (doThrow) {
                    throw new IllegalArgumentException("Could not encode " + matcher.group());
                } else {
                    out.append(StringUtils.repeat(getInvalidCharacterMarkerForEncoding(),
                            matcher.end() - matcher.start() - matchBuffer.position()));
                }
            }
        }
        matcher.appendTail(out);
        encodePostprocess(out);
        return out.toString();
    }

    /**
     * Hook for finalizing encoding
     */
    protected void encodePostprocess(StringBuffer out) {
    }

    protected void writePercentEncoded(ByteBuffer bytes, StringBuffer out) {
        while (bytes.hasRemaining()) {
            int b = (bytes.get() + 0x100) & 0xff;
            out.append('%')
                    .append(HEXDIGITS.charAt(b / 0x10))
                    .append(HEXDIGITS.charAt(b % 0x10));
        }
    }

    /**
     * To mark characters that could not properly be encoded, we use {@value #INVALID_CHARACTER_MARKER} or ? if
     * one of these is admissible, or {@value #INVALID_CHARACTER_MARKER} encoded if that belongs to the charset, or ? encoded if
     * it's not.
     */
    protected String getInvalidCharacterMarkerForEncoding() {
        if (invalidCharacterMarkerForEncoding == null) {
            if (!charsToEncodeRegex.matcher(INVALID_CHARACTER_MARKER).matches()) {
                invalidCharacterMarkerForEncoding = INVALID_CHARACTER_MARKER;
            } else if (!charsToEncodeRegex.matcher("?").matches()) {
                invalidCharacterMarkerForEncoding = "?";
            } else {
                ByteBuffer byteBuffer = charset.encode(INVALID_CHARACTER_MARKER);
                StringBuffer buf = new StringBuffer();
                writePercentEncoded(byteBuffer, buf);
                invalidCharacterMarkerForEncoding = buf.toString();
            }
        }
        return invalidCharacterMarkerForEncoding;
    }

    /**
     * Decodes percent encoded characters in the string but throws an {@link IllegalArgumentException} if the input string is invalid:
     * if it contains an unencoded quoting character % recognizable because it is not followed by a 2 digit hexadecimal number or it does not encode a character in the charset.
     *
     * @throws IllegalArgumentException if encoded is not a validly encoded String
     */
    @Nullable
    public String decodeValidated(@Nullable String encoded) throws IllegalArgumentException {
        return decode(encoded, true);
    }

    @Nullable
    protected String decode(@Nullable String encoded, boolean doThrow) throws IllegalArgumentException {
        encoded = decodePreprocess(encoded);
        if (encoded == null || encoded.isEmpty() || !encoded.contains("%")) {
            return encoded;
        }
        if (doThrow) {
            Matcher fail = PAT_INVALID_ENCODED_CHARACTER.matcher(encoded);
            if (fail.find()) {
                throw new IllegalArgumentException("Invalid encoded character " + fail.group());
            }
        }
        Matcher m = PAT_ENCODED_CHARACTERS.matcher(encoded);
        CharBuffer out = CharBuffer.allocate(encoded.length() + 100);
        ByteBuffer bytes = ByteBuffer.allocate(100);
        CharsetDecoder charsetDecoder = charset.newDecoder();
        int appended = 0;
        try {
            while (m.find()) {
                out.append(encoded, appended, m.start());
                appended = m.end();
                if (bytes.capacity() < (m.end() - m.start()) / 3) {
                    bytes = ByteBuffer.allocate(m.end() - m.start());
                }
                bytes.clear();
                for (int i = m.start() + 1; i < m.end(); i += 3) {
                    bytes.put((byte) (16 * unhex(encoded.charAt(i)) + unhex(encoded.charAt(i + 1))));
                }
                charsetDecoder.reset();
                bytes.flip();
                CoderResult result = charsetDecoder.decode(bytes, out, true);
                checkResult(encoded, doThrow, out, result);
                result = charsetDecoder.flush(out);
                checkResult(encoded, doThrow, out, result);
            }
            out.append(encoded, appended, encoded.length());
        } catch (BufferOverflowException e) { // impossible
            LOG.error("Bug: Buffer overflow in decoding {}", encoded, e);
            if (doThrow) {
                throw e;
            } else {
                return out.flip().toString() + INVALID_CHARACTER_MARKER;
            }
        }
        return out.flip().toString();
    }

    /**
     * Hook to preprocess something about to be decoded.
     */
    protected String decodePreprocess(String encoded) {
        return encoded;
    }

    protected void checkResult(@NotNull String encoded, boolean doThrow, CharBuffer out, CoderResult result) throws IllegalArgumentException {
        if (result.isError()) {
            if (doThrow) {
                try {
                    result.throwException();
                } catch (CharacterCodingException e) {
                    throw new IllegalArgumentException(e);
                }
            } else {
                out.put(INVALID_CHARACTER_MARKER);
            }
        }
        if (result.isOverflow()) {
            LOG.error("Bug: overflow when decoding {}", encoded);
        }
    }

    protected byte unhex(char c) {
        if (c >= '0' && c <= '9') return (byte) (c - '0');
        if (c >= 'a' && c <= 'f') return (byte) (10 + c - 'a');
        if (c >= 'A' && c <= 'F') return (byte) (10 + c - 'A');
        throw new IllegalArgumentException("Invalid hex char " + c);
    }

    /**
     * Verifies that the given String is encoded: all characters are admissible and % is always followed by a hexadecimal number.
     */
    public boolean isValid(@Nullable String encoded) {
        if (encoded == null || encoded.isEmpty()) {
            return true;
        }
        if (!validationRegex.matcher(encoded).matches()) {
            if (LOG.isDebugEnabled()) {
                Matcher m = validationRegex.matcher(encoded);
                if (m.lookingAt()) { // happens always
                    String invalidChars = StringUtils.abbreviate(encoded.substring(m.end()), 4);
                    LOG.debug("Inadmissible character(s) at {} in input {}", invalidChars, encoded);
                }
            }
            return false;
        }
        if (!encoded.contains("%")) {
            return true;
        }
        Matcher matcher = PAT_INVALID_ENCODED_CHARACTER.matcher(encoded);
        if (matcher.find()) {
            LOG.debug("Invalidly encoded character {} in input {}", matcher.group(), encoded);
            return false;
        }
        try { // check whether there are characters in there that do not belong to our charset
            decode(encoded, true);
        } catch (IllegalArgumentException e) {
            return false;
        }
        return true;
    }

    @Override
    public String toString() {
        return "UrlCodec{" +
                "charset=" + charset +
                ", admissibleCharacters='" + admissibleCharacters + '\'' +
                '}';
    }
}