org.htmlunit.util.UrlUtils Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of xlt Show documentation
XLT (Xceptance LoadTest) is an extensive load and performance test tool developed and maintained by Xceptance.
There is a newer version: 8.4.1
/*
 * Copyright (c) 2002-2023 Gargoyle Software Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * https://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.htmlunit.util;

import static java.nio.charset.StandardCharsets.US_ASCII;
import static java.nio.charset.StandardCharsets.UTF_8;

import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLEncoder;
import java.net.URLStreamHandler;
import java.nio.charset.Charset;
import java.util.BitSet;
import java.util.Locale;
import java.util.Objects;

import org.apache.commons.codec.DecoderException;
import org.apache.commons.codec.net.URLCodec;

import org.htmlunit.WebAssert;
import org.htmlunit.protocol.AnyHandler;
import org.htmlunit.protocol.javascript.JavaScriptURLConnection;

/**
 * URL utilities class that makes it easy to create new URLs based off of old URLs
 * without having to assemble or parse them yourself.
 *
 * @author Daniel Gredler
 * @author Martin Tamme
 * @author Sudhan Moghe
 * @author Marc Guillemot
 * @author Ahmed Ashour
 * @author Ronald Brill
 * @author Joerg Werner
 * @author Hartmut Arlt
 */
public final class UrlUtils {

    /** "about". */
    public static final String ABOUT = "about";
    /** "about:". */
    public static final String ABOUT_SCHEME = ABOUT + ":";
    /** "about:blank". */
    public static final String ABOUT_BLANK = ABOUT_SCHEME + "blank";
    /** URL for "about:blank". */
    public static final URL URL_ABOUT_BLANK;

    private static final URLStreamHandler JS_HANDLER;
    private static final URLStreamHandler ABOUT_HANDLER;
    private static final URLStreamHandler DATA_HANDLER;

    private static final BitSet PATH_ALLOWED_CHARS = new BitSet(256);
    private static final BitSet QUERY_ALLOWED_CHARS = new BitSet(256);
    private static final BitSet ANCHOR_ALLOWED_CHARS = new BitSet(256);
    private static final BitSet HASH_ALLOWED_CHARS = new BitSet(256);

    /*
      URI allowed char initialization; based on HttpClient 3.1's URI bit sets.
     */
    static {
        // make sure the handlers are available first (before calling toUrlSafe())
        JS_HANDLER = new org.htmlunit.protocol.javascript.Handler();
        ABOUT_HANDLER = new org.htmlunit.protocol.about.Handler();
        DATA_HANDLER = new org.htmlunit.protocol.data.Handler();

        try {
            URL_ABOUT_BLANK = new URL(null, ABOUT_BLANK, ABOUT_HANDLER);
        }
        catch (final MalformedURLException e) {
            // should never happen
            throw new RuntimeException(e);
        }

        final BitSet reserved = new BitSet(256);
        reserved.set(';');
        reserved.set('/');
        reserved.set('?');
        reserved.set(':');
        reserved.set('@');
        reserved.set('&');
        reserved.set('=');
        reserved.set('+');
        reserved.set('$');
        reserved.set(',');

        final BitSet mark = new BitSet(256);
        mark.set('-');
        mark.set('_');
        mark.set('.');
        mark.set('!');
        mark.set('~');
        mark.set('*');
        mark.set('\'');
        mark.set('(');
        mark.set(')');

        final BitSet alpha = new BitSet(256);
        for (int i = 'a'; i <= 'z'; i++) {
            alpha.set(i);
        }
        for (int i = 'A'; i <= 'Z'; i++) {
            alpha.set(i);
        }

        final BitSet digit = new BitSet(256);
        for (int i = '0'; i <= '9'; i++) {
            digit.set(i);
        }

        final BitSet alphanumeric = new BitSet(256);
        alphanumeric.or(alpha);
        alphanumeric.or(digit);

        final BitSet unreserved = new BitSet(256);
        unreserved.or(alphanumeric);
        unreserved.or(mark);

        final BitSet hex = new BitSet(256);
        hex.or(digit);
        for (int i = 'a'; i <= 'f'; i++) {
            hex.set(i);
        }
        for (int i = 'A'; i <= 'F'; i++) {
            hex.set(i);
        }

        final BitSet escaped = new BitSet(256);
        escaped.set('%');
        escaped.or(hex);

        final BitSet uric = new BitSet(256);
        uric.or(reserved);
        uric.or(unreserved);
        uric.or(escaped);

        final BitSet pchar = new BitSet(256);
        pchar.or(unreserved);
        pchar.or(escaped);
        pchar.set(':');
        pchar.set('@');
        pchar.set('&');
        pchar.set('=');
        pchar.set('+');
        pchar.set('$');
        pchar.set(',');

        final BitSet segment = new BitSet(256);
        segment.or(pchar);
        segment.set(';');
        segment.or(pchar);

        final BitSet pathSegments = new BitSet(256);
        pathSegments.set('/');
        pathSegments.or(segment);

        final BitSet absPath = new BitSet(256);
        absPath.set('/');
        absPath.or(pathSegments);

        final BitSet allowedAbsPath = new BitSet(256);
        allowedAbsPath.or(absPath);

        final BitSet allowedFragment = new BitSet(256);
        allowedFragment.or(uric);

        final BitSet allowedQuery = new BitSet(256);
        allowedQuery.or(uric);

        final BitSet allowedHash = new BitSet(256);
        allowedHash.or(uric);

        PATH_ALLOWED_CHARS.or(allowedAbsPath);
        QUERY_ALLOWED_CHARS.or(allowedQuery);
        ANCHOR_ALLOWED_CHARS.or(allowedFragment);
        HASH_ALLOWED_CHARS.or(allowedHash);
    }

    /**
     * Disallow instantiation of this class.
     */
    private UrlUtils() {
        // Empty.
    }

    /**
     * Constructs a URL instance based on the specified URL string, taking into account the fact that the
     * specified URL string may represent an "about:..." URL, a "javascript:..." URL, or
     * a data:... URL.
     *
     * The caller should be sure that URL strings passed to this method will parse correctly as URLs, as
     * this method never expects to have to handle {@link MalformedURLException}s.
     *
     * @param url the URL string to convert into a URL instance
     * @return the constructed URL instance
     */
    public static URL toUrlSafe(final String url) {
        try {
            return toUrlUnsafe(url);
        }
        catch (final MalformedURLException e) {
            // Should never happen.
            throw new RuntimeException(e);
        }
    }

    /**
     * Constructs a URL instance based on the specified URL string, taking into account the fact that the
     * specified URL string may represent an "about:..." URL, a "javascript:..." URL, or
     * a data:... URL.
     *
     * Unlike {@link #toUrlSafe(String)}, the caller need not be sure that URL strings passed to this
     * method will parse correctly as URLs.
     *
     * @param url the URL string to convert into a URL instance
     * @return the constructed URL instance
     * @throws MalformedURLException if the URL string cannot be converted to a URL instance
     */
    public static URL toUrlUnsafe(final String url) throws MalformedURLException {
        WebAssert.notNull("url", url);

        final String protocol = org.apache.commons.lang3.StringUtils.substringBefore(url, ":").toLowerCase(Locale.ROOT);

        if (protocol.isEmpty() || UrlUtils.isNormalUrlProtocol(protocol)) {
            final URL response = new URL(url);
            if (response.getProtocol().startsWith("http")
                    && org.apache.commons.lang3.StringUtils.isEmpty(response.getHost())) {
                throw new MalformedURLException("Missing host name in url: " + url);
            }
            return response;
        }

        if (JavaScriptURLConnection.JAVASCRIPT_PREFIX.equals(protocol + ":")) {
            return new URL(null, url, JS_HANDLER);
        }

        if (ABOUT.equals(protocol)) {
            if (org.apache.commons.lang3.StringUtils.equalsIgnoreCase(ABOUT_BLANK, url)) {
                return URL_ABOUT_BLANK;
            }
            return new URL(null, url, ABOUT_HANDLER);
        }

        if ("data".equals(protocol)) {
            return new URL(null, url, DATA_HANDLER);
        }

        return new URL(null, url, AnyHandler.INSTANCE);
    }

    /**
     * Encodes illegal characters in the specified URL's path, query string and anchor according to the URL
     * encoding rules observed in real browsers.
     *
     * For example, this method changes
     * "http://first/?a=b c" to "http://first/?a=b%20c".
     *
     * @param url the URL to encode
     * @param minimalQueryEncoding whether or not to perform minimal query encoding, like IE does
     * @param charset the charset
     * @return the encoded URL
     */
    public static URL encodeUrl(final URL url, final boolean minimalQueryEncoding, final Charset charset) {
        if (!isNormalUrlProtocol(url.getProtocol())) {
            return url; // javascript:, about:, data: and anything not supported like foo:
        }

        try {
            String path = url.getPath();
            if (path != null) {
                path = encode(path, PATH_ALLOWED_CHARS, UTF_8);
            }
            String query = url.getQuery();
            if (query != null) {
                if (minimalQueryEncoding) {
                    query = org.apache.commons.lang3.StringUtils.replace(query, " ", "%20");
                }
                else {
                    query = encode(query, QUERY_ALLOWED_CHARS, charset);
                }
            }
            String anchor = url.getRef();
            if (anchor != null) {
                anchor = encode(anchor, ANCHOR_ALLOWED_CHARS, UTF_8);
            }
            return createNewUrl(url.getProtocol(), url.getUserInfo(), url.getHost(),
                                url.getPort(), path, anchor, query);
        }
        catch (final MalformedURLException e) {
            // Impossible... I think.
            throw new RuntimeException(e);
        }
    }

    /**
     * Encodes and escapes the specified URI anchor string.
     *
     * @param anchor the anchor string to encode and escape
     * @return the encoded and escaped anchor string
     */
    public static String encodeAnchor(final String anchor) {
        if (anchor == null) {
            return null;
        }
        return encode(anchor, ANCHOR_ALLOWED_CHARS, UTF_8);
    }

    /**
     * Encodes and escapes the specified URI hash string.
     *
     * @param hash the anchor string to encode and escape
     * @return the encoded and escaped hash string
     */
    public static String encodeHash(final String hash) {
        if (hash == null) {
            return null;
        }
        return encode(hash, HASH_ALLOWED_CHARS, UTF_8);
    }

    /**
     * Encodes and escapes the specified URI hash string.
     *
     * @param query the query string to encode and escape
     * @return the encoded and escaped hash string
     */
    public static String encodeQuery(final String query) {
        if (query == null) {
            return null;
        }
        return encode(query, QUERY_ALLOWED_CHARS, UTF_8);
    }

    /**
     * Unescapes and decodes the specified string.
     *
     * @param escaped the string to be unescaped and decoded
     * @return the unescaped and decoded string
     */
    public static String decode(final String escaped) {
        try {
            final byte[] bytes = escaped.getBytes(US_ASCII);
            final byte[] bytes2 = URLCodec.decodeUrl(bytes);
            return new String(bytes2, UTF_8);
        }
        catch (final DecoderException e) {
            // Should never happen.
            throw new RuntimeException(e);
        }
    }

    /**
     * Escapes and encodes the specified string. Based on HttpClient 3.1's URIUtil.encode() method.
     *
     * @param unescaped the string to encode
     * @param allowed allowed characters that shouldn't be escaped
     * @param charset the charset to use
     * @return the escaped string
     */
    private static String encode(final String unescaped, final BitSet allowed, final Charset charset) {
        final byte[] bytes = unescaped.getBytes(charset);
        final byte[] bytes2 = URLCodec.encodeUrl(allowed, bytes);
        return encodePercentSign(bytes2);
    }

    /**
     * Encodes every occurrence of the escape character '%' in the given input
     * string that is not followed by two hexadecimal characters.
     * @param input the input bytes
     * @return the given input string where every occurrence of % in
     * invalid escape sequences has been replace by %25
     */
    @SuppressWarnings("PMD.UselessParentheses")
    private static String encodePercentSign(final byte[] input) {
        if (input == null) {
            return null;
        }

        final StringBuilder result = new StringBuilder(new String(input, US_ASCII));
        int state = -0;
        int offset = 0;
        for (int i = 0; i < input.length; i++) {
            final byte b = input[i];
            if (state == 0 && b == '%') {
                state = 1;
            }
            else if (state == 1 || state == 2) {
                if (('0' <= b && b <= '9')
                        || ('A' <= b && b <= 'F')
                        || ('a' <= b && b <= 'f')) {
                    state++;
                    if (state == 3) {
                        state = 0;
                    }
                }
                else {
                    final int st = i - state + offset;
                    result.replace(st, st + 1, "%25");
                    offset = offset + 2;
                    state = b == '%' ? 1 : 0;
                }
            }
        }
        if (state == 1 || state == 2) {
            final int st = input.length - state + offset;
            result.replace(st, st + 1, "%25");
        }
        return result.toString();
    }

    /**
     * Creates and returns a new URL using only the protocol and authority from the given one.
     * @param u the URL on which to base the returned URL
     * @return a new URL using only the protocol and authority from the given one
     * @throws MalformedURLException if there is a problem creating the new URL
     */
    public static URL getUrlWithoutPathRefQuery(final URL u) throws MalformedURLException {
        return createNewUrl(u.getProtocol(), u.getAuthority(), null, null, null);
    }

    /**
     * Creates and returns a new URL using only the protocol, authority and path
     * from the given one.
     * @param u the URL on which to base the returned URL
     * @return a new URL using only the protocol and authority from the given one
     * @throws MalformedURLException if there is a problem creating the new URL
     */
    public static URL getUrlWithoutRef(final URL u) throws MalformedURLException {
        return createNewUrl(u.getProtocol(), u.getAuthority(), u.getPath(), null, u.getQuery());
    }

    /**
     * Creates and returns a new URL identical to the specified URL, except using the specified protocol.
     * @param u the URL on which to base the returned URL
     * @param newProtocol the new protocol to use in the returned URL
     * @return a new URL identical to the specified URL, except using the specified protocol
     * @throws MalformedURLException if there is a problem creating the new URL
     */
    public static URL getUrlWithNewProtocol(final URL u, final String newProtocol) throws MalformedURLException {
        return createNewUrl(newProtocol, u.getAuthority(), u.getPath(), u.getRef(), u.getQuery());
    }

    /**
     * Creates and returns a new URL identical to the specified URL, except using the specified host.
     * @param u the URL on which to base the returned URL
     * @param newHost the new host to use in the returned URL
     * @return a new URL identical to the specified URL, except using the specified host
     * @throws MalformedURLException if there is a problem creating the new URL
     */
    public static URL getUrlWithNewHost(final URL u, final String newHost)
        throws MalformedURLException {
        return createNewUrl(u.getProtocol(), u.getUserInfo(), newHost,
                            u.getPort(), u.getPath(), u.getRef(), u.getQuery());
    }

    /**
     * Creates and returns a new URL identical to the specified URL, except using the specified host.
     * @param u the URL on which to base the returned URL
     * @param newHost the new host to use in the returned URL
     * @param newPort the new port to use in the returned URL
     * @return a new URL identical to the specified URL, except using the specified host
     * @throws MalformedURLException if there is a problem creating the new URL
     */
    public static URL getUrlWithNewHostAndPort(final URL u, final String newHost, final int newPort)
        throws MalformedURLException {
        return createNewUrl(u.getProtocol(), u.getUserInfo(), newHost, newPort, u.getPath(), u.getRef(), u.getQuery());
    }

    /**
     * Creates and returns a new URL identical to the specified URL, except using the specified port.
     * @param u the URL on which to base the returned URL
     * @param newPort the new port to use in the returned URL or -1 to remove it
     * @return a new URL identical to the specified URL, except using the specified port
     * @throws MalformedURLException if there is a problem creating the new URL
     */
    public static URL getUrlWithNewPort(final URL u, final int newPort) throws MalformedURLException {
        return createNewUrl(u.getProtocol(), u.getUserInfo(), u.getHost(),
                            newPort, u.getPath(), u.getRef(), u.getQuery());
    }

    /**
     * Creates and returns a new URL identical to the specified URL, except using the specified path.
     * @param u the URL on which to base the returned URL
     * @param newPath the new path to use in the returned URL
     * @return a new URL identical to the specified URL, except using the specified path
     * @throws MalformedURLException if there is a problem creating the new URL
     */
    public static URL getUrlWithNewPath(final URL u, final String newPath) throws MalformedURLException {
        return createNewUrl(u.getProtocol(), u.getAuthority(), newPath, u.getRef(), u.getQuery());
    }

    /**
     * Creates and returns a new URL identical to the specified URL, except using the specified reference.
     * @param u the URL on which to base the returned URL
     * @param newRef the new reference to use in the returned URL or null to remove it
     * @return a new URL identical to the specified URL, except using the specified reference
     * @throws MalformedURLException if there is a problem creating the new URL
     */
    public static URL getUrlWithNewRef(final URL u, final String newRef) throws MalformedURLException {
        return createNewUrl(u.getProtocol(), u.getAuthority(), u.getPath(), newRef, u.getQuery());
    }

    /**
     * Creates and returns a new URL identical to the specified URL, except using the specified query string.
     * @param u the URL on which to base the returned URL
     * @param newQuery the new query string to use in the returned URL
     * @return a new URL identical to the specified URL, except using the specified query string
     * @throws MalformedURLException if there is a problem creating the new URL
     */
    public static URL getUrlWithNewQuery(final URL u, final String newQuery) throws MalformedURLException {
        return createNewUrl(u.getProtocol(), u.getAuthority(), u.getPath(), u.getRef(), newQuery);
    }

    /**
     * Creates and returns a new URL identical to the specified URL, ignoring path, protocol and query.
     * @param u the URL on which to base the returned URL
     * @return a new URL identical to the specified URL, ignoring path, protocol and query
     * @throws MalformedURLException if there is a problem creating the new URL
     */
    public static URL getUrlWithProtocolAndAuthority(final URL u) throws MalformedURLException {
        return createNewUrl(u.getProtocol(), u.getAuthority(), null, null, null);
    }

    /**
     * Creates and returns a new URL identical to the specified URL but with a changed user name.
     * @param u the URL on which to base the returned URL
     * @param newUserName the new user name or null to remove it
     * @return a new URL identical to the specified URL; only user name updated
     * @throws MalformedURLException if there is a problem creating the new URL
     */
    public static URL getUrlWithNewUserName(final URL u, final String newUserName) throws MalformedURLException {
        String newUserInfo = newUserName == null ? "" : newUserName;
        final String userInfo = u.getUserInfo();
        if (org.apache.commons.lang3.StringUtils.isNotBlank(userInfo)) {
            final int colonIdx = userInfo.indexOf(':');
            if (colonIdx > -1) {
                newUserInfo = newUserInfo + userInfo.substring(colonIdx);
            }
        }
        return createNewUrl(u.getProtocol(), newUserInfo.isEmpty() ? null : newUserInfo,
                u.getHost(), u.getPort(), u.getPath(), u.getRef(), u.getQuery());
    }

    /**
     * Creates and returns a new URL identical to the specified URL but with a changed user password.
     * @param u the URL on which to base the returned URL
     * @param newUserPassword the new user password or null to remove it
     * @return a new URL identical to the specified URL; only user name updated
     * @throws MalformedURLException if there is a problem creating the new URL
     */
    public static URL getUrlWithNewUserPassword(final URL u, final String newUserPassword)
            throws MalformedURLException {
        String newUserInfo = newUserPassword == null ? "" : ':' + newUserPassword;
        final String userInfo = u.getUserInfo();
        if (org.apache.commons.lang3.StringUtils.isNotBlank(userInfo)) {
            final int colonIdx = userInfo.indexOf(':');
            if (colonIdx > -1) {
                newUserInfo = userInfo.substring(0, colonIdx) + newUserInfo;
            }
            else {
                newUserInfo = userInfo + newUserInfo;
            }
        }
        return createNewUrl(u.getProtocol(), newUserInfo.isEmpty() ? null : newUserInfo,
                u.getHost(), u.getPort(), u.getPath(), u.getRef(), u.getQuery());
    }

    /**
     * Creates a new URL based on the specified fragments.
     * @param protocol the protocol to use (may not be {@code null})
     * @param userInfo the user info to use (may be {@code null})
     * @param host the host to use (may not be {@code null})
     * @param port the port to use (may be -1 if no port is specified)
     * @param path the path to use (may be {@code null} and may omit the initial '/')
     * @param ref the reference to use (may be {@code null} and must not include the '#')
     * @param query the query to use (may be {@code null} and must not include the '?')
     * @return a new URL based on the specified fragments
     * @throws MalformedURLException if there is a problem creating the new URL
     */
    private static URL createNewUrl(final String protocol, final String userInfo, final String host, final int port,
            final String path, final String ref, final String query) throws MalformedURLException {
        final StringBuilder s = new StringBuilder();
        s.append(protocol).append("://");
        if (userInfo != null) {
            s.append(userInfo).append('@');
        }
        s.append(host);
        if (port != -1) {
            s.append(':').append(port);
        }
        if (path != null && !path.isEmpty()) {
            if ('/' != path.charAt(0)) {
                s.append('/');
            }
            s.append(path);
        }
        if (query != null) {
            s.append('?').append(query);
        }
        if (ref != null) {
            if (ref.isEmpty() || ref.charAt(0) != '#') {
                s.append('#');
            }
            s.append(ref);
        }

        return new URL(s.toString());
    }

    /**
     * Creates a new URL based on the specified fragments.
     * @param protocol the protocol to use (may not be {@code null})
     * @param authority the authority to use (may not be {@code null})
     * @param path the path to use (may be {@code null} and may omit the initial '/')
     * @param ref the reference to use (may be {@code null} and must not include the '#')
     * @param query the query to use (may be {@code null} and must not include the '?')
     * @return a new URL based on the specified fragments
     * @throws MalformedURLException if there is a problem creating the new URL
     */
    private static URL createNewUrl(final String protocol, final String authority,
            final String path, final String ref, final String query) throws MalformedURLException {

        // pre-compute length of StringBuilder
        int len = protocol.length() + 1;
        if (authority != null && !authority.isEmpty()) {
            len += 2 + authority.length();
        }
        if (path != null) {
            len += path.length();
        }
        if (query != null) {
            len += 1 + query.length();
        }
        if (ref != null) {
            len += 1 + ref.length();
        }

        final StringBuilder s = new StringBuilder(len);
        s.append(protocol).append(':');
        if (authority != null && !authority.isEmpty()) {
            s.append("//");
            s.append(authority);
        }
        if (path != null) {
            s.append(path);
        }
        if (query != null) {
            s.append('?');
            s.append(query);
        }
        if (ref != null) {
            if (ref.isEmpty() || ref.charAt(0) != '#') {
                s.append('#');
            }
            s.append(ref);
        }

        return toUrlSafe(s.toString());
    }

    /**
     * Resolves a given relative URL against a base URL. See
     * RFC1808
     * Section 4 for more details.
     *
     * @param baseUrl     The base URL in which to resolve the specification.
     * @param relativeUrl The relative URL to resolve against the base URL.
     * @return the resolved specification.
     */
    public static String resolveUrl(final String baseUrl, final String relativeUrl) {
        if (baseUrl == null) {
            throw new IllegalArgumentException("Base URL must not be null");
        }
        if (relativeUrl == null) {
            throw new IllegalArgumentException("Relative URL must not be null");
        }
        final Url url = resolveUrl(parseUrl(baseUrl), relativeUrl);

        return url.toString();
    }

    /**
     * Resolves a given relative URL against a base URL. See
     * RFC1808
     * Section 4 for more details.
     *
     * @param baseUrl     The base URL in which to resolve the specification.
     * @param relativeUrl The relative URL to resolve against the base URL.
     * @return the resolved specification.
     */
    public static String resolveUrl(final URL baseUrl, final String relativeUrl) {
        if (baseUrl == null) {
            throw new IllegalArgumentException("Base URL must not be null");
        }
        return resolveUrl(baseUrl.toExternalForm(), relativeUrl);
    }

    /**
     * Parses a given specification using the algorithm depicted in
     * RFC1808:
     * 
     * Section 2.4: Parsing a URL
     * 

     *   An accepted method for parsing URLs is useful to clarify the
     *   generic-RL syntax of Section 2.2 and to describe the algorithm for
     *   resolving relative URLs presented in Section 4. This section
     *   describes the parsing rules for breaking down a URL (relative or
     *   absolute) into the component parts described in Section 2.1.  The
     *   rules assume that the URL has already been separated from any
     *   surrounding text and copied to a "parse string". The rules are
     *   listed in the order in which they would be applied by the parser.
     *
     * @param spec The specification to parse.
     * @return the parsed specification.
     */
    @SuppressWarnings("PMD.UselessParentheses")
    private static Url parseUrl(String spec) {
        final Url url = new Url();
        int startIndex = 0;
        int endIndex = spec.length();

        // see https://url.spec.whatwg.org/#concept-basic-url-parser
        //   * If input contains any leading or trailing C0 control or space, validation error.
        //     Remove any leading and trailing C0 control or space from input.
        //   * If input contains any ASCII tab or newline, validation error.
        //     Remove all ASCII tab or newline from input.

        if (endIndex > startIndex) {
            StringBuilder sb = null;
            boolean before = true;
            int trailing = 0;

            for (int i = 0; i < endIndex; i++) {
                final char c = spec.charAt(i);
                boolean remove = false;

                if (c == '\t' | c == '\r' | c == '\n') {
                    remove = true;
                }
                else if ('\u0000' <= c && c <= '\u0020') {
                    if (before) {
                        remove = true;
                    }
                    else {
                        trailing++;
                    }
                }
                else {
                    before &= false;
                    trailing = 0;
                }

                if (remove) {
                    if (sb == null) {
                        sb = new StringBuilder(spec.substring(0, i));
                    }
                }
                else if (sb != null) {
                    sb.append(c);
                }
            }

            if (sb == null) {
                if (trailing > 0) {
                    endIndex = spec.length() - trailing;
                    spec = spec.substring(0, endIndex);
                }
            }
            else {
                if (trailing > 0) {
                    spec = sb.substring(0, sb.length() - trailing);
                }
                else {
                    spec = sb.toString();
                }
                endIndex = spec.length();
            }
        }

        // Section 2.4.1: Parsing the Fragment Identifier
        //
        //   If the parse string contains a crosshatch "#" character, then the
        //   substring after the first (left-most) crosshatch "#" and up to the
        //   end of the parse string is the  identifier. If the
        //   crosshatch is the last character, or no crosshatch is present, then
        //   the fragment identifier is empty. The matched substring, including
        //   the crosshatch character, is removed from the parse string before
        //   continuing.
        //
        //   Note that the fragment identifier is not considered part of the URL.
        //   However, since it is often attached to the URL, parsers must be able
        //   to recognize and set aside fragment identifiers as part of the
        //   process.
        final int crosshatchIndex = StringUtils.indexOf(spec, '#', startIndex, endIndex);

        if (crosshatchIndex >= 0) {
            url.fragment_ = spec.substring(crosshatchIndex + 1, endIndex);
            endIndex = crosshatchIndex;
        }
        // Section 2.4.2: Parsing the Scheme
        //
        //   If the parse string contains a colon ":" after the first character
        //   and before any characters not allowed as part of a scheme name (i.e.,
        //   any not an alphanumeric, plus "+", period ".", or hyphen "-"), the
        //    of the URL is the substring of characters up to but not
        //   including the first colon. These characters and the colon are then
        //   removed from the parse string before continuing.
        final int colonIndex = StringUtils.indexOf(spec, ':', startIndex, endIndex);

        if (colonIndex > 0) {
            final String scheme = spec.substring(startIndex, colonIndex);
            if (isValidScheme(scheme)) {
                url.scheme_ = scheme;
                startIndex = colonIndex + 1;
            }
        }
        // Section 2.4.3: Parsing the Network Location/Login
        //
        //   If the parse string begins with a double-slash "//", then the
        //   substring of characters after the double-slash and up to, but not
        //   including, the next slash "/" character is the network location/login
        //   () of the URL. If no trailing slash "/" is present, the
        //   entire remaining parse string is assigned to . The double-
        //   slash and  are removed from the parse string before
        //   continuing.
        //
        // Note: We also accept a question mark "?" or a semicolon ";" character as
        //       delimiters for the network location/login () of the URL.
        final int locationStartIndex;
        int locationEndIndex;

        if (spec.startsWith("//", startIndex)) {
            locationStartIndex = startIndex + 2;
            locationEndIndex = StringUtils.indexOf(spec, '/', locationStartIndex, endIndex);
            if (locationEndIndex >= 0) {
                startIndex = locationEndIndex;
            }
        }
        else {
            locationStartIndex = -1;
            locationEndIndex = -1;
        }
        // Section 2.4.4: Parsing the Query Information
        //
        //   If the parse string contains a question mark "?" character, then the
        //   substring after the first (left-most) question mark "?" and up to the
        //   end of the parse string is the  information. If the question
        //   mark is the last character, or no question mark is present, then the
        //   query information is empty. The matched substring, including the
        //   question mark character, is removed from the parse string before
        //   continuing.
        final int questionMarkIndex = StringUtils.indexOf(spec, '?', startIndex, endIndex);

        if (questionMarkIndex >= 0) {
            if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
                // The substring of characters after the double-slash and up to, but not
                // including, the question mark "?" character is the network location/login
                // () of the URL.
                locationEndIndex = questionMarkIndex;
                startIndex = questionMarkIndex;
            }
            url.query_ = spec.substring(questionMarkIndex + 1, endIndex);
            endIndex = questionMarkIndex;
        }
        // Section 2.4.5: Parsing the Parameters
        //
        //   If the parse string contains a semicolon ";" character, then the
        //   substring after the first (left-most) semicolon ";" and up to the end
        //   of the parse string is the parameters (). If the semicolon
        //   is the last character, or no semicolon is present, then  is
        //   empty. The matched substring, including the semicolon character, is
        //   removed from the parse string before continuing.
        final int semicolonIndex = StringUtils.indexOf(spec, ';', startIndex, endIndex);

        if (semicolonIndex >= 0) {
            if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
                // The substring of characters after the double-slash and up to, but not
                // including, the semicolon ";" character is the network location/login
                // () of the URL.
                locationEndIndex = semicolonIndex;
                startIndex = semicolonIndex;
            }
            url.parameters_ = spec.substring(semicolonIndex + 1, endIndex);
            endIndex = semicolonIndex;
        }
        // Section 2.4.6: Parsing the Path
        //
        //   After the above steps, all that is left of the parse string is the
        //   URL  and the slash "/" that may precede it. Even though the
        //   initial slash is not part of the URL path, the parser must remember
        //   whether or not it was present so that later processes can
        //   differentiate between relative and absolute paths. Often this is
        //   done by simply storing the preceding slash along with the path.
        if ((locationStartIndex >= 0) && (locationEndIndex < 0)) {
            // The entire remaining parse string is assigned to the network
            // location/login () of the URL.
            locationEndIndex = endIndex;
        }
        else if (startIndex < endIndex) {
            url.path_ = spec.substring(startIndex, endIndex);
        }
        // Set the network location/login () of the URL.
        if ((locationStartIndex >= 0) && (locationEndIndex >= 0)) {
            url.location_ = spec.substring(locationStartIndex, locationEndIndex);
        }
        return url;
    }

    /**
     * Returns true if specified string is a valid scheme name.
     * 

     * https://tools.ietf.org/html/rfc1738
     * 

     * Scheme names consist of a sequence of characters. The lower case
     * letters "a"--"z", digits, and the characters plus ("+"), period
     * ("."), and hyphen ("-") are allowed. For resiliency, programs
     * interpreting URLs should treat upper case letters as equivalent to
     * lower case in scheme names (e.g., allow "HTTP" as well as "http").
     *
     * @param scheme the scheme string to check
     * @return true if valid
     */
    @SuppressWarnings("PMD.UselessParentheses")
    public static boolean isValidScheme(final String scheme) {
        final int length = scheme.length();
        if (length < 1) {
            return false;
        }

        char c = scheme.charAt(0);
        boolean isValid = ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
        if (!isValid) {
            return false;
        }

        for (int i = 1; i < length; i++) {
            c = scheme.charAt(i);
            isValid =
                    ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')
                    || ('0' <= c && c <= '9')
                    || c == '+'
                    || c == '.'
                    || c == '-';
            if (!isValid) {
                return false;
            }
        }

        return true;
    }

    /**
     * Resolves a given relative URL against a base URL using the algorithm
     * depicted in RFC1808:
     * 

     * Section 4: Resolving Relative URLs
     * 
     *   This section describes an example algorithm for resolving URLs within
     *   a context in which the URLs may be relative, such that the result is
     *   always a URL in absolute form. Although this algorithm cannot
     *   guarantee that the resulting URL will equal that intended by the
     *   original author, it does guarantee that any valid URL (relative or
     *   absolute) can be consistently transformed to an absolute form given a
     *   valid base URL.
     *
     * @param baseUrl     The base URL in which to resolve the specification.
     * @param relativeUrl The relative URL to resolve against the base URL.
     * @return the resolved specification.
     */
    private static Url resolveUrl(final Url baseUrl, final String relativeUrl) {
        final Url url = parseUrl(relativeUrl);
        // Step 1: The base URL is established according to the rules of
        //         Section 3.  If the base URL is the empty string (unknown),
        //         the embedded URL is interpreted as an absolute URL and
        //         we are done.
        if (baseUrl == null) {
            return url;
        }
        // Step 2: Both the base and embedded URLs are parsed into their
        //         component parts as described in Section 2.4.
        //      a) If the embedded URL is entirely empty, it inherits the
        //         entire base URL (i.e., is set equal to the base URL)
        //         and we are done.
        if (relativeUrl.isEmpty()) {
            return new Url(baseUrl);
        }
        //      b) If the embedded URL starts with a scheme name, it is
        //         interpreted as an absolute URL and we are done.
        if (url.scheme_ != null) {
            return url;
        }
        //      c) Otherwise, the embedded URL inherits the scheme of
        //         the base URL.
        url.scheme_ = baseUrl.scheme_;
        // Step 3: If the embedded URL's  is non-empty, we skip to
        //         Step 7.  Otherwise, the embedded URL inherits the 
        //         (if any) of the base URL.
        if (url.location_ != null) {
            return url;
        }
        url.location_ = baseUrl.location_;
        // Step 4: If the embedded URL path is preceded by a slash "/", the
        //         path is not relative and we skip to Step 7.
        if (url.path_ != null && !url.path_.isEmpty() && url.path_.charAt(0) == '/') {
            url.path_ = removeLeadingSlashPoints(url.path_);
            return url;
        }
        // Step 5: If the embedded URL path is empty (and not preceded by a
        //         slash), then the embedded URL inherits the base URL path,
        //         and
        if (url.path_ == null) {
            url.path_ = baseUrl.path_;
            //  a) if the embedded URL's  is non-empty, we skip to
            //     step 7; otherwise, it inherits the  of the base
            //     URL (if any) and
            if (url.parameters_ != null) {
                return url;
            }
            url.parameters_ = baseUrl.parameters_;
            //  b) if the embedded URL's  is non-empty, we skip to
            //     step 7; otherwise, it inherits the  of the base
            //     URL (if any) and we skip to step 7.
            if (url.query_ != null) {
                return url;
            }
            url.query_ = baseUrl.query_;
            return url;
        }
        // Step 6: The last segment of the base URL's path (anything
        //         following the rightmost slash "/", or the entire path if no
        //         slash is present) is removed and the embedded URL's path is
        //         appended in its place.  The following operations are
        //         then applied, in order, to the new path:
        final String basePath = baseUrl.path_;
        String path = "";

        if (basePath == null) {
            path = "/";
        }
        else {
            final int lastSlashIndex = basePath.lastIndexOf('/');

            if (lastSlashIndex >= 0) {
                path = basePath.substring(0, lastSlashIndex + 1);
            }
        }

        path = path.concat(url.path_);
        //      a) All occurrences of "./", where "." is a complete path
        //         segment, are removed.
        int pathSegmentIndex;

        while ((pathSegmentIndex = path.indexOf("/./")) >= 0) {
            path = path.substring(0, pathSegmentIndex + 1).concat(path.substring(pathSegmentIndex + 3));
        }
        //      b) If the path ends with "." as a complete path segment,
        //         that "." is removed.
        if (path.endsWith("/.")) {
            path = path.substring(0, path.length() - 1);
        }
        //      c) All occurrences of "/../", where  is a
        //         complete path segment not equal to "..", are removed.
        //         Removal of these path segments is performed iteratively,
        //         removing the leftmost matching pattern on each iteration,
        //         until no matching pattern remains.
        while ((pathSegmentIndex = path.indexOf("/../")) > 0) {
            final String pathSegment = path.substring(0, pathSegmentIndex);
            final int slashIndex = pathSegment.lastIndexOf('/');

            if (slashIndex >= 0) {
                if (!"..".equals(pathSegment.substring(slashIndex))) {
                    path = path.substring(0, slashIndex + 1).concat(path.substring(pathSegmentIndex + 4));
                }
            }
            else {
                path = path.substring(pathSegmentIndex + 4);
            }
        }
        //      d) If the path ends with "/..", where  is a
        //         complete path segment not equal to "..", that
        //         "/.." is removed.
        if (path.endsWith("/..")) {
            final String pathSegment = path.substring(0, path.length() - 3);
            final int slashIndex = pathSegment.lastIndexOf('/');

            if (slashIndex >= 0) {
                path = path.substring(0, slashIndex + 1);
            }
        }

        path = removeLeadingSlashPoints(path);

        url.path_ = path;
        // Step 7: The resulting URL components, including any inherited from
        //         the base URL, are recombined to give the absolute form of
        //         the embedded URL.
        return url;
    }

    /**
     * "../" after the leading "/" should be removed as browsers do (not in RFC)
     */
    private static String removeLeadingSlashPoints(final String path) {
        int i = 1;
        while (path.startsWith("../", i)) {
            i = i + 3;
        }

        if (i > 1) {
            return "/" + path.substring(i);
        }

        return path;
    }

    /**
     * Class Url represents a Uniform Resource Locator.
     *
     * @author Martin Tamme
     */
    private static class Url {

        private String scheme_;
        private String location_;
        private String path_;
        private String parameters_;
        private String query_;
        private String fragment_;

        /**
         * Creates a Url object.
         */
        Url() {
        }

        /**
         * Creates a Url object from the specified
         * Url object.
         *
         * @param url a Url object.
         */
        Url(final Url url) {
            scheme_ = url.scheme_;
            location_ = url.location_;
            path_ = url.path_;
            parameters_ = url.parameters_;
            query_ = url.query_;
            fragment_ = url.fragment_;
        }

        /**
         * Returns a string representation of the Url object.
         *
         * @return a string representation of the Url object.
         */
        @Override
        public String toString() {
            final StringBuilder sb = new StringBuilder();

            if (scheme_ != null) {
                sb.append(scheme_);
                sb.append(':');
            }
            if (location_ != null) {
                sb.append("//");
                sb.append(location_);
            }
            if (path_ != null) {
                sb.append(path_);
            }
            if (parameters_ != null) {
                sb.append(';');
                sb.append(parameters_);
            }
            if (query_ != null) {
                sb.append('?');
                sb.append(query_);
            }
            if (fragment_ != null) {
                sb.append('#');
                sb.append(fragment_);
            }
            return sb.toString();
        }
    }

    static boolean isNormalUrlProtocol(final String protocol) {
        return "http".equals(protocol) || "https".equals(protocol) || "file".equals(protocol);
    }

    /**
     * More or less the same as sameFile(URL, URL) but without
     * resolving the host to an IP address for comparing.
     * Additionally we do some path normalization.
     *
     * @param u1 a URL object
     * @param u2 a URL object
     * @return true if u1 and u2 refer to the same file
     */
    @SuppressWarnings("PMD.UselessParentheses")
    public static boolean sameFile(final URL u1, final URL u2) {
        if (u1 == u2) {
            return true;
        }
        if (u1 == null || u2 == null) {
            return false;
        }

        // Compare the protocols.
        final String p1 = u1.getProtocol();
        final String p2 = u2.getProtocol();
        if (!(p1 == p2 || (p1 != null && p1.equalsIgnoreCase(p2)))) {
            return false;
        }

        // Compare the ports.
        final int port1 = (u1.getPort() == -1) ? u1.getDefaultPort() : u1.getPort();
        final int port2 = (u2.getPort() == -1) ? u2.getDefaultPort() : u2.getPort();
        if (port1 != port2) {
            return false;
        }

        // Compare the hosts.
        final String h1 = u1.getHost();
        final String h2 = u2.getHost();
        if (!(h1 == h2 || (h1 != null && h1.equalsIgnoreCase(h2)))) {
            return false;
        }

        // Compare the files.
        String f1 = u1.getFile();
        if (f1.isEmpty()) {
            f1 = "/";
        }
        String f2 = u2.getFile();
        if (f2.isEmpty()) {
            f2 = "/";
        }
        if (f1.indexOf('.') > 0 || f2.indexOf('.') > 0) {
            try {
                f1 = u1.toURI().normalize().toURL().getFile();
                f2 = u2.toURI().normalize().toURL().getFile();
            }
            catch (final RuntimeException re) {
                throw re;
            }
            catch (final Exception e) {
                // ignore
            }
        }

        return Objects.equals(f1, f2);
    }

    /**
     * Helper that constructs a normalized url string
     * usable as cache key.
     *
     * @param url a URL object
     * @return the normalized string
     */
    public static String normalize(final URL url) {
        final StringBuilder result = new StringBuilder();
        result.append(url.getProtocol())
                .append("://")
                .append(url.getHost())
                .append(':')
                .append((url.getPort() == -1) ? url.getDefaultPort() : url.getPort());

        // Compare the files.
        String f = url.getFile();
        if (f.isEmpty()) {
            result.append('/');
        }
        else {
            if (f.indexOf('.') > 0) {
                try {
                    f = url.toURI().normalize().toURL().getFile();
                }
                catch (final Exception e) {
                    // ignore
                }
            }
            result.append(f);
        }

        return result.toString();
    }

    /**
     * Constructs a {@link URI} using the specified URL.
     *
     * @param url the URL
     * @param query the query
     *
     * @throws URISyntaxException
     *         If both a scheme and a path are given but the path is
     *         relative, if the URI string constructed from the given
     *         components violates RFC 2396, or if the authority
     *         component of the string is present but cannot be parsed
     *         as a server-based authority
     * @return the URI
     */
    public static URI toURI(final URL url, final String query) throws URISyntaxException {
        final String scheme = url.getProtocol();
        final String host = url.getHost();
        final int port = url.getPort();
        final String path = url.getPath();
        final StringBuilder buffer = new StringBuilder();
        if (host != null) {
            if (scheme != null) {
                buffer.append(scheme);
                buffer.append("://");
            }
            buffer.append(host);
            if (port > 0) {
                buffer.append(':');
                buffer.append(port);
            }
        }
        if (path == null || path.isEmpty() || path.charAt(0) != '/') {
            buffer.append('/');
        }
        if (path != null) {
            buffer.append(path);
        }
        if (query != null) {
            buffer.append('?');
            buffer.append(query);
        }
        return new URI(buffer.toString());
    }

    /**
     * @param part the part to encode
     * @return the ecoded string
     */
    public static String encodeQueryPart(final String part) {
        if (part == null || part.isEmpty()) {
            return "";
        }

        try {
            return URLEncoder.encode(part, "UTF-8");
        }
        catch (final UnsupportedEncodingException e) {
            return part;
        }
    }

    /**
     * Removes the well known ports if it can be deduced from protocol.
     * @param url the url to clean up
     * @return a new URL without the port or the given one
     * @throws MalformedURLException if the URL string cannot be converted to a URL instance
     */
    public static URL removeRedundantPort(final URL url) throws MalformedURLException {
        if (("https".equals(url.getProtocol()) && url.getPort() == 443)
                || ("http".equals(url.getProtocol()) && url.getPort() == 80)) {
            return getUrlWithNewPort(url, -1);
        }
        return url;
    }

}