com.networknt.url.URLNormalizer Maven / Gradle / Ivy
Show all versions of http-url Show documentation
/* Copyright 2010-2017 Norconex Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.networknt.url;
import com.networknt.utility.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.net.*;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* The general idea behind URL normalization is to make different URLs
* "equivalent" (i.e. eliminate URL variations pointing to the same resource).
* To achieve this,
* URLNormalizer
takes a URL and modifies it to its
* most basic or standard form (for the context in which it is used).
* Of course URLNormalizer
can simply be used as a generic
* URL manipulation tool for your needs.
*
*
* You would typically "build" your normalized URL by invoking each method
* of interest, in the relevant order, using a similar approach:
*
*
* String url = "Http://Example.com:80//foo/index.html";
* URL normalizedURL = new URLNormalizer(url)
* .lowerCaseSchemeHost()
* .removeDefaultPort()
* .removeDuplicateSlashes()
* .removeDirectoryIndex()
* .addWWW()
* .toURL();
* System.out.println(normalizedURL.toString());
* // Output: http://www.example.com/foo/
*
* Several normalization methods implemented come from the
* RFC 3986 standard.
* These standards and several more normalization techniques
* are very well summarized on the Wikipedia article titled
*
* URL Normalization.
* This class implements most normalizations described on that article and
* borrows several of its examples, as well as a few additional ones.
*
*
* The normalization methods available can be broken down into three
* categories:
*
*
* Preserving Semantics
*
* The following normalizations are part of the
* RFC 3986 standard
* and should result in equivalent
* URLs (one that identifies the same resource):
*
*
* - {@link #lowerCaseSchemeHost()
* Convert scheme and host to lower case}
* - {@link #upperCaseEscapeSequence()
* Convert escape sequence to upper case}
* - {@link #decodeUnreservedCharacters()
* Decode percent-encoded unreserved characters}
* - {@link #removeDefaultPort() Removing default ports}
* - {@link #encodeNonURICharacters()
* URL-Encode non-ASCII characters}
* - {@link #encodeSpaces() Encode spaces to plus sign}
*
*
* Usually Preserving Semantics
*
* The following techniques will generate a semantically equivalent URL for
* the majority of use cases but are not enforced as a standard.
*
*
* - {@link #addTrailingSlash() Add trailing slash}
* - {@link #removeDotSegments() Remove .dot segments}
*
*
* Not Preserving Semantics
*
* These normalizations will fail to produce semantically equivalent URLs in
* many cases. They usually work best when you have a good understanding of
* the web site behind the supplied URL and whether for that site,
* which normalizations can be be considered to produce semantically equivalent
* URLs or not.
*
*
* - {@link #removeDirectoryIndex() Remove directory index}
* - {@link #removeFragment() Remove fragment (#)}
* - {@link #replaceIPWithDomainName() Replace IP with domain name}
* - {@link #unsecureScheme() Unsecure schema (https → http)}
* - {@link #secureScheme() Secure schema (http → https)}
* - {@link #removeDuplicateSlashes() Remove duplicate slashes}
* - {@link #removeWWW() Remove "www."}
* - {@link #addWWW() Add "www."}
* - {@link #sortQueryParameters() Sort query parameters}
* - {@link #removeEmptyParameters() Remove empty query parameters}
* - {@link #removeTrailingQuestionMark() Remove trailing question mark (?)}
* - {@link #removeSessionIds() Remove session IDs}
*
*
* Refer to each methods below for description and examples (or click on a
* normalization name above).
*
* @author Pascal Essiembre
*/
public class URLNormalizer implements Serializable {
private static final long serialVersionUID = 7236478212865008971L;
private static final Logger logger = LoggerFactory.getLogger(URLNormalizer.class);
private static final Pattern PATTERN_PERCENT_ENCODED_CHAR =
Pattern.compile("(%[0-9a-f]{2})", Pattern.CASE_INSENSITIVE);
private static final Pattern PATTERN_PATH_LAST_SEGMENT = Pattern.compile(
"(.*/)(index\\.html|index\\.htm|index\\.shtml|index\\.php"
+ "|default\\.html|default\\.htm|home\\.html|home\\.htm|index\\.php5"
+ "|index\\.php4|index\\.php3|index\\.cgi|placeholder\\.html"
+ "|default\\.asp)$", Pattern.CASE_INSENSITIVE);
private static final Pattern PATTERN_DOMAIN = Pattern.compile(
"^[a-z0-9]+([\\-\\.]{1}[a-z0-9]+)*\\.[a-z]{2,5}$",
Pattern.CASE_INSENSITIVE);
private static final Pattern PATTERN_SCHEMA = Pattern.compile(
"(.*?)(://.*)$",
Pattern.CASE_INSENSITIVE);
private String url;
/**
* Create a new URLNormalizer
instance.
* @param url the url to normalize
*/
public URLNormalizer(URL url) {
this(Objects.toString(url, null));
}
/**
*
* Create a new URLNormalizer
instance.
*
* Since 1.8.0, spaces in URLs are no longer converted to + automatically.
* Use {@link #encodeNonURICharacters()} or {@link #encodeSpaces()}.
*
* @param url the url to normalize
*/
public URLNormalizer(String url) {
super();
if (StringUtils.isBlank(url)) {
throw new IllegalArgumentException("URL argument cannot be null.");
}
this.url = url.trim();
// Check it is a valid URL.
try {
new URL(this.url);
} catch (MalformedURLException e) {
throw new RuntimeException("Invalid URL: " + url, e);
}
}
/**
* Converts the scheme and host to lower case.
* HTTP://www.Example.com/ → http://www.example.com/
* @return this instance
*/
public URLNormalizer lowerCaseSchemeHost() {
URL u = toURL();
url = Pattern.compile(u.getProtocol(),
Pattern.CASE_INSENSITIVE).matcher(url).replaceFirst(
u.getProtocol().toLowerCase());
url = Pattern.compile(u.getHost(),
Pattern.CASE_INSENSITIVE).matcher(url).replaceFirst(
u.getHost().toLowerCase());
return this;
}
/**
* Converts letters in URL-encoded escape sequences to upper case.
* http://www.example.com/a%c2%b1b →
* http://www.example.com/a%C2%B1b
* @return this instance
*/
public URLNormalizer upperCaseEscapeSequence() {
if (url.contains("%")) {
StringBuffer sb = new StringBuffer();
Matcher m = PATTERN_PERCENT_ENCODED_CHAR.matcher(url);
while (m.find()) {
m.appendReplacement(sb, m.group(1).toUpperCase());
}
url = m.appendTail(sb).toString();
}
return this;
}
/**
* Decodes percent-encoded unreserved characters.
* http://www.example.com/%7Eusername/ →
* http://www.example.com/~username/
* @return this instance
*/
public URLNormalizer decodeUnreservedCharacters() {
if (url.contains("%")) {
StringBuffer sb = new StringBuffer();
Matcher m = PATTERN_PERCENT_ENCODED_CHAR.matcher(url);
try {
while (m.find()) {
String enc = m.group(1).toUpperCase();
if (isEncodedUnreservedCharacter(enc)) {
m.appendReplacement(sb, URLDecoder.decode(
enc, StandardCharsets.UTF_8.toString()));
}
}
} catch (UnsupportedEncodingException e) {
logger.debug("UTF-8 is not supported by your system. "
+ "URL will remain unchanged:" + url, e);
}
url = m.appendTail(sb).toString();
}
return this;
}
/**
*
* Encodes all characters that are not supported characters
* in a URI (not to confuse with URL), as defined
* by the RFC 3986
* standard. This includes all non-ASCII characters.
*
*
* Since this method also encodes spaces to the plus sign (+), there is
* no need to also invoke {@link #encodeSpaces()}.
*
* http://www.example.com/^a [b]/ →
* http://www.example.com/%5Ea+%5Bb%5D/
* @return this instance
* @since 1.8.0
*/
public URLNormalizer encodeNonURICharacters() {
url = toURI().toASCIIString();
return this;
}
/**
*
* Encodes space characters into plus signs (+) if they are part of the
* query string. Spaces part of the URL path are percent-encoded to %20.
*
*
* To encode all non-ASCII characters (including spaces), use
* {@link #encodeNonURICharacters()} instead.
*
* http://www.example.com/a b c →
* http://www.example.com/a+b+c
* @return this instance
* @since 1.8.0
*/
public URLNormalizer encodeSpaces() {
String path = StringUtils.substringBefore(url, "?");
path = StringUtils.replace(path, " ", "%20", -1);
String qs = StringUtils.substringAfter(url, "?");
if (StringUtils.isNotBlank(qs)) {
qs = StringUtils.replace(qs, " ", "+", -1);
url = path + "?" + qs;
} else {
url = path;
}
return this;
}
/**
* Removes the default port (80 for http, and 443 for https).
* http://www.example.com:80/bar.html →
* http://www.example.com/bar.html
* @return this instance
*/
public URLNormalizer removeDefaultPort() {
URL u = toURL();
if ("http".equalsIgnoreCase(u.getProtocol())
&& u.getPort() == HttpURL.DEFAULT_HTTP_PORT) {
url = url.replaceFirst(":" + HttpURL.DEFAULT_HTTP_PORT, "");
} else if ("https".equalsIgnoreCase(u.getProtocol())
&& u.getPort() == HttpURL.DEFAULT_HTTPS_PORT) {
url = url.replaceFirst(":" + HttpURL.DEFAULT_HTTPS_PORT, "");
}
return this;
}
/**
*
Adds a trailing slash (/) to a URL ending with a directory. A URL is
* considered to end with a directory if the last path segment,
* before fragment (#) or query string (?), does not contain a dot,
* typically representing an extension.
*
* Please Note: URLs do not always denote a directory structure
* and many URLs can qualify to this method without truly representing a
* directory. Adding a trailing slash to these URLs could potentially break
* its semantic equivalence.
* http://www.example.com/alice →
* http://www.example.com/alice/
* @return this instance
* @since 1.11.0 (renamed from "addTrailingSlash")
*/
public URLNormalizer addDirectoryTrailingSlash() {
String urlRoot = HttpURL.getRoot(url);
String path = toURL().getPath();
String urlRootAndPath = urlRoot + path;
String name = StringUtils.substringAfterLast(path, "/");
if (StringUtils.isNotBlank(name) && !name.contains(".")) {
String newPath = path + "/";
String newUrlRootAndPath = urlRoot + newPath;
url = StringUtils.replaceOnce(
url, urlRootAndPath, newUrlRootAndPath);
}
return this;
}
/**
* Adds a trailing slash (/) right after the domain for URLs with no
* path, before any fragment (#) or query string (?).
*
* Please Note: Adding a trailing slash to URLs could
* potentially break its semantic equivalence.
* http://www.example.com →
* http://www.example.com/
* @return this instance
* @since 1.12.0
*/
public URLNormalizer addDomainTrailingSlash() {
String urlRoot = HttpURL.getRoot(url);
String path = toURL().getPath();
if (StringUtils.isNotBlank(path)) {
// there is a path so do nothing
return this;
}
String urlRootAndPath = urlRoot + "/";
url = StringUtils.replaceOnce(url, urlRoot, urlRootAndPath);
return this;
}
/**
* Adds a trailing slash (/) to a URL ending with a directory. A URL is
* considered to end with a directory if the last path segment,
* before fragment (#) or query string (?), does not contain a dot,
* typically representing an extension.
*
* Please Note: URLs do not always denote a directory structure
* and many URLs can qualify to this method without truly representing a
* directory. Adding a trailing slash to these URLs could potentially break
* its semantic equivalence.
* http://www.example.com/alice →
* http://www.example.com/alice/
* @return this instance
* @deprecated Since 1.11.0, use {@link #addDirectoryTrailingSlash()}
*/
@Deprecated
public URLNormalizer addTrailingSlash() {
return addDirectoryTrailingSlash();
}
/**
* Removes any trailing slash (/) from a URL, before fragment
* (#) or query string (?).
*
* Please Note: Removing trailing slashes form URLs
* could potentially break their semantic equivalence.
* http://www.example.com/alice/ →
* http://www.example.com/alice
* @return this instance
* @since 1.11.0
*/
public URLNormalizer removeTrailingSlash() {
String urlRoot = HttpURL.getRoot(url);
String path = toURL().getPath();
String urlRootAndPath = urlRoot + path;
if (path.endsWith("/")) {
String newPath = StringUtils.removeEnd(path, "/");
String newUrlRootAndPath = urlRoot + newPath;
url = StringUtils.replaceOnce(
url, urlRootAndPath, newUrlRootAndPath);
}
return this;
}
/**
* Removes the unnecessary "." and ".." segments from the URL path.
*
* As of 2.3.0, the algorithm used to remove the dot segments
* is the one prescribed by
* RFC3986.
*
* http://www.example.com/../a/b/../c/./d.html →
* http://www.example.com/a/c/d.html
* Please Note: URLs do not always represent a clean hierarchy
* structure and the dots/double-dots may have a different signification
* on some sites. Removing them from a URL could potentially break
* its semantic equivalence.
* @return this instance
* @see URI#normalize()
*/
public URLNormalizer removeDotSegments() {
String path = toURL().getPath().trim();
// (Bulleted comments are from RFC3986, section-5.2.4)
// 1. The input buffer is initialized with the now-appended path
// components and the output buffer is initialized to the empty
// string.
StringBuilder in = new StringBuilder(path);
StringBuilder out = new StringBuilder();
// 2. While the input buffer is not empty, loop as follows:
while (in.length() > 0) {
// A. If the input buffer begins with a prefix of "../" or "./",
// then remove that prefix from the input buffer; otherwise,
if (startsWith(in, "../")) {
deleteStart(in, "../");
} else if (startsWith(in, "./")) {
deleteStart(in, "./");
}
// B. if the input buffer begins with a prefix of "/./" or "/.",
// where "." is a complete path segment, then replace that
// prefix with "/" in the input buffer; otherwise,
else if (startsWith(in, "/./")) {
replaceStart(in, "/./", "/");
} else if (equalStrings(in, "/.")) {
replaceStart(in, "/.", "/");
}
// C. if the input buffer begins with a prefix of "/../" or "/..",
// where ".." is a complete path segment, then replace that
// prefix with "/" in the input buffer and remove the last
// segment and its preceding "/" (if any) from the output
// buffer; otherwise,
else if (startsWith(in, "/../")) {
replaceStart(in, "/../", "/");
removeLastSegment(out);
} else if (equalStrings(in, "/..")) {
replaceStart(in, "/..", "/");
removeLastSegment(out);
}
// D. if the input buffer consists only of "." or "..", then remove
// that from the input buffer; otherwise,
else if (equalStrings(in, "..")) {
deleteStart(in, "..");
} else if (equalStrings(in, ".")) {
deleteStart(in, ".");
}
// E. move the first path segment in the input buffer to the end of
// the output buffer, including the initial "/" character (if
// any) and any subsequent characters up to, but not including,
// the next "/" character or the end of the input buffer.
else {
int nextSlashIndex = in.indexOf("/", 1);
if (nextSlashIndex > -1) {
out.append(in.substring(0, nextSlashIndex));
in.delete(0, nextSlashIndex);
} else {
out.append(in);
in.setLength(0);
}
}
}
// 3. Finally, the output buffer is returned as the result of
// remove_dot_segments.
url = StringUtils.replaceOnce(url, path, out.toString());
return this;
}
private static boolean equalStrings(StringBuilder b, String str) {
return b.length() == str.length() && b.indexOf(str) == 0;
}
private static boolean startsWith(StringBuilder b, String str) {
return b.indexOf(str) == 0;
}
private void replaceStart(
StringBuilder b, String toreplace, String replacement) {
deleteStart(b, toreplace);
b.insert(0, replacement);
}
private void deleteStart(StringBuilder b, String str) {
b.delete(0, str.length());
}
private void removeLastSegment(StringBuilder b) {
int index = b.lastIndexOf("/");
if (index == -1) {
b.setLength(0);
} else {
b.setLength(index);
}
}
/**
* Removes directory index files. They are often not needed in URLs.
* http://www.example.com/a/index.html →
* http://www.example.com/a/
* Index files must be the last URL path segment to be considered.
* The following are considered index files:
*
* - index.html
* - index.htm
* - index.shtml
* - index.php
* - default.html
* - default.htm
* - home.html
* - home.htm
* - index.php5
* - index.php4
* - index.php3
* - index.cgi
* - placeholder.html
* - default.asp
*
* Please Note: There are no guarantees a URL without its
* index files will be semantically equivalent, or even be valid.
* @return this instance
*/
public URLNormalizer removeDirectoryIndex() {
String path = toURL().getPath();
if (PATTERN_PATH_LAST_SEGMENT.matcher(path).matches()) {
url = StringUtils.replaceOnce(
url, path, StringUtils.substringBeforeLast(path, "/") + "/");
}
return this;
}
/**
* Removes the URL fragment (from the "#" character until the end).
* http://www.example.com/bar.html#section1 →
* http://www.example.com/bar.html
* @return this instance
*/
public URLNormalizer removeFragment() {
url = url.replaceFirst("(.*?)(#.*)", "$1");
return this;
}
/**
* Replaces IP address with domain name. This is often not
* reliable due to virtual domain names and can be slow, as it has
* to access the network.
* http://208.77.188.166/ → http://www.example.com/
* @return this instance
*/
public URLNormalizer replaceIPWithDomainName() {
URL u = toURL();
if (!PATTERN_DOMAIN.matcher(u.getHost()).matches()) {
try {
InetAddress addr = InetAddress.getByName(u.getHost());
String host = addr.getHostName();
if (!u.getHost().equalsIgnoreCase(host)) {
url = url.replaceFirst(u.getHost(), host);
}
} catch (UnknownHostException e) {
logger.debug("Cannot resolve IP to host for :" + u.getHost(), e);
}
}
return this;
}
/**
* Converts https
scheme to http
.
* https://www.example.com/ → http://www.example.com/
* @return this instance
*/
public URLNormalizer unsecureScheme() {
Matcher m = PATTERN_SCHEMA.matcher(url);
if (m.find()) {
String schema = m.group(1);
if ("https".equalsIgnoreCase(schema)) {
url = m.replaceFirst(StringUtils.stripEnd(schema, "Ss") + "$2");
}
}
return this;
}
/**
* Converts http
scheme to https
.
* http://www.example.com/ → https://www.example.com/
* @return this instance
*/
public URLNormalizer secureScheme() {
Matcher m = PATTERN_SCHEMA.matcher(url);
if (m.find()) {
String schema = m.group(1);
if ("http".equalsIgnoreCase(schema)) {
url = m.replaceFirst(schema + "s$2");
}
}
return this;
}
/**
* Removes duplicate slashes. Two or more adjacent slash ("/")
* characters will be converted into one.
* http://www.example.com/foo//bar.html
* → http://www.example.com/foo/bar.html
* @return this instance
*/
public URLNormalizer removeDuplicateSlashes() {
String urlRoot = HttpURL.getRoot(url);
String path = toURL().getPath();
String urlRootAndPath = urlRoot + path;
String newPath = path.replaceAll("/{2,}", "/");
String newUrlRootAndPath = urlRoot + newPath;
url = StringUtils.replaceOnce(url, urlRootAndPath, newUrlRootAndPath);
return this;
}
/**
* Removes "www." domain name prefix.
* http://www.example.com/ → http://example.com/
* @return this instance
*/
public URLNormalizer removeWWW() {
String host = toURL().getHost();
String newHost = StringUtils.removeStartIgnoreCase(host, "www.");
url = StringUtils.replaceOnce(url, host, newHost);
return this;
}
/**
* Adds "www." domain name prefix.
* http://example.com/ → http://www.example.com/
* @return this instance
*/
public URLNormalizer addWWW() {
String host = toURL().getHost();
if (!host.toLowerCase().startsWith("www.")) {
url = StringUtils.replaceOnce(url, host, "www." + host);
}
return this;
}
/**
* Sorts query parameters.
* http://www.example.com/?z=bb&y=cc&z=aa →
* http://www.example.com/?y=cc&z=bb&z=aa
* @return this instance
*/
public URLNormalizer sortQueryParameters() {
// Does it have query parameters?
if (!url.contains("?")) {
return this;
}
// It does, so proceed
List keyValues = new ArrayList<>();
String queryString = StringUtils.substringAfter(url, "?");
// extract and remove any fragments
String fragment = StringUtils.substringAfter(queryString, "#");
if (StringUtils.isNotBlank(fragment)) {
fragment = "#" + fragment;
}
queryString = StringUtils.substringBefore(queryString, "#");
String[] params = StringUtils.split(queryString, '&');
for (String param : params) {
keyValues.add(param);
}
// sort it so that query string are in order
Collections.sort(keyValues);
String sortedQueryString = StringUtils.join(keyValues, '&');
if (StringUtils.isNotBlank(sortedQueryString)) {
url = StringUtils.substringBefore(
url, "?") + "?" + sortedQueryString + fragment;
}
return this;
}
/**
* Removes empty parameters.
* http://www.example.com/display?a=b&a=&c=d&e=&f=g
* → http://www.example.com/display?a=b&c=d&f=g
* @return this instance
*/
public URLNormalizer removeEmptyParameters() {
// Does it have query parameters?
if (!url.contains("?")) {
return this;
}
// It does, so proceed
List keyValues = new ArrayList<>();
String queryString = StringUtils.substringAfter(url, "?");
String[] params = StringUtils.split(queryString, '&');
for (String param : params) {
if (param.contains("=")
&& StringUtils.isNotBlank(
StringUtils.substringAfter(param, "="))
&& StringUtils.isNotBlank(
StringUtils.substringBefore(param, "="))) {
keyValues.add(param);
}
}
String cleanQueryString = StringUtils.join(keyValues, '&');
if (StringUtils.isNotBlank(cleanQueryString)) {
url = StringUtils.substringBefore(
url, "?") + "?" + cleanQueryString;
}
return this;
}
/**
* Removes trailing question mark ("?").
* http://www.example.com/display? →
* http://www.example.com/display
* @return this instance
*/
public URLNormalizer removeTrailingQuestionMark() {
if (url.endsWith("?") && StringUtils.countMatches(url, "?") == 1) {
url = StringUtils.removeEnd(url, "?");
}
return this;
}
/**
* Removes a URL-based session id. It removes PHP (PHPSESSID),
* ASP (ASPSESSIONID), and Java EE (jsessionid) session ids.
* http://www.example.com/servlet;jsessionid=1E6FEC0D14D044541DD84D2D013D29ED?a=b
* → http://www.example.com/servlet?a=b
* Please Note: Removing session IDs from URLs is often
* a good way to have the URL return an error once invoked.
* @return this instance
*/
public URLNormalizer removeSessionIds() {
if (StringUtils.containsIgnoreCase(url, ";jsessionid=")) {
url = url.replaceFirst(
"(;jsessionid=([A-F0-9]+)((\\.\\w+)*))", "");
} else {
String u = StringUtils.substringBefore(url, "?");
String q = StringUtils.substringAfter(url, "?");
if (StringUtils.containsIgnoreCase(url, "PHPSESSID=")) {
q = q.replaceFirst("(&|^)(PHPSESSID=[0-9a-zA-Z]*)", "");
} else if (StringUtils.containsIgnoreCase(url, "ASPSESSIONID")) {
q = q.replaceFirst(
"(&|^)(ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]*)", "");
}
if (!StringUtils.isBlank(q)) {
u += "?" + StringUtils.removeStart(q, "&");
}
url = u;
}
return this;
}
/**
* Removes trailing hash character ("#").
* http://www.example.com/path# →
* http://www.example.com/path
*
* This only removes the hash character if it is the last character.
* To remove an entire URL fragment, use {@link #removeFragment()}.
*
* @return this instance
* @since 1.13.0
*/
public URLNormalizer removeTrailingHash() {
if (url.endsWith("#") && StringUtils.countMatches(url, "#") == 1) {
url = StringUtils.removeEnd(url, "#");
}
return this;
}
/**
* Returns the normalized URL as string.
* @return URL
*/
@Override
public String toString() {
return url;
}
/**
* Returns the normalized URL as {@link URI}.
* @return URI
*/
public URI toURI() {
if (StringUtils.isBlank(url)) {
return null;
}
return HttpURL.toURI(url);
}
/**
* Returns the normalized URL as {@link URL}.
* @return URI
*/
public URL toURL() {
if (StringUtils.isBlank(url)) {
return null;
}
try {
return new URL(url);
} catch (MalformedURLException e) {
logger.info("URL does not appear to be valid and cannot be parsed:"
+ url, e);
return null;
}
}
private boolean isEncodedUnreservedCharacter(String enc) {
// is ALPHA (a-zA-Z)
if ((enc.compareTo("%41") >= 0 && enc.compareTo("%5A") <= 0)
|| (enc.compareTo("%61") >= 0 && enc.compareTo("%7A") <= 0)) {
return true;
}
// is Digit (0-9)
if (enc.compareTo("%30") >= 0 && enc.compareTo("%39") <= 0) {
return true;
}
// is hyphen, period, underscore, tilde
return equalsAny(enc, "%2D", "%2E", "%5F", "%7E");
}
private static boolean equalsAny(Object source, Object... targets) {
if (targets == null) {
return source == null;
}
for (Object object : targets) {
if (Objects.equals(source, object)) {
return true;
}
}
return false;
}
}