com.yahoo.net.URI Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of vespajlib Show documentation
Show all versions of vespajlib Show documentation
Library for use in Java components of Vespa. Shared code which do
not fit anywhere else.
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.net;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import static com.yahoo.text.Lowercase.toLowerCase;
/**
* An URI. This is a pure (immutable) value object.
*
* This does more normalization of hierarchical URIs (URLs) than
* described in the RFC and allows hosts with underscores.
*
* @author bratseth
*/
public class URI implements Cloneable, Comparable {
/** The uri string */
private String uri;
/** The scheme of the uri */
private String scheme = null;
/** The host part of the uri */
private String host = null;
/** The port number of the uri, or -1 if no port is explicitly given */
private int port = -1;
/** The part of the uri following the host (host and port) */
private String rest = null;
private static final Pattern tokenizePattern = Pattern.compile("[^\\w\\-]");
private boolean parsedDomain = false;
private String domain = null;
private boolean parsedMainTld = false;
private String mainTld = null;
private boolean parsedPath = false;
private String path = null;
private boolean parsedParams = false;
private String params = null;
private boolean parsedFilename = false;
private String filename = null;
private boolean parsedExtension = false;
private String extension = null;
private boolean parsedQuery = false;
private String query = null;
private boolean parsedFragment = false;
private String fragment = null;
/** The explanation of why this uri is invalid, or null if it is valid */
private String invalidExplanation = null;
/** True if this uri is opaque, false if it is hierarchical */
private boolean opaque = true;
/**
* Creates an URI without keeping the fragment (the part starting by #).
* If the uri is hierarchical, it is normalized and incorrect hierarchical uris
* which looks like urls are attempted repaired.
*
* Relative uris are not supported.
*
* @param uriString the uri string
* @throws NullPointerException if the given uriString is null
*/
public URI(String uriString) {
this(uriString, false);
}
/**
* Creates a URI, optionally keeping the fragment (the part starting by #).
* If the uri is hierarchical, it is normalized and incorrect hierarchical uris
* which looks like urls are attempted repaired.
*
* Relative uris are not supported.
*
* @param uriString the uri string
* @param keepFragment true to keep the fragment
* @throws NullPointerException if the given uriString is null
*/
public URI(String uriString, boolean keepFragment) {
this(uriString, keepFragment, false);
}
/**
* Creates a URI, optionally keeping the fragment (the part starting by #).
* If the uri is hierarchical, it is normalized and incorrect hierarchical uris
* which looks like urls are attempted repaired.
*
* Relative uris are not supported.
*
* @param uriString the uri string
* @param keepFragment true to keep the fragment
* @param hierarchicalOnly will force any uri string given to be parsed as
* a hierarchical one, causing the uri to be invalid if it isn't
* @throws NullPointerException if the given uriString is null
*/
public URI(String uriString, boolean keepFragment, boolean hierarchicalOnly) {
if (uriString == null) {
throw new NullPointerException("Can not create an uri from null");
}
if (!keepFragment) {
int fragmentIndex = uriString.indexOf("#");
if (fragmentIndex >= 0) {
uriString = uriString.substring(0, fragmentIndex);
}
}
try {
this.uri = uriString.trim();
opaque = isOpaque(uri);
// No further parsing of opaque uris
if (isOpaque() && !hierarchicalOnly) {
return;
}
opaque = false;
normalizeHierarchical();
} catch (IllegalArgumentException e) {
if (e.getMessage() != null) {
invalidExplanation = e.getMessage();
} else {
Throwable t = e.getCause();
if (t != null && t.getMessage() != null) {
invalidExplanation = t.getMessage();
} else {
invalidExplanation = "Invalid uri: " + e;
}
}
}
}
/** Creates an url type uri */
public URI(String scheme, String host, int port, String rest) {
this.scheme = scheme;
this.host = host;
this.port = port;
this.rest = rest;
recombine();
normalizeHierarchical();
opaque = false;
}
/** Returns whether an url is opaque or hierarchical */
private boolean isOpaque(String uri) {
int colonIndex = uri.indexOf(":");
if (colonIndex < 0) {
return true;
} else {
return !(uri.length() > colonIndex + 1
&& uri.charAt(colonIndex + 1) == '/');
}
}
/**
* Returns whether this is a valid URI (after normalizing).
* All non-hierarchical uri's containing a scheme is valid.
*/
public boolean isValid() {
return invalidExplanation == null;
}
/**
* Normalizes this hierarchical uri according to FRC 2396 and the Overture
* standard. Before normalizing, some simple heuristics are use to make
* the uri complete if needed. After normalizing, the scheme,
* host, port and rest of this uri is set if defined.
*
* @throws IllegalArgumentException if this uri can not be normalized into a legal uri
*/
private void normalizeHierarchical() {
complete();
escapeNonAscii();
unescapeHtmlEntities();
decompose();
lowCaseHost();
removeDefaultPortNumber();
removeTrailingHostDot();
makeDoubleSlashesSingle();
recombine();
}
/** Applies simple heuristics to complete this uri if needed */
private void complete() {
if (uri.startsWith("www.")) {
uri = "http://" + uri;
} else if (uri.startsWith("WWW")) {
uri = "http://" + uri;
} else if (uri.startsWith("/http:")) {
uri = uri.substring(1);
} else if (isFileURIShortHand(uri)) {
uri = "file://" + uri;
}
}
private boolean isFileURIShortHand(String uri) {
if (uri.indexOf(":\\") == 1) {
return true;
}
if (uri.indexOf("c:/") == 0) {
return true;
}
if (uri.indexOf("d:/") == 0) {
return true;
}
return false;
}
/**
* Decomposes this uri into scheme, host, port and rest.
*/
private void decompose() {
java.net.URI neturi = java.net.URI.create(uri).normalize();
scheme = neturi.getScheme();
host = neturi.getHost();
boolean portAlreadyParsed = false;
// No host if the host contains underscores
if (host == null) {
host = neturi.getAuthority();
if (host != null) {
int colonPos = host.lastIndexOf(":");
if (!scheme.equals("file") && colonPos > -1) {
//we probably have an (illegal) URI of type http://under_score.com:5000/
try {
port = Integer.parseInt(host.substring(colonPos + 1));
host = host.substring(0, colonPos);
portAlreadyParsed = true;
} catch (NumberFormatException nfe) {
//empty
}
}
}
}
if ("file".equalsIgnoreCase(scheme)) {
if (host == null) {
host = "localhost";
} else {
host = repairWindowsDrive(host, uri);
}
}
if (host == null) {
throw new IllegalArgumentException(
"A complete uri must specify a host");
}
if (!portAlreadyParsed) {
port = neturi.getPort();
}
rest = (neturi.getRawPath() != null ? neturi.getRawPath() : "")
+ (neturi.getRawQuery() != null
? ("?" + neturi.getRawQuery())
: "")
+ (neturi.getRawFragment() != null
? ("#" + neturi.getRawFragment())
: "");
}
/** c: turns to c when interpreted by URI. Repair it */
private String repairWindowsDrive(String host, String uri) {
if (host.length() != 1) {
return host;
}
int driveIndex = uri.indexOf(host + ":");
if (driveIndex == 5 || driveIndex == 7) { // file: or file://
return host + ":";
} else {
return host;
}
}
/** "http://a/\u00E6" → "http://a/%E6;" */
private void escapeNonAscii() {
char[] uriChars = uri.toCharArray();
StringBuilder result = new StringBuilder(uri.length());
for (char uriChar : uriChars) {
if (uriChar >= 0x80 || uriChar == 0x22) {
result.append("%");
result.append(Integer.toHexString(uriChar));
result.append(";");
} else {
result.append(uriChar);
}
}
uri = result.toString();
}
/** "http://a/&" → "http://a/&" Currently ampersand only */
private void unescapeHtmlEntities() {
int ampIndex = uri.indexOf("&");
if (ampIndex < 0) {
return;
}
StringBuilder result = new StringBuilder(uri.substring(0, ampIndex));
while (ampIndex >= 0) {
result.append("&");
int nextAmpIndex = uri.indexOf("&", ampIndex + 5);
result.append(
uri.substring(ampIndex + 5,
nextAmpIndex > 0 ? nextAmpIndex : uri.length()));
ampIndex = nextAmpIndex;
}
uri = result.toString();
}
/** "HTTP://a" → "http://a" */
private void lowCaseHost() {
host = toLowerCase(host);
}
/** "http://a:80" → "http://a" and "https://a:443" → https//a */
private void removeDefaultPortNumber() {
if (port == 80 && scheme.equals("http")) {
port = -1;
} else if (port == 443 && scheme.equals("https")) {
port = -1;
}
}
/** "http://a./b" → "http://a/b" */
private void removeTrailingHostDot() {
if (host.endsWith(".")) {
host = host.substring(0, host.length() - 1);
}
}
/** "http://a//b" → "http://a/b" */
private void makeDoubleSlashesSingle() {
StringBuilder result = new StringBuilder(rest.length());
char[] restChars = rest.toCharArray();
for (int i = 0; i < restChars.length; i++) {
if (!(i + 1 < restChars.length && restChars[i] == '/'
&& restChars[i + 1] == '/')) {
result.append(restChars[i]);
}
}
rest = result.toString();
}
/** Recombines the uri from the scheme, host, port and rest */
private void recombine() {
StringBuilder recombined = new StringBuilder(100);
recombined.append(scheme);
recombined.append("://");
recombined.append(host);
if (port > -1) {
recombined.append(":").append(port);
}
if (rest != null) {
if (!rest.startsWith("/")) {
recombined.append("/");
}
recombined.append(rest);
} else {
recombined.append("/"); // RFC 2396 violation, as required by search
}
uri = recombined.toString();
}
/**
* Returns the normalized scheme of this URI.
*
* @return the normalized scheme (protocol), or null if there is none,
* which may only be the case with non-hierarchical URIs
*/
public String getScheme() {
return scheme;
}
/**
* Returns whether this URI is hierarchical or opaque.
* A typical example of an hierarchical URI is an URL,
* while URI's are mailto, news and such.
*
* @return true if the url is opaque, false if it is hierarchical
*/
public final boolean isOpaque() {
return opaque;
}
/**
* Returns the normalized host of this URI.
*
* @return the normalized host, or null if there is none, which may
* only be the case if this is a non-hierarchical uri
*/
public String getHost() {
return host;
}
/** Returns the port number of this scheme if set explicitly, or -1 otherwise */
public int getPort() {
return port;
}
/**
* Returns the rest of this uri, that is what is following the host or port.
* This is path, query and fragment as defined in RFC 2396. Returns an empty string
* if this uri has no rest.
*/
public String getRest() {
if (rest == null) {
return null;
} else if (rest.equals("/")) {
return "";
} else {
return rest;
}
}
public String getDomain() {
if (parsedDomain) {
return domain;
}
String host = getHost();
if (host == null) return null;
int firstDotPos = host.indexOf(".");
int lastDotPos = host.lastIndexOf(".");
String domain;
if (firstDotPos < 0) {
// "." was not found at all
domain = host;
} else if (firstDotPos == lastDotPos) {
//there is only one "." in the host
domain = host;
} else {
//for www.host.com return host.com
//TODO: Must be corrected when implementing tldlist
domain = host.substring(firstDotPos + 1, host.length());
}
this.parsedDomain = true;
this.domain = domain;
return domain;
}
public String getMainTld() {
if (parsedMainTld) {
return mainTld;
}
String host = getHost();
if (host == null) return null;
int lastDotPos = host.lastIndexOf(".");
String mainTld;
if (lastDotPos < 0) {
//no ".", no TLD
mainTld = null;
} else if (lastDotPos == host.length() - 1) {
//the "." is the last character
mainTld = null;
} else {
//for www.yahoo.co.uk return uk
//TODO: Implement list of TLDs from config?
mainTld = host.substring(lastDotPos + 1);
}
this.parsedMainTld = true;
this.mainTld = mainTld;
return mainTld;
}
public String getPath() {
if (parsedPath) {
return path;
}
String rest = this.rest;
if (rest == null) return null;
rest = removeFragment(rest);
int queryPos = rest.lastIndexOf("?");
if (queryPos > -1) {
rest = rest.substring(0, queryPos);
}
this.parsedPath = true;
this.path = rest;
return this.path;
}
private String removeFragment(String path) {
int fragmentPos = path.lastIndexOf("#");
return (fragmentPos > -1) ? path.substring(0, fragmentPos) : path;
}
public String getFilename() {
if (parsedFilename) {
return filename;
}
String path = getPath();
if (path == null) return null;
path = removeParams(path);
int lastSlash = path.lastIndexOf("/");
String filename;
if (lastSlash < 0) {
//there is no slash, return the path, excluding params
filename = path;
} else if (lastSlash == path.length() - 1) {
//the slash is the last character, there is no filename here
filename = "";
} else {
filename = path.substring(lastSlash + 1);
}
this.parsedFilename = true;
this.filename = filename;
return filename;
}
private String removeParams(String filename) {
int firstSemicolon = filename.indexOf(";");
if (firstSemicolon < 0) {
//there are no params
return filename;
}
return filename.substring(0, firstSemicolon);
}
public String getExtension() {
if (parsedExtension) {
return extension;
}
String filename = getFilename();
if (filename == null) return null;
int lastDotPos = filename.lastIndexOf(".");
String extension;
if (lastDotPos < 0) {
//there is no ".", there is no extension
extension = null;
} else if (lastDotPos == filename.length() - 1) {
//the "." is the last character, there is no extension
extension = null;
} else {
extension = filename.substring(lastDotPos + 1);
}
this.parsedExtension = true;
this.extension = extension;
return extension;
}
public String getQuery() {
if (parsedQuery) {
return query;
}
String rest = this.rest;
if (rest == null) return null;
rest = removeFragment(rest);
int queryPos = rest.lastIndexOf("?");
String query = null;
if (queryPos > -1) {
//we have a query
query = rest.substring(queryPos+1);
}
this.parsedQuery = true;
this.query = query;
return query;
}
public String getFragment() {
if (parsedFragment) {
return fragment;
}
String path = this.rest;
if (path == null) return null;
int fragmentPos = path.lastIndexOf("#");
String fragment = null;
if (fragmentPos > -1) {
//we have a fragment
fragment = path.substring(fragmentPos+1);
}
this.parsedFragment = true;
this.fragment = fragment;
return fragment;
}
public String getParams() {
if (parsedParams) {
return params;
}
String path = getPath();
if (path == null) return null;
int semicolonPos = path.indexOf(";");
String params;
if (semicolonPos < 0) {
//there is no semicolon, there are no params here
params = null;
} else if (semicolonPos == path.length() - 1) {
//the semicolon is the last character, there are no params here
params = null;
} else {
params = path.substring(semicolonPos + 1);
}
this.parsedParams = true;
this.params = params;
return params;
}
public static String[] tokenize(String item) {
return tokenizePattern.split(item);
}
public List tokenize() {
List tokens = new ArrayList<>();
tokens.addAll(tokenize(URLContext.URL_SCHEME, getScheme()));
tokens.addAll(tokenize(URLContext.URL_HOST, getHost()));
tokens.addAll(tokenize(URLContext.URL_PORT, getPort() > -1 ? "" + getPort() : null));
tokens.addAll(tokenize(URLContext.URL_PATH, getPath()));
tokens.addAll(tokenize(URLContext.URL_QUERY, getQuery()));
tokens.addAll(tokenize(URLContext.URL_FRAGMENT, getFragment()));
return tokens;
}
private List tokenize(URLContext context, String item) {
if (item == null) {
return new ArrayList<>(0);
}
String[] tokenStrings = tokenize(item);
List tokens = new ArrayList<>(tokenStrings.length);
for (String tokenString : tokenStrings) {
if (tokenString.length() > 0) {
tokens.add(new Token(context, tokenString));
}
}
return tokens;
}
/** Returns an explanation of why this uri is invalid, or null if it is valid */
public String getInvalidExplanation() {
return invalidExplanation;
}
public int hashCode() {
return uri.hashCode();
}
public boolean equals(Object object) {
if (!(object instanceof URI)) {
return false;
}
return (toString().equals(object.toString()));
}
public int compareTo(URI object) {
return toString().compareTo(object.toString());
}
public Object clone() {
try {
return super.clone();
} catch (CloneNotSupportedException e) {
throw new RuntimeException("Someone made me unclonable!", e);
}
}
/** Returns a new URI with a changed scheme */
public URI setScheme(String scheme) {
return new URI(scheme, host, port, rest);
}
/** Returns a new URI with a changed host (or authority) */
public URI setHost(String host) {
return new URI(scheme, host, port, rest);
}
/** Returns a new URI with a changed port */
public URI setPort(int port) {
return new URI(scheme, host, port, rest);
}
/** Returns a new URI with a changed rest */
public URI setRest(String rest) {
return new URI(scheme, host, port, rest);
}
/** Returns a new uri with the an additional parameter */
public URI addParameter(String name, String value) {
String newRest = rest;
if (newRest == null) {
newRest = "";
}
if ( newRest.contains("?")) {
newRest += "&";
} else {
newRest += "?";
}
newRest += name + "=" + value;
return new URI(scheme, host, port, newRest);
}
/** Returns this uri as a string */
public String stringValue() {
return uri;
}
/** Returns this URI as a string */
public String toString() {
return uri;
}
/**
* Returns the depth of this uri.
* The depth of an hierarchical uri equals the number of slashes
* which are not separating the protocol and the host, and not at the end.
*
* @return the depth of this uri if it is hierarchical, or 0 if it is opaque
*/
public int getDepth() {
int colonIndex = uri.indexOf(':');
// count number of slashes in the Uri
int currentIndex = colonIndex;
int depth = 0;
while (currentIndex != -1) {
currentIndex = uri.indexOf('/', currentIndex);
if (currentIndex != -1) {
depth++;
currentIndex++;
}
}
if (uri.charAt(colonIndex + 1) == '/') {
depth--;
}
if (uri.charAt(colonIndex + 2) == '/') {
depth--;
}
if ((uri.charAt(uri.length() - 1) == '/')
&& ((uri.length() - 1) > (colonIndex + 2))) {
depth--;
}
return depth;
}
public static class Token {
private final URLContext context;
private final String token;
private Token(URLContext context, String token) {
this.context = context;
this.token = token;
}
public URLContext getContext() {
return context;
}
public String getToken() {
return token;
}
}
public enum URLContext {
URL_SCHEME(0, "scheme"),
URL_HOST(1, "host"),
URL_DOMAIN(2, "domain"),
URL_MAINTLD(3, "maintld"),
URL_PORT(4, "port"),
URL_PATH(5, "path"),
URL_FILENAME(6, "filename"),
URL_EXTENSION(7, "extension"),
URL_PARAMS(8, "params"),
URL_QUERY(9, "query"),
URL_FRAGMENT(10, "fragment");
public final int id;
public final String name;
URLContext(int id, String name) {
this.id = id;
this.name = name;
}
}
}