org.jsoup.helper.UrlBuilder Maven / Gradle / Ivy
Go to download
SDK for dev_appserver (local development) with some of the dependencies shaded (repackaged)
package org.jsoup.helper;
import org.jsoup.Connection;
import org.jsoup.internal.StringUtil;
import org.jspecify.annotations.Nullable;
import java.io.UnsupportedEncodingException;
import java.net.IDN;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLDecoder;
import java.net.URLEncoder;
import static org.jsoup.helper.DataUtil.UTF_8;
/**
A utility class to normalize input URLs. jsoup internal; API subject to change.
Normalization includes puny-coding the host, and encoding non-ascii path components. Any non-ascii characters in
the query string (or the fragment/anchor) are escaped, but any existing escapes in those components are preserved.
*/
final class UrlBuilder {
URL u;
@Nullable StringBuilder q;
UrlBuilder(URL inputUrl) {
this.u = inputUrl;
if (u.getQuery() != null)
q = StringUtil.borrowBuilder().append(u.getQuery());
}
URL build() {
try {
// use the URI class to encode non-ascii in path
URI uri = new URI(
u.getProtocol(),
u.getUserInfo(),
IDN.toASCII(decodePart(u.getHost())), // puny-code
u.getPort(),
null, null, null // path, query and fragment appended later so as not to encode
);
StringBuilder normUrl = StringUtil.borrowBuilder().append(uri.toASCIIString());
appendToAscii(u.getPath(), false, normUrl);
if (q != null) {
normUrl.append('?');
appendToAscii(StringUtil.releaseBuilder(q), true, normUrl);
}
if (u.getRef() != null) {
normUrl.append('#');
appendToAscii(u.getRef(), false, normUrl);
}
u = new URL(StringUtil.releaseBuilder(normUrl));
return u;
} catch (MalformedURLException | URISyntaxException | UnsupportedEncodingException e) {
// we assert here so that any incomplete normalization issues can be caught in devel. but in practise,
// the remote end will be able to handle it, so in prod we just pass the original URL.
// The UnsupportedEncodingException would never happen as always UTF8
assert Validate.assertFail(e.toString());
return u;
}
}
void appendKeyVal(Connection.KeyVal kv) throws UnsupportedEncodingException {
if (q == null)
q = StringUtil.borrowBuilder();
else
q.append('&');
q
.append(URLEncoder.encode(kv.key(), UTF_8.name()))
.append('=')
.append(URLEncoder.encode(kv.value(), UTF_8.name()));
}
private static String decodePart(String encoded) {
try {
return URLDecoder.decode(encoded, UTF_8.name());
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e); // wtf!
}
}
private static void appendToAscii(String s, boolean spaceAsPlus, StringBuilder sb) throws UnsupportedEncodingException {
// minimal normalization of Unicode -> Ascii, and space normal. Existing escapes are left as-is.
for (int i = 0; i < s.length(); i++) {
int c = s.codePointAt(i);
if (c == ' ') {
sb.append(spaceAsPlus ? '+' : "%20");
} else if (c > 127) { // out of ascii range
sb.append(URLEncoder.encode(new String(Character.toChars(c)), UTF_8.name()));
// ^^ is a bit heavy-handed - if perf critical, we could optimize
if (Character.charCount(c) == 2) i++; // advance past supplemental
} else {
sb.append((char) c);
}
}
}
}