
com.github.dirkraft.geturi.GetURI Maven / Gradle / Ivy
/**
* Copyright (c) 2014 Jason Dunkelberger (dirkraft)
*
* See the file license.txt for copying permission.
*/
package com.github.dirkraft.geturi;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GetURI {
private static final Pattern PAT_BAD_QUERY_CHAR = Pattern.compile("^Illegal character in (query|path) at index (\\d+):.*");
/**
* Maximum number of exceptions to tolerate and attempt to fix in appeasing URI.
*/
public static final int MAX_GRIT = 100;
/**
* A collection of UGHWTF to attempt to validify (to make valid) urls that would otherwise trigger
* {@link URISyntaxException}s with {@link URI}. The problem is Java's rfc3986-strict URI parser. Servers accept
* whatever bytes you send to them. By "fixing" a URI, you are changing the link. This tries to
* do the minimum possible to a URI through the java.net.URI
exception wall. Yay, you can still use
* Apache HttpClient, or whatever else is hard set on java.net.URI, unless of course these changes cause the server
* at the end to return something else, in which case, (╯°□°)╯︵ ┻━┻
*
* This method is probably not performant, but it guarantees the absolute minimum changes required to pass
* validation because it bases "fixes" directly on the URISyntaxExceptions themselves.
*
* Rant:
* YEAH, you could "fix" the URL to coincide with some well-defined standard with reserved characters
* (I'm looking at you, rfc3986). Then you wouldn't be doing something so terrible as parsing exception
* message strings (as this method does), and in the least performant way possible.
*
* But the assumption there is that you would be parsing the (invalid) form of the URI identical to the way
* java.net.URI does in its private code, because if you tried to eagerly fix all the broken stuff, you may have
* changed the original URL TOO MUCH to even be the same request anymore. The thing is, servers accept
* rfc3986-invalid requests. Java just won't make them. So the standard is not at the other end. You can try to
* eagerly fix it in one pass if you wish, but you would have to clone and modify java.net.URI's logic to make
* Java happy and minimize changes to the original request.
*
* Also realize that the standard defines the correct form, not how to fallback and resolve errors in
* non-compliant forms. The correctest fix would be to fork Apache HttpClient, replace all usages of java.net.URI
* with a custom URI class with configurable, arbitrary amounts of encoding and validation, so that any request, no
* matter how subjectively horrible, could put its bytes on the wire to give the horrible remote server gods at the
* other ends of the ether whatever byte sacrifice they require to release their particular document-form blessings.
*
* So what I'm saying is: don't try to "fix" this anymore than is necessary. Also don't try
* to be efficient in "fixing" URLs. You don't even know what "fixed" is. You can't because it's really up
* to each server. Do not stare directly at java.net.URI source.
*
*
Everything is fine.
*
* @throws RuntimeException "orDieTryin", if unable to appease the URI gods of Java.
*/
public static URI orDieTryin(String url) throws RuntimeException {
URI uri = null;
for (int i = 0; uri == null && i < MAX_GRIT; i++) { // maximum cleaning iterations for safety
try {
uri = new URI(url);
} catch (URISyntaxException e) {
if (e.getMessage() == null) {
throw new RuntimeException(e);
}
boolean handled = false;
Matcher m = PAT_BAD_QUERY_CHAR.matcher(e.getMessage());
if (m.find()) {
int badCharIdx = Integer.parseInt(m.group(2));
char badChar = url.charAt(badCharIdx);
String hex = Integer.toHexString(badChar);
if (hex.length() == 1) {
hex = '0' + hex;
}
if (hex.length() != 2) {
throw new RuntimeException("Character " + badChar + " hexed too large " + hex +
". I only know how to make 2-digit %-hex codes.");
}
String pfx = url.substring(0, badCharIdx);
String sfx = badCharIdx + 1 == url.length() ? "" : url.substring(badCharIdx + 1);
url = pfx + '%' + hex + sfx;
handled = true;
}
if (!handled) {
throw new RuntimeException(e);
}
}
}
return uri;
}
}