org.archive.url.BasicURLCanonicalizer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of webarchive-commons Show documentation
The newest version!
package org.archive.url;

import java.net.IDN;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.google.common.net.InetAddresses;

/**
 * Canonicalizer that does more or less basic fixup. Based initially on rules
 * specified at https://developers.google.com/safe-browsing/developers_guide_v2#
 * Canonicalization. These rules are designed for clients of Google's
 * "experimental" Safe Browsing API to "check URLs against Google's
 * constantly-updated blacklists of suspected phishing and malware pages".
 * 
 * 
 * This class differs from Google in treatment of non-ascii input. Google's
 * rules don't really address this except with one example test case, which
 * seems to suggest taking raw input bytes and pct-encoding them byte for byte.
 * Since the input to this class consists of java strings, not raw bytes, that
 * wouldn't be possible, even if deemed preferable. Instead,
 * BasicURLCanonicalizer expresses non-ascii characters pct-encoded UTF-8.
 */
public class BasicURLCanonicalizer implements URLCanonicalizer {
	Pattern OCTAL_IP = Pattern
			.compile("^(0[0-7]*)(\\.[0-7]+)?(\\.[0-7]+)?(\\.[0-7]+)?$");
	Pattern DECIMAL_IP = Pattern
			.compile("^([1-9][0-9]*)(\\.[0-9]+)?(\\.[0-9]+)?(\\.[0-9]+)?$");

	public void canonicalize(HandyURL url) {
		url.setHash(null);
		url.setAuthUser(minimalEscape(url.getAuthUser()));
		url.setAuthPass(minimalEscape(url.getAuthPass()));

		url.setQuery(minimalEscape(url.getQuery()));
		String hostE = unescapeRepeatedly(url.getHost());
		String host = null;
		if (hostE != null) {
			try {
				host = IDN.toASCII(hostE);
			} catch (IllegalArgumentException e) {
				if (!e.getMessage().contains(
						"A prohibited code point was found")) {
					// TODO: What to do???
					// throw e;
				}
				host = hostE;

			}
			host = host.replaceAll("^\\.+", "").replaceAll("\\.\\.+", ".")
					.replaceAll("\\.$", "");
		}

		String ip = null;
		ip = attemptIPFormats(host);
		if (ip != null) {
			host = ip;
		} else if (host != null) {
			host = escapeOnce(host.toLowerCase());
		}
		url.setHost(host);
		// now the path:

		String path = unescapeRepeatedly(url.getPath());

		url.setPath(escapeOnce(normalizePath(path)));
	}

	private static final Pattern SINGLE_FORWARDSLASH_PATTERN = Pattern
			.compile("/");

	public String normalizePath(String path) {
		if (path == null) {
			path = "/";
		} else {
			// -1 gives an empty trailing element if path ends with '/':
			String[] paths = SINGLE_FORWARDSLASH_PATTERN.split(path, -1);
			ArrayList keptPaths = new ArrayList();
			boolean first = true;
			for (String p : paths) {
				if (first) {
					first = false;
					continue;
				} else if (p.compareTo(".") == 0) {
					// skip
					continue;
				} else if (p.compareTo("..") == 0) {
					// pop the last path, if present:
					if (keptPaths.size() > 0) {
						keptPaths.remove(keptPaths.size() - 1);
					} else {
						// TODO: leave it? let's do for now...
						keptPaths.add(p);
					}
				} else {
					keptPaths.add(p);
				}
			}
			int numKept = keptPaths.size();
			if (numKept == 0) {
				path = "/";
			} else {
				StringBuilder sb = new StringBuilder();
				sb.append("/");
				for (int i = 0; i < numKept - 1; i++) {
					String p = keptPaths.get(i);
					if (p.length() > 0) {
						// this will omit multiple slashes:
						sb.append(p).append("/");
					}
				}
				sb.append(keptPaths.get(numKept - 1));
				path = sb.toString();
			}
		}
		return path;
	}

	public String attemptIPFormats(String host) { // throws URIException {
		if (host == null) {
			return null;
		}
		if (host.matches("^\\d+$")) {
			try {
				Long l = Long.parseLong(host);
				return InetAddresses.fromInteger(l.intValue()).getHostAddress();
			} catch (NumberFormatException e) {
			}
		} else {
			// check for octal:
			Matcher m = OCTAL_IP.matcher(host);
			if (m.matches()) {
				int parts = m.groupCount();
				if (parts > 4) {
					// WHAT TO DO?
					return null;
					// throw new URIException("Bad Host("+host+")");
				}
				int[] ip = new int[] { 0, 0, 0, 0 };
				for (int i = 0; i < parts; i++) {
					int octet;
					try {
						octet = Integer.parseInt(
								m.group(i + 1).substring((i == 0) ? 0 : 1), 8);
					} catch (Exception e) {
						return null;
					}
					if ((octet < 0) || (octet > 255)) {
						return null;
						// throw new URIException("Bad Host("+host+")");
					}
					ip[i] = octet;
				}
				return String.format("%d.%d.%d.%d", ip[0], ip[1], ip[2], ip[3]);
			} else {
				Matcher m2 = DECIMAL_IP.matcher(host);
				if (m2.matches()) {
					int parts = m2.groupCount();
					if (parts > 4) {
						// WHAT TO DO?
						return null;
						// throw new URIException("Bad Host("+host+")");
					}
					int[] ip = new int[] { 0, 0, 0, 0 };
					for (int i = 0; i < parts; i++) {

						String m2Group = m2.group(i + 1);
						if (m2Group == null)
							return null;
						// int octet =
						// Integer.parseInt(m2.group(i+1).substring((i==0)?0:1));
						int octet;
						try {
							octet = Integer.parseInt(m2Group
									.substring((i == 0) ? 0 : 1));
						} catch (Exception e) {
							return null;
						}
						if ((octet < 0) || (octet > 255)) {
							return null;
							// throw new URIException("Bad Host("+host+")");
						}
						ip[i] = octet;
					}
					return String.format("%d.%d.%d.%d", ip[0], ip[1], ip[2],
							ip[3]);

				}
			}
		}
		return null;
	}

	public String minimalEscape(String input) {
		return escapeOnce(unescapeRepeatedly(input));
	}

	protected static Charset _UTF8 = null;

	protected static Charset UTF8() {
		if (_UTF8 == null) {
			_UTF8 = Charset.forName("UTF-8");
		}
		return _UTF8;
	}

	/**
	 * @param input String to be percent-encoded. Assumed to be fully unescaped.
	 * @return percent-encoded string
	 */
	public String escapeOnce(String input) {
		if (input == null) {
			return null;
		}

		byte[] utf8bytes = input.getBytes(UTF8());
		StringBuilder sb = null;
		boolean ok = false;

		for (int i = 0; i < utf8bytes.length; i++) {
			int b = utf8bytes[i] & 0xff;
			ok = false;
			if (b > 32) {
				if (b < 128) {
					if (b != '#') {
						ok = (b != '%');
					}
				}
			}
			if (ok) {
				if (sb != null) {
					sb.append((char) b);
				}
			} else {
				if (sb == null) {
					/*
					 * everything up to this point has been an ascii character
					 * not needing escaping
					 */
					sb = new StringBuilder(input.substring(0, i));
				}
				if (b == '%' && i < utf8bytes.length - 2) {
					// Any hex escapes left at this point represent non-UTF-8 encoded characters
					// Unescape them, so they don't get double escaped
					int hex1 = getHex(utf8bytes[i + 1]);
					if (hex1 >= 0) {
						int hex2 = getHex(utf8bytes[i + 2]);
						if (hex2 >= 0) {
							i = i+2;
							b = hex1 * 16 + hex2;
						}
					}

				}
				sb.append("%");
				String hex = Integer.toHexString(b).toUpperCase();
				if (hex.length() == 1) {
					sb.append('0');
				}
				sb.append(hex);
			}
		}
		if (sb == null) {
			return input;
		}
		return sb.toString();
	}

	public String unescapeRepeatedly(String input) {
		if (input == null) {
			return null;
		}
		while (true) {
			String un = decode(input);
			if (un.compareTo(input) == 0) {
				return input;
			}
			input = un;
		}
	}

	public String decode(String input) {
		StringBuilder sb = null;
		int pctUtf8SeqStart = -1;
		ByteBuffer bbuf = null;
		CharsetDecoder utf8decoder = null;
		int i = 0;
		int h1, h2;
		while (i < input.length()) {
			char c = input.charAt(i);
			if (i <= input.length() - 3 && c == '%'
					&& (h1 = getHex(input.charAt(i + 1))) >= 0
					&& (h2 = getHex(input.charAt(i + 2))) >= 0) {
				if (sb == null) {
					sb = new StringBuilder(input.length());
					if (i > 0) {
						sb.append(input.substring(0, i));
					}
				}
				int b = ((h1 << 4) + h2) & 0xff;
				if (pctUtf8SeqStart < 0 && b < 0x80) { // plain ascii
					sb.append((char) b);
				} else {
					if (pctUtf8SeqStart < 0) {
						pctUtf8SeqStart = i;
						if (bbuf == null) {
							bbuf = ByteBuffer
									.allocate((input.length() - i) / 3);
						}
					}
					bbuf.put((byte) b);
				}
				i += 3;
			} else {
				if (pctUtf8SeqStart >= 0) {
					if (utf8decoder == null) {
						utf8decoder = UTF8().newDecoder();
					}
					appendDecodedPctUtf8(sb, bbuf, input, pctUtf8SeqStart, i,
							utf8decoder);
					pctUtf8SeqStart = -1;
					bbuf.clear();
				}
				if (sb != null) {
					sb.append(c);
				}
				i++;
			}
		}
		if (pctUtf8SeqStart >= 0) {
			if (utf8decoder == null) {
				utf8decoder = UTF8().newDecoder();
			}
			appendDecodedPctUtf8(sb, bbuf, input, pctUtf8SeqStart, i,
					utf8decoder);
		}

		if (sb != null) {
			return sb.toString();
		} else {
			return input;
		}
	}

	/**
	 * Decodes bytes in bbuf as utf-8 and appends decoded characters to sb. If
	 * decoding of any portion fails, appends the un-decodable %xx%xx sequence
	 * extracted from inputStr instead of decoded characters. See "bad unicode"
	 * tests in BasicURLCanonicalizerTest#testDecode(). Variables only make sense
	 * within context of {@link #decode(String)}.
	 * 
	 * @param sb
	 *            StringBuilder to append to
	 * @param bbuf
	 *            raw bytes decoded from %-encoded input
	 * @param inputStr
	 *            full input string
	 * @param seqStart
	 *            start index inclusive within inputStr of %-encoded sequence
	 * @param seqEnd
	 *            end index exclusive within inputStr of %-encoded sequence
	 * @param utf8decoder
	 */
	private void appendDecodedPctUtf8(StringBuilder sb, ByteBuffer bbuf,
			String inputStr, int seqStart, int seqEnd,
			CharsetDecoder utf8decoder) {
		// assert bbuf.position() * 3 == seqEnd - seqStart;
		utf8decoder.reset();
		CharBuffer cbuf = CharBuffer.allocate(bbuf.position());
		bbuf.flip();
		while (bbuf.position() < bbuf.limit()) {
			CoderResult coderResult = utf8decoder.decode(bbuf, cbuf, true);
			sb.append(cbuf.flip());
			if (coderResult.isMalformed()) {
				// put the malformed %xx%xx into the result un-decoded
				CharSequence undecodablePctHex = inputStr.subSequence(seqStart
						+ 3 * bbuf.position(), seqStart + 3 * bbuf.position()
						+ 3 * coderResult.length());
				sb.append(undecodablePctHex);

				// there could be more good stuff after the bad
				bbuf.position(bbuf.position() + coderResult.length());
			}
			cbuf.clear();
		}
	}

	public int getHex(final int b) {
		if (b < '0') {
			return -1;
		}
		if (b <= '9') {
			return b - '0';
		}
		if (b < 'A') {
			return -1;
		}
		if (b <= 'F') {
			return 10 + (b - 'A');
		}
		if (b < 'a') {
			return -1;
		}
		if (b <= 'f') {
			return 10 + (b - 'a');
		}
		return -1;
	}

}