All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.javaweb.utils.URLCanonicalizerUtils Maven / Gradle / Ivy

There is a newer version: 2.0.3
Show newest version
/*
 * Copyright yz 2016-01-14  Email:[email protected].
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.javaweb.utils;

import java.net.*;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.regex.Pattern;

/**
 * URL规范化处理工具类 See http://en.wikipedia.org/wiki/URL_normalization for a
 * reference Note: some parts of the code are adapted from:
 * http://stackoverflow.com/a/4057470/405418
 *
 * @author Yasser Ganjisaffar
 */
public class URLCanonicalizerUtils {

	public static String getCanonicalURL(String url) {
		URL canonicalURL = getCanonicalURL(url, null);

		if (canonicalURL != null) {
			return canonicalURL.toExternalForm();
		}

		return null;
	}

	/**
	 * 获取一个规范化的URL对象
	 *
	 * @param href
	 * @param context
	 * @return
	 */
	public static URL getCanonicalURL(String href, String context) {
		try {
			if (!Pattern.compile("^https?", Pattern.CASE_INSENSITIVE).matcher(href).find()) {
				href = "http://" + href;
			}

			URL canonicalURL;

			if (context == null) {
				canonicalURL = new URL(href);
			} else {
				canonicalURL = new URL(new URL(context), href);
			}

			String path = canonicalURL.getPath();

			/*
			 * 替换掉换行符
			 */
			path = path.replaceAll("\r", "").replaceAll("\n", "");

			/*
			 * 替换任意个\为/
			 */
			path = path.replaceAll("\\\\", "/");

			/*
			 * Normalize: no empty segments (i.e., "//"), no segments equal to
			 * ".", and no segments equal to ".." that are preceded by a segment
			 * not equal to "..".
			 */
			try {
				path = new URI(path).normalize().toString();
			} catch (URISyntaxException e) {
				// 忽略URI错误格式
			}

			/*
			 * Convert '//' -> '/'
			 */
			path = path.replaceAll("/+", "/");

			/*
			 * Drop starting '/../'
			 */
			while (path.startsWith("/../")) {
				path = path.substring(3);
			}

			/*
			 * Trim
			 */
			path = path.trim();

			final SortedMap params = createParameterMap(canonicalURL.getQuery());
			final String                    queryString;

			if (params != null && params.size() > 0) {
				String canonicalParams = canonicalize(params);
				queryString = (canonicalParams.isEmpty() ? "" : "?" + canonicalParams);
			} else {
				queryString = "";
			}

			/*
			 * Add starting slash if needed
			 */
			if (path.length() == 0) {
				path = "/" + path;
			}

			/*
			 * Drop default port: example.com:80 -> example.com
			 */
			int port = canonicalURL.getPort();
			if (port == canonicalURL.getDefaultPort()) {
				port = -1;
			}

			/*
			 * Lowercasing protocol and host
			 */
			String protocol           = canonicalURL.getProtocol().toLowerCase();
			String host               = canonicalURL.getHost().toLowerCase();
			String pathAndQueryString = normalizePath(path) + queryString;

			return new URL(protocol, host, port, pathAndQueryString);
		} catch (MalformedURLException ex) {
			return null;
		}
	}

	/**
	 * Takes a query string, separates the constituent name-value pairs, and
	 * stores them in a SortedMap ordered by lexicographical order.
	 *
	 * @return Null if there is no query string.
	 */
	private static SortedMap createParameterMap(final String queryString) {
		if (queryString == null || queryString.isEmpty()) {
			return null;
		}

		final String[]            pairs  = queryString.split("&");
		final Map params = new HashMap(pairs.length);

		for (final String pair : pairs) {
			if (pair.length() == 0) {
				continue;
			}

			String[] tokens = pair.split("=", 2);

			switch (tokens.length) {
				case 1:
					if (pair.charAt(0) == '=') {
						params.put("", tokens[0]);
					} else {
						params.put(tokens[0], "");
					}
					break;
				case 2:
					params.put(tokens[0], tokens[1]);
					break;
			}
		}

		return new TreeMap(params);
	}

	/**
	 * Canonicalize the query string.
	 *
	 * @param sortedParamMap Parameter name-value pairs in lexicographical
	 *                       order.
	 * @return Canonical form of query string.
	 */
	private static String canonicalize(final SortedMap sortedParamMap) {
		if (sortedParamMap == null || sortedParamMap.isEmpty()) {
			return "";
		}

		final StringBuffer sb = new StringBuffer(100);

		for (Map.Entry pair : sortedParamMap.entrySet()) {
			final String key = pair.getKey().toLowerCase();

			if (key.equals("jsessionid") || key.equals("phpsessid") || key.equals("aspsessionid")) {
				continue;
			}

			if (sb.length() > 0) {
				sb.append('&');
			}

			sb.append(percentEncodeRfc3986(pair.getKey()));

			if (!pair.getValue().isEmpty()) {
				sb.append('=');
				sb.append(percentEncodeRfc3986(pair.getValue()));
			}
		}

		return sb.toString();
	}

	/**
	 * Percent-encode values according the RFC 3986. The built-in Java
	 * URLEncoder does not encode according to the RFC, so we make the extra
	 * replacements.
	 *
	 * @param string Decoded string.
	 * @return Encoded string per RFC 3986.
	 */
	private static String percentEncodeRfc3986(String string) {
		try {
			string = string.replace("+", "%2B");
			string = URLDecoder.decode(string, "UTF-8");
			string = URLEncoder.encode(string, "UTF-8");
			return string.replace("+", "%20").replace("*", "%2A").replace("%7E", "~");
		} catch (Exception e) {
			return string;
		}
	}

	private static String normalizePath(final String path) {
		return path.replace("%7E", "~").replace(" ", "%20");
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy