org.zaproxy.zap.spider.URLCanonicalizer Maven / Gradle / Ivy
Show all versions of zap Show documentation
/*
* Zed Attack Proxy (ZAP) and its related class files.
*
* ZAP is an HTTP/HTTPS proxy for assessing web application security.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* ZAP: Based on work by Yasser Ganjisaffar
* from project http://code.google.com/p/crawler4j/
*/
package org.zaproxy.zap.spider;
import java.net.URI;
import java.net.URL;
import java.util.HashSet;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.URIUtil;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.zaproxy.zap.spider.SpiderParam.HandleParametersOption;
/**
* The URLCanonicalizer is used for the process of converting an URL into a canonical (normalized)
* form. See URL Normalization for a
* reference.
*
* Note: some parts of the code are adapted from: stackoverflow
*
*
Added support for OData URLs
*/
public final class URLCanonicalizer {
/** The Constant log. */
private static final Logger log = LogManager.getLogger(URLCanonicalizer.class);
private static final String HTTP_SCHEME = "http";
private static final int HTTP_DEFAULT_PORT = 80;
private static final String HTTPS_SCHEME = "https";
private static final int HTTPS_DEFAULT_PORT = 443;
/**
* The Constant IRRELEVANT_PARAMETERS defining the parameter names which are ignored in the URL.
*/
private static final Set IRRELEVANT_PARAMETERS = new HashSet<>(3);
static {
IRRELEVANT_PARAMETERS.add("jsessionid");
IRRELEVANT_PARAMETERS.add("phpsessid");
IRRELEVANT_PARAMETERS.add("aspsessionid");
}
/**
* OData support Extract the ID of a resource including the surrounding quote First group is the
* resource_name Second group is the ID (quote will be taken as part of the value)
*/
private static final Pattern patternResourceIdentifierUnquoted =
Pattern.compile("/([\\w%]*)\\(([\\w']*)\\)");
/** OData support Detect a section containing a composite IDs */
private static final Pattern patternResourceMultipleIdentifier =
Pattern.compile("/[\\w%]*\\((.*)\\)");
/** OData support Extract the detail of the multiples IDs */
private static final Pattern patternResourceMultipleIdentifierDetail =
Pattern.compile("([\\w%]*)=([\\w']*)");
/** Private constructor to avoid initialization of object. */
private URLCanonicalizer() {}
/**
* Gets the canonical url.
*
* @param url the url
* @return the canonical url
*/
public static String getCanonicalURL(String url) {
return getCanonicalURL(url, null);
}
/**
* Gets the canonical url, starting from a relative or absolute url found in a given context
* (baseURL).
*
* @param url the url string defining the reference
* @param baseURL the context in which this url was found
* @return the canonical url
*/
public static String getCanonicalURL(String url, String baseURL) {
try {
/* Build the absolute URL, from the url and the baseURL */
String resolvedURL = URLResolver.resolveUrl(baseURL == null ? "" : baseURL, url);
log.debug("Resolved URL: " + resolvedURL);
URI canonicalURI;
try {
canonicalURI = new URI(resolvedURL);
} catch (Exception e) {
canonicalURI = new URI(URIUtil.encodeQuery(resolvedURL));
}
/* Some checking. */
if (canonicalURI.getScheme() == null) {
log.warn(
"Protocol could not be reliably evaluated from uri: "
+ canonicalURI
+ " and base url: "
+ baseURL);
return null;
}
if (canonicalURI.getRawAuthority() == null) {
log.debug(
"Ignoring URI with no authority (host[\":\"port]): "
+ canonicalURI
+ " (on base "
+ baseURL
+ ")");
return null;
}
if (canonicalURI.getHost() == null) {
log.warn(
"Host could not be reliably evaluated from: "
+ canonicalURI
+ " (on base "
+ baseURL
+ ")");
return null;
}
/*
* Normalize: no empty segments (i.e., "//"), no segments equal to ".", and no segments equal to
* ".." that are preceded by a segment not equal to "..".
*/
String path = canonicalURI.normalize().getRawPath();
/* Convert '//' -> '/' */
int idx = path.indexOf("//");
while (idx >= 0) {
path = path.replace("//", "/");
idx = path.indexOf("//");
}
/* Drop starting '/../' */
while (path.startsWith("/../")) {
path = path.substring(3);
}
/* Trim */
path = path.trim();
/* Process parameters and sort them. */
final SortedSet params =
createSortedParameters(canonicalURI.getRawQuery());
final String queryString;
String canonicalParams = canonicalize(params);
queryString = (canonicalParams.isEmpty() ? "" : "?" + canonicalParams);
/* Add starting slash if needed */
if (path.length() == 0) {
path = "/" + path;
}
/* Drop default port: example.com:80 -> example.com */
int port = canonicalURI.getPort();
if (isDefaultPort(canonicalURI.getScheme(), port)) {
port = -1;
}
/* Lowercasing protocol and host */
String protocol = canonicalURI.getScheme().toLowerCase();
String host = canonicalURI.getHost().toLowerCase();
String pathAndQueryString = normalizePath(path) + queryString;
URL result = new URL(protocol, host, port, pathAndQueryString);
return result.toExternalForm();
} catch (Exception ex) {
log.warn(
"Error while Processing URL ["
+ url
+ "] in the spidering process (on base "
+ baseURL
+ "): "
+ ex.getMessage());
return null;
}
}
/**
* Tells whether or not the given port is the default for the given scheme.
*
* Note: Only HTTP and HTTPS schemes are taken into account.
*
* @param scheme the scheme
* @param port the port
* @return {@code true} if given the port is the default port for the given scheme, {@code
* false} otherwise.
*/
private static boolean isDefaultPort(String scheme, int port) {
return HTTP_SCHEME.equalsIgnoreCase(scheme) && port == HTTP_DEFAULT_PORT
|| HTTPS_SCHEME.equalsIgnoreCase(scheme) && port == HTTPS_DEFAULT_PORT;
}
/**
* Builds a String representation of the URI with cleaned parameters, that can be used when
* checking if an URI was already visited. The URI provided as a parameter should be already
* cleaned and canonicalized, so it should be build with a result from {@link
* #getCanonicalURL(String)}.
*
*
When building the URI representation, the same format should be used for all the cases, as
* it may affect the number of times the pages are visited and reported if the option
* HandleParametersOption is changed while the spider is running.
*
* @param uri the uri
* @param handleParameters the handle parameters option
* @param handleODataParametersVisited Should we handle specific OData parameters
* @return the string representation of the URI
* @throws URIException the URI exception
*/
public static String buildCleanedParametersURIRepresentation(
org.apache.commons.httpclient.URI uri,
SpiderParam.HandleParametersOption handleParameters,
boolean handleODataParametersVisited)
throws URIException {
// If the option is set to use all the information, just use the default string
// representation
if (handleParameters.equals(HandleParametersOption.USE_ALL)) {
return uri.toString();
}
// If the option is set to ignore parameters completely, ignore the query completely
if (handleParameters.equals(HandleParametersOption.IGNORE_COMPLETELY)) {
return createBaseUriWithCleanedPath(
uri, handleParameters, handleODataParametersVisited);
}
// If the option is set to ignore the value, we get the parameters and we only add their
// name to the
// query
if (handleParameters.equals(HandleParametersOption.IGNORE_VALUE)) {
StringBuilder retVal =
new StringBuilder(
createBaseUriWithCleanedPath(
uri, handleParameters, handleODataParametersVisited));
String cleanedQuery = getCleanedQuery(uri.getEscapedQuery());
// Add the parameters' names to the uri representation.
if (cleanedQuery.length() > 0) {
retVal.append('?').append(cleanedQuery);
}
return retVal.toString();
}
// Should not be reached
return uri.toString();
}
private static String createBaseUriWithCleanedPath(
org.apache.commons.httpclient.URI uri,
HandleParametersOption handleParameters,
boolean handleODataParametersVisited)
throws URIException {
StringBuilder uriBuilder = new StringBuilder(createBaseUri(uri));
uriBuilder.append(
getCleanedPath(
uri.getEscapedPath(), handleParameters, handleODataParametersVisited));
return uriBuilder.toString();
}
private static String createBaseUri(org.apache.commons.httpclient.URI uri) throws URIException {
StringBuilder baseUriBuilder = new StringBuilder();
baseUriBuilder.append(uri.getScheme()).append("://").append(uri.getHost());
if (uri.getPort() != -1) {
baseUriBuilder.append(':').append(uri.getPort());
}
return baseUriBuilder.toString();
}
private static String getCleanedPath(
String escapedPath,
HandleParametersOption handleParameters,
boolean handleODataParametersVisited) {
if (escapedPath == null) {
return "";
}
String cleanedPath;
if (handleODataParametersVisited) {
cleanedPath = cleanODataPath(escapedPath, handleParameters);
} else {
cleanedPath = escapedPath;
}
return cleanedPath;
}
private static String getCleanedQuery(String escapedQuery) {
// Get the parameters' names
SortedSet params = createSortedParameters(escapedQuery);
Set parameterNames = new HashSet<>();
StringBuilder cleanedQueryBuilder = new StringBuilder();
if (params != null && !params.isEmpty()) {
for (QueryParameter parameter : params) {
String name = parameter.getName();
if (parameterNames.contains(name)) {
continue;
}
parameterNames.add(name);
// Ignore irrelevant parameters
if (IRRELEVANT_PARAMETERS.contains(name) || name.startsWith("utm_")) {
continue;
}
if (cleanedQueryBuilder.length() > 0) {
cleanedQueryBuilder.append('&');
}
cleanedQueryBuilder.append(name);
}
}
return cleanedQueryBuilder.toString();
}
/**
* Clean the path in the case of an OData Uri containing a resource identifier (simple or
* multiple)
*
* @param path The path to clean
* @param handleParameters tThe cleaning mode
* @return A cleaned path
*/
private static String cleanODataPath(String path, HandleParametersOption handleParameters) {
String cleanedPath = path;
if (HandleParametersOption.USE_ALL.equals(handleParameters)) {
cleanedPath = path;
} else {
// check for single ID (unnamed)
Matcher matcher = patternResourceIdentifierUnquoted.matcher(path);
if (matcher.find()) {
String resourceName = matcher.group(1);
String resourceID = matcher.group(2);
String subString = resourceName + "(" + resourceID + ")";
int begin = path.indexOf(subString);
int end = begin + subString.length();
String beforeSubstring = path.substring(0, begin);
String afterSubstring = path.substring(end);
if (HandleParametersOption.IGNORE_COMPLETELY.equals(handleParameters)
|| HandleParametersOption.IGNORE_VALUE.equals(handleParameters)) {
StringBuilder sb = new StringBuilder(beforeSubstring);
sb.append(resourceName).append("()").append(afterSubstring);
cleanedPath = sb.toString();
}
} else {
matcher = patternResourceMultipleIdentifier.matcher(path);
if (matcher.find()) {
// We've found a composite identifier. i.e: /Resource(field1=a,field2=3)
String multipleIdentifierSection = matcher.group(1);
int begin = path.indexOf(multipleIdentifierSection);
int end = begin + multipleIdentifierSection.length();
String beforeSubstring = path.substring(0, begin);
String afterSubstring = path.substring(end);
if (HandleParametersOption.IGNORE_COMPLETELY.equals(handleParameters)) {
cleanedPath = beforeSubstring + afterSubstring;
} else {
StringBuilder sb = new StringBuilder(beforeSubstring);
matcher =
patternResourceMultipleIdentifierDetail.matcher(
multipleIdentifierSection);
int i = 1;
while (matcher.find()) {
if (i > 1) {
sb.append(',');
}
String paramName = matcher.group(1);
sb.append(paramName);
i++;
}
sb.append(afterSubstring);
cleanedPath = sb.toString();
}
}
}
}
return cleanedPath;
}
/**
* Creates a sorted set with all the parameters from the given {@code query}, ordered
* lexicographically by name and value.
*
* @param queryString the query string
* @return a sorted set with all parameters, or {@code null} if the query string is {@code null}
* or empty.
*/
private static SortedSet createSortedParameters(final String queryString) {
if (queryString == null || queryString.isEmpty()) {
return null;
}
final String[] pairs = queryString.split("&");
final SortedSet params = new TreeSet<>();
for (final String pair : pairs) {
if (pair.length() == 0) {
continue;
}
String[] tokens = pair.split("=", 2);
switch (tokens.length) {
case 1:
if (pair.charAt(0) == '=') {
params.add(new QueryParameter("", tokens[0]));
} else {
params.add(new QueryParameter(tokens[0], ""));
}
break;
case 2:
params.add(new QueryParameter(tokens[0], tokens[1]));
break;
}
}
return params;
}
/**
* Canonicalize the query string.
*
* @param sortedParameters Parameter name-value pairs in lexicographical order.
* @return Canonical form of query string.
*/
private static String canonicalize(final SortedSet sortedParameters) {
if (sortedParameters == null || sortedParameters.isEmpty()) {
return "";
}
final StringBuilder sb = new StringBuilder(100);
for (QueryParameter parameter : sortedParameters) {
final String name = parameter.getName().toLowerCase();
// Ignore irrelevant parameters
if (IRRELEVANT_PARAMETERS.contains(name) || name.startsWith("utm_")) {
continue;
}
if (sb.length() > 0) {
sb.append('&');
}
sb.append(parameter.getName());
if (!parameter.getValue().isEmpty()) {
sb.append('=');
sb.append(parameter.getValue());
}
}
return sb.toString();
}
/**
* Normalize path.
*
* @param path the path
* @return the string
*/
private static String normalizePath(final String path) {
return path.replace("%7E", "~").replace(" ", "%20");
}
/**
* A query parameter, with non-{@code null} name and value.
*
* The query parameters are ordered by name and value.
*/
private static class QueryParameter implements Comparable {
private final String name;
private final String value;
public QueryParameter(String name, String value) {
if (name == null) {
throw new IllegalArgumentException("Parameter name must not be null.");
}
if (value == null) {
throw new IllegalArgumentException("Parameter value must not be null.");
}
this.name = name;
this.value = value;
}
public String getName() {
return name;
}
public String getValue() {
return value;
}
@Override
public int compareTo(QueryParameter other) {
if (other == null) {
return 1;
}
int result = name.compareTo(other.name);
if (result != 0) {
return result;
}
return value.compareTo(other.value);
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + name.hashCode();
result = prime * result + value.hashCode();
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
QueryParameter other = (QueryParameter) obj;
if (!name.equals(other.name)) {
return false;
}
if (!value.equals(other.value)) {
return false;
}
return true;
}
}
}