All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.url.UsableURIFactory Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.url;

import gnu.inet.encoding.IDNA;
import gnu.inet.encoding.IDNAException;

import java.io.UnsupportedEncodingException;
import java.util.BitSet;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.archive.util.TextUtils;

/**
 * Factory that returns UsableURIs.
 * 
 * Does escaping and fixup on URIs massaging in accordance with RFC2396 and to
 * match browser practice. For example, it removes any '..' if first thing in
 * the path as per IE, converts backslashes preceding the query string to
 * forward slashes, and discards any 'fragment'/anchor portion of the URI. This
 * class will also fail URIs if they are longer than IE's allowed maximum
 * length.
 * 
 * 

* TODO: Test logging. * * @author stack */ public class UsableURIFactory extends URI { private static final long serialVersionUID = 2L; /** * Logging instance. */ private static Logger logger = Logger.getLogger(UsableURIFactory.class.getName()); /** * The single instance of this factory. */ private static final UsableURIFactory factory = new UsableURIFactory(); /** * RFC 2396-inspired regex. * * From the RFC Appendix B: *

     * URI Generic Syntax                August 1998
     *
     * B. Parsing a URI Reference with a Regular Expression
     *
     * As described in Section 4.3, the generic URI syntax is not sufficient
     * to disambiguate the components of some forms of URI.  Since the
     * "greedy algorithm" described in that section is identical to the
     * disambiguation method used by POSIX regular expressions, it is
     * natural and commonplace to use a regular expression for parsing the
     * potential four components and fragment identifier of a URI reference.
     *
     * The following line is the regular expression for breaking-down a URI
     * reference into its components.
     *
     * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
     * 12            3  4          5       6  7        8 9
     *
     * The numbers in the second line above are only to assist readability;
     * they indicate the reference points for each subexpression (i.e., each
     * paired parenthesis).  We refer to the value matched for subexpression
     * <n> as $<n>.  For example, matching the above expression to
     *
     * http://www.ics.uci.edu/pub/ietf/uri/#Related
     *
     * results in the following subexpression matches:
     *
     * $1 = http:
     * $2 = http
     * $3 = //www.ics.uci.edu
     * $4 = www.ics.uci.edu
     * $5 = /pub/ietf/uri/
     * $6 = <undefined>
     * $7 = <undefined>
     * $8 = #Related
     * $9 = Related
     *
     * where <undefined> indicates that the component is not present, as is
     * the case for the query component in the above example.  Therefore, we
     * can determine the value of the four components and fragment as
     *
     * scheme    = $2
     * authority = $4
     * path      = $5
     * query     = $7
     * fragment  = $9
     * 
* * -- *

Below differs from the rfc regex in that... * (1) it has java escaping of regex characters * (2) we allow a URI made of a fragment only (Added extra * group so indexing is off by one after scheme). * (3) scheme is limited to legal scheme characters */ final public static Pattern RFC2396REGEX = Pattern.compile( "^(([a-zA-Z][a-zA-Z0-9\\+\\-\\.]*):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?"); // 12 34 5 6 7 8 9 A // 2 1 54 6 87 3 A9 // 1: scheme // 2: scheme: // 3: //authority/path // 4: //authority // 5: authority // 6: path // 7: ?query // 8: query // 9: #fragment // A: fragment public static final String SLASHDOTDOTSLASH = "^(/\\.\\./)+"; public static final String SLASH = "/"; public static final String HTTP = "http"; public static final String HTTP_PORT = ":80"; public static final String HTTPS = "https"; public static final String HTTPS_PORT = ":443"; public static final String DOT = "."; public static final String EMPTY_STRING = ""; public static final String NBSP = "\u00A0"; public static final String SPACE = " "; public static final String ESCAPED_SPACE = "%20"; public static final String TRAILING_ESCAPED_SPACE = "^(.*)(%20)+$"; public static final String PIPE = "|"; public static final String PIPE_PATTERN = "\\|"; public static final String ESCAPED_PIPE = "%7C"; public static final String CIRCUMFLEX = "^"; public static final String CIRCUMFLEX_PATTERN = "\\^"; public static final String ESCAPED_CIRCUMFLEX = "%5E"; public static final String QUOT = "\""; public static final String ESCAPED_QUOT = "%22"; public static final String SQUOT = "'"; public static final String ESCAPED_SQUOT = "%27"; public static final String APOSTROPH = "`"; public static final String ESCAPED_APOSTROPH = "%60"; public static final String LSQRBRACKET = "["; public static final String LSQRBRACKET_PATTERN = "\\["; public static final String ESCAPED_LSQRBRACKET = "%5B"; public static final String RSQRBRACKET = "]"; public static final String RSQRBRACKET_PATTERN = "\\]"; public static final String ESCAPED_RSQRBRACKET = "%5D"; public static final String LCURBRACKET = "{"; public static final String LCURBRACKET_PATTERN = "\\{"; public static final String ESCAPED_LCURBRACKET = "%7B"; public static final String RCURBRACKET = "}"; public static final String RCURBRACKET_PATTERN = "\\}"; public static final String ESCAPED_RCURBRACKET = "%7D"; public static final String BACKSLASH = "\\"; public static final String ESCAPED_BACKSLASH = "%5C"; public static final String STRAY_SPACING = "[\n\r\t]+"; public static final String IMPROPERESC_REPLACE = "%25$1"; public static final String IMPROPERESC = "%((?:[^\\p{XDigit}])|(?:.[^\\p{XDigit}])|(?:\\z))"; public static final String COMMERCIAL_AT = "@"; public static final char PERCENT_SIGN = '%'; public static final char COLON = ':'; /** * First percent sign in string followed by two hex chars. */ public static final String URI_HEX_ENCODING = "^[^%]*%[\\p{XDigit}][\\p{XDigit}].*"; /** * Authority port number regex. */ protected final static Pattern PORTREGEX = Pattern.compile("(.*:)([0-9]+)$"); /** * Characters we'll accept in the domain label part of a URI * authority: ASCII letters-digits-hyphen (LDH) plus underscore, * with single intervening '.' characters. * * (We accept '_' because DNS servers have tolerated for many * years counter to spec; we also accept dash patterns and ACE * prefixes that will be rejected by IDN-punycoding attempt.) */ protected final static String ACCEPTABLE_ASCII_DOMAIN = "^(?:[a-zA-Z0-9_-]++(?:\\.)?)++$"; /** * Pattern that looks for case of three or more slashes after the * scheme. If found, we replace them with two only as mozilla does. */ protected final static Pattern HTTP_SCHEME_SLASHES = Pattern.compile("^(https?://)/+(.*)"); /** * Pattern that looks for case of two or more slashes in a path. */ final static Pattern MULTIPLE_SLASHES = Pattern.compile("//+"); /** * Protected constructor. */ protected UsableURIFactory() { super(); } /** * @param uri URI as string. * @return An instance of UURI * @throws URIException */ public static UsableURI getInstance(String uri) throws URIException { return UsableURIFactory.factory.create(uri); } /** * @param uri URI as string. * @param charset Character encoding of the passed uri string. * @return An instance of UURI * @throws URIException */ public static UsableURI getInstance(String uri, String charset) throws URIException { return UsableURIFactory.factory.create(uri, charset); } /** * @param base Base uri to use resolving passed relative uri. * @param relative URI as string. * @return An instance of UURI * @throws URIException */ public static UsableURI getInstance(UsableURI base, String relative) throws URIException { // return base.resolve(relative); return UsableURIFactory.factory.create(base, relative); } /** * @param uri URI as string. * @return Instance of UURI. * @throws URIException */ protected UsableURI create(String uri) throws URIException { return create(uri, UsableURI.getDefaultProtocolCharset()); } /** * @param uri URI as string. * @param charset Original encoding of the string. * @return Instance of UURI. * @throws URIException */ protected UsableURI create(String uri, String charset) throws URIException { UsableURI uuri = makeOne(fixup(uri, null, charset), true, charset); if (logger.isLoggable(Level.FINE)) { logger.fine("URI " + uri + " PRODUCT " + uuri.toString() + " CHARSET " + charset); } return validityCheck(uuri); } /* for subclasses to override and call their own constructor */ protected UsableURI makeOne(String fixedUpUri, boolean escaped, String charset) throws URIException { return new UsableURI(fixedUpUri, escaped, charset); } protected UsableURI makeOne(UsableURI base, UsableURI relative) throws URIException { return new UsableURI(base, relative); } /** * @param base UURI to use as a base resolving relative. * @param relative Relative URI. * @return Instance of UURI. * @throws URIException */ protected UsableURI create(UsableURI base, String relative) throws URIException { UsableURI relativeUURI = makeOne(fixup(relative, base, base.getProtocolCharset()), true, base.getProtocolCharset()); UsableURI uuri = makeOne(base, relativeUURI); if (logger.isLoggable(Level.FINE)) { logger.fine(" URI " + relative + " PRODUCT " + uuri.toString() + " CHARSET " + base.getProtocolCharset() + " BASE " + base); } return validityCheck(uuri); } /** * Check the generated UURI. * * At the least look at length of uuri string. We were seeing case * where before escaping, string was < MAX_URL_LENGTH but after was * >. Letting out a too-big message was causing us troubles later * down the processing chain. * @param uuri Created uuri to check. * @return The passed uuri so can easily inline this check. * @throws URIException */ protected UsableURI validityCheck(UsableURI uuri) throws URIException { if (uuri.getRawURI().length > UsableURI.MAX_URL_LENGTH) { throw new URIException("Created (escaped) uuri > " + UsableURI.MAX_URL_LENGTH +": "+uuri.toString()); } return uuri; } /** * Do heritrix fix-up on passed uri string. * * Does heritrix escaping; usually escaping done to make our behavior align * with IEs. This method codifies our experience pulling URIs from the * wilds. Its does all the escaping we want; its output can always be * assumed to be 'escaped' (though perhaps to a laxer standard than the * vanilla HttpClient URI class or official specs might suggest). * * @param uri URI as string. * @param base May be null. * @return A fixed up URI string. */ private String fixup(String uri, final URI base, final String charset) throws URIException { if (uri == null) { throw new NullPointerException(); } else if (uri.length() == 0 && base == null) { throw new URIException("URI length is zero (and not relative)."); } if (uri.length() > UsableURI.MAX_URL_LENGTH) { // We check length here and again later after all convertions. throw new URIException("URI length > " + UsableURI.MAX_URL_LENGTH + ": " + uri); } // Replace nbsp with normal spaces (so that they get stripped if at // ends, or encoded if in middle) if (uri.indexOf(NBSP) >= 0) { uri = TextUtils.replaceAll(NBSP, uri, SPACE); } // Get rid of any trailing spaces or new-lines. uri = uri.trim(); // IE converts backslashes preceding the query string to slashes, rather // than to %5C. Since URIs that have backslashes usually work only with // IE, we will convert backslashes to slashes as well. int nextBackslash = uri.indexOf(BACKSLASH); if (nextBackslash >= 0) { int queryStart = uri.indexOf('?'); StringBuilder tmp = new StringBuilder(uri); while (nextBackslash >= 0 && (queryStart < 0 || nextBackslash < queryStart)) { tmp.setCharAt(nextBackslash, '/'); nextBackslash = uri.indexOf(BACKSLASH, nextBackslash + 1); } uri = tmp.toString(); } // Remove stray TAB/CR/LF uri = TextUtils.replaceAll(STRAY_SPACING, uri, EMPTY_STRING); // Test for the case of more than two slashes after the http(s) scheme. // Replace with two slashes as mozilla does if found. // See [ 788219 ] URI Syntax Errors stop page parsing. // Matcher matcher = HTTP_SCHEME_SLASHES.matcher(uri); Matcher matcher = TextUtils.getMatcher(HTTP_SCHEME_SLASHES.pattern(), uri); if (matcher.matches()) { uri = matcher.group(1) + matcher.group(2); } TextUtils.recycleMatcher(matcher); // For further processing, get uri elements. See the RFC2396REGEX // comment above for explanation of group indices used in the below. // matcher = RFC2396REGEX.matcher(uri); matcher = TextUtils.getMatcher(RFC2396REGEX.pattern(), uri); if (!matcher.matches()) { throw new URIException("Failed parse of " + uri); } String uriScheme = checkUriElementAndLowerCase(matcher.group(2)); String uriSchemeSpecificPart = checkUriElement(matcher.group(3)); String uriAuthority = checkUriElement(matcher.group(5)); String uriPath = checkUriElement(matcher.group(6)); String uriQuery = checkUriElement(matcher.group(8)); // UNUSED String uriFragment = checkUriElement(matcher.group(10)); TextUtils.recycleMatcher(matcher); matcher = null; // Test if relative URI. If so, need a base to resolve against. if (uriScheme == null || uriScheme.length() <= 0) { if (base == null) { throw new URIException("Relative URI but no base: " + uri); } } else { checkHttpSchemeSpecificPartSlashPrefix(base, uriScheme, uriSchemeSpecificPart); } // fixup authority portion: lowercase/IDN-punycode any domain; // remove stray trailing spaces uriAuthority = fixupAuthority(uriAuthority,charset); // Do some checks if absolute path. if (uriSchemeSpecificPart != null && uriSchemeSpecificPart.startsWith(SLASH)) { if (uriPath != null) { // Eliminate '..' if its first thing in the path. IE does this. uriPath = TextUtils.replaceFirst(SLASHDOTDOTSLASH, uriPath, SLASH); } // Ensure root URLs end with '/': browsers always send "/" // on the request-line, so we should consider "http://host" // to be "http://host/". if (uriPath == null || EMPTY_STRING.equals(uriPath)) { uriPath = SLASH; } } if (uriAuthority != null) { if (uriScheme != null && uriScheme.length() > 0 && uriScheme.equals(HTTP)) { uriAuthority = checkPort(uriAuthority); uriAuthority = stripTail(uriAuthority, HTTP_PORT); } else if (uriScheme != null && uriScheme.length() > 0 && uriScheme.equals(HTTPS)) { uriAuthority = checkPort(uriAuthority); uriAuthority = stripTail(uriAuthority, HTTPS_PORT); } // Strip any prefix dot or tail dots from the authority. uriAuthority = stripTail(uriAuthority, DOT); uriAuthority = stripPrefix(uriAuthority, DOT); } else { // no authority; may be relative. consider stripping scheme // to work-around org.apache.commons.httpclient.URI bug // ( http://issues.apache.org/jira/browse/HTTPCLIENT-587 ) if (uriScheme != null && base != null && uriScheme.equals(base.getScheme())) { // uriScheme redundant and will only confound httpclient.URI uriScheme = null; } } // Ensure minimal escaping. Use of 'lax' URI and URLCodec // means minimal escaping isn't necessarily complete/consistent. // There is a chance such lax encoding will throw exceptions // later at inconvenient times. // // One reason for these bad escapings -- though not the only -- // is that the page is using an encoding other than the ASCII or the // UTF-8 that is our default URI encoding. In this case the parent // class is burping on the passed URL encoding. If the page encoding // was passed into this factory, the encoding seems to be parsed // correctly (See the testEscapedEncoding unit test). // // This fixup may cause us to miss content. There is the charset case // noted above. TODO: Look out for cases where we fail other than for // the above given reason which will be fixed when we address // '[ 913687 ] Make extractors interrogate for charset'. uriPath = ensureMinimalEscaping(uriPath, charset); uriQuery = ensureMinimalEscaping(uriQuery, charset, LaxURLCodec.QUERY_SAFE); // Preallocate. The '1's and '2's in below are space for ':', // '//', etc. URI characters. StringBuilder s = new StringBuilder( ((uriScheme != null)? uriScheme.length(): 0) + 1 // ';' + ((uriAuthority != null)? uriAuthority.length(): 0) + 2 // '//' + ((uriPath != null)? uriPath.length(): 0) + 1 // '?' + ((uriQuery != null)? uriQuery.length(): 0)); appendNonNull(s, uriScheme, ":", true); appendNonNull(s, uriAuthority, "//", false); appendNonNull(s, uriPath, "", false); appendNonNull(s, uriQuery, "?", false); return s.toString(); } /** * If http(s) scheme, check scheme specific part begins '//'. * @throws URIException * @see Section 3.1. Common Internet * Scheme Syntax */ protected void checkHttpSchemeSpecificPartSlashPrefix(final URI base, final String scheme, final String schemeSpecificPart) throws URIException { if (scheme == null || scheme.length() <= 0) { return; } if (!scheme.equals("http") && !scheme.equals("https")) { return; } if ( schemeSpecificPart == null || !schemeSpecificPart.startsWith("//")) { // only acceptable if schemes match if (base == null || !scheme.equals(base.getScheme())) { throw new URIException( "relative URI with scheme only allowed for " + "scheme matching base"); } return; } if (schemeSpecificPart.length() <= 2) { throw new URIException("http scheme specific part is " + "too short: " + schemeSpecificPart); } } /** * Fixup 'authority' portion of URI, by removing any stray * encoded spaces, lowercasing any domain names, and applying * IDN-punycoding to Unicode domains. * * @param uriAuthority the authority string to fix * @return fixed version * @throws URIException */ private String fixupAuthority(String uriAuthority, String charset) throws URIException { // Lowercase the host part of the uriAuthority; don't destroy any // userinfo capitalizations. Make sure no illegal characters in // domainlabel substring of the uri authority. if (uriAuthority != null) { // Get rid of any trailing escaped spaces: // http://www.archive.org%20. Rare but happens. // TODO: reevaluate: do IE or firefox do such mid-URI space-removal? // if not, we shouldn't either. while(uriAuthority.endsWith(ESCAPED_SPACE)) { uriAuthority = uriAuthority.substring(0,uriAuthority.length()-3); } // lowercase & IDN-punycode only the domain portion int atIndex = uriAuthority.indexOf(COMMERCIAL_AT); int portColonIndex = uriAuthority.indexOf(COLON,(atIndex<0)?0:atIndex); if(atIndex<0 && portColonIndex<0) { // most common case: neither userinfo nor port return fixupDomainlabel(uriAuthority); } else if (atIndex<0 && portColonIndex>-1) { // next most common: port but no userinfo String domain = fixupDomainlabel(uriAuthority.substring(0,portColonIndex)); String port = uriAuthority.substring(portColonIndex); return domain + port; } else if (atIndex>-1 && portColonIndex<0) { // uncommon: userinfo, no port String userinfo = ensureMinimalEscaping(uriAuthority.substring(0,atIndex+1),charset); String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1)); return userinfo + domain; } else { // uncommon: userinfo, port String userinfo = ensureMinimalEscaping(uriAuthority.substring(0,atIndex+1),charset); String domain = fixupDomainlabel(uriAuthority.substring(atIndex+1,portColonIndex)); String port = uriAuthority.substring(portColonIndex); return userinfo + domain + port; } } return uriAuthority; } /** * Fixup the domain label part of the authority. * * We're more lax than the spec. in that we allow underscores. * * @param label Domain label to fix. * @return Return fixed domain label. * @throws URIException */ private String fixupDomainlabel(String label) throws URIException { // apply IDN-punycoding, as necessary try { // TODO: optimize: only apply when necessary, or // keep cache of recent encodings label = IDNA.toASCII(label); } catch (IDNAException e) { if(TextUtils.matches(ACCEPTABLE_ASCII_DOMAIN,label)) { // domain name has ACE prefix, leading/trailing dash, or // underscore -- but is still a name we wish to tolerate; // simply continue } else { // problematic domain: neither ASCII acceptable characters // nor IDN-punycodable, so throw exception // TODO: change to HeritrixURIException so distinguishable // from URIExceptions in library code URIException ue = new URIException(e+" "+label); ue.initCause(e); throw ue; } } label = label.toLowerCase(); return label; } /** * Ensure that there all characters needing escaping * in the passed-in String are escaped. Stray '%' characters * are *not* escaped, as per browser behavior. * * @param u String to escape * @param charset * @return string with any necessary escaping applied */ private String ensureMinimalEscaping(String u, final String charset) { return ensureMinimalEscaping(u, charset, LaxURLCodec.EXPANDED_URI_SAFE); } /** * Ensure that there all characters needing escaping * in the passed-in String are escaped. Stray '%' characters * are *not* escaped, as per browser behavior. * * @param u String to escape * @param charset * @param bitset * @return string with any necessary escaping applied */ private String ensureMinimalEscaping(String u, final String charset, final BitSet bitset) { if (u == null) { return null; } for (int i = 0; i < u.length(); i++) { char c = u.charAt(i); if (!bitset.get(c)) { try { u = LaxURLCodec.DEFAULT.encode(bitset, u, charset); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } break; } } return u; } /** * Check port on passed http authority. Make sure the size is not larger * than allowed: See the 'port' definition on this * page, http://www.kerio.com/manual/wrp/en/418.htm. * Also, we've seen port numbers of '0080' whose leading zeros confuse * the parent class. Strip the leading zeros. * * @param uriAuthority * @return Null or an amended port number. * @throws URIException */ private String checkPort(String uriAuthority) throws URIException { // Matcher m = PORTREGEX.matcher(uriAuthority); Matcher m = TextUtils.getMatcher(PORTREGEX.pattern(), uriAuthority); if (m.matches()) { String no = m.group(2); if (no != null && no.length() > 0) { // First check if the port has leading zeros // as in '0080'. Strip them if it has and // then reconstitute the uriAuthority. Be careful // of cases where port is '0' or '000'. while (no.charAt(0) == '0' && no.length() > 1) { no = no.substring(1); } uriAuthority = m.group(1) + no; // Now makesure the number is legit. int portNo = 0; try { portNo = Integer.parseInt(no); } catch (NumberFormatException nfe) { // just catch and leave portNo at illegal 0 } if (portNo <= 0 || portNo > 65535) { throw new URIException("Port out of bounds: " + uriAuthority); } } } TextUtils.recycleMatcher(m); return uriAuthority; } /** * @param b Buffer to append to. * @param str String to append if not null. * @param substr Suffix or prefix to use if str is not null. * @param suffix True if substr is a suffix. */ private void appendNonNull(StringBuilder b, String str, String substr, boolean suffix) { if (str != null && str.length() > 0) { if (!suffix) { b.append(substr); } b.append(str); if (suffix) { b.append(substr); } } } /** * @param str String to work on. * @param prefix Prefix to strip if present. * @return str w/o prefix. */ private String stripPrefix(String str, String prefix) { return str.startsWith(prefix)? str.substring(prefix.length(), str.length()): str; } /** * @param str String to work on. * @param tail Tail to strip if present. * @return str w/o tail. */ private static String stripTail(String str, String tail) { return str.endsWith(tail)? str.substring(0, str.length() - tail.length()): str; } /** * @param element to examine. * @return Null if passed null or an empty string otherwise * element. */ private String checkUriElement(String element) { return (element == null || element.length() <= 0)? null: element; } /** * @param element to examine and lowercase if non-null. * @return Null if passed null or an empty string otherwise * element lowercased. */ private String checkUriElementAndLowerCase(String element) { String tmp = checkUriElement(element); return (tmp != null)? tmp.toLowerCase(): tmp; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy