All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.url.URLParser Maven / Gradle / Ivy

The newest version!
package org.archive.url;

import java.net.URISyntaxException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class URLParser {
    /**
     * RFC 2396-inspired regex.
     *
     * From the RFC Appendix B:
     * 
     * URI Generic Syntax                August 1998
     *
     * B. Parsing a URI Reference with a Regular Expression
     *
     * As described in Section 4.3, the generic URI syntax is not sufficient
     * to disambiguate the components of some forms of URI.  Since the
     * "greedy algorithm" described in that section is identical to the
     * disambiguation method used by POSIX regular expressions, it is
     * natural and commonplace to use a regular expression for parsing the
     * potential four components and fragment identifier of a URI reference.
     *
     * The following line is the regular expression for breaking-down a URI
     * reference into its components.
     *
     * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
     *  12            3  4          5       6  7        8 9
     *
     * The numbers in the second line above are only to assist readability;
     * they indicate the reference points for each subexpression (i.e., each
     * paired parenthesis).  We refer to the value matched for subexpression
     * <n> as $<n>.  For example, matching the above expression to
     *
     * http://www.ics.uci.edu/pub/ietf/uri/#Related
     *
     * results in the following subexpression matches:
     *
     * $1 = http:
     * $2 = http
     * $3 = //www.ics.uci.edu
     * $4 = www.ics.uci.edu
     * $5 = /pub/ietf/uri/
     * $6 = <undefined>
     * $7 = <undefined>
     * $8 = #Related
     * $9 = Related
     *
     * where <undefined> indicates that the component is not present, as is
     * the case for the query component in the above example.  Therefore, we
     * can determine the value of the four components and fragment as
     *
     * scheme    = $2
     * authority = $4
     * path      = $5
     * query     = $7
     * fragment  = $9
     * 
* * -- *

Below differs from the rfc regex in that... * (1) it has java escaping of regex characters * (2) we allow a URI made of a fragment only (Added extra * group so indexing is off by one after scheme). * (3) scheme is limited to legal scheme characters */ final public static Pattern RFC2396REGEX = Pattern.compile( "^(([a-zA-Z][a-zA-Z0-9\\+\\-\\.]*):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?"); // 12 34 5 6 7 8 9 A // 2 1 54 6 87 3 A9 // 1: scheme // 2: scheme: // 3: //authority/path // 4: //authority // 5: authority // 6: path // 7: ?query // 8: query // 9: #fragment // A: fragment public static final String COMMERCIAL_AT = "@"; public static final char PERCENT_SIGN = '%'; public static final char COLON = ':'; public static final String STRAY_SPACING = "[\n\r\t\\p{Zl}\\p{Zp}\u0085]+"; /** * Pattern that looks for case of three or more slashes after the * scheme. If found, we replace them with two only as mozilla does. */ final static Pattern HTTP_SCHEME_SLASHES = Pattern.compile("^(https?://)/+(.*)"); /** * ARC/WARC specific DNS resolution record. */ public final static String DNS_SCHEME = "dns:"; /** * ARC header record. */ public final static String FILEDESC_SCHEME = "filedesc:"; /** * WARC header record. */ public final static String WARCINFO_SCHEME = "warcinfo:"; /** * HTTP */ public final static String HTTP_SCHEME = "http://"; /** * HTTPS */ public final static String HTTPS_SCHEME = "https://"; /** * FTP */ public final static String FTP_SCHEME = "ftp://"; /** * MMS */ public final static String MMS_SCHEME = "mms://"; /** * RTSP */ public final static String RTSP_SCHEME = "rtsp://"; /** * Default scheme to assume if unspecified. No context implied... */ public final static String DEFAULT_SCHEME = HTTP_SCHEME; /** * go brewster */ public final static String WAIS_SCHEME = "wais://"; /** * array of static Strings for all "known" schemes */ public final static String ALL_SCHEMES[] = { HTTP_SCHEME, HTTPS_SCHEME, FTP_SCHEME, MMS_SCHEME, RTSP_SCHEME, WAIS_SCHEME }; public final static Pattern ALL_SCHEMES_PATTERN = Pattern.compile("(?i)^(http|https|ftp|mms|rtsp|wais)://.*"); /** * Attempt to find the scheme (http://, https://, etc) from a given URL. * @param url URL String to parse for a scheme. * @return the scheme, including trailing "://" if known, null otherwise. */ public static String urlToScheme(final String url) { for(final String scheme : ALL_SCHEMES) { if(url.startsWith(scheme)) { return scheme; } } return null; } public static String addDefaultSchemeIfNeeded(String urlString) { if(urlString == null) { return null; } // add http:// if no scheme is present: Matcher m2 = ALL_SCHEMES_PATTERN.matcher(urlString); if(m2.matches()) { return urlString; } return DEFAULT_SCHEME + urlString; } public static HandyURL parse(String urlString) throws URISyntaxException { // first strip leading or trailing spaces: // TODO: this strips too much - stripping non-printables urlString = urlString.trim(); // then remove leading, trailing, and internal TAB, CR, LF: urlString = urlString.replaceAll(STRAY_SPACING,""); // check for non-standard URLs: if(urlString.startsWith(DNS_SCHEME) || urlString.startsWith(FILEDESC_SCHEME) || urlString.startsWith(WARCINFO_SCHEME)) { HandyURL h = new HandyURL(); // TODO: we could set the authority - to allow SURT stuff to work.. h.setOpaque(urlString); return h; } // add http:// if no scheme is present.. urlString = addDefaultSchemeIfNeeded(urlString); // replace leading http:/// with http:// Matcher m1 = HTTP_SCHEME_SLASHES.matcher(urlString); if (m1.matches()) { urlString = m1.group(1) + m1.group(2); } // cross fingers, toes, eyes... Matcher matcher = RFC2396REGEX.matcher(urlString); if(!matcher.matches()) { throw new URISyntaxException(urlString, "string does not match RFC 2396 regex"); } String uriScheme = matcher.group(2); String uriAuthority = matcher.group(5); String uriPath = matcher.group(6); String uriQuery = matcher.group(8); String uriFragment = matcher.group(10); // Split Authority into USER:PASS@HOST:PORT String userName = null; String userPass = null; String hostname = null; int port = HandyURL.DEFAULT_PORT; String userInfo = null; String colonPort = null; int atIndex = uriAuthority.indexOf(COMMERCIAL_AT); int portColonIndex = -1; int startColonIndex = 0; if (atIndex > -1) { startColonIndex = atIndex; } if (uriAuthority.charAt(startColonIndex) == '[') { // IPv6 address startColonIndex = uriAuthority.indexOf(']', (startColonIndex + 1)); } portColonIndex = uriAuthority.indexOf(COLON, startColonIndex); if(atIndex<0 && portColonIndex<0) { // most common case: neither userinfo nor port hostname = uriAuthority; } else if (atIndex<0 && portColonIndex>-1) { // next most common: port but no userinfo hostname = uriAuthority.substring(0,portColonIndex); colonPort = uriAuthority.substring(portColonIndex); } else if (atIndex>-1 && portColonIndex<0) { // uncommon: userinfo, no port userInfo = uriAuthority.substring(0,atIndex); hostname = uriAuthority.substring(atIndex+1); } else { // uncommon: userinfo, port userInfo = uriAuthority.substring(0,atIndex); hostname = uriAuthority.substring(atIndex+1,portColonIndex); colonPort = uriAuthority.substring(portColonIndex); } if(colonPort != null) { if(colonPort.startsWith(":")) { if (colonPort.length() == 1) { // a bare colon (http://example.com:/), use default port } else { try { port = Integer.parseInt(colonPort.substring(1)); } catch(NumberFormatException e) { throw new URISyntaxException(urlString, "bad port " + colonPort.substring(1)); } } } else { // XXX: what's happened?! } } if(userInfo != null) { int passColonIndex = userInfo.indexOf(COLON); if(passColonIndex == -1) { // no password: userName = userInfo; } else { userName = userInfo.substring(0, passColonIndex); userPass = userInfo.substring(passColonIndex + 1); } } return new HandyURL(uriScheme,userName,userPass,hostname, port,uriPath,uriQuery,uriFragment); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy