org.archive.url.URLParser Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of webarchive-commons Show documentation
The newest version!
package org.archive.url;

import java.net.URISyntaxException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class URLParser {
    /**
     * RFC 2396-inspired regex.
     *
     * From the RFC Appendix B:
     *      * URI Generic Syntax                August 1998
     *
     * B. Parsing a URI Reference with a Regular Expression
     *
     * As described in Section 4.3, the generic URI syntax is not sufficient
     * to disambiguate the components of some forms of URI.  Since the
     * "greedy algorithm" described in that section is identical to the
     * disambiguation method used by POSIX regular expressions, it is
     * natural and commonplace to use a regular expression for parsing the
     * potential four components and fragment identifier of a URI reference.
     *
     * The following line is the regular expression for breaking-down a URI
     * reference into its components.
     *
     * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
     *  12            3  4          5       6  7        8 9
     *
     * The numbers in the second line above are only to assist readability;
     * they indicate the reference points for each subexpression (i.e., each
     * paired parenthesis).  We refer to the value matched for subexpression
     * <n> as $<n>.  For example, matching the above expression to
     *
     * http://www.ics.uci.edu/pub/ietf/uri/#Related
     *
     * results in the following subexpression matches:
     *
     * $1 = http:
     * $2 = http
     * $3 = //www.ics.uci.edu
     * $4 = www.ics.uci.edu
     * $5 = /pub/ietf/uri/
     * $6 = <undefined>
     * $7 = <undefined>
     * $8 = #Related
     * $9 = Related
     *
     * where <undefined> indicates that the component is not present, as is
     * the case for the query component in the above example.  Therefore, we
     * can determine the value of the four components and fragment as
     *
     * scheme    = $2
     * authority = $4
     * path      = $5
     * query     = $7
     * fragment  = $9
     * 
     *
     * -- 
     * Below differs from the rfc regex in that... 
     * (1) it has java escaping of regex characters 
     * (2) we allow a URI made of a fragment only (Added extra
     * group so indexing is off by one after scheme).
     * (3) scheme is limited to legal scheme characters 
     */
    final public static Pattern RFC2396REGEX = Pattern.compile(
            "^(([a-zA-Z][a-zA-Z0-9\\+\\-\\.]*):)?((//([^/?#]*))?([^?#]*)(\\?([^#]*))?)?(#(.*))?");
    //        12                                 34  5          6       7   8          9 A
    //                                       2 1             54        6          87 3      A9
    // 1: scheme
    // 2: scheme:
    // 3: //authority/path
    // 4: //authority
    // 5: authority
    // 6: path
    // 7: ?query
    // 8: query 
    // 9: #fragment
    // A: fragment
    
    public static final String COMMERCIAL_AT = "@";
    public static final char PERCENT_SIGN = '%';
    public static final char COLON = ':';
    public static final String STRAY_SPACING = "[\n\r\t\\p{Zl}\\p{Zp}\u0085]+";
    
    /**
     * Pattern that looks for case of three or more slashes after the 
     * scheme.  If found, we replace them with two only as mozilla does.
     */
    final static Pattern HTTP_SCHEME_SLASHES =
        Pattern.compile("^(https?://)/+(.*)");

	/**
	 * ARC/WARC specific DNS resolution record.
	 */
	public final static String DNS_SCHEME = "dns:";
	/**
	 * ARC header record.
	 */
	public final static String FILEDESC_SCHEME = "filedesc:";
	/**
	 * WARC header record.
	 */
	public final static String WARCINFO_SCHEME = "warcinfo:";
	
	/**
	 * HTTP
	 */
	public final static String HTTP_SCHEME = "http://";
	/**
	 * HTTPS
	 */
	public final static String HTTPS_SCHEME = "https://";
	/**
	 * FTP
	 */
	public final static String FTP_SCHEME = "ftp://";
	/**
	 * MMS
	 */
	public final static String MMS_SCHEME = "mms://";
	/**
	 * RTSP
	 */
	public final static String RTSP_SCHEME = "rtsp://";
	
	/**
	 * Default scheme to assume if unspecified. No context implied...
	 */
	public final static String DEFAULT_SCHEME = HTTP_SCHEME;	
	
	/**
	 * go brewster
	 */
	public final static String WAIS_SCHEME = "wais://";
	
	/**
	 * array of static Strings for all "known" schemes
	 */
	public final static String ALL_SCHEMES[] = { 
		HTTP_SCHEME,
		HTTPS_SCHEME,
		FTP_SCHEME,
		MMS_SCHEME,
		RTSP_SCHEME,
		WAIS_SCHEME
	};
	
	public final static Pattern ALL_SCHEMES_PATTERN =
		Pattern.compile("(?i)^(http|https|ftp|mms|rtsp|wais)://.*");
	
	/**
	 * Attempt to find the scheme (http://, https://, etc) from a given URL.
	 * @param url URL String to parse for a scheme.
	 * @return the scheme, including trailing "://" if known, null otherwise.
	 */
	public static String urlToScheme(final String url) {
		for(final String scheme : ALL_SCHEMES) {
			if(url.startsWith(scheme)) {
				return scheme;
			}
		}
		return null;
	}
    
	public static String addDefaultSchemeIfNeeded(String urlString) {
		if(urlString == null) {
			return null;
		}
        // add http:// if no scheme is present:
        Matcher m2 = ALL_SCHEMES_PATTERN.matcher(urlString);
        if(m2.matches()) {
        	return urlString;
        }
		return DEFAULT_SCHEME + urlString;
	}
	
    public static HandyURL parse(String urlString) throws URISyntaxException {

    	// first strip leading or trailing spaces:
    	// TODO: this strips too much - stripping non-printables
    	urlString = urlString.trim();
    	
    	// then remove leading, trailing, and internal TAB, CR, LF:
    	urlString = urlString.replaceAll(STRAY_SPACING,"");

    	// check for non-standard URLs:
    	if(urlString.startsWith(DNS_SCHEME)
    			|| urlString.startsWith(FILEDESC_SCHEME)
    			|| urlString.startsWith(WARCINFO_SCHEME)) {
    		HandyURL h = new HandyURL();
    		// TODO: we could set the authority - to allow SURT stuff to work..
    		h.setOpaque(urlString);
    		return h;
    	}
    	
    	// add http:// if no scheme is present..
    	urlString = addDefaultSchemeIfNeeded(urlString);
    	
    	// replace leading http:/// with http://
        Matcher m1 = HTTP_SCHEME_SLASHES.matcher(urlString);
        if (m1.matches()) {
        	urlString = m1.group(1) + m1.group(2);
        }

        // cross fingers, toes, eyes...
    	Matcher matcher = RFC2396REGEX.matcher(urlString);
    	if(!matcher.matches()) {
			throw new URISyntaxException(urlString,
					"string does not match RFC 2396 regex");
    	}
        String uriScheme = matcher.group(2);
        String uriAuthority = matcher.group(5);
        String uriPath = matcher.group(6);
        String uriQuery = matcher.group(8);
        String uriFragment = matcher.group(10);

        // Split Authority into USER:PASS@HOST:PORT
        String userName = null;
        String userPass = null;
        String hostname = null;
        int port = HandyURL.DEFAULT_PORT;
        
        String userInfo = null;
        String colonPort = null;

        int atIndex = uriAuthority.indexOf(COMMERCIAL_AT);
        int portColonIndex = -1;
        int startColonIndex = 0;
        if (atIndex > -1) {
            startColonIndex = atIndex;
        }
        if (uriAuthority.charAt(startColonIndex) == '[') {
            // IPv6 address
            startColonIndex = uriAuthority.indexOf(']', (startColonIndex + 1));
        }
        portColonIndex = uriAuthority.indexOf(COLON, startColonIndex);

        if(atIndex<0 && portColonIndex<0) {
            // most common case: neither userinfo nor port
        	hostname = uriAuthority;
        } else if (atIndex<0 && portColonIndex>-1) {
            // next most common: port but no userinfo
            hostname = uriAuthority.substring(0,portColonIndex);
            colonPort = uriAuthority.substring(portColonIndex);
        } else if (atIndex>-1 && portColonIndex<0) {
            // uncommon: userinfo, no port
            userInfo = uriAuthority.substring(0,atIndex);
            hostname = uriAuthority.substring(atIndex+1);
        } else {
            // uncommon: userinfo, port
            userInfo = uriAuthority.substring(0,atIndex);
            hostname = uriAuthority.substring(atIndex+1,portColonIndex);
            colonPort = uriAuthority.substring(portColonIndex);
        }
        if(colonPort != null) {
            if(colonPort.startsWith(":")) {
                if (colonPort.length() == 1) {
                    // a bare colon (http://example.com:/), use default port
                } else {
                    try {
                        port = Integer.parseInt(colonPort.substring(1));
                    } catch(NumberFormatException e) {
                        throw new URISyntaxException(urlString, "bad port "
                                + colonPort.substring(1));
                    }
                }
            } else {
                // XXX: what's happened?!
            }
        }
        if(userInfo != null) {
        	int passColonIndex = userInfo.indexOf(COLON);
        	if(passColonIndex == -1) {
        		// no password:
        		userName = userInfo;
        	} else {
                userName = userInfo.substring(0, passColonIndex);
                userPass = userInfo.substring(passColonIndex + 1);
        	}
        }
        return new HandyURL(uriScheme,userName,userPass,hostname,
        		port,uriPath,uriQuery,uriFragment);
    }
}