All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.url.LaxURI Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.url;

import java.util.Arrays;
import java.util.BitSet;

import org.apache.commons.httpclient.URI;
import org.apache.commons.httpclient.URIException;
import org.apache.commons.httpclient.util.EncodingUtil;

/**
 * URI subclass which allows partial/inconsistent encoding, matching
 * the URIs which will be relayed in requests from popular web
 * browsers (esp. Mozilla Firefox and MS IE).
 * 
 * @author gojomo
 */
public class LaxURI extends URI {

    private static final long serialVersionUID = 5273922211722239537L;
    
    final protected static char[] HTTP_SCHEME = {'h','t','t','p'};
    final protected static char[] HTTPS_SCHEME = {'h','t','t','p','s'};
    
    protected static final BitSet lax_rel_segment = new BitSet(256);
    // Static initializer for lax_rel_segment
    static {
        lax_rel_segment.or(rel_segment);
        lax_rel_segment.set(':'); // allow ':'
        // TODO: add additional allowances as need is demonstrated
    }

    protected static final BitSet lax_abs_path = new BitSet(256);
    static {
        lax_abs_path.or(abs_path);
        lax_abs_path.set('|'); // tests indicate Firefox (1.0.6) doesn't escape.
    }
    
    protected static final BitSet lax_rel_path = new BitSet(256);
    // Static initializer for rel_path
    static {
        lax_rel_path.or(lax_rel_segment);
        lax_rel_path.or(lax_abs_path);
    }
    
    protected static final BitSet lax_query = new BitSet(256);
    static {
        lax_query.or(query);
        lax_query.set('{'); // tests indicate FF doesn't escape { in query
        lax_query.set('}'); // tests indicate FF doesn't escape } in query
        lax_query.set('|'); // tests indicate FF doesn't escape | in query
        lax_query.set('['); // tests indicate FF doesn't escape [ in query
        lax_query.set(']'); // tests indicate FF doesn't escape ] in query
        lax_query.set('^'); // tests indicate FF doesn't escape ^ in query
    }
    
    // passthrough initializers
    public LaxURI(String uri, boolean escaped, String charset)
    throws URIException {
        super(uri,escaped,charset);
    }
    public LaxURI(URI base, URI relative) throws URIException {
        super(base,relative);
    }
    public LaxURI(String uri, boolean escaped) throws URIException {
        super(uri,escaped);
    }
    public LaxURI() {
        super();
    }

    // overridden to use this class's static decode()
    public String getURI() throws URIException {
        return (_uri == null) ? null : decode(_uri, getProtocolCharset());
    }
    
    // overridden to use this class's static decode()
    public String getPath() throws URIException {
        char[] p = getRawPath();
        return (p == null) ? null : decode(p, getProtocolCharset());
    }

    // overridden to use this class's static decode()
    public String getPathQuery() throws URIException {
        char[] rawPathQuery = getRawPathQuery();
        return (rawPathQuery == null) ? null : decode(rawPathQuery,
                getProtocolCharset());
    }
    // overridden to use this class's static decode()
    protected static String decode(char[] component, String charset)
            throws URIException {
        if (component == null) {
            throw new IllegalArgumentException(
                    "Component array of chars may not be null");
        }
        return decode(new String(component), charset);
    }

    // overridden to use IA's LaxURLCodec, which never throws DecoderException
    protected static String decode(String component, String charset)
            throws URIException {
        if (component == null) {
            throw new IllegalArgumentException(
                    "Component array of chars may not be null");
        }
        byte[] rawdata = null;
        //     try {
        rawdata = LaxURLCodec.decodeUrlLoose(EncodingUtil
                .getAsciiBytes(component));
        //     } catch (DecoderException e) {
        //         throw new URIException(e.getMessage());
        //     }
        return EncodingUtil.getString(rawdata, charset);
    }
    
    // overidden to lax() the acceptable-char BitSet passed in
    protected boolean validate(char[] component, BitSet generous) {
        return super.validate(component, lax(generous));
    }

    // overidden to lax() the acceptable-char BitSet passed in
    protected boolean validate(char[] component, int soffset, int eoffset,
            BitSet generous) {
        return super.validate(component, soffset, eoffset, lax(generous));
    }
    
    /**
     * Given a BitSet -- typically one of the URI superclass's
     * predefined static variables -- possibly replace it with
     * a more-lax version to better match the character sets
     * actually left unencoded in web browser requests
     * 
     * @param generous original BitSet
     * @return (possibly more lax) BitSet to use
     */
    protected BitSet lax(BitSet generous) {
        if (generous == rel_segment) {
            // Swap in more lax allowable set
            return lax_rel_segment;
        }
        if (generous == abs_path) {
            return lax_abs_path;
        }
        if (generous == query) {
            return lax_query;
        }
        if (generous == rel_path) {
            return lax_rel_path; 
        }
        // otherwise, leave as is
        return generous;
    }
    
    /** 
     * Coalesce the _host and _authority fields where 
     * possible.
     * 
     * In the web crawl/http domain, most URIs have an 
     * identical _host and _authority. (There is no port
     * or user info.) However, the superclass always 
     * creates two separate char[] instances. 
     * 
     * Notably, the lengths of these char[] fields are 
     * equal if and only if their values are identical.
     * This method makes use of this fact to reduce the
     * two instances to one where possible, slimming 
     * instances.  
     * 
     * @see org.apache.commons.httpclient.URI#parseAuthority(java.lang.String, boolean)
     */
    protected void parseAuthority(String original, boolean escaped)
            throws URIException {
        super.parseAuthority(original, escaped);
        if (_host != null && _authority != null
                && _host.length == _authority.length) {
            _host = _authority;
        }
    }
    
    
    /** 
     * Coalesce _scheme to existing instances, where appropriate.
     * 
     * In the web-crawl domain, most _schemes are 'http' or 'https',
     * but the superclass always creates a new char[] instance. For
     * these two cases, we replace the created instance with a 
     * long-lived instance from a static field, saving 12-14 bytes
     * per instance. 
     * 
     * @see org.apache.commons.httpclient.URI#setURI()
     */
    protected void setURI() {
        if (_scheme != null) {
            if (_scheme.length == 4 && Arrays.equals(_scheme, HTTP_SCHEME)) {
                _scheme = HTTP_SCHEME;
            } else if (_scheme.length == 5
                    && Arrays.equals(_scheme, HTTPS_SCHEME)) {
                _scheme = HTTPS_SCHEME;
            }
        }
        super.setURI();
    }
    
    /**
     * IA OVERRIDDEN IN LaxURI TO INCLUDE FIX FOR 
     * http://issues.apache.org/jira/browse/HTTPCLIENT-588
     * AND
     * http://webteam.archive.org/jira/browse/HER-1268
     * 
     * In order to avoid any possilbity of conflict with non-ASCII characters,
     * Parse a URI reference as a String with the character
     * encoding of the local system or the document.
     * 

* The following line is the regular expression for breaking-down a URI * reference into its components. *

     *   ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
     *    12            3  4          5       6  7        8 9
     * 

* For example, matching the above expression to * http://jakarta.apache.org/ietf/uri/#Related * results in the following subexpression matches: *

     *               $1 = http:
     *  scheme    =  $2 = http
     *               $3 = //jakarta.apache.org
     *  authority =  $4 = jakarta.apache.org
     *  path      =  $5 = /ietf/uri/
     *               $6 = <undefined>
     *  query     =  $7 = <undefined>
     *               $8 = #Related
     *  fragment  =  $9 = Related
     * 

* * @param original the original character sequence * @param escaped true if original is escaped * @throws URIException If an error occurs. */ protected void parseUriReference(String original, boolean escaped) throws URIException { // validate and contruct the URI character sequence if (original == null) { throw new URIException("URI-Reference required"); } /* @ * ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))? */ String tmp = original.trim(); /* * The length of the string sequence of characters. * It may not be equal to the length of the byte array. */ int length = tmp.length(); /* * Remove the delimiters like angle brackets around an URI. */ if (length > 0) { char[] firstDelimiter = { tmp.charAt(0) }; if (validate(firstDelimiter, delims)) { if (length >= 2) { char[] lastDelimiter = { tmp.charAt(length - 1) }; if (validate(lastDelimiter, delims)) { tmp = tmp.substring(1, length - 1); length = length - 2; } } } } /* * The starting index */ int from = 0; /* * The test flag whether the URI is started from the path component. */ boolean isStartedFromPath = false; int atColon = tmp.indexOf(':'); int atSlash = tmp.indexOf('/'); if (!tmp.startsWith("//") && (atColon <= 0 || (atSlash >= 0 && atSlash < atColon))) { isStartedFromPath = true; } /* *

         *     @@@@@@@@
         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
         * 

*/ int at = indexFirstOf(tmp, isStartedFromPath ? "/?#" : ":/?#", from); if (at == -1) { at = 0; } /* * Parse the scheme. *

         *  scheme    =  $2 = http
         *              @
         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
         * 

*/ if (at > 0 && at < length && tmp.charAt(at) == ':') { char[] target = tmp.substring(0, at).toLowerCase().toCharArray(); if (validate(target, scheme)) { _scheme = target; from = ++at; } else { // IA CHANGE: // do nothing; allow interpretation as URI with // later colon in other syntactical component } } /* * Parse the authority component. *

         *  authority =  $4 = jakarta.apache.org
         *                  @@
         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
         * 

*/ // Reset flags _is_net_path = _is_abs_path = _is_rel_path = _is_hier_part = false; if (0 <= at && at < length && tmp.charAt(at) == '/') { // Set flag _is_hier_part = true; if (at + 2 < length && tmp.charAt(at + 1) == '/' && !isStartedFromPath) { // the temporary index to start the search from int next = indexFirstOf(tmp, "/?#", at + 2); if (next == -1) { next = (tmp.substring(at + 2).length() == 0) ? at + 2 : tmp.length(); } parseAuthority(tmp.substring(at + 2, next), escaped); from = at = next; // Set flag _is_net_path = true; } if (from == at) { // Set flag _is_abs_path = true; } } /* * Parse the path component. *

         *  path      =  $5 = /ietf/uri/
         *                                @@@@@@
         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
         * 

*/ if (from < length) { // rel_path = rel_segment [ abs_path ] int next = indexFirstOf(tmp, "?#", from); if (next == -1) { next = tmp.length(); } if (!_is_abs_path) { if (!escaped && prevalidate(tmp.substring(from, next), disallowed_rel_path) || escaped && validate(tmp.substring(from, next).toCharArray(), rel_path)) { // Set flag _is_rel_path = true; } else if (!escaped && prevalidate(tmp.substring(from, next), disallowed_opaque_part) || escaped && validate(tmp.substring(from, next).toCharArray(), opaque_part)) { // Set flag _is_opaque_part = true; } else { // the path component may be empty _path = null; } } String s = tmp.substring(from, next); if (escaped) { setRawPath(s.toCharArray()); } else { setPath(s); } at = next; } // set the charset to do escape encoding String charset = getProtocolCharset(); /* * Parse the query component. *

         *  query     =  $7 = 
         *                                        @@@@@@@@@
         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
         * 

*/ if (0 <= at && at + 1 < length && tmp.charAt(at) == '?') { int next = tmp.indexOf('#', at + 1); if (next == -1) { next = tmp.length(); } if (escaped) { _query = tmp.substring(at + 1, next).toCharArray(); if (!validate(_query, query)) { throw new URIException("Invalid query"); } } else { _query = encode(tmp.substring(at + 1, next), allowed_query, charset); } at = next; } /* * Parse the fragment component. *

         *  fragment  =  $9 = Related
         *                                                   @@@@@@@@
         *  ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
         * 

*/ if (0 <= at && at + 1 <= length && tmp.charAt(at) == '#') { if (at + 1 == length) { // empty fragment _fragment = "".toCharArray(); } else { _fragment = (escaped) ? tmp.substring(at + 1).toCharArray() : encode(tmp.substring(at + 1), allowed_fragment, charset); } } // set this URI. setURI(); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy