All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.url.UsableURI Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.url;

import gnu.inet.encoding.IDNA;
import java.io.File;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;

import org.apache.commons.httpclient.URIException;
import org.archive.util.SURT;
import org.archive.util.TextUtils;

/**
 * Usable URI.
 * 
 * This class wraps {@link org.apache.commons.httpclient.URI} adding caching
 * and methods. It cannot be instantiated directly.  Go via UURIFactory.
 * 
 *  

We used to use {@link java.net.URI} for parsing URIs but ran across * quirky behaviors and bugs. {@link java.net.URI} is not subclassable -- * its final -- and its unlikely that java.net.URI will change any time soon * (See Gordon's considered petition here: * java.net.URI * should have loose/tolerant/compatibility option (or allow reuse)). * *

This class tries to cache calculated strings such as the extracted host * and this class as a string rather than have the parent class rerun its * calculation everytime. * * @author gojomo * @author stack * * @see org.apache.commons.httpclient.URI */ public class UsableURI extends LaxURI implements CharSequence, Serializable { private static final long serialVersionUID = -1277570889914647093L; //private static Logger LOGGER = // Logger.getLogger(UURI.class.getName()); /** * Consider URIs too long for IE as illegal. */ public final static int MAX_URL_LENGTH = 2083; public static final String MASSAGEHOST_PATTERN = "^www\\d*\\."; /** * Cache of the host name. * * Super class calculates on every call. Profiling shows us spend 30% of * total elapsed time in URI class. */ private transient String cachedHost = null; /** * Cache of this uuri escaped as a string. * * Super class calculates on every call. Profiling shows us spend 30% of * total elapsed time in URI class. */ private transient String cachedEscapedURI = null; /** * Cache of this uuri escaped as a string. * * Super class calculates on every call. Profiling shows us spend 30% of * total elapsed time in URI class. */ private transient String cachedString = null; /** * Cached authority minus userinfo. */ private transient String cachedAuthorityMinusUserinfo = null; /** * Cache of this uuri in SURT format */ private transient String surtForm = null; // Technically, underscores are disallowed in the domainlabel // portion of hostname according to rfc2396 but we'll be more // loose and allow them. See: [ 1072035 ] [uuri] Underscore in // host messes up port parsing. static { hostname.set('_'); } /** * Shutdown access to default constructor. */ protected UsableURI() { super(); } /** * @param uri String representation of an absolute URI. * @param escaped If escaped. * @param charset Charset to use. * @throws org.apache.commons.httpclient.URIException */ protected UsableURI(String uri, boolean escaped, String charset) throws URIException { super(uri, escaped, charset); normalize(); } /** * @param relative String representation of URI. * @param base Parent UURI to use derelativizing. * @throws org.apache.commons.httpclient.URIException */ protected UsableURI(UsableURI base, UsableURI relative) throws URIException { super(base, relative); normalize(); } /** * @param uri String representation of a URI. * @param escaped If escaped. * @throws NullPointerException * @throws URIException */ protected UsableURI(String uri, boolean escaped) throws URIException, NullPointerException { super(uri,escaped); normalize(); } /** * @param uri URI as string that is resolved relative to this UURI. * @return UURI that uses this UURI as base. * @throws URIException */ public UsableURI resolve(String uri) throws URIException { return resolve(uri, false, // assume not escaped this.getProtocolCharset()); } /** * @param uri URI as string that is resolved relative to this UURI. * @param e True if escaped. * @return UURI that uses this UURI as base. * @throws URIException */ public UsableURI resolve(String uri, boolean e) throws URIException { return resolve(uri, e, this.getProtocolCharset()); } /** * @param uri URI as string that is resolved relative to this UURI. * @param e True if uri is escaped. * @param charset Charset to use. * @return UURI that uses this UURI as base. * @throws URIException */ public UsableURI resolve(String uri, boolean e, String charset) throws URIException { return new UsableURI(this, new UsableURI(uri, e, charset)); } /** * Test an object if this UURI is equal to another. * * @param obj an object to compare * @return true if two URI objects are equal */ public boolean equals(Object obj) { // normalize and test each components if (obj == this) { return true; } if (!(obj instanceof UsableURI)) { return false; } UsableURI another = (UsableURI) obj; // scheme if (!equals(this._scheme, another._scheme)) { return false; } // is_opaque_part or is_hier_part? and opaque if (!equals(this._opaque, another._opaque)) { return false; } // is_hier_part // has_authority if (!equals(this._authority, another._authority)) { return false; } // path if (!equals(this._path, another._path)) { return false; } // has_query if (!equals(this._query, another._query)) { return false; } // UURIs do not have fragments return true; } /** * Strips www variants from the host. * * Strips www[0-9]*\. from the host. If calling getHostBaseName becomes a * performance issue we should consider adding the hostBasename member that * is set on initialization. * * @return Host's basename. * @throws URIException */ public String getHostBasename() throws URIException { // caching eliminated because this is rarely used // (only benefits legacy DomainScope, which should // be retired). Saves 4-byte object pointer in UURI // instances. return (this.getReferencedHost() == null) ? null : TextUtils.replaceFirst(MASSAGEHOST_PATTERN, this.getReferencedHost(), UsableURIFactory.EMPTY_STRING); } /** * Returns an alternate, functional String representation -- in this * case, a String of the URI represented by this UURI instance. * * @return */ public synchronized String toCustomString() { if (this.cachedString == null) { this.cachedString = super.toString(); coalesceUriStrings(); } return this.cachedString; } /** * Override to cache result * * TODO: eliminate, moving most callers to toCustomString, to avoid * overloading/diluting toString() * (see http://webteam.archive.org/confluence/display/Heritrix/Preserve+toString%28%29 ) * @return String representation of this URI */ public String toString() { return toCustomString(); } /** * In the case of a puny encoded IDN, this method returns the decoded Unicode version. *

* Most of this implementation is copied from {@link org.apache.commons.httpclient.URI#setURI()}. * * @return decoded IDN version of URI */ public String toUnicodeHostString() { if (!_is_hostname) { return toString(); } try { StringBuilder buf = new StringBuilder(); if (_scheme != null) { buf.append(_scheme); buf.append(':'); } if (_is_net_path) { buf.append("//"); if (_authority != null) { // has_authority if (_userinfo != null) { buf.append(_userinfo).append('@'); } buf.append(IDNA.toUnicode(getHost())); if (_port >= 0) { buf.append(':').append(_port); } } } if (_opaque != null && _is_opaque_part) { buf.append(_opaque); } else if (_path != null) { // _is_hier_part or _is_relativeURI if (_path.length != 0) { buf.append(_path); } } if (_query != null) { // has_query buf.append('?'); buf.append(_query); } return buf.toString(); } catch (URIException ex) { throw new RuntimeException(ex); } } public synchronized String getEscapedURI() { if (this.cachedEscapedURI == null) { this.cachedEscapedURI = super.getEscapedURI(); coalesceUriStrings(); } return this.cachedEscapedURI; } /** * The two String fields cachedString and cachedEscapedURI are * usually identical; if so, coalesce into a single instance. */ protected void coalesceUriStrings() { if (this.cachedString != null && this.cachedEscapedURI != null && this.cachedString.length() == this.cachedEscapedURI.length()) { // lengths will only be identical if contents are identical // (deescaping will always shrink length), so coalesce to // use only single cached instance this.cachedString = this.cachedEscapedURI; } } public synchronized String getHost() throws URIException { if (this.cachedHost == null) { // If this._host is null, 3.0 httpclient throws // illegalargumentexception. Don't go there. if (this._host != null) { this.cachedHost = super.getHost(); coalesceHostAuthorityStrings(); } } return this.cachedHost; } /** * The two String fields cachedHost and cachedAuthorityMinusUserInfo are * usually identical; if so, coalesce into a single instance. */ protected void coalesceHostAuthorityStrings() { if (this.cachedAuthorityMinusUserinfo != null && this.cachedHost != null && this.cachedHost.length() == this.cachedAuthorityMinusUserinfo.length()) { // lengths can only be identical if contents // are identical; use only one instance this.cachedAuthorityMinusUserinfo = this.cachedHost; } } /** * Return the referenced host in the UURI, if any, also extracting the * host of a DNS-lookup URI where necessary. * * @return the target or topic host of the URI * @throws URIException */ public String getReferencedHost() throws URIException { String referencedHost = this.getHost(); if(referencedHost==null && this.getScheme().equals("dns")) { // extract target domain of DNS lookup String possibleHost = this.getCurrentHierPath(); if(possibleHost != null && possibleHost.matches("[-_\\w\\.:]+")) { referencedHost = possibleHost; } } return referencedHost; } /** * @return Return the 'SURT' format of this UURI */ public String getSurtForm() { if (surtForm == null) { surtForm = SURT.fromURI(this.toString()); } return surtForm; } /** * Return the authority minus userinfo (if any). * * If no userinfo present, just returns the authority. * * @return The authority stripped of any userinfo if present. * @throws URIException */ public String getAuthorityMinusUserinfo() throws URIException { if (this.cachedAuthorityMinusUserinfo == null) { String tmp = getAuthority(); if (tmp != null && tmp.length() > 0) { int index = tmp.indexOf('@'); if (index >= 0 && index < tmp.length()) { tmp = tmp.substring(index + 1); } } this.cachedAuthorityMinusUserinfo = tmp; coalesceHostAuthorityStrings(); } return this.cachedAuthorityMinusUserinfo; } /* (non-Javadoc) * @see java.lang.CharSequence#length() */ public int length() { return getEscapedURI().length(); } /* (non-Javadoc) * @see java.lang.CharSequence#charAt(int) */ public char charAt(int index) { return getEscapedURI().charAt(index); } /* (non-Javadoc) * @see java.lang.CharSequence#subSequence(int, int) */ public CharSequence subSequence(int start, int end) { return getEscapedURI().subSequence(start,end); } /* (non-Javadoc) * @see java.lang.Comparable#compareTo(java.lang.Object) */ public int compareTo(Object arg0) { return getEscapedURI().compareTo(arg0.toString()); } /** * Test if passed String has likely URI scheme prefix. * @param possibleUrl URL string to examine. * @return True if passed string looks like it could be an URL. */ public static boolean hasScheme(String possibleUrl) { boolean result = false; for (int i = 0; i < possibleUrl.length(); i++) { char c = possibleUrl.charAt(i); if (c == ':') { if (i != 0) { result = true; } break; } if (!scheme.get(c)) { break; } } return result; } /** * @param pathOrUri A file path or a URI. * @return Path parsed from passed pathOrUri. * @throws URISyntaxException */ public static String parseFilename(final String pathOrUri) throws URISyntaxException { String path = pathOrUri; if (UsableURI.hasScheme(pathOrUri)) { URI url = new URI(pathOrUri); path = url.getPath(); } return (new File(path)).getName(); } private void writeObject(ObjectOutputStream stream) throws IOException { stream.writeUTF(toCustomString()); } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy