org.archive.url.UsableURI Maven / Gradle / Ivy
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.url;
import gnu.inet.encoding.IDNA;
import java.io.File;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.commons.httpclient.URIException;
import org.archive.util.SURT;
import org.archive.util.TextUtils;
/**
* Usable URI.
*
* This class wraps {@link org.apache.commons.httpclient.URI} adding caching
* and methods. It cannot be instantiated directly. Go via UURIFactory.
*
* We used to use {@link java.net.URI} for parsing URIs but ran across
* quirky behaviors and bugs. {@link java.net.URI} is not subclassable --
* its final -- and its unlikely that java.net.URI will change any time soon
* (See Gordon's considered petition here:
* java.net.URI
* should have loose/tolerant/compatibility option (or allow reuse)).
*
*
This class tries to cache calculated strings such as the extracted host
* and this class as a string rather than have the parent class rerun its
* calculation everytime.
*
* @author gojomo
* @author stack
*
* @see org.apache.commons.httpclient.URI
*/
public class UsableURI extends LaxURI
implements CharSequence, Serializable {
private static final long serialVersionUID = -1277570889914647093L;
//private static Logger LOGGER =
// Logger.getLogger(UURI.class.getName());
/**
* Consider URIs too long for IE as illegal.
*/
public final static int MAX_URL_LENGTH = 2083;
public static final String MASSAGEHOST_PATTERN = "^www\\d*\\.";
/**
* Cache of the host name.
*
* Super class calculates on every call. Profiling shows us spend 30% of
* total elapsed time in URI class.
*/
private transient String cachedHost = null;
/**
* Cache of this uuri escaped as a string.
*
* Super class calculates on every call. Profiling shows us spend 30% of
* total elapsed time in URI class.
*/
private transient String cachedEscapedURI = null;
/**
* Cache of this uuri escaped as a string.
*
* Super class calculates on every call. Profiling shows us spend 30% of
* total elapsed time in URI class.
*/
private transient String cachedString = null;
/**
* Cached authority minus userinfo.
*/
private transient String cachedAuthorityMinusUserinfo = null;
/**
* Cache of this uuri in SURT format
*/
private transient String surtForm = null;
// Technically, underscores are disallowed in the domainlabel
// portion of hostname according to rfc2396 but we'll be more
// loose and allow them. See: [ 1072035 ] [uuri] Underscore in
// host messes up port parsing.
static {
hostname.set('_');
}
/**
* Shutdown access to default constructor.
*/
protected UsableURI() {
super();
}
/**
* @param uri String representation of an absolute URI.
* @param escaped If escaped.
* @param charset Charset to use.
* @throws org.apache.commons.httpclient.URIException
*/
protected UsableURI(String uri, boolean escaped, String charset)
throws URIException {
super(uri, escaped, charset);
normalize();
}
/**
* @param relative String representation of URI.
* @param base Parent UURI to use derelativizing.
* @throws org.apache.commons.httpclient.URIException
*/
protected UsableURI(UsableURI base, UsableURI relative) throws URIException {
super(base, relative);
normalize();
}
/**
* @param uri String representation of a URI.
* @param escaped If escaped.
* @throws NullPointerException
* @throws URIException
*/
protected UsableURI(String uri, boolean escaped) throws URIException, NullPointerException {
super(uri,escaped);
normalize();
}
/**
* @param uri URI as string that is resolved relative to this UURI.
* @return UURI that uses this UURI as base.
* @throws URIException
*/
public UsableURI resolve(String uri)
throws URIException {
return resolve(uri, false, // assume not escaped
this.getProtocolCharset());
}
/**
* @param uri URI as string that is resolved relative to this UURI.
* @param e True if escaped.
* @return UURI that uses this UURI as base.
* @throws URIException
*/
public UsableURI resolve(String uri, boolean e)
throws URIException {
return resolve(uri, e, this.getProtocolCharset());
}
/**
* @param uri URI as string that is resolved relative to this UURI.
* @param e True if uri is escaped.
* @param charset Charset to use.
* @return UURI that uses this UURI as base.
* @throws URIException
*/
public UsableURI resolve(String uri, boolean e, String charset)
throws URIException {
return new UsableURI(this, new UsableURI(uri, e, charset));
}
/**
* Test an object if this UURI is equal to another.
*
* @param obj an object to compare
* @return true if two URI objects are equal
*/
public boolean equals(Object obj) {
// normalize and test each components
if (obj == this) {
return true;
}
if (!(obj instanceof UsableURI)) {
return false;
}
UsableURI another = (UsableURI) obj;
// scheme
if (!equals(this._scheme, another._scheme)) {
return false;
}
// is_opaque_part or is_hier_part? and opaque
if (!equals(this._opaque, another._opaque)) {
return false;
}
// is_hier_part
// has_authority
if (!equals(this._authority, another._authority)) {
return false;
}
// path
if (!equals(this._path, another._path)) {
return false;
}
// has_query
if (!equals(this._query, another._query)) {
return false;
}
// UURIs do not have fragments
return true;
}
/**
* Strips www variants from the host.
*
* Strips www[0-9]*\. from the host. If calling getHostBaseName becomes a
* performance issue we should consider adding the hostBasename member that
* is set on initialization.
*
* @return Host's basename.
* @throws URIException
*/
public String getHostBasename() throws URIException {
// caching eliminated because this is rarely used
// (only benefits legacy DomainScope, which should
// be retired). Saves 4-byte object pointer in UURI
// instances.
return (this.getReferencedHost() == null)
? null
: TextUtils.replaceFirst(MASSAGEHOST_PATTERN,
this.getReferencedHost(), UsableURIFactory.EMPTY_STRING);
}
/**
* Returns an alternate, functional String representation -- in this
* case, a String of the URI represented by this UURI instance.
*
* @return
*/
public synchronized String toCustomString() {
if (this.cachedString == null) {
this.cachedString = super.toString();
coalesceUriStrings();
}
return this.cachedString;
}
/**
* Override to cache result
*
* TODO: eliminate, moving most callers to toCustomString, to avoid
* overloading/diluting toString()
* (see http://webteam.archive.org/confluence/display/Heritrix/Preserve+toString%28%29 )
* @return String representation of this URI
*/
public String toString() {
return toCustomString();
}
/**
* In the case of a puny encoded IDN, this method returns the decoded Unicode version.
*
* Most of this implementation is copied from {@link org.apache.commons.httpclient.URI#setURI()}.
*
* @return decoded IDN version of URI
*/
public String toUnicodeHostString() {
if (!_is_hostname) {
return toString();
}
try {
StringBuilder buf = new StringBuilder();
if (_scheme != null) {
buf.append(_scheme);
buf.append(':');
}
if (_is_net_path) {
buf.append("//");
if (_authority != null) { // has_authority
if (_userinfo != null) {
buf.append(_userinfo).append('@');
}
buf.append(IDNA.toUnicode(getHost()));
if (_port >= 0) {
buf.append(':').append(_port);
}
}
}
if (_opaque != null && _is_opaque_part) {
buf.append(_opaque);
} else if (_path != null) {
// _is_hier_part or _is_relativeURI
if (_path.length != 0) {
buf.append(_path);
}
}
if (_query != null) { // has_query
buf.append('?');
buf.append(_query);
}
return buf.toString();
} catch (URIException ex) {
throw new RuntimeException(ex);
}
}
public synchronized String getEscapedURI() {
if (this.cachedEscapedURI == null) {
this.cachedEscapedURI = super.getEscapedURI();
coalesceUriStrings();
}
return this.cachedEscapedURI;
}
/**
* The two String fields cachedString and cachedEscapedURI are
* usually identical; if so, coalesce into a single instance.
*/
protected void coalesceUriStrings() {
if (this.cachedString != null && this.cachedEscapedURI != null
&& this.cachedString.length() == this.cachedEscapedURI.length()) {
// lengths will only be identical if contents are identical
// (deescaping will always shrink length), so coalesce to
// use only single cached instance
this.cachedString = this.cachedEscapedURI;
}
}
public synchronized String getHost() throws URIException {
if (this.cachedHost == null) {
// If this._host is null, 3.0 httpclient throws
// illegalargumentexception. Don't go there.
if (this._host != null) {
this.cachedHost = super.getHost();
coalesceHostAuthorityStrings();
}
}
return this.cachedHost;
}
/**
* The two String fields cachedHost and cachedAuthorityMinusUserInfo are
* usually identical; if so, coalesce into a single instance.
*/
protected void coalesceHostAuthorityStrings() {
if (this.cachedAuthorityMinusUserinfo != null
&& this.cachedHost != null
&& this.cachedHost.length() ==
this.cachedAuthorityMinusUserinfo.length()) {
// lengths can only be identical if contents
// are identical; use only one instance
this.cachedAuthorityMinusUserinfo = this.cachedHost;
}
}
/**
* Return the referenced host in the UURI, if any, also extracting the
* host of a DNS-lookup URI where necessary.
*
* @return the target or topic host of the URI
* @throws URIException
*/
public String getReferencedHost() throws URIException {
String referencedHost = this.getHost();
if(referencedHost==null && this.getScheme().equals("dns")) {
// extract target domain of DNS lookup
String possibleHost = this.getCurrentHierPath();
if(possibleHost != null && possibleHost.matches("[-_\\w\\.:]+")) {
referencedHost = possibleHost;
}
}
return referencedHost;
}
/**
* @return Return the 'SURT' format of this UURI
*/
public String getSurtForm() {
if (surtForm == null) {
surtForm = SURT.fromURI(this.toString());
}
return surtForm;
}
/**
* Return the authority minus userinfo (if any).
*
* If no userinfo present, just returns the authority.
*
* @return The authority stripped of any userinfo if present.
* @throws URIException
*/
public String getAuthorityMinusUserinfo()
throws URIException {
if (this.cachedAuthorityMinusUserinfo == null) {
String tmp = getAuthority();
if (tmp != null && tmp.length() > 0) {
int index = tmp.indexOf('@');
if (index >= 0 && index < tmp.length()) {
tmp = tmp.substring(index + 1);
}
}
this.cachedAuthorityMinusUserinfo = tmp;
coalesceHostAuthorityStrings();
}
return this.cachedAuthorityMinusUserinfo;
}
/* (non-Javadoc)
* @see java.lang.CharSequence#length()
*/
public int length() {
return getEscapedURI().length();
}
/* (non-Javadoc)
* @see java.lang.CharSequence#charAt(int)
*/
public char charAt(int index) {
return getEscapedURI().charAt(index);
}
/* (non-Javadoc)
* @see java.lang.CharSequence#subSequence(int, int)
*/
public CharSequence subSequence(int start, int end) {
return getEscapedURI().subSequence(start,end);
}
/* (non-Javadoc)
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
public int compareTo(Object arg0) {
return getEscapedURI().compareTo(arg0.toString());
}
/**
* Test if passed String has likely URI scheme prefix.
* @param possibleUrl URL string to examine.
* @return True if passed string looks like it could be an URL.
*/
public static boolean hasScheme(String possibleUrl) {
boolean result = false;
for (int i = 0; i < possibleUrl.length(); i++) {
char c = possibleUrl.charAt(i);
if (c == ':') {
if (i != 0) {
result = true;
}
break;
}
if (!scheme.get(c)) {
break;
}
}
return result;
}
/**
* @param pathOrUri A file path or a URI.
* @return Path parsed from passed pathOrUri
.
* @throws URISyntaxException
*/
public static String parseFilename(final String pathOrUri)
throws URISyntaxException {
String path = pathOrUri;
if (UsableURI.hasScheme(pathOrUri)) {
URI url = new URI(pathOrUri);
path = url.getPath();
}
return (new File(path)).getName();
}
private void writeObject(ObjectOutputStream stream) throws IOException {
stream.writeUTF(toCustomString());
}
}