org.archive.util.UriUtils Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-commons Show documentation
Show all versions of heritrix-commons Show documentation
The Archive Commons Code Libraries project contains general Java utility
libraries, as used by the Heritrix crawler and other projects.
The newest version!
/* UriUtils
*
* $Id: MimetypeUtils.java 3119 2005-02-17 20:39:21Z stack-sf $
*
* Created on April 15, 2010
*
* Copyright (C) 2010 Internet Archive.
*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Heritrix is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or
* any later version.
*
* Heritrix is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser Public License for more details.
*
* You should have received a copy of the GNU Lesser Public License
* along with Heritrix; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
package org.archive.util;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import org.apache.commons.codec.DecoderException;
import org.apache.commons.httpclient.URIException;
import org.archive.net.UURI;
import org.archive.url.LaxURLCodec;
/**
* URI-related utilities.
*
* Primarily, a place to centralize and better document and test certain URI-related heuristics
* that may be useful in many places.
*
* The choice of when to consider a string likely enough to be a URI that we try crawling it
* is, so far, based on rather arbitrary rules-of-thumb. We have not quantitatively tested
* how often the strings that pass these tests yield meaningful (not 404, non-soft-404,
* non-garbage) replies. We are willing to accept some level of mistaken requests, knowing
* that their cost is usually negligible, if that allows us to discover meaningful content
* that could be not be discovered via other heuristics.
*
* Our intuitive understanding so far is that: strings that appear to have ./.. relative-path
* prefixes, dot-extensions, or path-slashes are good candidates for trying as URIs, even
* though with some Javascript/HTML-VALUE-attributes, this yields a lot of false positives.
*
* We want to get strings like....
*
* photo.jpg
* /photos
* /photos/
* ./photos
* ../../photos
* photos/index.html
*
* ...but we will thus also sometimes try strings that were other kinds of variables/
* parameters, like...
*
* rectangle.x
* 11.2px
* text/xml
* width:6.33
*
* Until better rules, exception-blacklists or even site-sensitive dynamic adjustment of
* heuristics (eg: this site, guesses are yield 200s, keep guessing; this site, guesses are
* all 404s, stop guessing) are developed, crawl operators should monitor their crawls
* (and contact email) for cases where speculative crawling are generating many errors, and
* use settings like ExtractorHTML's 'extract-javascript' and 'extract-value-attributes' or
* disable of ExtractorJS entirely when they want to curtail those errors.
*
* The 'legacy' tests are those used in H1 at least through 1.14.4. They have
* some known problems, but are not yet being dropped until more experience
* with the 'new' isLikelyUri() test is collected (in H3). Enable the 'xest'
* methods of the UriUtilsTest class for details.
*
* @author gojomo
*/
public class UriUtils {
private static final Logger LOGGER = Logger.getLogger(UriUtils.class.getName());
/**
* Returns true when when given a CharSequence that looks like a data URI.
*/
public static boolean isDataUri(CharSequence candidate) {
return TextUtils.matches("(?i)\\s*data:.*", candidate);
}
// naive likely-uri test:
// no '<' or '>'
// at least one '.' or '/';
protected static final String NAIVE_LIKELY_URI_PATTERN = "[^<>]*[\\./][^<>]*";
public static boolean isPossibleUri(CharSequence candidate) {
return TextUtils.matches(NAIVE_LIKELY_URI_PATTERN, candidate);
}
/**
* @deprecated produces too many false positives,
* {@link #isVeryLikelyUri(CharSequence)} is preferred
*/
public static boolean isLikelyUri(CharSequence candidate) {
return isPossibleUri(candidate) && !isLikelyFalsePositive(candidate);
}
protected final static String[] AUDIO_VIDEO_IMAGE_MIMETYPES = new String[] {
"audio/aiff",
"audio/asf",
"audio/basic",
"audio/m4a",
"audio/mid",
"audio/midi",
"audio/mp3",
"audio/mp4",
"audio/mp4a-latm",
"audio/mpeg",
"audio/mpeg3",
"audio/mpegurl",
"audio/mpg",
"audio/ogg",
"audio/playlist",
"audio/unknown",
"audio/vnd.qcelp",
"audio/vnd.rn-realaudio",
"audio/wav",
"audio/x-aiff",
"audio/x-m4a",
"audio/x-midi",
"audio/x-mp3",
"audio/x-mpeg",
"audio/x-mpeg3",
"audio/x-mpegurl",
"audio/x-ms-wax",
"audio/x-ms-wma",
"audio/x-ms-wmv",
"audio/x-pn-realaudio",
"audio/x-pn-realaudio-plugin",
"audio/x-realaudio",
"audio/x-scpls",
"audio/x-wav",
"image/bitmap",
"image/bmp",
"image/BMP",
"image/cur",
"image/fits",
"image/gif",
"image/GIF",
"image/ico",
"image/icon",
"image/jp2",
"image/jpeg",
"image/JPEG",
"image/jpeg-cmyk",
"image/jpg",
"image/JPG",
"image/pdf",
"image/pict",
"image/pjpeg",
"image/png",
"image/PNG",
"image/svg+xml",
"image/tiff",
"image/vnd.adobe.photoshop",
"image/vnd.djvu",
"image/vnd.dwg",
"image/vnd.dxf",
"image/vnd.microsoft.icon",
"image/vnd.ms-modi",
"image/vnd.ms-photo",
"image/vnd.wap.wbmp",
"image/x-bitmap",
"image/x-bmp",
"image/x-citrix-pjpeg",
"image/x-dcraw",
"image/x-djvu",
"image/x.djvu",
"image/x-emf",
"image/x-eps",
"image/x-guffaw",
"image/x-ico",
"image/xicon",
"image/x-icon",
"image/x-jg",
"image/x-ms-bmp",
"image/x-MS-bmp",
"image/x-pcx",
"image/x-photoshop",
"image/x-pict",
"image/x-png",
"image/x-portable-anymap",
"image/x-portable-bitmap",
"image/x-portable-graymap",
"image/x-portable-pixmap",
"image/x-psd",
"image/x-quicktime",
"image/x-rgb",
"image/x-windows-bmp",
"image/x-wmf",
"image/x-xbitmap",
"image/x-xbm",
"image/x-xfig",
"image/x-xpixmap",
"video/3gpp",
"video/asx",
"video/avi",
"video/f4v",
"video/flv",
"video/m4v",
"video/mp4",
"video/MP4",
"video/mp4v-es",
"video/mpeg",
"video/mpeg3",
"video/mpeg4",
"video/mpg4",
"video/msvideo",
"video/ogg",
"video/quicktime",
"video/swf",
"video/unknown",
"video/vnd.objectvideo",
"video/webm",
"video/wmv",
"video/x-dv",
"video/x-flv",
"video/x-m4v",
"video/x-mp4",
"video/x-mpeg",
"video/x-ms-asf",
"video/x-ms-asx",
"video/x-msvideo",
"video/x-ms-wm",
"video/x-ms-wma",
"video/x-ms-wmv",
"video/x-ms-wmx",
"video/x-ms-wvx",
"video/x-pn-realaudio",
"video/x-pn-realvideo",
"video/x-sgi-movie",
"video/x-swf"
};
protected static final Set AUDIO_VIDEO_IMAGE_MIMETYPE_SET = new HashSet();
static {
AUDIO_VIDEO_IMAGE_MIMETYPE_SET.addAll(Arrays.asList(AUDIO_VIDEO_IMAGE_MIMETYPES));
}
protected static boolean isLikelyFalsePositive(CharSequence candidate) {
if (TextUtils.matches("(?:text|application)/[^/]+", candidate)) {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("rejected: looks like an application or text mimetype: " + candidate);
}
return true;
}
for (String s: AUDIO_VIDEO_IMAGE_MIMETYPES) {
if (s.contentEquals(candidate)) {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("rejected: looks like an audio video or image mimetype: " + candidate);
}
return true;
}
}
if (TextUtils.matches("\\d+(?:\\.\\d+)*", candidate)) {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("rejected: looks like a decimal number: " + candidate);
}
return true;
}
if (TextUtils.matches(".*[$()'\"\\[\\]{}|].*", candidate)) {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("rejected: contains unusual characters: " + candidate);
}
return true;
}
// starting or ending with + particularly common because of string concatenation in javascript
if (TextUtils.matches("^[,;+:].*|.*[.,;+:]$", candidate)) {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("rejected: starts or ends with an unusual starting or ending character: " + candidate);
}
return true;
}
if (candidate.charAt(0) == '.' && !TextUtils.matches("^\\.{1,2}/.*", candidate)) {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("rejected: starts with '.' (but not './' or '../'): " + candidate);
}
return true;
}
if (TextUtils.matches("^.*[^:]//.*$", candidate)) {
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("rejected: contains '//' (but not '://'): " + candidate);
}
return true;
}
// look for things that look like hostnames and not filenames?
// look for too many dots but make sure we take into account that url may have hostname?
if (LOGGER.isLoggable(Level.FINE)) {
LOGGER.fine("accepted: does not look like a false positive: " + candidate);
}
return false;
}
/**
* Perform additional fixup of likely-URI Strings
* @return String changed/decoded to increase likelihood it is a
* meaningful non-404 URI
*/
public static String speculativeFixup(String candidate, UURI base) {
String retVal = candidate;
// unescape ampersands
retVal = TextUtils.replaceAll("&", retVal, "&");
// uri-decode if begins with encoded 'http(s)?%3A'
if(TextUtils.matches("(?i)^https?%3A.*", retVal)) {
try {
retVal = LaxURLCodec.DEFAULT.decode(retVal);
} catch (DecoderException e) {
LOGGER.log(Level.INFO,"unable to decode",e);
}
}
// TODO: more URI-decoding if there are %-encoded parts?
// detect scheme-less intended-absolute-URI
// intent: "opens with what looks like a dotted-domain, and
// last segment is a top-level-domain (eg "com", "org", etc)"
Matcher m = TextUtils.getMatcher("(?:[^./]+\\.)+([^./]+)(?:/.*)?",
retVal);
if (m.matches()) {
if (ArchiveUtils.isTld(m.group(1))) {
String schemePlus = "http://";
// if on exact same host preserve scheme (eg https)
try {
if (retVal.startsWith(base.getHost())) {
schemePlus = base.getScheme() + "://";
}
} catch (URIException e) {
// error retrieving source host - ignore it
}
retVal = schemePlus + retVal;
}
}
TextUtils.recycleMatcher(m);
return retVal;
}
protected static final Set HTML_TAGS = new HashSet();
static {
HTML_TAGS.addAll(Arrays.asList("a", "abbr", "acronym", "address",
"applet", "area", "article", "aside", "audio", "b",
"base", "basefont", "bdi", "bdo", "big", "blockquote",
"body", "br", "button", "canvas", "caption", "center",
"cite", "code", "col", "colgroup", "command",
"datalist", "dd", "del", "details", "dfn", "dir",
"div", "dl", "dt", "em", "embed", "fieldset",
"figcaption", "figure", "font", "footer", "form",
"frame", "frameset", "head", "header", "hgroup", "h1",
"h2", "h3", "h4", "h5", "h6", "hr", "html", "i",
"iframe", "img", "input", "ins", "kbd", "keygen",
"label", "legend", "li", "link", "map", "mark", "menu",
"meta", "meter", "nav", "noframes", "noscript",
"object", "ol", "optgroup", "option", "output", "p",
"param", "pre", "progress", "q", "rp", "rt", "ruby",
"s", "samp", "script", "section", "select", "small",
"source", "span", "strike", "strong", "style", "sub",
"summary", "sup", "table", "tbody", "td", "textarea",
"tfoot", "th", "thead", "time", "title", "tr", "track",
"tt", "u", "ul", "var", "video", "wbr"));
}
protected static final Set KNOWN_GOOD_FILE_EXTENSIONS = new HashSet();
static {
/*
* Real known use cases for this are .min.js, .min.css, and we've seen
* .jpg files with an extra dot in them. Other extensions are included
* in the list somewhat arbitrarily.
*/
KNOWN_GOOD_FILE_EXTENSIONS.addAll(Arrays.asList(".jpg", ".js", ".css",
".png", ".gif", ".swf", ".flv", ".mp4", ".mp3", ".jpeg",
".html", ".pdf"));
}
protected static final String QNV = "[a-zA-Z_]+=(?:[\\w-/.]|%[0-9a-fA-F]{2})*"; // name=value for query strings
// group(1) filename
// group(2) filename extension with leading '.'
protected static final String LIKELY_RELATIVE_URI_PATTERN =
"(?:\\.?/)?" // may start with "/" or "./"
+ "(?:(?:[\\s\\w-]+|\\.\\.)(?:/))*" // may have path/segments/segment2
+ "([\\s\\w-]+(?:\\.[\\w-]+)??(\\.[a-zA-Z0-9]{2,5})?)?" // may have a filename with or without an extension
+ "(?:\\?(?:"+ QNV + ")(?:&(?:" + QNV + "))*)?" // may have a ?query=string
+ "(?:#[\\w-]+)?"; // may have a #fragment
public static boolean isVeryLikelyUri(CharSequence candidate) {
// must have a . or /
if (!TextUtils.matches(NAIVE_LIKELY_URI_PATTERN, candidate)) {
return false;
}
// absolute uri
if (TextUtils.matches("^(?i)https?://[^<>\\s/]+\\.[^<>\\s/]+(?:/[^<>\\s]*)?", candidate)) {
return true;
}
// "protocol-relative" uri
if (TextUtils.matches("^//[^<>\\s/]+\\.[^<>\\s/]+(?:/[^<>\\s]*)?", candidate)) {
return true;
}
// relative or server-relative uri
Matcher matcher = TextUtils.getMatcher(LIKELY_RELATIVE_URI_PATTERN, candidate);
if (!matcher.matches()) {
return false;
}
/*
* Remaining tests discard stuff that the
* LIKELY_RELATIVE_URI_PATTERN can't catch
*/
// if filename contains two dots, it must end with a known good extension
String filename = matcher.group(1);
String extension = matcher.group(2);
if (filename != null && extension != null
&& filename.indexOf('.') != filename.lastIndexOf('.')
&& !KNOWN_GOOD_FILE_EXTENSIONS.contains(extension)) {
return false;
}
if (TextUtils.matches(".*\\s+.*", candidate)
&& (extension == null
|| !KNOWN_GOOD_FILE_EXTENSIONS.contains(extension))) {
return false;
}
// text or application mimetype
if (TextUtils.matches("(?:text|application)/[^/]+", candidate)) {
return false;
}
// audio, video or image mimetype
if (AUDIO_VIDEO_IMAGE_MIMETYPE_SET.contains(candidate)) {
return false;
}
// decimal number
if (TextUtils.matches("\\d+(?:\\.\\d+)*", candidate)) {
return false;
}
// likely css class, e.g. "div.menu", "a.help", etc
Matcher m = TextUtils.getMatcher("([^./]+)\\.([^./]+)", candidate);
if (m.matches() && HTML_TAGS.contains(m.group(1).toLowerCase())) {
return false;
}
return true;
}
//
// legacy likely-URI test from ExtractorJS
//
// determines whether a string is likely URI
// (no whitespace or '<' '>', has an internal dot or some slash,
// begins and ends with either '/' or a word-char)
protected static final String STRING_URI_DETECTOR =
"(?:\\w|[\\.]{0,2}/)[\\S&&[^<>]]*(?:\\.|/)[\\S&&[^<>]]*(?:\\w|/)";
// blacklist of strings that STRING_URI_DETECTOR picks up as URIs,
// which are known to be problematic, and NOT to be
// added to outLinks
protected final static String[] STRING_URI_DETECTOR_EXCEPTIONS = {
"text/javascript"
};
public static boolean isLikelyUriJavascriptContextLegacy(CharSequence candidate) {
if(!TextUtils.matches(STRING_URI_DETECTOR,candidate)) {
return false;
}
for (String s : STRING_URI_DETECTOR_EXCEPTIONS) {
if (s.contentEquals(candidate))
return false;
}
// matches detector and not an exception: so a likely URI
return true;
}
//
// legacy likely-URI test from ExtractorHTML
//
// much like the javascript likely-URI extractor, but
// without requiring quotes -- this can indicate whether
// an HTML tag attribute that isn't definitionally a
// URI might be one anyway, as in form-tag VALUE attributes
protected static final String LIKELY_URI_PATH =
"(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)";
public static boolean isLikelyUriHtmlContextLegacy(CharSequence candidate) {
return TextUtils.matches(LIKELY_URI_PATH, candidate);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy