/* UriUtils
 * $Id: 3119 2005-02-17 20:39:21Z stack-sf $
 * Created on April 15, 2010
 * Copyright (C) 2010 Internet Archive.
 * This file is part of the Heritrix web crawler (
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 * Heritrix is distributed in the hope that it will be useful, 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * GNU Lesser Public License for more details.
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
package org.archive.util;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;

import org.apache.commons.codec.DecoderException;
import org.apache.commons.httpclient.URIException;
import org.archive.url.LaxURLCodec;

 * URI-related utilities. 
 * Primarily, a place to centralize and better document and test certain URI-related heuristics
 * that may be useful in many places. 
 * The choice of when to consider a string likely enough to be a URI that we try crawling it 
 * is, so far, based on rather arbitrary rules-of-thumb. We have not quantitatively tested 
 * how often the strings that pass these tests yield meaningful (not 404, non-soft-404, 
 * non-garbage) replies. We are willing to accept some level of mistaken requests, knowing
 * that their cost is usually negligible, if that allows us to discover meaningful content
 * that could be not be discovered via other heuristics. 
 *  Our intuitive understanding so far is that: strings that appear to have ./.. relative-path
 *  prefixes, dot-extensions,  or path-slashes are good candidates for trying as URIs, even 
 *  though with some Javascript/HTML-VALUE-attributes, this yields a lot of false positives. 
 *  We want to get strings like....
 *    photo.jpg
 *    /photos
 *    /photos/
 *    ./photos
 *    ../../photos
 *    photos/index.html
 *  ...but we will thus also sometimes try strings that were other kinds of variables/
 *  parameters, like...
 *    rectangle.x
 *    11.2px
 *    text/xml
 *    width:6.33
 *  Until better rules, exception-blacklists or even site-sensitive dynamic adjustment of 
 *  heuristics (eg: this site, guesses are yield 200s, keep guessing; this site, guesses are
 *  all 404s, stop guessing) are developed, crawl operators should monitor their crawls 
 *  (and contact email) for cases where speculative crawling are generating many errors, and
 *  use settings like ExtractorHTML's 'extract-javascript' and 'extract-value-attributes' or
 *  disable of ExtractorJS entirely when they want to curtail those errors. 
 *  The 'legacy' tests are those used in H1 at least through 1.14.4. They have
 *  some known problems, but are not yet being dropped until more experience 
 *  with the 'new' isLikelyUri() test is collected (in H3). Enable the 'xest'
 *  methods of the UriUtilsTest class for details. 
 * @author gojomo
public class UriUtils {
    private static final Logger LOGGER = Logger.getLogger(UriUtils.class.getName());

     * Returns true when when given a CharSequence that looks like a data URI.
    public static boolean isDataUri(CharSequence candidate) {
        return TextUtils.matches("(?i)\\s*data:.*", candidate);

    // naive likely-uri test: 
    //    no '<' or '>' 
    //    at least one '.' or '/';
    protected static final String NAIVE_LIKELY_URI_PATTERN = "[^<>]*[\\./][^<>]*";
    public static boolean isPossibleUri(CharSequence candidate) {
        return TextUtils.matches(NAIVE_LIKELY_URI_PATTERN, candidate);
     * @deprecated produces too many false positives,
     *             {@link #isVeryLikelyUri(CharSequence)} is preferred
    public static boolean isLikelyUri(CharSequence candidate) {
        return isPossibleUri(candidate) && !isLikelyFalsePositive(candidate);

    protected final static String[] AUDIO_VIDEO_IMAGE_MIMETYPES = new String[] {
    protected static final Set AUDIO_VIDEO_IMAGE_MIMETYPE_SET = new HashSet();
    static {

    protected static boolean isLikelyFalsePositive(CharSequence candidate) {
        if (TextUtils.matches("(?:text|application)/[^/]+", candidate)) {
            if (LOGGER.isLoggable(Level.FINE)) {
                LOGGER.fine("rejected: looks like an application or text mimetype: " + candidate);
            return true;

        for (String s: AUDIO_VIDEO_IMAGE_MIMETYPES) {
            if (s.contentEquals(candidate)) {
                if (LOGGER.isLoggable(Level.FINE)) {
                    LOGGER.fine("rejected: looks like an audio video or image mimetype: " + candidate);
                return true;
        if (TextUtils.matches("\\d+(?:\\.\\d+)*", candidate)) {
            if (LOGGER.isLoggable(Level.FINE)) {
                LOGGER.fine("rejected: looks like a decimal number: " + candidate);
            return true;

        if (TextUtils.matches(".*[$()'\"\\[\\]{}|].*", candidate)) {
            if (LOGGER.isLoggable(Level.FINE)) {
                LOGGER.fine("rejected: contains unusual characters: " + candidate);
            return true;
        // starting or ending with + particularly common because of string concatenation in javascript
        if (TextUtils.matches("^[,;+:].*|.*[.,;+:]$", candidate)) {
            if (LOGGER.isLoggable(Level.FINE)) {
                LOGGER.fine("rejected: starts or ends with an unusual starting or ending character: " + candidate);
            return true;
        if (candidate.charAt(0) == '.' && !TextUtils.matches("^\\.{1,2}/.*", candidate)) {
            if (LOGGER.isLoggable(Level.FINE)) {
                LOGGER.fine("rejected: starts with '.' (but not './' or '../'): " + candidate);
            return true;
        if (TextUtils.matches("^.*[^:]//.*$", candidate)) {
            if (LOGGER.isLoggable(Level.FINE)) {
                LOGGER.fine("rejected: contains '//' (but not '://'): " + candidate);
            return true;
        // look for things that look like hostnames and not filenames?
        // look for too many dots but make sure we take into account that url may have hostname?

        if (LOGGER.isLoggable(Level.FINE)) {
            LOGGER.fine("accepted: does not look like a false positive: " + candidate);

        return false;
     * Perform additional fixup of likely-URI Strings
     * @return String changed/decoded to increase likelihood it is a 
     * meaningful non-404 URI
    public static String speculativeFixup(String candidate, UURI base) {
        String retVal = candidate;
        // unescape ampersands
        retVal = TextUtils.replaceAll("&", retVal, "&");
        // uri-decode if begins with encoded 'http(s)?%3A'
        if(TextUtils.matches("(?i)^https?%3A.*", retVal)) {
            try {
                retVal = LaxURLCodec.DEFAULT.decode(retVal);
            } catch (DecoderException e) {
                LOGGER.log(Level.INFO,"unable to decode",e);
        // TODO: more URI-decoding if there are %-encoded parts?
        // detect scheme-less intended-absolute-URI
        // intent: "opens with what looks like a dotted-domain, and 
        // last segment is a top-level-domain (eg "com", "org", etc)" 
        Matcher m = TextUtils.getMatcher("(?:[^./]+\\.)+([^./]+)(?:/.*)?", 
        if (m.matches()) {
            if (ArchiveUtils.isTld( {
                String schemePlus = "http://";
                // if on exact same host preserve scheme (eg https)
                try {
                    if (retVal.startsWith(base.getHost())) {
                        schemePlus = base.getScheme() + "://";
                } catch (URIException e) {
                    // error retrieving source host - ignore it
                retVal = schemePlus + retVal;
        return retVal; 

    protected static final Set HTML_TAGS = new HashSet();
    static {
        HTML_TAGS.addAll(Arrays.asList("a", "abbr", "acronym", "address",
                        "applet", "area", "article", "aside", "audio", "b",
                        "base", "basefont", "bdi", "bdo", "big", "blockquote",
                        "body", "br", "button", "canvas", "caption", "center",
                        "cite", "code", "col", "colgroup", "command",
                        "datalist", "dd", "del", "details", "dfn", "dir",
                        "div", "dl", "dt", "em", "embed", "fieldset",
                        "figcaption", "figure", "font", "footer", "form",
                        "frame", "frameset", "head", "header", "hgroup", "h1",
                        "h2", "h3", "h4", "h5", "h6", "hr", "html", "i",
                        "iframe", "img", "input", "ins", "kbd", "keygen",
                        "label", "legend", "li", "link", "map", "mark", "menu",
                        "meta", "meter", "nav", "noframes", "noscript",
                        "object", "ol", "optgroup", "option", "output", "p",
                        "param", "pre", "progress", "q", "rp", "rt", "ruby",
                        "s", "samp", "script", "section", "select", "small",
                        "source", "span", "strike", "strong", "style", "sub",
                        "summary", "sup", "table", "tbody", "td", "textarea",
                        "tfoot", "th", "thead", "time", "title", "tr", "track",
                        "tt", "u", "ul", "var", "video", "wbr"));
    protected static final Set KNOWN_GOOD_FILE_EXTENSIONS = new HashSet();

    static {
         * Real known use cases for this are .min.js, .min.css, and we've seen
         * .jpg files with an extra dot in them. Other extensions are included
         * in the list somewhat arbitrarily.
        KNOWN_GOOD_FILE_EXTENSIONS.addAll(Arrays.asList(".jpg", ".js", ".css",
                ".png", ".gif", ".swf", ".flv", ".mp4", ".mp3", ".jpeg",
                ".html", ".pdf"));

    protected static final String QNV = "[a-zA-Z_]+=(?:[\\w-/.]|%[0-9a-fA-F]{2})*"; // name=value for query strings
    // group(1) filename
    // group(2) filename extension with leading '.'
    protected static final String LIKELY_RELATIVE_URI_PATTERN = 
            "(?:\\.?/)?"                                                    // may start with "/" or "./"
            + "(?:(?:[\\s\\w-]+|\\.\\.)(?:/))*"                             // may have path/segments/segment2
            + "([\\s\\w-]+(?:\\.[\\w-]+)??(\\.[a-zA-Z0-9]{2,5})?)?"         // may have a filename with or without an extension
            + "(?:\\?(?:"+ QNV + ")(?:&(?:" + QNV + "))*)?"                 // may have a ?query=string
            + "(?:#[\\w-]+)?";                                              // may have a #fragment
    public static boolean isVeryLikelyUri(CharSequence candidate) {
        // must have a . or /
        if (!TextUtils.matches(NAIVE_LIKELY_URI_PATTERN, candidate)) {
            return false;
        // absolute uri
        if (TextUtils.matches("^(?i)https?://[^<>\\s/]+\\.[^<>\\s/]+(?:/[^<>\\s]*)?", candidate)) {
            return true;
        // "protocol-relative" uri
        if (TextUtils.matches("^//[^<>\\s/]+\\.[^<>\\s/]+(?:/[^<>\\s]*)?", candidate)) {
            return true;
        // relative or server-relative uri
        Matcher matcher = TextUtils.getMatcher(LIKELY_RELATIVE_URI_PATTERN, candidate);
        if (!matcher.matches()) {
            return false;

         * Remaining tests discard stuff that the
         * LIKELY_RELATIVE_URI_PATTERN can't catch

        // if filename contains two dots, it must end with a known good extension
        String filename =;
        String extension =;
        if (filename != null && extension != null
                && filename.indexOf('.') != filename.lastIndexOf('.')
                && !KNOWN_GOOD_FILE_EXTENSIONS.contains(extension)) {
            return false;

        if (TextUtils.matches(".*\\s+.*", candidate)
                && (extension == null
                    || !KNOWN_GOOD_FILE_EXTENSIONS.contains(extension))) {
            return false;

        // text or application mimetype
        if (TextUtils.matches("(?:text|application)/[^/]+", candidate)) {
            return false;

        // audio, video or image mimetype
        if (AUDIO_VIDEO_IMAGE_MIMETYPE_SET.contains(candidate)) {
            return false;
        // decimal number
        if (TextUtils.matches("\\d+(?:\\.\\d+)*", candidate)) {
            return false;
        // likely css class, e.g. "", "", etc
        Matcher m = TextUtils.getMatcher("([^./]+)\\.([^./]+)", candidate);
        if (m.matches() && HTML_TAGS.contains( {
            return false;
        return true;

// legacy likely-URI test from ExtractorJS
    // determines whether a string is likely URI
    // (no whitespace or '<' '>',  has an internal dot or some slash,
    // begins and ends with either '/' or a word-char)
    protected static final String STRING_URI_DETECTOR =

    // blacklist of strings that STRING_URI_DETECTOR picks up as URIs,
    // which are known to be problematic, and NOT to be 
    // added to outLinks
    protected final static String[] STRING_URI_DETECTOR_EXCEPTIONS = {
    public static boolean isLikelyUriJavascriptContextLegacy(CharSequence candidate) {
    	if(!TextUtils.matches(STRING_URI_DETECTOR,candidate)) {
    		return false; 
            if (s.contentEquals(candidate)) 
                return false;
    	// matches detector and not an exception: so a likely URI
    	return true; 
// legacy likely-URI test from ExtractorHTML
    // much like the javascript likely-URI extractor, but
    // without requiring quotes -- this can indicate whether
    // an HTML tag attribute that isn't definitionally a
    // URI might be one anyway, as in form-tag VALUE attributes
    protected static final String LIKELY_URI_PATH =
	public static boolean isLikelyUriHtmlContextLegacy(CharSequence candidate) {
		return TextUtils.matches(LIKELY_URI_PATH, candidate);

