org.archive.util.UriUtils Maven / Gradle / Ivy

/* UriUtils
 * 
 * $Id: MimetypeUtils.java 3119 2005-02-17 20:39:21Z stack-sf $
 * 
 * Created on April 15, 2010
 *
 * Copyright (C) 2010 Internet Archive.
 * 
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 * 
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 * 
 * Heritrix is distributed in the hope that it will be useful, 
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
package org.archive.util;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;

import org.apache.commons.codec.DecoderException;
import org.apache.commons.httpclient.URIException;
import org.archive.net.UURI;
import org.archive.url.LaxURLCodec;


/**
 * URI-related utilities. 
 * 
 * Primarily, a place to centralize and better document and test certain URI-related heuristics
 * that may be useful in many places. 
 * 
 * The choice of when to consider a string likely enough to be a URI that we try crawling it 
 * is, so far, based on rather arbitrary rules-of-thumb. We have not quantitatively tested 
 * how often the strings that pass these tests yield meaningful (not 404, non-soft-404, 
 * non-garbage) replies. We are willing to accept some level of mistaken requests, knowing
 * that their cost is usually negligible, if that allows us to discover meaningful content
 * that could be not be discovered via other heuristics. 
 * 
 *  Our intuitive understanding so far is that: strings that appear to have ./.. relative-path
 *  prefixes, dot-extensions,  or path-slashes are good candidates for trying as URIs, even 
 *  though with some Javascript/HTML-VALUE-attributes, this yields a lot of false positives. 
 *  
 *  We want to get strings like....
 *  
 *    photo.jpg
 *    /photos
 *    /photos/
 *    ./photos
 *    ../../photos
 *    photos/index.html
 *  
 *  ...but we will thus also sometimes try strings that were other kinds of variables/
 *  parameters, like...
 *  
 *    rectangle.x
 *    11.2px
 *    text/xml
 *    width:6.33
 * 
 *  Until better rules, exception-blacklists or even site-sensitive dynamic adjustment of 
 *  heuristics (eg: this site, guesses are yield 200s, keep guessing; this site, guesses are
 *  all 404s, stop guessing) are developed, crawl operators should monitor their crawls 
 *  (and contact email) for cases where speculative crawling are generating many errors, and
 *  use settings like ExtractorHTML's 'extract-javascript' and 'extract-value-attributes' or
 *  disable of ExtractorJS entirely when they want to curtail those errors. 
 *  
 *  The 'legacy' tests are those used in H1 at least through 1.14.4. They have
 *  some known problems, but are not yet being dropped until more experience 
 *  with the 'new' isLikelyUri() test is collected (in H3). Enable the 'xest'
 *  methods of the UriUtilsTest class for details. 
 *  
 * @author gojomo
 */
public class UriUtils {
    private static final Logger LOGGER = Logger.getLogger(UriUtils.class.getName());

    /**
     * Returns true when when given a CharSequence that looks like a data URI.
     */
    public static boolean isDataUri(CharSequence candidate) {
        return TextUtils.matches("(?i)\\s*data:.*", candidate);
    }

    // naive likely-uri test: 
    //    no '<' or '>' 
    //    at least one '.' or '/';
    protected static final String NAIVE_LIKELY_URI_PATTERN = "[^<>]*[\\./][^<>]*";
    
    public static boolean isPossibleUri(CharSequence candidate) {
        return TextUtils.matches(NAIVE_LIKELY_URI_PATTERN, candidate);
    }
    
    /**
     * @deprecated produces too many false positives,
     *             {@link #isVeryLikelyUri(CharSequence)} is preferred
     */
    public static boolean isLikelyUri(CharSequence candidate) {
        return isPossibleUri(candidate) && !isLikelyFalsePositive(candidate);
    }

    protected final static String[] AUDIO_VIDEO_IMAGE_MIMETYPES = new String[] {
            "audio/aiff",
            "audio/asf",
            "audio/basic",
            "audio/m4a",
            "audio/mid",
            "audio/midi",
            "audio/mp3",
            "audio/mp4",
            "audio/mp4a-latm",
            "audio/mpeg",
            "audio/mpeg3",
            "audio/mpegurl",
            "audio/mpg",
            "audio/ogg",
            "audio/playlist",
            "audio/unknown",
            "audio/vnd.qcelp",
            "audio/vnd.rn-realaudio",
            "audio/wav",
            "audio/x-aiff",
            "audio/x-m4a",
            "audio/x-midi",
            "audio/x-mp3",
            "audio/x-mpeg",
            "audio/x-mpeg3",
            "audio/x-mpegurl",
            "audio/x-ms-wax",
            "audio/x-ms-wma",
            "audio/x-ms-wmv",
            "audio/x-pn-realaudio",
            "audio/x-pn-realaudio-plugin",
            "audio/x-realaudio",
            "audio/x-scpls",
            "audio/x-wav",
            "image/bitmap",
            "image/bmp",
            "image/BMP",
            "image/cur",
            "image/fits",
            "image/gif",
            "image/GIF",
            "image/ico",
            "image/icon",
            "image/jp2",
            "image/jpeg",
            "image/JPEG",
            "image/jpeg-cmyk",
            "image/jpg",
            "image/JPG",
            "image/pdf",
            "image/pict",
            "image/pjpeg",
            "image/png",
            "image/PNG",
            "image/svg+xml",
            "image/tiff",
            "image/vnd.adobe.photoshop",
            "image/vnd.djvu",
            "image/vnd.dwg",
            "image/vnd.dxf",
            "image/vnd.microsoft.icon",
            "image/vnd.ms-modi",
            "image/vnd.ms-photo",
            "image/vnd.wap.wbmp",
            "image/x-bitmap",
            "image/x-bmp",
            "image/x-citrix-pjpeg",
            "image/x-dcraw",
            "image/x-djvu",
            "image/x.djvu",
            "image/x-emf",
            "image/x-eps",
            "image/x-guffaw",
            "image/x-ico",
            "image/xicon",
            "image/x-icon",
            "image/x-jg",
            "image/x-ms-bmp",
            "image/x-MS-bmp",
            "image/x-pcx",
            "image/x-photoshop",
            "image/x-pict",
            "image/x-png",
            "image/x-portable-anymap",
            "image/x-portable-bitmap",
            "image/x-portable-graymap",
            "image/x-portable-pixmap",
            "image/x-psd",
            "image/x-quicktime",
            "image/x-rgb",
            "image/x-windows-bmp",
            "image/x-wmf",
            "image/x-xbitmap",
            "image/x-xbm",
            "image/x-xfig",
            "image/x-xpixmap",
            "video/3gpp",
            "video/asx",
            "video/avi",
            "video/f4v",
            "video/flv",
            "video/m4v",
            "video/mp4",
            "video/MP4",
            "video/mp4v-es",
            "video/mpeg",
            "video/mpeg3",
            "video/mpeg4",
            "video/mpg4",
            "video/msvideo",
            "video/ogg",
            "video/quicktime",
            "video/swf",
            "video/unknown",
            "video/vnd.objectvideo",
            "video/webm",
            "video/wmv",
            "video/x-dv",
            "video/x-flv",
            "video/x-m4v",
            "video/x-mp4",
            "video/x-mpeg",
            "video/x-ms-asf",
            "video/x-ms-asx",
            "video/x-msvideo",
            "video/x-ms-wm",
            "video/x-ms-wma",
            "video/x-ms-wmv",
            "video/x-ms-wmx",
            "video/x-ms-wvx",
            "video/x-pn-realaudio",
            "video/x-pn-realvideo",
            "video/x-sgi-movie",
            "video/x-swf"
    };
    protected static final Set AUDIO_VIDEO_IMAGE_MIMETYPE_SET = new HashSet();
    static {
        AUDIO_VIDEO_IMAGE_MIMETYPE_SET.addAll(Arrays.asList(AUDIO_VIDEO_IMAGE_MIMETYPES));
    }

    protected static boolean isLikelyFalsePositive(CharSequence candidate) {
        if (TextUtils.matches("(?:text|application)/[^/]+", candidate)) {
            if (LOGGER.isLoggable(Level.FINE)) {
                LOGGER.fine("rejected: looks like an application or text mimetype: " + candidate);
            }
            return true;
        }

        for (String s: AUDIO_VIDEO_IMAGE_MIMETYPES) {
            if (s.contentEquals(candidate)) {
                if (LOGGER.isLoggable(Level.FINE)) {
                    LOGGER.fine("rejected: looks like an audio video or image mimetype: " + candidate);
                }
                return true;
            }
        }
        
        if (TextUtils.matches("\\d+(?:\\.\\d+)*", candidate)) {
            if (LOGGER.isLoggable(Level.FINE)) {
                LOGGER.fine("rejected: looks like a decimal number: " + candidate);
            }
            return true;
        }

        if (TextUtils.matches(".*[$()'\"\\[\\]{}|].*", candidate)) {
            if (LOGGER.isLoggable(Level.FINE)) {
                LOGGER.fine("rejected: contains unusual characters: " + candidate);
            }
            return true;
        }
        
        // starting or ending with + particularly common because of string concatenation in javascript
        if (TextUtils.matches("^[,;+:].*|.*[.,;+:]$", candidate)) {
            if (LOGGER.isLoggable(Level.FINE)) {
                LOGGER.fine("rejected: starts or ends with an unusual starting or ending character: " + candidate);
            }
            return true;
        }
        if (candidate.charAt(0) == '.' && !TextUtils.matches("^\\.{1,2}/.*", candidate)) {
            if (LOGGER.isLoggable(Level.FINE)) {
                LOGGER.fine("rejected: starts with '.' (but not './' or '../'): " + candidate);
            }
            return true;
        }
        
        if (TextUtils.matches("^.*[^:]//.*$", candidate)) {
            if (LOGGER.isLoggable(Level.FINE)) {
                LOGGER.fine("rejected: contains '//' (but not '://'): " + candidate);
            }
            return true;
        }
        
        // look for things that look like hostnames and not filenames?
        // look for too many dots but make sure we take into account that url may have hostname?

        if (LOGGER.isLoggable(Level.FINE)) {
            LOGGER.fine("accepted: does not look like a false positive: " + candidate);
        }

        return false;
    }
    
    /**
     * Perform additional fixup of likely-URI Strings
     * @return String changed/decoded to increase likelihood it is a 
     * meaningful non-404 URI
     */
    public static String speculativeFixup(String candidate, UURI base) {
        String retVal = candidate;
        
        // unescape ampersands
        retVal = TextUtils.replaceAll("&", retVal, "&");
        
        // uri-decode if begins with encoded 'http(s)?%3A'
        if(TextUtils.matches("(?i)^https?%3A.*", retVal)) {
            try {
                retVal = LaxURLCodec.DEFAULT.decode(retVal);
            } catch (DecoderException e) {
                LOGGER.log(Level.INFO,"unable to decode",e);
            }
        }
        
        // TODO: more URI-decoding if there are %-encoded parts?
        
        // detect scheme-less intended-absolute-URI
        // intent: "opens with what looks like a dotted-domain, and 
        // last segment is a top-level-domain (eg "com", "org", etc)" 
        Matcher m = TextUtils.getMatcher("(?:[^./]+\\.)+([^./]+)(?:/.*)?", 
                retVal);
        if (m.matches()) {
            if (ArchiveUtils.isTld(m.group(1))) {
                String schemePlus = "http://";
                // if on exact same host preserve scheme (eg https)
                try {
                    if (retVal.startsWith(base.getHost())) {
                        schemePlus = base.getScheme() + "://";
                    }
                } catch (URIException e) {
                    // error retrieving source host - ignore it
                }
                retVal = schemePlus + retVal;
            }
        }
        TextUtils.recycleMatcher(m);
        
        return retVal; 
    }

    protected static final Set HTML_TAGS = new HashSet();
    static {
        HTML_TAGS.addAll(Arrays.asList("a", "abbr", "acronym", "address",
                        "applet", "area", "article", "aside", "audio", "b",
                        "base", "basefont", "bdi", "bdo", "big", "blockquote",
                        "body", "br", "button", "canvas", "caption", "center",
                        "cite", "code", "col", "colgroup", "command",
                        "datalist", "dd", "del", "details", "dfn", "dir",
                        "div", "dl", "dt", "em", "embed", "fieldset",
                        "figcaption", "figure", "font", "footer", "form",
                        "frame", "frameset", "head", "header", "hgroup", "h1",
                        "h2", "h3", "h4", "h5", "h6", "hr", "html", "i",
                        "iframe", "img", "input", "ins", "kbd", "keygen",
                        "label", "legend", "li", "link", "map", "mark", "menu",
                        "meta", "meter", "nav", "noframes", "noscript",
                        "object", "ol", "optgroup", "option", "output", "p",
                        "param", "pre", "progress", "q", "rp", "rt", "ruby",
                        "s", "samp", "script", "section", "select", "small",
                        "source", "span", "strike", "strong", "style", "sub",
                        "summary", "sup", "table", "tbody", "td", "textarea",
                        "tfoot", "th", "thead", "time", "title", "tr", "track",
                        "tt", "u", "ul", "var", "video", "wbr"));
    }
    
    protected static final Set KNOWN_GOOD_FILE_EXTENSIONS = new HashSet();

    static {
        /*
         * Real known use cases for this are .min.js, .min.css, and we've seen
         * .jpg files with an extra dot in them. Other extensions are included
         * in the list somewhat arbitrarily.
         */
        KNOWN_GOOD_FILE_EXTENSIONS.addAll(Arrays.asList(".jpg", ".js", ".css",
                ".png", ".gif", ".swf", ".flv", ".mp4", ".mp3", ".jpeg",
                ".html", ".pdf"));
    }

    protected static final String QNV = "[a-zA-Z_]+=(?:[\\w-/.]|%[0-9a-fA-F]{2})*"; // name=value for query strings
    // group(1) filename
    // group(2) filename extension with leading '.'
    protected static final String LIKELY_RELATIVE_URI_PATTERN = 
            "(?:\\.?/)?"                                                    // may start with "/" or "./"
            + "(?:(?:[\\s\\w-]+|\\.\\.)(?:/))*"                             // may have path/segments/segment2
            + "([\\s\\w-]+(?:\\.[\\w-]+)??(\\.[a-zA-Z0-9]{2,5})?)?"         // may have a filename with or without an extension
            + "(?:\\?(?:"+ QNV + ")(?:&(?:" + QNV + "))*)?"                 // may have a ?query=string
            + "(?:#[\\w-]+)?";                                              // may have a #fragment
    
    
    public static boolean isVeryLikelyUri(CharSequence candidate) {
        // must have a . or /
        if (!TextUtils.matches(NAIVE_LIKELY_URI_PATTERN, candidate)) {
            return false;
        }
        
        // absolute uri
        if (TextUtils.matches("^(?i)https?://[^<>\\s/]+\\.[^<>\\s/]+(?:/[^<>\\s]*)?", candidate)) {
            return true;
        }
        
        // "protocol-relative" uri
        if (TextUtils.matches("^//[^<>\\s/]+\\.[^<>\\s/]+(?:/[^<>\\s]*)?", candidate)) {
            return true;
        }
        
        // relative or server-relative uri
        Matcher matcher = TextUtils.getMatcher(LIKELY_RELATIVE_URI_PATTERN, candidate);
        if (!matcher.matches()) {
            return false;
        }

        /*
         * Remaining tests discard stuff that the
         * LIKELY_RELATIVE_URI_PATTERN can't catch
         */

        // if filename contains two dots, it must end with a known good extension
        String filename = matcher.group(1);
        String extension = matcher.group(2);
        if (filename != null && extension != null
                && filename.indexOf('.') != filename.lastIndexOf('.')
                && !KNOWN_GOOD_FILE_EXTENSIONS.contains(extension)) {
            return false;
        }

        if (TextUtils.matches(".*\\s+.*", candidate)
                && (extension == null
                    || !KNOWN_GOOD_FILE_EXTENSIONS.contains(extension))) {
            return false;
        }

        // text or application mimetype
        if (TextUtils.matches("(?:text|application)/[^/]+", candidate)) {
            return false;
        }

        // audio, video or image mimetype
        if (AUDIO_VIDEO_IMAGE_MIMETYPE_SET.contains(candidate)) {
            return false;
        }
        
        // decimal number
        if (TextUtils.matches("\\d+(?:\\.\\d+)*", candidate)) {
            return false;
        }
        
        // likely css class, e.g. "div.menu", "a.help", etc
        Matcher m = TextUtils.getMatcher("([^./]+)\\.([^./]+)", candidate);
        if (m.matches() && HTML_TAGS.contains(m.group(1).toLowerCase())) {
            return false;
        }
        
        return true;
    }


    
//
// legacy likely-URI test from ExtractorJS
//
    // determines whether a string is likely URI
    // (no whitespace or '<' '>',  has an internal dot or some slash,
    // begins and ends with either '/' or a word-char)
    protected static final String STRING_URI_DETECTOR =
        "(?:\\w|[\\.]{0,2}/)[\\S&&[^<>]]*(?:\\.|/)[\\S&&[^<>]]*(?:\\w|/)";

 
    // blacklist of strings that STRING_URI_DETECTOR picks up as URIs,
    // which are known to be problematic, and NOT to be 
    // added to outLinks
    protected final static String[] STRING_URI_DETECTOR_EXCEPTIONS = {
        "text/javascript"
    };
    
    public static boolean isLikelyUriJavascriptContextLegacy(CharSequence candidate) {
    	if(!TextUtils.matches(STRING_URI_DETECTOR,candidate)) {
    		return false; 
    	}
    	for (String s : STRING_URI_DETECTOR_EXCEPTIONS) {
            if (s.contentEquals(candidate)) 
                return false;
        }
    	// matches detector and not an exception: so a likely URI
    	return true; 
    }
    
    
//
// legacy likely-URI test from ExtractorHTML
// 
	
    // much like the javascript likely-URI extractor, but
    // without requiring quotes -- this can indicate whether
    // an HTML tag attribute that isn't definitionally a
    // URI might be one anyway, as in form-tag VALUE attributes
    protected static final String LIKELY_URI_PATH =
     "(\\.{0,2}[^\\.\\n\\r\\s\"']*(\\.[^\\.\\n\\r\\s\"']+)+)";
	
	public static boolean isLikelyUriHtmlContextLegacy(CharSequence candidate) {
		return TextUtils.matches(LIKELY_URI_PATH, candidate);
	}
}