org.archive.modules.extractor.ExtractorUniversal Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-modules Show documentation
This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.
There is a newer version: 3.5.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.extractor;

import java.io.IOException;
import java.io.InputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.archive.modules.CrawlURI;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;


/**
 * A last ditch extractor that will look at the raw byte code and try to extract
 * anything that looks like a link.
 *
 * If used, it should always be specified as the last link extractor in the
 * order file.
 * 
 * To accomplish this it will scan through the bytecode and try and build up
 * strings of consecutive bytes that all represent characters that are valid
 * in a URL (see #isURLableChar(int) for details).
 * Once it hits the end of such a string (i.e. finds a character that
 * should not be in a URL) it will try to determine if it has found a URL.
 * This is done be seeing if the string is an IP address prefixed with
 * http(s):// or contains a dot followed by a Top Level Domain and end of
 * string or a slash.
 *
 * @author Kristinn Sigurdsson
 */
public class ExtractorUniversal extends ContentExtractor {

    @SuppressWarnings("unused")
    private static final long serialVersionUID = 3L;

    /**
     * How deep to look into files for URI strings, in bytes.
     */
    {
        setMaxSizeToParse(1*1024*1024L); // 1MB
    }
    public long getMaxSizeToParse() {
        return (Long) kp.get("maxSizeToParse");
    }
    public void setMaxSizeToParse(long threshold) {
        kp.put("maxSizeToParse",threshold);
    }

    /**
     * Matches any string that begins with http:// or https:// followed by
     * something that looks like an ip address (four numbers, none longer then
     * 3 chars seperated by 3 dots). Does not ensure that the numbers are
     * each in the range 0-255.
     */
    protected static final Pattern IP_ADDRESS = Pattern.compile(
        "((http://)|(https://))(\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?\\.\\d(\\d)?(\\d)?)");

    /**
     * Matches any string that begins with a TLD (no .) followed by a '/' slash
     * or end of string. If followed by slash then nothing after the slash is
     * of consequence.
     */
    public static final Pattern TLDs = Pattern.compile(
          "(ac(/.*)?)"  // ac  Ascension Island
        + "|(ad(/.*)?)" // ad  Andorra
        + "|(ae(/.*)?)" // ae  United Arab Emirates
        + "|(af(/.*)?)" // af  Afghanistan
        + "|(ag(/.*)?)" // ag  Antigua and Barbuda
        + "|(ai(/.*)?)" // ai  Anguilla
        + "|(al(/.*)?)" // al  Albania
        + "|(am(/.*)?)" // am  Armenia
        + "|(an(/.*)?)" // an  Netherlands Antilles
        + "|(ao(/.*)?)" // ao  Angola
        + "|(aero(/.*)?)" // aero Air-transport industry
        + "|(aq(/.*)?)" // aq  Antarctica
        + "|(ar(/.*)?)" // ar  Argentina
        + "|(as(/.*)?)" // as  American Samoa
        + "|(at(/.*)?)" // at  Austria
        + "|(au(/.*)?)" // au  Australia
        + "|(aw(/.*)?)" // aw  Aruba
        + "|(az(/.*)?)" // az  Azerbaijan
        + "|(ba(/.*)?)" // ba  Bosnia Hercegovina
        + "|(bb(/.*)?)" // bb  Barbados
        + "|(bd(/.*)?)" // bd  Bangladesh
        + "|(be(/.*)?)" // be  Belgium
        + "|(bf(/.*)?)" // bf  Burkina Faso
        + "|(bg(/.*)?)" // bg  Bulgaria
        + "|(bh(/.*)?)" // bh  Bahrain
        + "|(bi(/.*)?)" // bi  Burundi
        + "|(biz(/.*)?)" // biz Businesses
        + "|(bj(/.*)?)" // bj  Benin
        + "|(bm(/.*)?)" // bm  Bermuda
        + "|(bn(/.*)?)" // bn  Brunei Darussalam
        + "|(bo(/.*)?)" // bo  Bolivia
        + "|(br(/.*)?)" // br  Brazil
        + "|(bs(/.*)?)" // bs  Bahamas
        + "|(bt(/.*)?)" // bt  Bhutan
        + "|(bv(/.*)?)" // bv  Bouvet Island
        + "|(bw(/.*)?)" // bw  Botswana
        + "|(by(/.*)?)" // by  Belarus (Byelorussia)
        + "|(bz(/.*)?)" // bz  Belize
        + "|(ca(/.*)?)" // ca  Canada
        + "|(cc(/.*)?)" // cc  Cocos Islands (Keeling)
        + "|(cd(/.*)?)" // cd  Congo, Democratic Republic of the
        + "|(cf(/.*)?)" // cf  Central African Republic
        + "|(cg(/.*)?)" // cg  Congo, Republic of
        + "|(ch(/.*)?)" // ch  Switzerland
        + "|(ci(/.*)?)" // ci  Cote d'Ivoire (Ivory Coast)
        + "|(ck(/.*)?)" // ck  Cook Islands
        + "|(cl(/.*)?)" // cl  Chile
        + "|(cm(/.*)?)" // cm  Cameroon
        + "|(cn(/.*)?)" // cn  China
        + "|(co(/.*)?)" // co  Colombia
        + "|(com(/.*)?)" // com Commercial
        + "|(coop(/.*)?)" // coop Cooperatives
        + "|(cr(/.*)?)" // cr  Costa Rica
        + "|(cs(/.*)?)" // cs  Czechoslovakia
        + "|(cu(/.*)?)" // cu  Cuba
        + "|(cv(/.*)?)" // cv  Cap Verde
        + "|(cx(/.*)?)" // cx  Christmas Island
        + "|(cy(/.*)?)" // cy  Cyprus
        + "|(cz(/.*)?)" // cz  Czech Republic
        + "|(de(/.*)?)" // de  Germany
        + "|(dj(/.*)?)" // dj  Djibouti
        + "|(dk(/.*)?)" // dk  Denmark
        + "|(dm(/.*)?)" // dm  Dominica
        + "|(do(/.*)?)" // do  Dominican Republic
        + "|(dz(/.*)?)" // dz  Algeria
        + "|(ec(/.*)?)" // ec  Ecuador
        + "|(edu(/.*)?)" // edu Educational Institution
        + "|(ee(/.*)?)" // ee  Estonia
        + "|(eg(/.*)?)" // eg  Egypt
        + "|(eh(/.*)?)" // eh  Western Sahara
        + "|(er(/.*)?)" // er  Eritrea
        + "|(es(/.*)?)" // es  Spain
        + "|(et(/.*)?)" // et  Ethiopia
        + "|(fi(/.*)?)" // fi  Finland
        + "|(fj(/.*)?)" // fj  Fiji
        + "|(fk(/.*)?)" // fk  Falkland Islands
        + "|(fm(/.*)?)" // fm  Micronesia, Federal State of
        + "|(fo(/.*)?)" // fo  Faroe Islands
        + "|(fr(/.*)?)" // fr  France
        + "|(ga(/.*)?)" // ga  Gabon
        + "|(gd(/.*)?)" // gd  Grenada
        + "|(ge(/.*)?)" // ge  Georgia
        + "|(gf(/.*)?)" // gf  French Guiana
        + "|(gg(/.*)?)" // gg  Guernsey
        + "|(gh(/.*)?)" // gh  Ghana
        + "|(gi(/.*)?)" // gi  Gibraltar
        + "|(gl(/.*)?)" // gl  Greenland
        + "|(gm(/.*)?)" // gm  Gambia
        + "|(gn(/.*)?)" // gn  Guinea
        + "|(gov(/.*)?)" // gov Government (US)
        + "|(gp(/.*)?)" // gp  Guadeloupe
        + "|(gq(/.*)?)" // gq  Equatorial Guinea
        + "|(gr(/.*)?)" // gr  Greece
        + "|(gs(/.*)?)" // gs  South Georgia and the South Sandwich Islands
        + "|(gt(/.*)?)" // gt  Guatemala
        + "|(gu(/.*)?)" // gu  Guam
        + "|(gw(/.*)?)" // gw  Guinea-Bissau
        + "|(gy(/.*)?)" // gy  Guyana
        + "|(hk(/.*)?)" // hk  Hong Kong
        + "|(hm(/.*)?)" // hm  Heard and McDonald Islands
        + "|(hn(/.*)?)" // hn  Honduras
        + "|(hr(/.*)?)" // hr  Croatia/Hrvatska
        + "|(ht(/.*)?)" // ht  Haiti
        + "|(hu(/.*)?)" // hu  Hungary
        + "|(id(/.*)?)" // id  Indonesia
        + "|(ie(/.*)?)" // ie  Ireland
        + "|(il(/.*)?)" // il  Israel
        + "|(im(/.*)?)" // im  Isle of Man
        + "|(in(/.*)?)" // in  India
        + "|(info(/.*)?)" // info
        + "|(int(/.*)?)" // int Int. Organizations
        + "|(io(/.*)?)" // io  British Indian Ocean Territory
        + "|(iq(/.*)?)" // iq  Iraq
        + "|(ir(/.*)?)" // ir  Iran, Islamic Republic of
        + "|(is(/.*)?)" // is  Iceland
        + "|(it(/.*)?)" // it  Italy
        + "|(je(/.*)?)" // je  Jersey
        + "|(jm(/.*)?)" // jm  Jamaica
        + "|(jo(/.*)?)" // jo  Jordan
        + "|(jp(/.*)?)" // jp  Japan
        + "|(ke(/.*)?)" // ke  Kenya
        + "|(kg(/.*)?)" // kg  Kyrgyzstan
        + "|(kh(/.*)?)" // kh  Cambodia
        + "|(ki(/.*)?)" // ki  Kiribati
        + "|(km(/.*)?)" // km  Comoros
        + "|(kn(/.*)?)" // kn  Saint Kitts and Nevis
        + "|(kp(/.*)?)" // kp  Korea, Democratic People's Republic
        + "|(kr(/.*)?)" // kr  Korea, Republic of
        + "|(kw(/.*)?)" // kw  Kuwait
        + "|(ky(/.*)?)" // ky  Cayman Islands
        + "|(kz(/.*)?)" // kz  Kazakhstan
        + "|(la(/.*)?)" // la  Lao People's Democratic Republic
        + "|(lb(/.*)?)" // lb  Lebanon
        + "|(lc(/.*)?)" // lc  Saint Lucia
        + "|(li(/.*)?)" // li  Liechtenstein
        + "|(lk(/.*)?)" // lk  Sri Lanka
        + "|(lr(/.*)?)" // lr  Liberia
        + "|(ls(/.*)?)" // ls  Lesotho
        + "|(lt(/.*)?)" // lt  Lithuania
        + "|(lu(/.*)?)" // lu  Luxembourg
        + "|(lv(/.*)?)" // lv  Latvia
        + "|(ly(/.*)?)" // ly  Libyan Arab Jamahiriya
        + "|(ma(/.*)?)" // ma  Morocco
        + "|(mc(/.*)?)" // mc  Monaco
        + "|(md(/.*)?)" // md  Moldova, Republic of
        + "|(mg(/.*)?)" // mg  Madagascar
        + "|(mh(/.*)?)" // mh  Marshall Islands
        + "|(mil(/.*)?)" // mil Military (US Dept of Defense)
        + "|(mk(/.*)?)" // mk  Macedonia, Former Yugoslav Republic
        + "|(ml(/.*)?)" // ml  Mali
        + "|(mm(/.*)?)" // mm  Myanmar
        + "|(mn(/.*)?)" // mn  Mongolia
        + "|(mo(/.*)?)" // mo  Macau
        + "|(mp(/.*)?)" // mp  Northern Mariana Islands
        + "|(mq(/.*)?)" // mq  Martinique
        + "|(mr(/.*)?)" // mr  Mauritani
        + "|(ms(/.*)?)" // ms  Montserrat
        + "|(mt(/.*)?)" // mt  Malta
        + "|(mu(/.*)?)" // mu  Mauritius
        + "|(museum(/.*)?)" // museum Museums
        + "|(mv(/.*)?)" // mv  Maldives
        + "|(mw(/.*)?)" // mw  Malawi
        + "|(mx(/.*)?)" // mx  Mexico
        + "|(my(/.*)?)" // my  Malaysia
        + "|(mz(/.*)?)" // mz  Mozambique
        + "|(na(/.*)?)" // na  Namibia
        + "|(name(/.*)?)" // name Individuals
        + "|(nc(/.*)?)" // nc  New Caledonia
        + "|(ne(/.*)?)" // ne  Niger
        + "|(net(/.*)?)" // net networks
        + "|(nf(/.*)?)" // nf  Norfolk Island
        + "|(ng(/.*)?)" // ng  Nigeria
        + "|(ni(/.*)?)" // ni  Nicaragua
        + "|(nl(/.*)?)" // nl  Netherlands
        + "|(no(/.*)?)" // no  Norway
        + "|(np(/.*)?)" // np  Nepal
        + "|(nr(/.*)?)" // nr  Nauru
        + "|(nt(/.*)?)" // nt  Neutral Zone
        + "|(nu(/.*)?)" // nu  Niue
        + "|(nz(/.*)?)" // nz  New Zealand
        + "|(om(/.*)?)" // om  Oman
        + "|(org(/.*)?)" // org Organization (non-profit)
        + "|(pa(/.*)?)" // pa  Panama
        + "|(pe(/.*)?)" // pe  Peru
        + "|(pf(/.*)?)" // pf  French Polynesia
        + "|(pg(/.*)?)" // pg  Papua New Guinea
        + "|(ph(/.*)?)" // ph  Philippines
        + "|(pk(/.*)?)" // pk  Pakistan
        + "|(pl(/.*)?)" // pl  Poland
        + "|(pm(/.*)?)" // pm  St. Pierre and Miquelon
        + "|(pn(/.*)?)" // pn  Pitcairn Island
        + "|(pr(/.*)?)" // pr  Puerto Rico
        + "|(pro(/.*)?)" // pro Accountants, lawyers, and physicians
        + "|(ps(/.*)?)" // ps  Palestinian Territories
        + "|(pt(/.*)?)" // pt  Portugal
        + "|(pw(/.*)?)" // pw  Palau
        + "|(py(/.*)?)" // py  Paraguay
        + "|(qa(/.*)?)" // qa  Qatar
        + "|(re(/.*)?)" // re  Reunion Island
        + "|(ro(/.*)?)" // ro  Romania
        + "|(ru(/.*)?)" // ru  Russian Federation
        + "|(rw(/.*)?)" // rw  Rwanda
        + "|(sa(/.*)?)" // sa  Saudi Arabia
        + "|(sb(/.*)?)" // sb  Solomon Islands
        + "|(sc(/.*)?)" // sc  Seychelles
        + "|(sd(/.*)?)" // sd  Sudan
        + "|(se(/.*)?)" // se  Sweden
        + "|(sg(/.*)?)" // sg  Singapore
        + "|(sh(/.*)?)" // sh  St. Helena
        + "|(si(/.*)?)" // si  Slovenia
        + "|(sj(/.*)?)" // sj  Svalbard and Jan Mayen Islands
        + "|(sk(/.*)?)" // sk  Slovak Republic
        + "|(sl(/.*)?)" // sl  Sierra Leone
        + "|(sm(/.*)?)" // sm  San Marino
        + "|(sn(/.*)?)" // sn  Senegal
        + "|(so(/.*)?)" // so  Somalia
        + "|(sr(/.*)?)" // sr  Suriname
        + "|(sv(/.*)?)" // sv  El Salvador
        + "|(st(/.*)?)" // st  Sao Tome and Principe
        + "|(sy(/.*)?)" // sy  Syrian Arab Republic
        + "|(sz(/.*)?)" // sz  Swaziland
        + "|(tc(/.*)?)" // tc  Turks and Caicos Islands
        + "|(td(/.*)?)" // td  Chad
        + "|(tf(/.*)?)" // tf  French Southern Territories
        + "|(tg(/.*)?)" // tg  Togo
        + "|(th(/.*)?)" // th  Thailand
        + "|(tj(/.*)?)" // tj  Tajikistan
        + "|(tk(/.*)?)" // tk  Tokelau
        + "|(tm(/.*)?)" // tm  Turkmenistan
        + "|(tn(/.*)?)" // tn  Tunisia
        + "|(to(/.*)?)" // to  Tonga
        + "|(tp(/.*)?)" // tp  East Timor
        + "|(tr(/.*)?)" // tr  Turkey
        + "|(tt(/.*)?)" // tt  Trinidad and Tobago
        + "|(tv(/.*)?)" // tv  Tuvalu
        + "|(tw(/.*)?)" // tw  Taiwan
        + "|(tz(/.*)?)" // tz  Tanzania
        + "|(ua(/.*)?)" // ua  Ukraine
        + "|(ug(/.*)?)" // ug  Uganda
        + "|(uk(/.*)?)" // uk  United Kingdom
        + "|(um(/.*)?)" // um  US Minor Outlying Islands
        + "|(us(/.*)?)" // us  United States
        + "|(uy(/.*)?)" // uy  Uruguay
        + "|(uz(/.*)?)" // uz  Uzbekistan
        + "|(va(/.*)?)" // va  Holy See (City Vatican State)
        + "|(vc(/.*)?)" // vc  Saint Vincent and the Grenadines
        + "|(ve(/.*)?)" // ve  Venezuela
        + "|(vg(/.*)?)" // vg  Virgin Islands (British)
        + "|(vi(/.*)?)" // vi  Virgin Islands (USA)
        + "|(vn(/.*)?)" // vn  Vietnam
        + "|(vu(/.*)?)" // vu  Vanuatu
        + "|(wf(/.*)?)" // wf  Wallis and Futuna Islands
        + "|(ws(/.*)?)" // ws  Western Samoa
        + "|(ye(/.*)?)" // ye  Yemen
        + "|(yt(/.*)?)" // yt  Mayotte
        + "|(yu(/.*)?)" // yu  Yugoslavia
        + "|(za(/.*)?)" // za  South Africa
        + "|(zm(/.*)?)" // zm  Zambia
        + "|(zw(/.*)?)" // zw  Zimbabwe
        );

    /**
     * Constructor.
     */
    public ExtractorUniversal() {
    }

    
    @Override
    protected boolean shouldExtract(CrawlURI uri) {
        return true;
    }
    
    
    @Override
    protected boolean innerExtract(CrawlURI curi) {
        InputStream instream = null;
        try {
            instream = curi.getRecorder().getContentReplayInputStream();
            int ch = instream.read();
            StringBuffer lookat = new StringBuffer();
            long counter = 0;
            long maxdepth = getMaxSizeToParse();
            if(maxdepth<=0) {
                maxdepth = Long.MAX_VALUE;
            }
            long maxURLLength = UURI.MAX_URL_LENGTH;
            boolean foundDot = false;
            while(ch != -1 && ++counter <= maxdepth) {
                if(lookat.length()>maxURLLength){
                    //Exceeded maximum length of a URL. Start fresh.
                    lookat = new StringBuffer();
                    foundDot = false;
                }
                else if(isURLableChar(ch)){
                    //Add to buffer.
                    if(ch == 46){
                        // Current character is a dot '.'
                        foundDot = true;
                    }
                    lookat.append((char)ch);
                } else if(lookat.length() > 3 && foundDot) {
                    // It takes a bare mininum of 4 characters to form a URL
                    // Since we have at least that many let's try link
                    // extraction.
                    String newURL = lookat.toString();
                    if(looksLikeAnURL(newURL))
                    {
                        // Looks like we found something.

                        // Let's start with a little cleanup as we may have
                        // junk in front or at the end.
                        if(newURL.toLowerCase().indexOf("http") > 0){
                            // Got garbage in front of the protocol. Remove.
                            newURL = newURL.substring(newURL.toLowerCase().
                                indexOf("http"));
                        }
                        while(newURL.substring(newURL.length()-1).equals("."))
                        {
                            // URLs can't end with a dot. Strip it off.
                            newURL = newURL.substring(0,newURL.length()-1);
                        }

                        // And add the URL to speculative embeds.
                        numberOfLinksExtracted.incrementAndGet();
                        UURI dest = UURIFactory.getInstance(newURL);
                        LinkContext lc = LinkContext.SPECULATIVE_MISC;
                        Hop hop = Hop.SPECULATIVE;
                        addOutlink(curi,  dest, lc, hop);
                    }
                    // Reset lookat for next string.
                    lookat = new StringBuffer();
                    foundDot = false;
                } else if(lookat.length()>0) {
                    // Didn't get enough chars. Reset lookat for next string.
                    lookat = new StringBuffer();
                    foundDot = false;
                }
                ch = instream.read();
            }
        } catch(IOException e){
            curi.getNonFatalFailures().add(e);
        } finally {
            IOUtils.closeQuietly(instream);
        }
        // Set flag to indicate that link extraction is completed.
        return true;
    }

    /**
     * This method takes a look at a string and determines if it could be a URL.
     * To qualify the string must either begin with "http://" (https would also
     * work) followed by something that looks like an IP address or contain
     * within the string (possible at the end but not at the beginning) a TLD
     * (Top Level Domain) preceded by a dot.
     *
     * @param lookat The string to examine in an effort to determine if it
     * could be a URL
     * @return True if the string matches the above criteria for a URL.
     */
    private boolean looksLikeAnURL(String lookat) {
        if(lookat.indexOf("http://")==0 || lookat.indexOf("https://")==0){
            //Check if the rest of the string looks like an IP address.
            //if so return true. Otherwise continue on.
            Matcher ip = IP_ADDRESS.matcher(lookat);
            boolean testVal = ip.matches();
            if(testVal){
                return true;
            }
        }

        int dot = lookat.indexOf(".");
        if(dot!=0){//An URL can't start with a .tld.
            while(dot != -1 && dot < lookat.length()){
                lookat = lookat.substring(dot+1);
                if (isTLD(lookat.substring(0, lookat.length() <= 6?
                    lookat.length(): 6)))
                {
                    return true;
                }
                dot = lookat.indexOf(".");
            }
        }

        return false;
    }

    /**
     * Checks if a string is equal to known Top Level Domain. The string may
     * contain additional characters after the TLD but not before.
     * @param potentialTLD The string (usually 2-6 chars) to check if it starts
     * with a TLD.
     * @return True if the given string starts with the name of a known TLD
     *
     * @see #TLDs
     */
    private boolean isTLD(String potentialTLD) {
        if(potentialTLD.length()<2){
            return false;
        }

        potentialTLD = potentialTLD.toLowerCase();
        Matcher uri = TLDs.matcher(potentialTLD);
        boolean ret = uri.matches();
        return ret;
    }

    /**
     * Determines if a char (as represented by an int in the range of 0-255) is
     * a character (in the Ansi character set) that can be present in a URL.
     * This method takes a strict approach to what characters can be in
     * a URL.
     * 

     * The following are considered to be 'URLable'

     * 

     *   # $ % & + , - . / values 35-38,43-47
     *  
 [0-9] values 48-57
     *  
 : ; = ? @ value 58-59,61,63-64
     *  
 [A-Z] values 65-90
     *  
 _ value 95
     *  
 [a-z] values 97-122
     *  
 ~ value 126
     * 
     * 
     * To summerize, the following ranges are considered URLable:

     * 35-38,43-59,61,63-90,95,97-122,126
     *
     * @param ch The character (represented by an int) to test.
     * @return True if it is a URLable character, false otherwise.
     */
    private boolean isURLableChar(int ch) {
        return (ch>=35 && ch<=38)
            || (ch>=43 && ch<=59)
            || (ch==61)
            || (ch>=63 && ch<=90)
            || (ch==95)
            || (ch>=97 && ch<=122)
            || (ch==126);
    }
}