org.archive.modules.extractor.ExtractorHTML Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-modules Show documentation
This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.
There is a newer version: 3.6.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.modules.extractor;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;

import org.apache.commons.httpclient.URIException;
import org.archive.io.ReplayCharSequence;
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlMetadata;
import org.archive.modules.CrawlURI;
import org.archive.modules.net.RobotsPolicy;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.DevUtils;
import org.archive.util.TextUtils;
import org.archive.util.UriUtils;
import org.springframework.beans.factory.InitializingBean;
import org.springframework.beans.factory.annotation.Autowired;

/**
 * Basic link-extraction, from an HTML content-body,
 * using regular expressions.
 *
 * NOTE: This processor may open a ReplayCharSequence from the 
 * CrawlURI's Recorder, without closing that ReplayCharSequence, to allow
 * reuse by later processors in sequence. In the usual (Heritrix) case, a 
 * call after all processing to the Recorder's endReplays() method ensures
 * timely close of any reused ReplayCharSequences. Reuse of this processor
 * elsewhere should ensure a similar cleanup call to Recorder.endReplays()
 * occurs. 
 * 
 * TODO: Compare against extractors based on HTML parsing libraries for 
 * accuracy, completeness, and speed.
 * 
 * @author gojomo
 */
public class ExtractorHTML extends ContentExtractor implements InitializingBean {

    @SuppressWarnings("unused")
    private static final long serialVersionUID = 2L;

    private static Logger logger =
        Logger.getLogger(ExtractorHTML.class.getName());

    private final static String MAX_ELEMENT_REPLACE = "MAX_ELEMENT";
    
    private final static String MAX_ATTR_NAME_REPLACE = "MAX_ATTR_NAME";
    
    private final static String MAX_ATTR_VAL_REPLACE = "MAX_ATTR_VAL";

    public final static String A_META_ROBOTS = "meta-robots";
    
    public final static String A_FORM_OFFSETS = "form-offsets";
    
    {
        setMaxElementLength(64); 
    }
    public int getMaxElementLength() {
        return (Integer) kp.get("maxElementLength");
    }
    public void setMaxElementLength(int max) {
        kp.put("maxElementLength",max);
    }
      
    
    /**
     * Relevant tag extractor.
     * 
     * 
     * This pattern extracts either:
     * 
     * 
     * (1) whole <script>...</script> or
     * 
(2) <style>...</style> or
     * 
(3) <meta ...> or
     * 
(4) any other open-tag with at least one attribute (eg matches
     * "<a href='boo'>" but not "</a>" or "<br>")
     * 
     * 
     * groups:
     * 
     * 
     * 1: SCRIPT SRC=foo>boo</SCRIPT
     * 
2: just script open tag
     * 
3: STYLE TYPE=moo>zoo</STYLE
     * 
4: just style open tag
     * 
5: entire other tag, without '<' '>'
     * 
6: element
     * 
7: META
     * 
8: !-- comment --
     * 
     * 
     * 
     * HER-1998 - Modified part 8 to allow conditional html comments.
     * Conditional HTML comment example:
     * "<!--[if expression]> HTML <![endif]-->"
     * 
     * 
     * 
     * This technique is commonly used to reference CSS & JavaScript that
     * are designed to deal with the quirks of a specific version of Internet
     * Explorer. There is another syntax for conditional comments which already
     * gets parsed by the regex since it doesn't start with "<!--" Ex.
     * <!if expression> HTML <!endif>
     * 
     * 
     * 
     * https://en.wikipedia.org/wiki/Conditional_Comments
     * 
     */
    // version w/ less unnecessary backtracking
    static final String RELEVANT_TAG_EXTRACTOR =
      "(?is)<(?:((script[^>]*+)>.*?]*+)>.*?]*+)" + // 5, 6, 7
      "|(!--(?!\\[if|>).*?--))>"; // 8 

//    version w/ problems with unclosed script tags 
//    static final String RELEVANT_TAG_EXTRACTOR =
//    "(?is)<(?:((script.*?)>.*?.*?";


      
//    // this pattern extracts 'href' or 'src' attributes from
//    // any open-tag innards matched by the above
//    static Pattern RELEVANT_ATTRIBUTE_EXTRACTOR = Pattern.compile(
//     "(?is)(\\w+)(?:\\s+|(?:\\s.*?\\s))(?:(href)|(src))\\s*=(?:(?:\\s*\"(.+?)\")|(?:\\s*'(.+?)')|(\\S+))");
//
//    // this pattern extracts 'robots' attributes
//    static Pattern ROBOTS_ATTRIBUTE_EXTRACTOR = Pattern.compile(
//     "(?is)(\\w+)\\s+.*?(?:(robots))\\s*=(?:(?:\\s*\"(.+)\")|(?:\\s*'(.+)')|(\\S+))");

    {
        setMaxAttributeNameLength(64); // 64 chars
    }

    public int getMaxAttributeNameLength() {
        return (Integer) kp.get("maxAttributeNameLength");
    }

    public void setMaxAttributeNameLength(int max) {
        kp.put("maxAttributeNameLength", max);
    }


    {
        setMaxAttributeValLength(2048); // 2K
    }

    public int getMaxAttributeValLength() {
        return (Integer) kp.get("maxAttributeValLength");
    }

    public void setMaxAttributeValLength(int max) {
        kp.put("maxAttributeValLength", max);
    }
      
    // TODO: perhaps cut to near MAX_URI_LENGTH
    
    // this pattern extracts attributes from any open-tag innards
    // matched by the above. attributes known to be URIs of various
    // sorts are matched specially
    static final String EACH_ATTRIBUTE_EXTRACTOR =
      "(?is)\\s?((href)|(action)|(on\\w*)" // 1, 2, 3, 4 
     +"|((?:src)|(?:srcset)|(?:lowsrc)|(?:background)|(?:cite)" // ...
     +"|(?:longdesc)|(?:usemap)|(?:profile)|(?:datasrc))" // 5
     +"|(codebase)|((?:classid)|(?:data))|(archive)|(code)" // 6, 7, 8, 9
     +"|(value)|(style)|(method)" // 10, 11, 12
     +"|([-\\w]{1,"+MAX_ATTR_NAME_REPLACE+"}))" // 13
     +"\\s*=\\s*"
     +"(?:(?:\"(.{0,"+MAX_ATTR_VAL_REPLACE+"}?)(?:\"|$))" // 14
     +"|(?:'(.{0,"+MAX_ATTR_VAL_REPLACE+"}?)(?:'|$))" // 15
     +"|(\\S{1,"+MAX_ATTR_VAL_REPLACE+"}))"; // 16
    // groups:
    // 1: attribute name
    // 2: HREF - single URI relative to doc base, or occasionally javascript:
    // 3: ACTION - single URI relative to doc base, or occasionally javascript:
    // 4: ON[WHATEVER] - script handler
    // 5: SRC,SRCSET,LOWSRC,BACKGROUND,CITE,LONGDESC,USEMAP,PROFILE, or DATASRC
    //    single URI relative to doc base
    // 6: CODEBASE - a single URI relative to doc base, affecting other
    //    attributes
    // 7: CLASSID, DATA - a single URI relative to CODEBASE (if supplied)
    // 8: ARCHIVE - one or more space-delimited URIs relative to CODEBASE
    //    (if supplied)
    // 9: CODE - a single URI relative to the CODEBASE (is specified).
    // 10: VALUE - often includes a uri path on forms
    // 11: STYLE - inline attribute style info
    // 12: METHOD - form GET/POST
    // 13: any other attribute
    // 14: double-quote delimited attr value
    // 15: single-quote delimited attr value
    // 16: space-delimited attr value

    
    static final String WHITESPACE = "\\s";
    static final String CLASSEXT =".class";
    static final String APPLET = "applet";
    static final String BASE = "base";
    static final String LINK = "link";
    static final String FRAME = "frame";
    static final String IFRAME = "iframe";

    
    /**
     * If true, FRAME/IFRAME SRC-links are treated as embedded resources (like
     * IMG, 'E' hop-type), otherwise they are treated as navigational links.
     * Default is true.
     */
    {
        setTreatFramesAsEmbedLinks(true);
    }
    public boolean getTreatFramesAsEmbedLinks() {
        return (Boolean) kp.get("treatFramesAsEmbedLinks");
    }
    public void setTreatFramesAsEmbedLinks(boolean asEmbeds) {
        kp.put("treatFramesAsEmbedLinks",asEmbeds);
    }
    
    /**
     * If true, URIs appearing as the ACTION attribute in HTML FORMs are
     * ignored. Default is false.
     */
    {
        setIgnoreFormActionUrls(false);
    }
    public boolean getIgnoreFormActionUrls() {
        return (Boolean) kp.get("ignoreFormActionUrls");
    }
    public void setIgnoreFormActionUrls(boolean ignoreActions) {
        kp.put("ignoreFormActionUrls",ignoreActions);
    }

    /**
     * If true, only ACTION URIs with a METHOD of GET (explicit or implied)
     * are extracted. Default is true.
     */
    {
        setExtractOnlyFormGets(true);
    }
    public boolean getExtractOnlyFormGets() {
        return (Boolean) kp.get("extractOnlyFormGets");
    }
    public void setExtractOnlyFormGets(boolean onlyGets) {
        kp.put("extractOnlyFormGets",onlyGets);
    }
    
    /**
     * If true, in-page Javascript is scanned for strings that
     * appear likely to be URIs. This typically finds both valid
     * and invalid URIs, and attempts to fetch the invalid URIs
     * sometimes generates webmaster concerns over odd crawler
     * behavior. Default is true.
     */
    {
        setExtractJavascript(true);
    }
    public boolean getExtractJavascript() {
        return (Boolean) kp.get("extractJavascript");
    }
    public void setExtractJavascript(boolean extractJavascript) {
        kp.put("extractJavascript",extractJavascript);
    }    

    /**
     * If true, strings that look like URIs found in unusual places (such as
     * form VALUE attributes) will be extracted. This typically finds both valid
     * and invalid URIs, and attempts to fetch the invalid URIs sometimes
     * generate webmaster concerns over odd crawler behavior. Default is true.
     */
    {
        setExtractValueAttributes(true);
    }
    public boolean getExtractValueAttributes() {
        return (Boolean) kp.get("extractValueAttributes");
    }
    public void setExtractValueAttributes(boolean extractValueAttributes) {
        kp.put("extractValueAttributes",extractValueAttributes);
    }    

    /**
     * If true, URIs which end in typical non-HTML extensions (such as .gif)
     * will not be scanned as if it were HTML. Default is true.
     */
    {
        setIgnoreUnexpectedHtml(true);
    }
    public boolean getIgnoreUnexpectedHtml() {
        return (Boolean) kp.get("ignoreUnexpectedHtml");
    }
    public void setIgnoreUnexpectedHtml(boolean ignoreUnexpectedHtml) {
        kp.put("ignoreUnexpectedHtml",ignoreUnexpectedHtml);
    }
    
    /**
     * CrawlMetadata provides the robots honoring policy to use when 
     * considering a robots META tag.
     */
    protected CrawlMetadata metadata;
    public CrawlMetadata getMetadata() {
        return metadata;
    }
    @Autowired
    public void setMetadata(CrawlMetadata provider) {
        this.metadata = provider;
    }
    
    /**
     * Javascript extractor to use to process inline javascript. Autowired if
     * available. If null, links will not be extracted from inline javascript.
     */
    transient protected ExtractorJS extractorJS;
    public ExtractorJS getExtractorJS() {
        return extractorJS;
    }
    @Autowired
    public void setExtractorJS(ExtractorJS extractorJS) {
        this.extractorJS = extractorJS;
    }
    
    // TODO: convert to Strings
    private String relevantTagPattern;
    private String eachAttributePattern;
 
    public ExtractorHTML() {
    }

    public void afterPropertiesSet() {
        String regex = RELEVANT_TAG_EXTRACTOR;
        regex = regex.replace(MAX_ELEMENT_REPLACE, 
                    Integer.toString(getMaxElementLength()));
        this.relevantTagPattern = regex;
        
        regex = EACH_ATTRIBUTE_EXTRACTOR;
        regex = regex.replace(MAX_ATTR_NAME_REPLACE, 
                    Integer.toString(getMaxAttributeNameLength()));
        regex = regex.replace(MAX_ATTR_VAL_REPLACE,
                    Integer.toString(getMaxAttributeValLength()));
        this.eachAttributePattern = regex;
    }
    

    protected void processGeneralTag(CrawlURI curi, CharSequence element,
            CharSequence cs) {

        Matcher attr = TextUtils.getMatcher(eachAttributePattern,cs);

        // Just in case it's an OBJECT or APPLET tag
        String codebase = null;
        ArrayList resources = null;
        
        // Just in case it's a FORM
        CharSequence action = null;
        CharSequence actionContext = null;
        CharSequence method = null; 
        
        // Just in case it's a VALUE whose interpretation depends on accompanying NAME
        CharSequence valueVal = null; 
        CharSequence valueContext = null;
        CharSequence nameVal = null; 
        
        final boolean framesAsEmbeds = 
            getTreatFramesAsEmbedLinks();

        final boolean ignoreFormActions = 
            getIgnoreFormActionUrls();
        
        final boolean extractValueAttributes = 
            getExtractValueAttributes();
        
        final String elementStr = element.toString();

        while (attr.find()) {
            int valueGroup =
                (attr.start(14) > -1) ? 14 : (attr.start(15) > -1) ? 15 : 16;
            int start = attr.start(valueGroup);
            int end = attr.end(valueGroup);
            assert start >= 0: "Start is: " + start + ", " + curi;
            assert end >= 0: "End is :" + end + ", " + curi;
            CharSequence value = cs.subSequence(start, end);
            CharSequence attrName = cs.subSequence(attr.start(1),attr.end(1));
            value = TextUtils.unescapeHtml(value);
            if (attr.start(2) > -1) {
                CharSequence context;
                // HREF
                if ("a".equals(element) && TextUtils.matches("(?i).*data-remote\\s*=\\s*([\"'])true.*\\1", cs)) {
                    context = "a[data-remote='true']/@href";
                } else {
                    context = elementContext(element, attr.group(2));
                }

                if ("a[data-remote='true']/@href".equals(context) || elementStr.equalsIgnoreCase(LINK)) {
                    //  elements treated as embeds (css, ico, etc)
                    processEmbed(curi, value, context);
                } else {
                    // other HREFs treated as links
                    processLink(curi, value, context);
                }
                // Set the relative or absolute base URI if it's not already been modified. 
                // See https://github.com/internetarchive/heritrix3/pull/209
                if (elementStr.equalsIgnoreCase(BASE) && !curi.containsDataKey(CoreAttributeConstants.A_HTML_BASE)) {
                    try {
                        UURI base = UURIFactory.getInstance(curi.getUURI(),value.toString());
                        curi.setBaseURI(base);
                    } catch (URIException e) {
                        logUriError(e, curi.getUURI(), value);
                    }
                }
            } else if (attr.start(3) > -1) {
                // ACTION
                if (!ignoreFormActions) {
                    action = value; 
                    actionContext = elementContext(element, attr.group(3));
                    // handling finished only at end (after METHOD also collected)
                }
            } else if (attr.start(4) > -1) {
                // ON____
                processScriptCode(curi, value); // TODO: context?
            } else if (attr.start(5) > -1) {
                // SRC etc.
                CharSequence context = elementContext(element, attr.group(5));
                if (!context.toString().toLowerCase().startsWith("data:")) {

                    // true, if we expect another HTML page instead of an image etc.
                    final Hop hop;

                    if (!framesAsEmbeds
                            && (elementStr.equalsIgnoreCase(FRAME) || elementStr
                            .equalsIgnoreCase(IFRAME))) {
                        hop = Hop.NAVLINK;
                    } else {
                        hop = Hop.EMBED;
                    }
                    processEmbed(curi, value, context, hop);
                }
            } else if (attr.start(6) > -1) {
                // CODEBASE
                codebase = (value instanceof String)?
                    (String)value: value.toString();
                CharSequence context = elementContext(element,
                    attr.group(6));
                processLink(curi, codebase, context);
            } else if (attr.start(7) > -1) {
                // CLASSID, DATA
                if (resources == null) {
                    resources = new ArrayList();
                }
                resources.add(value.toString());
            } else if (attr.start(8) > -1) {
                // ARCHIVE
                if (resources==null) {
                    resources = new ArrayList();
                }
                String[] multi = TextUtils.split(WHITESPACE, value);
                for(int i = 0; i < multi.length; i++ ) {
                    resources.add(multi[i]);
                }
            } else if (attr.start(9) > -1) {
                // CODE
                if (resources==null) {
                    resources = new ArrayList();
                }
                // If element is applet and code value does not end with
                // '.class' then append '.class' to the code value.
                if (elementStr.equalsIgnoreCase(APPLET) &&
                        !value.toString().toLowerCase().endsWith(CLASSEXT)) {
                    resources.add(value.toString() + CLASSEXT);
                } else {
                    resources.add(value.toString());
                }
            } else if (attr.start(10) > -1) {
                // VALUE, with possibility of URI
                // store value, context for handling at end
                valueVal = value; 
                valueContext = elementContext(element,attr.group(10));
            } else if (attr.start(11) > -1) {
                // STYLE inline attribute
                // then, parse for URIs
                numberOfLinksExtracted.addAndGet(ExtractorCSS.processStyleCode(
                        this, curi, value));        
            } else if (attr.start(12) > -1) {
                // METHOD
                method = value;
                // form processing finished at end (after ACTION also collected)
            } else if (attr.start(13) > -1) {
                if("NAME".equalsIgnoreCase(attrName.toString())) {
                    // remember 'name' for end-analysis
                    nameVal = value; 
                }
                if("FLASHVARS".equalsIgnoreCase(attrName.toString())) {
                    // consider FLASHVARS attribute immediately
                    valueContext = elementContext(element,attr.group(13));
                    considerQueryStringValues(curi, value, valueContext,Hop.SPECULATIVE);
                }
                // any other attribute
                // ignore for now
                // could probe for path- or script-looking strings, but
                // those should be vanishingly rare in other attributes,
                // and/or symptomatic of page bugs
            }
        }
        TextUtils.recycleMatcher(attr);

        // handle codebase/resources
        if (resources != null) {
            Iterator iter = resources.iterator();
            UURI codebaseURI = null;
            String res = null;
            try {
                if (codebase != null) {
                    // TODO: Pass in the charset.
                    codebaseURI = UURIFactory.
                        getInstance(curi.getUURI(), codebase);
                }
                while(iter.hasNext()) {
                    res = iter.next().toString();
                    res = (String) TextUtils.unescapeHtml(res);
                    if (codebaseURI != null) {
                        res = codebaseURI.resolve(res).toString();
                    }
                    processEmbed(curi, res, element); // TODO: include attribute too
                }
            } catch (URIException e) {
                curi.getNonFatalFailures().add(e);
            } catch (IllegalArgumentException e) {
                DevUtils.logger.log(Level.WARNING, "processGeneralTag()\n" +
                    "codebase=" + codebase + " res=" + res + "\n" +
                    DevUtils.extraInfo(), e);
            }
        }
           
        // finish handling form action, now method is available
        if(action != null) {
            if(method == null || "GET".equalsIgnoreCase(method.toString()) 
                        || ! getExtractOnlyFormGets()) {
                processLink(curi, action, actionContext);
            }
        }
        
        // finish handling VALUE
        if(valueVal != null) {
            if ("PARAM".equalsIgnoreCase(elementStr) && nameVal != null
                    && "flashvars".equalsIgnoreCase(nameVal.toString())) {
                // special handling for 0) {
               curi.getNonFatalFailures().add(cs.getCodingException()); 
           }
           // Set flag to indicate that link extraction is completed.
           return true;
        } catch (IOException e) {
            curi.getNonFatalFailures().add(e);
            logger.log(Level.WARNING,"Failed get of replay char sequence in " +
                Thread.currentThread().getName(), e);
        }
        return false;
    }
    
    // 1. look for 
    // 2. if not found then look for 
    // 3. if not found then 
    protected Charset getContentDeclaredCharset(CrawlURI curi, String contentPrefix) {
        String charsetName = null; 
        // 
        Matcher matcher = TextUtils.getMatcher("(?is)]*http-equiv\\s*=\\s*['\"]content-type['\"][^>]*>", contentPrefix);
        if (matcher.find()) {
            String metaContentType = matcher.group();
            TextUtils.recycleMatcher(matcher); 
            matcher = TextUtils.getMatcher("charset=([^'\";\\s>]+)", metaContentType);
            if (matcher.find()) {
                charsetName = matcher.group(1); 
            }
            TextUtils.recycleMatcher(matcher); 
        }

        if(charsetName==null) {
            // 
            matcher = TextUtils.getMatcher("(?si)]*charset=['\"]([^'\";\\s>]+)['\"]", contentPrefix);
            if (matcher.find()) {
                charsetName = matcher.group(1); 
                TextUtils.recycleMatcher(matcher); 
            } else {
                // 
                matcher = TextUtils.getMatcher("(?is)<\\?xml\\s+[^>]*encoding=['\"]([^'\"]+)['\"]", contentPrefix);
                if (matcher.find()) {
                    charsetName = matcher.group(1); 
                } else {
                    return null; // none found
                }
                TextUtils.recycleMatcher(matcher); 
            }
        }
        try {
            return Charset.forName(charsetName); 
        } catch (IllegalArgumentException iae) {
            logger.log(Level.INFO,"Unknown content-encoding '"+charsetName+"' declared; using default");  
            curi.getAnnotations().add("unsatisfiableCharsetInHTML:"+charsetName);
            return null; 
        } 
    }

    /**
     * Run extractor.
     * This method is package visible to ease testing.
     * @param curi CrawlURI we're processing.
     * @param cs Sequence from underlying ReplayCharSequence. This
     * is TRANSIENT data. Make a copy if you want the data to live outside
     * of this extractors' lifetime.
     */
    protected void extract(CrawlURI curi, CharSequence cs) {
        Matcher tags = TextUtils.getMatcher(relevantTagPattern,cs);
        while(tags.find()) {
            if(Thread.interrupted()){
                break;
            }
            if (tags.start(8) > 0) {
                // comment match
                // for now do nothing
            } else if (tags.start(7) > 0) {
                //  match
                int start = tags.start(5);
                int end = tags.end(5);
                assert start >= 0: "Start is: " + start + ", " + curi;
                assert end >= 0: "End is :" + end + ", " + curi;
                if (processMeta(curi,
                    cs.subSequence(start, end))) {

                    // meta tag included NOFOLLOW; abort processing
                    break;
                }
            } else if (tags.start(5) > 0) {
                // generic  match
                int start5 = tags.start(5);
                int end5 = tags.end(5);
                assert start5 >= 0: "Start is: " + start5 + ", " + curi;
                assert end5 >= 0: "End is :" + end5 + ", " + curi;
                int start6 = tags.start(6);
                int end6 = tags.end(6);
                assert start6 >= 0: "Start is: " + start6 + ", " + curi;
                assert end6 >= 0: "End is :" + end6 + ", " + curi;
                String element = cs.subSequence(start6, end6).toString();
                CharSequence attributes = cs.subSequence(start5, end5);
                processGeneralTag(curi,
                    element,
                    attributes);
                // remember FORM to help later extra processing
                if ("form".equalsIgnoreCase(element)) {
                    curi.getDataList(A_FORM_OFFSETS).add((Integer)(start6-1));
                }
               

            } else if (tags.start(1) > 0) {
                //