org.archive.modules.extractor.JerichoExtractorHTML Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-modules Show documentation
This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.
There is a newer version: 3.5.0
Show newest version
/*RELICENSE_RESEARCH*/
/* JerichoExtractorHTML
 * 
 * Copyright (C) 2006 Olaf Freyer
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 * $Id$
 */

package org.archive.modules.extractor;

import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlURI;
import org.archive.modules.net.RobotsPolicy;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.util.DevUtils;
import org.archive.util.TextUtils;

import au.id.jericho.lib.html.Attribute;
import au.id.jericho.lib.html.Attributes;
import au.id.jericho.lib.html.Element;
import au.id.jericho.lib.html.FormControl;
import au.id.jericho.lib.html.FormControlType;
import au.id.jericho.lib.html.FormField;
import au.id.jericho.lib.html.HTMLElementName;
import au.id.jericho.lib.html.Source;
import au.id.jericho.lib.html.StartTagType;

/**
 * Improved link-extraction from an HTML content-body using jericho-html parser.
 * This extractor extends ExtractorHTML and mimics its workflow - but has some
 * substantial differences when it comes to internal implementation. Instead
 * of heavily relying upon java regular expressions it uses a real html parser
 * library - namely Jericho HTML Parser (http://jerichohtml.sourceforge.net).
 * Using this parser it can better handle broken html (i.e. missing quotes)
 * and also offer improved extraction of HTML form URLs (not only extract
 * the action of a form, but also its default values).
 * Unfortunately this parser also has one major drawback - it has to read the
 * whole document into memory for parsing, thus has an inherent OOME risk.
 * This OOME risk can be reduced/eleminated by limiting the size of documents
 * to be parsed (i.e. using NotExceedsDocumentLengthTresholdDecideRule).
 * Also note that this extractor seems to have a lower overall memory 
 * consumption compared to ExtractorHTML. (still to be confirmed on a larger 
 * scale crawl) 
 * 
 * @author Olaf Freyer
 * @version $Date$ $Revision$
 */
@SuppressWarnings("unchecked")
public class JerichoExtractorHTML extends ExtractorHTML {

    @SuppressWarnings("unused")
    private static final long serialVersionUID = 1684681316546343615L;

    final private static Logger logger = 
        Logger.getLogger(JerichoExtractorHTML.class.getName());

    protected AtomicLong numberOfFormsProcessed = new AtomicLong(0);

    /*
    public JerichoExtractorHTML(String name) {
        this(name, "Jericho-HTML extractor. Extracts links from HTML " +
                "documents using Jericho HTML Parser. Offers same " + 
                "basic functionality as ExtractorHTML but better " +
                "handles broken HTML and extraction of default " +
                "values from HTML forms. A word of warning: the used " +
                "parser, the Jericho HTML Parser, reads the whole " +
                "document into memory for " +
                "parsing - thus this extractor has an inherent OOME risk. " +
                "This OOME risk can be reduced/eliminated by limiting the " +
                "size of documents to be parsed (i.e. using " +
                "NotExceedsDocumentLengthTresholdDecideRule). ");
    }*/

    public JerichoExtractorHTML() {
        super();
    }

    private static List findOnAttributes(Attributes attributes) {
        List result = new LinkedList();
        for (Attribute attr : (Iterable)attributes) {
            if (attr.getKey().startsWith("on"))
                result.add(attr);
        }
        return result;
    }


    protected void processGeneralTag(CrawlURI curi, Element element,
            Attributes attributes) {
        Attribute attr;
        String attrValue;
        List attrList;
        String elementName = element.getName();

        // Just in case it's an OBJECT or APPLET tag
        String codebase = null;
        ArrayList resources = null;

        final boolean framesAsEmbeds = getTreatFramesAsEmbedLinks();

        final boolean ignoreFormActions = getIgnoreFormActionUrls();
        
        final boolean overlyEagerLinkDetection = getExtractValueAttributes();

        // HREF
        if (((attr = attributes.get("href")) != null) &&
            ((attrValue = attr.getValue()) != null)) {
            CharSequence context = elementContext(elementName, attr
                    .getKey());
            if ("link".equals(elementName)) {
                //  elements treated as embeds (css, ico, etc)
                processEmbed(curi, attrValue, context);
            } else {
                // other HREFs treated as links
                processLink(curi, attrValue, context);
            }
            // Set the relative or absolute base URI if it's not already been modified. 
            // See https://github.com/internetarchive/heritrix3/pull/209
            if ("base".equals(elementName) && !curi.containsDataKey(CoreAttributeConstants.A_HTML_BASE)) {
                try {
                    UURI base = UURIFactory.getInstance(curi.getUURI(),attrValue);
                    curi.setBaseURI(base);
                } catch (URIException e) {
                    logUriError(e, curi.getUURI(), attrValue);
                }
            }
        }
        // ACTION
        if (((attr = attributes.get("action")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            if (!ignoreFormActions) {
                CharSequence context = elementContext(elementName, attr
                        .getKey());
                processLink(curi, attrValue, context);
            }
        }
        // ON_
        if ((attrList = findOnAttributes(attributes)).size() != 0) {
            for (Iterator attrIter = attrList.iterator(); attrIter.hasNext();) {
                attr = (Attribute) attrIter.next();
                CharSequence valueSegment = attr.getValueSegment();
                if (valueSegment != null)
                    processScriptCode(curi, valueSegment);

            }
        }
        // SRC atc.
        if ((((attr = attributes.get("src")) != null)
                || ((attr = attributes.get("lowsrc")) != null)
                || ((attr = attributes.get("background")) != null)
                || ((attr = attributes.get("cite")) != null)
                || ((attr = attributes.get("longdesc")) != null)
                || ((attr = attributes.get("usemap")) != null)
                || ((attr = attributes.get("profile")) != null)
                || ((attr = attributes.get("datasrc")) != null)) &&
                   ((attrValue = attr.getValue()) != null)) {

            final Hop hopType;
            CharSequence context = elementContext(elementName, attr.getKey());

            if (!framesAsEmbeds
                    && ("frame".equals(elementName) || "iframe"
                            .equals(elementName)))
                hopType = Hop.NAVLINK;
            else
                hopType = Hop.EMBED;

            processEmbed(curi, attrValue, context, hopType);
        }
        // CODEBASE
        if (((attr = attributes.get("codebase")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            codebase = StringEscapeUtils.unescapeHtml(attrValue);
            CharSequence context = elementContext(elementName, attr.getKey());
            processEmbed(curi, codebase, context);
        }
        // CLASSID DATA
        if ((((attr = attributes.get("classid")) != null)
                || ((attr = attributes.get("data")) != null)) &&
                   ((attrValue = attr.getValue()) != null)) {
            if (resources == null)
                resources = new ArrayList();
            resources.add(attrValue);
        }
        // ARCHIVE
        if (((attr = attributes.get("archive")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            if (resources == null)
                resources = new ArrayList();
            String[] multi = TextUtils.split(WHITESPACE, attrValue);
            for (int i = 0; i < multi.length; i++) {
                resources.add(multi[i]);
            }
        }
        // CODE
        if (((attr = attributes.get("code")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            if (resources == null)
                resources = new ArrayList();
            // If element is applet and code value does not end with
            // '.class' then append '.class' to the code value.
            if (APPLET.equals(elementName) && !attrValue.endsWith(CLASSEXT)) {
                resources.add(attrValue + CLASSEXT);
            } else {
                resources.add(attrValue);
            }
        }
        // VALUE
        if (((attr = attributes.get("value")) != null) &&
                 ((attrValue = attr.getValue()) != null)) {
            CharSequence valueContext = elementContext(elementName, attr.getKey());
            if("PARAM".equalsIgnoreCase(elementName) 
                    && "flashvars".equalsIgnoreCase(attributes.get("name").getValue())) {
                // special handling for )element.findFormFields()) {
            // for each form control
            for (FormControl formControl : (Iterable)formField.getFormControls()) {
                // get name of control element (and URLEncode it)
                String controlName = formControl.getName();

                // retrieve list of values - submit needs special handling
                Collection controlValues;
                if (!(formControl.getFormControlType() ==
                        FormControlType.SUBMIT)) {
                    controlValues = formControl.getValues();
                } else {
                    controlValues = formControl.getPredefinedValues();
                }

                if (controlValues.size() > 0) {
                    // for each value set
                    for (String value : controlValues) {
                        queryURL += "&" + controlName + "=" + value;
                    }
                } else {
                    queryURL += "&" + controlName + "=";
                }
            }
        }

        // clean up url
        if (action == null) {
            queryURL = queryURL.replaceFirst("&", "?");
        } else {
            if (!action.contains("?"))
                queryURL = queryURL.replaceFirst("&", "?");
            queryURL = action + queryURL;
        }

        CharSequence context = elementContext(element.getName(),
            "name=" + name);
        processLink(curi, queryURL, context);

    }

    /**
     * Run extractor. This method is package visible to ease testing.
     * 
     * @param curi
     *            CrawlURI we're processing.
     * @param cs
     *            Sequence from underlying ReplayCharSequence.
     */
    protected void extract(CrawlURI curi, CharSequence cs) {
        Source source = new Source(cs);
        List elements = source.findAllElements(StartTagType.NORMAL);
        for (Element element : elements) {
            String elementName = element.getName();
            Attributes attributes;
            if (elementName.equals(HTMLElementName.META)) {
                if (processMeta(curi, element)) {
                    // meta tag included NOFOLLOW; abort processing
                    break;
                }
            } else if (elementName.equals(HTMLElementName.SCRIPT)) {
                processScript(curi, element);
            } else if (elementName.equals(HTMLElementName.STYLE)) {
                processStyle(curi, element);
            } else if (elementName.equals(HTMLElementName.FORM)) {
                processForm(curi, element);
            } else if (!(attributes = element.getAttributes()).isEmpty()) {
                processGeneralTag(curi, element, attributes);
            }
        }
    }

    /*
     * (non-Javadoc)
     * 
     * @see org.archive.crawler.framework.Processor#report()
     */
    public String report() {
        StringBuffer ret = new StringBuffer();
        ret.append(super.report());
        ret.append("  " + this.numberOfFormsProcessed + " forms processed\n");
        return ret.toString();
    }
}