org.archive.modules.extractor.Extractor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-modules Show documentation
This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.
There is a newer version: 3.5.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
 
package org.archive.modules.extractor;


import java.util.concurrent.atomic.AtomicLong;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.annotation.Autowired;


/**
 * Extracts links from fetched URIs.  This class provides error handling
 * for some common issues that occur when parsing document content.  You
 * almost certainly want to subclass {@link ContentExtractor} instead of
 * this class.
 * 
 * @author pjack
 */
public abstract class Extractor extends Processor {
    private static final Logger LOGGER = Logger.getLogger(Extractor.class.getName());
    
    protected AtomicLong numberOfLinksExtracted = new AtomicLong(0);

    /** Logger. */
    private static final Logger logger = 
        Logger.getLogger(Extractor.class.getName());

    public static final ExtractorParameters DEFAULT_PARAMETERS = 
        new ExtractorParameters() {
            public int getMaxOutlinks() {
                return 6000;
            }
            public boolean getExtractIndependently() {
                return false;
            }
            public boolean getExtract404s() {
                return false;
            }
        };

    transient protected UriErrorLoggerModule loggerModule;
    public UriErrorLoggerModule getLoggerModule() {
        return this.loggerModule;
    }
    @Autowired
    public void setLoggerModule(UriErrorLoggerModule loggerModule) {
        this.loggerModule = loggerModule;
    }
    
    {
    	setExtractorParameters(DEFAULT_PARAMETERS);
    }
    protected transient ExtractorParameters extractorParameters;
    public ExtractorParameters getExtractorParameters() {
        return extractorParameters;
    }
    @Autowired(required=false)
    public void setExtractorParameters(ExtractorParameters helper) {
        this.extractorParameters = helper; 
    }
    
    /**
     * Processes the given URI.  This method just delegates to 
     * {@link #extract(CrawlURI)}, catching runtime exceptions and
     * errors that are usually non-fatal, to highlight them in the 
     * relevant log(s). 
     * 
     * Notably, StackOverflowError is caught here, as that seems to 
     * happen a lot when dealing with document parsing APIs.
     * 
     * @param uri  the URI to extract links from
     */
    final protected void innerProcess(CrawlURI uri)
    throws InterruptedException {
        try {
            extract(uri);
        } catch (NullPointerException npe) {
            handleException(uri, npe);
        } catch (StackOverflowError soe) {
            handleException(uri, soe);
        } catch (java.nio.charset.CoderMalfunctionError cme) {
            // See http://sourceforge.net/tracker/index.php?func=detail&aid=1540222&group_id=73833&atid=539099
            handleException(uri, cme);
        }
    }
    
    
    private void handleException(CrawlURI uri, Throwable t) {
        // both annotate (to highlight in crawl log) & add as local-error
        uri.getAnnotations().add("err=" + t.getClass().getName());
        uri.getNonFatalFailures().add(t);
        // also log as INFO
        // TODO: remove as redundant, given nonfatal logging?
        logger.log(Level.INFO, "Exception", t);        
    }


    /**
     * Extracts links from the given URI.  Subclasses should use 
     * {@link CrawlURI#getRecorder()} to process the content of the
     * URI.  Any links that are discovered should be added to the
     * {@link CrawlURI#getOutLinks()} set.
     * 
     * @param uri  the uri to extract links from
     */
    protected abstract void extract(CrawlURI uri);


    /**
     * Create and add a 'Link' to the CrawlURI with given URI/context/hop-type
     * @param curi
     * @param uri
     * @param context
     * @param hop
     */
    protected void addOutlink(CrawlURI curi, String uri, LinkContext context,
            Hop hop) {
        try {
            UURI dest = UURIFactory.getInstance(curi.getUURI(), uri);
            CrawlURI link = curi.createCrawlURI(dest, context, hop);
            curi.getOutLinks().add(link);
        } catch (URIException e) {
            logUriError(e, curi.getUURI(), uri);
        }
    }
    
    protected void addOutlink(CrawlURI curi, UURI uuri, LinkContext context,
            Hop hop) {
        try {
            CrawlURI link = curi.createCrawlURI(uuri, context, hop);
            curi.getOutLinks().add(link);
        } catch (URIException e) {
            logUriError(e, curi.getUURI(), uuri.toString());
        }
    }
    
    public void logUriError(URIException e, UURI uuri, 
            CharSequence l) {
        loggerModule.logUriError(e, uuri, l);
    }
    
    @Override
    protected JSONObject toCheckpointJson() throws JSONException {
        JSONObject json = super.toCheckpointJson();
        json.put("numberOfLinksExtracted", numberOfLinksExtracted.get());
        return json;
    }

    @Override
    protected void fromCheckpointJson(JSONObject json) throws JSONException {
        super.fromCheckpointJson(json);
        numberOfLinksExtracted.set(json.getLong("numberOfLinksExtracted"));
    }
    
    public String report() {
        StringBuffer ret = new StringBuffer();
        ret.append(super.report());
        ret.append("  " + numberOfLinksExtracted + " links from " + getURICount() +" CrawlURIs\n");
        return ret.toString();
    }
    
    public static CrawlURI addRelativeToBase(CrawlURI uri, int max,
            String newUri, LinkContext context, Hop hop) throws URIException {
        UURI dest = UURIFactory.getInstance(uri.getBaseURI(), newUri);
        return add2(uri, max, dest, context, hop);
    }

    
    public static CrawlURI addRelativeToVia(CrawlURI uri, int max, String newUri,
            LinkContext context, Hop hop) throws URIException {
        UURI relTo = uri.getVia();
        if (relTo == null) {
            if (!uri.getAnnotations().contains("usedBaseForVia")) {
                LOGGER.info("no via where expected; using base instead: " + uri);
                uri.getAnnotations().add("usedBaseForVia");
            }
            relTo = uri.getBaseURI();
        }
        UURI dest = UURIFactory.getInstance(relTo, newUri);
        return add2(uri, max, dest, context, hop);
    }

    public static void add(CrawlURI uri, int max, String newUri,
            LinkContext context, Hop hop) throws URIException {
        UURI dest = UURIFactory.getInstance(newUri);
        add2(uri, max, dest, context, hop);
    }


    private static CrawlURI add2(CrawlURI curi, int max, UURI dest,
            LinkContext context, Hop hop) throws URIException {
        if (curi.getOutLinks().size() < max) {
            CrawlURI link = curi.createCrawlURI(dest, context, hop);
            curi.getOutLinks().add(link);
            return link;
        } else {
            curi.incrementDiscardedOutLinks();
            return null;
        }
    }

}