org.archive.crawler.postprocessor.DispositionProcessor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-engine Show documentation
The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.crawler.postprocessor;


import static org.archive.modules.CoreAttributeConstants.A_FETCH_BEGAN_TIME;
import static org.archive.modules.CoreAttributeConstants.A_FETCH_COMPLETED_TIME;
import static org.archive.modules.fetcher.FetchStatusCodes.S_CONNECT_FAILED;
import static org.archive.modules.fetcher.FetchStatusCodes.S_CONNECT_LOST;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DEEMED_NOT_FOUND;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DEFERRED;

import java.util.Map;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlMetadata;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.CrawlServer;
import org.archive.modules.net.IgnoreRobotsPolicy;
import org.archive.modules.net.Robotstxt;
import org.archive.modules.net.ServerCache;
import org.springframework.beans.factory.annotation.Autowired;


/**
 * A step, late in the processing of a CrawlURI, for marking-up the 
 * CrawlURI with values to affect frontier disposition, and updating
 * information that may have been affected by the fetch. This includes
 * robots info and other stats. 
 * 
 * (Formerly called CrawlStateUpdater, when it did less.)
 *
 * @author gojomo
 * @version $Date$, $Revision$
 */
public class DispositionProcessor extends Processor {
    @SuppressWarnings("unused")
    private static final long serialVersionUID = -1072728147960180091L;
    private static final Logger logger =
        Logger.getLogger(DispositionProcessor.class.getName());

    protected ServerCache serverCache;
    public ServerCache getServerCache() {
        return this.serverCache;
    }
    @Autowired
    public void setServerCache(ServerCache serverCache) {
        this.serverCache = serverCache;
    }
    
    {
        setDelayFactor(5.0f);
    }
    public float getDelayFactor() {
        return (Float) kp.get("delayFactor");
    }
    /**
     * How many multiples of last fetch elapsed time to wait before recontacting
     * same server.
     */
    public void setDelayFactor(float factor) {
        kp.put("delayFactor",factor);
    }

    {
        setMinDelayMs(3000);
    }
    public int getMinDelayMs() {
        return (Integer) kp.get("minDelayMs");
    }
    /**
     * always wait this long after one completion before recontacting same
     * server, regardless of multiple
     */
    public void setMinDelayMs(int minDelay) {
        kp.put("minDelayMs",minDelay);
    }
    
    {
        setRespectCrawlDelayUpToSeconds(300);
    }
    public int getRespectCrawlDelayUpToSeconds() {
        return (Integer) kp.get("respectCrawlDelayUpToSeconds");
    }
    /**
     * Whether to respect a 'Crawl-Delay' (in seconds) given in a site's
     * robots.txt
     */
    public void setRespectCrawlDelayUpToSeconds(int respect) {
        kp.put("respectCrawlDelayUpToSeconds",respect);
    }

    {
        setMaxDelayMs(30000);
    }
    public int getMaxDelayMs() {
        return (Integer) kp.get("maxDelayMs");
    }
    /** never wait more than this long, regardless of multiple */
    public void setMaxDelayMs(int maxDelay) {
        kp.put("maxDelayMs",maxDelay);
    }    

    {
        setMaxPerHostBandwidthUsageKbSec(0);
    }
    public int getMaxPerHostBandwidthUsageKbSec() {
        return (Integer) kp.get("maxPerHostBandwidthUsageKbSec");
    }
    /** maximum per-host bandwidth usage */
    public void setMaxPerHostBandwidthUsageKbSec(int max) {
        kp.put("maxPerHostBandwidthUsageKbSec",max);
    }
    
    {
        setForceRetire(false);
    }
    public boolean getForceRetire() {
        return (Boolean) kp.get("forceRetire");
    }
    /**
     * Whether to set a CrawlURI's force-retired directive, retiring
     * its queue when it finishes. Mainly intended for URI-specific
     * overlay settings; setting true globally will just retire all queues
     * after they offer one URI, rapidly ending a crawl.
     */
    public void setForceRetire(boolean force) {
        kp.put("forceRetire",force);
    }
    
    protected CrawlMetadata metadata;
    public CrawlMetadata getMetadata() {
        return metadata;
    }
    /**
     * Auto-discovered module providing configured (or overridden)
     * User-Agent value and RobotsHonoringPolicy
     */
    @Autowired
    public void setMetadata(CrawlMetadata provider) {
        this.metadata = provider;
    }

    public DispositionProcessor() {
        super();
    }

    @Override
    protected boolean shouldProcess(CrawlURI puri) {
        return true;
    }
    
    @Override
    protected void innerProcess(CrawlURI curi) {
        // Tally per-server, per-host, per-frontier-class running totals
        CrawlServer server = serverCache.getServerFor(curi.getUURI());

        String scheme = curi.getUURI().getScheme().toLowerCase();
        if (scheme.equals("http") || scheme.equals("https") &&
                server != null) {
            // Update connection problems counter
            if(curi.getFetchStatus() == S_CONNECT_FAILED || curi.getFetchStatus() == S_CONNECT_LOST ) {
                server.incrementConsecutiveConnectionErrors();
            } else if (curi.getFetchStatus() > 0){
                server.resetConsecutiveConnectionErrors();
            }

            // Update robots info
            try {
                if ("/robots.txt".equals(curi.getUURI().getPath()) && curi.getFetchStatus() != S_DEFERRED) {
                    // shortcut retries  w/ DEEMED when ignore-all
                    if (metadata.getRobotsPolicy() instanceof IgnoreRobotsPolicy) {
                        if(curi.getFetchStatus() < 0 && curi.getFetchStatus()!=S_DEFERRED) {
                            // prevent the rest of the usual retries
                            curi.setFetchStatus(S_DEEMED_NOT_FOUND);
                        }
                    }
                    
                    // Update server with robots info
                    // NOTE: in some cases the curi's status can be changed here
                    server.updateRobots(curi);
                }
            }
            catch (URIException e) {
                logger.severe("Failed get path on " + curi.getUURI());
            }
        }
        
        // set politeness delay
        curi.setPolitenessDelay(politenessDelayFor(curi));
        
        // consider operator-set force-retire
        if (getForceRetire()) {
            curi.setForceRetire(true);
        }
        
        // TODO: set other disposition decisions
        // success, failure, retry(retry-delay)
    }
    
    /**
     * Update any scheduling structures with the new information in this
     * CrawlURI. Chiefly means make necessary arrangements for no other URIs at
     * the same host to be visited within the appropriate politeness window.
     * 
     * @param curi
     *            The CrawlURI
     * @return millisecond politeness delay
     */
    protected long politenessDelayFor(CrawlURI curi) {
        long durationToWait = 0;
        Map cdata = curi.getData();
        if (cdata.containsKey(A_FETCH_BEGAN_TIME)
                && cdata.containsKey(A_FETCH_COMPLETED_TIME)) {

            long completeTime = curi.getFetchCompletedTime();
            long durationTaken = (completeTime - curi.getFetchBeginTime());
            durationToWait = (long)(getDelayFactor() * durationTaken);

            long minDelay = getMinDelayMs();
            if (minDelay > durationToWait) {
                // wait at least the minimum
                durationToWait = minDelay;
            }

            long maxDelay = getMaxDelayMs();
            if (durationToWait > maxDelay) {
                // wait no more than the maximum
                durationToWait = maxDelay;
            }
            
            long respectThreshold = getRespectCrawlDelayUpToSeconds() * 1000;
            if (durationToWait respectThreshold) 
                            ? respectThreshold 
                            : crawlDelay;
                    if (crawlDelay > durationToWait) {
                        // wait at least the directive crawl-delay
                        durationToWait = crawlDelay;
                    }
                }
            }
            
            long now = System.currentTimeMillis();
            int maxBandwidthKB = getMaxPerHostBandwidthUsageKbSec();
            if (maxBandwidthKB > 0) {
                // Enforce bandwidth limit
                ServerCache cache = this.getServerCache();
                CrawlHost host = cache.getHostFor(curi.getUURI());
                long minDurationToWait = host.getEarliestNextURIEmitTime()
                        - now;
                float maxBandwidth = maxBandwidthKB * 1.024F; // kilo factor
                long processedBytes = curi.getContentSize();
                host
                        .setEarliestNextURIEmitTime((long)(processedBytes / maxBandwidth)
                                + now);

                if (minDurationToWait > durationToWait) {
                    durationToWait = minDurationToWait;
                }
            }
        }
        return durationToWait;
    }
}