org.archive.crawler.postprocessor.DispositionProcessor Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.postprocessor;
import static org.archive.modules.CoreAttributeConstants.A_FETCH_BEGAN_TIME;
import static org.archive.modules.CoreAttributeConstants.A_FETCH_COMPLETED_TIME;
import static org.archive.modules.fetcher.FetchStatusCodes.S_CONNECT_FAILED;
import static org.archive.modules.fetcher.FetchStatusCodes.S_CONNECT_LOST;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DEEMED_NOT_FOUND;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DEFERRED;
import java.util.Map;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.archive.modules.CrawlMetadata;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.CrawlServer;
import org.archive.modules.net.IgnoreRobotsPolicy;
import org.archive.modules.net.Robotstxt;
import org.archive.modules.net.ServerCache;
import org.springframework.beans.factory.annotation.Autowired;
/**
* A step, late in the processing of a CrawlURI, for marking-up the
* CrawlURI with values to affect frontier disposition, and updating
* information that may have been affected by the fetch. This includes
* robots info and other stats.
*
* (Formerly called CrawlStateUpdater, when it did less.)
*
* @author gojomo
* @version $Date$, $Revision$
*/
public class DispositionProcessor extends Processor {
@SuppressWarnings("unused")
private static final long serialVersionUID = -1072728147960180091L;
private static final Logger logger =
Logger.getLogger(DispositionProcessor.class.getName());
protected ServerCache serverCache;
public ServerCache getServerCache() {
return this.serverCache;
}
@Autowired
public void setServerCache(ServerCache serverCache) {
this.serverCache = serverCache;
}
{
setDelayFactor(5.0f);
}
public float getDelayFactor() {
return (Float) kp.get("delayFactor");
}
/**
* How many multiples of last fetch elapsed time to wait before recontacting
* same server.
*/
public void setDelayFactor(float factor) {
kp.put("delayFactor",factor);
}
{
setMinDelayMs(3000);
}
public int getMinDelayMs() {
return (Integer) kp.get("minDelayMs");
}
/**
* always wait this long after one completion before recontacting same
* server, regardless of multiple
*/
public void setMinDelayMs(int minDelay) {
kp.put("minDelayMs",minDelay);
}
{
setRespectCrawlDelayUpToSeconds(300);
}
public int getRespectCrawlDelayUpToSeconds() {
return (Integer) kp.get("respectCrawlDelayUpToSeconds");
}
/**
* Whether to respect a 'Crawl-Delay' (in seconds) given in a site's
* robots.txt
*/
public void setRespectCrawlDelayUpToSeconds(int respect) {
kp.put("respectCrawlDelayUpToSeconds",respect);
}
{
setMaxDelayMs(30000);
}
public int getMaxDelayMs() {
return (Integer) kp.get("maxDelayMs");
}
/** never wait more than this long, regardless of multiple */
public void setMaxDelayMs(int maxDelay) {
kp.put("maxDelayMs",maxDelay);
}
{
setMaxPerHostBandwidthUsageKbSec(0);
}
public int getMaxPerHostBandwidthUsageKbSec() {
return (Integer) kp.get("maxPerHostBandwidthUsageKbSec");
}
/** maximum per-host bandwidth usage */
public void setMaxPerHostBandwidthUsageKbSec(int max) {
kp.put("maxPerHostBandwidthUsageKbSec",max);
}
{
setForceRetire(false);
}
public boolean getForceRetire() {
return (Boolean) kp.get("forceRetire");
}
/**
* Whether to set a CrawlURI's force-retired directive, retiring
* its queue when it finishes. Mainly intended for URI-specific
* overlay settings; setting true globally will just retire all queues
* after they offer one URI, rapidly ending a crawl.
*/
public void setForceRetire(boolean force) {
kp.put("forceRetire",force);
}
protected CrawlMetadata metadata;
public CrawlMetadata getMetadata() {
return metadata;
}
/**
* Auto-discovered module providing configured (or overridden)
* User-Agent value and RobotsHonoringPolicy
*/
@Autowired
public void setMetadata(CrawlMetadata provider) {
this.metadata = provider;
}
public DispositionProcessor() {
super();
}
@Override
protected boolean shouldProcess(CrawlURI puri) {
return true;
}
@Override
protected void innerProcess(CrawlURI curi) {
// Tally per-server, per-host, per-frontier-class running totals
CrawlServer server = serverCache.getServerFor(curi.getUURI());
String scheme = curi.getUURI().getScheme().toLowerCase();
if (scheme.equals("http") || scheme.equals("https") &&
server != null) {
// Update connection problems counter
if(curi.getFetchStatus() == S_CONNECT_FAILED || curi.getFetchStatus() == S_CONNECT_LOST ) {
server.incrementConsecutiveConnectionErrors();
} else if (curi.getFetchStatus() > 0){
server.resetConsecutiveConnectionErrors();
}
// Update robots info
try {
if ("/robots.txt".equals(curi.getUURI().getPath()) && curi.getFetchStatus() != S_DEFERRED) {
// shortcut retries w/ DEEMED when ignore-all
if (metadata.getRobotsPolicy() instanceof IgnoreRobotsPolicy) {
if(curi.getFetchStatus() < 0 && curi.getFetchStatus()!=S_DEFERRED) {
// prevent the rest of the usual retries
curi.setFetchStatus(S_DEEMED_NOT_FOUND);
}
}
// Update server with robots info
// NOTE: in some cases the curi's status can be changed here
server.updateRobots(curi);
}
}
catch (URIException e) {
logger.severe("Failed get path on " + curi.getUURI());
}
}
// set politeness delay
curi.setPolitenessDelay(politenessDelayFor(curi));
// consider operator-set force-retire
if (getForceRetire()) {
curi.setForceRetire(true);
}
// TODO: set other disposition decisions
// success, failure, retry(retry-delay)
}
/**
* Update any scheduling structures with the new information in this
* CrawlURI. Chiefly means make necessary arrangements for no other URIs at
* the same host to be visited within the appropriate politeness window.
*
* @param curi
* The CrawlURI
* @return millisecond politeness delay
*/
protected long politenessDelayFor(CrawlURI curi) {
long durationToWait = 0;
Map cdata = curi.getData();
if (cdata.containsKey(A_FETCH_BEGAN_TIME)
&& cdata.containsKey(A_FETCH_COMPLETED_TIME)) {
long completeTime = curi.getFetchCompletedTime();
long durationTaken = (completeTime - curi.getFetchBeginTime());
durationToWait = (long)(getDelayFactor() * durationTaken);
long minDelay = getMinDelayMs();
if (minDelay > durationToWait) {
// wait at least the minimum
durationToWait = minDelay;
}
long maxDelay = getMaxDelayMs();
if (durationToWait > maxDelay) {
// wait no more than the maximum
durationToWait = maxDelay;
}
long respectThreshold = getRespectCrawlDelayUpToSeconds() * 1000;
if (durationToWait respectThreshold)
? respectThreshold
: crawlDelay;
if (crawlDelay > durationToWait) {
// wait at least the directive crawl-delay
durationToWait = crawlDelay;
}
}
}
long now = System.currentTimeMillis();
int maxBandwidthKB = getMaxPerHostBandwidthUsageKbSec();
if (maxBandwidthKB > 0) {
// Enforce bandwidth limit
ServerCache cache = this.getServerCache();
CrawlHost host = cache.getHostFor(curi.getUURI());
long minDurationToWait = host.getEarliestNextURIEmitTime()
- now;
float maxBandwidth = maxBandwidthKB * 1.024F; // kilo factor
long processedBytes = curi.getContentSize();
host
.setEarliestNextURIEmitTime((long)(processedBytes / maxBandwidth)
+ now);
if (minDurationToWait > durationToWait) {
durationToWait = minDurationToWait;
}
}
}
return durationToWait;
}
}