org.archive.crawler.prefetch.RuntimeLimitEnforcer Maven / Gradle / Ivy
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.prefetch;
import static org.archive.modules.fetcher.FetchStatusCodes.S_BLOCKED_BY_RUNTIME_LIMIT;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.CrawlStatus;
import org.archive.crawler.reporting.StatisticsTracker;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.springframework.beans.factory.annotation.Autowired;
/**
* A processor to enforce runtime limits on crawls.
*
* This processor extends and improves on the 'max-time' capability of Heritrix.
* Essentially, the 'Terminate job' option functions the same way as 'max-time'.
* The processor however also enables pausing when the runtime is exceeded and
* the blocking of all URIs.
*
*
* - Pause job - Pauses the crawl. A change (increase) to the
* runtime duration will make it pausible to resume the crawl.
* Attempts to resume the crawl without modifying the run time
* will cause it to be immediately paused again.
* - Terminate job - Terminates the job. Equivalent
* to using the max-time setting on the CrawlController.
* - Block URIs - Blocks each URI with an -5002
* (blocked by custom processor) fetch status code. This will
* cause all the URIs queued to wind up in the crawl.log.
*
*
* The processor allows variable runtime based on host (or other
* override/refinement criteria) however using such overrides only makes sense
* when using 'Block URIs' as pause and terminate will have global impact once
* encountered anywhere.
*
* @author Kristinn Sigurðsson
*/
public class RuntimeLimitEnforcer extends Processor {
@SuppressWarnings("unused")
private static final long serialVersionUID = 3L;
protected static Logger logger = Logger.getLogger(
RuntimeLimitEnforcer.class.getName());
/**
* The action that the processor takes once the runtime has elapsed.
*/
public static enum Operation {
/**
* Pauses the crawl. A change (increase) to the runtime duration will
* make it pausible to resume the crawl. Attempts to resume the crawl
* without modifying the run time will cause it to be immediately paused
* again.
*/
PAUSE,
/**
* Terminates the job. Equivalent to using the max-time setting on the
* CrawlController.
*/
TERMINATE,
/**
* Blocks each URI with an -5002 (blocked by custom processor) fetch
* status code. This will cause all the URIs queued to wind up in the
* crawl.log.
*/
BLOCK_URIS
};
/**
* The amount of time, in seconds, that the crawl will be allowed to run
* before this processor performs it's 'end operation.'
*/
protected long runtimeSeconds = 24*60*60L; // 1 day
public long getRuntimeSeconds() {
return this.runtimeSeconds;
}
public void setRuntimeSeconds(long secs) {
this.runtimeSeconds = secs;
}
/**
* The action that the processor takes once the runtime has elapsed.
*
* Operation: Pause job - Pauses the crawl. A change (increase) to the
* runtime duration will make it pausible to resume the crawl. Attempts to
* resume the crawl without modifying the run time will cause it to be
* immediately paused again.
*
* Operation: Terminate job - Terminates the job. Equivalent to using the
* max-time setting on the CrawlController.
*
* Operation: Block URIs - Blocks each URI with an -5002 (blocked by custom
* processor) fetch status code. This will cause all the URIs queued to wind
* up in the crawl.log.
*/
protected Operation expirationOperation = Operation.PAUSE;
public Operation getExpirationOperation() {
return this.expirationOperation;
}
public void setExpirationOperation(Operation op) {
this.expirationOperation = op;
}
protected CrawlController controller;
public CrawlController getCrawlController() {
return this.controller;
}
@Autowired
public void setCrawlController(CrawlController controller) {
this.controller = controller;
}
protected StatisticsTracker statisticsTracker;
public StatisticsTracker getStatisticsTracker() {
return this.statisticsTracker;
}
@Autowired
public void setStatisticsTracker(StatisticsTracker statisticsTracker) {
this.statisticsTracker = statisticsTracker;
}
public RuntimeLimitEnforcer() {
super();
}
@Override
protected boolean shouldProcess(CrawlURI puri) {
return puri instanceof CrawlURI;
}
@Override
protected void innerProcess(CrawlURI curi) {
throw new AssertionError();
}
@Override
protected ProcessResult innerProcessResult(CrawlURI curi)
throws InterruptedException {
CrawlController controller = getCrawlController();
StatisticsTracker stats = getStatisticsTracker();
long allowedRuntimeMs = getRuntimeSeconds() * 1000L;
long currentRuntimeMs = stats.getCrawlElapsedTime();
if(currentRuntimeMs > allowedRuntimeMs){
Operation op = getExpirationOperation();
if(op != null){
if (op.equals(Operation.PAUSE)) {
controller.requestCrawlPause();
} else if (op.equals(Operation.TERMINATE)){
controller.requestCrawlStop(CrawlStatus.FINISHED_TIME_LIMIT);
} else if (op.equals(Operation.BLOCK_URIS)) {
curi.setFetchStatus(S_BLOCKED_BY_RUNTIME_LIMIT);
curi.getAnnotations().add("Runtime exceeded " + allowedRuntimeMs +
"ms");
return ProcessResult.FINISH;
}
} else {
logger.log(Level.SEVERE,"Null value for end-operation " +
" when processing " + curi.toString());
}
}
return ProcessResult.PROCEED;
}
}