All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.crawler.prefetch.RuntimeLimitEnforcer Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.crawler.prefetch;


import static org.archive.modules.fetcher.FetchStatusCodes.S_BLOCKED_BY_RUNTIME_LIMIT;

import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.crawler.framework.CrawlController;
import org.archive.crawler.framework.CrawlStatus;
import org.archive.crawler.reporting.StatisticsTracker;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.springframework.beans.factory.annotation.Autowired;


/**
 * A processor to enforce runtime limits on crawls.
 * 

* This processor extends and improves on the 'max-time' capability of Heritrix. * Essentially, the 'Terminate job' option functions the same way as 'max-time'. * The processor however also enables pausing when the runtime is exceeded and * the blocking of all URIs. *

*

    *
  1. Pause job - Pauses the crawl. A change (increase) to the * runtime duration will make it pausible to resume the crawl. * Attempts to resume the crawl without modifying the run time * will cause it to be immediately paused again.
  2. *
  3. Terminate job - Terminates the job. Equivalent * to using the max-time setting on the CrawlController.
  4. *
  5. Block URIs - Blocks each URI with an -5002 * (blocked by custom processor) fetch status code. This will * cause all the URIs queued to wind up in the crawl.log.
  6. *
*

* The processor allows variable runtime based on host (or other * override/refinement criteria) however using such overrides only makes sense * when using 'Block URIs' as pause and terminate will have global impact once * encountered anywhere. * * @author Kristinn Sigurðsson */ public class RuntimeLimitEnforcer extends Processor { @SuppressWarnings("unused") private static final long serialVersionUID = 3L; protected static Logger logger = Logger.getLogger( RuntimeLimitEnforcer.class.getName()); /** * The action that the processor takes once the runtime has elapsed. */ public static enum Operation { /** * Pauses the crawl. A change (increase) to the runtime duration will * make it pausible to resume the crawl. Attempts to resume the crawl * without modifying the run time will cause it to be immediately paused * again. */ PAUSE, /** * Terminates the job. Equivalent to using the max-time setting on the * CrawlController. */ TERMINATE, /** * Blocks each URI with an -5002 (blocked by custom processor) fetch * status code. This will cause all the URIs queued to wind up in the * crawl.log. */ BLOCK_URIS }; /** * The amount of time, in seconds, that the crawl will be allowed to run * before this processor performs it's 'end operation.' */ protected long runtimeSeconds = 24*60*60L; // 1 day public long getRuntimeSeconds() { return this.runtimeSeconds; } public void setRuntimeSeconds(long secs) { this.runtimeSeconds = secs; } /** * The action that the processor takes once the runtime has elapsed. *

* Operation: Pause job - Pauses the crawl. A change (increase) to the * runtime duration will make it pausible to resume the crawl. Attempts to * resume the crawl without modifying the run time will cause it to be * immediately paused again. *

* Operation: Terminate job - Terminates the job. Equivalent to using the * max-time setting on the CrawlController. *

* Operation: Block URIs - Blocks each URI with an -5002 (blocked by custom * processor) fetch status code. This will cause all the URIs queued to wind * up in the crawl.log. */ protected Operation expirationOperation = Operation.PAUSE; public Operation getExpirationOperation() { return this.expirationOperation; } public void setExpirationOperation(Operation op) { this.expirationOperation = op; } protected CrawlController controller; public CrawlController getCrawlController() { return this.controller; } @Autowired public void setCrawlController(CrawlController controller) { this.controller = controller; } protected StatisticsTracker statisticsTracker; public StatisticsTracker getStatisticsTracker() { return this.statisticsTracker; } @Autowired public void setStatisticsTracker(StatisticsTracker statisticsTracker) { this.statisticsTracker = statisticsTracker; } public RuntimeLimitEnforcer() { super(); } @Override protected boolean shouldProcess(CrawlURI puri) { return puri instanceof CrawlURI; } @Override protected void innerProcess(CrawlURI curi) { throw new AssertionError(); } @Override protected ProcessResult innerProcessResult(CrawlURI curi) throws InterruptedException { CrawlController controller = getCrawlController(); StatisticsTracker stats = getStatisticsTracker(); long allowedRuntimeMs = getRuntimeSeconds() * 1000L; long currentRuntimeMs = stats.getCrawlElapsedTime(); if(currentRuntimeMs > allowedRuntimeMs){ Operation op = getExpirationOperation(); if(op != null){ if (op.equals(Operation.PAUSE)) { controller.requestCrawlPause(); } else if (op.equals(Operation.TERMINATE)){ controller.requestCrawlStop(CrawlStatus.FINISHED_TIME_LIMIT); } else if (op.equals(Operation.BLOCK_URIS)) { curi.setFetchStatus(S_BLOCKED_BY_RUNTIME_LIMIT); curi.getAnnotations().add("Runtime exceeded " + allowedRuntimeMs + "ms"); return ProcessResult.FINISH; } } else { logger.log(Level.SEVERE,"Null value for end-operation " + " when processing " + curi.toString()); } } return ProcessResult.PROCEED; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy