org.archive.crawler.framework.Scoper Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.framework;
import java.util.logging.Logger;
import org.archive.crawler.reporting.CrawlerLoggerModule;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.deciderules.DecideResult;
import org.archive.modules.deciderules.DecideRule;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;
/**
* Base class for Scopers.
* Scopers test CrawlURIs against a scope.
* Scopers allow logging of rejected CrawlURIs.
* @author stack
* @version $Date$, $Revision$
*/
public abstract class Scoper extends Processor implements Lifecycle {
protected DecideRule scope;
public DecideRule getScope() {
return this.scope;
}
@Autowired
public void setScope(DecideRule scope) {
this.scope = scope;
}
protected Logger fileLogger = null;
{
setLogToFile(false);
}
public boolean getLogToFile() {
return (Boolean) kp.get("logToFile");
}
/**
* If enabled, log decisions to file named logs/{spring-bean-id}.log. Format
* is "[timestamp] [decision] [uri]" where decision is 'ACCEPT' or 'REJECT'.
*/
public void setLogToFile(boolean enabled) {
kp.put("logToFile",enabled);
}
protected CrawlerLoggerModule loggerModule;
public CrawlerLoggerModule getLoggerModule() {
return this.loggerModule;
}
@Autowired
public void setLoggerModule(CrawlerLoggerModule loggerModule) {
this.loggerModule = loggerModule;
}
/**
* Constructor.
*/
public Scoper() {
super();
}
protected boolean isRunning = false;
public void start() {
if(isRunning) {
return;
}
if (getLogToFile() && fileLogger == null) {
fileLogger = loggerModule.setupSimpleLog(getBeanName());
}
isRunning = true;
}
public boolean isRunning() {
return this.isRunning;
}
public void stop() {
isRunning = false;
}
/**
* Schedule the given {@link CrawlURI CrawlURI} with the Frontier.
* @param caUri The CrawlURI to be scheduled.
* @return true if CrawlURI was accepted by crawl scope, false
* otherwise.
*/
protected boolean isInScope(CrawlURI caUri) {
boolean result = false;
DecideResult dr = scope.decisionFor(caUri);
if (dr == DecideResult.ACCEPT) {
result = true;
if (fileLogger != null) {
fileLogger.info("ACCEPT " + caUri);
}
} else {
outOfScope(caUri);
}
return result;
}
/**
* Called when a CrawlURI is ruled out of scope.
* Override if you don't want logs as coming from this class.
* @param caUri CrawlURI that is out of scope.
*/
protected void outOfScope(CrawlURI caUri) {
if (fileLogger != null) {
fileLogger.info("REJECT " + caUri);
}
}
}