All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.crawler.framework.Scoper Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.crawler.framework;

import java.util.logging.Logger;

import org.archive.crawler.reporting.CrawlerLoggerModule;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.deciderules.DecideResult;
import org.archive.modules.deciderules.DecideRule;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;

/**
 * Base class for Scopers.
 * Scopers test CrawlURIs against a scope.
 * Scopers allow logging of rejected CrawlURIs.
 * @author stack
 * @version $Date$, $Revision$
 */
public abstract class Scoper extends Processor implements Lifecycle {
    
    protected DecideRule scope;
    public DecideRule getScope() {
        return this.scope;
    }
    @Autowired
    public void setScope(DecideRule scope) {
        this.scope = scope;
    }
    
    protected Logger fileLogger = null;

    {
        setLogToFile(false);
    }
    public boolean getLogToFile() {
        return (Boolean) kp.get("logToFile");
    }
    /**
     * If enabled, log decisions to file named logs/{spring-bean-id}.log. Format
     * is "[timestamp] [decision] [uri]" where decision is 'ACCEPT' or 'REJECT'.
     */
    public void setLogToFile(boolean enabled) {
        kp.put("logToFile",enabled);
    }

    protected CrawlerLoggerModule loggerModule;
    public CrawlerLoggerModule getLoggerModule() {
        return this.loggerModule;
    }
    @Autowired
    public void setLoggerModule(CrawlerLoggerModule loggerModule) {
        this.loggerModule = loggerModule;
    }
    
    /**
     * Constructor.
     */
    public Scoper() {
        super();
    }

    protected boolean isRunning = false; 
    public void start() {
        if(isRunning) {
            return; 
        }
        if (getLogToFile() && fileLogger == null) {
            fileLogger = loggerModule.setupSimpleLog(getBeanName());
        }
        isRunning = true; 
    }
    public boolean isRunning() {
        return this.isRunning;
    }
    public void stop() {
        isRunning = false; 
    }

    /**
     * Schedule the given {@link CrawlURI CrawlURI} with the Frontier.
     * @param caUri The CrawlURI to be scheduled.
     * @return true if CrawlURI was accepted by crawl scope, false
     * otherwise.
     */
    protected boolean isInScope(CrawlURI caUri) {
        boolean result = false;
        DecideResult dr = scope.decisionFor(caUri);
        if (dr == DecideResult.ACCEPT) {
            result = true;
            if (fileLogger != null) {
                fileLogger.info("ACCEPT " + caUri); 
            }
        } else {
            outOfScope(caUri);
        }
        return result;
    }
    
    /**
     * Called when a CrawlURI is ruled out of scope.
     * Override if you don't want logs as coming from this class.
     * @param caUri CrawlURI that is out of scope.
     */
    protected void outOfScope(CrawlURI caUri) {
        if (fileLogger != null) {
            fileLogger.info("REJECT " + caUri); 
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy