org.archive.modules.Processor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-modules Show documentation
This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.
There is a newer version: 3.5.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules;


import java.io.IOException;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.httpclient.HttpStatus;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.modules.credential.Credential;
import org.archive.modules.credential.HttpAuthenticationCredential;
import org.archive.modules.deciderules.AcceptDecideRule;
import org.archive.modules.deciderules.DecideResult;
import org.archive.modules.deciderules.DecideRule;
import org.archive.net.UURI;
import org.archive.spring.HasKeyedProperties;
import org.archive.spring.KeyedProperties;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.BeanNameAware;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;


/**
 * A processor of URIs.  The URI provides the context for the process; 
 * settings can be altered based on the URI.
 * 
 * @author pjack
 */
public abstract class Processor 
implements HasKeyedProperties, 
           Lifecycle, 
           BeanNameAware,
           Checkpointable {
    protected KeyedProperties kp = new KeyedProperties();
    public KeyedProperties getKeyedProperties() {
        return kp;
    }

    protected String beanName; 
    public String getBeanName() {
        return this.beanName;
    }
    public void setBeanName(String name) {
        this.beanName = name;
    }
    
    /** 
     * Whether or not this process will execute for a particular URI. 
     * If this is false for a URI, then the URI isn't processed,
     * regardless of what the DecideRules say.
     */
    {
        setEnabled(true);
    }
    public boolean getEnabled() {
        return (Boolean) kp.get("enabled");
    }
    public void setEnabled(boolean enabled) {
        kp.put("enabled",enabled);
    }
    
    
    /** 
     * Decide rule(s) (also particular to a URI) that determine whether 
     * or not a particular URI is processed here. If the rule(s) answer
     * REJECT, processing is skipped. (ACCEPT or PASS allow processing
     * to continue). 
     */
    {
        setShouldProcessRule(new AcceptDecideRule());
    }
    public DecideRule getShouldProcessRule() {
        return (DecideRule) kp.get("shouldProcessRule");
    }
    public void setShouldProcessRule(DecideRule rule) {
        kp.put("shouldProcessRule", rule);
    }

    /**
     * The number of URIs processed by this processor.
     */
    protected AtomicLong uriCount = new AtomicLong(0);

    
    /**
     * Processes the given URI.  First checks {@link #getEnabled()} and
     * {@link #getShouldProcessRule()}.  If getEnabled() returns false, then nothing happens.
     * If the shouldProcessRule indicates REJECT, then the 
     * {@link #innerRejectProcess(CrawlURI)} method is invoked, and
     * the process method returns.
     * 
     * Next, the {@link #shouldProcess(CrawlURI)} method is 
     * consulted to see if this Processor knows how to handle the given
     * URI.  If it returns false, then nothing futher occurs.
     * 
     * 
FIXME: Should innerRejectProcess be called when ENABLED is false,
     * or when shouldProcess returns false?  The previous Processor 
     * implementation didn't handle it that way.
     * 
     * Otherwise, the URI is considered valid.  This processor's count
     * of handled URIs is incremented, and the 
     * {@link #innerProcess(CrawlURI)} method is invoked to actually
     * perform the process.
     * 
     * @param uri  The URI to process
     * @throws  InterruptedException   if the thread is interrupted
     */
    public ProcessResult process(CrawlURI uri) 
    throws InterruptedException {
        if (!getEnabled()) {
            return ProcessResult.PROCEED;
        }
        
        if (getShouldProcessRule().decisionFor(uri) == DecideResult.REJECT) {
            innerRejectProcess(uri);
            return ProcessResult.PROCEED;
        }
        
        if (shouldProcess(uri)) {
            uriCount.incrementAndGet();
            return innerProcessResult(uri);
        } else {
            return ProcessResult.PROCEED;
        }
    }

    /**
     * Returns the number of URIs this processor has handled.  The returned
     * number does not include URIs that were rejected by the 
     * {@link #getEnabled()} flag, by the {@link #getShouldProcessRule()}, or by the 
     * {@link #shouldProcess(CrawlURI)} method.
     * 
     * @return  the number of URIs this processor has handled
     */
    public long getURICount() {
        return uriCount.get();
    }


    /**
     * Determines whether the given uri should be processed by this 
     * processor.  For instance, a processor that only works on HTML 
     * content might reject the URI if its content type is not 
     * "text/html", if its content length is zero, and so on.
     * 
     * @param uri   the URI to test
     * @return  true if this processor should process that uri; false if not
     */
    protected abstract boolean shouldProcess(CrawlURI uri);

    
    protected ProcessResult innerProcessResult(CrawlURI uri) 
    throws InterruptedException {
        innerProcess(uri);
        return ProcessResult.PROCEED;
    }

    /**
     * Actually performs the process.  By the time this method is invoked,
     * it is known that the given URI passes the {@link #getEnabled()}, the 
     * {@link #getShouldProcessRule()} and the {@link #shouldProcess(CrawlURI)}
     * tests.  
     * 
     * @param uri    the URI to process
     * @throws InterruptedException   if the thread is interrupted
     */
    protected abstract void innerProcess(CrawlURI uri) 
    throws InterruptedException;


    /**
     * Invoked after a URI has been rejected.  The default implementation
     * does nothing; subclasses may override to log rejects or something.
     * 
     * @param uri   the URI that was rejected
     * @throws InterruptedException   if the thread is interrupted
     */
    protected void innerRejectProcess(CrawlURI uri) 
    throws InterruptedException {        
    }


    public static String flattenVia(CrawlURI puri) {
        UURI uuri = puri.getVia();
        return (uuri == null) ? "" : uuri.toString();
    }

    
    public static boolean isSuccess(CrawlURI puri) {
        boolean result = false;
        int statusCode = puri.getFetchStatus();
        if (statusCode == HttpStatus.SC_UNAUTHORIZED &&
            hasHttpAuthenticationCredential(puri)) {
            result = false;
        } else {
            result = (statusCode > 0);
        }
        return result;        
    }
    
    
    public static long getRecordedSize(CrawlURI puri) {
        if (puri.getRecorder() == null) {
            return puri.getContentSize();
        } else {
            return puri.getRecorder().getRecordedInput().getSize();
        }
    }
    

    /**
     * @return True if we have an HttpAuthentication (rfc2617) payload.
     */
    public static boolean hasHttpAuthenticationCredential(CrawlURI puri) {
        Set credentials = puri.getCredentials();
        for (Credential ca: credentials) {
            if (ca instanceof HttpAuthenticationCredential) {
                return true;
            }
        }
        return false;
    }

    // FIXME: Raise to interface
    // FIXME: Internationalize somehow
    // FIXME: Pass in PrintWriter instead creating large in-memory strings
    public String report() {
        return "Processor: "+getClass().getName()+"\n";
    }
    
    protected boolean isRunning = false; 
    public boolean isRunning() {
        return isRunning;
    }

    public void start() {
        if(isRunning) {
            return; 
        }
        isRunning = true; 
        if(recoveryCheckpoint!=null) {
            try {
                JSONObject json = recoveryCheckpoint.loadJson(getBeanName());
                fromCheckpointJson(json); 
            } catch (JSONException e) {
                throw new RuntimeException(e);
            }
        }
    }


    public void stop() {
        isRunning = false; 
    }
    
    public void startCheckpoint(Checkpoint checkpointInProgress) {}
    
    public void doCheckpoint(Checkpoint checkpointInProgress) 
    throws IOException {
        try {
            JSONObject json = toCheckpointJson();
            checkpointInProgress.saveJson(beanName, json); 
        } catch(JSONException j) {
            // impossible
        } 
    }
  
    /**
     * Return a JSONObject of current stat that can be consulted 
     * on recovery to restore necessary values. 
     * 
     * @return JSONObject
     * @throws JSONException
     */
    protected JSONObject toCheckpointJson() throws JSONException {
        JSONObject json = new JSONObject();
        json.put("uriCount", getURICount());
        return json;
    }
    
    /**
     * Restore internal state from JSONObject stored at earlier
     * checkpoint-time.
     * 
     * @param json JSONObject
     * @throws JSONException
     */
    protected void fromCheckpointJson(JSONObject json) throws JSONException {
        uriCount.set(json.getLong("uriCount"));
    }
    
    public void finishCheckpoint(Checkpoint checkpointInProgress) {}
    
    protected Checkpoint recoveryCheckpoint;
    @Autowired(required=false)
    public void setRecoveryCheckpoint(Checkpoint checkpoint) {
        this.recoveryCheckpoint = checkpoint; 
    }
}