All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.Processor Maven / Gradle / Ivy

/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules;


import java.io.IOException;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.commons.httpclient.HttpStatus;
import org.archive.checkpointing.Checkpoint;
import org.archive.checkpointing.Checkpointable;
import org.archive.modules.credential.Credential;
import org.archive.modules.credential.HttpAuthenticationCredential;
import org.archive.modules.deciderules.AcceptDecideRule;
import org.archive.modules.deciderules.DecideResult;
import org.archive.modules.deciderules.DecideRule;
import org.archive.net.UURI;
import org.archive.spring.HasKeyedProperties;
import org.archive.spring.KeyedProperties;
import org.json.JSONException;
import org.json.JSONObject;
import org.springframework.beans.factory.BeanNameAware;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;


/**
 * A processor of URIs.  The URI provides the context for the process; 
 * settings can be altered based on the URI.
 * 
 * @author pjack
 */
public abstract class Processor 
implements HasKeyedProperties, 
           Lifecycle, 
           BeanNameAware,
           Checkpointable {
    protected KeyedProperties kp = new KeyedProperties();
    public KeyedProperties getKeyedProperties() {
        return kp;
    }

    protected String beanName; 
    public String getBeanName() {
        return this.beanName;
    }
    public void setBeanName(String name) {
        this.beanName = name;
    }
    
    /** 
     * Whether or not this process will execute for a particular URI. 
     * If this is false for a URI, then the URI isn't processed,
     * regardless of what the DecideRules say.
     */
    {
        setEnabled(true);
    }
    public boolean getEnabled() {
        return (Boolean) kp.get("enabled");
    }
    public void setEnabled(boolean enabled) {
        kp.put("enabled",enabled);
    }
    
    
    /** 
     * Decide rule(s) (also particular to a URI) that determine whether 
     * or not a particular URI is processed here. If the rule(s) answer
     * REJECT, processing is skipped. (ACCEPT or PASS allow processing
     * to continue). 
     */
    {
        setShouldProcessRule(new AcceptDecideRule());
    }
    public DecideRule getShouldProcessRule() {
        return (DecideRule) kp.get("shouldProcessRule");
    }
    public void setShouldProcessRule(DecideRule rule) {
        kp.put("shouldProcessRule", rule);
    }

    /**
     * The number of URIs processed by this processor.
     */
    protected AtomicLong uriCount = new AtomicLong(0);

    
    /**
     * Processes the given URI.  First checks {@link #getEnabled()} and
     * {@link #getShouldProcessRule()}.  If getEnabled() returns false, then nothing happens.
     * If the shouldProcessRule indicates REJECT, then the 
     * {@link #innerRejectProcess(CrawlURI)} method is invoked, and
     * the process method returns.
     * 
     * 

Next, the {@link #shouldProcess(CrawlURI)} method is * consulted to see if this Processor knows how to handle the given * URI. If it returns false, then nothing futher occurs. * *

FIXME: Should innerRejectProcess be called when ENABLED is false, * or when shouldProcess returns false? The previous Processor * implementation didn't handle it that way. * *

Otherwise, the URI is considered valid. This processor's count * of handled URIs is incremented, and the * {@link #innerProcess(CrawlURI)} method is invoked to actually * perform the process. * * @param uri The URI to process * @throws InterruptedException if the thread is interrupted */ public ProcessResult process(CrawlURI uri) throws InterruptedException { if (!getEnabled()) { return ProcessResult.PROCEED; } if (getShouldProcessRule().decisionFor(uri) == DecideResult.REJECT) { innerRejectProcess(uri); return ProcessResult.PROCEED; } if (shouldProcess(uri)) { uriCount.incrementAndGet(); return innerProcessResult(uri); } else { return ProcessResult.PROCEED; } } /** * Returns the number of URIs this processor has handled. The returned * number does not include URIs that were rejected by the * {@link #getEnabled()} flag, by the {@link #getShouldProcessRule()}, or by the * {@link #shouldProcess(CrawlURI)} method. * * @return the number of URIs this processor has handled */ public long getURICount() { return uriCount.get(); } /** * Determines whether the given uri should be processed by this * processor. For instance, a processor that only works on HTML * content might reject the URI if its content type is not * "text/html", if its content length is zero, and so on. * * @param uri the URI to test * @return true if this processor should process that uri; false if not */ protected abstract boolean shouldProcess(CrawlURI uri); protected ProcessResult innerProcessResult(CrawlURI uri) throws InterruptedException { innerProcess(uri); return ProcessResult.PROCEED; } /** * Actually performs the process. By the time this method is invoked, * it is known that the given URI passes the {@link #getEnabled()}, the * {@link #getShouldProcessRule()} and the {@link #shouldProcess(CrawlURI)} * tests. * * @param uri the URI to process * @throws InterruptedException if the thread is interrupted */ protected abstract void innerProcess(CrawlURI uri) throws InterruptedException; /** * Invoked after a URI has been rejected. The default implementation * does nothing; subclasses may override to log rejects or something. * * @param uri the URI that was rejected * @throws InterruptedException if the thread is interrupted */ protected void innerRejectProcess(CrawlURI uri) throws InterruptedException { } public static String flattenVia(CrawlURI puri) { UURI uuri = puri.getVia(); return (uuri == null) ? "" : uuri.toString(); } public static boolean isSuccess(CrawlURI puri) { boolean result = false; int statusCode = puri.getFetchStatus(); if (statusCode == HttpStatus.SC_UNAUTHORIZED && hasHttpAuthenticationCredential(puri)) { result = false; } else { result = (statusCode > 0); } return result; } public static long getRecordedSize(CrawlURI puri) { if (puri.getRecorder() == null) { return puri.getContentSize(); } else { return puri.getRecorder().getRecordedInput().getSize(); } } /** * @return True if we have an HttpAuthentication (rfc2617) payload. */ public static boolean hasHttpAuthenticationCredential(CrawlURI puri) { Set credentials = puri.getCredentials(); for (Credential ca: credentials) { if (ca instanceof HttpAuthenticationCredential) { return true; } } return false; } // FIXME: Raise to interface // FIXME: Internationalize somehow // FIXME: Pass in PrintWriter instead creating large in-memory strings public String report() { return "Processor: "+getClass().getName()+"\n"; } protected boolean isRunning = false; public boolean isRunning() { return isRunning; } public void start() { if(isRunning) { return; } isRunning = true; if(recoveryCheckpoint!=null) { try { JSONObject json = recoveryCheckpoint.loadJson(getBeanName()); fromCheckpointJson(json); } catch (JSONException e) { throw new RuntimeException(e); } } } public void stop() { isRunning = false; } public void startCheckpoint(Checkpoint checkpointInProgress) {} public void doCheckpoint(Checkpoint checkpointInProgress) throws IOException { try { JSONObject json = toCheckpointJson(); checkpointInProgress.saveJson(beanName, json); } catch(JSONException j) { // impossible } } /** * Return a JSONObject of current stat that can be consulted * on recovery to restore necessary values. * * @return JSONObject * @throws JSONException */ protected JSONObject toCheckpointJson() throws JSONException { JSONObject json = new JSONObject(); json.put("uriCount", getURICount()); return json; } /** * Restore internal state from JSONObject stored at earlier * checkpoint-time. * * @param json JSONObject * @throws JSONException */ protected void fromCheckpointJson(JSONObject json) throws JSONException { uriCount.set(json.getLong("uriCount")); } public void finishCheckpoint(Checkpoint checkpointInProgress) {} protected Checkpoint recoveryCheckpoint; @Autowired(required=false) public void setRecoveryCheckpoint(Checkpoint checkpoint) { this.recoveryCheckpoint = checkpoint; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy