org.archive.crawler.prefetch.PreconditionEnforcer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-engine Show documentation
The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.crawler.prefetch;

import static org.archive.modules.fetcher.FetchStatusCodes.S_DOMAIN_PREREQUISITE_FAILURE;
import static org.archive.modules.fetcher.FetchStatusCodes.S_ROBOTS_PRECLUDED;
import static org.archive.modules.fetcher.FetchStatusCodes.S_ROBOTS_PREREQUISITE_FAILURE;
import static org.archive.modules.fetcher.FetchStatusCodes.S_UNFETCHABLE_URI;

import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.archive.crawler.reporting.CrawlerLoggerModule;
import org.archive.modules.CrawlMetadata;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.archive.modules.credential.Credential;
import org.archive.modules.credential.CredentialStore;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.CrawlServer;
import org.archive.modules.net.RobotsPolicy;
import org.archive.modules.net.ServerCache;
import org.archive.net.UURI;
import org.springframework.beans.factory.annotation.Autowired;


/**
 * Ensures the preconditions for a fetch -- such as DNS lookup 
 * or acquiring and respecting a robots.txt policy -- are
 * satisfied before a URI is passed to subsequent stages.
 *
 * @author gojomo
 */
public class PreconditionEnforcer extends Processor  {
    @SuppressWarnings("unused")
    private static final long serialVersionUID = 3L;
    private static final Logger logger =
        Logger.getLogger(PreconditionEnforcer.class.getName());

    {
        setIpValidityDurationSeconds(6*60*60); // 6 hours
    }
    public int getIpValidityDurationSeconds() {
        return (Integer) kp.get("ipValidityDurationSeconds");
    }
    /**
     * The minimum interval for which a dns-record will be considered
     * valid (in seconds). If the record's DNS TTL is larger, that will
     * be used instead.
     */
    public void setIpValidityDurationSeconds(int duration) {
        kp.put("ipValidityDurationSeconds",duration);
    }

    {
        setRobotsValidityDurationSeconds(24*60*60); // 24 hours
    }
    public int getRobotsValidityDurationSeconds() {
        return (Integer) kp.get("robotsValidityDurationSeconds");
    }
    /**
     * The time in seconds that fetched robots.txt information is considered to
     * be valid. If the value is set to '0', then the robots.txt information
     * will never expire.
     */
    public void setRobotsValidityDurationSeconds(int duration) {
        kp.put("robotsValidityDurationSeconds",duration);
    }

    {
        setCalculateRobotsOnly(false);
    }
    public boolean getCalculateRobotsOnly() {
        return (Boolean) kp.get("calculateRobotsOnly");
    }
    /**
     * Whether to only calculate the robots status of an URI, without actually
     * applying any exclusions found. If true, exlcuded URIs will only be
     * annotated in the crawl.log, but still fetched. Default is false.
     */
    public void setCalculateRobotsOnly(boolean calcOnly) {
        kp.put("calculateRobotsOnly",calcOnly);
    }   
    
    /**
     * Auto-discovered module providing configured (or overridden)
     * User-Agent value and RobotsHonoringPolicy
     */
    protected CrawlMetadata metadata;
    public CrawlMetadata getMetadata() {
        return metadata;
    }
    @Autowired
    public void setMetadata(CrawlMetadata provider) {
        this.metadata = provider;
    }
    
    {
        // initialize with empty store so declaration not required
        setCredentialStore(new CredentialStore());
    }
    public CredentialStore getCredentialStore() {
        return (CredentialStore) kp.get("credentialStore");
    }
    @Autowired(required=false)
    public void setCredentialStore(CredentialStore credentials) {
        kp.put("credentialStore",credentials);
    }
    
    protected ServerCache serverCache;
    public ServerCache getServerCache() {
        return this.serverCache;
    }
    @Autowired
    public void setServerCache(ServerCache serverCache) {
        this.serverCache = serverCache;
    }
    
    protected CrawlerLoggerModule loggerModule;
    public CrawlerLoggerModule getLoggerModule() {
        return this.loggerModule;
    }
    @Autowired
    public void setLoggerModule(CrawlerLoggerModule loggerModule) {
        this.loggerModule = loggerModule;
    }
    
    public PreconditionEnforcer() {
        super();
    }
    
    @Override
    protected boolean shouldProcess(CrawlURI puri) {
        return (puri instanceof CrawlURI);
    }
    
    
    @Override
    protected void innerProcess(CrawlURI puri) {
        throw new AssertionError();
    }

    
    @Override
    protected ProcessResult innerProcessResult(CrawlURI puri) {
        CrawlURI curi = (CrawlURI)puri;
        if (considerDnsPreconditions(curi)) {
            return ProcessResult.FINISH;
        }

        // make sure we only process schemes we understand (i.e. not dns)
        String scheme = curi.getUURI().getScheme().toLowerCase();
        if (! (scheme.equals("http") || scheme.equals("https"))) {
            logger.fine("PolitenessEnforcer doesn't understand uri's of type " +
                scheme + " (ignoring)");
            return ProcessResult.PROCEED;
        }

        if (considerRobotsPreconditions(curi)) {
            return ProcessResult.FINISH;
        }

        if (!curi.isPrerequisite() && credentialPrecondition(curi)) {
            return ProcessResult.FINISH;
        }

        // OK, it's allowed

        // For all curis that will in fact be fetched, set appropriate delays.
        // TODO: SOMEDAY: allow per-host, per-protocol, etc. factors
        // curi.setDelayFactor(getDelayFactorFor(curi));
        // curi.setMinimumDelay(getMinimumDelayFor(curi));

        return ProcessResult.PROCEED;
    }

    /**
     * Consider the robots precondition.
     *
     * @param curi CrawlURI we're checking for any required preconditions.
     * @return True, if this curi has a precondition or processing
     *         should be terminated for some other reason.  False if
     *         we can proceed to process this url.
     */
    protected boolean considerRobotsPreconditions(CrawlURI curi) {
        // treat /robots.txt fetches specially
        UURI uuri = curi.getUURI();
        try {
            if (uuri != null && uuri.getPath() != null &&
                    curi.getUURI().getPath().equals("/robots.txt")) {
                // allow processing to continue
                curi.setPrerequisite(true);
                return false;
            }
        } catch (URIException e) {
            logger.severe("Failed get of path for " + curi);
        }
        
        CrawlServer cs = serverCache.getServerFor(curi.getUURI());
        // require /robots.txt if not present
        if (cs.isRobotsExpired(getRobotsValidityDurationSeconds())) {
        	// Need to get robots
            if (logger.isLoggable(Level.FINE)) {
                logger.fine( "No valid robots for " + cs  +
                    "; deferring " + curi);
            }

            // Robots expired - should be refetched even though its already
            // crawled.
            try {
                String prereq = curi.getUURI().resolve("/robots.txt").toString();
                curi.markPrerequisite(prereq);
            }
            catch (URIException e1) {
                logger.severe("Failed resolve using " + curi);
                throw new RuntimeException(e1); // shouldn't ever happen
            }
            return true;
        }
        // test against robots.txt if available
        if (cs.isValidRobots()) {
            String ua = metadata.getUserAgent();
            RobotsPolicy robots = metadata.getRobotsPolicy();
            if(!robots.allows(ua, curi, cs.getRobotstxt())) {
                if(getCalculateRobotsOnly()) {
                    // annotate URI as excluded, but continue to process normally
                    curi.getAnnotations().add("robotExcluded");
                    return false; 
                }
                // mark as precluded; in FetchHTTP, this will
                // prevent fetching and cause a skip to the end
                // of processing (unless an intervening processor
                // overrules)
                curi.setFetchStatus(S_ROBOTS_PRECLUDED);
                curi.setError("robots.txt exclusion");
                logger.fine("robots.txt precluded " + curi);
                return true;
            }
            return false;
        }
        // No valid robots found => Attempt to get robots.txt failed
//        curi.skipToPostProcessing();
        curi.setFetchStatus(S_ROBOTS_PREREQUISITE_FAILURE);
        curi.setError("robots.txt prerequisite failed");
        if (logger.isLoggable(Level.FINE)) {
            logger.fine("robots.txt prerequisite failed " + curi);
        }
        return true;
    }

    /**
     * @param curi CrawlURI whose dns prerequisite we're to check.
     * @return true if no further processing in this module should occur
     */
    protected boolean considerDnsPreconditions(CrawlURI curi) {
        if(curi.getUURI().getScheme().equals("dns")){
            // DNS URIs never have a DNS precondition
            curi.setPrerequisite(true);
            return false; 
        } else if (curi.getUURI().getScheme().equals("whois")) {
            return false;
        }
        
        CrawlServer cs = serverCache.getServerFor(curi.getUURI());
        if(cs == null) {
            curi.setFetchStatus(S_UNFETCHABLE_URI);
//            curi.skipToPostProcessing();
            return true;
        }

        // If we've done a dns lookup and it didn't resolve a host
        // cancel further fetch-processing of this URI, because
        // the domain is unresolvable
        CrawlHost ch = serverCache.getHostFor(curi.getUURI());
        if (ch == null || ch.getIP() == null && !isIpExpired(curi)) {
            if (logger.isLoggable(Level.FINE)) {
                logger.fine( "no dns for " + ch +
                    " cancelling processing for CrawlURI " + curi.toString());
            }
            curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
//            curi.skipToPostProcessing();
            return true;
        }

        // If we haven't done a dns lookup  and this isn't a dns uri
        // shoot that off and defer further processing
        if (isIpExpired(curi) && !curi.getUURI().getScheme().equals("dns")) {
            logger.fine("Deferring processing of CrawlURI " + curi.toString()
                + " for dns lookup.");
            String preq = "dns:" + ch.getHostName();
            try {
                curi.markPrerequisite(preq);
            } catch (URIException e) {
                throw new RuntimeException(e); // shouldn't ever happen
            }
            return true;
        }
        
        // DNS preconditions OK
        return false;
    }

    /** Return true if ip should be looked up.
     *
     * @param curi the URI to check.
     * @return true if ip should be looked up.
     */
    public boolean isIpExpired(CrawlURI curi) {
        CrawlHost host = serverCache.getHostFor(curi.getUURI());
        if (!host.hasBeenLookedUp()) {
            // IP has not been looked up yet.
            return true;
        }

        if (host.getIpTTL() == CrawlHost.IP_NEVER_EXPIRES) {
            // IP never expires (numeric IP)
            return false;
        }

        long duration = getIpValidityDurationSeconds();
        if (duration == 0) {
            // Never expire ip if duration is null (set by user or more likely,
            // set to zero in case where we tried in FetchDNS but failed).
            return false;
        }
        
        long ttl = host.getIpTTL();
        if (ttl > duration) {
            // Use the larger of the operator-set minimum duration 
            // or the DNS record TTL
            duration = ttl;
        }

        // Duration and ttl are in seconds.  Convert to millis.
        if (duration > 0) {
            duration *= 1000;
        }

        return (duration + host.getIpFetched()) < System.currentTimeMillis();
    }

   /**
    * Consider credential preconditions.
    *
    * Looks to see if any credential preconditions (e.g. html form login
    * credentials) for this CrawlServer. If there are, have they
    * been run already? If not, make the running of these logins a precondition
    * of accessing any other url on this CrawlServer.
    *
    * 
    * One day, do optimization and avoid running the bulk of the code below.
    * Argument for running the code every time is that overrides and refinements
    * may change what comes back from credential store.
    *
    * @param curi CrawlURI we're checking for any required preconditions.
    * @return True, if this curi has a precondition that needs to
    *         be met before we can proceed. False if we can precede to process
    *         this url.
    */
    public boolean credentialPrecondition(final CrawlURI curi) {

        boolean result = false;

        CredentialStore cs = getCredentialStore();
        if (cs == null) {
            logger.severe("No credential store for " + curi);
            return result;
        }

        for (Credential c: cs.getAll()) {
            if (c.isPrerequisite(curi)) {
                // This credential has a prereq. and this curi is it.  Let it
                // through.  Add its avatar to the curi as a mark.  Also, does
                // this curi need to be posted?  Note, we do this test for
                // is it a prereq BEFORE we do the check that curi is of the
                // credential domain because such as yahoo have you go to
                // another domain altogether to login.
                logger.fine("attaching credential and setting fetch type to POST for recognized form login url " + curi);
                c.attach(curi);
                curi.setFetchType(CrawlURI.FetchType.HTTP_POST);
                break;
            }

            if (!c.rootUriMatch(serverCache, curi)) {
                continue;
            }

            if (!c.hasPrerequisite(curi)) {
                continue;
            }

            if (!authenticated(c, curi)) {
                // Han't been authenticated.  Queue it and move on (Assumption
                // is that we can do one authentication at a time -- usually one
                // html form).
                String prereq = c.getPrerequisite(curi);
                if (prereq == null || prereq.length() <= 0) {
                    CrawlServer server = serverCache.getServerFor(curi.getUURI());
                    logger.severe(server.getName() + " has "
                        + " credential(s) of type " + c + " but prereq"
                        + " is null.");
                } else {
                    try {
                        curi.markPrerequisite(prereq);
                    } catch (URIException e) {
                        logger.severe("unable to set credentials prerequisite "+prereq);
                        loggerModule.logUriError(e,curi.getUURI(),prereq);
                        return false; 
                    }
                    result = true;
                    if (logger.isLoggable(Level.FINE)) {
                        logger.fine("Queueing prereq " + prereq + " of type " +
                            c + " for " + curi);
                    }
                    break;
                }
            }
        }
        return result;
    }

    /**
     * Has passed credential already been authenticated.
     *
     * @param credential Credential to test.
     * @param curi CrawlURI.
     * @return True if already run.
     */
    protected boolean authenticated(final Credential credential, final CrawlURI curi) {
        CrawlServer server = serverCache.getServerFor(curi.getUURI());
        if (!server.hasCredentials()) {
            return false;
        }
        Set credentials = server.getCredentials();
        for (Credential cred: credentials) {
            if (cred.getKey().equals(credential.getKey()) 
                    && cred.getClass().isInstance(credential)) {
                return true; 
            }
        }
        return false;
    }

}