org.archive.crawler.postprocessor.CandidatesProcessor Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-engine Show documentation
The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.crawler.postprocessor;


import static org.archive.modules.fetcher.FetchStatusCodes.S_DEFERRED;
import static org.archive.modules.fetcher.FetchStatusCodes.S_PREREQUISITE_UNSCHEDULABLE_FAILURE;

import org.apache.commons.httpclient.URIException;
import org.archive.crawler.framework.Frontier;
import org.archive.crawler.reporting.CrawlerLoggerModule;
import org.archive.crawler.spring.SheetOverlaysManager;
import org.archive.modules.CandidateChain;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.SchedulingConstants;
import org.archive.modules.extractor.Hop;
import org.archive.modules.seeds.SeedModule;
import org.archive.spring.KeyedProperties;
import org.archive.util.ArchiveUtils;
import org.springframework.beans.factory.annotation.Autowired;


/**
 * Processor which sends all candidate outlinks through the 
 * CandidateChain, scheduling those with non-negative status
 * codes to the frontier. Also performs special handling for
 * 'discovered seeds' -- URIs, as with redirects from seeds, 
 * that may deserve special treatment to expand the scope.
 */
public class CandidatesProcessor extends Processor {

    @SuppressWarnings("unused")
    private static final long serialVersionUID = -3L;
    
    /**
     * Candidate chain
     */
    protected CandidateChain candidateChain;
    public CandidateChain getCandidateChain() {
        return this.candidateChain;
    }
    @Autowired
    public void setCandidateChain(CandidateChain candidateChain) {
        this.candidateChain = candidateChain;
    }
    
    /**
     * The frontier to use.
     */
    protected Frontier frontier;
    public Frontier getFrontier() {
        return this.frontier;
    }
    @Autowired
    public void setFrontier(Frontier frontier) {
        this.frontier = frontier;
    }

    
    protected CrawlerLoggerModule loggerModule;
    public CrawlerLoggerModule getLoggerModule() {
        return this.loggerModule;
    }
    @Autowired
    public void setLoggerModule(CrawlerLoggerModule loggerModule) {
        this.loggerModule = loggerModule;
    }
    
    {
        setSeedsRedirectNewSeeds(true);
    }
    public boolean getSeedsRedirectNewSeeds() {
        return (Boolean) kp.get("seedsRedirectNewSeeds");
    }
    /**
     * If enabled, any URL found because a seed redirected to it (original seed
     * returned 301 or 302), will also be treated as a seed, as long as the hop
     * count is less than {@value #SEEDS_REDIRECT_NEW_SEEDS_MAX_HOPS}.
     */
    public void setSeedsRedirectNewSeeds(boolean redirect) {
        kp.put("seedsRedirectNewSeeds",redirect);
    }

    {
        setSeedsRedirectNewSeedsAllowTLDs(true);
    }
    public boolean getSeedsRedirectNewSeedsAllowTLDs() {
        return (Boolean) kp.get("seedsRedirectNewSeedsAllowTLDs");
    }
    /**
     * If enabled, any URL found because a seed redirected to it (original seed
     * returned 301 or 302), will also be treated as a seed, as long as the hop
     * count is less than {@value #SEEDS_REDIRECT_NEW_SEEDS_MAX_HOPS}.
     */
    public void setSeedsRedirectNewSeedsAllowTLDs(boolean allowTLDs) {
        kp.put("seedsRedirectNewSeedsAllowTLDs",allowTLDs);
    }
    
    protected static final int SEEDS_REDIRECT_NEW_SEEDS_MAX_HOPS = 5;

    {
        setProcessErrorOutlinks(false);
    }
    public boolean getProcessErrorOutlinks() {
        return (Boolean) kp.get("processErrorOutlinks");
    }
    /**
     * If true, outlinks from status codes <200 and >=400
     * will be sent through candidates processing. Default is
     * false.
     */
    public void setProcessErrorOutlinks(boolean errorOutlinks) {
        kp.put("processErrorOutlinks",errorOutlinks);
    }
    
    protected SeedModule seeds;
    public SeedModule getSeeds() {
        return this.seeds;
    }
    @Autowired
    public void setSeeds(SeedModule seeds) {
        this.seeds = seeds;
    }
    
    protected SheetOverlaysManager sheetOverlaysManager;
    public SheetOverlaysManager getSheetOverlaysManager() {
        return sheetOverlaysManager;
    }
    @Autowired
    public void setSheetOverlaysManager(SheetOverlaysManager sheetOverlaysManager) {
        this.sheetOverlaysManager = sheetOverlaysManager;
    }
    
    /**
     * Usual no-argument constructor
     */
    public CandidatesProcessor() {
    }
    
    /* (non-Javadoc)
     * @see org.archive.modules.Processor#shouldProcess(org.archive.modules.CrawlURI)
     */
    protected boolean shouldProcess(CrawlURI puri) {
        return true;
    }

    /**
     * Run candidatesChain on a single candidate CrawlURI; if its
     * reported status is nonnegative, schedule to frontier. 
     * 
     * Also applies special handling of discovered URIs that by
     * convention we want to treat as seeds (which then may be
     * scheduled indirectly via addSeed). 
     * 
     * @param candidate CrawlURI to consider 
     * @param source CrawlURI from which candidate was discovered/derived
     * @return candidate's status code at end of candidate chain execution
     * @throws InterruptedException
     */
    public int runCandidateChain(CrawlURI candidate, CrawlURI source) throws InterruptedException {
        // at least for duration of candidatechain, offer
        // access to full CrawlURI of via
        candidate.setFullVia(source); 
        sheetOverlaysManager.applyOverlaysTo(candidate);
        try {
            KeyedProperties.clearOverridesFrom(source); 
            KeyedProperties.loadOverridesFrom(candidate);
            
            // apply special seed-status promotion
            if(getSeedsRedirectNewSeeds() && source != null && source.isSeed() 
                    && candidate.getLastHop().equals(Hop.REFER.getHopString())
                    && candidate.getHopCount() < SEEDS_REDIRECT_NEW_SEEDS_MAX_HOPS
            		&& (getSeedsRedirectNewSeedsAllowTLDs() || domainIsNotTLD(candidate))
            				) {
                candidate.setSeed(true); 
            }
            
            getCandidateChain().process(candidate, null);
            int statusAfterCandidateChain = candidate.getFetchStatus();
            if(statusAfterCandidateChain>=0) {
                if(checkForSeedPromotion(candidate)) {
                    /*
                     * We want to guarantee crawling of seed version of
                     * CrawlURI even if same url has already been enqueued,
                     * see https://webarchive.jira.com/browse/HER-1891
                     */
                    candidate.setForceFetch(true);
                    getSeeds().addSeed(candidate); // triggers scheduling
                } else {
                    
                    frontier.schedule(candidate);
                    
                }
            } 
            return statusAfterCandidateChain;
        } finally {
            KeyedProperties.clearOverridesFrom(candidate); 
            KeyedProperties.loadOverridesFrom(source);
        }        
    }
    
    private boolean domainIsNotTLD(CrawlURI candidate) {
    	try {
    		return !ArchiveUtils.isTld(candidate.getBaseURI().getHost());
    	} catch (URIException e) {
    		// Just swallow this?
    	}
    	return false;
    }
    
    /**
     * Run candidates chain on each of (1) any prerequisite, if present; 
     * (2) any outCandidates, if present; (3) all outlinks, if appropriate
     * 
     * @see org.archive.modules.Processor#innerProcess(org.archive.modules.CrawlURI)
     */
    @Override
    protected void innerProcess(final CrawlURI curi) throws InterruptedException {
        // (1) Handle any prerequisites when S_DEFERRED for prereqs
        if (curi.hasPrerequisiteUri() && curi.getFetchStatus() == S_DEFERRED) {
            CrawlURI prereq = curi.getPrerequisiteUri();
            
            int prereqStatus = runCandidateChain(prereq, curi);
            
            if (prereqStatus<0) {
                curi.setFetchStatus(S_PREREQUISITE_UNSCHEDULABLE_FAILURE);
            }
            return;
        }

        // Only consider candidate links of error pages if configured to do so
        if (!getProcessErrorOutlinks() 
                && (curi.getFetchStatus() < 200 || curi.getFetchStatus() >= 400)) {
            curi.getOutLinks().clear();
            return;
        }

        // (3) Handle outlinks (usual bulk of discoveries) 
        for (CrawlURI candidate: curi.getOutLinks()) {
            
            runCandidateChain(candidate, curi);

        }

    }
    
    /**
     * Check if the URI needs special 'discovered seed' treatment.
     * 
     * @param curi
     */
    protected boolean checkForSeedPromotion(CrawlURI curi) {
        if (curi.isSeed() && curi.getVia() != null
                && curi.flattenVia().length() > 0) {
            // The only way a seed can have a non-empty via is if it is the
            // result of a seed redirect. Returning true here schedules it 
            // via the seeds module, so it may affect scope and be logged 
            // as 'discovered' seed.
            //
            // This is a feature. This is handling for case where a seed
            // gets immediately redirected to another page. What we're doing is
            // treating the immediate redirect target as a seed.
            
            // And it needs rapid scheduling.
            if (curi.getSchedulingDirective() == SchedulingConstants.NORMAL) {
                curi.setSchedulingDirective(SchedulingConstants.MEDIUM);
            }
            return true; 
        }
        return false;
    }
}