All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.crawler.frontier.URIAuthorityBasedQueueAssignmentPolicy Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.crawler.frontier;

import org.apache.commons.lang.StringUtils;
import org.archive.modules.CrawlURI;
import org.archive.net.UURI;
import org.archive.spring.HasKeyedProperties;
import org.archive.util.LongToIntConsistentHash;

/**
 * SurtAuthorityQueueAssignmentPolicy based on the surt form of hostname.
 */
public abstract class URIAuthorityBasedQueueAssignmentPolicy
extends 
 QueueAssignmentPolicy 
implements
 HasKeyedProperties {
    private static final long serialVersionUID = 3L;
    
    //for when neat class-key fails us
    protected static String DEFAULT_CLASS_KEY = "default...";

    protected LongToIntConsistentHash conhash = new LongToIntConsistentHash();
    
    /**
     * Whether to always defer to a previously-assigned key inside 
     * the CrawlURI. If true, any key already in the CrawlURI will
     * be returned as the classKey. 
     */
    public boolean getDeferToPrevious() {
        return (Boolean) kp.get("deferToPrevious");
    }
    {
        setDeferToPrevious(true);
    }
    public void setDeferToPrevious(boolean defer) {
        kp.put("deferToPrevious",defer);
    }
    
    /**
     * The number of parallel queues to split a core key into. By 
     * default is 1. If larger than 1, the non-authority-based portion
     * of the URI will be used to distribute over that many separate
     * queues. 
     * 
     */
    public int getParallelQueues() {
        return (Integer) kp.get("parallelQueues");
    }
    {
        setParallelQueues(1);
    }
    public void setParallelQueues(int count) {
        kp.put("parallelQueues",count);
    }

    /**
    * Whether to assign URIs to parallel queues in round-robin fashon.
    * False by default. If true, URIs will be assigned to a queue randomly.
    */
    public boolean getParallelQueuesRandomAssignment() {
        return (Boolean) kp.get("parallelQueuesRandomAssignment");
    }
    {
        setParallelQueuesRandomAssignment(false);
    }
    public void setParallelQueuesRandomAssignment(boolean doRandom) {
        kp.put("parallelQueuesRandomAssignment",doRandom);
    }

    public String getClassKey(CrawlURI curi) {
        if(getDeferToPrevious() && !StringUtils.isEmpty(curi.getClassKey())) {
            return curi.getClassKey();
        }

        UURI basis = curi.getPolicyBasisUURI();
        String candidate = getCoreKey(basis);
        
        if(!StringUtils.isEmpty(getForceQueueAssignment())) {
            candidate = getForceQueueAssignment();
        }
        
        // all whois urls in the same queue
        if (curi.getUURI().getScheme().equals("whois")) {
            return "whois...";
        }
        
        if(StringUtils.isEmpty(candidate)) {
            return DEFAULT_CLASS_KEY;
        }
        
        if(getParallelQueues()>1) {
            int subqueue = getSubqueue(basis,getParallelQueues());
            if (subqueue>0) {
                candidate += "+"+subqueue;
            }
        }
        return candidate; 
    }
    
    protected int getSubqueue(UURI basisUuri, int parallelQueues) {
        String basis = bucketBasis(basisUuri);
        if(StringUtils.isEmpty(basis)) {
            return 0; 
        }
        return conhash.bucketFor(basis, parallelQueues);
    }
    
    /**
     * Base subqueue on first path-segment, if any. (Means unbalanced
     * subqueues, but consistency for most-common case where fanout
     * can be at first segment, and it's beneficial to keep similar
     * URIs in same queue.)
     * @param uuri
     * @return
     */
    protected String bucketBasis(UURI uuri) {
        if(getParallelQueuesRandomAssignment()){
            return uuri.getEscapedURI();
        }
        String path = new String(uuri.getRawPath());
        int i = path.indexOf('/',1);
        if(i<0) {
            return null; 
        }
        return path.substring(1,i);
    }

    protected abstract String getCoreKey(UURI basis);
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy