All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.crawler.frontier.BucketQueueAssignmentPolicy Maven / Gradle / Ivy

The newest version!
/* BucketQueueAssignmentPolicy
 * 
 * $Header$
 * 
 * Created on May 06, 2005
 *
 *  Copyright (C) 2005 Christian Kohlschuetter
 *
 * This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 * Heritrix is free software; you can redistribute it and/or modify
 * it under the terms of the GNU Lesser Public License as published by
 * the Free Software Foundation; either version 2.1 of the License, or
 * any later version.
 *
 * Heritrix is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser Public License for more details.
 *
 * You should have received a copy of the GNU Lesser Public License
 * along with Heritrix; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 *
 */
package org.archive.crawler.frontier;

import org.archive.modules.CrawlURI;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.springframework.beans.factory.annotation.Autowired;

/**
 * Uses the target IPs as basis for queue-assignment,
 * distributing them over a fixed number of sub-queues.
 * 
 * @author Christian Kohlschuetter
 */
public class BucketQueueAssignmentPolicy extends QueueAssignmentPolicy {

    private static final long serialVersionUID = 3L;

    private static final int DEFAULT_NOIP_BITMASK = 1023;
    private static final int DEFAULT_QUEUES_HOSTS_MODULO = 1021;

    protected ServerCache serverCache;
    public ServerCache getServerCache() {
        return this.serverCache;
    }
    @Autowired
    public void setServerCache(ServerCache serverCache) {
        this.serverCache = serverCache;
    }
    
    public String getClassKey(final CrawlURI curi) {
        
        CrawlHost host;
        host = serverCache.getHostFor(curi.getUURI());
        if(host == null) {
            return "NO-HOST";
        } else if(host.getIP() == null) {
            return "NO-IP-".concat(Long.toString(Math.abs((long) host
                    .getHostName().hashCode()) & DEFAULT_NOIP_BITMASK));
        } else {
            return Long.toString(Math.abs((long) host.getIP().hashCode())
                    % DEFAULT_QUEUES_HOSTS_MODULO);
        }
    }

    public int maximumNumberOfKeys() {
        return DEFAULT_NOIP_BITMASK + DEFAULT_QUEUES_HOSTS_MODULO + 2;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy