org.archive.crawler.frontier.URIAuthorityBasedQueueAssignmentPolicy Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.frontier;
import org.apache.commons.lang.StringUtils;
import org.archive.modules.CrawlURI;
import org.archive.net.UURI;
import org.archive.spring.HasKeyedProperties;
import org.archive.util.LongToIntConsistentHash;
/**
* SurtAuthorityQueueAssignmentPolicy based on the surt form of hostname.
*/
public abstract class URIAuthorityBasedQueueAssignmentPolicy
extends
QueueAssignmentPolicy
implements
HasKeyedProperties {
private static final long serialVersionUID = 3L;
//for when neat class-key fails us
protected static String DEFAULT_CLASS_KEY = "default...";
protected LongToIntConsistentHash conhash = new LongToIntConsistentHash();
/**
* Whether to always defer to a previously-assigned key inside
* the CrawlURI. If true, any key already in the CrawlURI will
* be returned as the classKey.
*/
public boolean getDeferToPrevious() {
return (Boolean) kp.get("deferToPrevious");
}
{
setDeferToPrevious(true);
}
public void setDeferToPrevious(boolean defer) {
kp.put("deferToPrevious",defer);
}
/**
* The number of parallel queues to split a core key into. By
* default is 1. If larger than 1, the non-authority-based portion
* of the URI will be used to distribute over that many separate
* queues.
*
*/
public int getParallelQueues() {
return (Integer) kp.get("parallelQueues");
}
{
setParallelQueues(1);
}
public void setParallelQueues(int count) {
kp.put("parallelQueues",count);
}
/**
* Whether to assign URIs to parallel queues in round-robin fashon.
* False by default. If true, URIs will be assigned to a queue randomly.
*/
public boolean getParallelQueuesRandomAssignment() {
return (Boolean) kp.get("parallelQueuesRandomAssignment");
}
{
setParallelQueuesRandomAssignment(false);
}
public void setParallelQueuesRandomAssignment(boolean doRandom) {
kp.put("parallelQueuesRandomAssignment",doRandom);
}
public String getClassKey(CrawlURI curi) {
if(getDeferToPrevious() && !StringUtils.isEmpty(curi.getClassKey())) {
return curi.getClassKey();
}
UURI basis = curi.getPolicyBasisUURI();
String candidate = getCoreKey(basis);
if(!StringUtils.isEmpty(getForceQueueAssignment())) {
candidate = getForceQueueAssignment();
}
// all whois urls in the same queue
if (curi.getUURI().getScheme().equals("whois")) {
return "whois...";
}
if(StringUtils.isEmpty(candidate)) {
return DEFAULT_CLASS_KEY;
}
if(getParallelQueues()>1) {
int subqueue = getSubqueue(basis,getParallelQueues());
if (subqueue>0) {
candidate += "+"+subqueue;
}
}
return candidate;
}
protected int getSubqueue(UURI basisUuri, int parallelQueues) {
String basis = bucketBasis(basisUuri);
if(StringUtils.isEmpty(basis)) {
return 0;
}
return conhash.bucketFor(basis, parallelQueues);
}
/**
* Base subqueue on first path-segment, if any. (Means unbalanced
* subqueues, but consistency for most-common case where fanout
* can be at first segment, and it's beneficial to keep similar
* URIs in same queue.)
* @param uuri
* @return
*/
protected String bucketBasis(UURI uuri) {
if(getParallelQueuesRandomAssignment()){
return uuri.getEscapedURI();
}
String path = new String(uuri.getRawPath());
int i = path.indexOf('/',1);
if(i<0) {
return null;
}
return path.substring(1,i);
}
protected abstract String getCoreKey(UURI basis);
}