org.archive.crawler.processor.HashCrawlMapper Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.processor;
import java.util.regex.Matcher;
import org.archive.crawler.framework.Frontier;
import org.archive.modules.CrawlURI;
import org.archive.net.PublicSuffixes;
import org.archive.util.TextUtils;
import org.springframework.beans.factory.annotation.Autowired;
import st.ata.util.FPGenerator;
/**
* Maps URIs to one of N crawler names by applying a hash to the
* URI's (possibly-transformed) classKey.
*
* @author gojomo
* @version $Date$, $Revision$
*/
public class HashCrawlMapper extends CrawlMapper {
@SuppressWarnings("unused")
private static final long serialVersionUID = 2L;
protected Frontier frontier;
public Frontier getFrontier() {
return this.frontier;
}
@Autowired
public void setFrontier(Frontier frontier) {
this.frontier = frontier;
}
/**
* Number of crawlers among which to split up the URIs. Their names are
* assumed to be 0..N-1.
*/
protected long crawlerCount = 1L;
public long getCrawlerCount() {
return this.crawlerCount;
}
public void setCrawlerCount(long count) {
this.crawlerCount = count;
}
{
setUsePublicSuffixesRegex(true);
}
public boolean getUsePublicSuffixesRegex() {
return (Boolean) kp.get("usePublicSuffixesRegex");
}
/**
* Whether to use the PublicSuffixes-supplied reduce regex.
*
*/
public void setUsePublicSuffixesRegex(boolean usePublicSuffixes) {
kp.put("usePublicSuffixesRegex",usePublicSuffixes);
}
{
setReducePrefixRegex("");
}
public String getReducePrefixRegex() {
return (String) kp.get("reducePrefixRegex");
}
/**
* A regex pattern to apply to the classKey, using the first match as the
* mapping key. If empty (the default), use the full classKey.
*
*/
public void setReducePrefixRegex(String regex) {
kp.put("reducePrefixRegex",regex);
}
/**
* Constructor.
*/
public HashCrawlMapper() {
super();
}
/**
* Look up the crawler node name to which the given CrawlURI
* should be mapped.
*
* @param cauri CrawlURI to consider
* @return String node name which should handle URI
*/
protected String map(CrawlURI cauri) {
// get classKey, via frontier to generate if necessary
String key = frontier.getClassKey(cauri);
String reduceRegex = getReduceRegex(cauri);
return mapString(key, reduceRegex, getCrawlerCount());
}
protected String getReduceRegex(CrawlURI cauri) {
if(getUsePublicSuffixesRegex()) {
return PublicSuffixes.getTopmostAssignedSurtPrefixRegex();
} else {
return getReducePrefixRegex();
}
}
public static String mapString(String key, String reducePattern,
long bucketCount) {
if (reducePattern != null && reducePattern.length()>0) {
Matcher matcher = TextUtils.getMatcher(reducePattern,key);
if(matcher.find()) {
key = matcher.group();
}
TextUtils.recycleMatcher(matcher);
}
long fp = FPGenerator.std64.fp(key);
long bucket = fp % bucketCount;
return Long.toString(bucket >= 0 ? bucket : -bucket);
}
}