![JAR search and dependency download from the Maven repository](/logo.png)
org.archive.modules.deciderules.IpAddressSetDecideRule Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-modules Show documentation
Show all versions of heritrix-modules Show documentation
This project contains some of the configurable modules used within the
Heritrix application to crawl the web. The modules in this project can
be used in applications other than Heritrix, however.
package org.archive.modules.deciderules;
import static org.archive.modules.CoreAttributeConstants.A_DNS_SERVER_IP_LABEL;
import java.net.InetAddress;
import java.util.Collections;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.archive.modules.CrawlURI;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.springframework.beans.factory.annotation.Autowired;
/**
* IpAddressSetDecideRule must be used with
* org.archive.crawler.prefetch.Preselector#setRecheckScope(boolean) set
* to true because it relies on Heritrix' dns lookup to establish the ip address
* for a URI before it can run.
*
*
* <bean class="org.archive.modules.deciderules.IpAddressSetDecideRule">
* <property name="ipAddresses">
* <set>
* <value>127.0.0.1</value>
* <value>69.89.27.209</value>
* </set>
* </property>
* <property name='decision' value='REJECT' />
* </bean>
*
*
* @author Travis Wellman <[email protected]>
*/
public class IpAddressSetDecideRule extends PredicatedDecideRule {
private static final Logger logger = Logger.getLogger(IpAddressSetDecideRule.class.getName());
private static final long serialVersionUID = -3670434739183271441L;
private Set ipAddresses;
/**
* @return the addresses being matched
*/
public Set getIpAddresses() {
return Collections.unmodifiableSet(ipAddresses);
}
/**
* @param ipAddresses the addresses to match
*/
public void setIpAddresses(Set ipAddresses) {
this.ipAddresses = ipAddresses;
}
@Override
protected boolean evaluate(CrawlURI curi) {
String hostAddress = getHostAddress(curi);
return hostAddress != null &&
ipAddresses.contains(hostAddress.intern());
}
transient protected ServerCache serverCache;
public ServerCache getServerCache() {
return this.serverCache;
}
@Autowired
public void setServerCache(ServerCache serverCache) {
this.serverCache = serverCache;
}
/**
* from WriterPoolProcessor
*
* @param curi CrawlURI
* @return String of IP address or null if unable to determine IP address
*/
protected String getHostAddress(CrawlURI curi) {
// special handling for DNS URIs: want address of DNS server
if (curi.getUURI().getScheme().toLowerCase().equals("dns")) {
return (String)curi.getData().get(A_DNS_SERVER_IP_LABEL);
}
// otherwise, host referenced in URI
// TODO:FIXME: have fetcher insert exact IP contacted into curi,
// use that rather than inferred by CrawlHost lookup
String addr = null;
try {
CrawlHost crlh = getServerCache().getHostFor(curi.getUURI());
if (crlh == null) {
return null;
}
InetAddress inetadd = crlh.getIP();
if (inetadd == null) {
return null;
}
addr = inetadd.getHostAddress();
} catch (Exception e) {
// Log error and continue (return null)
logger.log(Level.WARNING, "Error looking up IP for URI "+curi.getURI(), e);
}
return addr;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy