All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.deciderules.IpAddressSetDecideRule Maven / Gradle / Ivy

Go to download

This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.

There is a newer version: 3.6.0
Show newest version
package org.archive.modules.deciderules;

import static org.archive.modules.CoreAttributeConstants.A_DNS_SERVER_IP_LABEL;

import java.net.InetAddress;
import java.util.Collections;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.modules.CrawlURI;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.springframework.beans.factory.annotation.Autowired;

/**
 * IpAddressSetDecideRule must be used with
 * org.archive.crawler.prefetch.Preselector#setRecheckScope(boolean) set
 * to true because it relies on Heritrix' dns lookup to establish the ip address
 * for a URI before it can run.
 * 
 * 
 * <bean class="org.archive.modules.deciderules.IpAddressSetDecideRule">
 *  <property name="ipAddresses">
 *   <set>
 *    <value>127.0.0.1</value>
 *    <value>69.89.27.209</value>
 *   </set>
 *  </property>
 *  <property name='decision' value='REJECT' />
 * </bean>
 * 
* * @author Travis Wellman <[email protected]> */ public class IpAddressSetDecideRule extends PredicatedDecideRule { private static final Logger logger = Logger.getLogger(IpAddressSetDecideRule.class.getName()); private static final long serialVersionUID = -3670434739183271441L; private Set ipAddresses; /** * @return the addresses being matched */ public Set getIpAddresses() { return Collections.unmodifiableSet(ipAddresses); } /** * @param ipAddresses the addresses to match */ public void setIpAddresses(Set ipAddresses) { this.ipAddresses = ipAddresses; } @Override protected boolean evaluate(CrawlURI curi) { String hostAddress = getHostAddress(curi); return hostAddress != null && ipAddresses.contains(hostAddress.intern()); } transient protected ServerCache serverCache; public ServerCache getServerCache() { return this.serverCache; } @Autowired public void setServerCache(ServerCache serverCache) { this.serverCache = serverCache; } /** * from WriterPoolProcessor * * @param curi CrawlURI * @return String of IP address or null if unable to determine IP address */ protected String getHostAddress(CrawlURI curi) { // special handling for DNS URIs: want address of DNS server if (curi.getUURI().getScheme().toLowerCase().equals("dns")) { return (String)curi.getData().get(A_DNS_SERVER_IP_LABEL); } // otherwise, host referenced in URI // TODO:FIXME: have fetcher insert exact IP contacted into curi, // use that rather than inferred by CrawlHost lookup String addr = null; try { CrawlHost crlh = getServerCache().getHostFor(curi.getUURI()); if (crlh == null) { return null; } InetAddress inetadd = crlh.getIP(); if (inetadd == null) { return null; } addr = inetadd.getHostAddress(); } catch (Exception e) { // Log error and continue (return null) logger.log(Level.WARNING, "Error looking up IP for URI "+curi.getURI(), e); } return addr; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy