All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.deciderules.IpAddressSetDecideRule Maven / Gradle / Ivy

package org.archive.modules.deciderules;

import static org.archive.modules.CoreAttributeConstants.A_DNS_SERVER_IP_LABEL;

import java.net.InetAddress;
import java.util.Collections;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.archive.modules.CrawlURI;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.springframework.beans.factory.annotation.Autowired;

/**
 * IpAddressSetDecideRule must be used with
 * org.archive.crawler.prefetch.Preselector#setRecheckScope(boolean) set
 * to true because it relies on Heritrix' dns lookup to establish the ip address
 * for a URI before it can run.
 * 
 * 
 * <bean class="org.archive.modules.deciderules.IpAddressSetDecideRule">
 *  <property name="ipAddresses">
 *   <set>
 *    <value>127.0.0.1</value>
 *    <value>69.89.27.209</value>
 *   </set>
 *  </property>
 *  <property name='decision' value='REJECT' />
 * </bean>
 * 
* * @author Travis Wellman <[email protected]> */ public class IpAddressSetDecideRule extends PredicatedDecideRule { private static final Logger logger = Logger.getLogger(IpAddressSetDecideRule.class.getName()); private static final long serialVersionUID = -3670434739183271441L; private Set ipAddresses; /** * @return the addresses being matched */ public Set getIpAddresses() { return Collections.unmodifiableSet(ipAddresses); } /** * @param ipAddresses the addresses to match */ public void setIpAddresses(Set ipAddresses) { this.ipAddresses = ipAddresses; } @Override protected boolean evaluate(CrawlURI curi) { String hostAddress = getHostAddress(curi); return hostAddress != null && ipAddresses.contains(hostAddress.intern()); } transient protected ServerCache serverCache; public ServerCache getServerCache() { return this.serverCache; } @Autowired public void setServerCache(ServerCache serverCache) { this.serverCache = serverCache; } /** * from WriterPoolProcessor * * @param curi CrawlURI * @return String of IP address or null if unable to determine IP address */ protected String getHostAddress(CrawlURI curi) { // special handling for DNS URIs: want address of DNS server if (curi.getUURI().getScheme().toLowerCase().equals("dns")) { return (String)curi.getData().get(A_DNS_SERVER_IP_LABEL); } // otherwise, host referenced in URI // TODO:FIXME: have fetcher insert exact IP contacted into curi, // use that rather than inferred by CrawlHost lookup String addr = null; try { CrawlHost crlh = getServerCache().getHostFor(curi.getUURI()); if (crlh == null) { return null; } InetAddress inetadd = crlh.getIP(); if (inetadd == null) { return null; } addr = inetadd.getHostAddress(); } catch (Exception e) { // Log error and continue (return null) logger.log(Level.WARNING, "Error looking up IP for URI "+curi.getURI(), e); } return addr; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy