All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.fetcher.FetchWhois Maven / Gradle / Ivy

Go to download

This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.

There is a newer version: 3.5.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.modules.fetcher;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.HashMap;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;

import org.apache.commons.httpclient.URIException;
import org.apache.commons.net.whois.WhoisClient;
import org.archive.bdb.BdbModule;
import org.archive.modules.CoreAttributeConstants;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.archive.modules.extractor.Extractor;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.LinkContext;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.ServerCache;
import org.archive.util.Recorder;
import org.archive.util.TextUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;

import com.google.common.net.InternetDomainName;
import com.sleepycat.bind.tuple.IntegerBinding;
import com.sleepycat.bind.tuple.StringBinding;
import com.sleepycat.collections.StoredSortedMap;
import com.sleepycat.je.Database;
import com.sleepycat.je.DatabaseException;

/**
 * WHOIS Fetcher (RFC 3912). If this fetcher is enabled, Heritrix will attempt
 * WHOIS lookups on the topmost assigned domain and the IP address of each URL.
 * 
 * 

WHOIS URIs

*

* There is no pre-existing, canonical specification for WHOIS URIs. What * follows is the the format that Heritrix uses, which we propose for general * use. *

* *

* Syntax in ABNF as used in RFC 3986 Uniform Resource Identifier (URI): * Generic Syntax: *

* *
whoisurl = "whois:" [ "//" host [ ":" port ] "/" ] whoisquery
* *

* whoisquery is a url-encoded string. In ABNF, * whoisquery = 1*pchar where pchar is defined in RFC 3986. * host and port also as defined in RFC 3986. *

* *

* To resolve a WHOIS URI which specifies host[:port], open a TCP connection to * the host at the specified port (default 43), send the query (whoisquery, * url-decoded) followed by CRLF, and read the response until the server closes * the connection. For more details see RFC 3912. *

* *

* Resolution of a "serverless" WHOIS URI, which does not specify host[:port], * is implementation-dependent. *

* *

Serverless WHOIS URIs in Heritrix

* *

* For each non-WHOIS URI processed which has an authority, FetchWhois adds 1 or * 2 serverless WHOIS URIs to the CrawlURI's outlinks. These are * "whois:{ipAddress}" and, if the authority includes a hostname, * "whois:{topLevelDomain}". See {@link #addWhoisLinks(CrawlURI)}. *

* *

* Heritrix resolves serverless WHOIS URIs by first querying an initial server, * then following referrals to other servers. In pseudocode: * *

 * if query is an IPv4 address
 *     resolve whois://{@link #DEFAULT_IP_WHOIS_SERVER}/whoisquery
 * else
 *     let domainSuffix = part of query after the last '.' (or the whole query if no '.'), url-encoded
 *     resolve whois://{@link #ULTRA_SUFFIX_WHOIS_SERVER}/domainSuffix
 * 
 * while last response refers to another server, i.e. matches regex {@link #WHOIS_SERVER_REGEX}
 *     if we have a special query formatting rule for this whois server, apply it - see {@link #specialQueryTemplates}
 *     resolve whois://referralServer/whoisquery
 * 
* *

See {@link #deferOrFinishGeneric(CrawlURI, String)}

* * @author nlevitt */ public class FetchWhois extends Processor implements CoreAttributeConstants, FetchStatusCodes, Lifecycle { @SuppressWarnings("unused") private static final long serialVersionUID = 1L; private static Logger logger = Logger.getLogger(FetchWhois.class.getName()); public static final String IP_ADDRESS_REGEX = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}"; // North America-centric, but it should refer us to the right server // e.g. "ReferralServer: whois://whois.apnic.net" protected static final String DEFAULT_IP_WHOIS_SERVER = "whois.arin.net"; // look up "com" "net" "fr" "info" etc protected static final String ULTRA_SUFFIX_WHOIS_SERVER = "whois.iana.org"; // [whois://whois.arin.net/192.102.239.53] ReferralServer: whois://whois.apnic.net // [whois://whois.arin.net/208.49.199.10] ReferralServer: rwhois://rwhois.gblx.net:4321 // [whois://whois.arin.net/195.154.120.129] ReferralServer: whois://whois.ripe.net:43 // (obsolete) [whois://whois.iana.org/fr] Whois Server (port 43): whois.nic.fr // [whois://whois.iana.org/fr] whois: whois.nic.fr // [whois://whois.verisign-grs.com/domain%201stbattalion9thmarinesfirebase.net] Whois Server: whois.fastdomain.com // (false positive fixed) WHOIS lookup made at 23:48:04 13-Jan-2011 protected static String WHOIS_SERVER_REGEX = "(?i)^\\s*(?:whois server|ReferralServer|whois)[^:]*:.*?([a-zA-Z0-9-]+\\.[a-zA-Z0-9.:-]+)/*$"; protected enum UrlStatus {IN_PROGRESS, DONE}; private transient Database whoisDb; private transient StoredSortedMap referralServers; private transient StoredSortedMap urlProgress; protected BdbModule bdb; @Autowired public void setBdbModule(BdbModule bdb) { this.bdb = bdb; } protected Map specialQueryTemplates; { // Default special templates. Keep commented out section of // profile-crawler-beans.xml in synch with this. specialQueryTemplates = new HashMap(); specialQueryTemplates.put("whois.verisign-grs.com", "domain %s"); specialQueryTemplates.put("whois.arin.net", "z + %s"); specialQueryTemplates.put("whois.denic.de", "-T dn %s"); } public void setSpecialQueryTemplates(Map m) { this.specialQueryTemplates.clear(); this.specialQueryTemplates.putAll(m); } /** * If the socket is unresponsive for this number of milliseconds, give up. * Set to zero for no timeout (Not. recommended. Could hang a thread on an * unresponsive server). This timeout is used timing out socket opens and * for timing out each socket read. Make sure this value is < * {@link #TIMEOUT_SECONDS} for optimal configuration: ensures at least one * retry read. */ { setSoTimeoutMs(20*1000); // 20 seconds } public int getSoTimeoutMs() { return (Integer) kp.get("soTimeoutMs"); } public void setSoTimeoutMs(int timeout) { kp.put("soTimeoutMs",timeout); } private boolean isRunning = false; public void start() { if(isRunning()) { return; } try { BdbModule.BdbConfig dbConfig = new BdbModule.BdbConfig(); dbConfig.setTransactional(false); boolean isRecovery = recoveryCheckpoint != null; dbConfig.setAllowCreate(!isRecovery); whoisDb = bdb.openDatabase("whoisKnowledge", dbConfig, isRecovery); referralServers = new StoredSortedMap(whoisDb, new StringBinding(), new StringBinding(), true); urlProgress = new StoredSortedMap(whoisDb, new StringBinding(), new IntegerBinding(), true); } catch (DatabaseException e) { throw new RuntimeException(e); } isRunning = true; } public boolean isRunning() { return isRunning; } public void stop() { isRunning = false; // BdbModule will handle closing of DB // XXX happens at finish; move to teardown? bdb = null; } @Override protected ProcessResult innerProcessResult(CrawlURI curi) throws InterruptedException { if (curi.getUURI().getScheme().equals("whois")) { curi.setFetchBeginTime(System.currentTimeMillis()); String whoisServer = getWhoisServer(curi); String whoisQuery = getWhoisQuery(curi); if (whoisServer == null) { // e.g. whois:foo.org ProcessResult ret = deferOrFinishGeneric(curi, whoisQuery); return ret; } else { // e.g. whois://whois.pir.org/foo.org fetch(curi, whoisServer, whoisQuery); return ProcessResult.PROCEED; } } else { addWhoisLinks(curi); return ProcessResult.PROCEED; } } // handle serverless whois url protected ProcessResult deferOrFinishGeneric(CrawlURI curi, String domainOrIp) { String tryThis = null; String ultraSuffix = domainOrIp.substring(domainOrIp.lastIndexOf('.') + 1).toLowerCase(); if (referralServers.containsKey(domainOrIp)) { tryThis = "whois://" + referralServers.get(domainOrIp) + '/' + domainOrIp; } else if (TextUtils.getMatcher(IP_ADDRESS_REGEX, domainOrIp).matches()) { tryThis = makeWhoisUrl(DEFAULT_IP_WHOIS_SERVER, domainOrIp); } else if (referralServers.containsKey(ultraSuffix)) { tryThis = makeWhoisUrl(referralServers.get(ultraSuffix), domainOrIp); } else if (urlProgress.get(makeWhoisUrl(ULTRA_SUFFIX_WHOIS_SERVER, ultraSuffix)) == null) { tryThis = makeWhoisUrl(ULTRA_SUFFIX_WHOIS_SERVER, ultraSuffix); } else { logger.warning("apparently no whois server for \"" + domainOrIp + "\""); curi.setFetchStatus(S_OTHER_PREREQUISITE_FAILURE); return ProcessResult.PROCEED; } assert(tryThis != null); Integer progress = urlProgress.get(tryThis); if (tryThis == null || (progress != null && progress == UrlStatus.DONE.ordinal())) { if (logger.isLoggable(Level.FINE)) { logger.fine("finished with generic serverless whois uri " + curi); } curi.setFetchStatus(S_WHOIS_GENERIC_FINISHED); return ProcessResult.PROCEED; } else { if (progress == null) { try { if (logger.isLoggable(Level.FINE)) { logger.fine(curi + " marking prerequisite " + tryThis + " and deferring"); } CrawlURI caUri = curi.markPrerequisite(tryThis); caUri.setForceFetch(false); urlProgress.put(tryThis, UrlStatus.IN_PROGRESS.ordinal()); } catch (URIException e) { throw new RuntimeException(e); } } else { // this shouldn't happen because prereqs should be crawled before we're attempted again curi.incrementDeferrals(); curi.setFetchStatus(S_DEFERRED); if (logger.isLoggable(Level.FINE)) { logger.fine(curi + ": prerequisite " + tryThis + " is in progress, deferring"); } } return ProcessResult.FINISH; } } protected String makeWhoisUrl(String server, String principal) { try { String query; String template = specialQueryTemplates.get(server.toLowerCase()); if (template != null) { query = template.replaceAll("%s", principal); } else { query = principal; } return "whois://" + server + "/" + URLEncoder.encode(query, "UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } } protected void fetch(CrawlURI curi, String whoisServer, String whoisQuery) { WhoisClient client = new WhoisClient(); Recorder recorder = curi.getRecorder(); try { client.setConnectTimeout(getSoTimeoutMs()); client.setDefaultTimeout(getSoTimeoutMs()); if (curi.getUURI().getPort() > 0) { client.connect(whoisServer, curi.getUURI().getPort()); } else { client.connect(whoisServer); } client.setSoTimeout(getSoTimeoutMs()); // must be after connect() curi.getData().put(CoreAttributeConstants.A_WHOIS_SERVER_IP, client.getRemoteAddress().getHostAddress()); recorder.inputWrap(client.getInputStream(whoisQuery)); // look for info about whois server in the response // XXX run regex on the whole thing, rather than line by line? BufferedReader reader = new BufferedReader(new InputStreamReader(recorder.getRecordedInput(), "ASCII")); for (String line = reader.readLine(); line != null; line = reader.readLine()) { Matcher matcher = TextUtils.getMatcher(WHOIS_SERVER_REGEX, line); if (matcher.find()) { // gets rid of "domain " for whois.verisign-grs.com queries String key = whoisQuery.replaceFirst("(\\S+\\s+)+", "").toLowerCase(); referralServers.put(key, matcher.group(1).toLowerCase()); if (logger.isLoggable(Level.FINE)) { logger.fine("added referral server " + matcher.group(1) + " to server list for " + key); } } } curi.setContentType("text/plain"); curi.setFetchStatus(S_WHOIS_SUCCESS); } catch (IOException e) { if (logger.isLoggable(Level.FINE)) { logger.fine("failed to connect to whois server for uri " + curi + ": " + e); } curi.getNonFatalFailures().add(e); curi.setFetchStatus(S_CONNECT_FAILED); } finally { recorder.close(); curi.setContentSize(recorder.getRecordedInput().getSize()); logger.fine(curi + ": " + recorder.getRecordedInput().getSize() + " bytes read"); if (client != null && client.isConnected()) try { client.disconnect(); } catch (IOException e) { logger.fine("problem closing connection to whois server for uri " + curi + ": " + e); } urlProgress.put(curi.toString(), UrlStatus.DONE.ordinal()); } } protected String getWhoisQuery(CrawlURI curi) { try { if (curi.getUURI().getAuthority() == null) { // whois:archive-it.org - returns "archive-it.org" return curi.getUURI().getPathQuery(); } else { // whois://whois.pir.org/archive-it.org - getPathQuery() returns"/archive-it.org", so chop off "/" return curi.getUURI().getPathQuery().substring(1); } } catch (URIException e) { logger.log(Level.SEVERE, "Failed to get path/query from uri " + curi, e); return null; } } protected String getWhoisServer(CrawlURI curi) { String whoisServer = null; try { whoisServer = curi.getUURI().getHost(); if (whoisServer != null && whoisServer.length() == 0) { whoisServer = null; } } catch (URIException e) { logger.warning("Failed to get host from uri " + curi + ": " + e); whoisServer = null; } return whoisServer; } @Override protected boolean shouldProcess(CrawlURI uri) { // process all uris - non-whois uris get whois outlinks added return true; } protected ServerCache serverCache; public ServerCache getServerCache() { return this.serverCache; } @Autowired public void setServerCache(ServerCache serverCache) { this.serverCache = serverCache; } protected void addWhoisLink(CrawlURI curi, String query) { String whoisUrl = "whois:" + query; try { Extractor.add(curi, Integer.MAX_VALUE, whoisUrl, LinkContext.INFERRED_MISC, Hop.INFERRED); } catch (URIException e) { logger.log(Level.WARNING, "problem with url " + whoisUrl, e); } } /** * Adds outlinks to whois:{domain} and whois:{ipAddress} */ protected void addWhoisLinks(CrawlURI curi) throws InterruptedException { CrawlHost ch = serverCache.getHostFor(curi.getUURI()); if (ch == null) { return; } if (ch.getIP() != null) { // do a whois lookup on the ip address addWhoisLink(curi, ch.getIP().getHostAddress()); } if (InternetDomainName.isValid(ch.getHostName())) { // do a whois lookup on the domain try { String topmostAssigned = InternetDomainName.from(ch.getHostName()).topPrivateDomain().toString(); addWhoisLink(curi, topmostAssigned); } catch (IllegalStateException e) { // java.lang.IllegalStateException: Not under a public suffix: mod.uk logger.warning("problem resolving topmost assigned domain, will try whois lookup on the plain hostname " + ch.getHostName() + " - " + e); addWhoisLink(curi, ch.getHostName()); } } } @Override protected void innerProcess(CrawlURI uri) throws InterruptedException { throw new RuntimeException("this method shouldn't be called - should use innerProcessResult()"); } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy