org.archive.crawler.prefetch.PreconditionEnforcer Maven / Gradle / Ivy
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.crawler.prefetch;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DOMAIN_PREREQUISITE_FAILURE;
import static org.archive.modules.fetcher.FetchStatusCodes.S_ROBOTS_PRECLUDED;
import static org.archive.modules.fetcher.FetchStatusCodes.S_ROBOTS_PREREQUISITE_FAILURE;
import static org.archive.modules.fetcher.FetchStatusCodes.S_UNFETCHABLE_URI;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.httpclient.URIException;
import org.archive.crawler.reporting.CrawlerLoggerModule;
import org.archive.modules.CrawlMetadata;
import org.archive.modules.CrawlURI;
import org.archive.modules.ProcessResult;
import org.archive.modules.Processor;
import org.archive.modules.credential.Credential;
import org.archive.modules.credential.CredentialStore;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.CrawlServer;
import org.archive.modules.net.RobotsPolicy;
import org.archive.modules.net.ServerCache;
import org.archive.net.UURI;
import org.springframework.beans.factory.annotation.Autowired;
/**
* Ensures the preconditions for a fetch -- such as DNS lookup
* or acquiring and respecting a robots.txt policy -- are
* satisfied before a URI is passed to subsequent stages.
*
* @author gojomo
*/
public class PreconditionEnforcer extends Processor {
@SuppressWarnings("unused")
private static final long serialVersionUID = 3L;
private static final Logger logger =
Logger.getLogger(PreconditionEnforcer.class.getName());
{
setIpValidityDurationSeconds(6*60*60); // 6 hours
}
public int getIpValidityDurationSeconds() {
return (Integer) kp.get("ipValidityDurationSeconds");
}
/**
* The minimum interval for which a dns-record will be considered
* valid (in seconds). If the record's DNS TTL is larger, that will
* be used instead.
*/
public void setIpValidityDurationSeconds(int duration) {
kp.put("ipValidityDurationSeconds",duration);
}
{
setRobotsValidityDurationSeconds(24*60*60); // 24 hours
}
public int getRobotsValidityDurationSeconds() {
return (Integer) kp.get("robotsValidityDurationSeconds");
}
/**
* The time in seconds that fetched robots.txt information is considered to
* be valid. If the value is set to '0', then the robots.txt information
* will never expire.
*/
public void setRobotsValidityDurationSeconds(int duration) {
kp.put("robotsValidityDurationSeconds",duration);
}
{
setCalculateRobotsOnly(false);
}
public boolean getCalculateRobotsOnly() {
return (Boolean) kp.get("calculateRobotsOnly");
}
/**
* Whether to only calculate the robots status of an URI, without actually
* applying any exclusions found. If true, exlcuded URIs will only be
* annotated in the crawl.log, but still fetched. Default is false.
*/
public void setCalculateRobotsOnly(boolean calcOnly) {
kp.put("calculateRobotsOnly",calcOnly);
}
/**
* Auto-discovered module providing configured (or overridden)
* User-Agent value and RobotsHonoringPolicy
*/
protected CrawlMetadata metadata;
public CrawlMetadata getMetadata() {
return metadata;
}
@Autowired
public void setMetadata(CrawlMetadata provider) {
this.metadata = provider;
}
{
// initialize with empty store so declaration not required
setCredentialStore(new CredentialStore());
}
public CredentialStore getCredentialStore() {
return (CredentialStore) kp.get("credentialStore");
}
@Autowired(required=false)
public void setCredentialStore(CredentialStore credentials) {
kp.put("credentialStore",credentials);
}
protected ServerCache serverCache;
public ServerCache getServerCache() {
return this.serverCache;
}
@Autowired
public void setServerCache(ServerCache serverCache) {
this.serverCache = serverCache;
}
protected CrawlerLoggerModule loggerModule;
public CrawlerLoggerModule getLoggerModule() {
return this.loggerModule;
}
@Autowired
public void setLoggerModule(CrawlerLoggerModule loggerModule) {
this.loggerModule = loggerModule;
}
public PreconditionEnforcer() {
super();
}
@Override
protected boolean shouldProcess(CrawlURI puri) {
return (puri instanceof CrawlURI);
}
@Override
protected void innerProcess(CrawlURI puri) {
throw new AssertionError();
}
@Override
protected ProcessResult innerProcessResult(CrawlURI puri) {
CrawlURI curi = (CrawlURI)puri;
if (considerDnsPreconditions(curi)) {
return ProcessResult.FINISH;
}
// make sure we only process schemes we understand (i.e. not dns)
String scheme = curi.getUURI().getScheme().toLowerCase();
if (! (scheme.equals("http") || scheme.equals("https"))) {
logger.fine("PolitenessEnforcer doesn't understand uri's of type " +
scheme + " (ignoring)");
return ProcessResult.PROCEED;
}
if (considerRobotsPreconditions(curi)) {
return ProcessResult.FINISH;
}
if (!curi.isPrerequisite() && credentialPrecondition(curi)) {
return ProcessResult.FINISH;
}
// OK, it's allowed
// For all curis that will in fact be fetched, set appropriate delays.
// TODO: SOMEDAY: allow per-host, per-protocol, etc. factors
// curi.setDelayFactor(getDelayFactorFor(curi));
// curi.setMinimumDelay(getMinimumDelayFor(curi));
return ProcessResult.PROCEED;
}
/**
* Consider the robots precondition.
*
* @param curi CrawlURI we're checking for any required preconditions.
* @return True, if this curi
has a precondition or processing
* should be terminated for some other reason. False if
* we can proceed to process this url.
*/
protected boolean considerRobotsPreconditions(CrawlURI curi) {
// treat /robots.txt fetches specially
UURI uuri = curi.getUURI();
try {
if (uuri != null && uuri.getPath() != null &&
curi.getUURI().getPath().equals("/robots.txt")) {
// allow processing to continue
curi.setPrerequisite(true);
return false;
}
} catch (URIException e) {
logger.severe("Failed get of path for " + curi);
}
CrawlServer cs = serverCache.getServerFor(curi.getUURI());
// require /robots.txt if not present
if (cs.isRobotsExpired(getRobotsValidityDurationSeconds())) {
// Need to get robots
if (logger.isLoggable(Level.FINE)) {
logger.fine( "No valid robots for " + cs +
"; deferring " + curi);
}
// Robots expired - should be refetched even though its already
// crawled.
try {
String prereq = curi.getUURI().resolve("/robots.txt").toString();
curi.markPrerequisite(prereq);
}
catch (URIException e1) {
logger.severe("Failed resolve using " + curi);
throw new RuntimeException(e1); // shouldn't ever happen
}
return true;
}
// test against robots.txt if available
if (cs.isValidRobots()) {
String ua = metadata.getUserAgent();
RobotsPolicy robots = metadata.getRobotsPolicy();
if(!robots.allows(ua, curi, cs.getRobotstxt())) {
if(getCalculateRobotsOnly()) {
// annotate URI as excluded, but continue to process normally
curi.getAnnotations().add("robotExcluded");
return false;
}
// mark as precluded; in FetchHTTP, this will
// prevent fetching and cause a skip to the end
// of processing (unless an intervening processor
// overrules)
curi.setFetchStatus(S_ROBOTS_PRECLUDED);
curi.setError("robots.txt exclusion");
logger.fine("robots.txt precluded " + curi);
return true;
}
return false;
}
// No valid robots found => Attempt to get robots.txt failed
// curi.skipToPostProcessing();
curi.setFetchStatus(S_ROBOTS_PREREQUISITE_FAILURE);
curi.setError("robots.txt prerequisite failed");
if (logger.isLoggable(Level.FINE)) {
logger.fine("robots.txt prerequisite failed " + curi);
}
return true;
}
/**
* @param curi CrawlURI whose dns prerequisite we're to check.
* @return true if no further processing in this module should occur
*/
protected boolean considerDnsPreconditions(CrawlURI curi) {
if(curi.getUURI().getScheme().equals("dns")){
// DNS URIs never have a DNS precondition
curi.setPrerequisite(true);
return false;
} else if (curi.getUURI().getScheme().equals("whois")) {
return false;
}
CrawlServer cs = serverCache.getServerFor(curi.getUURI());
if(cs == null) {
curi.setFetchStatus(S_UNFETCHABLE_URI);
// curi.skipToPostProcessing();
return true;
}
// If we've done a dns lookup and it didn't resolve a host
// cancel further fetch-processing of this URI, because
// the domain is unresolvable
CrawlHost ch = serverCache.getHostFor(curi.getUURI());
if (ch == null || ch.getIP() == null && !isIpExpired(curi)) {
if (logger.isLoggable(Level.FINE)) {
logger.fine( "no dns for " + ch +
" cancelling processing for CrawlURI " + curi.toString());
}
curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE);
// curi.skipToPostProcessing();
return true;
}
// If we haven't done a dns lookup and this isn't a dns uri
// shoot that off and defer further processing
if (isIpExpired(curi) && !curi.getUURI().getScheme().equals("dns")) {
logger.fine("Deferring processing of CrawlURI " + curi.toString()
+ " for dns lookup.");
String preq = "dns:" + ch.getHostName();
try {
curi.markPrerequisite(preq);
} catch (URIException e) {
throw new RuntimeException(e); // shouldn't ever happen
}
return true;
}
// DNS preconditions OK
return false;
}
/** Return true if ip should be looked up.
*
* @param curi the URI to check.
* @return true if ip should be looked up.
*/
public boolean isIpExpired(CrawlURI curi) {
CrawlHost host = serverCache.getHostFor(curi.getUURI());
if (!host.hasBeenLookedUp()) {
// IP has not been looked up yet.
return true;
}
if (host.getIpTTL() == CrawlHost.IP_NEVER_EXPIRES) {
// IP never expires (numeric IP)
return false;
}
long duration = getIpValidityDurationSeconds();
if (duration == 0) {
// Never expire ip if duration is null (set by user or more likely,
// set to zero in case where we tried in FetchDNS but failed).
return false;
}
long ttl = host.getIpTTL();
if (ttl > duration) {
// Use the larger of the operator-set minimum duration
// or the DNS record TTL
duration = ttl;
}
// Duration and ttl are in seconds. Convert to millis.
if (duration > 0) {
duration *= 1000;
}
return (duration + host.getIpFetched()) < System.currentTimeMillis();
}
/**
* Consider credential preconditions.
*
* Looks to see if any credential preconditions (e.g. html form login
* credentials) for this CrawlServer
. If there are, have they
* been run already? If not, make the running of these logins a precondition
* of accessing any other url on this CrawlServer
.
*
*
* One day, do optimization and avoid running the bulk of the code below.
* Argument for running the code every time is that overrides and refinements
* may change what comes back from credential store.
*
* @param curi CrawlURI we're checking for any required preconditions.
* @return True, if this curi
has a precondition that needs to
* be met before we can proceed. False if we can precede to process
* this url.
*/
public boolean credentialPrecondition(final CrawlURI curi) {
boolean result = false;
CredentialStore cs = getCredentialStore();
if (cs == null) {
logger.severe("No credential store for " + curi);
return result;
}
for (Credential c: cs.getAll()) {
if (c.isPrerequisite(curi)) {
// This credential has a prereq. and this curi is it. Let it
// through. Add its avatar to the curi as a mark. Also, does
// this curi need to be posted? Note, we do this test for
// is it a prereq BEFORE we do the check that curi is of the
// credential domain because such as yahoo have you go to
// another domain altogether to login.
logger.fine("attaching credential and setting fetch type to POST for recognized form login url " + curi);
c.attach(curi);
curi.setFetchType(CrawlURI.FetchType.HTTP_POST);
break;
}
if (!c.rootUriMatch(serverCache, curi)) {
continue;
}
if (!c.hasPrerequisite(curi)) {
continue;
}
if (!authenticated(c, curi)) {
// Han't been authenticated. Queue it and move on (Assumption
// is that we can do one authentication at a time -- usually one
// html form).
String prereq = c.getPrerequisite(curi);
if (prereq == null || prereq.length() <= 0) {
CrawlServer server = serverCache.getServerFor(curi.getUURI());
logger.severe(server.getName() + " has "
+ " credential(s) of type " + c + " but prereq"
+ " is null.");
} else {
try {
curi.markPrerequisite(prereq);
} catch (URIException e) {
logger.severe("unable to set credentials prerequisite "+prereq);
loggerModule.logUriError(e,curi.getUURI(),prereq);
return false;
}
result = true;
if (logger.isLoggable(Level.FINE)) {
logger.fine("Queueing prereq " + prereq + " of type " +
c + " for " + curi);
}
break;
}
}
}
return result;
}
/**
* Has passed credential already been authenticated.
*
* @param credential Credential to test.
* @param curi CrawlURI.
* @return True if already run.
*/
protected boolean authenticated(final Credential credential, final CrawlURI curi) {
CrawlServer server = serverCache.getServerFor(curi.getUURI());
if (!server.hasCredentials()) {
return false;
}
Set credentials = server.getCredentials();
for (Credential cred: credentials) {
if (cred.getKey().equals(credential.getKey())
&& cred.getClass().isInstance(credential)) {
return true;
}
}
return false;
}
}