All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.fetcher.FetchHTTP Maven / Gradle / Ivy

Go to download

This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.

There is a newer version: 3.5.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.modules.fetcher;

import static org.archive.modules.CrawlURI.FetchType.HTTP_POST;
import static org.archive.modules.fetcher.FetchErrors.LENGTH_TRUNC;
import static org.archive.modules.fetcher.FetchErrors.TIMER_TRUNC;
import static org.archive.modules.fetcher.FetchStatusCodes.S_CONNECT_FAILED;
import static org.archive.modules.fetcher.FetchStatusCodes.S_CONNECT_LOST;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DOMAIN_PREREQUISITE_FAILURE;
import static org.archive.modules.fetcher.FetchStatusCodes.S_UNFETCHABLE_URI;
import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_REFERENCE_LENGTH;

import java.io.IOException;
import java.nio.charset.Charset;
import java.security.MessageDigest;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;

import javax.net.ssl.SSLContext;
import javax.net.ssl.TrustManager;

import org.apache.commons.httpclient.URIException;
import org.apache.commons.lang.StringUtils;
import org.apache.http.Header;
import org.apache.http.HttpHeaders;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.HttpVersion;
import org.apache.http.ProtocolVersion;
import org.apache.http.auth.AuthScheme;
import org.apache.http.auth.AuthSchemeProvider;
import org.apache.http.auth.MalformedChallengeException;
import org.apache.http.client.AuthenticationStrategy;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.AuthSchemes;
import org.apache.http.client.methods.AbstractExecutionAwareRequest;
import org.apache.http.config.Lookup;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.entity.ContentType;
import org.apache.http.impl.auth.BasicSchemeFactory;
import org.apache.http.impl.auth.DigestSchemeFactory;
import org.apache.http.impl.client.ProxyAuthenticationStrategy;
import org.apache.http.impl.client.TargetAuthenticationStrategy;
import org.apache.http.message.BasicHeader;
import org.archive.httpclient.ConfigurableX509TrustManager;
import org.archive.httpclient.ConfigurableX509TrustManager.TrustLevel;
import org.archive.io.RecorderLengthExceededException;
import org.archive.io.RecorderTimeoutException;
import org.archive.modules.CrawlURI;
import org.archive.modules.Processor;
import org.archive.modules.credential.Credential;
import org.archive.modules.credential.CredentialStore;
import org.archive.modules.credential.HttpAuthenticationCredential;
import org.archive.modules.deciderules.AcceptDecideRule;
import org.archive.modules.deciderules.DecideResult;
import org.archive.modules.deciderules.DecideRule;
import org.archive.modules.net.CrawlHost;
import org.archive.modules.net.CrawlServer;
import org.archive.modules.net.ServerCache;
import org.archive.util.Recorder;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.Lifecycle;

/**
 * HTTP fetcher that uses Apache HttpComponents.
 * @author nlevitt
 */
public class FetchHTTP extends Processor implements Lifecycle {

    private static Logger logger = Logger.getLogger(FetchHTTP.class.getName());

    public static final String HTTP_SCHEME = "http";
    public static final String HTTPS_SCHEME = "https";
    
    protected static final Lookup AUTH_SCHEME_REGISTRY;
    static {
        RegistryBuilder b = RegistryBuilder.create();
        b.register(AuthSchemes.BASIC, new BasicSchemeFactory());
        b.register(AuthSchemes.DIGEST, new DigestSchemeFactory());
        AUTH_SCHEME_REGISTRY = b.build();
    }

    protected ServerCache serverCache;
    public ServerCache getServerCache() {
        return this.serverCache;
    }
    /**
     * Used to do DNS lookups.
     */
    @Autowired
    public void setServerCache(ServerCache serverCache) {
        this.serverCache = serverCache;
    }

    {
        setDigestContent(true);
    }
    public boolean getDigestContent() {
        return (Boolean) kp.get("digestContent");
    }
    /**
     * Whether or not to perform an on-the-fly digest hash of retrieved
     * content-bodies.
     */
    public void setDigestContent(boolean digest) {
        kp.put("digestContent",digest);
    }
 
    protected String digestAlgorithm = "sha1";
    public String getDigestAlgorithm() {
        return digestAlgorithm;
    }
    /**
     * Which algorithm (for example MD5 or SHA-1) to use to perform an
     * on-the-fly digest hash of retrieved content-bodies.
     */
    public void setDigestAlgorithm(String digestAlgorithm) {
        this.digestAlgorithm = digestAlgorithm;
    }

    public UserAgentProvider getUserAgentProvider() {
        return (UserAgentProvider) kp.get("userAgentProvider");
    }
    @Autowired
    public void setUserAgentProvider(UserAgentProvider provider) {
        kp.put("userAgentProvider",provider);
    }

    {
        setSendConnectionClose(true);
    }
    public boolean getSendConnectionClose() {
        return (Boolean) kp.get("sendConnectionClose");
    }
    /**
     * Send 'Connection: close' header with every request.
     */
    public void setSendConnectionClose(boolean sendClose) {
        kp.put("sendConnectionClose",sendClose);
    }
    
    {
        setDefaultEncoding("ISO-8859-1");
    }
    public String getDefaultEncoding() {
        return getDefaultCharset().name();
    }
    /**
     * The character encoding to use for files that do not have one specified in
     * the HTTP response headers. Default: ISO-8859-1.
     */
    public void setDefaultEncoding(String encoding) {
        kp.put("defaultEncoding",Charset.forName(encoding));
    }
    public Charset getDefaultCharset() {
        return (Charset)kp.get("defaultEncoding");
    }

    {
        setUseHTTP11(false);
    }
    public boolean getUseHTTP11() {
        return (Boolean) kp.get("useHTTP11");
    }
    /**
     * Use HTTP/1.1. Note: even when offering an HTTP/1.1 request, 
     * Heritrix may not properly handle persistent/keep-alive connections, 
     * so the sendConnectionClose parameter should remain 'true'. 
     */
    public void setUseHTTP11(boolean useHTTP11) {
        kp.put("useHTTP11",useHTTP11);
    }

    protected ProtocolVersion getConfiguredHttpVersion() {
        if (getUseHTTP11()) {
            return HttpVersion.HTTP_1_1;
        } else {
            return HttpVersion.HTTP_1_0;
        }
    }

    {
        setIgnoreCookies(false);
    }
    public boolean getIgnoreCookies() {
        return (Boolean) kp.get("ignoreCookies");
    }
    /**
     * Disable cookie handling.
     */
    public void setIgnoreCookies(boolean ignoreCookies) {
        kp.put("ignoreCookies",ignoreCookies);
    }

    {
        setSendReferer(true);
    }
    public boolean getSendReferer() {
        return (Boolean) kp.get("sendReferer");
    }
    /**
     * Send 'Referer' header with every request.
     * 

* The 'Referer' header contans the location the crawler came from, the page * the current URI was discovered in. The 'Referer' usually is logged on the * remote server and can be of assistance to webmasters trying to figure how * a crawler got to a particular area on a site. */ public void setSendReferer(boolean sendReferer) { kp.put("sendReferer",sendReferer); } { setAcceptCompression(false); } public boolean getAcceptCompression() { return (Boolean) kp.get("acceptCompression"); } /** * Set headers to accept compressed responses. */ public void setAcceptCompression(boolean acceptCompression) { kp.put("acceptCompression", acceptCompression); } { setAcceptHeaders(Arrays.asList("Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")); } @SuppressWarnings("unchecked") public List getAcceptHeaders() { return (List) kp.get("acceptHeaders"); } /** * Accept Headers to include in each request. Each must be the complete * header, e.g., 'Accept-Language: en'. (Thus, this can also be used to * other headers not beginning 'Accept-' as well.) By default heritrix sends * an Accept header similar to what a typical browser would send (the value * comes from Firefox 4.0). */ public void setAcceptHeaders(List headers) { kp.put("acceptHeaders",headers); } protected AbstractCookieStore cookieStore; @Autowired(required=false) public void setCookieStore(AbstractCookieStore cookieStore) { this.cookieStore = cookieStore; } public AbstractCookieStore getCookieStore() { return cookieStore; } { // initialize with empty store so declaration not required setCredentialStore(new CredentialStore()); } public CredentialStore getCredentialStore() { return (CredentialStore) kp.get("credentialStore"); } /** * Used to store credentials. */ @Autowired(required=false) public void setCredentialStore(CredentialStore credentials) { kp.put("credentialStore",credentials); } public String getHttpBindAddress(){ return (String) kp.get(HTTP_BIND_ADDRESS); } /** * Local IP address or hostname to use when making connections (binding * sockets). When not specified, uses default local address(es). */ public void setHttpBindAddress(String address) { kp.put(HTTP_BIND_ADDRESS, address); } public static final String HTTP_BIND_ADDRESS = "httpBindAddress"; public String getHttpProxyHost() { return (String) kp.get("httpProxyHost"); } /** * Proxy host IP (set only if needed). */ public void setHttpProxyHost(String host) { kp.put("httpProxyHost",host); } public Integer getHttpProxyPort() { return (Integer) kp.get("httpProxyPort"); } /** * Proxy port (set only if needed). */ public void setHttpProxyPort(Integer port) { kp.put("httpProxyPort", port); } public String getHttpProxyUser() { return (String) kp.get("httpProxyUser"); } /** * Proxy user (set only if needed). */ public void setHttpProxyUser(String user) { kp.put("httpProxyUser",user); } public String getHttpProxyPassword() { return (String) kp.get("httpProxyPassword"); } /** * Proxy password (set only if needed). */ public void setHttpProxyPassword(String password) { kp.put("httpProxyPassword",password); } { setMaxFetchKBSec(0); // no limit } public int getMaxFetchKBSec() { return (Integer) kp.get("maxFetchKBSec"); } /** * The maximum KB/sec to use when fetching data from a server. The default * of 0 means no maximum. */ public void setMaxFetchKBSec(int rate) { kp.put("maxFetchKBSec",rate); } { setTimeoutSeconds(20*60); // 20 minutes } public int getTimeoutSeconds() { return (Integer) kp.get("timeoutSeconds"); } /** * If the fetch is not completed in this number of seconds, give up (and * retry later). */ public void setTimeoutSeconds(int timeout) { kp.put("timeoutSeconds",timeout); } { setSoTimeoutMs(20*1000); // 20 seconds } public int getSoTimeoutMs() { return (Integer) kp.get("soTimeoutMs"); } /** * If the socket is unresponsive for this number of milliseconds, give up. * Set to zero for no timeout (Not. recommended. Could hang a thread on an * unresponsive server). This timeout is used timing out socket opens and * for timing out each socket read. Make sure this value is < * {@link #getTimeoutSeconds()} for optimal configuration: ensures at least one * retry read. */ public void setSoTimeoutMs(int timeout) { kp.put("soTimeoutMs",timeout); } { setMaxLengthBytes(0L); // no limit } public long getMaxLengthBytes() { return (Long) kp.get("maxLengthBytes"); } /** * Maximum length in bytes to fetch. Fetch is truncated at this length. A * value of 0 means no limit. */ public void setMaxLengthBytes(long timeout) { kp.put("maxLengthBytes",timeout); } /** * Send 'Range' header when a limit ({@link #MAX_LENGTH_BYTES}) on * document size. *

* Be polite to the HTTP servers and send the 'Range' header, stating that * you are only interested in the first n bytes. Only pertinent if * {@link #MAX_LENGTH_BYTES} > 0. Sending the 'Range' header results in a * '206 Partial Content' status response, which is better than just cutting * the response mid-download. On rare occasion, sending 'Range' will * generate '416 Request Range Not Satisfiable' response. */ { setSendRange(false); } public boolean getSendRange() { return (Boolean) kp.get("sendRange"); } public void setSendRange(boolean sendRange) { kp.put("sendRange",sendRange); } { // XXX default to false? setSendIfModifiedSince(true); } public boolean getSendIfModifiedSince() { return (Boolean) kp.get("sendIfModifiedSince"); } /** * Send 'If-Modified-Since' header, if previous 'Last-Modified' fetch * history information is available in URI history. */ public void setSendIfModifiedSince(boolean sendIfModifiedSince) { kp.put("sendIfModifiedSince",sendIfModifiedSince); } { // XXX default to false? setSendIfNoneMatch(true); } public boolean getSendIfNoneMatch() { return (Boolean) kp.get("sendIfNoneMatch"); } /** * Send 'If-None-Match' header, if previous 'Etag' fetch history information * is available in URI history. */ public void setSendIfNoneMatch(boolean sendIfNoneMatch) { kp.put("sendIfNoneMatch",sendIfNoneMatch); } { setShouldFetchBodyRule(new AcceptDecideRule()); } public DecideRule getShouldFetchBodyRule() { return (DecideRule) kp.get("shouldFetchBodyRule"); } /** * DecideRules applied after receipt of HTTP response headers but before we * start to download the body. If any filter returns FALSE, the fetch is * aborted. Prerequisites such as robots.txt by-pass filtering (i.e. they * cannot be midfetch aborted. */ public void setShouldFetchBodyRule(DecideRule rule) { kp.put("shouldFetchBodyRule", rule); } protected TrustLevel sslTrustLevel = TrustLevel.OPEN; public TrustLevel getSslTrustLevel() { return sslTrustLevel; } /** * SSL certificate trust level. Range is from the default 'open' (trust all * certs including expired, selfsigned, and those for which we do not have a * CA) through 'loose' (trust all valid certificates including selfsigned), * 'normal' (all valid certificates not including selfsigned) to 'strict' * (Cert is valid and DN must match servername). */ public synchronized void setSslTrustLevel(TrustLevel sslTrustLevel) { if (sslTrustLevel != this.sslTrustLevel) { this.sslTrustLevel = sslTrustLevel; // force sslContext to be reinitialized with new trust level sslContext = null; } } protected transient SSLContext sslContext; protected synchronized SSLContext sslContext() { if (sslContext == null) { try { TrustManager trustManager = new ConfigurableX509TrustManager( getSslTrustLevel()); sslContext = SSLContext.getInstance("SSL"); sslContext.init(null, new TrustManager[] {trustManager}, null); } catch (Exception e) { logger.log(Level.WARNING, "Failed configure of ssl context " + e.getMessage(), e); } } return sslContext; } /** * Can this processor fetch the given CrawlURI. May set a fetch status * if this processor would usually handle the CrawlURI, but cannot in * this instance. * * @param curi * @return True if processor can fetch. */ @Override protected boolean shouldProcess(CrawlURI curi) { String scheme = curi.getUURI().getScheme(); if (!(scheme.equals(HTTP_SCHEME) || scheme.equals(HTTPS_SCHEME))) { // handles only plain http and https return false; } CrawlHost host = getServerCache().getHostFor(curi.getUURI()); if (host.getIP() == null && host.hasBeenLookedUp()) { curi.setFetchStatus(S_DOMAIN_PREREQUISITE_FAILURE); return false; } return true; } /** * Set the transfer, content encodings based on headers (if necessary). * * @param rec * Recorder for this request. * @param response * Method used for the request. */ protected void setOtherCodings(CrawlURI uri, final Recorder rec, final HttpResponse response) { if (response.getEntity() != null) { rec.setInputIsChunked(response.getEntity().isChunked()); Header contentEncodingHeader = response.getEntity().getContentEncoding(); if (contentEncodingHeader != null) { String ce = contentEncodingHeader.getValue().trim(); try { rec.setContentEncoding(ce); } catch (IllegalArgumentException e) { uri.getAnnotations().add("unsatisfiableContentEncoding:" + StringUtils.stripToEmpty(ce)); } } } } /** * Set the character encoding based on the result headers or default. * * The HttpClient returns its own default encoding ("ISO-8859-1") if one * isn't specified in the Content-Type response header. We give the user the * option of overriding this, so we need to detect the case where the * default is returned. * * Now, it may well be the case that the default returned by HttpClient and * the default defined by the user are the same. * * TODO:FIXME?: This method does not do the "detect the case where the * [HttpClient] default is returned" mentioned above! Why not? * * @param rec * Recorder for this request. * @param response * Method used for the request. */ protected void setCharacterEncoding(CrawlURI curi, final Recorder rec, final HttpResponse response) { rec.setCharset(getDefaultCharset()); try { Charset charset = ContentType.getOrDefault(response.getEntity()).getCharset(); if (charset != null) { rec.setCharset(charset); } } catch (IllegalArgumentException e) { // exception could be UnsupportedCharsetException or IllegalCharsetNameException String unsatisfiableCharset; try { unsatisfiableCharset = response.getFirstHeader("content-type").getElements()[0].getParameterByName("charset").getValue(); } catch (Exception f) { unsatisfiableCharset = ""; } curi.getAnnotations().add("unsatisfiableCharsetInHeader:"+StringUtils.stripToEmpty(unsatisfiableCharset)); } } protected boolean checkMidfetchAbort(CrawlURI curi) { if (curi.isPrerequisite()) { return false; } DecideResult r = getShouldFetchBodyRule().decisionFor(curi); if (r != DecideResult.REJECT) { return false; } return true; } protected void doAbort(CrawlURI curi, AbstractExecutionAwareRequest request, String annotation) { curi.getAnnotations().add(annotation); curi.getRecorder().close(); request.abort(); } protected boolean maybeMidfetchAbort(CrawlURI curi, AbstractExecutionAwareRequest request) { if (checkMidfetchAbort(curi)) { doAbort(curi, request, "midFetchAbort"); curi.getRecorder().getRecordedInput().chopAtMessageBodyBegin(); return true; } else { return false; } } @Override protected void innerProcess(final CrawlURI curi) throws InterruptedException { // Note begin time curi.setFetchBeginTime(System.currentTimeMillis()); // Get a reference to the HttpRecorder that is set into this ToeThread. final Recorder rec = curi.getRecorder(); // Shall we get a digest on the content downloaded? boolean digestContent = getDigestContent(); String algorithm = null; if (digestContent) { algorithm = getDigestAlgorithm(); rec.getRecordedInput().setDigest(algorithm); } else { // clear rec.getRecordedInput().setDigest((MessageDigest)null); } FetchHTTPRequest req; try { req = new FetchHTTPRequest(this, curi); } catch (URIException e) { cleanup(curi, e, e.getMessage(), S_UNFETCHABLE_URI); return; } rec.getRecordedInput().setLimits(getMaxLengthBytes(), 1000l * (long) getTimeoutSeconds(), (long) getMaxFetchKBSec()); HttpResponse response = null; try { response = req.execute(); addResponseContent(response, curi); } catch (ClientProtocolException e) { failedExecuteCleanup(curi, e); return; } catch (IOException e) { if ("handshake alert: unrecognized_name".equals(e.getMessage())) { req.setDisableSNI(true); try { response = req.execute(); addResponseContent(response, curi); } catch (ClientProtocolException ee) { failedExecuteCleanup(curi, e); return; } catch (IOException ee) { failedExecuteCleanup(curi, e); return; } } else { failedExecuteCleanup(curi, e); return; } } maybeMidfetchAbort(curi, req.request); long contentLength = -1l; Header h = response.getLastHeader("content-length"); if (h != null && h.getValue().trim().length()>0) { contentLength = Long.parseLong(h.getValue()); } try { if (!req.request.isAborted()) { // Force read-to-end, so that any socket hangs occur here, // not in later modules. rec.getRecordedInput().readToEndOfContent(contentLength); } } catch (RecorderTimeoutException ex) { doAbort(curi, req.request, TIMER_TRUNC); } catch (RecorderLengthExceededException ex) { doAbort(curi, req.request, LENGTH_TRUNC); } catch (IOException e) { cleanup(curi, e, "readFully", S_CONNECT_LOST); return; } catch (ArrayIndexOutOfBoundsException e) { // For weird windows-only ArrayIndex exceptions from native code // see http://forum.java.sun.com/thread.jsp?forum=11&thread=378356 // treating as if it were an IOException cleanup(curi, e, "readFully", S_CONNECT_LOST); return; } finally { rec.close(); // ensure recording has stopped rec.closeRecorders(); // Note completion time curi.setFetchCompletedTime(System.currentTimeMillis()); // Set the response charset into the HttpRecord if available. setCharacterEncoding(curi, rec, response); setSizes(curi, rec); setOtherCodings(curi, rec, response); } if (digestContent) { curi.setContentDigest(algorithm, rec.getRecordedInput().getDigestValue()); } if (logger.isLoggable(Level.FINE)) { logger.fine(((curi.getFetchType() == HTTP_POST) ? "POST" : "GET") + " " + curi.getUURI().toString() + " " + response.getStatusLine().getStatusCode() + " " + rec.getRecordedInput().getSize() + " " + curi.getContentType()); } if (isSuccess(curi) && req.addedCredentials) { // Promote the credentials from the CrawlURI to the CrawlServer // so they are available for all subsequent CrawlURIs on this // server. promoteCredentials(curi); } else if (response.getStatusLine().getStatusCode() == HttpStatus.SC_UNAUTHORIZED) { // 401 is not 'success'. handle401(response, curi); } else if (response.getStatusLine().getStatusCode() == HttpStatus.SC_PROXY_AUTHENTICATION_REQUIRED) { // 407 - remember Proxy-Authenticate headers for later use kp.put("proxyAuthChallenges", extractChallenges(response, curi, ProxyAuthenticationStrategy.INSTANCE)); } if (rec.getRecordedInput().isOpen()) { logger.severe(curi.toString() + " RIS still open. Should have" + " been closed by method release: " + Thread.currentThread().getName()); try { rec.getRecordedInput().close(); } catch (IOException e) { logger.log(Level.SEVERE, "second-chance RIS close failed", e); } } } /** * Promote successful credential to the server. * * @param curi * CrawlURI whose credentials we are to promote. */ protected void promoteCredentials(final CrawlURI curi) { Set credentials = curi.getCredentials(); for (Iterator i = credentials.iterator(); i.hasNext();) { Credential c = i.next(); i.remove(); // The server to attach to may not be the server that hosts // this passed curi. It might be of another subdomain. // The avatar needs to be added to the server that is dependent // on this precondition. Find it by name. Get the name from // the credential this avatar represents. String cd = c.getDomain(); if (cd != null) { CrawlServer cs = serverCache.getServerFor(cd); if (cs != null) { cs.addCredential(c); cs.setHttpAuthChallenges(curi.getHttpAuthChallenges()); } } } } /** * Server is looking for basic/digest auth credentials (RFC2617). If we have * any, put them into the CrawlURI and have it come around again. * Presence of the credential serves as flag to frontier to requeue * promptly. If we already tried this domain and still got a 401, then our * credentials are bad. Remove them and let this curi die. * @param response 401 http response * @param curi * CrawlURI that got a 401. */ protected void handle401(HttpResponse response, final CrawlURI curi) { Map challenges = extractChallenges(response, curi, TargetAuthenticationStrategy.INSTANCE); AuthScheme authscheme = chooseAuthScheme(challenges, HttpHeaders.WWW_AUTHENTICATE); // remember WWW-Authenticate headers for later use curi.setHttpAuthChallenges(challenges); if (authscheme == null) { return; } String realm = authscheme.getRealm(); // Look to see if this curi had rfc2617 avatars loaded. If so, are // any of them for this realm? If so, then the credential failed // if we got a 401 and it should be let die a natural 401 death. Set curiRfc2617Credentials = getCredentials(curi, HttpAuthenticationCredential.class); HttpAuthenticationCredential extant = HttpAuthenticationCredential.getByRealm( curiRfc2617Credentials, realm, curi); if (extant != null) { // Then, already tried this credential. Remove ANY rfc2617 // credential since presence of a rfc2617 credential serves // as flag to frontier to requeue this curi and let the curi // die a natural death. extant.detachAll(curi); logger.warning("Auth failed (401) though supplied realm " + realm + " to " + curi.toString()); } else { // Look see if we have a credential that corresponds to this // realm in credential store. Filter by type and credential // domain. If not, let this curi die. Else, add it to the // curi and let it come around again. Add in the AuthScheme // we got too. Its needed when we go to run the Auth on // second time around. String serverKey = getServerKey(curi); CrawlServer server = serverCache.getServerFor(serverKey); Set storeRfc2617Credentials = getCredentialStore().subset(curi, HttpAuthenticationCredential.class, server.getName()); if (storeRfc2617Credentials == null || storeRfc2617Credentials.size() <= 0) { logger.fine("No rfc2617 credentials for " + curi); } else { HttpAuthenticationCredential found = HttpAuthenticationCredential.getByRealm( storeRfc2617Credentials, realm, curi); if (found == null) { logger.fine("No rfc2617 credentials for realm " + realm + " in " + curi); } else { found.attach(curi); logger.fine("Found credential for scheme " + authscheme + " realm " + realm + " in store for " + curi.toString()); } } } } /** * @param response * @param curi * CrawlURI that got a 401 or 407. * @param authStrategy * Either ProxyAuthenticationStrategy or * TargetAuthenticationStrategy. Determines whether * Proxy-Authenticate or WWW-Authenticate header is consulted. * * @return Map<authSchemeName -> challenge header value> */ protected Map extractChallenges(HttpResponse response, final CrawlURI curi, AuthenticationStrategy authStrategy) { Map hcChallengeHeaders = null; try { hcChallengeHeaders = authStrategy.getChallenges(null, response, null); } catch (MalformedChallengeException e) { logger.fine("Failed challenge parse: " + e.getMessage()); hcChallengeHeaders = new HashMap(); } if (hcChallengeHeaders.size() < 1) { curi.getNonFatalFailures().add( new IllegalStateException("Missing auth challenge headers for uri with response status 401: " + curi) ); } // reorganize in non-library-specific way Map challenges = new HashMap(); for (Entry challenge: hcChallengeHeaders.entrySet()) { challenges.put(challenge.getKey(), challenge.getValue().getValue()); } return challenges; } protected AuthScheme chooseAuthScheme(Map challenges, String challengeHeaderKey) { HashSet authSchemesLeftToTry = new HashSet(challenges.keySet()); for (String authSchemeName: new String[]{"digest","basic"}) { if (authSchemesLeftToTry.remove(authSchemeName)) { AuthScheme authScheme = AUTH_SCHEME_REGISTRY.lookup(authSchemeName).create(null);; BasicHeader challenge = new BasicHeader(challengeHeaderKey, challenges.get(authSchemeName)); try { authScheme.processChallenge(challenge); } catch (MalformedChallengeException e) { logger.fine(e.getMessage() + " " + challenge); continue; } if (authScheme.isConnectionBased()) { logger.fine("Connection based " + authScheme); continue; } if (authScheme.getRealm() == null || authScheme.getRealm().length() <= 0) { logger.fine("Empty realm " + authScheme); continue; } return authScheme; } } for (String unsupportedSchemeName: authSchemesLeftToTry) { logger.fine("Unsupported http auth scheme: " + unsupportedSchemeName); } return null; } /** * @param curi * CrawlURI that got a 401. * @param type * Class of credential to get from curi. * @return Set of credentials attached to this curi. */ protected Set getCredentials(CrawlURI curi, Class type) { Set result = null; if (curi.hasCredentials()) { for (Credential c : curi.getCredentials()) { if (type.isInstance(c)) { if (result == null) { result = new HashSet(); } result.add(c); } } } return result; } /** * Get a value either from inside the CrawlURI instance, or from * settings (module attributes). * * @param curi * CrawlURI to consult * @param key * key to lookup * @return value from either CrawlURI (preferred) or settings */ protected Object getAttributeEither(CrawlURI curi, String key) { Object r = curi.getData().get(key); if (r != null) { return r; } return kp.get(key); } /** * Update CrawlURI internal sizes based on current transaction (and * in the case of 304s, history) * * @param curi CrawlURI * @param rec HttpRecorder */ protected void setSizes(CrawlURI curi, Recorder rec) { // set reporting size curi.setContentSize(rec.getRecordedInput().getSize()); // add contentSize to extraInfo so it's available to log in the crawl log curi.addExtraInfo("contentSize", rec.getRecordedInput().getSize()); // special handling for 304-not modified if (curi.getFetchStatus() == HttpStatus.SC_NOT_MODIFIED && curi.getFetchHistory() != null) { Map[] history = curi.getFetchHistory(); if (history[0] != null && history[0].containsKey(A_REFERENCE_LENGTH)) { long referenceLength = (Long) history[0].get(A_REFERENCE_LENGTH); // carry-forward previous 'reference-length' for future curi.getData().put(A_REFERENCE_LENGTH, referenceLength); // increase content-size to virtual-size for reporting curi.setContentSize(rec.getRecordedInput().getSize() + referenceLength); } } } /** * This method populates curi with response status and * content type. * * @param curi * CrawlURI to populate. * @param response * Method to get response status and headers from. */ protected void addResponseContent(HttpResponse response, CrawlURI curi) { curi.setFetchStatus(response.getStatusLine().getStatusCode()); Header ct = response.getLastHeader("content-type"); curi.setContentType(ct == null ? null : ct.getValue()); for (Header h: response.getAllHeaders()) { curi.putHttpResponseHeader(h.getName(), h.getValue()); } } /** * Cleanup after a failed method execute. * * @param curi * CrawlURI we failed on. * @param exception * Exception we failed with. */ protected void failedExecuteCleanup(final CrawlURI curi, final Exception exception) { cleanup(curi, exception, "executeMethod", S_CONNECT_FAILED); } /** * Cleanup after a failed method execute. * * @param curi * CrawlURI we failed on. * @param exception * Exception we failed with. * @param message * Message to log with failure. FIXME: Seems ignored * @param status * Status to set on the fetch. */ protected void cleanup(final CrawlURI curi, final Exception exception, final String message, final int status) { if (logger.isLoggable(Level.FINER)) { logger.log(Level.FINER, message + ": " + exception, exception); } else if (logger.isLoggable(Level.FINE)) { logger.fine(message + ": " + exception); } curi.getNonFatalFailures().add(exception); curi.setFetchStatus(status); curi.getRecorder().close(); } public void start() { if (isRunning()) { return; } super.start(); if (getCookieStore() != null) { getCookieStore().start(); } } public void stop() { if (!isRunning()) { return; } super.stop(); // At the end save cookies to the file specified in the order file. if (getCookieStore() != null) { AbstractCookieStore r = getCookieStore(); if (r.getCookiesSaveFile() != null) { r.saveCookies(r.getCookiesSaveFile().getFile().getAbsolutePath()); } getCookieStore().stop(); setCookieStore(null); } } protected static String getServerKey(CrawlURI uri) { try { return CrawlServer.getServerKey(uri.getUURI()); } catch (URIException e) { logger.log(Level.SEVERE, e.toString() + ": " + uri, e); return null; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy