org.archive.modules.CrawlURI Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of heritrix-modules Show documentation
This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.
There is a newer version: 3.5.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
 
package org.archive.modules;

import static org.archive.modules.CoreAttributeConstants.A_ANNOTATIONS;
import static org.archive.modules.CoreAttributeConstants.A_CREDENTIALS_KEY;
import static org.archive.modules.CoreAttributeConstants.A_DNS_SERVER_IP_LABEL;
import static org.archive.modules.CoreAttributeConstants.A_FETCH_COMPLETED_TIME;
import static org.archive.modules.CoreAttributeConstants.A_FORCE_RETIRE;
import static org.archive.modules.CoreAttributeConstants.A_HERITABLE_KEYS;
import static org.archive.modules.CoreAttributeConstants.A_HTML_BASE;
import static org.archive.modules.CoreAttributeConstants.A_HTTP_AUTH_CHALLENGES;
import static org.archive.modules.CoreAttributeConstants.A_HTTP_RESPONSE_HEADERS;
import static org.archive.modules.CoreAttributeConstants.A_NONFATAL_ERRORS;
import static org.archive.modules.CoreAttributeConstants.A_PREREQUISITE_URI;
import static org.archive.modules.CoreAttributeConstants.A_SOURCE_TAG;
import static org.archive.modules.CoreAttributeConstants.A_SUBMIT_DATA;
import static org.archive.modules.CoreAttributeConstants.A_SUBMIT_ENCTYPE;
import static org.archive.modules.CoreAttributeConstants.A_WARC_RESPONSE_HEADERS;
import static org.archive.modules.SchedulingConstants.NORMAL;
import static org.archive.modules.fetcher.FetchStatusCodes.S_BLOCKED_BY_CUSTOM_PROCESSOR;
import static org.archive.modules.fetcher.FetchStatusCodes.S_BLOCKED_BY_USER;
import static org.archive.modules.fetcher.FetchStatusCodes.S_CONNECT_FAILED;
import static org.archive.modules.fetcher.FetchStatusCodes.S_CONNECT_LOST;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DEEMED_CHAFF;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DEFERRED;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DELETED_BY_USER;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DNS_SUCCESS;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DOMAIN_PREREQUISITE_FAILURE;
import static org.archive.modules.fetcher.FetchStatusCodes.S_DOMAIN_UNRESOLVABLE;
import static org.archive.modules.fetcher.FetchStatusCodes.S_OTHER_PREREQUISITE_FAILURE;
import static org.archive.modules.fetcher.FetchStatusCodes.S_OUT_OF_SCOPE;
import static org.archive.modules.fetcher.FetchStatusCodes.S_PREREQUISITE_UNSCHEDULABLE_FAILURE;
import static org.archive.modules.fetcher.FetchStatusCodes.S_PROCESSING_THREAD_KILLED;
import static org.archive.modules.fetcher.FetchStatusCodes.S_ROBOTS_PRECLUDED;
import static org.archive.modules.fetcher.FetchStatusCodes.S_ROBOTS_PREREQUISITE_FAILURE;
import static org.archive.modules.fetcher.FetchStatusCodes.S_RUNTIME_EXCEPTION;
import static org.archive.modules.fetcher.FetchStatusCodes.S_SERIOUS_ERROR;
import static org.archive.modules.fetcher.FetchStatusCodes.S_TIMEOUT;
import static org.archive.modules.fetcher.FetchStatusCodes.S_TOO_MANY_EMBED_HOPS;
import static org.archive.modules.fetcher.FetchStatusCodes.S_TOO_MANY_LINK_HOPS;
import static org.archive.modules.fetcher.FetchStatusCodes.S_TOO_MANY_RETRIES;
import static org.archive.modules.fetcher.FetchStatusCodes.S_UNATTEMPTED;
import static org.archive.modules.fetcher.FetchStatusCodes.S_UNFETCHABLE_URI;
import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_CONTENT_DIGEST_HISTORY;
import static org.archive.modules.recrawl.RecrawlAttributeConstants.A_FETCH_HISTORY;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.PrintWriter;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.httpclient.URIException;
import org.apache.commons.lang.StringUtils;
import org.archive.bdb.AutoKryo;
import org.archive.modules.credential.Credential;
import org.archive.modules.credential.HttpAuthenticationCredential;
import org.archive.modules.extractor.HTMLLinkContext;
import org.archive.modules.extractor.Hop;
import org.archive.modules.extractor.LinkContext;
import org.archive.modules.revisit.RevisitProfile;
import org.archive.net.UURI;
import org.archive.net.UURIFactory;
import org.archive.spring.OverlayContext;
import org.archive.spring.OverlayMapsSource;
import org.archive.util.Base32;
import org.archive.util.Recorder;
import org.archive.util.ReportUtils;
import org.archive.util.Reporter;
import org.json.JSONException;
import org.json.JSONObject;


/**
 * Represents a candidate URI and the associated state it
 * collects as it is crawled.
 *
 * Core state is in instance variables but a flexible
 * attribute list is also available. Use this 'bucket' to carry
 * custom processing extracted data and state across CrawlURI
 * processing.  See {@link #getData()}, etc.
 *
 * 

 * Note: getHttpMethod() has been removed starting with Heritrix 3.3.0. HTTP
 * response headers are available using {@link #getHttpResponseHeader(String)}.
 * (HTTP fetchers are responsible for setting the values using
 * {@link #putHttpResponseHeader(String, String)}).
 *
 * @author Gordon Mohr
 */
public class CrawlURI 
implements Reporter, Serializable, OverlayContext, Comparable {
    private static final long serialVersionUID = 4L;

    private static final Logger logger =
        Logger.getLogger(CrawlURI.class.getName());

    public static final int UNCALCULATED = -1;
    
    public static enum FetchType { HTTP_GET, HTTP_POST, UNKNOWN };

    /**
     * The URI being crawled.  It's transient to save space when storing to BDB.
     */
    private UURI uuri;

    
    /** Seed status */
    private boolean isSeed = false;

    
    /** String of letters indicating how this URI was reached from a seed.
     * 
     * P precondition
     * R redirection
     * E embedded (as frame, src, link, codebase, etc.)
     * X speculative embed (as from javascript, some alternate-format extractors
     * L link
     * For example LLLE (an embedded image on a page 3 links from seed).
     */
    private String pathFromSeed;
    
    /**
     * Where this URI was (presently) discovered. . Transient to allow
     * more efficient custom serialization
     */
    private UURI via;

    /**
     * Context of URI's discovery, as per the 'context' in Link
     */
    private LinkContext viaContext;

    
    private int schedulingDirective = NORMAL;

    
    /**
     * Frontier/Scheduler lifecycle info.
     * This is an identifier set by the Frontier for its
     * purposes. Usually its the name of the Frontier queue
     * this URI gets queued to.  Values can be host + port
     * or IP, etc.
     */
    private String classKey;

    /** assigned precedence */
    private int precedence;
    
    // Processing progress
    private int fetchStatus = 0;    // default to unattempted
    private int deferrals = 0;     // count of postponements for prerequisites
    private int fetchAttempts = 0; // the number of fetch attempts that have been made
    transient private int threadNumber;

    // User agent to masquerade as when crawling this URI. If null, globals should be used
    private String userAgent = null;

    // Once a link extractor has finished processing this curi this will be
    // set as true
    transient private boolean linkExtractorFinished = false;
   
    transient private int discardedOutlinks = 0; 
    
    private long contentSize = UNCALCULATED;
    private long contentLength = UNCALCULATED;

    
    /**
     * Flexible dynamic attributes list.
     * 
     * The attribute list is a flexible map of key/value pairs for storing
     * status of this URI for use by other processors. By convention the
     * attribute list is keyed by constants found in the
     * {@link CoreAttributeConstants} interface.  Use this list to carry
     * data or state produced by custom processors rather change the
     * classes {@link CrawlURI} or this class, CrawlURI.
     */
    protected Map data;

    private boolean forceRevisit = false; // even if already visited

    
    /**
     * Current http recorder.
     *
     * Gets set upon successful request.  Reset at start of processing chain.
     */
    private transient Recorder httpRecorder = null;

    /**
     * Content type of a successfully fetched URI.
     *
     * May be null even on successfully fetched URI.
     */
    private String contentType = "unknown";

    /**
     * True if this CrawlURI has been deemed a prerequisite by the
     * org.archive.crawler.prefetch.PreconditionEnforcer.
     *
     * This flag is used at least inside in the precondition enforcer so that
     * subsequent prerequisite tests know to let this CrawlURI through because
     * its a prerequisite needed by an earlier prerequisite tests (e.g. If
     * this is a robots.txt, then the subsequent login credentials prereq
     * test must not throw it out because its not a login curi).
     */
    private boolean prerequisite = false;
    
    /** specified fetch-type: GET, POST, or not-yet-known */ 
    private FetchType fetchType = FetchType.UNKNOWN;

    /** 
     * Monotonically increasing number within a crawl;
     * useful for tending towards breadth-first ordering.
     * Will sometimes be truncated to 48 bits, so behavior
     * over 281 trillion instantiated CrawlURIs may be 
     * buggy
     */
    protected long ordinal;
    
    /**
     * Array to hold keys of data members that persist across URI processings.
     * Any key mentioned in this list will not be cleared out at the end
     * of a pass down the processing chain.
     */
    private static final Collection persistentKeys
     = new CopyOnWriteArrayList(
            new String [] {A_CREDENTIALS_KEY, A_HTTP_AUTH_CHALLENGES, A_SUBMIT_DATA, A_WARC_RESPONSE_HEADERS, A_ANNOTATIONS, A_SUBMIT_ENCTYPE});

    /** maximum length for pathFromSeed/hopsPath; longer truncated with leading counter **/ 
    private static final int MAX_HOPS_DISPLAYED = 50;

    /**
     * A digest (hash, usually SHA1) of retrieved content-body. 
     * 
     */
    private byte[] contentDigest = null;
    private String contentDigestScheme = null;

    
    /**
     * If this value is non-null, a determination has been made that this CrawlURI instance is a revisit or 
     * recrawl. Details are provided by the RevisitProfile object. 
     */
    transient private RevisitProfile revisitProfile = null;

    /**
     * Create a new instance of CrawlURI from a {@link UURI}.
     *
     * @param uuri the UURI to base this CrawlURI on.
     */
    public CrawlURI(UURI uuri) {
        this.uuri = uuri;
        this.pathFromSeed = "";
    }

    public static CrawlURI fromHopsViaString(String uriHopsViaContext) throws URIException {
        UURI u;
        String args[] = uriHopsViaContext.split("\\s+");
        u = UURIFactory.getInstance(args[0]);
        String pathFromSeed = (args.length > 1)?
            args[1].toString() : "";
        UURI via = (args.length > 2 && args[2].length()>1) ?
            UURIFactory.getInstance(args[2].toString()):
            null;
        LinkContext viaContext = (args.length > 3 && args[2].length()>1) ?
                HTMLLinkContext.get(args[3].toString()): null;
        CrawlURI caUri = new CrawlURI(u, pathFromSeed, via, viaContext);
        return caUri;
    }
    
    /**
     * @param u uuri instance this CrawlURI wraps.
     * @param pathFromSeed
     * @param via
     * @param viaContext
     */
    public CrawlURI(UURI u, String pathFromSeed, UURI via,
            LinkContext viaContext) {
        this.uuri = u;
        if (pathFromSeed != null) {
            this.pathFromSeed = pathFromSeed;
        } else {
            this.pathFromSeed = "";
        }
        this.via = via;
        this.viaContext = viaContext;
    }

    /**
     * @return Returns the schedulingDirective.
     */
    public int getSchedulingDirective() {
        return schedulingDirective;
    }


    /** 
     * @param priority The schedulingDirective to set.
     */
    public void setSchedulingDirective(int priority) {
        this.schedulingDirective = priority;
    }

    
    public boolean containsDataKey(String key) {
        if (data == null) {
            return false;
        }
        return data.containsKey(key);
    }


    /**
     * Takes a status code and converts it into a human readable string.
     *
     * @param code the status code
     * @return a human readable string declaring what the status code is.
     */
    public static String fetchStatusCodesToString(int code){
        switch(code){
            // DNS
            case S_DNS_SUCCESS : return "DNS-1-OK";
            // HTTP Informational 1xx
            case 100  : return "HTTP-100-Info-Continue";
            case 101  : return "HTTP-101-Info-Switching Protocols";
            // HTTP Successful 2xx
            case 200  : return "HTTP-200-Success-OK";
            case 201  : return "HTTP-201-Success-Created";
            case 202  : return "HTTP-202-Success-Accepted";
            case 203  : return "HTTP-203-Success-Non-Authoritative";
            case 204  : return "HTTP-204-Success-No Content ";
            case 205  : return "HTTP-205-Success-Reset Content";
            case 206  : return "HTTP-206-Success-Partial Content";
            // HTTP Redirection 3xx
            case 300  : return "HTTP-300-Redirect-Multiple Choices";
            case 301  : return "HTTP-301-Redirect-Moved Permanently";
            case 302  : return "HTTP-302-Redirect-Found";
            case 303  : return "HTTP-303-Redirect-See Other";
            case 304  : return "HTTP-304-Redirect-Not Modified";
            case 305  : return "HTTP-305-Redirect-Use Proxy";
            case 307  : return "HTTP-307-Redirect-Temporary Redirect";
            // HTTP Client Error 4xx
            case 400  : return "HTTP-400-ClientErr-Bad Request";
            case 401  : return "HTTP-401-ClientErr-Unauthorized";
            case 402  : return "HTTP-402-ClientErr-Payment Required";
            case 403  : return "HTTP-403-ClientErr-Forbidden";
            case 404  : return "HTTP-404-ClientErr-Not Found";
            case 405  : return "HTTP-405-ClientErr-Method Not Allowed";
            case 407  : return "HTTP-406-ClientErr-Not Acceptable";
            case 408  : return "HTTP-407-ClientErr-Proxy Authentication Required";
            case 409  : return "HTTP-408-ClientErr-Request Timeout";
            case 410  : return "HTTP-409-ClientErr-Conflict";
            case 406  : return "HTTP-410-ClientErr-Gone";
            case 411  : return "HTTP-411-ClientErr-Length Required";
            case 412  : return "HTTP-412-ClientErr-Precondition Failed";
            case 413  : return "HTTP-413-ClientErr-Request Entity Too Large";
            case 414  : return "HTTP-414-ClientErr-Request-URI Too Long";
            case 415  : return "HTTP-415-ClientErr-Unsupported Media Type";
            case 416  : return "HTTP-416-ClientErr-Requested Range Not Satisfiable";
            case 417  : return "HTTP-417-ClientErr-Expectation Failed";
            // HTTP Server Error 5xx
            case 500  : return "HTTP-500-ServerErr-Internal Server Error";
            case 501  : return "HTTP-501-ServerErr-Not Implemented";
            case 502  : return "HTTP-502-ServerErr-Bad Gateway";
            case 503  : return "HTTP-503-ServerErr-Service Unavailable";
            case 504  : return "HTTP-504-ServerErr-Gateway Timeout";
            case 505  : return "HTTP-505-ServerErr-HTTP Version Not Supported";
            // Heritrix internal codes (all negative numbers
            case S_BLOCKED_BY_USER:
                return "Heritrix(" + S_BLOCKED_BY_USER + ")-Blocked by user";
            case S_BLOCKED_BY_CUSTOM_PROCESSOR:
                return "Heritrix(" + S_BLOCKED_BY_CUSTOM_PROCESSOR +
                ")-Blocked by custom prefetch processor";
            case S_DELETED_BY_USER:
                return "Heritrix(" + S_DELETED_BY_USER + ")-Deleted by user";
            case S_CONNECT_FAILED:
                return "Heritrix(" + S_CONNECT_FAILED + ")-Connection failed";
            case S_CONNECT_LOST:
                return "Heritrix(" + S_CONNECT_LOST + ")-Connection lost";
            case S_DEEMED_CHAFF:
                return "Heritrix(" + S_DEEMED_CHAFF + ")-Deemed chaff";
            case S_DEFERRED:
                return "Heritrix(" + S_DEFERRED + ")-Deferred";
            case S_DOMAIN_UNRESOLVABLE:
                return "Heritrix(" + S_DOMAIN_UNRESOLVABLE
                        + ")-Domain unresolvable";
            case S_OUT_OF_SCOPE:
                return "Heritrix(" + S_OUT_OF_SCOPE + ")-Out of scope";
            case S_DOMAIN_PREREQUISITE_FAILURE:
                return "Heritrix(" + S_DOMAIN_PREREQUISITE_FAILURE
                        + ")-Domain prerequisite failure";
            case S_ROBOTS_PREREQUISITE_FAILURE:
                return "Heritrix(" + S_ROBOTS_PREREQUISITE_FAILURE
                        + ")-Robots prerequisite failure";
            case S_OTHER_PREREQUISITE_FAILURE:
                return "Heritrix(" + S_OTHER_PREREQUISITE_FAILURE
                        + ")-Other prerequisite failure";
            case S_PREREQUISITE_UNSCHEDULABLE_FAILURE:
                return "Heritrix(" + S_PREREQUISITE_UNSCHEDULABLE_FAILURE
                        + ")-Prerequisite unschedulable failure";
            case S_ROBOTS_PRECLUDED:
                return "Heritrix(" + S_ROBOTS_PRECLUDED + ")-Robots precluded";
            case S_RUNTIME_EXCEPTION:
                return "Heritrix(" + S_RUNTIME_EXCEPTION
                        + ")-Runtime exception";
            case S_SERIOUS_ERROR:
                return "Heritrix(" + S_SERIOUS_ERROR + ")-Serious error";
            case S_TIMEOUT:
                return "Heritrix(" + S_TIMEOUT + ")-Timeout";
            case S_TOO_MANY_EMBED_HOPS:
                return "Heritrix(" + S_TOO_MANY_EMBED_HOPS
                        + ")-Too many embed hops";
            case S_TOO_MANY_LINK_HOPS:
                return "Heritrix(" + S_TOO_MANY_LINK_HOPS
                        + ")-Too many link hops";
            case S_TOO_MANY_RETRIES:
                return "Heritrix(" + S_TOO_MANY_RETRIES + ")-Too many retries";
            case S_UNATTEMPTED:
                return "Heritrix(" + S_UNATTEMPTED + ")-Unattempted";
            case S_UNFETCHABLE_URI:
                return "Heritrix(" + S_UNFETCHABLE_URI + ")-Unfetchable URI";
            case S_PROCESSING_THREAD_KILLED:
                return "Heritrix(" + S_PROCESSING_THREAD_KILLED + ")-" +
                    "Processing thread killed";
            // Unknown return code
            default : return Integer.toString(code);
        }
    }


    /**
     * Return the overall/fetch status of this CrawlURI for its
     * current trip through the processing loop.
     *
     * @return a value from FetchStatusCodes
     */
    public int getFetchStatus(){
        return fetchStatus;
    }

    /**
     * Set the overall/fetch status of this CrawlURI for
     * its current trip through the processing loop.
     *
     * @param newstatus a value from FetchStatusCodes
     */
    public void setFetchStatus(int newstatus){
        fetchStatus = newstatus;
    }

    /**
     * Get the count of attempts (trips through the processing 
     * loop) at getting the document referenced by this URI. 
     * Compared against a configured maximum to determine when
     * to stop retrying. 
     * 
     * TODO: Consider renaming as something more generic, as all
     * processing-loops do not necessarily include an attempted
     * network-fetch (for example, when processing is aborted 
     * early to enqueue a prerequisite), and this counter may be 
     * reset if a URI is starting a fresh series of tries (as when
     * rescheduled at a future time). Perhaps simply 'tryCount'
     * or 'attempts'?
     *
     * @return attempts count
     */
    public int getFetchAttempts() {
        return fetchAttempts;
    }

    /**
     * Increment the count of attempts (trips through the processing 
     * loop) at getting the document referenced by this URI.
     */
    public void incrementFetchAttempts() {
        fetchAttempts++;
    }

    /**
     * Reset fetchAttempts counter.
     */
    public void resetFetchAttempts() {
        this.fetchAttempts = 0;
    }

    /**
     * Reset deferrals counter.
     */
    public void resetDeferrals() {
        this.deferrals = 0;
    }

    /**
     * Set a prerequisite for this URI.
     * 

     * A prerequisite is a URI that must be crawled before this URI can be
     * crawled.
     *
     * @param pre Link to set as prereq.
     */
    public void setPrerequisiteUri(CrawlURI pre) {
        getData().put(A_PREREQUISITE_URI, pre);
    }

    /**
     * Get the prerequisite for this URI.
     * 

     * A prerequisite is a URI that must be crawled before this URI can be
     * crawled.
     *
     * @return the prerequisite for this URI or null if no prerequisite.
     */
    public CrawlURI getPrerequisiteUri() {
        return (CrawlURI) getData().get(A_PREREQUISITE_URI);
    }
    
    /**
     * Clear prerequisite, if any.
     */
    public CrawlURI clearPrerequisiteUri() {
        return (CrawlURI) getData().remove(A_PREREQUISITE_URI);
    }
    
    /**
     * @return True if this CrawlURI has a prerequisite.
     */
    public boolean hasPrerequisiteUri() {
        return containsDataKey(A_PREREQUISITE_URI);
    }

    /**
     * Returns true if this CrawlURI is a prerequisite.
     *
     * TODO:FIXME: code elsewhere is confused whether this means 
     * that this CrawlURI is a prerquisite for another, or *has* a 
     * prequisite; clean up and rename as necessary. 
     * @return true if this CrawlURI is a prerequisite.
     */
    public boolean isPrerequisite() {
        return this.prerequisite;
    }

    /**
     * Set if this CrawlURI is itself a prerequisite URI.
     *
     * @param prerequisite True if this CrawlURI is itself a prerequiste uri.
     */
    public void setPrerequisite(boolean prerequisite) {
        this.prerequisite = prerequisite;
    }

    /**
     * Get the content type of this URI.
     *
     * @return Fetched URIs content type.  May be null.
     */
    public String getContentType() {
        return this.contentType;
    }

    /**
     * Set a fetched uri's content type.
     *
     * @param ct Contenttype.
     */
    public void setContentType(String ct) {
        if (ct == null) {
            ct = "unknown";
        }
        this.contentType = ct;
    }

    /**
     * Set the number of the ToeThread responsible for processing this uri.
     *
     * @param i the ToeThread number.
     */
    public void setThreadNumber(int i) {
        threadNumber = i;
    }

    /**
     * Get the number of the ToeThread responsible for processing this uri.
     *
     * @return the ToeThread number.
     */
    public int getThreadNumber() {
        return threadNumber;
    }

    /**
     * Increment the deferral count.
     *
     */
    public void incrementDeferrals() {
        deferrals++;
    }

    /**
     * Get the deferral count.
     *
     * @return the deferral count.
     */
    public int getDeferrals() {
        return deferrals;
    }

    /**
     * Remove all attributes set on this uri.
     * 

     * This methods removes the attribute list.
     */
    public void stripToMinimal() {
        data = null;
    }

    /**
     * Get the size in bytes of this URI's recorded content, inclusive
     * of things like protocol headers. It is the responsibility of the 
     * classes which fetch the URI to set this value accordingly -- it is 
     * not calculated/verified within CrawlURI. 
     * 
     * This value is consulted in reporting/logging/writing-decisions.
     * 
     * @see #setContentSize(long)
     * @return contentSize
     */
    public long getContentSize(){
        return contentSize;
    }

    /**
     * Get the annotations set for this uri.
     *
     * @return the annotations set for this uri.
     */
    public Collection getAnnotations() {
        @SuppressWarnings("unchecked")
        Collection annotations = (Collection)getData().get(A_ANNOTATIONS);
        if (annotations == null) {
            annotations = new LinkedHashSet();
            getData().put(A_ANNOTATIONS, annotations);
        }
        return annotations;
    }

    /**
     * Get total hops from seed. 
     * @return int hops count
     */
    public int getHopCount() {
        if(pathFromSeed.length()<=MAX_HOPS_DISPLAYED) {
            return pathFromSeed.length();
        }
        int plusIndex = pathFromSeed.indexOf('+');
        if(plusIndex<0) {
            // just in case old-style hops-paths slip through
            return pathFromSeed.length(); 
        }
        // return overflow number + remainder (remainder will be 
        // MAX_HOPS_DISPLAYED unless an exceptional condition)
        return Integer.parseInt(pathFromSeed.substring(0,plusIndex)) 
            + pathFromSeed.length()-(plusIndex+1);
    }
    
    /**
     * Get the embed hop count.
     *
     * @return the embed hop count.
     */
    public int getEmbedHopCount() {
        int embedHops = 0;
        for(int i = pathFromSeed.length()-1; i>=0; i--) {
            if(pathFromSeed.charAt(i)==Hop.NAVLINK.getHopChar()) {
                break;
            }
            embedHops++;
        }
        return embedHops;
    }

    /**
     * Get the link hop count.
     *
     * @return the link hop count.
     */
    public int getLinkHopCount() {
        int linkHops = 0;
        for(int i = pathFromSeed.length()-1; i>=0; i--) {
            if(pathFromSeed.charAt(i)==Hop.NAVLINK.getHopChar()) {
                linkHops++;
            }
        }
        return linkHops;
    }

    /**
     * Get the user agent to use for crawling this URI.
     *
     * If null the global setting should be used.
     *
     * @return user agent or null
     */
    public String getUserAgent() {
        return userAgent;
    }

    /**
     * Set the user agent to use when crawling this URI.
     *
     * If not set the global settings should be used.
     *
     * @param string user agent to use
     */
    public void setUserAgent(String string) {
        userAgent = string;
    }


    /**
     * For completed HTTP transactions, the length of the content-body.
     *
     * @return For completed HTTP transactions, the length of the content-body.
     */
    public long getContentLength() {
        if (this.contentLength < 0) {
            this.contentLength = (getRecorder() != null)?
                getRecorder().getResponseContentLength(): 0;
        }
        return this.contentLength;
    }
    
    /**
     * Get size of data recorded (transferred)
     * 
     * @return recorded data size
     */
    public long getRecordedSize() {
        return (getRecorder() != null) ? getRecorder()
                .getRecordedInput().getSize()
                // if unavailable fall back on content-size
                : getContentSize();
    }

    /**
     * Sets the 'content size' for the URI, which is considered inclusive of all
     * of all recorded material (such as protocol headers) or even material
     * 'virtually' considered (as in material from a previous fetch 
     * confirmed unchanged with a server). (In contrast, content-length 
     * matches the HTTP definition, that of the enclosed content-body.)
     * 
     * Should be set by a fetcher or other processor as soon as the final size
     * of recorded content is known. Setting to an artificial/incorrect value
     * may affect other reporting/processing.
     */
    public void setContentSize(long l) {
        contentSize = l;
    }

    /**
     * If true then a link extractor has already claimed this CrawlURI and
     * performed link extraction on the document content. This does not
     * preclude other link extractors that may have an interest in this
     * CrawlURI from also doing link extraction.
     * 
     * 
There is an onus on link extractors to set this flag if they have
     * run.
     * 
     * @return True if a processor has performed link extraction on this
     * CrawlURI
     *
     * @see #linkExtractorFinished()
     */
    public boolean hasBeenLinkExtracted(){
        return linkExtractorFinished;
    }

    /**
     * Note that link extraction has been performed on this CrawlURI. A processor
     * doing link extraction should invoke this method once it has finished it's
     * work. It should invoke it even if no links are extracted. It should only
     * invoke this method if the link extraction was performed on the document
     * body (not the HTTP headers etc.).
     *
     * @see #hasBeenLinkExtracted()
     */
    public void linkExtractorFinished() {
        linkExtractorFinished = true;
        if(discardedOutlinks>0) {
            getAnnotations().add("dol:"+discardedOutlinks);
        }
    }

    /**
     * Notify CrawlURI it is about to be logged; opportunity
     * for self-annotation
     */
    public void aboutToLog() {
        if (fetchAttempts>1) {
            getAnnotations().add(fetchAttempts + "t");
        }
    }

    /**
     * Get the http recorder associated with this uri.
     *
     * @return Returns the httpRecorder.  May be null but its set early in
     * FetchHttp so there is an issue if its null.
     */
    public Recorder getRecorder() {
        return httpRecorder;
    }

    /**
     * Set the http recorder to be associated with this uri.
     *
     * @param httpRecorder The httpRecorder to set.
     */
    public void setRecorder(Recorder httpRecorder) {
        this.httpRecorder = httpRecorder;
    }

    /**
     * Return true if this is a http transaction.
     *
     * @return True if this is a http transaction.
     */
    public boolean isHttpTransaction() {
        return getFetchType().equals(FetchType.HTTP_GET) || getFetchType().equals(FetchType.HTTP_POST);
    }

    /**
     * Clean up after a run through the processing chain.
     *
     * Called on the end of processing chain by Frontier#finish.  Null out any
     * state gathered during processing.
     */
    public void processingCleanup() {
        this.httpRecorder = null;
        this.fetchStatus = S_UNATTEMPTED;
        this.setPrerequisite(false);
        this.contentSize = UNCALCULATED;
        this.contentLength = UNCALCULATED;
        // Clear 'links extracted' flag.
        this.linkExtractorFinished = false;
        // Clean the data map of all but registered permanent members.
        this.data = getPersistentDataMap();
        
        extraInfo = null;
        outLinks = null;
        
        this.revisitProfile = null;
        
        // XXX er uh surprised this wasn't here before?
        fetchType = FetchType.UNKNOWN;
    }
    
    public Map getPersistentDataMap() {
        if (data == null) {
            return null;
        }
        Map result = new HashMap(getData());
        Set retain = new HashSet(persistentKeys);
        
        if (containsDataKey(A_HERITABLE_KEYS)) {
            @SuppressWarnings("unchecked")
            HashSet heritable = (HashSet)getData().get(A_HERITABLE_KEYS);
            retain.addAll(heritable);
        }
        
        result.keySet().retainAll(retain);
        return result;
    }

    /**
     * @return Credential avatars.  Null if none set.
     */
    public Set getCredentials() {
        @SuppressWarnings("unchecked")
        Set r = (Set)getData().get(A_CREDENTIALS_KEY);
        if (r == null) {
            r = new HashSet();
            getData().put(A_CREDENTIALS_KEY, r);
        }
        return r;
    }

    /**
     * @return True if there are avatars attached to this instance.
     */
    public boolean hasCredentials() {
        return containsDataKey(A_CREDENTIALS_KEY);
    }


    /**
     * Ask this URI if it was a success or not.
     *
     * Only makes sense to call this method after execution of
     * HttpMethod#execute. Regard any status larger then 0 as success
     * except for below caveat regarding 401s.  Use {@link #is2XXSuccess()} if
     * looking for a status code in the 200 range.
     *
     * 
401s caveat: If any rfc2617 credential data present and we got a 401
     * assume it got loaded in FetchHTTP on expectation that we're to go around
     * the processing chain again. Report this condition as a failure so we
     * get another crack at the processing chain only this time we'll be making
     * use of the loaded credential data.
     *
     * @return True if ths URI has been successfully processed.
     * @see #is2XXSuccess()
     */
    public boolean isSuccess() {
        boolean result = false;
        int statusCode = this.fetchStatus;
        if (statusCode == 401 && hasRfc2617Credential()) {
            result = false;
        } else {
            result = (statusCode > 0);
        }
        return result;
    }
    
    /**
     * @return True if status code is in the 2xx range.
     * @see #isSuccess()
     */
    public boolean is2XXSuccess() {
        return this.fetchStatus >= 200 && this.fetchStatus < 300;
    }

    /**
     * @return True if we have an rfc2617 payload.
     */
    public boolean hasRfc2617Credential() {
        Set credentials = getCredentials();
        if (credentials != null && credentials.size() > 0) {
            for (Credential credential : credentials) {
                if(credential instanceof HttpAuthenticationCredential) {
                    return true; 
                }
            }
        }
        return false; 
    }

    /**
     * Set the retained content-digest value (usu. SHA1). 
     * 
     * @param digestValue
     * @deprecated Use {@link #setContentDigest(String, byte[])}
     */
    public void setContentDigest(byte[] digestValue) {
        setContentDigest("SHA1", digestValue);
    }
    
    public void setContentDigest(final String scheme,
            final byte [] digestValue) {
        this.contentDigest = digestValue;
        this.contentDigestScheme = scheme;
    }
    
    public String getContentDigestSchemeString() {
        if (this.contentDigest == null) {
            return null;
        }
        return this.contentDigestScheme + ":" + getContentDigestString();
    }

    /**
     * Return the retained content-digest value, if any.
     * 
     * @return Digest value.
     */
    public byte[] getContentDigest() {
        return contentDigest;
    }
    
    public String getContentDigestString() {
        if (this.contentDigest == null) {
            return null;
        }
        return Base32.encode(this.contentDigest);
    }

    transient protected Object holder;
    transient protected Object holderKey;

    /**
     * Remember a 'holder' to which some enclosing/queueing
     * facility has assigned this CrawlURI
     * .
     * @param obj
     */
    public void setHolder(Object obj) {
        holder=obj;
    }

    /**
     * Return the 'holder' for the convenience of 
     * an external facility.
     *
     * @return holder
     */
    public Object getHolder() {
        return holder;
    }

    /**
     * Remember a 'holderKey' which some enclosing/queueing
     * facility has assigned this CrawlURI
     * .
     * @param obj
     */
    public void setHolderKey(Object obj) {
        holderKey=obj;
    }
    /**
     * Return the 'holderKey' for convenience of 
     * an external facility (Frontier).
     * 
     * @return holderKey 
     */
    public Object getHolderKey() {
        return holderKey;
    }

    /**
     * Get the ordinal (serial number) assigned at creation.
     * 
     * @return ordinal
     */
    public long getOrdinal() {
        return ordinal;
    }
    
    
    public void setOrdinal(long o) {
        this.ordinal = o;
    }

    /** spot for an integer cost to be placed by external facility (frontier).
     *  cost is truncated to 8 bits at times, so should not exceed 255 */
    protected int holderCost = UNCALCULATED;
    /**
     * Return the 'holderCost' for convenience of external facility (frontier)
     * @return value of holderCost
     */
    public int getHolderCost() {
        return holderCost;
    }

    /**
     * Remember a 'holderCost' which some enclosing/queueing
     * facility has assigned this CrawlURI
     * @param cost value to remember
     */
    public void setHolderCost(int cost) {
        holderCost = cost;
    }

    /** 
     * All discovered outbound urls as CrawlURIs (navlinks, embeds, etc.) 
     */
    protected transient Collection outLinks;
    
    /**
     * Returns discovered links.  The returned collection might be empty if
     * no links were discovered, or if something like LinksScoper promoted
     * the links to CrawlURIs.
     * 
     * @return Collection of all discovered outbound links
     */
    public Collection getOutLinks() {
    	if (outLinks==null) {
    		outLinks = new LinkedHashSet();
    	}
        return outLinks;
    }
    
   
    /**
     * Set the (HTML) Base URI used for derelativizing internal URIs. 
     * 
     * @param baseHref String base href to use
     * @throws URIException if supplied string cannot be interpreted as URI
     */
    public void setBaseURI(String baseHref) throws URIException {
        getData().put(A_HTML_BASE, UURIFactory.getInstance(baseHref));
    }
      
    /**
     * Get the (HTML) Base URI used for derelativizing internal URIs. 
     *
     * @return UURI base URI previously set 
     */  
    public UURI getBaseURI() {
        if (!containsDataKey(A_HTML_BASE)) {
            return getUURI();
        }
        return (UURI)getData().get(A_HTML_BASE);
    }
    
    public static Collection getPersistentDataKeys() {
        return persistentKeys;
    }

    /**
     * Add the key of  items you want to persist across
     * processings.
     * @param s Key to add.
     */
    public void addPersistentDataMapKey(String s) {
        if (!persistentKeys.contains(s)) {
            addDataPersistentMember(s);
        }
    }
    
    /**
     * Add the key of data map items you want to persist across
     * processings.
     * @param key Key to add.
     */
    public static void addDataPersistentMember(String key) {
        persistentKeys.add(key);
    }
    
    /**
     * Remove the key from those data map members persisted. 
     * @param key Key to remove.
     * @return True if list contained the element.
     */
    public static boolean removeDataPersistentMember(String key) {
        return persistentKeys.remove(key);
    }

    private void writeObject(ObjectOutputStream stream) throws IOException {
        stream.defaultWriteObject();
        stream.writeObject((data==null || data.isEmpty()) ? null : data);
      }
    
    private void readObject(ObjectInputStream stream) throws IOException,
    ClassNotFoundException {
        stream.defaultReadObject();
        @SuppressWarnings("unchecked")
        Map temp = (Map)stream.readObject();
        this.data = temp;
    }

    /**
     * Read a UURI from a String, handling a null or URIException
     * 
     * @param u String or null from which to create UURI
     * @return the best UURI instance creatable
     */
    protected UURI readUuri(String u) {
        if (u == null) {
            return null;
        }
        try {
            return UURIFactory.getInstance(u);
        } catch (URIException ux) {
            // simply continue to next try
        }
        try {
            // try adding an junk scheme
            return UURIFactory.getInstance("invalid:" + u);
        } catch (URIException ux) {
            ux.printStackTrace();
            // ignored; method continues
        }
        try {
            // return total junk
            return UURIFactory.getInstance("invalid:");
        } catch (URIException e) {
            e.printStackTrace();
            return null;
        }
    }
    


    public String getDNSServerIPLabel() {
        if (data == null) {
            return null;
        } else {
            return (String)data.get(A_DNS_SERVER_IP_LABEL);
        }
    }

    public long getFetchBeginTime() {
        if (containsDataKey(CoreAttributeConstants.A_FETCH_BEGAN_TIME)) {
            return (Long)getData().get(CoreAttributeConstants.A_FETCH_BEGAN_TIME);
        } else {
            return 1L;
        }
    }

    public long getFetchCompletedTime() {
        if (containsDataKey(A_FETCH_COMPLETED_TIME)) {
            return (Long)getData().get(A_FETCH_COMPLETED_TIME);
        } else {
            return 0L;
        }
    }

    public long getFetchDuration() {
        if (!containsDataKey(A_FETCH_COMPLETED_TIME)) {
            return -1;
        }

        long completedTime = getFetchCompletedTime();
        long beganTime = getFetchBeginTime();
        return completedTime - beganTime;
    }
    
    public FetchType getFetchType() {
        return fetchType;
    }
    
    public Collection getNonFatalFailures() {
        @SuppressWarnings("unchecked")
        List list = (List)getData().get(A_NONFATAL_ERRORS);
        if (list == null) {
            list = new ArrayList();
            getData().put(A_NONFATAL_ERRORS, list);
        }
        
        // FIXME: Previous code automatically added annotation when "localized error"
        // was added, override collection to implement that?
        return list;
    }


    public void setDNSServerIPLabel(String label) {
        getData().put(A_DNS_SERVER_IP_LABEL, label);
    }

    public void setError(String msg) {
        // TODO: Figure out where this is read, if ever.
        getData().put("error", msg);
    }

    public void setFetchBeginTime(long time) {
        getData().put(CoreAttributeConstants.A_FETCH_BEGAN_TIME, time);
    }

    public void setFetchCompletedTime(long time) {
        getData().put(A_FETCH_COMPLETED_TIME, time);
    }

    public void setFetchType(FetchType type) {
        fetchType = type;
    }

    public void setForceRetire(boolean b) {
        getData().put(A_FORCE_RETIRE, b);
    }

    public void setBaseURI(UURI base) {
        getData().put(A_HTML_BASE, base);
    }
    
    public Map getData() {
        if (data == null) {
            data = new HashMap();
        }
        return data;
    }
    
    /**
     * Convenience method: return (creating if necessary) list at 
     * given data key
     * @param key
     * @return List
     */
    @SuppressWarnings("unchecked")
    public List