All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.modules.recrawl.RecrawlAttributeConstants Maven / Gradle / Ivy

Go to download

This project contains some of the configurable modules used within the Heritrix application to crawl the web. The modules in this project can be used in applications other than Heritrix, however.

There is a newer version: 3.5.0
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.modules.recrawl;

/**
 * 
 * @author pjack
 *
 */
public interface RecrawlAttributeConstants {

    /* Duplication-reduction / recrawl / history constants */
    
    /** fetch history array */ 
    public static final String A_FETCH_HISTORY = "fetch-history";
    /** content digest */
    public static final String A_CONTENT_DIGEST = "content-digest";
    /** header name (and AList key) for last-modified timestamp */
    public static final String A_LAST_MODIFIED_HEADER = "last-modified";
    /** header name (and AList key) for ETag */
    public static final String A_ETAG_HEADER = "etag"; 
    /** key for status (when in history) */
    public static final String A_STATUS = "status"; 
    /** reference length (content length or virtual length */
    public static final String A_REFERENCE_LENGTH = "reference-length";
    
    // constants for uri-agnostic content digest based dedupe
    /** content digest history map */
    public static final String A_CONTENT_DIGEST_HISTORY = "content-digest-history";
    /** url that the content payload was written for */
    public static final String A_ORIGINAL_URL = "original-url";
    /** warc record id of warc record with the content payload */
    public static final String A_WARC_RECORD_ID = "warc-record-id";
    /** warc filename containing the content payload */
    public static final String A_WARC_FILENAME = "warc-filename";
    /** offset into warc file of warc record with content payload */
    public static final String A_WARC_FILE_OFFSET = "warc-file-offset";
    /** date content payload was written */
    public static final String A_ORIGINAL_DATE = "content-written-date";
    /** number of times we've seen this content digest (1 original + n duplicates) */
    public static final String A_CONTENT_DIGEST_COUNT = "content-digest-count";

    /**
     * Writer processors of all types are encouraged to put a 'writeTag'
     * (analogous to HTTP 'etag') in the CrawlURI state. Its contents are
     * opaque/private-to-the-writer, but might generally be a
     * WARC-name/offset/UUID/etc, and their mere presence means content is
     * written somewhere. A writer processor that decides not to write fresh
     * content at all, not even a revisit record, because it sees previous
     * sufficient writeTag in history, will usually copy that forward to latest
     * history record. {@link PersistLogProcessor}/{@link PersistStoreProcessor}
     * have an option {@link PersistProcessor#onlyStoreIfWriteTagPresent}, which
     * defaults to true.
     */
   public static final String A_WRITE_TAG = "write-tag";
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy