All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jwat.warc.WarcConstants Maven / Gradle / Ivy

/**
 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
 * and GZip files. (http://jwat.org/)
 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.jwat.warc;

import java.util.HashMap;
import java.util.Map;

/**
 * Class containing all relevant WARC constants and structures.
 * Including but not limited to field names and mime-types.
 * Also includes statically initialized structures for validation.
 *
 * @author nicl
 */
public class WarcConstants {

    /**
     * This utility class does not require instantiation.
     */
    protected WarcConstants() {
    }

    /**
     * A WARC header block starts with this string including trailing version
     * information.
     * */
    public static final String WARC_MAGIC_HEADER = "WARC/";

    /** End mark used after each record consisting of two newlines. */
    protected static byte[] endMark = "\r\n\r\n".getBytes();

    /** WARC date format string as specified by the WARC ISO standard. */
    public static final String WARC_DATE_FORMAT = "yyyy-MM-dd'T'HH:mm:ss'Z'";

    /** WARC digest format string as specified by the WARC ISO standard. */
    public static final String WARC_DIGEST_FORMAT = ":";

    /** Content-type format string as specified in RFC2616.*/
    public static final String CONTENT_TYPE_FORMAT = "/(; =)*";

    /*
     * WARC content-types (MIME).
     */

    /** WARC mime type. */
    public static final String WARC_MIME_TYPE = "application/warc";

    /** Suggested content-type/media-type for metadata records and others. */
    public static final String CT_APP_WARC_FIELDS = "application/warc-fields";

    /** Suggested content-type for metadata records and others. */
    public static final String CONTENT_TYPE_METADATA = "application";

    /** Suggested media-type for metadata records and others. */
    public static final String MEDIA_TYPE_METADATA = "warc-fields";

    //"text/dns"
    //"application/http;msgtype=request"
    //"application/http;msgtype=response"

    /*
     * Voodoo magic constants.
     */

    /** Trailing newlines after each record as per the WARC ISO standard. */
    public static final int WARC_RECORD_TRAILING_NEWLINES = 2;

    /** Number of WARC fields. */
    public static final int FN_NUMBER = 21;

    /** Index of last WARC field (zero-indexed). */
    public static final int FN_INDEX_OF_LAST = FN_NUMBER+1;

    /** Number of WARC types. */
    public static final int RT_NUMBER = 8;

    /** Index of last WARC type (zero indexed). */
    public static final int RT_INDEX_OF_LAST = RT_NUMBER+1;

    /*
     * WARC field names.
     */

    /** Warc-type field name. */
    public static final String FN_WARC_TYPE = "WARC-Type";
    /** Warc-record-id field name. */
    public static final String FN_WARC_RECORD_ID = "WARC-Record-ID";
    /** Warc-date field name. */
    public static final String FN_WARC_DATE = "WARC-Date";
    /** Content-length field name. */
    public static final String FN_CONTENT_LENGTH = "Content-Length";
    /** Content-type field name. */
    public static final String FN_CONTENT_TYPE = "Content-Type";
    /** Warc-concurrent-to field name. */
    public static final String FN_WARC_CONCURRENT_TO = "WARC-Concurrent-To";
    /** Warc-block-digest field name. */
    public static final String FN_WARC_BLOCK_DIGEST = "WARC-Block-Digest";
    /** Warc-payload-digest field name. */
    public static final String FN_WARC_PAYLOAD_DIGEST = "WARC-Payload-Digest";
    /** Warc-ip-address field name. */
    public static final String FN_WARC_IP_ADDRESS = "WARC-IP-Address";
    /** Warc-refers-to field name. */
    public static final String FN_WARC_REFERS_TO = "WARC-Refers-To";
    /** Warc-target-uri field name. */
    public static final String FN_WARC_TARGET_URI = "WARC-Target-URI";
    /** Warc-truncated field name. */
    public static final String FN_WARC_TRUNCATED = "WARC-Truncated";
    /** Warc-warcinfo-id field name. */
    public static final String FN_WARC_WARCINFO_ID = "WARC-Warcinfo-ID";
    /** Warc-filename field name. */
    public static final String FN_WARC_FILENAME = "WARC-Filename";
    /** Warc-profile field name. */
    public static final String FN_WARC_PROFILE = "WARC-Profile";
    /** Warc-identified-payload-type field name. */
    public static final String FN_WARC_IDENTIFIED_PAYLOAD_TYPE = "WARC-Identified-Payload-Type";
    /** Warc-segment-origin-id field name. */
    public static final String FN_WARC_SEGMENT_ORIGIN_ID = "WARC-Segment-Origin-ID";
    /** Warc-segment-number field name. */
    public static final String FN_WARC_SEGMENT_NUMBER = "WARC-Segment-Number";
    /** Warc-segment-totalt-length field name. */
    public static final String FN_WARC_SEGMENT_TOTAL_LENGTH = "WARC-Segment-Total-Length";
    /** WARC-Refers-To-Target-URI field name. */
    public static final String FN_WARC_REFERS_TO_TARGET_URI = "WARC-Refers-To-Target-URI";
    /** WARC-Refers-To-Date field name. */
    public static final String FN_WARC_REFERS_TO_DATE = "WARC-Refers-To-Date";

    /** WARC field name id to field name mapping table.
     *  Zero indexed array with all indexes used > 1. (Index 0 is unused) */
    public static final String[] FN_IDX_STRINGS = {
        null,
        FN_WARC_TYPE,
        FN_WARC_RECORD_ID,
        FN_WARC_DATE,
        FN_CONTENT_LENGTH,
        FN_CONTENT_TYPE,
        FN_WARC_CONCURRENT_TO,
        FN_WARC_BLOCK_DIGEST,
        FN_WARC_PAYLOAD_DIGEST,
        FN_WARC_IP_ADDRESS,
        FN_WARC_REFERS_TO,
        FN_WARC_TARGET_URI,
        FN_WARC_TRUNCATED,
        FN_WARC_WARCINFO_ID,
        FN_WARC_FILENAME,
        FN_WARC_PROFILE,
        FN_WARC_IDENTIFIED_PAYLOAD_TYPE,
        FN_WARC_SEGMENT_ORIGIN_ID,
        FN_WARC_SEGMENT_NUMBER,
        FN_WARC_SEGMENT_TOTAL_LENGTH,
        FN_WARC_REFERS_TO_TARGET_URI,
        FN_WARC_REFERS_TO_DATE
    };

    /** Warc reader warc-type field name id. */
    public static final int FN_IDX_WARC_TYPE = 1;
    /** Warc reader warc-record-id field name id. */
    public static final int FN_IDX_WARC_RECORD_ID = 2;
    /** Warc reader warc-date field name id. */
    public static final int FN_IDX_WARC_DATE = 3;
    /** Warc reader content-length field name id. */
    public static final int FN_IDX_CONTENT_LENGTH = 4;
    /** Warc reader content-type field name id. */
    public static final int FN_IDX_CONTENT_TYPE = 5;
    /** Warc reader warc-concurrent-to field name id. */
    public static final int FN_IDX_WARC_CONCURRENT_TO = 6;
    /** Warc reader warc-block-digest field name id. */
    public static final int FN_IDX_WARC_BLOCK_DIGEST = 7;
    /** Warc reader warc-payload-digest field name id. */
    public static final int FN_IDX_WARC_PAYLOAD_DIGEST = 8;
    /** Warc reader warc-ip-address field name id. */
    public static final int FN_IDX_WARC_IP_ADDRESS = 9;
    /** Warc reader warc-refers-to field name id. */
    public static final int FN_IDX_WARC_REFERS_TO = 10;
    /** Warc reader warc-target-uri field name id. */
    public static final int FN_IDX_WARC_TARGET_URI = 11;
    /** Warc reader warc-truncated field name id. */
    public static final int FN_IDX_WARC_TRUNCATED = 12;
    /** Warc reader warc-warcinfo-id field name id. */
    public static final int FN_IDX_WARC_WARCINFO_ID = 13;
    /** Warc reader warc-filename field name id. */
    public static final int FN_IDX_WARC_FILENAME = 14;                    // warcinfo only
    /** Warc reader warc-profile field name id. */
    public static final int FN_IDX_WARC_PROFILE = 15;                    // revisit only
    /** Warc reader warc-identified-payload-type field name id. */
    public static final int FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE = 16;
    /** Warc reader warc-segment-origin-id field name id. */
    public static final int FN_IDX_WARC_SEGMENT_ORIGIN_ID = 17;            // continuation only
    /** Warc reader warc-segment-number field name id. */
    public static final int FN_IDX_WARC_SEGMENT_NUMBER = 18;
    /** Warc reader warc-segment-totalt-length field name id. */
    public static final int FN_IDX_WARC_SEGMENT_TOTAL_LENGTH = 19;        //continuation only
    /** WARC-Refers-To-Target-URI field name id. */
    public static final int FN_IDX_WARC_REFERS_TO_TARGET_URI = 20;
    /** WARC-Refers-To-Date field name id. */
    public static final int FN_IDX_WARC_REFERS_TO_DATE = 21;

    /** Map used to identify known warc field names. */
    public static final Map fieldNameIdxMap = new HashMap();

    /**
     * Populate map of known WARC field names.
     */
    static {
        fieldNameIdxMap.put(FN_WARC_TYPE.toLowerCase(), FN_IDX_WARC_TYPE);
        fieldNameIdxMap.put(FN_WARC_RECORD_ID.toLowerCase(), FN_IDX_WARC_RECORD_ID);
        fieldNameIdxMap.put(FN_WARC_DATE.toLowerCase(), FN_IDX_WARC_DATE);
        fieldNameIdxMap.put(FN_CONTENT_LENGTH.toLowerCase(), FN_IDX_CONTENT_LENGTH);
        fieldNameIdxMap.put(FN_CONTENT_TYPE.toLowerCase(), FN_IDX_CONTENT_TYPE);
        fieldNameIdxMap.put(FN_WARC_CONCURRENT_TO.toLowerCase(), FN_IDX_WARC_CONCURRENT_TO);
        fieldNameIdxMap.put(FN_WARC_BLOCK_DIGEST.toLowerCase(), FN_IDX_WARC_BLOCK_DIGEST);
        fieldNameIdxMap.put(FN_WARC_PAYLOAD_DIGEST.toLowerCase(), FN_IDX_WARC_PAYLOAD_DIGEST);
        fieldNameIdxMap.put(FN_WARC_IP_ADDRESS.toLowerCase(), FN_IDX_WARC_IP_ADDRESS);
        fieldNameIdxMap.put(FN_WARC_REFERS_TO.toLowerCase(), FN_IDX_WARC_REFERS_TO);
        fieldNameIdxMap.put(FN_WARC_TARGET_URI.toLowerCase(), FN_IDX_WARC_TARGET_URI);
        fieldNameIdxMap.put(FN_WARC_TRUNCATED.toLowerCase(), FN_IDX_WARC_TRUNCATED);
        fieldNameIdxMap.put(FN_WARC_WARCINFO_ID.toLowerCase(), FN_IDX_WARC_WARCINFO_ID);
        fieldNameIdxMap.put(FN_WARC_FILENAME.toLowerCase(), FN_IDX_WARC_FILENAME);
        fieldNameIdxMap.put(FN_WARC_PROFILE.toLowerCase(), FN_IDX_WARC_PROFILE);
        fieldNameIdxMap.put(FN_WARC_IDENTIFIED_PAYLOAD_TYPE.toLowerCase(), FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE);
        fieldNameIdxMap.put(FN_WARC_SEGMENT_ORIGIN_ID.toLowerCase(), FN_IDX_WARC_SEGMENT_ORIGIN_ID);
        fieldNameIdxMap.put(FN_WARC_SEGMENT_NUMBER.toLowerCase(), FN_IDX_WARC_SEGMENT_NUMBER);
        fieldNameIdxMap.put(FN_WARC_SEGMENT_TOTAL_LENGTH.toLowerCase(), FN_IDX_WARC_SEGMENT_TOTAL_LENGTH);
        fieldNameIdxMap.put(FN_WARC_REFERS_TO_TARGET_URI.toLowerCase(), FN_IDX_WARC_REFERS_TO_TARGET_URI);
        fieldNameIdxMap.put(FN_WARC_REFERS_TO_DATE.toLowerCase(), FN_IDX_WARC_REFERS_TO_DATE);
    }

    /** WARC String field datatype identifier. */
    public static final int FDT_STRING = 0;
    /** WARC Integer field datatype identifier. */
    public static final int FDT_INTEGER = 1;
    /** WARC Long field datatype identifier. */
    public static final int FDT_LONG = 2;
    /** WARC Digest field datatype identifier. */
    public static final int FDT_DIGEST = 3;
    /** WARC ContentType field datatype identifier. */
    public static final int FDT_CONTENTTYPE = 4;
    /** WARC Date field datatype identifier. */
    public static final int FDT_DATE = 5;
    /** WARC InetAddress field datatype identifier. */
    public static final int FDT_INETADDRESS = 6;
    /** WARC URI field datatype identifier. */
    public static final int FDT_URI = 7;

    /** WARC field datatype id to field datatype name mapping table. */
    public static final String[] FDT_IDX_STRINGS = {
        "String",
        "Integer",
        "Long",
        "Digest",
        "ContentType",
        "Date",
        "InetAddress",
        "URI"
    };

    /** Array to lookup WARC field datatypes. */
    public static final int[] FN_IDX_DT = {
        -1,
        FDT_STRING,
        FDT_URI,
        FDT_DATE,
        FDT_LONG,
        FDT_CONTENTTYPE,
        FDT_URI,
        FDT_DIGEST,
        FDT_DIGEST,
        FDT_INETADDRESS,
        FDT_URI,
        FDT_URI,
        FDT_STRING,
        FDT_URI,
        FDT_STRING,
        FDT_URI,
        FDT_CONTENTTYPE,
        FDT_URI,
        FDT_INTEGER,
        FDT_LONG,
        FDT_URI,
        FDT_DATE
    };

    /*
     * WARC fields that can have multiple occurrences in a Warc header.
     */

    /** Lookup table of Warc fields that can have multiple occurrences. */
    public static final boolean[] fieldNamesRepeatableLookup = new boolean[FN_INDEX_OF_LAST];

    /**
     * Populate multiple occurrences lookup table.
     */
    static {
        fieldNamesRepeatableLookup[FN_IDX_WARC_CONCURRENT_TO] = true;
    }

    /*
     * WARC record types.
     */

    /** WARC-Type warcinfo id. */
    public static final String RT_WARCINFO = "warcinfo";
    /** WARC-Type response id. */
    public static final String RT_RESPONSE = "response";
    /** WARC-Type resource id. */
    public static final String RT_RESOURCE = "resource";
    /** WARC-Type request id. */
    public static final String RT_REQUEST = "request";
    /** WARC-Type metadata id. */
    public static final String RT_METADATA = "metadata";
    /** WARC-Type revisit id. */
    public static final String RT_REVISIT = "revisit";
    /** WARC-Type conversion id. */
    public static final String RT_CONVERSION = "conversion";
    /** WARC-Type continuation id. */
    public static final String RT_CONTINUATION = "continuation";

    /** WARC type id to field name mapping table.
     *  Zero indexed array with all indexes used > 1. (Index 0 is unused) */
    public static final String[] RT_IDX_STRINGS = {
        null,
        RT_WARCINFO,
        RT_RESPONSE,
        RT_RESOURCE,
        RT_REQUEST,
        RT_METADATA,
        RT_REVISIT,
        RT_CONVERSION,
        RT_CONTINUATION
    };

    /** Warc reader unknown warc record type id. */
    public static final int RT_IDX_UNKNOWN = 0;
    /** Warc reader warcinfo warc record type id. */
    public static final int RT_IDX_WARCINFO = 1;
    /** Warc reader response warc record type id. */
    public static final int RT_IDX_RESPONSE = 2;
    /** Warc reader resource warc record type id. */
    public static final int RT_IDX_RESOURCE = 3;
    /** Warc reader request warc record type id. */
    public static final int RT_IDX_REQUEST = 4;
    /** Warc reader metadata warc record type id. */
    public static final int RT_IDX_METADATA = 5;
    /** Warc reader revisit warc record type id. */
    public static final int RT_IDX_REVISIT = 6;
    /** Warc reader conversion warc record type id. */
    public static final int RT_IDX_CONVERSION = 7;
    /** Warc reader continuation warc record type id. */
    public static final int RT_IDX_CONTINUATION = 8;

    /** WARC-Type lookup map. */
    public static final Map recordTypeIdxMap = new HashMap();

    /**
     * Populate WARC-Type lookup map.
     */
    static {
        recordTypeIdxMap.put(RT_WARCINFO.toLowerCase(), RT_IDX_WARCINFO);
        recordTypeIdxMap.put(RT_RESPONSE.toLowerCase(), RT_IDX_RESPONSE);
        recordTypeIdxMap.put(RT_RESOURCE.toLowerCase(), RT_IDX_RESOURCE);
        recordTypeIdxMap.put(RT_REQUEST.toLowerCase(), RT_IDX_REQUEST);
        recordTypeIdxMap.put(RT_METADATA.toLowerCase(), RT_IDX_METADATA);
        recordTypeIdxMap.put(RT_REVISIT.toLowerCase(), RT_IDX_REVISIT);
        recordTypeIdxMap.put(RT_CONVERSION.toLowerCase(), RT_IDX_CONVERSION);
        recordTypeIdxMap.put(RT_CONTINUATION.toLowerCase(), RT_IDX_CONTINUATION);
    }

    /*
     * Truncation reason types.
     */

    /** WARC-Truncated length id. */
    public static final String TT_LENGTH = "length";
    /** WARC-Truncated time id*/
    public static final String TT_TIME = "time";
    /** WARC-Truncated disconnect id. */
    public static final String TT_DISCONNECT = "disconnect";
    /** WARC-Truncated unspecified id. */
    public static final String TT_UNSPECIFIED = "unspecified";

    /** WARC truncation reason id to field name mapping table.
     *  Zero indexed array with all indexes used > 1. (Index 0 is unused) */
    public static final String[] TT_IDX_STRINGS = {
        null,
        TT_LENGTH,
        TT_TIME,
        TT_DISCONNECT,
        TT_UNSPECIFIED
    };

    /** Warc reader future reason id. */
    public static final int TT_IDX_FUTURE_REASON = 0;
    /** Warc reader length reason id. */
    public static final int TT_IDX_LENGTH = 1;
    /** Warc reader time reason id. */
    public static final int TT_IDX_TIME = 2;
    /** Warc reader disconnect reason id. */
    public static final int TT_IDX_DISCONNECT = 3;
    /** Warc reader unspecified reason id. */
    public static final int TT_IDX_UNSPECIFIED = 4;

    /** Lookup map for known truncation reason id's. */
    public static final Map truncatedTypeIdxMap = new HashMap();

    /**
     * Populate truncation reason id lookup map.
     */
    static {
        truncatedTypeIdxMap.put(TT_LENGTH.toLowerCase(), TT_IDX_LENGTH);
        truncatedTypeIdxMap.put(TT_TIME.toLowerCase(), TT_IDX_TIME);
        truncatedTypeIdxMap.put(TT_DISCONNECT.toLowerCase(), TT_IDX_DISCONNECT);
        truncatedTypeIdxMap.put(TT_UNSPECIFIED.toLowerCase(), TT_IDX_UNSPECIFIED);
    }

    /*
     * Warc revisit profile ids used in the WARC-Profile header (See ISO).
     */

    /** Revisit WARC-Profile id for identical payload digest. */
    public static final String PROFILE_IDENTICAL_PAYLOAD_DIGEST =
            "http://netpreserve.org/warc/1.0/revisit/identical-payload-digest";

    /** Revisit WARC-Profile id for server not modified. */
    public static final String PROFILE_SERVER_NOT_MODIFIED =
            "http://netpreserve.org/warc/1.0/revisit/server-not-modified";

    /** WARC profile id to field name mapping table.
     *  Zero indexed array with all indexes used > 1. (Index 0 is unused) */
    public static final String[] P_IDX_STRINGS = {
        null,
        PROFILE_IDENTICAL_PAYLOAD_DIGEST,
        PROFILE_SERVER_NOT_MODIFIED
    };

    /*
     * Warc revisit profile ids returned by the warc reader.
     * The raw value is also available in case of unknown profiles.
     */

    /** Warc reader id for unknown profile. */
    public static final int PROFILE_IDX_UNKNOWN = 0;
    /** Warc reader id for identical payload digest profile. */
    public static final int PROFILE_IDX_IDENTICAL_PAYLOAD_DIGEST = 1;
    /** Warc reader id for server not modified profile. */
    public static final int PROFILE_IDX_SERVER_NOT_MODIFIED = 2;

    /** Profile lookup map used to identify WARC-Profile values. */
    public static final Map profileIdxMap = new HashMap();

    /**
     * Populate the lookup map with known WARC-Profile ids.
     */
    static {
        profileIdxMap.put(PROFILE_IDENTICAL_PAYLOAD_DIGEST.toLowerCase(),
                PROFILE_IDX_IDENTICAL_PAYLOAD_DIGEST);
        profileIdxMap.put(PROFILE_SERVER_NOT_MODIFIED.toLowerCase(),
                PROFILE_IDX_SERVER_NOT_MODIFIED);
    }

    /*
     * The different requirement levels as per RFC 2119.
     * (See http://www.ietf.org/rfc/rfc2119.txt)
     */

    /** Warc header can be ignored. */
    public static final int POLICY_IGNORE = 0;
    /** Warc header is mandatory (equal to shall). */
    public static final int POLICY_MANDATORY = 1;
    /** Warc header must be present. */
    public static final int POLICY_SHALL = 2;
    /** Warc header must not be present. */
    public static final int POLICY_SHALL_NOT = 3;
    /** Warc header can be present. */
    public static final int POLICY_MAY = 4;
    /** Warc header should not be present. */
    public static final int POLICY_MAY_NOT = 5;

    /** A (Warc-Types x Warc-Header-Fields) matrix used for policy validation.
     *  (See below) */
    public static final int[][] field_policy;

    /**
     * The following section initializes the policy matrix used to check the
     * usage of each known warc header line against each known warc record
     * type.
     * The ISO standard was used to build the data in the matrix.
     */
    static {
        field_policy = new int[RT_INDEX_OF_LAST][FN_INDEX_OF_LAST];

        // Warc-Record-id
        // Warc-Type
        // Warc-Date
        // Content-Length
        // Also required for unknown warc-types.
        for (int i=0; i<=RT_NUMBER; ++i) {
            field_policy[i][FN_IDX_WARC_RECORD_ID] = POLICY_MANDATORY;
            field_policy[i][FN_IDX_WARC_TYPE] = POLICY_MANDATORY;
            field_policy[i][FN_IDX_WARC_DATE] = POLICY_MANDATORY;
            field_policy[i][FN_IDX_CONTENT_LENGTH] = POLICY_MANDATORY;
        }

        // Content-Type
        field_policy[RT_IDX_CONTINUATION][FN_IDX_CONTENT_TYPE] = POLICY_SHALL_NOT;

        // Warc-Ip-Address
        field_policy[RT_IDX_REQUEST][FN_IDX_WARC_IP_ADDRESS] = POLICY_MAY;
        field_policy[RT_IDX_RESPONSE][FN_IDX_WARC_IP_ADDRESS] = POLICY_MAY;
        field_policy[RT_IDX_RESOURCE][FN_IDX_WARC_IP_ADDRESS] = POLICY_MAY;
        field_policy[RT_IDX_METADATA][FN_IDX_WARC_IP_ADDRESS] = POLICY_MAY;
        field_policy[RT_IDX_REVISIT][FN_IDX_WARC_IP_ADDRESS] = POLICY_MAY;
        field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_IP_ADDRESS] = POLICY_SHALL_NOT;
        field_policy[RT_IDX_CONVERSION][FN_IDX_WARC_IP_ADDRESS] = POLICY_SHALL_NOT;
        field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_IP_ADDRESS] = POLICY_SHALL_NOT;

        // Warc-Concurrent-To
        field_policy[RT_IDX_REQUEST][FN_IDX_WARC_CONCURRENT_TO] = POLICY_MAY;
        field_policy[RT_IDX_RESPONSE][FN_IDX_WARC_CONCURRENT_TO] = POLICY_MAY;
        field_policy[RT_IDX_RESOURCE][FN_IDX_WARC_CONCURRENT_TO] = POLICY_MAY;
        field_policy[RT_IDX_METADATA][FN_IDX_WARC_CONCURRENT_TO] = POLICY_MAY;
        field_policy[RT_IDX_REVISIT][FN_IDX_WARC_CONCURRENT_TO] = POLICY_MAY;
        field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_CONCURRENT_TO] = POLICY_SHALL_NOT;
        field_policy[RT_IDX_CONVERSION][FN_IDX_WARC_CONCURRENT_TO] = POLICY_SHALL_NOT;
        field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_CONCURRENT_TO] = POLICY_SHALL_NOT;

        // Warc-Refers-To
        field_policy[RT_IDX_METADATA][FN_IDX_WARC_REFERS_TO] = POLICY_MAY;
        field_policy[RT_IDX_CONVERSION][FN_IDX_WARC_REFERS_TO] = POLICY_MAY;
        field_policy[RT_IDX_REVISIT][FN_IDX_WARC_REFERS_TO] = POLICY_MAY;
        field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_REFERS_TO] = POLICY_SHALL_NOT;
        field_policy[RT_IDX_REQUEST][FN_IDX_WARC_REFERS_TO] = POLICY_SHALL_NOT;
        field_policy[RT_IDX_RESPONSE][FN_IDX_WARC_REFERS_TO] = POLICY_SHALL_NOT;
        field_policy[RT_IDX_RESOURCE][FN_IDX_WARC_REFERS_TO] = POLICY_SHALL_NOT;
        field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_REFERS_TO] = POLICY_SHALL_NOT;

        // Warc-Target-Uri
        field_policy[RT_IDX_REQUEST][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL;
        field_policy[RT_IDX_RESPONSE][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL;
        field_policy[RT_IDX_RESOURCE][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL;
        field_policy[RT_IDX_CONVERSION][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL;
        field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL;
        field_policy[RT_IDX_REVISIT][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL;
        field_policy[RT_IDX_METADATA][FN_IDX_WARC_TARGET_URI] = POLICY_MAY;
        field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_TARGET_URI] = POLICY_SHALL_NOT;

        // Warc-Warcinfo-Id
        // Warc-Filename
        // Warc-Profile
        // Warc-Segment-Origin-Id
        // Warc-Segment-Total-Length
        for (int i=1; i<=RT_NUMBER; ++i) {
            field_policy[i][FN_IDX_WARC_WARCINFO_ID] = POLICY_MAY;
            field_policy[i][FN_IDX_WARC_FILENAME] = POLICY_SHALL_NOT;
            field_policy[i][FN_IDX_WARC_PROFILE] = POLICY_IGNORE;
            field_policy[i][FN_IDX_WARC_SEGMENT_ORIGIN_ID] = POLICY_SHALL_NOT;
            field_policy[i][FN_IDX_WARC_SEGMENT_ORIGIN_ID] = POLICY_SHALL_NOT;
        }
        field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_WARCINFO_ID] = POLICY_MAY_NOT;
        field_policy[RT_IDX_WARCINFO][FN_IDX_WARC_FILENAME] = POLICY_MAY;
        field_policy[RT_IDX_REVISIT][FN_IDX_WARC_PROFILE] = POLICY_MANDATORY;
        field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_SEGMENT_ORIGIN_ID] = POLICY_MANDATORY;

        // Warc-Segment-Number
        field_policy[RT_IDX_CONTINUATION][FN_IDX_WARC_SEGMENT_NUMBER] = POLICY_MANDATORY;

        // WARC-Refers-To-Target-URI
        // WARC-Refers-To-Date
        for (int i=1; i<=RT_NUMBER; ++i) {
            field_policy[i][FN_IDX_WARC_REFERS_TO_TARGET_URI] = POLICY_SHALL_NOT;
            field_policy[i][FN_IDX_WARC_REFERS_TO_DATE] = POLICY_SHALL_NOT;
        }
        field_policy[RT_IDX_REVISIT][FN_IDX_WARC_REFERS_TO_TARGET_URI] = POLICY_MAY;
        field_policy[RT_IDX_REVISIT][FN_IDX_WARC_REFERS_TO_DATE] = POLICY_MAY;
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy