All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jwat.warc.WarcHeader Maven / Gradle / Ivy

Go to download

Used for reading, writing and validating WARC files. Implemented to follow the WARC/1.0 ISO specification as closely as possible.

There is a newer version: 1.2.1
Show newest version
/**
 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
 * and GZip files. (http://jwat.org/)
 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.jwat.warc;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.InetAddress;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.jwat.common.ByteCountingPushBackInputStream;
import org.jwat.common.ContentType;
import org.jwat.common.Diagnosis;
import org.jwat.common.DiagnosisType;
import org.jwat.common.Diagnostics;
import org.jwat.common.HeaderLine;
import org.jwat.common.HeaderLineReader;
import org.jwat.common.MaxLengthRecordingInputStream;
import org.jwat.common.Uri;
import org.jwat.common.UriProfile;

/**
 * Central class for working with WARC headers. This class includes support for
 * reading and writing WARC headers. Methods are also available to validate
 * individual headers and a WARC header as a whole.
 *
 * @author nicl
 */
public class WarcHeader {

    /** An URI with encapsulating <> characters. */
    public static final boolean URI_LTGT = true;

    /** An URI without encapsulating <> characters. */
    public static final boolean URI_NAKED = false;

    /** Associated WarcReader context.
     *  Must be set prior to calling the various methods. */
    protected WarcReader reader;

    /** Version based WARC validation. */
    protected WarcValidatorBase warcValidator;

    /** Diagnostics used to report diagnoses.
     *  Must be set prior to calling the various methods. */
    protected Diagnostics diagnostics;

    /** WARC-Target-URI profile. */
    protected UriProfile warcTargetUriProfile;

    /** URI profile. */
    protected UriProfile uriProfile;

    /** WARC field parser used.
     *  Must be set prior to calling the various methods. */
    protected WarcFieldParsers fieldParsers;

    /** Max size allowed for a record header. */
    protected int recordHeaderMaxSize = -1;

    /** Line reader used to read version lines. */
    protected HeaderLineReader lineReader;

    /** Header line reader used to read the WARC headers. */
    protected HeaderLineReader headerLineReader;

    /** WARC record starting offset relative to the source WARC file input
     *  stream. The offset is correct for both compressed and uncompressed streams. */
    protected long startOffset = -1;

    /*
     * Version related fields.
     */

    /** Was "WARC/" identified while looking for the version string. */
    public boolean bMagicIdentified;
    /** Did the version string include between 2 and 4 substrings delimited by ".". */
    public boolean bVersionParsed;
    /** Is the version format valid. */
    public boolean bValidVersionFormat;
    /** Is the version recognized. (0.17, 0.18 or 1.0) */
    public boolean bValidVersion;

    /** Raw version string. */
    public String versionStr;
    /** Array based on the version string split by the "." delimiter and converted to integers. */
    public int[] versionArr;

    /** Major version number from WARC header. */
    public int major = -1;
    /** Minor version number from WARC header. */
    public int minor = -1;

    /*
     * WARC header fields.
     */

    /** Array used for duplicate header detection. */
    protected boolean[] seen = new boolean[WarcConstants.FN_INDEX_OF_LAST];

    /** Is the header missing one of the mandatory headers. */
    public boolean bMandatoryMissing;

    /** WARC-Type field string value. */
    public String warcTypeStr;
    /** WARC-Type converted to an integer id, if identified. */
    public Integer warcTypeIdx;

    /** WARC-Filename field string value.
     *  (warcinfo record type only) */
    public String warcFilename;

    /** WARC-Record-Id field string value. */
    public String warcRecordIdStr;
    /** WARC-Record-Id converted to an Uri object, if valid. */
    public Uri warcRecordIdUri;

    /** WARC-Date field string value. */
    public String warcDateStr;
    /** WARC-Date converted to a Date object, if valid. */
    public WarcDate warcDate;

    /** Content-Length field string value. */
    public String contentLengthStr;
    /** Content-Length converted to a Long object, if valid. */
    public Long contentLength;

    /** Content-Type field string value. */
    public String contentTypeStr;
    /** Content-Type converted to a ContentType object, if valid. */
    public ContentType contentType;

    /** WARC-Truncated field string value. */
    public String warcTruncatedStr;
    /** WARC-Truncated converted to an integer id, if valid. */
    public Integer warcTruncatedIdx;

    /** WARC-IP-Address field string value. */
    public String warcIpAddress;
    /** WARC-IP-Address converted to an InetAddress object, if valid. */
    public InetAddress warcInetAddress;

    /** List of WARC-Concurrent-To field string values and converted URI objects,  if valid. */
    public List warcConcurrentToList = new LinkedList();

    /** WARC-Refers-To field string value. */
    public String warcRefersToStr;
    /** WARC-Refers-To converted to an Uri object, if valid. */
    public Uri warcRefersToUri;

    /** WARC_Target-URI field string value. */
    public String warcTargetUriStr;
    /** WARC-TargetURI converted to an Uri object, if valid. */
    public Uri warcTargetUriUri;

    /** WARC-Warcinfo-Id field string value. */
    public String warcWarcinfoIdStr;
    /** WARC-Warcinfo-Id converted to an Uri object, if valid. */
    public Uri warcWarcinfoIdUri;

    /** WARC-Block-Digest field string value. */
    public String warcBlockDigestStr;
    /** WARC-Block-Digest converted to a WarcDigest object, if valid. */
    public WarcDigest warcBlockDigest;

    /** WARC-Payload-Digest field string value. */
    public String warcPayloadDigestStr;
    /** WARC-Payload-Digest converted to a WarcDigest object, if valid. */
    public WarcDigest warcPayloadDigest;

    /** WARC-Identified-Payload-Type field string value. */
    public String warcIdentifiedPayloadTypeStr;
    /** WARC-Identified-Payload-Type converted to a ContentType object, if valid. */
    public ContentType warcIdentifiedPayloadType;

    /** WARC-Profile field string value.
     *  (revisit record only) */
    public String warcProfileStr;
    /** WARC-Profile field converted to an Uri object, if valid.
     *  (revisit record only) */
    public Uri warcProfileUri;
    /** WARC-Profile converted to an integer id, if valid.
     *  (revisit record only) */
    public Integer warcProfileIdx;

    /** WARC-Segment-Number field string value. */
    public String warcSegmentNumberStr;
    /** WARC-Segment-Number converted to an Integer object, if valid. */
    public Integer warcSegmentNumber;

    /** WARC-Segment-Origin-Id field string value.
     *  (continuation record only) */
    public String warcSegmentOriginIdStr;
    /** WARC-Segment-Origin-Id converted to an Uri object, if valid.
     *  (continuation record only) */
    public Uri warcSegmentOriginIdUrl;

    /** WARC-Segment-Total-Length field string value.
     *  (continuation record only) */
    public String warcSegmentTotalLengthStr;
    /** WARC-Segment-Total-Length converted to a Long object, if valid.
     *  (continuation record only) */
    public Long warcSegmentTotalLength;

    // see https://docs.google.com/document/d/1QyQBA7Ykgxie75V8Jziz_O7hbhwf7PF6_u9O6w6zgp0/edit
    /** WARC-Refers-To-Target-URI field string value. */
    public String warcRefersToTargetUriStr;
    /** WARC-Refers-To-Target-URI converted to an Uri object, if valid. */
    public Uri warcRefersToTargetUriUri;
    /** WARC-Refers-To-Date */
    public String warcRefersToDateStr;
    /** WARC-Date converted to a Date object, if valid. */
    public WarcDate warcRefersToDate;

    /*
     * WARC header fields collections.
     */

    /** Raw WARC header output stream. */
    protected ByteArrayOutputStream headerBytesOut = new ByteArrayOutputStream();

    /** Raw WARC header byte array. */
    public byte[] headerBytes;

    /** List of parsed header fields. */
    protected List headerList = new LinkedList();

    /** Map of parsed header fields. */
    protected Map headerMap = new HashMap();

    /**
     * Non public constructor to allow unit testing.
     */
    protected WarcHeader() {
    }

    /**
     * Create a new WarcHeader for recreating a header object.
     * @param fieldParsers parsers used for the individual types
     * @param uriProfile uri profile used to validate urls
     * @param diagnostics diagnostics object used by parser
     * @return a WarcHeader prepared for reanimation
     */
    public static WarcHeader initHeader(int recordHeaderMaxSize, HeaderLineReader lineReader, HeaderLineReader headerLineReader, WarcFieldParsers fieldParsers, UriProfile uriProfile, Diagnostics diagnostics) {
        WarcHeader header = new WarcHeader();
        header.warcTargetUriProfile = uriProfile;
        header.uriProfile = uriProfile;
        header.fieldParsers = fieldParsers;
        header.diagnostics = diagnostics;
        header.recordHeaderMaxSize = recordHeaderMaxSize;
        header.lineReader = lineReader;
        header.headerLineReader = headerLineReader;
        return header;
    }

    /**
     * Create and initialize a new WarcHeader for writing.
     * @param writer writer which shall be used
     * @param diagnostics diagnostics object used by writer
     * @return a WarcHeader prepared for writing
     */
    public static WarcHeader initHeader(WarcWriter writer, Diagnostics diagnostics) {
        WarcHeader header = new WarcHeader();
        // Set default version to "1.0".
        header.major = 1;
        header.minor = 0;
        header.warcTargetUriProfile = writer.warcTargetUriProfile;
        header.uriProfile = writer.uriProfile;
        header.fieldParsers = writer.fieldParsers;
        header.diagnostics = diagnostics;
        return header;
    }

    /**
     * Create and initialize a new WarcHeader for reading.
     * @param reader reader which shall be used
     * @param startOffset start offset of header
     * @param diagnostics diagnostics object used by reader
     * @return a WarcHeader prepared for reading
     */
    public static WarcHeader initHeader(WarcReader reader, long startOffset, Diagnostics diagnostics) {
        WarcHeader header = new WarcHeader();
        header.reader = reader;
        header.warcTargetUriProfile = reader.warcTargetUriProfile;
        header.uriProfile = reader.uriProfile;
        header.fieldParsers = reader.fieldParsers;
        header.diagnostics = diagnostics;
        header.recordHeaderMaxSize = reader.recordHeaderMaxSize;
        header.lineReader = reader.lineReader;
        header.headerLineReader = reader.headerLineReader;
        // This is only relevant for uncompressed sequentially read records
        header.startOffset = startOffset;
        return header;
    }

    /**
     * Add a warning diagnosis of the given type on a specific entity with
     * optional extra information. The information varies according to the
     * diagnosis type.
     * @param type diagnosis type
     * @param entity entity examined
     * @param information optional extra information
     */
    protected void addWarningDiagnosis(DiagnosisType type, String entity, String... information) {
        diagnostics.addWarning(new Diagnosis(type, entity, information));
    }

    /**
     * Returns the starting offset of the record in the containing WARC.
     * @return the starting offset of the record
     */
    public long getStartOffset() {
        return startOffset;
    }

    /**
     * Try to parse a WARC header and return a boolean indicating the success or
     * failure of this.
     * @param in input stream with WARC data
     * @return boolean indicating whether a header was parsed or not
     * @throws IOException if an I/O exception occurs while parsing for a header
     */
    public boolean parseHeader(ByteCountingPushBackInputStream in) throws IOException {
        if (parseVersion(in)) {
            // debug
            //System.out.println(wr.bMagicIdentified);
            //System.out.println(wr.bVersionParsed);
            //System.out.println(wr.major + "." + wr.minor);
            warcValidator = reader.validation.getValidatorFor(this, diagnostics);

            MaxLengthRecordingInputStream mrin = new MaxLengthRecordingInputStream(in, recordHeaderMaxSize);
            ByteCountingPushBackInputStream pbin = new ByteCountingPushBackInputStream(mrin, recordHeaderMaxSize);

            parseHeaders(pbin);
            pbin.close();

            warcValidator.checkFields(this);

            headerBytes = headerBytesOut.toByteArray();
        }
        return bMagicIdentified;
    }

    /**
     * Looks forward in the input stream for a valid WARC version line.
     * @param in data input stream
     * @return true, if magic WARC header found
     * @throws IOException if an error occurs while reading version data
     */
    public boolean parseVersion(ByteCountingPushBackInputStream in) throws IOException {
        bMagicIdentified = false;
        bVersionParsed = false;
        boolean bInvalidDataBeforeVersion = false;
        boolean bEmptyLinesBeforeVersion = false;
        HeaderLine line;
        String tmpStr;
        boolean bSeekMagic = true;
        // Loop until when have found something that looks like a version line.
        while (bSeekMagic) {
            // This is only relevant for uncompressed sequentially read records
            startOffset = in.getConsumed();
            line = lineReader.readLine(in);
            if (!lineReader.bEof) {
                switch (line.type) {
                case HeaderLine.HLT_LINE:
                    tmpStr = line.line;
                    // debug
                    //System.out.println(tmpStr);
                    if (tmpStr.length() > 0) {
                        if (tmpStr.toUpperCase().startsWith(WarcConstants.WARC_MAGIC_HEADER)) {
                            bMagicIdentified = true;
                            versionStr = tmpStr.substring(WarcConstants.WARC_MAGIC_HEADER.length());
                            String[] tmpArr = versionStr.split("\\.", -1);        // Not optimal
                            if (tmpArr.length >= 2 && tmpArr.length <= 4) {
                                bVersionParsed = true;
                                bValidVersionFormat = true;
                                versionArr = new int[tmpArr.length];
                                for (int i=0; i 0) {
                        // debug
                        //System.out.println(headerLine.name);
                        //System.out.println(headerLine.value);
                        addHeader(headerLine);
                    } else {
                        // Empty field name.
                        addWarningDiagnosis(DiagnosisType.EMPTY, "Header line");
                    }
                    break;
                case HeaderLine.HLT_LINE:
                    if (headerLine.line.length() == 0) {
                        // Empty line.
                        bLoop = false;
                    } else {
                        // Unknown header line.
                        addWarningDiagnosis(DiagnosisType.UNKNOWN, "Header line", headerLine.line);
                    }
                    break;
                default:
                    throw new IllegalStateException("Invalid HeaderLine output!");
                }
            } else {
                // EOF.
                bLoop = false;
            }
        }
    }

    /**
     * Identify a (WARC) header name, validate the value and set the header.
     * @param headerLine the headerLine
     */
    public void addHeader(HeaderLine headerLine) {
        String fieldName = headerLine.name;
        String fieldValue = headerLine.value;
        WarcConcurrentTo warcConcurrentTo;
        Integer fn_idx = WarcConstants.fieldNameIdxMap.get(fieldName.toLowerCase());
        if (fn_idx != null) {
            // WARC field name defined in WARC specification.
            if (!seen[fn_idx] || WarcConstants.fieldNamesRepeatableLookup[fn_idx]) {
                seen[fn_idx] = true;
                switch (fn_idx.intValue()) {
                case WarcConstants.FN_IDX_WARC_TYPE:
                    warcTypeStr = fieldParsers.parseString(fieldValue, WarcConstants.FN_WARC_TYPE);
                    if (warcTypeStr != null) {
                        warcTypeIdx = WarcConstants.recordTypeIdxMap.get(warcTypeStr.toLowerCase());
                    }
                    if (warcTypeIdx == null && warcTypeStr != null && warcTypeStr.length() > 0) {
                        warcTypeIdx = WarcConstants.RT_IDX_UNKNOWN;
                    }
                    break;
                case WarcConstants.FN_IDX_WARC_RECORD_ID:
                    warcRecordIdStr = fieldValue;
                    warcRecordIdUri = fieldParsers.parseUri(fieldValue, URI_LTGT, uriProfile, WarcConstants.FN_WARC_RECORD_ID);
                    break;
                case WarcConstants.FN_IDX_WARC_DATE:
                    warcDateStr = fieldValue;
                    //warcDate = fieldParsers.parseDate(fieldValue, WarcConstants.FN_WARC_DATE);
                    warcDate = fieldParsers.parseWarcDate(fieldValue, WarcConstants.FN_WARC_DATE);
                    break;
                case WarcConstants.FN_IDX_CONTENT_LENGTH:
                    contentLengthStr = fieldValue;
                    contentLength = fieldParsers.parseLong(fieldValue, WarcConstants.FN_CONTENT_LENGTH);
                    break;
                case WarcConstants.FN_IDX_CONTENT_TYPE:
                    contentTypeStr = fieldValue;
                    contentType = fieldParsers.parseContentType(fieldValue, WarcConstants.FN_CONTENT_TYPE);
                    break;
                case WarcConstants.FN_IDX_WARC_CONCURRENT_TO:
                    Uri tmpUri = fieldParsers.parseUri(fieldValue, URI_LTGT, uriProfile, WarcConstants.FN_WARC_CONCURRENT_TO);
                    if (fieldValue != null && fieldValue.trim().length() > 0) {
                        warcConcurrentTo = new WarcConcurrentTo();
                        warcConcurrentTo.warcConcurrentToStr = fieldValue;
                        warcConcurrentTo.warcConcurrentToUri = tmpUri;
                        warcConcurrentToList.add(warcConcurrentTo);
                    }
                    break;
                case WarcConstants.FN_IDX_WARC_BLOCK_DIGEST:
                    warcBlockDigestStr = fieldValue;
                    warcBlockDigest = fieldParsers.parseDigest(fieldValue, WarcConstants.FN_WARC_BLOCK_DIGEST);
                    break;
                case WarcConstants.FN_IDX_WARC_PAYLOAD_DIGEST:
                    warcPayloadDigestStr = fieldValue;
                    warcPayloadDigest = fieldParsers.parseDigest(fieldValue, WarcConstants.FN_WARC_PAYLOAD_DIGEST);
                    break;
                case WarcConstants.FN_IDX_WARC_IP_ADDRESS:
                    warcIpAddress = fieldValue;
                    warcInetAddress = fieldParsers.parseIpAddress(fieldValue, WarcConstants.FN_WARC_IP_ADDRESS);
                    break;
                case WarcConstants.FN_IDX_WARC_REFERS_TO:
                    warcRefersToStr = fieldValue;
                    warcRefersToUri = fieldParsers.parseUri(fieldValue, URI_LTGT, uriProfile, WarcConstants.FN_WARC_REFERS_TO);
                    break;
                case WarcConstants.FN_IDX_WARC_TARGET_URI:
                    warcTargetUriStr = fieldValue;
                    warcTargetUriUri = fieldParsers.parseUri(fieldValue, URI_NAKED, warcTargetUriProfile, WarcConstants.FN_WARC_TARGET_URI);
                    break;
                case WarcConstants.FN_IDX_WARC_TRUNCATED:
                    warcTruncatedStr = fieldParsers.parseString(fieldValue, WarcConstants.FN_WARC_TRUNCATED);
                    if (warcTruncatedStr != null) {
                        warcTruncatedIdx = WarcConstants.truncatedTypeIdxMap.get(warcTruncatedStr.toLowerCase());
                    }
                    if (warcTruncatedIdx == null && warcTruncatedStr != null && warcTruncatedStr.length() > 0) {
                        warcTruncatedIdx = WarcConstants.TT_IDX_FUTURE_REASON;
                    }
                    break;
                case WarcConstants.FN_IDX_WARC_WARCINFO_ID:
                    warcWarcinfoIdStr = fieldValue;
                    warcWarcinfoIdUri = fieldParsers.parseUri(fieldValue, URI_LTGT, uriProfile, WarcConstants.FN_WARC_WARCINFO_ID);
                    break;
                case WarcConstants.FN_IDX_WARC_FILENAME:
                    warcFilename = fieldParsers.parseString(fieldValue, WarcConstants.FN_WARC_FILENAME);
                    break;
                case WarcConstants.FN_IDX_WARC_PROFILE:
                    warcProfileStr = fieldValue;
                    warcProfileUri = fieldParsers.parseUri(fieldValue, URI_NAKED, uriProfile, WarcConstants.FN_WARC_PROFILE);
                    if (warcProfileStr != null) {
                        warcProfileIdx = WarcConstants.profileIdxMap.get(warcProfileStr.toLowerCase());
                    }
                    if (warcProfileIdx == null && warcProfileStr != null && warcProfileStr.length() > 0) {
                        warcProfileIdx = WarcConstants.WARC_PROFILE_IDX_UNKNOWN;
                    }
                    break;
                case WarcConstants.FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE:
                    warcIdentifiedPayloadTypeStr = fieldValue;
                    warcIdentifiedPayloadType = fieldParsers.parseContentType(fieldValue, WarcConstants.FN_WARC_IDENTIFIED_PAYLOAD_TYPE);
                    break;
                case WarcConstants.FN_IDX_WARC_SEGMENT_ORIGIN_ID:
                    warcSegmentOriginIdStr = fieldValue;
                    warcSegmentOriginIdUrl = fieldParsers.parseUri(fieldValue, URI_LTGT, uriProfile, WarcConstants.FN_WARC_SEGMENT_ORIGIN_ID);
                    break;
                case WarcConstants.FN_IDX_WARC_SEGMENT_NUMBER:
                    warcSegmentNumberStr = fieldValue;
                    warcSegmentNumber = fieldParsers.parseInteger(fieldValue, WarcConstants.FN_WARC_SEGMENT_NUMBER);
                    break;
                case WarcConstants.FN_IDX_WARC_SEGMENT_TOTAL_LENGTH:
                    warcSegmentTotalLengthStr = fieldValue;
                    warcSegmentTotalLength = fieldParsers.parseLong(fieldValue, WarcConstants.FN_WARC_SEGMENT_TOTAL_LENGTH);
                    break;
                case WarcConstants.FN_IDX_WARC_REFERS_TO_TARGET_URI:
                    warcRefersToTargetUriStr = fieldValue;
                    warcRefersToTargetUriUri = fieldParsers.parseUri(fieldValue, URI_NAKED, warcTargetUriProfile, WarcConstants.FN_WARC_REFERS_TO_TARGET_URI);
                    break;
                case WarcConstants.FN_IDX_WARC_REFERS_TO_DATE:
                    warcRefersToDateStr = fieldValue;
                    //warcRefersToDate = fieldParsers.parseDate(fieldValue, WarcConstants.FN_WARC_REFERS_TO_DATE);
                    warcRefersToDate = fieldParsers.parseWarcDate(fieldValue, WarcConstants.FN_WARC_REFERS_TO_DATE);
                    break;
                }
            } else {
                // Duplicate field.
                diagnostics.addError(DiagnosisType.DUPLICATE, "'" + fieldName + "' header", fieldValue);
            }
        }
        HeaderLine tmpLine = headerMap.get(fieldName.toLowerCase());
        if (tmpLine == null) {
            headerMap.put(fieldName.toLowerCase(), headerLine);
        } else {
            tmpLine.lines.add(headerLine);
        }
        headerList.add(headerLine);
    }

    /**
     * Get a List of all the headers found during parsing.
     * @return List of HeaderLine
     */
    public List getHeaderList() {
        return Collections.unmodifiableList(headerList);
    }

    /**
     * Get a header line structure or null, if no header line structure is
     * stored with the given header name.
     * @param field header name
     * @return HeaderLine structure or null
     */
    public HeaderLine getHeader(String field) {
        if (field != null && field.length() > 0) {
            return headerMap.get(field.toLowerCase());
        } else {
            return null;
        }
    }

    /**
     * Add a String header using the supplied string and return a
     * HeaderLine object corresponding to how the header would be
     * read.
     * @param fieldName name of field to add
     * @param fieldValue field value string
     * @return HeaderLine object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, String fieldValue) {
        HeaderLine headerLine = new HeaderLine();
        headerLine.name = fieldName;
        headerLine.value = fieldValue;
        addHeader(headerLine);
        return headerLine;
    }

    /**
     * Add an Integer header using the supplied string and object values and return
     * a HeaderLine object corresponding to how the header would be read.
     * If both string and object values are not null they are used as is.
     * If the string value is null and the object is not null,
     * the object's toString method is called.
     * If the object is null and the string is not null, the string is parsed
     * and validated resulting in an object, if valid.
     * @param fieldName name of field to add
     * @param integerFieldValue Integer field value object
     * @param fieldValueStr Integer field value string
     * @return HeaderLine object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, Integer integerFieldValue, String fieldValueStr) {
        if (integerFieldValue == null && fieldValueStr != null) {
            integerFieldValue = fieldParsers.parseInteger(fieldValueStr, fieldName);
        } else if (fieldValueStr == null && integerFieldValue != null) {
            fieldValueStr = integerFieldValue.toString();
        }
        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_INTEGER,
                integerFieldValue, null, null, null, null, null, null);
    }

    /**
     * Add a Long header using the supplied string and object values and return
     * a HeaderLine object corresponding to how the header would be read.
     * If both string and object values are not null they are used as is.
     * If the string value is null and the object is not null,
     * the object's toString method is called.
     * If the object is null and the string is not null, the string is parsed
     * and validated resulting in an object, if valid.
     * @param fieldName name of field to add
     * @param longFieldValue Long field value object
     * @param fieldValueStr Long field value string
     * @return HeaderLine object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, Long longFieldValue, String fieldValueStr) {
        if (longFieldValue == null && fieldValueStr != null) {
            longFieldValue = fieldParsers.parseLong(fieldValueStr, fieldName);
        } else if (fieldValueStr == null && longFieldValue != null) {
            fieldValueStr = longFieldValue.toString();
        }
        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_LONG,
                null, longFieldValue, null, null, null, null, null);
    }

    /**
     * Add an Digest header using the supplied string and object values and return
     * a HeaderLine object corresponding to how the header would be read.
     * If both string and object values are not null they are used as is.
     * If the string value is null and the object is not null,
     * the object's toString method is called.
     * If the object is null and the string is not null, the string is parsed
     * and validated resulting in an object, if valid.
     * @param fieldName name of field to add
     * @param digestFieldValue Digest field value object
     * @param fieldValueStr Digest field value string
     * @return HeaderLine object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, WarcDigest digestFieldValue, String fieldValueStr) {
        if (digestFieldValue == null && fieldValueStr != null) {
            digestFieldValue = fieldParsers.parseDigest(fieldValueStr, fieldName);
        } else if (fieldValueStr == null && digestFieldValue != null) {
            fieldValueStr = digestFieldValue.toString();
        }
        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_DIGEST,
                null, null, digestFieldValue, null, null, null, null);
    }

    /**
     * Add an Content-Type header using the supplied string and object values and return
     * a HeaderLine object corresponding to how the header would be read.
     * If both string and object values are not null they are used as is.
     * If the string value is null and the object is not null,
     * the object's toString method is called.
     * If the object is null and the string is not null, the string is parsed
     * and validated resulting in an object, if valid.
     * @param fieldName name of field to add
     * @param contentTypeFieldValue ContentType field value object
     * @param fieldValueStr Content-Type field value string
     * @return HeaderLine object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, ContentType contentTypeFieldValue, String fieldValueStr) {
        if (contentTypeFieldValue == null && fieldValueStr != null) {
            contentTypeFieldValue = fieldParsers.parseContentType(fieldValueStr, fieldName);
        } else if (fieldValueStr == null && contentTypeFieldValue != null) {
            fieldValueStr = contentTypeFieldValue.toString();
        }
        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_CONTENTTYPE,
                null, null, null, contentTypeFieldValue, null, null, null);
    }

    /**
     * Add an Date header using the supplied string and object values and return
     * a HeaderLine object corresponding to how the header would be read.
     * If both string and object values are not null they are used as is.
     * If the string value is null and the object is not null,
     * the object's toString method is called.
     * If the object is null and the string is not null, the string is parsed
     * and validated resulting in an object, if valid.
     * @param fieldName name of field to add
     * @param dateFieldValue Date field value object
     * @param fieldValueStr Date field value string
     * @return HeaderLine object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, WarcDate dateFieldValue, String fieldValueStr) {
        if (dateFieldValue == null && fieldValueStr != null) {
            // FIXME Check check check.
            //dateFieldValue = fieldParsers.parseDate(fieldValueStr, fieldName);
            dateFieldValue = fieldParsers.parseWarcDate(fieldValueStr, fieldName);
        } else if (fieldValueStr == null && dateFieldValue != null) {
            // FIXME Check check check.
            //fieldValueStr = WarcDateParser.getDateFormat().format(dateFieldValue);
            // FIXME Tricky since precision is WARC version dependent!
            fieldValueStr = dateFieldValue.toString();
        }
        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_DATE,
                null, null, null, null, dateFieldValue, null, null);
    }

    /**
     * Add an InetAddress header using the supplied string and object values and return
     * a HeaderLine object corresponding to how the header would be read.
     * If both string and object values are not null they are used as is.
     * If the string value is null and the object is not null,
     * the object's toString method is called.
     * If the object is null and the string is not null, the string is parsed
     * and validated resulting in an object, if valid.
     * @param fieldName name of field to add
     * @param inetAddrFieldValue InetAddress field value object
     * @param fieldValueStr IP-Address field value string
     * @return HeaderLine object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, InetAddress inetAddrFieldValue, String fieldValueStr) {
        if (inetAddrFieldValue == null && fieldValueStr != null) {
            inetAddrFieldValue = fieldParsers.parseIpAddress(fieldValueStr, fieldName);
        } else if (fieldValueStr == null && inetAddrFieldValue != null) {
            fieldValueStr = inetAddrFieldValue.getHostAddress();
        }
        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_INETADDRESS,
                null, null, null, null, null, inetAddrFieldValue, null);
    }

    /**
     * Add an URI header using the supplied string and object values and return
     * a HeaderLine object corresponding to how the header would be read.
     * If both string and object values are not null they are used as is.
     * If the string value is null and the object is not null,
     * the object's toString method is called.
     * If the object is null and the string is not null, the string is parsed
     * and validated resulting in an object, if valid.
     * @param fieldName name of field to add
     * @param uriFieldValue URI field value object
     * @param fieldValueStr URI field value string
     * @return HeaderLine object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, Uri uriFieldValue, String fieldValueStr) {
        if (uriFieldValue == null && fieldValueStr != null) {
            if (WarcConstants.FN_WARC_TARGET_URI.equalsIgnoreCase(fieldName)) {
                uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_NAKED, warcTargetUriProfile, fieldName);
            } else if (WarcConstants.FN_WARC_PROFILE.equalsIgnoreCase(fieldName)) {
                uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_NAKED, uriProfile, fieldName);
            } else if (WarcConstants.FN_WARC_REFERS_TO_TARGET_URI.equalsIgnoreCase(fieldName)) {
                uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_NAKED, warcTargetUriProfile, fieldName);
            } else {
                uriFieldValue = fieldParsers.parseUri(fieldValueStr, URI_LTGT, uriProfile, fieldName);
            }
        } else if (fieldValueStr == null && uriFieldValue != null) {
            if (WarcConstants.FN_WARC_TARGET_URI.equalsIgnoreCase(fieldName)
                    || WarcConstants.FN_WARC_PROFILE.equalsIgnoreCase(fieldName)
                    || WarcConstants.FN_WARC_REFERS_TO_TARGET_URI.equalsIgnoreCase(fieldName)) {
                fieldValueStr = uriFieldValue.toString();
            } else {
                fieldValueStr = "<" + uriFieldValue.toString() + ">";
            }
        }
        return addHeader(fieldName, fieldValueStr, WarcConstants.FDT_URI,
                null, null, null, null, null, null, uriFieldValue);
    }

    /**
     * Add a header with the supplied field name, data type and value and
     * return a HeaderLine corresponding to how the header will
     * be read. The data type is validated against the field data type.
     * The values used are the field value string and the parameter
     * corresponding to the data type.
     * @param fieldName header field name
     * @param fieldValueStr field value in string form
     * @param dt data type of the field value string when converted to an object
     * @param integerFieldValue Integer object field value
     * @param longFieldValue Long object field value
     * @param digestFieldValue Digest object field value
     * @param contentTypeFieldValue ContentType object field value
     * @param dateFieldValue Date object field value
     * @param inetAddrFieldValue InetAddress object field value
     * @param uriFieldValue URI object field value
     * @return HeaderLine object corresponding to what would have been read
     */
    public HeaderLine addHeader(String fieldName, String fieldValueStr, int dt,
            Integer integerFieldValue, Long longFieldValue,
            WarcDigest digestFieldValue, ContentType contentTypeFieldValue,
            WarcDate dateFieldValue, InetAddress inetAddrFieldValue,
            Uri uriFieldValue) {
        Integer fn_idx = WarcConstants.fieldNameIdxMap.get(fieldName.toLowerCase());
        if (fn_idx != null) {
            // Implicit cast from integer to long, if needed.
            if (WarcConstants.FN_IDX_DT[fn_idx] == WarcConstants.FDT_LONG && dt == WarcConstants.FDT_INTEGER) {
                longFieldValue = (long)integerFieldValue;
                dt = WarcConstants.FDT_LONG;
            }
            if (dt == WarcConstants.FN_IDX_DT[fn_idx]) {
                // WARC field name defined in WARC specification.
                if (seen[fn_idx] && !WarcConstants.fieldNamesRepeatableLookup[fn_idx]) {
                    // Duplicate field.
                    diagnostics.addError(DiagnosisType.DUPLICATE, "'" + fieldName + "' header", fieldValueStr);
                }
                seen[fn_idx] = true;
                switch (fn_idx.intValue()) {
                /*
                 * Integer.
                 */
                case WarcConstants.FN_IDX_WARC_SEGMENT_NUMBER:
                    warcSegmentNumberStr = fieldValueStr;
                    warcSegmentNumber = integerFieldValue;
                    break;
                /*
                 * Long.
                 */
                case WarcConstants.FN_IDX_CONTENT_LENGTH:
                    contentLengthStr = fieldValueStr;
                    contentLength = longFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_SEGMENT_TOTAL_LENGTH:
                    warcSegmentTotalLengthStr = fieldValueStr;
                    warcSegmentTotalLength = longFieldValue;
                    break;
                /*
                 * Digest.
                 */
                case WarcConstants.FN_IDX_WARC_BLOCK_DIGEST:
                    warcBlockDigestStr = fieldValueStr;
                    warcBlockDigest = digestFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_PAYLOAD_DIGEST:
                    warcPayloadDigestStr = fieldValueStr;
                    warcPayloadDigest = digestFieldValue;
                    break;
                /*
                 * ContentType.
                 */
                case WarcConstants.FN_IDX_CONTENT_TYPE:
                    contentTypeStr = fieldValueStr;
                    contentType = contentTypeFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_IDENTIFIED_PAYLOAD_TYPE:
                    warcIdentifiedPayloadTypeStr = fieldValueStr;
                    warcIdentifiedPayloadType = contentTypeFieldValue;
                    break;
                /*
                 * Date.
                 */
                case WarcConstants.FN_IDX_WARC_DATE:
                    warcDateStr = fieldValueStr;
                    // FIXME Maybe change parameter type if it does not break too many things.
                    //warcDate = WarcDate.getWarcDate(dateFieldValue);
                    warcDate = dateFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_REFERS_TO_DATE:
                    warcRefersToDateStr = fieldValueStr;
                    // FIXME Maybe change parameter type if it does not break too many things.
                    //warcRefersToDate = WarcDate.getWarcDate(dateFieldValue);
                    warcRefersToDate = dateFieldValue;
                    break;
                /*
                 * InetAddress.
                 */
                case WarcConstants.FN_IDX_WARC_IP_ADDRESS:
                    warcIpAddress = fieldValueStr;
                    warcInetAddress = inetAddrFieldValue;
                    break;
                /*
                 * URI.
                 */
                case WarcConstants.FN_IDX_WARC_RECORD_ID:
                    warcRecordIdStr = fieldValueStr;
                    warcRecordIdUri = uriFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_CONCURRENT_TO:
                    if (fieldValueStr != null || uriFieldValue != null) {
                        WarcConcurrentTo warcConcurrentTo = new WarcConcurrentTo();
                        warcConcurrentTo.warcConcurrentToStr = fieldValueStr;
                        warcConcurrentTo.warcConcurrentToUri = uriFieldValue;
                        warcConcurrentToList.add(warcConcurrentTo);
                    }
                    break;
                case WarcConstants.FN_IDX_WARC_REFERS_TO:
                    warcRefersToStr = fieldValueStr;
                    warcRefersToUri = uriFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_TARGET_URI:
                    warcTargetUriStr = fieldValueStr;
                    warcTargetUriUri = uriFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_WARCINFO_ID:
                    warcWarcinfoIdStr = fieldValueStr;
                    warcWarcinfoIdUri = uriFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_PROFILE:
                    warcProfileStr = fieldValueStr;
                    warcProfileUri = uriFieldValue;
                    if (warcProfileStr != null) {
                        warcProfileIdx = WarcConstants.profileIdxMap.get(warcProfileStr.toLowerCase());
                    }
                    if (warcProfileIdx == null && warcProfileStr != null && warcProfileStr.length() > 0) {
                        warcProfileIdx = WarcConstants.WARC_PROFILE_IDX_UNKNOWN;
                    }
                    break;
                case WarcConstants.FN_IDX_WARC_SEGMENT_ORIGIN_ID:
                    warcSegmentOriginIdStr = fieldValueStr;
                    warcSegmentOriginIdUrl = uriFieldValue;
                    break;
                case WarcConstants.FN_IDX_WARC_REFERS_TO_TARGET_URI:
                    warcRefersToTargetUriStr = fieldValueStr;
                    warcRefersToTargetUriUri = uriFieldValue;
                    break;
                default:
                    break;
                }
            } else {
                // Invalid datatype for field.
                diagnostics.addError(DiagnosisType.INVALID_EXPECTED,
                        "Invalid datatype for '" + fieldName + "' header",
                        WarcConstants.FDT_IDX_STRINGS[WarcConstants.FN_IDX_DT[fn_idx]],
                        WarcConstants.FDT_IDX_STRINGS[dt]);
                // Consider throwing exception at some point.
            }
        }
        HeaderLine headerLine = new HeaderLine();
        headerLine.name = fieldName;
        headerLine.value = fieldValueStr;
        HeaderLine tmpLine = headerMap.get(fieldName.toLowerCase());
        if (tmpLine == null) {
            headerMap.put(fieldName.toLowerCase(), headerLine);
        } else {
            tmpLine.lines.add(headerLine);
        }
        return headerLine;
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy