All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.martinkl.warc.WARCRecord Maven / Gradle / Ivy

The newest version!
package com.martinkl.warc;

import java.io.ByteArrayOutputStream;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.regex.Pattern;

/**
 * Immutable implementation of a record in a WARC file. You create a {@link WARCRecord}
 * by parsing it out of a {@link DataInput} stream.
 *
 * The file format is documented in the
 * [ISO Standard](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf).
 * In a nutshell, it's a textual format consisting of lines delimited by `\r\n`.
 * Each record has the following structure:
 *
 * 1. A line indicating the WARC version number, such as `WARC/1.0`.
 * 2. Several header lines (in key-value format, similar to HTTP or email headers),
 *    giving information about the record. The header is terminated by an empty line.
 * 3. A body consisting of raw bytes (the number of bytes is indicated in one of the headers).
 * 4. A final separator of `\r\n\r\n` before the next record starts.
 *
 * There are various different types of records, as documented on
 * {@link Header#getRecordType()}.
 */
public class WARCRecord {

    public static final String WARC_VERSION = "WARC/1.0";
    private static final int MAX_LINE_LENGTH = 10000;
    private static final Pattern VERSION_PATTERN = Pattern.compile("WARC/[0-9\\.]+");
    private static final Pattern CONTINUATION_PATTERN = Pattern.compile("^[\\t ]+.*");
    private static final String CRLF = "\r\n";
    private static final byte[] CRLF_BYTES = { 13, 10 };

    private final Header header;
    private final byte[] content;

    /**
     * Creates a new WARCRecord by parsing it out of a {@link DataInput} stream.
     * @param in The input source from which one record will be read.
     * @throws IOException
     */
    public WARCRecord(DataInput in) throws IOException {
        header = readHeader(in);
        content = new byte[header.getContentLength()];
        in.readFully(content);
        readSeparator(in);
    }

    private static Header readHeader(DataInput in) throws IOException {
        String versionLine = readLine(in);
        if (!VERSION_PATTERN.matcher(versionLine).matches()) {
            throw new IllegalStateException("Expected WARC version, but got: " + versionLine);
        }

        LinkedHashMap headers = new LinkedHashMap();
        String line, fieldName = null;

        do {
            line = readLine(in);
            if (fieldName != null && CONTINUATION_PATTERN.matcher(line).matches()) {
                headers.put(fieldName, headers.get(fieldName) + line);
            } else if (!line.isEmpty()) {
                String[] field = line.split(":", 2);
                if (field.length < 2) throw new IllegalStateException("Malformed header line: " + line);
                fieldName = field[0].trim();
                headers.put(fieldName, field[1].trim());
            }
        } while (!line.isEmpty());

        return new Header(headers);
    }

    private static String readLine(DataInput in) throws IOException {
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        boolean seenCR = false, seenCRLF = false;
        while (!seenCRLF) {
            if (out.size() > MAX_LINE_LENGTH) {
                throw new IllegalStateException("Exceeded maximum line length");
            }
            byte b = in.readByte();
            if (!seenCR && b == 13) {
                seenCR = true;
            } else if (seenCR && b == 10) {
                seenCRLF = true;
            } else {
                seenCR = false;
                out.write(b);
            }
        }
        return out.toString("UTF-8");
    }

    private static void readSeparator(DataInput in) throws IOException {
        byte[] sep = new byte[4];
        in.readFully(sep);
        if (sep[0] != 13 || sep[1] != 10 || sep[2] != 13 || sep[3] != 10) {
            throw new IllegalStateException(String.format(
                "Expected final separator CR LF CR LF, but got: %d %d %d %d",
                sep[0], sep[1], sep[2], sep[3]));
        }
    }

    /**
     * Returns the parsed header structure of the WARC record.
     */
    public Header getHeader() {
        return header;
    }

    /**
     * Returns the body of the record, as an unparsed raw array of bytes. The content
     * of the body depends on the type of record (see {@link Header#getRecordType()}).
     * For example, in the case of a `response` type header, the body consists of the
     * full HTTP response returned by the server (HTTP headers followed by the body).
     */
    public byte[] getContent() {
        return content;
    }

    /**
     * Writes this record to a {@link DataOutput} stream. The output may, in some edge
     * cases, be not byte-for-byte identical to what was parsed from a {@link DataInput}.
     * However it has the same meaning and should not lose any information.
     * @param out The output stream to which this record should be appended.
     * @throws IOException
     */
    public void write(DataOutput out) throws IOException {
        header.write(out);
        out.write(CRLF_BYTES);
        out.write(content);
        out.write(CRLF_BYTES);
        out.write(CRLF_BYTES);
    }

    /**
     * Returns a human-readable string representation of the record.
     */
    @Override
    public String toString() {
        return header.toString();
    }


    /**
     * Contains the parsed headers of a {@link WARCRecord}. Each record contains a number
     * of headers in key-value format, where some header keys are standardised, but
     * nonstandard ones can be added.
     *
     * The documentation of the methods in this class is excerpted from the
     * [WARC 1.0 specification](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf).
     * Please see the specification for more detail.
     */
    public static class Header {
        private final Map fields;

        private Header(Map fields) {
            this.fields = fields;
        }

        /**
         * Returns the type of WARC record (the value of the `WARC-Type` header field).
         * WARC 1.0 defines the following record types: (for full definitions, see the
         * [spec](http://bibnum.bnf.fr/warc/WARC_ISO_28500_version1_latestdraft.pdf))
         *
         *  *  `warcinfo`: Describes the records that follow it, up through end of file,
         *     end of input, or until next `warcinfo` record. Typically, this appears once and
         *     at the beginning of a WARC file. For a web archive, it often contains information
         *     about the web crawl which generated the following records.
         *
         *     The format of this descriptive record block may vary, though the use of the
         *     `"application/warc-fields"` content-type is recommended. (...)
         *
         *  *  `response`: The record should contain a complete scheme-specific response, including
         *     network protocol information where possible. For a target-URI of the `http` or
         *     `https` schemes, a `response` record block should contain the full HTTP 
         *     response received over the network, including headers. That is, it contains the
         *     'Response' message defined by section 6 of HTTP/1.1 (RFC2616).
         *
         *     The WARC record's Content-Type field should contain the value defined by HTTP/1.1, 
         *     `"application/http;msgtype=response"`. The payload of the record is defined as its
         *     'entity-body' (per RFC2616), with any transfer-encoding removed.
         *
         *  *  `resource`: The record contains a resource, without full protocol response
         *     information. For example: a file directly retrieved from a locally accessible
         *     repository or the result of a networked retrieval where the protocol information
         *     has been discarded. For a target-URI of the `http` or `https` schemes, a `resource`
         *     record block shall contain the returned 'entity-body' (per RFC2616, with any
         *     transfer-encodings removed), possibly truncated.
         *
         *  *  `request`: The record holds the details of a complete scheme-specific request,
         *     including network protocol information where possible. For a target-URI of the
         *     `http` or `https` schemes, a `request` record block should contain the full HTTP
         *     request sent over the network, including headers. That is, it contains the
         *     'Request' message defined by section 5 of HTTP/1.1 (RFC2616).
         *
         *     The WARC record's Content-Type field should contain the value defined by HTTP/1.1,
         *     `"application/http;msgtype=request"`. The payload of a `request` record with a
         *     target-URI of scheme `http` or `https` is defined as its 'entity-body' (per
         *     RFC2616), with any transfer-encoding removed.
         *
         *  *  `metadata`: The record contains content created in order to further describe,
         *     explain, or accompany a harvested resource, in ways not covered by other record
         *     types. A `metadata` record will almost always refer to another record of another
         *     type, with that other record holding original harvested or transformed content.
         *
         *     The format of the metadata record block may vary. The `"application/warc-fields"`
         *     format may be used.
         *
         *  *  `revisit`: The record describes the revisitation of content already archived,
         *     and might include only an abbreviated content body which has to be interpreted
         *     relative to a previous record. Most typically, a `revisit` record is used
         *     instead of a `response` or `resource` record to indicate that the content
         *     visited was either a complete or substantial duplicate of material previously
         *     archived.
         *
         *     A `revisit` record shall contain a WARC-Profile field which determines the
         *     interpretation of the record's fields and record block. Please see the
         *     specification for details.
         *
         *  *  `conversion`: The record shall contain an alternative version of another
         *     record's content that was created as the result of an archival process.
         *     Typically, this is used to hold content transformations that maintain viability
         *     of content after widely available rendering tools for the originally stored
         *     format disappear. As needed, the original content may be migrated (transformed)
         *     to a more viable format in order to keep the information usable with current
         *     tools while minimizing loss of information.
         *
         *  *  `continuation`: Record blocks from `continuation` records must be appended to
         *     corresponding prior record blocks (eg. from other WARC files) to create the
         *     logically complete full-sized original record. That is, `continuation`
         *     records are used when a record that would otherwise cause a WARC file size to
         *     exceed a desired limit is broken into segments. A continuation record shall
         *     contain the named fields `WARC-Segment-Origin-ID` and `WARC-Segment-Number`,
         *     and the last `continuation` record of a series shall contain a
         *     `WARC-Segment-Total-Length` field. Please see the specification for details.
         *
         *  *  Other record types may be added in future, so this list is not exclusive.
         *
         * @return The record's `WARC-Type` header field, as a string.
         */
        public String getRecordType() {
            return fields.get("WARC-Type");
        }

        /**
         * A 14-digit UTC timestamp formatted according to YYYY-MM-DDThh:mm:ssZ, described
         * in the W3C profile of ISO8601. The timestamp shall represent the instant that
         * data capture for record creation began. Multiple records written as part of a
         * single capture event shall use the same WARC-Date, even though the times of
         * their writing will not be exactly synchronized.
         *
         * @return The record's `WARC-Date` header field, as a string.
         */
        public String getDateString() {
            return fields.get("WARC-Date");
        }

        /**
         * An identifier assigned to the current record that is globally unique for its
         * period of intended use. No identifier scheme is mandated by this specification,
         * but each record-id shall be a legal URI and clearly indicate a documented and
         * registered scheme to which it conforms (e.g., via a URI scheme prefix such as
         * `http:` or `urn:`).
         *
         * @return The record's `WARC-Record-ID` header field, as a string.
         */
        public String getRecordID() {
            return fields.get("WARC-Record-ID");
        }

        /**
         * The MIME type (RFC2045) of the information contained in the record's block. For
         * example, in HTTP request and response records, this would be `application/http`
         * as per section 19.1 of RFC2616 (or `application/http; msgtype=request` and
         * `application/http; msgtype=response` respectively).
         *
         * In particular, the content-type is *not* the value of the HTTP Content-Type
         * header in an HTTP response, but a MIME type to describe the full archived HTTP
         * message (hence `application/http` if the block contains request or response
         * headers).
         *
         * @return The record's `Content-Type` header field, as a string.
         */
        public String getContentType() {
            return fields.get("Content-Type");
        }

        /**
         * The original URI whose capture gave rise to the information content in this record.
         * In the context of web harvesting, this is the URI that was the target of a
         * crawler's retrieval request. For a `revisit` record, it is the URI that was the
         * target of a retrieval request. Indirectly, such as for a `metadata`, or `conversion`
         * record, it is a copy of the `WARC-Target-URI` appearing in the original record to
         * which the newer record pertains. The URI in this value shall be properly escaped
         * according to RFC3986, and written with no internal whitespace.
         *
         * @return The record's `WARC-Target-URI` header field, as a string.
         */
        public String getTargetURI() {
            return fields.get("WARC-Target-URI");
        }

        /**
         * The number of bytes in the body of the record, similar to RFC2616.
         *
         * @return The record's `Content-Length` header field, parsed into an int.
         */
        public int getContentLength() {
            String lengthStr = fields.get("Content-Length");
            if (lengthStr == null) throw new IllegalStateException("Missing Content-Length header");
            try {
                return Integer.parseInt(lengthStr);
            } catch (NumberFormatException e) {
                throw new IllegalStateException("Malformed Content-Length header: " + lengthStr);
            }
        }

        /**
         * Returns the value of a selected header field, or null if there is no header with
         * that field name.
         * @param field The name of the header to return (case-sensitive).
         * @return The value associated with that field name, or null if not present.
         */
        public String getField(String field) {
            return fields.get(field);
        }

        /**
         * Appends this header to a {@link DataOutput} stream, in WARC/1.0 format.
         * @param out The data output to which the header should be written.
         * @throws IOException
         */
        public void write(DataOutput out) throws IOException {
            out.write(toString().getBytes("UTF-8"));
        }

        /**
         * Formats this header in WARC/1.0 format, consisting of a version line followed
         * by colon-delimited key-value pairs, and `\r\n` line endings.
         */
        @Override
        public String toString() {
            StringBuffer buf = new StringBuffer();
            buf.append(WARC_VERSION);
            buf.append(CRLF);
            for (Map.Entry field : fields.entrySet()) {
                buf.append(field.getKey());
                buf.append(": ");
                buf.append(field.getValue());
                buf.append(CRLF);
            }
            return buf.toString();
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy