org.archive.io.ArchiveRecord Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of webarchive-commons Show documentation
The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.io;

import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.logging.Level;

import org.archive.util.Base32;

/**
 * Archive file Record.
 * @author stack
 * @version $Date$ $Version$
 */
public abstract class ArchiveRecord extends InputStream {

    /**
     * Minimal http response or request header length.
     * 
     * I've seen in arcs content length of 1 with no header.
     */
    protected static final long MIN_HTTP_HEADER_LENGTH =
            Math.min("HTTP/1.1 200 OK\r\n".length(), "GET / HTTP/1.0\n\r".length());

    protected ArchiveRecordHeader header = null;

    /**
     * Stream to read this record from.
     *
     * Stream can only be read sequentially.  Will only return this records'
     * content returning a -1 if you try to read beyond the end of the current
     * record.
     *
     * Streams can be markable or not.  If they are, we'll be able to roll
     * back when we've read too far.  If not markable, assumption is that
     * the underlying stream is managing our not reading too much (This pertains
     * to the skipping over the end of the ARCRecord.  See {@link #skip()}.
     */
    protected InputStream in = null;

    /**
     * Position w/i the Record content, within in.
     * This position is relative within this Record.  Its not same as the
     * Archive file position.
     */
    protected long position = 0;

    /**
     * Set flag when we've reached the end-of-record.
     */
    protected boolean eor = false;

    /**
     * Compute digest on what we read and add to metadata when done.
     * 
     * Currently hardcoded as sha-1. TODO: Remove when archive records
     * digest or else, add a facility that allows the arc reader to
     * compare the calculated digest to that which is recorded in
     * the arc.
     * 
     * Protected instead of private so subclasses can update and complete
     * the digest.
     */
    protected MessageDigest digest = null;
    private String digestStr = null;

    protected boolean strict = false;


    /**
     * Constructor.
     *
     * @param in Stream cue'd up to be at the start of the record this instance
     * is to represent.
     * @throws IOException
     */
    public ArchiveRecord(InputStream in)
            throws IOException {
        this(in, null, 0, true, false);
    }

    /**
     * Constructor.
     *
     * @param in Stream cue'd up to be at the start of the record this instance
     * is to represent.
     * @param header Header data.
     * @throws IOException
     */
    public ArchiveRecord(InputStream in, ArchiveRecordHeader header)
            throws IOException {
        this(in, header, 0, true, false);
    }

    /**
     * Constructor.
     *
     * @param in Stream cue'd up to be at the start of the record this instance
     * is to represent.
     * @param header Header data.
     * @param bodyOffset Offset into the body.  Usually 0.
     * @param digest True if we're to calculate digest for this record.  Not
     * digesting saves about ~15% of cpu during an ARC parse.
     * @param strict Be strict parsing (Parsing stops if ARC inproperly
     * formatted).
     * @throws IOException
     */
    public ArchiveRecord(InputStream in, ArchiveRecordHeader header,
            int bodyOffset, boolean digest, boolean strict) 
                    throws IOException {
        this.in = in;
        this.header = header;
        this.position = bodyOffset;
        if (digest) {
            try {
                this.digest = MessageDigest.getInstance("SHA1");
            } catch (NoSuchAlgorithmException e) {
                // Convert to IOE because thats more amenable to callers
                // -- they are dealing with it anyways.
                throw new IOException(e.getMessage());
            }
        }
        this.strict = strict;
    }

    public boolean markSupported() {
        return false;
    }

    /**
     * @return Header data for this record.
     */
    public ArchiveRecordHeader getHeader() {
        return this.header;
    }

    protected void setHeader(ArchiveRecordHeader header) {
        this.header = header;
    }

    /**
     * Calling close on a record skips us past this record to the next record
     * in the stream.
     *
     * It does not actually close the stream.  The underlying steam is probably
     * being used by the next arc record.
     *
     * @throws IOException
     */
    public void close() throws IOException {
        if (this.in != null) {
            skip();
            this.in = null;
            if (this.digest != null) {
                this.digestStr = Base32.encode(this.digest.digest());
            }
        }
    }

    /**
     * @return Next character in this Record content else -1 if at EOR.
     * @throws IOException
     */
    public int read() throws IOException {
        int c = -1;
        if (available() > 0) {
            c = this.in.read();
            if (c == -1) {
                throw new IOException("Premature EOF before end-of-record.");
            }
            if (this.digest != null) {
                this.digest.update((byte) c);
            }
            incrementPosition();
        }
        return c;
    }

    public int read(byte[] b, int offset, int length) throws IOException {
        int read = Math.min(length, available());
        if (read == -1 || read == 0) {
            read = -1;
        } else {
            read = this.in.read(b, offset, read);
            if (read == -1) {
                String msg = "Premature EOF before end-of-record: "
                        + getHeader().getHeaderFields();
                if (isStrict()) {
                    throw new IOException(msg);
                }
                setEor(true);
                System.err.println(Level.WARNING.toString() + " " + msg);
            }
            if (this.digest != null && read >= 0) {
                this.digest.update(b, offset, read);
            }
            incrementPosition(read);
        }
        return read;
    }

    /**
     * This available is not the stream's available. Its an available based on
     * what the stated Archive record length is minus what we've read to date.
     * 
     * @return True if bytes remaining in record content.
     */
    public int available() {
        long amount = getHeader().getLength() - getPosition();
        return (amount > Integer.MAX_VALUE? Integer.MAX_VALUE: (int)amount);
    }

    /**
     * Skip over this records content.
     *
     * @throws IOException
     */
    protected void skip() throws IOException {
        if (this.eor) {
            return;
        }

        // Read to the end of the body of the record.  Exhaust the stream.
        // Can't skip direct to end because underlying stream may be compressed
        // and we're calculating the digest for the record.
        int r = available();
        while (r > 0 && !this.eor) {
            skip(r);
            r = available();
        }
    }

    public long skip(long n) throws IOException {
        final int SKIP_BUFFERSIZE = 1024 * 4;
        byte[] b = new byte[SKIP_BUFFERSIZE];
        long total = 0;
        for (int read = 0; (total < n) && (read != -1);) {
            read = Math.min(SKIP_BUFFERSIZE, (int) (n - total));
            // TODO: Interesting is that reading from compressed stream, we only
            // read about 500 characters at a time though we ask for 4k.
            // Look at this sometime.
            read = read(b, 0, read);
            if (read <= 0) {
                read = -1;
            } else {
                total += read;
            }
        }
        return total;
    }

    /**
     * @return Returns the strict.
     */
    public boolean isStrict() {
        return this.strict;
    }

    /**
     * @param strict The strict to set.
     */
    public void setStrict(boolean strict) {
        this.strict = strict;
    }

    protected InputStream getIn() {
        return this.in;
    }

    public String getDigestStr() {
        return this.digestStr;
    }

    protected void incrementPosition() {
        this.position++;
    }

    protected void incrementPosition(final long incr) {
        this.position += incr;
    }

    public long getPosition() {
        return this.position;
    }

    protected boolean isEor() {
        return eor;
    }

    protected void setEor(boolean eor) {
        this.eor = eor;
    }

    protected String getStatusCode4Cdx(final ArchiveRecordHeader h) {
        return "-";
    }

    protected String getIp4Cdx(final ArchiveRecordHeader h) {
        return "-";
    }

    protected String getDigest4Cdx(final ArchiveRecordHeader h) {
        return getDigestStr() == null? "-": getDigestStr();
    }

    protected String getMimetype4Cdx(final ArchiveRecordHeader h) {
        return h.getMimetype();
    }

    protected String outputCdx(final String strippedFileName)
            throws IOException {
        // Read the whole record so we get out a hash. Should be safe calling
        // close on already closed Record.
        close();
        ArchiveRecordHeader h = getHeader();
        StringBuilder buffer =
                new StringBuilder(ArchiveFileConstants.CDX_LINE_BUFFER_SIZE);
        buffer.append(h.getDate());
        buffer.append(ArchiveFileConstants.SINGLE_SPACE);
        buffer.append(getIp4Cdx(h));
        buffer.append(ArchiveFileConstants.SINGLE_SPACE);
        buffer.append(h.getUrl());
        buffer.append(ArchiveFileConstants.SINGLE_SPACE);
        buffer.append(getMimetype4Cdx(h));
        buffer.append(ArchiveFileConstants.SINGLE_SPACE);
        buffer.append(getStatusCode4Cdx(h));
        buffer.append(ArchiveFileConstants.SINGLE_SPACE);
        buffer.append(getDigest4Cdx(h));
        buffer.append(ArchiveFileConstants.SINGLE_SPACE);
        buffer.append(h.getOffset());
        buffer.append(ArchiveFileConstants.SINGLE_SPACE);
        buffer.append(h.getLength());
        buffer.append(ArchiveFileConstants.SINGLE_SPACE);
        buffer.append(strippedFileName != null? strippedFileName: '-');
        return buffer.toString();
    }

    /**
     * Writes output on STDOUT.
     * @throws IOException
     */
    public void dump()
            throws IOException {
        dump(System.out);
    }

    /**
     * Writes output on passed os.
     * @throws IOException
     */
    public void dump(final OutputStream os)
            throws IOException {
        final byte [] outputBuffer = new byte [16*1024];
        int read = outputBuffer.length;
        while ((read = read(outputBuffer, 0, outputBuffer.length)) != -1) {
            os.write(outputBuffer, 0, read);
        }
        os.flush();
    }

    /**
     * Is it likely that this record contains headers?
     * This method will return true if the body is a http response that includes
     * http response headers or the body is a http request that includes request
     * headers, etc. Be aware that headers in content are distinct from
     * {@link ArchiveRecordHeader} 'headers'.
     * @return True if this Record's content has headers: 
     */
    public boolean hasContentHeaders() {
        final String url = getHeader().getUrl();
        if (url == null) {
            return false;
        }

        if (!url.toLowerCase().startsWith("http")) {
            return false;
        }

        if (getHeader().getLength() <= MIN_HTTP_HEADER_LENGTH) {
            return false;
        }

        return true;
    }

    protected void setBodyOffset(int bodyOffset) {
        this.position = bodyOffset;
    }
}