All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jwat.warc.WarcReader Maven / Gradle / Ivy

Go to download

Used for reading, writing and validating WARC files. Implemented to follow the WARC/1.0 ISO specification as closely as possible.

There is a newer version: 1.2.1
Show newest version
/**
 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
 * and GZip files. (http://jwat.org/)
 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.jwat.warc;

import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.NoSuchElementException;

import org.jwat.common.Diagnostics;
import org.jwat.common.HeaderLineReader;
import org.jwat.common.UriProfile;

/**
 * Base class for WARC reader implementations.
 *
 * @author nicl
 */
public abstract class WarcReader implements Closeable, Iterable {

    /*
     * Settings.
     */

    /** WARC-Target-URI profile. */
    protected UriProfile warcTargetUriProfile;

    /** URI profile. */
    protected UriProfile uriProfile;

    /** Default block digest algorithm to use if none is present in the
     *  record. */
    protected String blockDigestAlgorithm;

    /** Default encoding scheme used to encode block digest into a string,
     *  if none is detected from the record. */
    protected String blockDigestEncoding = "base32";

    /** Payload Digest enabled/disabled. */
    protected boolean bPayloadDigest = false;

    /** Default payload digest algorithm to use if none is present in the
     *  record. */
    protected String payloadDigestAlgorithm;

    /** Default encoding scheme used to encode payload digest into a string,
     *  if none is detected from the record. */
    protected String payloadDigestEncoding = "base32";

    /** Block Digest enabled/disabled. */
    protected boolean bBlockDigest = false;

    /** Max size allowed for a record header. */
    protected int recordHeaderMaxSize;

    /** Max size allowed for a payload header. */
    protected int payloadHeaderMaxSize;

    /** Line reader used to read version lines. */
    protected HeaderLineReader lineReader;

    /** Header line reader used to read the WARC headers. */
    protected HeaderLineReader headerLineReader;

    /** WARC field parser used. */
    protected WarcFieldParsers fieldParsers;

    /** Report HTTP header errors on reader diagnosis list. */
    protected boolean bReportHttpHeaderError = true;

    /*
     * State.
     */

    /** Used for version based validation. */
    public final WarcValidation validation = new WarcValidation();

    /** Reader level errors and warnings or when no record is available. */
    public final Diagnostics diagnostics = new Diagnostics();

    /** Compliance status for records parsed up to now. */
    protected boolean bIsCompliant = true;

    /** Number of bytes consumed by this reader. */
    protected long consumed = 0;

    /** Records parsed. */
    protected int records = 0;

    /** Aggregated number of errors encountered while parsing. */
    protected int errors = 0;

    /** Aggregate number of warnings encountered while parsing. */
    protected int warnings = 0;

    /** Current WARC record object. */
    protected WarcRecord currentRecord;

    /** Exception thrown while using the iterator. */
    protected Exception iteratorExceptionThrown;

    /** Callback used to monitor and modify internal data as it is being parsed. */
    protected WarcRecordParserCallback wrpCallback;

    /**
     * Method used to initialize a readers internal state.
     * Must be called by all constructors.
     */
    protected void init() {
        warcTargetUriProfile = UriProfile.RFC3986;
        uriProfile = UriProfile.RFC3986;
        recordHeaderMaxSize = 8192;
        payloadHeaderMaxSize = 32768;
        lineReader = HeaderLineReader.getReader();
        lineReader.bNameValue = false;
        lineReader.encoding = HeaderLineReader.ENC_US_ASCII;
        headerLineReader = HeaderLineReader.getReader();
        headerLineReader.bNameValue = true;
        headerLineReader.encoding = HeaderLineReader.ENC_UTF8;
        headerLineReader.bLWS = true;
        headerLineReader.bQuotedText = true;
        headerLineReader.bEncodedWords = true;
        fieldParsers = new WarcFieldParsers();
    }

    /**
     * Reset reader for reuse.
     */
    public void reset() {
        diagnostics.reset();
        bIsCompliant = true;
        consumed = 0;
        records = 0;
        errors = 0;
        warnings = 0;
        currentRecord = null;
    }

    /**
     * Returns a boolean indicating if all records parsed so far are compliant.
     * @return a boolean indicating if all records parsed so far are compliant
     */
    public boolean isCompliant() {
        return bIsCompliant;
    }

    /**
     * Is this reader assuming GZip compressed input.
     * @return boolean indicating the assumption of GZip compressed input
     */
    public abstract boolean isCompressed();

    /**
     * Set the URI profile used to validate WARC-Target URIs.
     * If null, the uriProfile is set to RCF3986.
     * @param uriProfile URI profile to use
     */
    public void setWarcTargetUriProfile(UriProfile uriProfile) {
        if (uriProfile == null) {
            uriProfile = UriProfile.RFC3986;
        }
        this.warcTargetUriProfile = uriProfile;
    }

    /**
     * Get the URI profile used to validate WARC-Target URIs.
     * @return the URI profile used to validate WARC-Target URIs
     */
    public UriProfile getWarcTargetUriProfile() {
        return warcTargetUriProfile;
    }

    /**
     * Set the URI profile used to validate URIs.
     * If null, the uriProfile is set to RCF3986.
     * @param uriProfile URI profile to use
     */
    public void setUriProfile(UriProfile uriProfile) {
        if (uriProfile == null) {
            uriProfile = UriProfile.RFC3986;
        }
        this.uriProfile = uriProfile;
    }

    /**
     * Get the URI profile used to validate URIs.
     * @return the URI profile used to validate URIs
     */
    public UriProfile getUriProfile() {
        return uriProfile;
    }

    /**
     * Get the readers block digest on/off status.
     * @return boolean indicating block digest on/off
     */
    public boolean getBlockDigestEnabled() {
        return bBlockDigest;
    }

    /**
     * Set the readers block digest on/off status. Digest, however,
     * will only be computed if either a Warc-Block-Digest header is
     * present or an optional algorithm has been chosen.
     * The Warc-Block-Digest always takes precedence.
     * @param enabled boolean indicating block digest on/off
     */
    public void setBlockDigestEnabled(boolean enabled) {
        bBlockDigest = enabled;
    }

    /**
     * Get the readers payload digest on/off status.
     * @return boolean indicating payload digest on/off
     */
    public boolean getPayloadDigestEnabled() {
        return bPayloadDigest;
    }

    /**
     * Set the readers payload digest on/off status. Digest, however,
     * will only be computed if either a Warc-Payload-Digest header is
     * present or an optional algorithm has been chosen.
     * The Warc-Payload-Digest always takes precedence.
     * @param enabled boolean indicating payload digest on/off
     */
    public void setPayloadDigestEnabled(boolean enabled) {
        bPayloadDigest = enabled;
    }

    /**
     * Get the default block digest algorithm.
     * @return default block digest algorithm
     */
    public String getBlockDigestAlgorithm() {
        return blockDigestAlgorithm;
    }

    /**
     * Tries to set the default block digest algorithm and returns a boolean
     * indicating whether the algorithm was accepted or not. This algorithm is
     * only used in case no WARC payload digest header is present in the record.
     * @param digestAlgorithm block digest algorithm
     * (null means no default block digest algorithm is selected)
     * @return boolean indicating the validity of the algorithm supplied
     */
    public boolean setBlockDigestAlgorithm(String digestAlgorithm) {
        if (digestAlgorithm == null || digestAlgorithm.length() == 0) {
            blockDigestAlgorithm = null;
            return true;
        }
        if (WarcDigest.digestAlgorithmLength(digestAlgorithm) > 0) {
            blockDigestAlgorithm = digestAlgorithm;
            return true;
        }
        return false;
    }

    /**
     * Get the default payload digest algorithm.
     * @return default payload digest algorithm
     */
    public String getPayloadDigestAlgorithm() {
        return payloadDigestAlgorithm;
    }

    /**
     * Tries to set the default payload digest algorithm and returns a boolean
     * indicating whether the algorithm was accepted or not. This algorithm is
     * only used in case no WARC payload digest header is present in the record.
     * @param digestAlgorithm payload digest algorithm
     * (null means no default payload digest algorithm is selected)
     * @return boolean indicating the validity of the algorithm supplied
     */
    public boolean setPayloadDigestAlgorithm(String digestAlgorithm) {
        if (digestAlgorithm == null || digestAlgorithm.length() == 0) {
            payloadDigestAlgorithm = null;
            return true;
        }
        if (WarcDigest.digestAlgorithmLength(digestAlgorithm) > 0) {
            payloadDigestAlgorithm = digestAlgorithm;
            return true;
        }
        return false;
    }

    /**
     * Get the default block digest encoding scheme.
     * @return default block digest encoding scheme
     */
    public String getBlockDigestEncoding() {
        return blockDigestEncoding;
    }

    /**
     * Set the default block digest encoding scheme. This scheme is only
     * used if none can be inferred from an existing block digest header.
     * @param encodingScheme encoding scheme
     * (null means default block digest is not encoded)
     */
    public void setBlockDigestEncoding(String encodingScheme) {
        if (encodingScheme != null && encodingScheme.length() > 0) {
            blockDigestEncoding = encodingScheme.toLowerCase();
        } else {
            blockDigestEncoding = null;
        }
    }

    /**
     * Get the default payload digest encoding scheme.
     * @return default payload digest encoding scheme
     */
    public String getPayloadDigestEncoding() {
        return payloadDigestEncoding;
    }

    /**
     * Set the default payload digest encoding scheme. This scheme is only
     * used if none can be inferred from an existing payload digest header.
     * @param encodingScheme encoding scheme
     * (null means default payload digest is not encoded)
     */
    public void setPayloadDigestEncoding(String encodingScheme) {
        if (encodingScheme != null && encodingScheme.length() > 0) {
            payloadDigestEncoding = encodingScheme.toLowerCase();
        } else {
            payloadDigestEncoding = null;
        }
    }

    /**
     * Get the max size allowed for a record header.
     * @return max size allowed for a record header
     */
    public int getRecordHeaderMaxSize() {
        return recordHeaderMaxSize;
    }

    /**
     * Set the max size allowed for a record header.
     * @param size max size allowed
     */
    public void setRecordHeaderMaxSize(int size) {
        recordHeaderMaxSize = size;
    }

    /**
     * Get the max size allowed for a payload header.
     * @return max size allowed for a payload header
     */
    public int getPayloadHeaderMaxSize() {
        return payloadHeaderMaxSize;
    }

    /**
     * Set the max size allowed for a payload header.
     * @param size max size allowed
     */
    public void setPayloadHeaderMaxSize(int size) {
        payloadHeaderMaxSize = size;
    }

    /**
     * Get a boolean indicating whether HTTP header error reporting is enabled or disabled.
     * @return boolean indicating whether HTTP header error reporting is enabled or disabled
     */
    public boolean getReportHttpHeaderErrors() {
        return bReportHttpHeaderError;
    }

    /**
     * Enable or disable the readers HTTP header error being included in the readers diagnosis list.
     * @param bReportHttpHeaderError Enable or disable HTTP header error reporting
     */
    public void setReportHttpHeaderErrors(boolean bReportHttpHeaderError) {
        this.bReportHttpHeaderError = bReportHttpHeaderError;
    }

    /**
     * Close current record resource(s) and input stream(s).
     */
    public abstract void close();

    /**
     * Callback method called when the payload has been processed.
     */
    protected abstract void recordClosed();

    /**
     * Get the offset of the current WARC record or -1 if none have been read.
     * @return offset of the current WARC record or -1
     */
    public abstract long getStartOffset();

    /**
     * Get the current offset in the WARC InputStream.
     * @return offset in WARC InputStream
     */
    public abstract long getOffset();

    /**
     * Get number of bytes consumed by this reader.
     * @return number of bytes consumed by this reader
     */
   public abstract long getConsumed();

   /**
     * Parses and gets the next record.
     * This method is for linear access to records.
     * @return the next record
     * @throws IOException I/O exception in parsing process
     */
    public abstract WarcRecord getNextRecord() throws IOException;

    /**
     * Parses and gets the next record from an Inputstream.
     * This method is mainly for random access use since there are serious
     * side-effects involved in using multiple PushBackInputStream
     * instances.
     * @param in InputStream used to read next record
     * @param offset offset provided by caller
     * @return the next record
     * @throws IOException I/O exception in parsing process
     */
    public abstract WarcRecord getNextRecordFrom(InputStream in, long offset)
                                                        throws IOException;

    /**
     * Parses and gets the next record from an Inputstream wrapped
     * by a BufferedInputStream.
     * This method is mainly for random access use since there are serious
     * side-effects involved in using multiple PushBackInputStream
     * instances.
     * @param in InputStream used to read next record
     * @param offset offset provided by caller
     * @param buffer_size buffer size to use
     * @return the next record
     * @throws IOException I/O exception in parsing process
     */
    public abstract WarcRecord getNextRecordFrom(InputStream in, long offset,
                                        int buffer_size) throws IOException;

    /**
     * Gets an exception thrown in the iterator if any or null.
     * @return exception thrown in the iterator if any or null
     */
    public Exception getIteratorExceptionThrown() {
        return iteratorExceptionThrown;
    }

    public WarcRecordParserCallback getWarcRecordParserCallback() {
        return wrpCallback;
    }

    public void setWarcRecordParserCallback(WarcRecordParserCallback wrpCallback) {
        this.wrpCallback = wrpCallback;
    }

    /**
     * Returns an Iterator over the records as they are being
     * parsed. Any exception thrown during parsing is accessible through the
     * getIteratorExceptionThrown method.
     * @return Iterator over the records
     */
    public Iterator iterator() {
        return new Iterator() {

            /** Internal next record updated by either hasNext() or next(). */
            private WarcRecord next;

            /** Entry returned by next(). */
            private WarcRecord current;

            @Override
            public boolean hasNext() {
                if (next == null) {
                    iteratorExceptionThrown = null;
                    try {
                        next = getNextRecord();
                    } catch (IOException e) {
                        iteratorExceptionThrown = e;
                    }
                }
                return (next != null);
            }

            @Override
            public WarcRecord next() {
                if (next == null) {
                    iteratorExceptionThrown = null;
                    try {
                        next = getNextRecord();
                    } catch (IOException e) {
                        iteratorExceptionThrown = e;
                    }
                }
                if (next == null) {
                    throw new NoSuchElementException();
                }
                current = next;
                next = null;
                return current;
            }

            @Override
            public void remove() {
                throw new UnsupportedOperationException();
            }
        };
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy