All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jwat.warc.WarcReaderFactory Maven / Gradle / Ivy

Go to download

Used for reading, writing and validating WARC files. Implemented to follow the WARC/1.0 ISO specification as closely as possible.

There is a newer version: 1.2.1
Show newest version
/**
 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
 * and GZip files. (http://jwat.org/)
 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.jwat.warc;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;

import org.jwat.archive.common.ReaderFactoryAbstract;
import org.jwat.common.ByteCountingPushBackInputStream;
import org.jwat.gzip.GzipReader;

/**
 * Factory used for creating WarcReader instances.
 * The general getReader methods will auto-detect Gzip'ed data
 * and return the appropriate WarcReader instances.
 * The other factory methods can be used to return specific
 * WarcReader instances for compressed or uncompressed records.
 * Readers are available for both sequential and random reading of records.
 * Use of buffered methods and/or buffering speeds up the reader considerably.
 *
 * @author nicl
 */
public class WarcReaderFactory extends ReaderFactoryAbstract {

    /** Buffer size used by PushbackInputStream. */
    public static final int PUSHBACK_BUFFER_SIZE = 32;

    /**
     * Private constructor to enforce factory methods.
     */
    protected WarcReaderFactory() {
    }

    /**
     * Check head of PushBackInputStream for a WARC file identifier.
     * The identifier for WARC files is "WARC/" in the beginning.
     * @param pbin PushBackInputStream with WARC records
     * @return boolean indicating presence of a WARC file identifier
     * @throws IOException if an I/O error occurs while examining head of stream
     */
    public static boolean isWarcFile(ByteCountingPushBackInputStream pbin) throws IOException {
        return isWarcRecord(pbin);
    }

    /**
     * Check head of PushBackInputStream for a WARC record identifier.
     * The identifier for WARC records is "WARC/" in the beginning.
     * @param pbin PushBackInputStream with WARC records
     * @return boolean indicating presence of a WARC magic number
     * @throws IOException if an I/O error occurs while examining head of stream
     */
    public static boolean isWarcRecord(ByteCountingPushBackInputStream pbin) throws IOException {
        byte[] streamBytes = new byte[WarcConstants.WARC_MAGIC_HEADER.length()];
        byte[] warcBytes = WarcConstants.WARC_MAGIC_HEADER.getBytes();
        // Look for the leading magic bytes in front of every valid WARC record.
        pbin.peek(streamBytes);
        return (Arrays.equals(warcBytes, streamBytes));
    }

    /**
     * Creates a new WarcReader from an InputStream
     * wrapped by a BufferedInputStream.
     * The WarcReader implementation returned is chosen based on
     * GZip auto detection.
     * @param in WARC File represented as InputStream
     * @param buffer_size buffer size to use
     * @return appropriate WarcReader based on data read from
     * InputStream
     * @throws IOException if an I/O exception occurs during initialization
     */
    public static WarcReader getReader(InputStream in, int buffer_size)
                                                        throws IOException {
        if (in == null) {
            throw new IllegalArgumentException(
                    "The inputstream 'in' is null");
        }
        if (buffer_size <= 0) {
            throw new IllegalArgumentException(
                    "The 'buffer_size' is less than or equal to zero: " +
                    buffer_size);
        }
        ByteCountingPushBackInputStream pbin =
                new ByteCountingPushBackInputStream(
                        new BufferedInputStream(in, buffer_size),
                PUSHBACK_BUFFER_SIZE);
        if (GzipReader.isGzipped(pbin)) {
            return new WarcReaderCompressed(new GzipReader(pbin),
                                            buffer_size);
        }
        return new WarcReaderUncompressed(pbin);
    }

    /**
     * Creates a new WarcReader from an InputStream.
     * The WarcReader implementation returned is chosen based on
     * GZip auto detection.
     * @param in WARC File represented as InputStream
     * @return appropriate WarcReader based on data read from
     * InputStream
     * @throws IOException if an I/O exception occurs during initialization
     */
    public static WarcReader getReader(InputStream in) throws IOException {
        if (in == null) {
            throw new IllegalArgumentException(
                    "The inputstream 'in' is null");
        }
        ByteCountingPushBackInputStream pbin =
                new ByteCountingPushBackInputStream(in, PUSHBACK_BUFFER_SIZE);
        if (GzipReader.isGzipped(pbin)) {
            return new WarcReaderCompressed(new GzipReader(pbin));
        }
        return new WarcReaderUncompressed(pbin);
    }

    /**
     * Creates a new WarcReader without any associated
     * InputStream for random access to uncompressed records.
     * @return WarcReader for uncompressed records read from
     * InputStream
     */
    public static WarcReaderUncompressed getReaderUncompressed() {
        return new WarcReaderUncompressed();
    }

    /**
     * Creates a new WarcReader from an InputStream
     * primarily for random access to uncompressed records.
     * @param in WARC File represented as InputStream
     * @return WarcReader for uncompressed records read from
     * InputStream
     * @throws IOException I/O exception while initializing reader
     */
    public static WarcReaderUncompressed getReaderUncompressed(InputStream in)
                                                        throws IOException {
        if (in == null) {
            throw new IllegalArgumentException(
                    "The inputstream 'in' is null");
        }
        ByteCountingPushBackInputStream pbin =
                new ByteCountingPushBackInputStream(in, PUSHBACK_BUFFER_SIZE);
        return new WarcReaderUncompressed(pbin);
    }

    /**
     * Creates a new WarcReader from an InputStream
     * wrapped by a BufferedInputStream primarily for random
     * access to uncompressed records.
     * @param in WARC File represented as InputStream
     * @param buffer_size buffer size to use
     * @return WarcReader for uncompressed records read from
     * InputStream
     * @throws IOException I/O exception while initializing reader
     */
    public static WarcReaderUncompressed getReaderUncompressed(InputStream in,
                                        int buffer_size) throws IOException {
        if (in == null) {
            throw new IllegalArgumentException(
                    "The inputstream 'in' is null");
        }
        if (buffer_size <= 0) {
            throw new IllegalArgumentException(
                    "The 'buffer_size' is less than or equal to zero: " +
                    buffer_size);
        }
        ByteCountingPushBackInputStream pbin =
                new ByteCountingPushBackInputStream(
                        new BufferedInputStream(in, buffer_size),
                PUSHBACK_BUFFER_SIZE);
        return new WarcReaderUncompressed(pbin);
    }

    /**
     * Creates a new WarcReader without any associated
     * InputStream for random access to GZip compressed records.
     * @return WarcReader for GZip compressed records read from
     * InputStream
     */
    public static WarcReaderCompressed getReaderCompressed() {
        return new WarcReaderCompressed();
    }

    /**
     * Creates a new WarcReader from an InputStream
     * primarily for random access to GZip compressed records.
     * @param in WARC File represented as InputStream
     * @return WarcReader for GZip compressed records read from
     * InputStream
     * @throws IOException I/O exception while initializing reader
     */
    public static WarcReaderCompressed getReaderCompressed(InputStream in)
                                                        throws IOException {
        if (in == null) {
            throw new IllegalArgumentException(
                    "The inputstream 'in' is null");
        }
        return new WarcReaderCompressed(new GzipReader(in));
    }

    /**
     * Creates a new WarcReader from an InputStream
     * wrapped by a BufferedInputStream primarily for random
     * access to GZip compressed records.
     * @param in WARC File represented as InputStream
     * @param buffer_size buffer size to use
     * @return WarcReader for GZip compressed records read from
     * InputStream
     * @throws IOException I/O exception while initializing reader
     */
    public static WarcReaderCompressed getReaderCompressed(InputStream in,
                                        int buffer_size) throws IOException {
        if (in == null) {
            throw new IllegalArgumentException(
                    "The inputstream 'in' is null");
        }
        if (buffer_size <= 0) {
            throw new IllegalArgumentException(
                    "The 'buffer_size' is less than or equal to zero: " +
                    buffer_size);
        }
        return new WarcReaderCompressed(new GzipReader(
                new BufferedInputStream(in, buffer_size)));
    }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy