All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jwat.arc.ArcReaderFactory Maven / Gradle / Ivy

Go to download

Used for reading, writing and validating ARC files. Trying to follow the adhoc standard as much as possible.

There is a newer version: 1.2.1
Show newest version
/**
 * Java Web Archive Toolkit - Software to read and validate ARC, WARC
 * and GZip files. (http://jwat.org/)
 * Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.jwat.arc;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;

import org.jwat.common.ByteCountingPushBackInputStream;
import org.jwat.common.Scheme;
import org.jwat.gzip.GzipReader;

/**
 * Factory used for creating ArcReader instances.
 * The general getReader methods will auto-detect Gzip'ed data
 * and return the appropriate ArcReader instances.
 * The other factory methods can be used to return specific
 * ArcReader instances for compressed or uncompressed records.
 * Readers are available for both sequential and random reading of records.
 * Use of buffered methods and/or buffering speeds up the reader considerably.
 *
 * @author nicl
 */
public class ArcReaderFactory {

    /** Buffer size used by PushbackInputStream. */
    public static final int PUSHBACK_BUFFER_SIZE = 32;

    /**
     * Private constructor to enforce factory method.
     */
    protected ArcReaderFactory() {
    }

    /**
     * Check head of PushBackInputStream for an ARC file identifier.
     * The identifier for ARC files is "filedesc:" in the beginning.
     * @param pbin PushBackInputStream with an ARC version block
     * @return boolean indicating presence of an ARC file identifier
     * @throws IOException if an i/o error occurs while examining head of stream
     */
    public static boolean isArcFile(ByteCountingPushBackInputStream pbin) throws IOException {
        byte[] streamBytes = new byte[ArcConstants.ARC_MAGIC_HEADER.length()];
        byte[] arcBytes = ArcConstants.ARC_MAGIC_HEADER.getBytes();
        // Look for an ARC file identifier in the beginning of the stream.
        pbin.peek(streamBytes);
        return (Arrays.equals(arcBytes, streamBytes));
    }

    /**
     * Check head of PushBackInputStream for an ARC record identifier.
     * The identifier for ARC files is "filedesc:" in the beginning.
     * @param pbin PushBackInputStream with an ARC version block
     * @return boolean indicating presence of an ARC file identifier
     * @throws IOException if an i/o error occurs while examining head of stream
     */
    public static boolean isArcRecord(ByteCountingPushBackInputStream pbin) throws IOException {
        byte[] streamBytes = new byte[32];
        // Look for a valid scheme in the beginning of the stream.
        pbin.peek(streamBytes);
        return Scheme.startsWithScheme(streamBytes);
    }

    /**
     * Creates a new ArcReader from an InputStream
     * wrapped by a BufferedInputStream.
     * The WarcReader implementation returned is chosen based on
     * GZip auto detection.
     * @param in ARC File represented as InputStream
     * @param buffer_size buffer size to use
     * @return appropriate ArcReader based on data read from
     * InputStream
     * @throws IOException if an i/o exception occurs during initialization
     */
    public static ArcReader getReader(InputStream in, int buffer_size)
                                                        throws IOException {
        if (in == null) {
            throw new IllegalArgumentException(
                    "The inputstream 'in' is null");
        }
        if (buffer_size <= 0) {
            throw new IllegalArgumentException(
                    "The 'buffer_size' is less than or equal to zero: "
                    + buffer_size);
        }
        ByteCountingPushBackInputStream pbin =
                new ByteCountingPushBackInputStream(
                        new BufferedInputStream(in, buffer_size),
                                                PUSHBACK_BUFFER_SIZE);
        if (GzipReader.isGzipped(pbin)) {
            return new ArcReaderCompressed(new GzipReader(pbin),
                                           buffer_size);
        }
        return new ArcReaderUncompressed(pbin);
    }

    /**
     * Creates a new ArcReader from an InputStream.
     * The WarcReader implementation returned is chosen based on
     * GZip auto detection.
     * @param in ARC File represented as InputStream
     * @return appropriate ArcReader based on data read from
     * InputStream
     * @throws IOException if an i/o exception occurs during initialization
     */
    public static ArcReader getReader(InputStream in) throws IOException {
        if (in == null) {
            throw new IllegalArgumentException(
                    "The inputstream 'in' is null");
        }
        ByteCountingPushBackInputStream pbin =
                new ByteCountingPushBackInputStream(in, PUSHBACK_BUFFER_SIZE);
        if (GzipReader.isGzipped(pbin)) {
            return new ArcReaderCompressed(new GzipReader(pbin));
        }
        return new ArcReaderUncompressed(pbin);
    }

    /**
     * Creates a new ArcReader without any associated
     * InputStream for random access to uncompressed records.
     * @return ArcReader for uncompressed records read from
     * InputStream
     */
    public static ArcReaderUncompressed getReaderUncompressed() {
        return new ArcReaderUncompressed();
    }

    /**
     * Creates a new ArcReader from an InputStream
     * primarily for random access to uncompressed records.
     * @param in ARC File represented as InputStream
     * @return ArcReader for uncompressed records read from
     * InputStream
     * @throws IOException i/o exception while initializing reader
     */
    public static ArcReaderUncompressed getReaderUncompressed(InputStream in)
                                                        throws IOException {
        if (in == null) {
            throw new IllegalArgumentException(
                    "The inputstream 'in' is null");
        }
        ByteCountingPushBackInputStream pbin =
                new ByteCountingPushBackInputStream(in, PUSHBACK_BUFFER_SIZE);
        return new ArcReaderUncompressed(pbin);
    }

    /**
     * Creates a new ArcReader from an InputStream
     * wrapped by a BufferedInputStream primarily for random
     * access to uncompressed records.
     * @param in ARC File represented as InputStream
     * @param buffer_size buffer size to use
     * @return ArcReader for uncompressed records read from
     * InputStream
     * @throws IOException i/o exception while initializing reader
     */
    public static ArcReaderUncompressed getReaderUncompressed(InputStream in,
                                        int buffer_size) throws IOException {
        if (in == null) {
            throw new IllegalArgumentException(
                    "The inputstream 'in' is null");
        }
        if (buffer_size <= 0) {
            throw new IllegalArgumentException(
                    "The 'buffer_size' is less than or equal to zero: "
                    + buffer_size);
        }
        ByteCountingPushBackInputStream pbin =
                new ByteCountingPushBackInputStream(
                        new BufferedInputStream(in, buffer_size),
                        PUSHBACK_BUFFER_SIZE);
        return new ArcReaderUncompressed(pbin);
    }

    /**
     * Creates a new ArcReader without any associated
     * InputStream for random access to GZip compressed records.
     * @return ArcReader for GZip compressed records read from
     * InputStream
     */
    public static ArcReaderCompressed getReaderCompressed() {
        return new ArcReaderCompressed();
    }

    /**
     * Creates a new ArcReader from an InputStream
     * primarily for random access to GZip compressed records.
     * @param in ARC File represented as InputStream
     * @return ArcReader for GZip compressed records read from
     * InputStream
     * @throws IOException i/o exception while initializing reader
     */
    public static ArcReaderCompressed getReaderCompressed(InputStream in)
                                                        throws IOException {
        if (in == null) {
            throw new IllegalArgumentException(
                    "The inputstream 'in' is null");
        }
        return new ArcReaderCompressed(new GzipReader(in));
    }

    /**
     * Creates a new ArcReader from an InputStream
     * wrapped by a BufferedInputStream primarily for random
     * access to GZip compressed records.
     * @param in ARC File represented as InputStream
     * @param buffer_size buffer size to use
     * @return ArcReader for GZip compressed records read from
     * InputStream
     * @throws IOException i/o exception while initializing reader
     */
    public static ArcReaderCompressed getReaderCompressed(InputStream in,
                                        int buffer_size) throws IOException {
        if (in == null) {
            throw new IllegalArgumentException(
                    "The inputstream 'in' is null");
        }
        if (buffer_size <= 0) {
            throw new IllegalArgumentException(
                    "The 'buffer_size' is less than or equal to zero: "
                    + buffer_size);
        }
        return new ArcReaderCompressed(new GzipReader(
                                new BufferedInputStream(in, buffer_size)));
    }

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy