org.jwat.warc.WarcReaderFactory Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jwat-warc Show documentation
Show all versions of jwat-warc Show documentation
Used for reading, writing and validating WARC files.
Implemented to follow the WARC/1.0 ISO specification as closely as possible.
/**
* Java Web Archive Toolkit - Software to read and validate ARC, WARC
* and GZip files. (http://jwat.org/)
* Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.jwat.warc;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import org.jwat.archive.common.ReaderFactoryAbstract;
import org.jwat.common.ByteCountingPushBackInputStream;
import org.jwat.gzip.GzipReader;
/**
* Factory used for creating WarcReader
instances.
* The general getReader
methods will auto-detect Gzip'ed data
* and return the appropriate WarcReader
instances.
* The other factory methods can be used to return specific
* WarcReader
instances for compressed or uncompressed records.
* Readers are available for both sequential and random reading of records.
* Use of buffered methods and/or buffering speeds up the reader considerably.
*
* @author nicl
*/
public class WarcReaderFactory extends ReaderFactoryAbstract {
/** Buffer size used by PushbackInputStream
. */
public static final int PUSHBACK_BUFFER_SIZE = 32;
/**
* Private constructor to enforce factory methods.
*/
protected WarcReaderFactory() {
}
/**
* Check head of PushBackInputStream
for a WARC file identifier.
* The identifier for WARC files is "WARC/" in the beginning.
* @param pbin PushBackInputStream
with WARC records
* @return boolean indicating presence of a WARC file identifier
* @throws IOException if an I/O error occurs while examining head of stream
*/
public static boolean isWarcFile(ByteCountingPushBackInputStream pbin) throws IOException {
return isWarcRecord(pbin);
}
/**
* Check head of PushBackInputStream
for a WARC record identifier.
* The identifier for WARC records is "WARC/" in the beginning.
* @param pbin PushBackInputStream
with WARC records
* @return boolean indicating presence of a WARC magic number
* @throws IOException if an I/O error occurs while examining head of stream
*/
public static boolean isWarcRecord(ByteCountingPushBackInputStream pbin) throws IOException {
byte[] streamBytes = new byte[WarcConstants.WARC_MAGIC_HEADER.length()];
byte[] warcBytes = WarcConstants.WARC_MAGIC_HEADER.getBytes();
// Look for the leading magic bytes in front of every valid WARC record.
pbin.peek(streamBytes);
return (Arrays.equals(warcBytes, streamBytes));
}
/**
* Creates a new WarcReader
from an InputStream
* wrapped by a BufferedInputStream
.
* The WarcReader
implementation returned is chosen based on
* GZip auto detection.
* @param in WARC File represented as InputStream
* @param buffer_size buffer size to use
* @return appropriate WarcReader
based on data read from
* InputStream
* @throws IOException if an I/O exception occurs during initialization
*/
public static WarcReader getReader(InputStream in, int buffer_size)
throws IOException {
if (in == null) {
throw new IllegalArgumentException(
"The inputstream 'in' is null");
}
if (buffer_size <= 0) {
throw new IllegalArgumentException(
"The 'buffer_size' is less than or equal to zero: " +
buffer_size);
}
ByteCountingPushBackInputStream pbin =
new ByteCountingPushBackInputStream(
new BufferedInputStream(in, buffer_size),
PUSHBACK_BUFFER_SIZE);
if (GzipReader.isGzipped(pbin)) {
return new WarcReaderCompressed(new GzipReader(pbin),
buffer_size);
}
return new WarcReaderUncompressed(pbin);
}
/**
* Creates a new WarcReader
from an InputStream
.
* The WarcReader
implementation returned is chosen based on
* GZip auto detection.
* @param in WARC File represented as InputStream
* @return appropriate WarcReader
based on data read from
* InputStream
* @throws IOException if an I/O exception occurs during initialization
*/
public static WarcReader getReader(InputStream in) throws IOException {
if (in == null) {
throw new IllegalArgumentException(
"The inputstream 'in' is null");
}
ByteCountingPushBackInputStream pbin =
new ByteCountingPushBackInputStream(in, PUSHBACK_BUFFER_SIZE);
if (GzipReader.isGzipped(pbin)) {
return new WarcReaderCompressed(new GzipReader(pbin));
}
return new WarcReaderUncompressed(pbin);
}
/**
* Creates a new WarcReader
without any associated
* InputStream
for random access to uncompressed records.
* @return WarcReader
for uncompressed records read from
* InputStream
*/
public static WarcReaderUncompressed getReaderUncompressed() {
return new WarcReaderUncompressed();
}
/**
* Creates a new WarcReader
from an InputStream
* primarily for random access to uncompressed records.
* @param in WARC File represented as InputStream
* @return WarcReader
for uncompressed records read from
* InputStream
* @throws IOException I/O exception while initializing reader
*/
public static WarcReaderUncompressed getReaderUncompressed(InputStream in)
throws IOException {
if (in == null) {
throw new IllegalArgumentException(
"The inputstream 'in' is null");
}
ByteCountingPushBackInputStream pbin =
new ByteCountingPushBackInputStream(in, PUSHBACK_BUFFER_SIZE);
return new WarcReaderUncompressed(pbin);
}
/**
* Creates a new WarcReader
from an InputStream
* wrapped by a BufferedInputStream
primarily for random
* access to uncompressed records.
* @param in WARC File represented as InputStream
* @param buffer_size buffer size to use
* @return WarcReader
for uncompressed records read from
* InputStream
* @throws IOException I/O exception while initializing reader
*/
public static WarcReaderUncompressed getReaderUncompressed(InputStream in,
int buffer_size) throws IOException {
if (in == null) {
throw new IllegalArgumentException(
"The inputstream 'in' is null");
}
if (buffer_size <= 0) {
throw new IllegalArgumentException(
"The 'buffer_size' is less than or equal to zero: " +
buffer_size);
}
ByteCountingPushBackInputStream pbin =
new ByteCountingPushBackInputStream(
new BufferedInputStream(in, buffer_size),
PUSHBACK_BUFFER_SIZE);
return new WarcReaderUncompressed(pbin);
}
/**
* Creates a new WarcReader
without any associated
* InputStream
for random access to GZip compressed records.
* @return WarcReader
for GZip compressed records read from
* InputStream
*/
public static WarcReaderCompressed getReaderCompressed() {
return new WarcReaderCompressed();
}
/**
* Creates a new WarcReader
from an InputStream
* primarily for random access to GZip compressed records.
* @param in WARC File represented as InputStream
* @return WarcReader
for GZip compressed records read from
* InputStream
* @throws IOException I/O exception while initializing reader
*/
public static WarcReaderCompressed getReaderCompressed(InputStream in)
throws IOException {
if (in == null) {
throw new IllegalArgumentException(
"The inputstream 'in' is null");
}
return new WarcReaderCompressed(new GzipReader(in));
}
/**
* Creates a new WarcReader
from an InputStream
* wrapped by a BufferedInputStream
primarily for random
* access to GZip compressed records.
* @param in WARC File represented as InputStream
* @param buffer_size buffer size to use
* @return WarcReader
for GZip compressed records read from
* InputStream
* @throws IOException I/O exception while initializing reader
*/
public static WarcReaderCompressed getReaderCompressed(InputStream in,
int buffer_size) throws IOException {
if (in == null) {
throw new IllegalArgumentException(
"The inputstream 'in' is null");
}
if (buffer_size <= 0) {
throw new IllegalArgumentException(
"The 'buffer_size' is less than or equal to zero: " +
buffer_size);
}
return new WarcReaderCompressed(new GzipReader(
new BufferedInputStream(in, buffer_size)));
}
}