org.jwat.arc.ArcReaderFactory Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jwat-arc Show documentation
Show all versions of jwat-arc Show documentation
Used for reading, writing and validating ARC files.
Trying to follow the adhoc standard as much as possible.
/**
* Java Web Archive Toolkit - Software to read and validate ARC, WARC
* and GZip files. (http://jwat.org/)
* Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.jwat.arc;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import org.jwat.common.ByteCountingPushBackInputStream;
import org.jwat.common.Scheme;
import org.jwat.gzip.GzipReader;
/**
* Factory used for creating ArcReader
instances.
* The general getReader
methods will auto-detect Gzip'ed data
* and return the appropriate ArcReader
instances.
* The other factory methods can be used to return specific
* ArcReader
instances for compressed or uncompressed records.
* Readers are available for both sequential and random reading of records.
* Use of buffered methods and/or buffering speeds up the reader considerably.
*
* @author nicl
*/
public class ArcReaderFactory {
/** Buffer size used by PushbackInputStream
. */
public static final int PUSHBACK_BUFFER_SIZE = 32;
/**
* Private constructor to enforce factory method.
*/
protected ArcReaderFactory() {
}
/**
* Check head of PushBackInputStream
for an ARC file identifier.
* The identifier for ARC files is "filedesc:" in the beginning.
* @param pbin PushBackInputStream
with an ARC version block
* @return boolean indicating presence of an ARC file identifier
* @throws IOException if an i/o error occurs while examining head of stream
*/
public static boolean isArcFile(ByteCountingPushBackInputStream pbin) throws IOException {
byte[] streamBytes = new byte[ArcConstants.ARC_MAGIC_HEADER.length()];
byte[] arcBytes = ArcConstants.ARC_MAGIC_HEADER.getBytes();
// Look for an ARC file identifier in the beginning of the stream.
pbin.peek(streamBytes);
return (Arrays.equals(arcBytes, streamBytes));
}
/**
* Check head of PushBackInputStream
for an ARC record identifier.
* The identifier for ARC files is "filedesc:" in the beginning.
* @param pbin PushBackInputStream
with an ARC version block
* @return boolean indicating presence of an ARC file identifier
* @throws IOException if an i/o error occurs while examining head of stream
*/
public static boolean isArcRecord(ByteCountingPushBackInputStream pbin) throws IOException {
byte[] streamBytes = new byte[32];
// Look for a valid scheme in the beginning of the stream.
pbin.peek(streamBytes);
return Scheme.startsWithScheme(streamBytes);
}
/**
* Creates a new ArcReader
from an InputStream
* wrapped by a BufferedInputStream
.
* The WarcReader
implementation returned is chosen based on
* GZip auto detection.
* @param in ARC File represented as InputStream
* @param buffer_size buffer size to use
* @return appropriate ArcReader
based on data read from
* InputStream
* @throws IOException if an i/o exception occurs during initialization
*/
public static ArcReader getReader(InputStream in, int buffer_size)
throws IOException {
if (in == null) {
throw new IllegalArgumentException(
"The inputstream 'in' is null");
}
if (buffer_size <= 0) {
throw new IllegalArgumentException(
"The 'buffer_size' is less than or equal to zero: "
+ buffer_size);
}
ByteCountingPushBackInputStream pbin =
new ByteCountingPushBackInputStream(
new BufferedInputStream(in, buffer_size),
PUSHBACK_BUFFER_SIZE);
if (GzipReader.isGzipped(pbin)) {
return new ArcReaderCompressed(new GzipReader(pbin),
buffer_size);
}
return new ArcReaderUncompressed(pbin);
}
/**
* Creates a new ArcReader
from an InputStream
.
* The WarcReader
implementation returned is chosen based on
* GZip auto detection.
* @param in ARC File represented as InputStream
* @return appropriate ArcReader
based on data read from
* InputStream
* @throws IOException if an i/o exception occurs during initialization
*/
public static ArcReader getReader(InputStream in) throws IOException {
if (in == null) {
throw new IllegalArgumentException(
"The inputstream 'in' is null");
}
ByteCountingPushBackInputStream pbin =
new ByteCountingPushBackInputStream(in, PUSHBACK_BUFFER_SIZE);
if (GzipReader.isGzipped(pbin)) {
return new ArcReaderCompressed(new GzipReader(pbin));
}
return new ArcReaderUncompressed(pbin);
}
/**
* Creates a new ArcReader
without any associated
* InputStream
for random access to uncompressed records.
* @return ArcReader
for uncompressed records read from
* InputStream
*/
public static ArcReaderUncompressed getReaderUncompressed() {
return new ArcReaderUncompressed();
}
/**
* Creates a new ArcReader
from an InputStream
* primarily for random access to uncompressed records.
* @param in ARC File represented as InputStream
* @return ArcReader
for uncompressed records read from
* InputStream
* @throws IOException i/o exception while initializing reader
*/
public static ArcReaderUncompressed getReaderUncompressed(InputStream in)
throws IOException {
if (in == null) {
throw new IllegalArgumentException(
"The inputstream 'in' is null");
}
ByteCountingPushBackInputStream pbin =
new ByteCountingPushBackInputStream(in, PUSHBACK_BUFFER_SIZE);
return new ArcReaderUncompressed(pbin);
}
/**
* Creates a new ArcReader
from an InputStream
* wrapped by a BufferedInputStream
primarily for random
* access to uncompressed records.
* @param in ARC File represented as InputStream
* @param buffer_size buffer size to use
* @return ArcReader
for uncompressed records read from
* InputStream
* @throws IOException i/o exception while initializing reader
*/
public static ArcReaderUncompressed getReaderUncompressed(InputStream in,
int buffer_size) throws IOException {
if (in == null) {
throw new IllegalArgumentException(
"The inputstream 'in' is null");
}
if (buffer_size <= 0) {
throw new IllegalArgumentException(
"The 'buffer_size' is less than or equal to zero: "
+ buffer_size);
}
ByteCountingPushBackInputStream pbin =
new ByteCountingPushBackInputStream(
new BufferedInputStream(in, buffer_size),
PUSHBACK_BUFFER_SIZE);
return new ArcReaderUncompressed(pbin);
}
/**
* Creates a new ArcReader
without any associated
* InputStream
for random access to GZip compressed records.
* @return ArcReader
for GZip compressed records read from
* InputStream
*/
public static ArcReaderCompressed getReaderCompressed() {
return new ArcReaderCompressed();
}
/**
* Creates a new ArcReader
from an InputStream
* primarily for random access to GZip compressed records.
* @param in ARC File represented as InputStream
* @return ArcReader
for GZip compressed records read from
* InputStream
* @throws IOException i/o exception while initializing reader
*/
public static ArcReaderCompressed getReaderCompressed(InputStream in)
throws IOException {
if (in == null) {
throw new IllegalArgumentException(
"The inputstream 'in' is null");
}
return new ArcReaderCompressed(new GzipReader(in));
}
/**
* Creates a new ArcReader
from an InputStream
* wrapped by a BufferedInputStream
primarily for random
* access to GZip compressed records.
* @param in ARC File represented as InputStream
* @param buffer_size buffer size to use
* @return ArcReader
for GZip compressed records read from
* InputStream
* @throws IOException i/o exception while initializing reader
*/
public static ArcReaderCompressed getReaderCompressed(InputStream in,
int buffer_size) throws IOException {
if (in == null) {
throw new IllegalArgumentException(
"The inputstream 'in' is null");
}
if (buffer_size <= 0) {
throw new IllegalArgumentException(
"The 'buffer_size' is less than or equal to zero: "
+ buffer_size);
}
return new ArcReaderCompressed(new GzipReader(
new BufferedInputStream(in, buffer_size)));
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy