org.archive.io.arc.ARCReaderFactory Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.io.arc;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;
import java.util.logging.Level;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.util.FileUtils;
import org.archive.util.zip.GZIPMembersInputStream;
import org.archive.util.zip.GzipHeader;
import org.archive.util.zip.NoGzipMagicException;
import com.google.common.io.CountingInputStream;
/**
* Factory that returns an ARCReader.
*
* Can handle compressed and uncompressed ARCs.
*
* @author stack
*/
public class ARCReaderFactory extends ArchiveReaderFactory
implements ARCConstants {
/**
* This factory instance.
*/
private static final ARCReaderFactory factory = new ARCReaderFactory();
/**
* Shutdown any access to default constructor.
*/
protected ARCReaderFactory() {
super();
}
public static ARCReader get(String arcFileOrUrl)
throws MalformedURLException, IOException {
return (ARCReader)ARCReaderFactory.factory.
getArchiveReader(arcFileOrUrl);
}
public static ARCReader get(String arcFileOrUrl, final long offset)
throws MalformedURLException, IOException {
return (ARCReader)ARCReaderFactory.factory.
getArchiveReader(arcFileOrUrl, offset);
}
public static ARCReader get(final File f) throws IOException {
return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f);
}
public static ARCReader get(final File f, final long offset)
throws IOException {
return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f, offset);
}
protected ArchiveReader getArchiveReader(final File f, final long offset)
throws IOException {
return getArchiveReader(f, true, offset);
}
/**
* @param f An arcfile to read.
* @param skipSuffixTest Set to true if want to test that ARC has proper
* suffix. Use this method and pass false
to open ARCs
* with the .open
or otherwise suffix.
* @param offset Have returned ARCReader set to start reading at passed
* offset.
* @return An ARCReader.
* @throws IOException
*/
public static ARCReader get(final File f,
final boolean skipSuffixTest, final long offset)
throws IOException {
return (ARCReader)ARCReaderFactory.factory.getArchiveReader(f,
skipSuffixTest, offset);
}
protected ArchiveReader getArchiveReader(final File arcFile,
final boolean skipSuffixTest, final long offset)
throws IOException {
boolean compressed = testCompressedARCFile(arcFile, skipSuffixTest);
if (!compressed) {
if (!FileUtils.isReadableWithExtensionAndMagic(arcFile,
ARC_FILE_EXTENSION, ARC_MAGIC_NUMBER)) {
throw new IOException(arcFile.getAbsolutePath() +
" is not an Internet Archive ARC file.");
}
}
return compressed?
(ARCReader)ARCReaderFactory.factory.
new CompressedARCReader(arcFile, offset):
(ARCReader)ARCReaderFactory.factory.
new UncompressedARCReader(arcFile, offset);
}
public static ArchiveReader get(final String s, final InputStream is,
final boolean atFirstRecord)
throws IOException {
return ARCReaderFactory.factory.getArchiveReader(s, is,
atFirstRecord);
}
protected ArchiveReader getArchiveReader(final String arc,
final InputStream is, final boolean atFirstRecord)
throws IOException {
// We do this mark() reset() stuff, wrapping in a BufferedInputStream if
// necessary to make it work, because testCompressedARCStream() consumes
// some bytes from the input stream
InputStream possiblyWrapped;
if (is.markSupported()) {
possiblyWrapped = is;
} else {
possiblyWrapped = new BufferedInputStream(is);
}
possiblyWrapped.mark(100);
boolean compressed = testCompressedARCStream(possiblyWrapped);
possiblyWrapped.reset();
if (compressed) {
return new CompressedARCReader(arc, possiblyWrapped, atFirstRecord);
} else {
return new UncompressedARCReader(arc, possiblyWrapped, atFirstRecord);
}
}
/**
* Get an ARCReader aligned at offset
. This version of get
* will not bring the ARC local but will try to stream across the net making
* an HTTP 1.1 Range request on remote http server (RFC1435 Section 14.35).
*
* @param arcUrl HTTP URL for an ARC (All ARCs considered remote).
* @param offset Offset into ARC at which to start fetching.
* @return An ARCReader aligned at offset.
* @throws IOException
*/
public static ARCReader get(final URL arcUrl, final long offset)
throws IOException {
return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl,
offset);
}
/**
* Get an ARCReader.
* Pulls the ARC local into whereever the System Property
* java.io.tmpdir
points. It then hands back an ARCReader that
* points at this local copy. A close on this ARCReader instance will
* remove the local copy.
* @param arcUrl An URL that points at an ARC.
* @return An ARCReader.
* @throws IOException
*/
public static ARCReader get(final URL arcUrl)
throws IOException {
return (ARCReader)ARCReaderFactory.factory.getArchiveReader(arcUrl);
}
/**
* @param arcFile File to test.
* @return True if arcFile
is compressed ARC.
* @throws IOException
*/
public boolean isCompressed(File arcFile) throws IOException {
return testCompressedARCFile(arcFile);
}
/**
* Check file is compressed and in ARC GZIP format.
*
* @param arcFile File to test if its Internet Archive ARC file
* GZIP compressed.
*
* @return True if this is an Internet Archive GZIP'd ARC file (It begins
* w/ the Internet Archive GZIP header and has the
* COMPRESSED_ARC_FILE_EXTENSION suffix).
*
* @exception IOException If file does not exist or is not unreadable.
*/
public static boolean testCompressedARCFile(File arcFile)
throws IOException {
return testCompressedARCFile(arcFile, false);
}
/**
* Check file is compressed and in ARC GZIP format.
*
* @param arcFile File to test if its Internet Archive ARC file
* GZIP compressed.
* @param skipSuffixCheck Set to true if we're not to test on the
* '.arc.gz' suffix.
*
* @return True if this is an Internet Archive GZIP'd ARC file (It begins
* w/ the Internet Archive GZIP header).
*
* @exception IOException If file does not exist or is not unreadable.
*/
public static boolean testCompressedARCFile(File arcFile,
boolean skipSuffixCheck)
throws IOException {
boolean compressedARCFile = false;
FileUtils.assertReadable(arcFile);
if(!skipSuffixCheck && !arcFile.getName().toLowerCase()
.endsWith(COMPRESSED_ARC_FILE_EXTENSION)) {
return compressedARCFile;
}
final InputStream is = new FileInputStream(arcFile);
try {
compressedARCFile = testCompressedARCStream(is);
} finally {
is.close();
}
return compressedARCFile;
}
public static boolean isARCSuffix(final String arcName) {
return (arcName == null)?
false:
(arcName.toLowerCase().endsWith(DOT_COMPRESSED_ARC_FILE_EXTENSION))?
true:
(arcName.toLowerCase().endsWith(DOT_ARC_FILE_EXTENSION))?
true: false;
}
/**
* Tests passed stream is gzip stream by reading in the HEAD.
* Does not reposition the stream. That is left up to the caller.
* @param is An InputStream.
* @return True if compressed stream.
* @throws IOException
*/
public static boolean testCompressedARCStream(final InputStream is)
throws IOException {
boolean compressedARCFile = false;
GzipHeader gh = null;
try {
gh = new GzipHeader(is);
} catch (NoGzipMagicException e) {
return false;
}
byte[] fextra = gh.getFextra();
// Now make sure following bytes are IA GZIP comment.
// First check length. ARC_GZIP_EXTRA_FIELD includes length
// so subtract two and start compare to ARC_GZIP_EXTRA_FIELD
// at +2.
// some Alexa ARC files gzip extra fields have changed slightly
// after the first two bytes, so we'll just look for the 'LX'
// extension for valid IA ARC files.
if (fextra != null) {
if (fextra.length >= ARC_GZIP_EXTRA_FIELD.length - 2) {
if (fextra[0] == ARC_GZIP_EXTRA_FIELD[2] &&
fextra[1] == ARC_GZIP_EXTRA_FIELD[3]) {
compressedARCFile = true;
}
}
} else {
// Some old arcs don't have an extra header at all, but they're still compressed
compressedARCFile = true;
}
return compressedARCFile;
}
/**
* Uncompressed arc file reader.
* @author stack
*/
public class UncompressedARCReader extends ARCReader {
/**
* Constructor.
* @param f Uncompressed arcfile to read.
* @throws IOException
*/
public UncompressedARCReader(final File f)
throws IOException {
this(f, 0);
}
/**
* Constructor.
*
* @param f Uncompressed arcfile to read.
* @param offset Offset at which to position ARCReader.
* @throws IOException
*/
public UncompressedARCReader(final File f, final long offset)
throws IOException {
// Arc file has been tested for existence by time it has come
// to here.
setIn(new CountingInputStream(getInputStream(f, offset)));
getIn().skip(offset);
initialize(f.getAbsolutePath());
}
/**
* Constructor.
*
* @param f Uncompressed arc to read.
* @param is InputStream.
*/
public UncompressedARCReader(final String f, final InputStream is, boolean atFirstRecord) {
// Arc file has been tested for existence by time it has come
// to here.
setIn(new CountingInputStream(is));
setAlignedOnFirstRecord(atFirstRecord);
initialize(f);
}
}
/**
* Compressed arc file reader.
*
* @author stack
*/
public class CompressedARCReader extends ARCReader {
/**
* Constructor.
*
* @param f
* Compressed arcfile to read.
* @throws IOException
*/
public CompressedARCReader(final File f) throws IOException {
this(f, 0);
}
/**
* Constructor.
*
* @param f Compressed arcfile to read.
* @param offset Position at where to start reading file.
* @throws IOException
*/
public CompressedARCReader(final File f, final long offset)
throws IOException {
// Arc file has been tested for existence by time it has come
// to here.
setIn(new GZIPMembersInputStream(getInputStream(f, offset)));
((GZIPMembersInputStream)getIn()).compressedSeek(offset);
setCompressed((offset == 0)); // TODO: does this make sense???
initialize(f.getAbsolutePath());
}
/**
* Constructor.
*
* @param f Compressed arcfile.
* @param is InputStream to use.
* @throws IOException
*/
public CompressedARCReader(final String f, final InputStream is,
final boolean atFirstRecord)
throws IOException {
// Arc file has been tested for existence by time it has come
// to here.
setIn(new GZIPMembersInputStream(is));
setCompressed(true);
setAlignedOnFirstRecord(atFirstRecord);
initialize(f);
}
/**
* Get record at passed offset
.
*
* @param offset
* Byte index into arcfile at which a record starts.
* @return An ARCRecord reference.
* @throws IOException
*/
public ARCRecord get(long offset) throws IOException {
cleanupCurrentRecord();
((GZIPMembersInputStream)getIn()).compressedSeek(offset);
return createArchiveRecord(getIn(), offset);
}
public Iterator iterator() {
/**
* Override ARCRecordIterator so can base returned iterator on
* GzippedInputStream iterator.
*/
return new ArchiveRecordIterator() {
private GZIPMembersInputStream gis =
(GZIPMembersInputStream)getIn();
private Iterator gzipIterator = this.gis.memberIterator();
protected boolean innerHasNext() {
return this.gzipIterator.hasNext();
}
protected ArchiveRecord innerNext() throws IOException {
InputStream is = this.gzipIterator.next();
return createArchiveRecord(is, Math.max(gis.getCurrentMemberStart(), gis.getCurrentMemberEnd()));
}
};
}
protected void gotoEOR(ArchiveRecord rec) throws IOException {
int c;
while ((c = getIn().read())==LINE_SEPARATOR);
if(c==-1) {
return;
}
long skipped = 1;
while (getIn().read()>-1) {
skipped++;
}
// Report on system error the number of unexpected characters
// at the end of this record.
ArchiveRecordHeader meta = (getCurrentRecord() != null)?
rec.getHeader(): null;
String message = "Record STARTING at " +
((GZIPMembersInputStream)getIn()).getCurrentMemberStart() +
" has " + skipped + " trailing byte(s): " +
((meta != null)? meta.toString(): "");
if (isStrict()) {
throw new IOException(message);
}
logStdErr(Level.WARNING, message);
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy