org.archive.io.arc.ARCWriter Maven / Gradle / Ivy
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.io.arc;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPoolMember;
import org.archive.io.WriterPoolSettings;
import org.archive.util.ArchiveUtils;
import org.archive.util.DevUtils;
import org.archive.util.MimetypeUtils;
/**
* Write ARC files.
*
* Assumption is that the caller is managing access to this ARCWriter ensuring
* only one thread of control accessing this ARC file instance at any one time.
*
* ARC files are described here:
* Arc
* File Format. This class does version 1 of the ARC file format. It also
* writes version 1.1 which is version 1 with data stuffed into the body of the
* first arc record in the file, the arc file meta record itself.
*
*
An ARC file is three lines of meta data followed by an optional 'body' and
* then a couple of '\n' and then: record, '\n', record, '\n', record, etc.
* If we are writing compressed ARC files, then each of the ARC file records is
* individually gzipped and concatenated together to make up a single ARC file.
* In GZIP terms, each ARC record is a GZIP member of a total gzip'd
* file.
*
*
The GZIPping of the ARC file meta data is exceptional. It is GZIPped
* w/ an extra GZIP header, a special Internet Archive (IA) extra header field
* (e.g. FEXTRA is set in the GZIP header FLG field and an extra field is
* appended to the GZIP header). The extra field has little in it but its
* presence denotes this GZIP as an Internet Archive gzipped ARC. See RFC1952
* to learn about the GZIP header structure.
*
*
This class then does its GZIPping in the following fashion. Each GZIP
* member is written w/ a new instance of GZIPOutputStream -- actually
* ARCWriterGZIPOututStream so we can get access to the underlying stream.
* The underlying stream stays open across GZIPoutputStream instantiations.
* For the 'special' GZIPing of the ARC file meta data, we cheat by catching the
* GZIPOutputStream output into a byte array, manipulating it adding the
* IA GZIP header, before writing to the stream.
*
*
I tried writing a resettable GZIPOutputStream and could make it work w/
* the SUN JDK but the IBM JDK threw NPE inside in the deflate.reset -- its zlib
* native call doesn't seem to like the notion of resetting -- so I gave up on
* it.
*
*
Because of such as the above and troubles with GZIPInputStream, we should
* write our own GZIP*Streams, ones that resettable and consious of gzip
* members.
*
*
This class will write until we hit >= maxSize. The check is done at
* record boundary. Records do not span ARC files. We will then close current
* file and open another and then continue writing.
*
*
TESTING: Here is how to test that produced ARC files are good
* using the
* alexa
* ARC c-tools:
*
* % av_procarc hx20040109230030-0.arc.gz | av_ziparc > \
* /tmp/hx20040109230030-0.dat.gz
* % av_ripdat /tmp/hx20040109230030-0.dat.gz > /tmp/hx20040109230030-0.cdx
*
* Examine the produced cdx file to make sure it makes sense. Search
* for 'no-type 0'. If found, then we're opening a gzip record w/o data to
* write. This is bad.
*
* You can also do gzip -t FILENAME
and it will tell you if the
* ARC makes sense to GZIP.
*
*
While being written, ARCs have a '.open' suffix appended.
*
* @author stack
*/
public class ARCWriter extends WriterPoolMember implements ARCConstants, Closeable {
private static final Logger logger =
Logger.getLogger(ARCWriter.class.getName());
/**
* Metadata line pattern.
*/
private static final Pattern METADATA_LINE_PATTERN =
Pattern.compile("^\\S+ \\S+ \\S+ \\S+ \\S+(" + LINE_SEPARATOR + "?)$");
/**
* Constructor.
* Takes a stream. Use with caution. There is no upperbound check on size.
* Will just keep writing.
*
* @param serialNo used to generate unique file name sequences
* @param out Where to write.
* @param arc File the out
is connected to.
* @param cmprs Compress the content written.
* @param metadata File meta data. Can be null. Is list of File and/or
* String objects.
* @param a14DigitDate If null, we'll write current time.
* @throws IOException
*/
public ARCWriter(final AtomicInteger serialNo, final PrintStream out,
final File arc, final WriterPoolSettings settings)
throws IOException {
super(serialNo, out, arc, settings);
writeFirstRecord(ArchiveUtils.get14DigitDate());
}
/**
* Constructor.
*
* @param serialNo used to generate unique file name sequences
* @param settings all creation parameters
*/
public ARCWriter(final AtomicInteger serialNo, final WriterPoolSettings settings) {
super(serialNo, settings, ARC_FILE_EXTENSION);
}
protected String createFile()
throws IOException {
String name = super.createFile();
writeFirstRecord(currentTimestamp);
return name;
}
private void writeFirstRecord(final String ts)
throws IOException {
write(generateARCFileMetaData(ts));
}
/**
* Write out the ARCMetaData.
*
*
Generate ARC file meta data. Currently we only do version 1 of the
* ARC file formats or version 1.1 when metadata has been supplied (We
* write it into the body of the first record in the arc file).
*
*
Version 1 metadata looks roughly like this:
*
*
filedesc://testWriteRecord-JunitIAH20040110013326-2.arc 0.0.0.0 \\
* 20040110013326 text/plain 77
* 1 0 InternetArchive
* URL IP-address Archive-date Content-type Archive-length
*
*
* If compress is set, then we generate a header that has been gzipped
* in the Internet Archive manner. Such a gzipping enables the FEXTRA
* flag in the FLG field of the gzip header. It then appends an extra
* header field: '8', '0', 'L', 'X', '0', '0', '0', '0'. The first two
* bytes are the length of the field and the last 6 bytes the Internet
* Archive header. To learn about GZIP format, see RFC1952. To learn
* about the Internet Archive extra header field, read the source for
* av_ziparc which can be found at
* alexa/vista/alexa-tools-1.2/src/av_ziparc.cc
.
*
*
We do things in this roundabout manner because the java
* GZIPOutputStream does not give access to GZIP header fields.
*
* @param date Date to put into the ARC metadata; if 17-digit will be
* truncated to traditional 14-digits
*
* @return Byte array filled w/ the arc header.
* @throws IOException
*/
private byte [] generateARCFileMetaData(String date)
throws IOException {
if(date!=null && date.length()>14) {
date = date.substring(0,14);
}
int metadataBodyLength = getMetadataLength();
// If metadata body, then the minor part of the version is '1' rather
// than '0'.
String metadataHeaderLinesTwoAndThree =
getMetadataHeaderLinesTwoAndThree("1 " +
((metadataBodyLength > 0)? "1": "0"));
int recordLength = metadataBodyLength +
metadataHeaderLinesTwoAndThree.getBytes(DEFAULT_ENCODING).length;
String metadataHeaderStr = ARC_MAGIC_NUMBER + getBaseFilename() +
" 0.0.0.0 " + date + " text/plain " + recordLength +
metadataHeaderLinesTwoAndThree;
ByteArrayOutputStream metabaos =
new ByteArrayOutputStream(recordLength);
// Write the metadata header.
metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING));
// Write the metadata body, if anything to write.
if (metadataBodyLength > 0) {
writeMetaData(metabaos);
}
// Write out a LINE_SEPARATORs to end this record.
metabaos.write(LINE_SEPARATOR);
// Now get bytes of all just written and compress if flag set.
byte [] bytes = metabaos.toByteArray();
if(isCompressed()) {
// GZIP the header but catch the gzipping into a byte array so we
// can add the special IA GZIP header to the product. After
// manipulations, write to the output stream (The JAVA GZIP
// implementation does not give access to GZIP header. It
// produces a 'default' header only). We can get away w/ these
// maniupulations because the GZIP 'default' header doesn't
// do the 'optional' CRC'ing of the header.
byte [] gzippedMetaData = ArchiveUtils.gzip(bytes);
if (gzippedMetaData[3] != 0) {
throw new IOException("The GZIP FLG header is unexpectedly " +
" non-zero. Need to add smarter code that can deal " +
" when already extant extra GZIP header fields.");
}
// Set the GZIP FLG header to '4' which says that the GZIP header
// has extra fields. Then insert the alex {'L', 'X', '0', '0', '0,
// '0'} 'extra' field. The IA GZIP header will also set byte
// 9 (zero-based), the OS byte, to 3 (Unix). We'll do the same.
gzippedMetaData[3] = 4;
gzippedMetaData[9] = 3;
byte [] assemblyBuffer = new byte[gzippedMetaData.length +
ARC_GZIP_EXTRA_FIELD.length];
// '10' in the below is a pointer past the following bytes of the
// GZIP header: ID1 ID2 CM FLG + MTIME(4-bytes) XFL OS. See
// RFC1952 for explaination of the abbreviations just used.
System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10);
System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer, 10,
ARC_GZIP_EXTRA_FIELD.length);
System.arraycopy(gzippedMetaData, 10, assemblyBuffer,
10 + ARC_GZIP_EXTRA_FIELD.length, gzippedMetaData.length - 10);
bytes = assemblyBuffer;
}
return bytes;
}
public String getMetadataHeaderLinesTwoAndThree(String version) {
StringBuffer buffer = new StringBuffer();
buffer.append(LINE_SEPARATOR);
buffer.append(version);
buffer.append(" InternetArchive");
buffer.append(LINE_SEPARATOR);
buffer.append("URL IP-address Archive-date Content-type Archive-length");
buffer.append(LINE_SEPARATOR);
return buffer.toString();
}
/**
* Write all metadata to passed baos
.
*
* @param baos Byte array to write to.
* @throws UnsupportedEncodingException
* @throws IOException
*/
private void writeMetaData(ByteArrayOutputStream baos)
throws UnsupportedEncodingException, IOException {
if (settings.getMetadata() == null) {
return;
}
for (Iterator i = settings.getMetadata().iterator();
i.hasNext();) {
Object obj = i.next();
if (obj instanceof String) {
baos.write(((String)obj).getBytes(DEFAULT_ENCODING));
} else if (obj instanceof File) {
InputStream is = null;
try {
is = new BufferedInputStream(
new FileInputStream((File)obj));
byte [] buffer = new byte[4096];
for (int read = -1; (read = is.read(buffer)) != -1;) {
baos.write(buffer, 0, read);
}
} finally {
if (is != null) {
is.close();
}
}
} else if (obj != null) {
logger.severe("Unsupported metadata type: " + obj);
}
}
return;
}
/**
* @return Total length of metadata.
* @throws UnsupportedEncodingException
*/
private int getMetadataLength()
throws UnsupportedEncodingException {
int result = -1;
if (settings.getMetadata() == null) {
result = 0;
} else {
for (Iterator i = settings.getMetadata().iterator();
i.hasNext();) {
Object obj = i.next();
if (obj instanceof String) {
result += ((String)obj).getBytes(DEFAULT_ENCODING).length;
} else if (obj instanceof File) {
result += ((File)obj).length();
} else {
logger.severe("Unsupported metadata type: " + obj);
}
}
}
return result;
}
/**
* @deprecated use input-stream version directly instead
*/
public void write(String uri, String contentType, String hostIP,
long fetchBeginTimeStamp, long recordLength,
ByteArrayOutputStream baos)
throws IOException {
write(uri, contentType, hostIP, fetchBeginTimeStamp, recordLength,
new ByteArrayInputStream(baos.toByteArray()), false);
}
public void write(String uri, String contentType, String hostIP,
long fetchBeginTimeStamp, long recordLength, InputStream in)
throws IOException {
write(uri,contentType,hostIP,fetchBeginTimeStamp,recordLength,in,true);
}
/**
* Write a record with the given metadata/content.
*
* @param uri
* URI for metadata-line
* @param contentType
* MIME content-type for metadata-line
* @param hostIP
* IP for metadata-line
* @param fetchBeginTimeStamp
* timestamp for metadata-line
* @param recordLength
* length for metadata-line; also may be enforced
* @param in
* source InputStream for record content
* @param enforceLength
* whether to enforce the declared length; should be true
* unless intentionally writing bad records for testing
* @throws IOException
*/
public void write(String uri, String contentType, String hostIP,
long fetchBeginTimeStamp, long recordLength, InputStream in,
boolean enforceLength) throws IOException {
preWriteRecordTasks();
try {
write(getMetaLine(uri, contentType, hostIP, fetchBeginTimeStamp,
recordLength).getBytes(UTF8));
copyFrom(in, recordLength, enforceLength);
if (in instanceof ReplayInputStream) {
// check for consumption of entire recorded material
long remaining = ((ReplayInputStream) in).remaining();
// Should be zero at this stage. If not, something is
// wrong.
if (remaining != 0) {
String message = "Gap between expected and actual: "
+ remaining + LINE_SEPARATOR + DevUtils.extraInfo()
+ " writing arc "
+ this.getFile().getAbsolutePath();
DevUtils.warnHandle(new Throwable(message), message);
throw new IOException(message);
}
}
write(LINE_SEPARATOR);
} finally {
postWriteRecordTasks();
}
}
/**
* @param uri
* @param contentType
* @param hostIP
* @param fetchBeginTimeStamp
* @param recordLength
* @return Metadata line for an ARCRecord made of passed components.
* @exception IOException
*/
protected String getMetaLine(String uri, String contentType, String hostIP,
long fetchBeginTimeStamp, long recordLength)
throws IOException {
if (fetchBeginTimeStamp <= 0) {
throw new IOException("Bogus fetchBeginTimestamp: " +
Long.toString(fetchBeginTimeStamp));
}
return validateMetaLine(createMetaline(uri, hostIP,
ArchiveUtils.get14DigitDate(fetchBeginTimeStamp),
MimetypeUtils.truncate(contentType),
Long.toString(recordLength)));
}
public String createMetaline(String uri, String hostIP,
String timeStamp, String mimetype, String recordLength) {
return uri + HEADER_FIELD_SEPARATOR + hostIP +
HEADER_FIELD_SEPARATOR + timeStamp +
HEADER_FIELD_SEPARATOR + mimetype +
HEADER_FIELD_SEPARATOR + recordLength + LINE_SEPARATOR;
}
/**
* Test that the metadata line is valid before writing.
* @param metaLineStr
* @throws IOException
* @return The passed in metaline.
*/
protected String validateMetaLine(String metaLineStr)
throws IOException {
if (metaLineStr.length() > MAX_METADATA_LINE_LENGTH) {
throw new IOException("Metadata line too long ("
+ metaLineStr.length() + ">" + MAX_METADATA_LINE_LENGTH
+ "): " + metaLineStr);
}
Matcher m = METADATA_LINE_PATTERN.matcher(metaLineStr);
if (!m.matches()) {
throw new IOException("Metadata line doesn't match expected" +
" pattern: " + metaLineStr);
}
return metaLineStr;
}
}