org.archive.io.arc.ARCWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of webarchive-commons Show documentation
There is a newer version: 1.1.9
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.io.arc;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.archive.io.ReplayInputStream;
import org.archive.io.WriterPoolMember;
import org.archive.io.WriterPoolSettings;
import org.archive.util.ArchiveUtils;
import org.archive.util.DevUtils;
import org.archive.util.MimetypeUtils;


/**
 * Write ARC files.
 *
 * Assumption is that the caller is managing access to this ARCWriter ensuring
 * only one thread of control accessing this ARC file instance at any one time.
 *
 * ARC files are described here:
 * Arc
 * File Format.  This class does version 1 of the ARC file format.  It also
 * writes version 1.1 which is version 1 with data stuffed into the body of the
 * first arc record in the file, the arc file meta record itself.
 *
 * 
An ARC file is three lines of meta data followed by an optional 'body' and
 * then a couple of '\n' and then: record, '\n', record, '\n', record, etc.
 * If we are writing compressed ARC files, then each of the ARC file records is
 * individually gzipped and concatenated together to make up a single ARC file.
 * In GZIP terms, each ARC record is a GZIP member of a total gzip'd
 * file.
 *
 * 
The GZIPping of the ARC file meta data is exceptional.  It is GZIPped
 * w/ an extra GZIP header, a special Internet Archive (IA) extra header field
 * (e.g. FEXTRA is set in the GZIP header FLG field and an extra field is
 * appended to the GZIP header).  The extra field has little in it but its
 * presence denotes this GZIP as an Internet Archive gzipped ARC.  See RFC1952
 * to learn about the GZIP header structure.
 *
 * 
This class then does its GZIPping in the following fashion.  Each GZIP
 * member is written w/ a new instance of GZIPOutputStream -- actually
 * ARCWriterGZIPOututStream so we can get access to the underlying stream.
 * The underlying stream stays open across GZIPoutputStream instantiations.
 * For the 'special' GZIPing of the ARC file meta data, we cheat by catching the
 * GZIPOutputStream output into a byte array, manipulating it adding the
 * IA GZIP header, before writing to the stream.
 *
 * 
I tried writing a resettable GZIPOutputStream and could make it work w/
 * the SUN JDK but the IBM JDK threw NPE inside in the deflate.reset -- its zlib
 * native call doesn't seem to like the notion of resetting -- so I gave up on
 * it.
 *
 * 
Because of such as the above and troubles with GZIPInputStream, we should
 * write our own GZIP*Streams, ones that resettable and consious of gzip
 * members.
 *
 * 
This class will write until we hit >= maxSize.  The check is done at
 * record boundary.  Records do not span ARC files.  We will then close current
 * file and open another and then continue writing.
 *
 * 
TESTING: Here is how to test that produced ARC files are good
 * using the
 * alexa
 * ARC c-tools:
 * 
 * % av_procarc hx20040109230030-0.arc.gz | av_ziparc > \
 *     /tmp/hx20040109230030-0.dat.gz
 * % av_ripdat /tmp/hx20040109230030-0.dat.gz > /tmp/hx20040109230030-0.cdx
 * 
 * Examine the produced cdx file to make sure it makes sense.  Search
 * for 'no-type 0'.  If found, then we're opening a gzip record w/o data to
 * write.  This is bad.
 *
 * You can also do gzip -t FILENAME and it will tell you if the
 * ARC makes sense to GZIP.
 * 
 * 
While being written, ARCs have a '.open' suffix appended.
 *
 * @author stack
 */
public class ARCWriter extends WriterPoolMember implements ARCConstants, Closeable {
    private static final Logger logger =
        Logger.getLogger(ARCWriter.class.getName());
    
    /**
     * Metadata line pattern.
     */
    private static final Pattern METADATA_LINE_PATTERN =
        Pattern.compile("^\\S+ \\S+ \\S+ \\S+ \\S+(" + LINE_SEPARATOR + "?)$");
    
      
    /**
     * Constructor.
     * Takes a stream. Use with caution. There is no upperbound check on size.
     * Will just keep writing.
     * 
     * @param serialNo  used to generate unique file name sequences
     * @param out Where to write.
     * @param arc File the out is connected to.
     * @param cmprs Compress the content written.
     * @param metadata File meta data.  Can be null.  Is list of File and/or
     * String objects.
     * @param a14DigitDate If null, we'll write current time.
     * @throws IOException
     */
    public ARCWriter(final AtomicInteger serialNo, final PrintStream out,
    	final File arc, final WriterPoolSettings settings)
    throws IOException {
        super(serialNo, out, arc, settings);
        writeFirstRecord(ArchiveUtils.get14DigitDate());
    }
          
    /**
     * Constructor.
     *
     * @param serialNo  used to generate unique file name sequences
     * @param settings all creation parameters
     */
    public ARCWriter(final AtomicInteger serialNo, final WriterPoolSettings settings) {
        super(serialNo, settings, ARC_FILE_EXTENSION);

    }

    protected String createFile()
    throws IOException {
        String name = super.createFile();
        writeFirstRecord(currentTimestamp);
        return name;
    }
    
    private void writeFirstRecord(final String ts)
    throws IOException {
        write(generateARCFileMetaData(ts));
    }
        
	/**
     * Write out the ARCMetaData.
     *
     * 
Generate ARC file meta data.  Currently we only do version 1 of the
     * ARC file formats or version 1.1 when metadata has been supplied (We
     * write it into the body of the first record in the arc file).
     *
     * 
Version 1 metadata looks roughly like this:
     *
     * 
filedesc://testWriteRecord-JunitIAH20040110013326-2.arc 0.0.0.0 \\
     *  20040110013326 text/plain 77
     * 1 0 InternetArchive
     * URL IP-address Archive-date Content-type Archive-length
     * 
     *
     * If compress is set, then we generate a header that has been gzipped
     * in the Internet Archive manner.   Such a gzipping enables the FEXTRA
     * flag in the FLG field of the gzip header.  It then appends an extra
     * header field: '8', '0', 'L', 'X', '0', '0', '0', '0'.  The first two
     * bytes are the length of the field and the last 6 bytes the Internet
     * Archive header.  To learn about GZIP format, see RFC1952.  To learn
     * about the Internet Archive extra header field, read the source for
     * av_ziparc which can be found at
     * alexa/vista/alexa-tools-1.2/src/av_ziparc.cc.
     *
     * We do things in this roundabout manner because the java
     * GZIPOutputStream does not give access to GZIP header fields.
     *
     * @param date Date to put into the ARC metadata; if 17-digit will be 
     * truncated to traditional 14-digits
     *
     * @return Byte array filled w/ the arc header.
	 * @throws IOException
     */
    private byte [] generateARCFileMetaData(String date)
    throws IOException {
        if(date!=null && date.length()>14) {
            date = date.substring(0,14);
        }
        int metadataBodyLength = getMetadataLength();
        // If metadata body, then the minor part of the version is '1' rather
        // than '0'.
        String metadataHeaderLinesTwoAndThree =
            getMetadataHeaderLinesTwoAndThree("1 " +
                ((metadataBodyLength > 0)? "1": "0"));
        int recordLength = metadataBodyLength +
            metadataHeaderLinesTwoAndThree.getBytes(DEFAULT_ENCODING).length;
        String metadataHeaderStr = ARC_MAGIC_NUMBER + getBaseFilename() +
            " 0.0.0.0 " + date + " text/plain " + recordLength +
            metadataHeaderLinesTwoAndThree;
        ByteArrayOutputStream metabaos =
            new ByteArrayOutputStream(recordLength);
        // Write the metadata header.
        metabaos.write(metadataHeaderStr.getBytes(DEFAULT_ENCODING));
        // Write the metadata body, if anything to write.
        if (metadataBodyLength > 0) {
            writeMetaData(metabaos);
        }
        
        // Write out a LINE_SEPARATORs to end this record.
        metabaos.write(LINE_SEPARATOR);
        
        // Now get bytes of all just written and compress if flag set.
        byte [] bytes = metabaos.toByteArray();
        
        if(isCompressed()) {
            // GZIP the header but catch the gzipping into a byte array so we
            // can add the special IA GZIP header to the product.  After
            // manipulations, write to the output stream (The JAVA GZIP
            // implementation does not give access to GZIP header. It
            // produces a 'default' header only).  We can get away w/ these
            // maniupulations because the GZIP 'default' header doesn't
            // do the 'optional' CRC'ing of the header.
            byte [] gzippedMetaData = ArchiveUtils.gzip(bytes);
            if (gzippedMetaData[3] != 0) {
                throw new IOException("The GZIP FLG header is unexpectedly " +
                    " non-zero.  Need to add smarter code that can deal " +
                    " when already extant extra GZIP header fields.");
            }
            // Set the GZIP FLG header to '4' which says that the GZIP header
            // has extra fields.  Then insert the alex {'L', 'X', '0', '0', '0,
            // '0'} 'extra' field.  The IA GZIP header will also set byte
            // 9 (zero-based), the OS byte, to 3 (Unix).  We'll do the same.
            gzippedMetaData[3] = 4;
            gzippedMetaData[9] = 3;
            byte [] assemblyBuffer = new byte[gzippedMetaData.length +
                ARC_GZIP_EXTRA_FIELD.length];
            // '10' in the below is a pointer past the following bytes of the
            // GZIP header: ID1 ID2 CM FLG + MTIME(4-bytes) XFL OS.  See
            // RFC1952 for explaination of the abbreviations just used.
            System.arraycopy(gzippedMetaData, 0, assemblyBuffer, 0, 10);
            System.arraycopy(ARC_GZIP_EXTRA_FIELD, 0, assemblyBuffer, 10,
                ARC_GZIP_EXTRA_FIELD.length);
            System.arraycopy(gzippedMetaData, 10, assemblyBuffer,
                10 + ARC_GZIP_EXTRA_FIELD.length, gzippedMetaData.length - 10);
            bytes = assemblyBuffer;
        }
        return bytes;
    }
    
    public String getMetadataHeaderLinesTwoAndThree(String version) {
        StringBuffer buffer = new StringBuffer();
        buffer.append(LINE_SEPARATOR);
        buffer.append(version);
        buffer.append(" InternetArchive");
        buffer.append(LINE_SEPARATOR);
        buffer.append("URL IP-address Archive-date Content-type Archive-length");
        buffer.append(LINE_SEPARATOR);
        return buffer.toString();
    }

    /**
     * Write all metadata to passed baos.
     *
     * @param baos Byte array to write to.
     * @throws UnsupportedEncodingException
     * @throws IOException
     */
    private void writeMetaData(ByteArrayOutputStream baos)
            throws UnsupportedEncodingException, IOException {
        if (settings.getMetadata() == null) {
            return;
        }

        for (Iterator i = settings.getMetadata().iterator();
                i.hasNext();) {
            Object obj = i.next();
            if (obj instanceof String) {
                baos.write(((String)obj).getBytes(DEFAULT_ENCODING));
            } else if (obj instanceof File) {
                InputStream is = null;
                try {
                    is = new BufferedInputStream(
                        new FileInputStream((File)obj));
                    byte [] buffer = new byte[4096];
                    for (int read = -1; (read = is.read(buffer)) != -1;) {
                        baos.write(buffer, 0, read);
                    }
                } finally {
                    if (is != null) {
                        is.close();
                    }
                }
            } else if (obj != null) {
                logger.severe("Unsupported metadata type: " + obj);
            }
        }
        return;
    }

    /**
     * @return Total length of metadata.
     * @throws UnsupportedEncodingException
     */
    private int getMetadataLength()
    throws UnsupportedEncodingException {
        int result = -1;
        if (settings.getMetadata()  == null) {
            result = 0;
        } else {
            for (Iterator i = settings.getMetadata().iterator();
                    i.hasNext();) {
                Object obj = i.next();
                if (obj instanceof String) {
                    result += ((String)obj).getBytes(DEFAULT_ENCODING).length;
                } else if (obj instanceof File) {
                    result += ((File)obj).length();
                } else {
                    logger.severe("Unsupported metadata type: " + obj);
                }
            }
        }
        return result;
    }

    /**
     * @deprecated use input-stream version directly instead
     */
    public void write(String uri, String contentType, String hostIP,
            long fetchBeginTimeStamp, long recordLength,
            ByteArrayOutputStream baos)
    throws IOException {
        write(uri, contentType, hostIP, fetchBeginTimeStamp, recordLength, 
                new ByteArrayInputStream(baos.toByteArray()), false);
    }
    
    public void write(String uri, String contentType, String hostIP,
            long fetchBeginTimeStamp, long recordLength, InputStream in)
    throws IOException {
        write(uri,contentType,hostIP,fetchBeginTimeStamp,recordLength,in,true);
    }
    
    /**
     * Write a record with the given metadata/content.
     * 
     * @param uri
     *            URI for metadata-line
     * @param contentType
     *            MIME content-type for metadata-line
     * @param hostIP
     *            IP for metadata-line
     * @param fetchBeginTimeStamp
     *            timestamp for metadata-line
     * @param recordLength
     *            length for metadata-line; also may be enforced
     * @param in
     *            source InputStream for record content
     * @param enforceLength
     *            whether to enforce the declared length; should be true
     *            unless intentionally writing bad records for testing
     * @throws IOException
     */
    public void write(String uri, String contentType, String hostIP,
            long fetchBeginTimeStamp, long recordLength, InputStream in,
            boolean enforceLength) throws IOException {
        preWriteRecordTasks();
        try {
            write(getMetaLine(uri, contentType, hostIP, fetchBeginTimeStamp,
                    recordLength).getBytes(UTF8));
            copyFrom(in, recordLength, enforceLength);
            if (in instanceof ReplayInputStream) {
                // check for consumption of entire recorded material
                long remaining = ((ReplayInputStream) in).remaining();
                // Should be zero at this stage. If not, something is
                // wrong.
                if (remaining != 0) {
                    String message = "Gap between expected and actual: "
                            + remaining + LINE_SEPARATOR + DevUtils.extraInfo()
                            + " writing arc "
                            + this.getFile().getAbsolutePath();
                    DevUtils.warnHandle(new Throwable(message), message);
                    throw new IOException(message);
                }
            }
            write(LINE_SEPARATOR);
        } finally {
            postWriteRecordTasks();
        }
    }
    
    /**
     * @param uri
     * @param contentType
     * @param hostIP
     * @param fetchBeginTimeStamp
     * @param recordLength
     * @return Metadata line for an ARCRecord made of passed components.
     * @exception IOException
     */
    protected String getMetaLine(String uri, String contentType, String hostIP,
        long fetchBeginTimeStamp, long recordLength)
    throws IOException {
        if (fetchBeginTimeStamp <= 0) {
            throw new IOException("Bogus fetchBeginTimestamp: " +
                Long.toString(fetchBeginTimeStamp));
        }

        return validateMetaLine(createMetaline(uri, hostIP, 
            ArchiveUtils.get14DigitDate(fetchBeginTimeStamp),
            MimetypeUtils.truncate(contentType),
            Long.toString(recordLength)));
    }
    
    public String createMetaline(String uri, String hostIP,
            String timeStamp, String mimetype, String recordLength) {
        return uri + HEADER_FIELD_SEPARATOR + hostIP +
            HEADER_FIELD_SEPARATOR + timeStamp +
            HEADER_FIELD_SEPARATOR + mimetype +
            HEADER_FIELD_SEPARATOR + recordLength + LINE_SEPARATOR;
    }
    
    /**
     * Test that the metadata line is valid before writing.
     * @param metaLineStr
     * @throws IOException
     * @return The passed in metaline.
     */
    protected String validateMetaLine(String metaLineStr)
    throws IOException {
        if (metaLineStr.length() > MAX_METADATA_LINE_LENGTH) {
            throw new IOException("Metadata line too long ("
                + metaLineStr.length() + ">" + MAX_METADATA_LINE_LENGTH 
                + "): " + metaLineStr);
        }
     	Matcher m = METADATA_LINE_PATTERN.matcher(metaLineStr);
        if (!m.matches()) {
            throw new IOException("Metadata line doesn't match expected" +
                " pattern: " + metaLineStr);
        }
        return metaLineStr;
    }
}