com.martinkl.warc.WARCFileWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of warc-hadoop Show documentation
Java library for working with WARC (Web Archive) files in Hadoop MapReduce
The newest version!
package com.martinkl.warc;

import java.io.BufferedOutputStream;
import java.io.DataOutputStream;
import java.io.FilterOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.ReflectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Writes {@link WARCRecord}s to a WARC file, using Hadoop's filesystem APIs. (This means you
 * can write to HDFS, S3 or any other filesystem supported by Hadoop). This implementation is
 * not tied to the MapReduce APIs -- that link is provided by the mapred
 * {@link com.martinkl.warc.mapred.WARCOutputFormat} and the mapreduce
 * {@link com.martinkl.warc.mapreduce.WARCOutputFormat}.
 *
 * WARCFileWriter keeps track of how much data it has written (optionally gzip-compressed);
 * when the file becomes larger than some threshold, it is automatically closed and a
 * new segment is started. A segment number is appended to the filename for that purpose.
 * The segment number always starts at 00000, and by default a new segment is started when
 * the file size exceeds 1GB. To change the target size for a segment, you can set the
 * `warc.output.segment.size` key in the Hadoop configuration to the number of bytes.
 * (Files may actually be a bit larger than this threshold, since we finish writing the
 * current record before opening a new file.)
 */
public class WARCFileWriter {
    private static final Logger logger = LoggerFactory.getLogger(WARCFileWriter.class);
    public static final long DEFAULT_MAX_SEGMENT_SIZE = 1000000000L; // 1 GB

    private final Configuration conf;
    private final CompressionCodec codec;
    private final Path workOutputPath;
    private final Progressable progress;
    private final String extensionFormat;
    private final long maxSegmentSize;
    private long segmentsCreated = 0, segmentsAttempted = 0, bytesWritten = 0;
    private CountingOutputStream byteStream;
    private DataOutputStream dataStream;

    /**
     * Creates a WARC file, and opens it for writing. If a file with the same name already
     * exists, an attempt number in the filename is incremented until we find a file that
     * doesn't already exist.
     *
     * @param conf The Hadoop configuration.
     * @param codec If null, the file is uncompressed. If non-null, this compression codec
     *              will be used. The codec's default file extension is appended to the filename.
     * @param workOutputPath The directory and filename prefix to which the data should be
     *                       written. We append a segment number and filename extensions to it.
     * @throws IOException
     */
    public WARCFileWriter(Configuration conf, CompressionCodec codec, Path workOutputPath) throws IOException {
        this(conf, codec, workOutputPath, null);
    }

    /**
     * Creates a WARC file, and opens it for writing. If a file with the same name already
     * exists, it is *overwritten*. Note that this is different behaviour from the other
     * constructor. Yes, this sucks. It will probably change in a future version.
     *
     * @param conf The Hadoop configuration.
     * @param codec If null, the file is uncompressed. If non-null, this compression codec
     *              will be used. The codec's default file extension is appended to the filename.
     * @param workOutputPath The directory and filename prefix to which the data should be
     *                       written. We append a segment number and filename extensions to it.
     * @param progress An object used by the mapred API for tracking a task's progress.
     * @throws IOException
     */
    public WARCFileWriter(Configuration conf, CompressionCodec codec, Path workOutputPath, Progressable progress)
            throws IOException {
        this.conf = conf;
        this.codec = codec;
        this.workOutputPath = workOutputPath;
        this.progress = progress;
        this.extensionFormat = ".seg-%05d.attempt-%05d.warc" +
                (codec == null ? "" : codec.getDefaultExtension());
        this.maxSegmentSize = conf.getLong("warc.output.segment.size", DEFAULT_MAX_SEGMENT_SIZE);
        createSegment();
    }

    /**
     * Instantiates a Hadoop codec for compressing and decompressing Gzip files. This is the
     * most common compression applied to WARC files.
     *
     * @param conf The Hadoop configuration.
     */
    public static CompressionCodec getGzipCodec(Configuration conf) {
        try {
            return (CompressionCodec) ReflectionUtils.newInstance(
                conf.getClassByName("org.apache.hadoop.io.compress.GzipCodec").asSubclass(CompressionCodec.class),
                conf);
        } catch (ClassNotFoundException e) {
            logger.warn("GzipCodec could not be instantiated", e);
            return null;
        }
    }

    /**
     * Creates an output segment file and sets up the output streams to point at it.
     * If the file already exists, retries with a different filename. This is a bit nasty --
     * after all, {@link FileOutputFormat}'s work directory concept is supposed to prevent
     * filename clashes -- but it looks like Amazon Elastic MapReduce prevents use of per-task
     * work directories if the output of a job is on S3.
     *
     * TODO: Investigate this and find a better solution.
     */
    private void createSegment() throws IOException {
        segmentsAttempted = 0;
        bytesWritten = 0;
        boolean success = false;

        while (!success) {
            Path path = workOutputPath.suffix(String.format(extensionFormat, segmentsCreated, segmentsAttempted));
            FileSystem fs = path.getFileSystem(conf);

            try {
                // The o.a.h.mapred OutputFormats overwrite existing files, whereas
                // the o.a.h.mapreduce OutputFormats don't overwrite. Bizarre...
                // Here, overwrite if progress != null, i.e. if using mapred API.
                FSDataOutputStream fsStream = (progress == null) ? fs.create(path, false): fs.create(path, progress);
                byteStream = new CountingOutputStream(new BufferedOutputStream(fsStream));
                dataStream = new DataOutputStream(codec == null ? byteStream : codec.createOutputStream(byteStream));
                segmentsCreated++;
                logger.info("Writing to output file: {}", path);
                success = true;

            } catch (IOException e) {
                if (e.getMessage().startsWith("File already exists")) {
                    logger.warn("Tried to create file {} but it already exists; retrying.", path);
                    segmentsAttempted++; // retry
                } else {
                    throw e;
                }
            }
        }
    }

    /**
     * Appends a {@link WARCRecord} to the file, in WARC/1.0 format.
     * @param record The record to be written.
     * @throws IOException
     */
    public void write(WARCRecord record) throws IOException {
        if (bytesWritten > maxSegmentSize) {
            dataStream.close();
            createSegment();
        }
        record.write(dataStream);
    }

    /**
     * Appends a {@link WARCRecord} wrapped in a {@link WARCWritable} to the file.
     * @param record The wrapper around the record to be written.
     * @throws IOException
     */
    public void write(WARCWritable record) throws IOException {
        if (record.getRecord() != null) write(record.getRecord());
    }

    /**
     * Flushes any buffered data and closes the file.
     * @throws IOException
     */
    public void close() throws IOException {
        dataStream.close();
    }


    private class CountingOutputStream extends FilterOutputStream {
        public CountingOutputStream(OutputStream out) {
            super(out);
        }

        @Override
        public void write(byte[] b, int off, int len) throws IOException {
            out.write(b, off, len);
            bytesWritten += len;
        }

        @Override
        public void write(int b) throws IOException {
            out.write(b);
            bytesWritten++;
        }

        // Overriding close() because FilterOutputStream's close() method pre-JDK8 has bad behavior:
        // it silently ignores any exception thrown by flush(). Instead, just close the delegate stream.
        // It should flush itself if necessary. (Thanks to the Guava project for noticing this.)
        @Override
        public void close() throws IOException {
            out.close();
        }
    }
}