All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.martinkl.warc.mapreduce.WARCOutputFormat Maven / Gradle / Ivy

The newest version!
package com.martinkl.warc.mapreduce;

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.martinkl.warc.WARCFileWriter;
import com.martinkl.warc.WARCWritable;

/**
 * Hadoop OutputFormat for mapreduce jobs ('new' API) that want to write data to WARC files.
 *
 * Usage:
 *
 * ```java
 * Job job = new Job(getConf());
 * job.setOutputFormatClass(WARCOutputFormat.class);
 * job.setOutputKeyClass(NullWritable.class);
 * job.setOutputValueClass(WARCWritable.class);
 * FileOutputFormat.setCompressOutput(job, true);
 * ```
 *
 * The tasks generating the output (usually the reducers, but may be the mappers if there
 * are no reducers) should use `NullWritable.get()` as the output key, and the
 * {@link WARCWritable} as the output value.
 */
public class WARCOutputFormat extends FileOutputFormat {

    /**
     * Creates a new output file in WARC format, and returns a RecordWriter for writing to it.
     */
    @Override
    public RecordWriter getRecordWriter(TaskAttemptContext context)
            throws IOException, InterruptedException {
        return new WARCWriter(context);
    }

    private class WARCWriter extends RecordWriter {
        private final WARCFileWriter writer;

        public WARCWriter(TaskAttemptContext context) throws IOException {
            Configuration conf = context.getConfiguration();
            CompressionCodec codec = getCompressOutput(context) ? WARCFileWriter.getGzipCodec(conf) : null;
            Path workFile = getDefaultWorkFile(context, "");
            this.writer = new WARCFileWriter(conf, codec, workFile);
        }

        @Override
        public void write(NullWritable key, WARCWritable value) throws IOException, InterruptedException {
            writer.write(value);
        }

        @Override
        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            writer.close();
        }
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy