
com.martinkl.warc.mapred.WARCInputFormat Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of warc-hadoop Show documentation
Show all versions of warc-hadoop Show documentation
Java library for working with WARC (Web Archive) files in Hadoop MapReduce
The newest version!
package com.martinkl.warc.mapred;
import java.io.EOFException;
import java.io.IOException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import com.martinkl.warc.WARCFileReader;
import com.martinkl.warc.WARCRecord;
import com.martinkl.warc.WARCWritable;
/**
* Hadoop InputFormat for mapred jobs ('old' API) that want to process data in WARC files.
*
* Usage:
*
* ```java
* JobConf job = new JobConf(getConf());
* job.setInputFormat(WARCInputFormat.class);
* ```
*
* Mappers should use a key of {@link org.apache.hadoop.io.LongWritable} (which is
* 1 for the first record in a file, 2 for the second record, etc.) and a value of
* {@link WARCWritable}.
*/
public class WARCInputFormat extends FileInputFormat {
/**
* Opens a WARC file (possibly compressed) for reading, and returns a RecordReader for accessing it.
*/
@Override
public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter)
throws IOException {
reporter.setStatus(split.toString());
return new WARCReader(job, (FileSplit) split);
}
/**
* Always returns false, as WARC files cannot be split.
*/
@Override
protected boolean isSplitable(FileSystem fs, Path filename) {
return false;
}
private static class WARCReader implements RecordReader {
private final WARCFileReader reader;
public WARCReader(JobConf job, FileSplit split) throws IOException {
reader = new WARCFileReader(job, ((FileSplit) split).getPath());
}
@Override
public LongWritable createKey() {
return new LongWritable();
}
@Override
public WARCWritable createValue() {
return new WARCWritable();
}
@Override
public boolean next(LongWritable key, WARCWritable value) throws IOException {
try {
WARCRecord record = reader.read();
key.set(reader.getRecordsRead());
value.setRecord(record);
return true;
} catch (EOFException eof) {
return false;
}
}
@Override
public long getPos() throws IOException {
return reader.getBytesRead();
}
@Override
public float getProgress() throws IOException {
return reader.getProgress();
}
@Override
public void close() throws IOException {
reader.close();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy