All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.hadoop.ResourceRecordReader Maven / Gradle / Ivy

The newest version!
package org.archive.hadoop;

import java.io.IOException;
import java.util.logging.Logger;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.archive.extract.ExtractingResourceFactoryMapper;
import org.archive.extract.ExtractingResourceProducer;
import org.archive.extract.ResourceFactoryMapper;
import org.archive.format.gzip.GZIPMemberSeries;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
import org.archive.resource.ResourceParseException;
import org.archive.resource.ResourceProducer;
import org.archive.resource.TransformingResourceProducer;
import org.archive.resource.arc.ARCResourceFactory;
import org.archive.resource.gzip.GZIPResourceContainer;
import org.archive.resource.warc.WARCResourceFactory;
import org.archive.streamcontext.HDFSStream;
import org.archive.streamcontext.Stream;
import org.archive.util.StreamCopy;

public class ResourceRecordReader extends RecordReader{
	private final static Logger LOG =
		Logger.getLogger(ResourceRecordReader.class.getName());

	WARCResourceFactory wf = new WARCResourceFactory();
	ARCResourceFactory af = new ARCResourceFactory();
	Stream stream;
	GZIPMemberSeries series;
	private ResourceProducer producer;
//	private ResourceExtractor extractor;
	private String name;
	private long startOffset;
	private long length;
	
	private ResourceContext cachedK;
	private MetaData cachedV;
	
	@Override
	public void close() throws IOException {
		producer.close();
	}

	@Override
	public ResourceContext getCurrentKey() throws IOException, InterruptedException {
		return cachedK;
	}

	@Override
	public MetaData getCurrentValue() throws IOException, InterruptedException {
		return cachedV;
	}

	@Override
	public float getProgress() throws IOException, InterruptedException {
		if(length == 0) {
			return 0;
		}
		long curOffset = stream.getOffset();
		float amtDone = curOffset - startOffset;
		float flen = (float) length;
		return amtDone / flen;
	}

	@Override
	public void initialize(InputSplit inputSplit, TaskAttemptContext context)
			throws IOException, InterruptedException {
		if(inputSplit instanceof FileSplit) {
			FileSplit fs = (FileSplit) inputSplit;
			Path fsPath = fs.getPath();
	    	FileSystem fSys = fsPath.getFileSystem(context.getConfiguration());
	    	FSDataInputStream fsdis = fSys.open(fsPath);
	    	String path = fsPath.getName();
	    	name = fsPath.getName();
	    	stream = new HDFSStream(fsdis);
	    	startOffset = fs.getStart();
			length = fs.getLength();
			long endOffset = startOffset + length;
			stream.setOffset(startOffset);	    	
	    	series = new GZIPMemberSeries(stream, name, startOffset);
			GZIPResourceContainer prod = 
				new GZIPResourceContainer(series,endOffset);
			ResourceProducer envelope;
	    	if(path.endsWith(".warc.gz") || path.endsWith(".wat.gz")) {
	    		envelope = new TransformingResourceProducer(prod,wf);
			} else if(path.endsWith(".arc.gz")) {
				envelope = new TransformingResourceProducer(prod,af);
			} else {
				throw new IOException("arguments must be arc.gz or warc.gz");
			}
	    	ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
	    	producer = new ExtractingResourceProducer(envelope, mapper);

		} else {
			throw new IOException("Need FileSplit input...");
		}
	}

	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
		// TODO: loop while getting resourceparseexceptions:
		try {
			Resource r = producer.getNext();
			if(r != null) {

				StreamCopy.readToEOF(r.getInputStream());
				LOG.info(String.format("Extracted offset %d\n", 
						series.getCurrentMemberStartOffset()));
				cachedK = new ResourceContext(name, 
						series.getCurrentMemberStartOffset());
				cachedV = r.getMetaData().getTopMetaData();
				return true;
			}
		} catch (ResourceParseException e) {
			e.printStackTrace();
			throw new IOException(
					String.format("ResourceParseException at(%s)(%d)",
							name,series.getCurrentMemberStartOffset()),
					e);
		}
		return false;
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy