
io.github.repir.EntityReader.EntityReader Maven / Gradle / Ivy
The newest version!
package io.github.repir.EntityReader;
import io.github.repir.tools.io.Datafile;
import io.github.repir.tools.extract.Content;
import io.github.repir.tools.lib.Log;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
/**
* A document reader read an input file, identifying document markers to store
* one document at a startTime in a BytesWritable, that is used in a map()
* process. The LongWritable that is passed along indicates the offset in the
* input file, which can be used to trace problems.
*
* Note, Hadoop can split uncompressed files, to divide the work between
* mappers. These splits are likely to place an offset inside a document. The
* desired cause of action if that the mapper who starts reading a document that
* encounters the InputSplit's ceiling, keeps reading past the ceiling (you can,
* the ceiling is just an indicator). The other Mapper starts at the designated
* offset and searches from that point until the first start of document tag.
* This way no documents are processed twice or get lost.
*
* EntityReader is used internally by {@link EntityReaderInputFormat}, to read
* the next entity from a source archive, for processing by a Mapper as
* {@link EntityWritable}.
*
* @author jeroen
*/
public abstract class EntityReader extends RecordReader {
public static Log log = new Log(EntityReader.class);
protected TaskAttemptContext context;
protected long start;
protected long end;
protected Datafile fsin;
protected LongWritable key = new LongWritable();
protected Content entitywritable;
protected FileSystem filesystem;
protected Configuration conf;
protected int onlypartition;
protected int partitions;
@Override
public void initialize(InputSplit is, TaskAttemptContext tac) {
context = tac;
initialize( is, tac.getConfiguration() );
}
public void initialize(InputSplit is, Configuration conf) {
//log.info("initialize");
try {
this.conf = conf;
filesystem = FileSystem.get(conf);
FileSplit fileSplit = (FileSplit) is;
Path file = fileSplit.getPath();
start = fileSplit.getStart();
end = start + fileSplit.getLength();
fsin = new Datafile(filesystem, file);
fsin.setOffset(start);
fsin.setBufferSize(10000000);
fsin.openRead();
onlypartition = conf.getInt("repository.onlypartition", -1);
partitions = conf.getInt("repository.partitions", 1);
initialize(fileSplit);
} catch (IOException ex) {
log.exception(ex, "initialize( %s ) conf %s filesystem %s fsin %s", is, conf, filesystem, fsin);
}
}
public abstract void initialize(FileSplit fileSplit);
/**
* Reads the input file, scanning for the next document, setting key and
* entitywritable with the offset and byte contents of the document read.
*
* @return true if a next document was read
*/
@Override
public abstract boolean nextKeyValue();
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
return key;
}
@Override
public Content getCurrentValue() throws IOException, InterruptedException {
return entitywritable;
}
/**
* NB this indicates progress as the data that has been read, for some
* MapReduce tasks processing the data continues for some startTime, causing
* the progress indicator to halt at 100%.
*
* @return @throws IOException
* @throws InterruptedException
*/
@Override
public float getProgress() throws IOException, InterruptedException {
return (fsin.getOffset() - start) / (float) (end - start);
}
@Override
public void close() throws IOException {
fsin.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy