io.github.repir.EntityReader.MapReduce.EntityReaderInputFormat Maven / Gradle / Ivy
package io.github.repir.EntityReader.MapReduce;
import io.github.repir.tools.hadoop.FileFilter;
import io.github.repir.EntityReader.EntityReader;
import io.github.repir.EntityReader.EntityReaderTrec;
import io.github.repir.tools.io.HDFSPath;
import io.github.repir.tools.extract.Content;
import static io.github.repir.tools.lib.ClassTools.*;
import io.github.repir.tools.lib.Log;
import io.github.repir.tools.hadoop.Job;
import java.io.IOException;
import java.lang.reflect.Constructor;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat;
/**
* EntityReaderInputFormat extends FileInputFormat to supply Hadoop with the
* input to process. To use EntityReaderInputFormat, instantiate with {@link #EntityReaderInputFormat(org.apache.hadoop.mapreduce.Job, java.lang.String[])
* }
* using an array of paths on the HDFS, that contain the input files to process.
* The paths can be files or directories, which are scanned recursively for any
* file. Before adding a file to the list of inputs,
* {@link #acceptFile(java.lang.String)} is called to check if this file is to
* be processed. This way the readme, program and .dtd files in the original
* TREC collections are skipped.
*
* The input is configured by "repository.inputdir", which can be a comma
* seperated list of folders, or an array, e.g. multiple
* "+repository.inputdir=...". The dirs are scanned recursively for input files.
* See {@link FileFilter} if certain files can be included or excluded.
*
* By default, valid files are submitted to an instantiation of the configured
* "repository.entityreader". Alternatively, different entityreaders can be
* configured for different file types, by assigning an entity reader for files
* that end with some extension, e.g. "+repository.assignentityreader=.pdf
* EntitReaderPDF"
*
* !!Note that Java does not have a way to uncompress .z files, so the .z files
* on the original TREC disks have to be uncompressed outside this framework.
*
* @author jeroen
*/
public class EntityReaderInputFormat extends FileInputFormat {
public static Log log = new Log(EntityReaderInputFormat.class);
FileFilter filefilter;
String defaultentityreader;
Configuration configuration;
HashMap assignentityreader = new HashMap();
Job job;
public EntityReaderInputFormat() {
}
public EntityReaderInputFormat(Job job) throws IOException {
configuration = job.getConfiguration();
String inputdirs[] = configuration.get("repository.inputdir").split(",");
filefilter = new FileFilter(configuration);
loadEntityReaderSettings(configuration);
job.setInputFormatClass(this.getClass());
job.setOutputFormatClass(NullOutputFormat.class);
for (String dir : inputdirs) {
addDirs(job, dir);
}
}
public void addDirs(Job job, String dir) throws IOException {
FileSystem fs = HDFSPath.getFS(configuration);
ArrayList paths = new ArrayList();
ArrayList files = new ArrayList();
if (dir.length() > 0) {
HDFSPath d = new HDFSPath(fs, dir);
if (d.isFile()) {
addFile(job, new Path(dir));
} else {
for (String f : d.getFilepathnames()) {
addFile(job, new Path(f));
}
for (HDFSPath f : d.getDirs()) {
addDirs(job, f.getCanonicalPath());
}
}
}
}
public void addFile(Job job, Path path) {
try {
if (filefilter.acceptFile(path)) {
addInputPath(job, path);
}
} catch (IOException ex) {
log.exception(ex, "add( %s, %s )", job, path);
}
}
@Override
public List getSplits(JobContext job) throws IOException {
return super.getSplits(job);
}
public void loadEntityReaderSettings(Configuration conf) {
defaultentityreader = conf.get("repository.entityreader", EntityReaderTrec.class.getCanonicalName());
for (String s : conf.getStrings("repository.assignentityreader", new String[0])) {
String part[] = s.split(" +");
assignentityreader.put(part[1], part[0]);
}
}
public String getEntityReaderName(InputSplit is, Configuration conf) {
if (defaultentityreader == null) {
loadEntityReaderSettings(conf);
}
String file = ((FileSplit) is).getPath().getName();
for (Map.Entry entry : assignentityreader.entrySet()) {
if (file.toLowerCase().endsWith(entry.getKey())) {
return entry.getValue();
}
}
return defaultentityreader;
}
@Override
public RecordReader createRecordReader(InputSplit is, TaskAttemptContext tac) {
//log.info("documentreader %s", getDocumentReader(tac.getConfiguration()));
Class clazz = toClass(getEntityReaderName(is, tac.getConfiguration()), EntityReader.class.getPackage().getName());
Constructor c;
try {
c = getAssignableConstructor(clazz, EntityReader.class);
return (RecordReader) construct(c);
} catch (ClassNotFoundException ex) {
log.fatalexception(ex, "createRecordReader()");
}
return null;
}
@Override
protected boolean isSplitable(JobContext context, Path file) {
return context.getConfiguration().getBoolean("repository.splitablesource", false);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy