org.archive.extract.ProducerUtils Maven / Gradle / Ivy
package org.archive.extract;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.archive.resource.ResourceProducer;
import org.archive.resource.producer.ARCFile;
import org.archive.resource.producer.EnvelopedResourceFile;
import org.archive.resource.producer.WARCFile;
public class ProducerUtils {
public static boolean STRICT_GZ = false;
public static ResourceProducer getProducer(String path) throws IOException {
return getProducer(path,0);
}
public static ResourceProducer getProducer(String path, long offset) throws IOException {
ResourceProducer producer = null;
EnvelopedResourceFile ef = new EnvelopedResourceFile(null);
ef.setStrict(STRICT_GZ);
ARCFile af = new ARCFile();
af.setStrict(STRICT_GZ);
WARCFile wf = new WARCFile();
wf.setStrict(STRICT_GZ);
File file = new File(path);
if(path.startsWith("hdfs://")) {
String name = file.getName();
Path fsPath = new Path(path);
FileSystem fs = fsPath.getFileSystem(new Configuration());
FSDataInputStream fsdis = fs.open(fsPath);
if(path.endsWith(".warc.gz") || path.endsWith(".wat.gz")) {
producer = wf.getGZResourceProducer(fsdis,name,offset);
} else if(path.endsWith(".arc.gz")) {
producer = af.getGZResourceProducer(fsdis,name,offset);
} else if(path.endsWith(".arc")) {
producer = af.getResourceProducer(fsdis,name,offset);
} else if(path.endsWith(".warc") || path.endsWith(".wat")) {
producer = wf.getResourceProducer(fsdis,name,offset);
} else if(path.endsWith(".gz")) {
producer = ef.getGZResourceProducer(fsdis,name,offset);
}
} else if(path.startsWith("http://")) {
String name = file.getName();
URL url = new URL(path);
if(path.endsWith(".warc.gz") || path.endsWith(".wat.gz")) {
producer = wf.getGZResourceProducer(url,name,offset);
} else if(path.endsWith(".arc.gz")) {
producer = af.getGZResourceProducer(url,name,offset);
} else if(path.endsWith(".arc")) {
producer = af.getResourceProducer(url,name,offset);
} else if(path.endsWith(".warc") || path.endsWith(".wat")) {
producer = wf.getResourceProducer(url,name,offset);
} else if(path.endsWith(".gz")) {
producer = ef.getGZResourceProducer(url,name,offset);
}
} else {
if(!(file.exists() && file.canRead())) {
System.err.println(path + " is not a readable file.");
return null;
}
if(path.endsWith(".warc.gz") || path.endsWith(".wat.gz")) {
producer = wf.getGZResourceProducer(file,offset);
} else if(path.endsWith(".arc.gz")) {
producer = af.getGZResourceProducer(file,offset);
} else if(path.endsWith(".arc")) {
producer = af.getResourceProducer(file,offset);
} else if(path.endsWith(".warc") || path.endsWith(".wat")) {
producer = wf.getResourceProducer(file,offset);
} else if(path.endsWith(".gz")) {
producer = ef.getGZResourceProducer(file,offset);
}
}
return producer;
}
}