org.archive.extract.ResourceExtractor Maven / Gradle / Ivy
package org.archive.extract;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.archive.RecoverableRecordFormatException;
import org.archive.format.gzip.GZIPFormatException;
import org.archive.resource.Resource;
import org.archive.resource.ResourceConstants;
import org.archive.resource.ResourceParseException;
import org.archive.resource.ResourceProducer;
import org.archive.url.WaybackURLKeyMaker;
public class ResourceExtractor implements ResourceConstants, Tool {
private final static Logger LOG =
Logger.getLogger(ResourceExtractor.class.getName());
Charset UTF8 = Charset.forName("utf-8");
public final static String TOOL_NAME = "extractor";
public static final String TOOL_DESCRIPTION =
"A tool for extracting metadata from WARC, ARC, and WAT files";
private OutputStream out;
private Configuration conf;
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return conf;
}
// private static final Logger LOG =
// Logger.getLogger(ResourceExtractor.class.getName());
private static int USAGE(int exitCode) {
System.err.println("Usage:\n");
System.err.println("extractor [OPT] SRC");
System.err.println("\tSRC is the local path, HTTP or HDFS URL to an " +
"arc, warc, arc.gz, or warc.gz.");
System.err.println("\tOPT can be one of:");
System.err.println("\t\t-cdxURL\tProduce output in old URL Wayback CDX format");
System.err.println("\t\t-cdx\tProduce output in NEW-SURT-Wayback CDX format");
System.err.println("\t\t\t (note that column 1 is NOT standard Wayback canonicalized)\n");
System.err.println("\t\t-wat\tembed JSON output in a compressed WARC" +
"wrapper, for storage, or sharing.");
return exitCode;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new ResourceExtractor(), args);
System.exit(res);
}
private PrintWriter makePrintWriter(OutputStream os)
{
return new PrintWriter(new OutputStreamWriter(os, Charset.forName("UTF-8")));
}
public int run(String[] args)
throws IndexOutOfBoundsException, FileNotFoundException, IOException,
ResourceParseException, URISyntaxException {
// TODO: parse CLI arguments better
if(args.length < 1) {
return USAGE(1);
}
if(args.length > 3) {
return USAGE(1);
}
int max = Integer.MAX_VALUE;
OutputStream os = this.out == null ? System.out : this.out;
Logger.getLogger("org.archive").setLevel(Level.WARNING);
ExtractorOutput out;
int arg = 0;
if(args.length > 0) {
if(args[0].equals("-strict")) {
ProducerUtils.STRICT_GZ = true;
arg++;
}
}
String path = args[arg];
if(args.length == arg + 2) {
if(args[arg].equals("-cdx")) {
path = args[arg+1];
out = new RealCDXExtractorOutput(makePrintWriter(os));
} else if(args[arg].equals("-cdxURL")) {
path = args[arg+1];
out = new RealCDXExtractorOutput(makePrintWriter(os), new WaybackURLKeyMaker(false));
} else if(args[arg].equals("-wat")) {
path = args[arg+1];
out = new WATExtractorOutput(os);
} else {
String filter = args[arg+1];
out = new JSONViewExtractorOutput(os, filter);
}
} else {
out = new DumpingExtractorOutput(os);
}
ResourceProducer producer = ProducerUtils.getProducer(path);
if(producer == null) {
return USAGE(1);
}
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer exProducer =
new ExtractingResourceProducer(producer, mapper);
Logger.getLogger("org.archive").setLevel(Level.WARNING);
int count = 0;
int incr = 1;
while(count < max) {
try {
Resource r = exProducer.getNext();
if(r == null) {
break;
}
count += incr;
out.output(r);
} catch(GZIPFormatException e) {
LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage()));
//Log is not coming out for some damn reason....needs to be studied
System.err.format("%s: %s",exProducer.getContext(),e.getMessage());
if(ProducerUtils.STRICT_GZ) {
throw e;
}
e.printStackTrace();
} catch(ResourceParseException e) {
LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage()));
//Log is not coming out for some damn reason....needs to be studied
System.err.format("%s: %s",exProducer.getContext(),e.getMessage());
if(ProducerUtils.STRICT_GZ) {
throw e;
}
e.printStackTrace();
} catch(RecoverableRecordFormatException e) {
// this should not get here - ResourceFactory et al should wrap as ResourceParseExceptions...
LOG.severe(String.format("RECOVERABLE - %s: %s",exProducer.getContext(),e.getMessage()));
//Log is not coming out for some damn reason....needs to be studied
System.err.format("%s: %s",exProducer.getContext(),e.getMessage());
e.printStackTrace();
}
}
return 0;
}
/**
* @return the out
*/
public OutputStream getOut() {
return out;
}
/**
* @param out the out to set
*/
public void setOut(OutputStream out) {
this.out = out;
}
}