All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.extract.ResourceExtractor Maven / Gradle / Ivy

There is a newer version: 1.1.9
Show newest version
package org.archive.extract;

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.archive.RecoverableRecordFormatException;
import org.archive.format.gzip.GZIPFormatException;
import org.archive.resource.Resource;
import org.archive.resource.ResourceConstants;
import org.archive.resource.ResourceParseException;
import org.archive.resource.ResourceProducer;
import org.archive.url.WaybackURLKeyMaker;

public class ResourceExtractor implements ResourceConstants, Tool {
	
	private final static Logger LOG =
		Logger.getLogger(ResourceExtractor.class.getName());
	Charset UTF8 = Charset.forName("utf-8");
	public final static String TOOL_NAME = "extractor";
	public static final String TOOL_DESCRIPTION = 
		"A tool for extracting metadata from WARC, ARC, and WAT files";
	private OutputStream out;
	private Configuration conf;
	public void setConf(Configuration conf) {
		this.conf = conf;
	}
	public Configuration getConf() {
		return conf;
	}

	
//	private static final Logger LOG = 
//		Logger.getLogger(ResourceExtractor.class.getName());
	
	private static int USAGE(int exitCode) {
		System.err.println("Usage:\n");
		System.err.println("extractor [OPT] SRC");
		System.err.println("\tSRC is the local path, HTTP or HDFS URL to an " +
				"arc, warc, arc.gz, or warc.gz.");
		System.err.println("\tOPT can be one of:");		
		System.err.println("\t\t-cdxURL\tProduce output in old URL Wayback CDX format");
		System.err.println("\t\t-cdx\tProduce output in NEW-SURT-Wayback CDX format");
		System.err.println("\t\t\t (note that column 1 is NOT standard Wayback canonicalized)\n");
		System.err.println("\t\t-wat\tembed JSON output in a compressed WARC" +
				"wrapper, for storage, or sharing.");
		return exitCode;
	}


	public static void main(String[] args) throws Exception {
		int res = ToolRunner.run(new Configuration(), new ResourceExtractor(), args);
		System.exit(res);
	}
	
	private PrintWriter makePrintWriter(OutputStream os)
	{
		return new PrintWriter(new OutputStreamWriter(os, Charset.forName("UTF-8")));
	}

	public int run(String[] args) 
	throws IndexOutOfBoundsException, FileNotFoundException, IOException,
	ResourceParseException, URISyntaxException {
		// TODO: parse CLI arguments better
		if(args.length < 1) {
			return USAGE(1);
		}
		if(args.length > 4) {
			return USAGE(1);
		}
		int max = Integer.MAX_VALUE;
		OutputStream os = this.out == null ? System.out : this.out;
	    Logger.getLogger("org.archive").setLevel(Level.WARNING);
	    ExtractorOutput out;
	    int arg = 0;
	    if(args.length > 0) {
	    	if(args[0].equals("-strict")) {
	    		ProducerUtils.STRICT_GZ = true;
	    		arg++;
	    	}	   
	    }
	    String path = args[arg];
	    String outputFile = null;
	    if(args.length >= arg + 2) {
	        //if a output file is specified in the command line
	        if(args.length == arg + 3) {
	            outputFile = args[arg+2];
	            os.close();
	            os = new FileOutputStream(outputFile);
	        }
	    	if(args[arg].equals("-cdx")) {
	    		path = args[arg+1];
	    		out = new RealCDXExtractorOutput(makePrintWriter(os));
	    		
	    	} else if(args[arg].equals("-cdxURL")) {
	    		path = args[arg+1];
	    		out = new RealCDXExtractorOutput(makePrintWriter(os), new WaybackURLKeyMaker(false));

	    	} else if(args[arg].equals("-wat")) {
	    		path = args[arg+1];
	    		out = new WATExtractorOutput(os, outputFile);
	    	} else {
	    		String filter = args[arg+1];
	    		out = new JSONViewExtractorOutput(os, filter);
	    	}
	    } else {
	    	out = new DumpingExtractorOutput(os);
	    }
	    ResourceProducer producer = ProducerUtils.getProducer(path);
	    if(producer == null) {
	    	return USAGE(1);
	    }
	    ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
	    ExtractingResourceProducer exProducer = 
	    	new ExtractingResourceProducer(producer, mapper);

	    Logger.getLogger("org.archive").setLevel(Level.WARNING);

		int count = 0;
		int incr = 1;
		while(count < max) {
			try {
				Resource r = exProducer.getNext();
				if(r == null) {
					break;
				}
				count += incr;
				
				out.output(r);
			} catch(GZIPFormatException e) {
				LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage()));
				//Log is not coming out for some damn reason....needs to be studied
				System.err.format("%s: %s",exProducer.getContext(),e.getMessage());
				
				if(ProducerUtils.STRICT_GZ) {
					throw e;
				}
				e.printStackTrace();
			} catch(ResourceParseException e) {
				LOG.severe(String.format("%s: %s",exProducer.getContext(),e.getMessage()));
				//Log is not coming out for some damn reason....needs to be studied
				System.err.format("%s: %s",exProducer.getContext(),e.getMessage());
				
				if(ProducerUtils.STRICT_GZ) {
					throw e;
				}
				e.printStackTrace();
			} catch(RecoverableRecordFormatException e) {
				// this should not get here - ResourceFactory et al should wrap as ResourceParseExceptions...
				LOG.severe(String.format("RECOVERABLE - %s: %s",exProducer.getContext(),e.getMessage()));
				//Log is not coming out for some damn reason....needs to be studied
				System.err.format("%s: %s",exProducer.getContext(),e.getMessage());

				e.printStackTrace();
				
			}
		}
		return 0;
	}
	/**
	 * @return the out
	 */
	public OutputStream getOut() {
		return out;
	}
	/**
	 * @param out the out to set
	 */
	public void setOut(OutputStream out) {
		this.out = out;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy