All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.extract.WATExtractorOutput Maven / Gradle / Ivy

There is a newer version: 1.1.9
Show newest version
package org.archive.extract;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.net.UnknownHostException;
import java.util.Date;

import org.archive.format.gzip.GZIPMemberWriter;
import org.archive.format.gzip.GZIPMemberWriterCommittedOutputStream;
import org.archive.format.http.HttpHeaders;
import org.archive.format.json.JSONUtils;
import org.archive.format.warc.WARCRecordWriter;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
import org.archive.util.IAUtils;
import org.archive.util.DateUtils;
import org.archive.util.StreamCopy;
import org.archive.util.io.CommitedOutputStream;
import org.json.JSONException;

import java.net.InetAddress;
import java.text.DateFormat;
import java.text.SimpleDateFormat;

import java.util.logging.Logger;

public class WATExtractorOutput implements ExtractorOutput {
	WARCRecordWriter recW;
	private boolean wroteFirst;
	private GZIPMemberWriter gzW;
	private static int DEFAULT_BUFFER_RAM = 1024 * 1024;
	private int bufferRAM = DEFAULT_BUFFER_RAM;
	private final static Charset UTF8 = Charset.forName("UTF-8");
	private String outputFile;
	
	private static final Logger LOG = Logger.getLogger(WATExtractorOutput.class.getName());
	
	public WATExtractorOutput(OutputStream out, String outputFile) {
		gzW = new GZIPMemberWriter(out);
		recW = new WARCRecordWriter();
		wroteFirst = false;
		this.outputFile = outputFile;
	}

	private CommitedOutputStream getOutput() {
		return new GZIPMemberWriterCommittedOutputStream(gzW,bufferRAM);
	}

	public void output(Resource resource) throws IOException {
		StreamCopy.readToEOF(resource.getInputStream());
		MetaData top = resource.getMetaData().getTopMetaData();
		CommitedOutputStream cos;
		if(!wroteFirst) {
			cos = getOutput();
			writeWARCInfo(cos,top);
			cos.commit();
			wroteFirst = true;
		}
		String envelopeFormat = JSONUtils.extractSingle(top, "Envelope.Format");
		if(envelopeFormat == null) {
			// hrm...
			throw new IOException("Missing Envelope.Format");
		}
		cos = getOutput();
		if(envelopeFormat.startsWith("ARC")) {
			writeARC(cos,top);
		} else if(envelopeFormat.startsWith("WARC")) {
			writeWARC(cos,top);
		} else {
			// hrm...
			throw new IOException("Unknown Envelope.Format");
		}
		cos.commit();
	}

	private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException {
		// filename is given in the command line
		String filename = outputFile;
		if (filename == null || filename.length() == 0) {
			// if no filename by command line, we construct a default filename base on container filename
			filename = JSONUtils.extractSingle(md, "Container.Filename");
			if (filename == null) {
				throw new IOException("No Container.Filename...");
			}
			if (filename.endsWith(".warc") || filename.endsWith(".warc.gz")) {
				filename = filename.replaceFirst("\\.warc$", ".warc.wat.gz");
				filename = filename.replaceFirst("\\.warc\\.gz$", ".warc.wat.gz");
			} else if (filename.endsWith(".arc") || filename.endsWith(".arc.gz")) {
				filename = filename.replaceFirst("\\.arc$", ".arc.wat.gz");
				filename = filename.replaceFirst("\\.arc\\.gz$", ".arc.wat.gz");
			}
		}
		// removing path from filename
		File tmpFile = new File(filename);
		filename = tmpFile.getName();
		HttpHeaders headers = new HttpHeaders();
		headers.add("software", IAUtils.COMMONS_VERSION);
		headers.addDateHeader("extractedDate", new Date());

		// add ip, hostname
		try {
			InetAddress host = InetAddress.getLocalHost();
			headers.add("ip", host.getHostAddress());
			headers.add("hostname", host.getCanonicalHostName());
		} catch (UnknownHostException e) {
			LOG.warning("unable to obtain local crawl engine host :\n"+e.getMessage());
        }

		headers.add("format", IAUtils.WARC_FORMAT);
		headers.add("conformsTo", IAUtils.WARC_FORMAT_CONFORMS_TO);
		// optional arguments
		if(IAUtils.OPERATOR != null && IAUtils.OPERATOR.length() > 0) {
			headers.add("operator", IAUtils.OPERATOR);
		}
		if(IAUtils.PUBLISHER != null && IAUtils.PUBLISHER.length() > 0) {
			headers.add("publisher", IAUtils.PUBLISHER);
		}
		if(IAUtils.WAT_WARCINFO_DESCRIPTION != null && IAUtils.WAT_WARCINFO_DESCRIPTION.length() > 0) {
			headers.add("description", IAUtils.WAT_WARCINFO_DESCRIPTION);
		}
		
		ByteArrayOutputStream baos = new ByteArrayOutputStream();
		headers.write(baos);
                recW.writeWARCInfoRecord(recOut,filename,baos.toByteArray());
	}

	private String extractOrIO(MetaData md, String path) throws IOException {
		String value = JSONUtils.extractSingle(md, path);
		if(value == null) {
			throw new IOException("No "+path+" found.");
		}
		return value;
	}

	private void writeARC(OutputStream recOut, MetaData md) throws IOException {
		String targetURI = extractOrIO(md, "Envelope.ARC-Header-Metadata.Target-URI");
		String capDateString = extractOrIO(md, "Envelope.ARC-Header-Metadata.Date");
		String filename = extractOrIO(md, "Container.Filename");
		String offset = extractOrIO(md, "Container.Offset");
		String recId = String.format("",filename,offset);
		writeWARCMDRecord(recOut,md,targetURI,capDateString,recId);
	}

	private void writeWARC(OutputStream recOut, MetaData md) throws IOException {
		String warcType = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Type");
		String targetURI;
		if(warcType.equals("warcinfo")) {
			targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Filename");
		} else {
			targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI");
		}
		// handle date of generation in WARC format
		DateFormat dateFormat = new SimpleDateFormat("yyyyMMddHHmmss");
		String capDateString = dateFormat.format(new Date());
		String recId = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Record-ID");
		writeWARCMDRecord(recOut,md,targetURI,capDateString,recId);
	}

	private void writeWARCMDRecord(OutputStream recOut, MetaData md, 
			String targetURI, String capDateString, String recId)
	throws IOException {

                ByteArrayOutputStream bos = new ByteArrayOutputStream();

		OutputStreamWriter osw = new OutputStreamWriter(bos, UTF8);
		try {
			md.write(osw);
		} catch (JSONException e1) {
			e1.printStackTrace();
			throw new IOException(e1);
		}
		osw.flush();
//		ByteArrayInputStream bais = new ByteArrayInputStream(md.toString().getBytes("UTF-8"));
		Date capDate;
		try {
			capDate = DateUtils.getSecondsSinceEpoch(capDateString);

		} catch (ParseException e) {
			e.printStackTrace();
			// TODO... not the write thing...
			capDate = new Date();
		}
		
		recW.writeJSONMetadataRecord(recOut, bos.toByteArray(),
				targetURI, capDate, recId);
	}

	private static String transformWARCDate(final String input) {
		
		StringBuilder output = new StringBuilder(14);
		
		output.append(input.substring(0,4));
		output.append(input.substring(5,7));
		output.append(input.substring(8,10));
		output.append(input.substring(11,13));
		output.append(input.substring(14,16));
		output.append(input.substring(17,19));
		
		return output.toString();
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy