All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.extract.WATExtractorOutput Maven / Gradle / Ivy

There is a newer version: 1.3.0
Show newest version
package org.archive.extract;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.text.ParseException;
import java.util.Date;

import org.archive.format.gzip.GZIPMemberWriter;
import org.archive.format.gzip.GZIPMemberWriterCommittedOutputStream;
import org.archive.format.http.HttpHeaders;
import org.archive.format.json.JSONUtils;
import org.archive.format.warc.WARCRecordWriter;
import org.archive.resource.MetaData;
import org.archive.resource.Resource;
import org.archive.util.IAUtils;
import org.archive.util.DateUtils;
import org.archive.util.StreamCopy;
import org.archive.util.io.CommitedOutputStream;
import org.json.JSONException;

public class WATExtractorOutput implements ExtractorOutput {
	WARCRecordWriter recW;
	private boolean wroteFirst;
	private GZIPMemberWriter gzW;
	private static int DEFAULT_BUFFER_RAM = 1024 * 1024;
	private int bufferRAM = DEFAULT_BUFFER_RAM;
	private final static Charset UTF8 = Charset.forName("UTF-8");
	
	public WATExtractorOutput(OutputStream out) {
		gzW = new GZIPMemberWriter(out);
		recW = new WARCRecordWriter();
		wroteFirst = false;
	}

	private CommitedOutputStream getOutput() {
		return new GZIPMemberWriterCommittedOutputStream(gzW,bufferRAM);
	}

	public void output(Resource resource) throws IOException {
		StreamCopy.readToEOF(resource.getInputStream());
		MetaData top = resource.getMetaData().getTopMetaData();
		CommitedOutputStream cos;
		if(!wroteFirst) {
			cos = getOutput();
			writeWARCInfo(cos,top);
			cos.commit();
			wroteFirst = true;
		}
		String envelopeFormat = JSONUtils.extractSingle(top, "Envelope.Format");
		if(envelopeFormat == null) {
			// hrm...
			throw new IOException("Missing Envelope.Format");
		}
		cos = getOutput();
		if(envelopeFormat.equals("ARC")) {
			writeARC(cos,top);
		} else if(envelopeFormat.equals("WARC")) {
			writeWARC(cos,top);
		} else {
			// hrm...
			throw new IOException("Unknown Envelope.Format");
		}
		cos.commit();
	}

	private void writeWARCInfo(OutputStream recOut, MetaData md) throws IOException {
		String filename = JSONUtils.extractSingle(md, "Container.Filename");
		if(filename == null) {
			throw new IOException("No Container.Filename...");
		}
		HttpHeaders headers = new HttpHeaders();
		headers.add("Software-Info", IAUtils.COMMONS_VERSION);
		headers.addDateHeader("Extracted-Date", new Date());
		ByteArrayOutputStream baos = new ByteArrayOutputStream();
		headers.write(baos);
                recW.writeWARCInfoRecord(recOut,filename,baos.toByteArray());
	}

	private String extractOrIO(MetaData md, String path) throws IOException {
		String value = JSONUtils.extractSingle(md, path);
		if(value == null) {
			throw new IOException("No "+path+" found.");
		}
		return value;
	}

	private void writeARC(OutputStream recOut, MetaData md) throws IOException {
		String targetURI = extractOrIO(md, "Envelope.ARC-Header-Metadata.Target-URI");
		String capDateString = extractOrIO(md, "Envelope.ARC-Header-Metadata.Date");
		String filename = extractOrIO(md, "Container.Filename");
		String offset = extractOrIO(md, "Container.Offset");
		String recId = String.format("",filename,offset);
		writeWARCMDRecord(recOut,md,targetURI,capDateString,recId);
	}

	private void writeWARC(OutputStream recOut, MetaData md) throws IOException {
		String warcType = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Type");
		String targetURI;
		if(warcType.equals("warcinfo")) {
			targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Filename");
		} else {
			targetURI = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Target-URI");
		}
		String capDateString = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Date");
		capDateString = transformWARCDate(capDateString);
		String recId = extractOrIO(md, "Envelope.WARC-Header-Metadata.WARC-Record-ID");
		writeWARCMDRecord(recOut,md,targetURI,capDateString,recId);
	}

	private void writeWARCMDRecord(OutputStream recOut, MetaData md, 
			String targetURI, String capDateString, String recId)
	throws IOException {

                ByteArrayOutputStream bos = new ByteArrayOutputStream();

		OutputStreamWriter osw = new OutputStreamWriter(bos, UTF8);
		try {
			md.write(osw);
		} catch (JSONException e1) {
			e1.printStackTrace();
			throw new IOException(e1);
		}
		osw.flush();
//		ByteArrayInputStream bais = new ByteArrayInputStream(md.toString().getBytes("UTF-8"));
		Date capDate;
		try {
			capDate = DateUtils.getSecondsSinceEpoch(capDateString);

		} catch (ParseException e) {
			e.printStackTrace();
			// TODO... not the write thing...
			capDate = new Date();
		}
		
		recW.writeJSONMetadataRecord(recOut, bos.toByteArray(),
				targetURI, capDate, recId);
	}

	private static String transformWARCDate(final String input) {
		
		StringBuilder output = new StringBuilder(14);
		
		output.append(input.substring(0,4));
		output.append(input.substring(5,7));
		output.append(input.substring(8,10));
		output.append(input.substring(11,13));
		output.append(input.substring(14,16));
		output.append(input.substring(17,19));
		
		return output.toString();
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy