
org.jwat.tools.tasks.arc2warc.Arc2Warc Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jwat-tools Show documentation
Show all versions of jwat-tools Show documentation
JWAT-Tools uses the available JWAT libraries to make high level tasks available either from command-line or programmatically.
Common tasks include: Test, Compress, Decompress, CDX, Arc2Warc.
More specialised tasks include: Changed, ContainerMD, Delete, Extract, Interval, PathIndex, Unpack, Headers2CDX.
package org.jwat.tools.tasks.arc2warc;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.util.GregorianCalendar;
import java.util.LinkedList;
import java.util.List;
import java.util.TimeZone;
import java.util.UUID;
import org.jwat.arc.ArcReader;
import org.jwat.arc.ArcReaderFactory;
import org.jwat.arc.ArcRecordBase;
import org.jwat.archive.ManagedPayload;
import org.jwat.common.Base32;
import org.jwat.common.ByteCountingPushBackInputStream;
import org.jwat.common.HttpHeader;
import org.jwat.common.RandomAccessFileInputStream;
import org.jwat.warc.WarcConstants;
import org.jwat.warc.WarcDigest;
import org.jwat.warc.WarcRecord;
import org.jwat.warc.WarcWriter;
import org.jwat.warc.WarcWriterFactory;
// TODO Check status and complete.
public class Arc2Warc {
public File srcFile;
protected RepairPayload repairPayload;
public List exceptionList = new LinkedList();
/**
*
* @param srcFile a valid arc(.gz) file
*/
public void arc2warc(File srcFile, Arc2WarcOptions options) {
repairPayload = RepairPayload.getRepairPayload();
try {
String srcFname = srcFile.getName();
RandomAccessFile raf = null;
RandomAccessFileInputStream rafin;
ByteCountingPushBackInputStream file_in = null;
ArcReader reader = null;
boolean bSrcCompressed;
ArcRecordBase arcRecord;
BufferedOutputStream file_out = null;
WarcWriter writer = null;
boolean bDestCompressed;
WarcRecord record;
Long contentLength;
String contentType;
ManagedPayload managedPayload = null;
InputStream payloadStream;
HttpHeader httpHeader;
WarcDigest warcBlockDigest;
WarcDigest warcPayloadDigest;
try {
/*
* Source.
*/
raf = new RandomAccessFile( srcFile, "r" );
rafin = new RandomAccessFileInputStream( raf );
file_in = new ByteCountingPushBackInputStream( new BufferedInputStream( rafin, 8192 ), 32 );
reader = ArcReaderFactory.getReader(file_in, 8192);
bSrcCompressed = reader.isCompressed();
/*
* Destination.
*/
// TODO select converted compression on/off/same.
bDestCompressed = false;
String dstFname = options.prefix + srcFname;
if (dstFname.toLowerCase().endsWith(".gz")) {
dstFname = dstFname.substring( 0, dstFname.length() - ".gz".length() );
}
if (dstFname.toLowerCase().endsWith(".arc")) {
dstFname = dstFname.substring( 0, dstFname.length() - ".arc".length() );
}
dstFname += ".warc";
if (bDestCompressed) {
dstFname += ".gz";
}
String tmpFname = dstFname + ".open";
File tmpDstFile = new File(options.destDir, tmpFname);
File dstFile = new File(options.destDir, dstFname);
if (dstFile.exists()) {
if (!dstFile.isFile()) {
throw new IOException("Destination file is a directory: '" + dstFile.getPath() + "'");
}
if (options.bOverwrite && !dstFile.delete()) {
throw new IOException("Could not delete file: '" + dstFile.getPath() + "'");
}
}
if (!dstFile.exists()) {
if (tmpDstFile.exists()) {
if (!tmpDstFile.isFile()) {
throw new IOException("Temporary destination file is a directory: '" + tmpDstFile.getPath() + "'");
}
if (!tmpDstFile.delete()) {
throw new IOException("Could not delete file: '" + tmpDstFile.getPath() + "'");
}
}
file_out = new BufferedOutputStream(new FileOutputStream(tmpDstFile), 8192);
writer = WarcWriterFactory.getWriter(file_out, 8192, bDestCompressed);
// debug
//System.out.println(srcFname + " -> " + dstFname);
managedPayload = ManagedPayload.checkout();
/*
* Loop record(s).
*/
UUID warcinfoUuid = null;
UUID filedescUuid = null;
UUID recordUuid = null;
int recordCount = 0;
while ((arcRecord = reader.getNextRecord()) != null) {
/*
* Generate filedesc uuid if the arc record is a version block record.
*/
if (arcRecord.recordType == ArcRecordBase.RT_VERSION_BLOCK) {
filedescUuid = UUID.randomUUID();
}
/*
* Is the first record a version block record?
*/
if (recordCount == 0 && arcRecord.recordType != ArcRecordBase.RT_VERSION_BLOCK) {
// TODO Warning, missing filedesc as first record in ARC file.
}
/*
* Write a warcinfo record if is the first record or if it is a version block record.
*/
if (recordCount == 0 || arcRecord.recordType == ArcRecordBase.RT_VERSION_BLOCK) {
GregorianCalendar cal = new GregorianCalendar();
cal.setTimeZone(TimeZone.getTimeZone("UTC"));
cal.setTimeInMillis(System.currentTimeMillis());
warcinfoUuid = UUID.randomUUID();
record = WarcRecord.createRecord(writer);
record.header.addHeader(WarcConstants.FN_WARC_TYPE, WarcConstants.RT_WARCINFO);
record.header.addHeader(WarcConstants.FN_WARC_DATE, cal.getTime(), null);
record.header.addHeader(WarcConstants.FN_WARC_FILENAME, dstFname);
record.header.addHeader(WarcConstants.FN_WARC_RECORD_ID, "");
record.header.addHeader(WarcConstants.FN_CONTENT_TYPE, "application/warc-fields");
record.header.addHeader(WarcConstants.FN_CONTENT_LENGTH, "0");
// Standard says no.
//record.header.addHeader(WarcConstants.FN_WARC_CONCURRENT_TO, "");
writer.writeHeader(record);
writer.closeRecord();
++recordCount;
}
/*
* Write filedesc metadata is the record is a version block record.
*/
if (arcRecord.recordType == ArcRecordBase.RT_VERSION_BLOCK) {
managedPayload.manageVersionBlock(arcRecord, true);
contentType = "text/plain";
contentLength = managedPayload.payloadLength;
warcBlockDigest = WarcDigest.createWarcDigest("SHA1", managedPayload.blockDigestBytes, "base32", Base32.encodeArray(managedPayload.blockDigestBytes));
record = WarcRecord.createRecord(writer);
record.header.addHeader(WarcConstants.FN_WARC_TYPE, WarcConstants.RT_METADATA);
record.header.addHeader(WarcConstants.FN_WARC_TARGET_URI, arcRecord.header.urlUri, arcRecord.header.urlStr );
record.header.addHeader(WarcConstants.FN_WARC_DATE, arcRecord.header.archiveDate, arcRecord.header.archiveDateStr);
record.header.addHeader(WarcConstants.FN_WARC_RECORD_ID, "");
record.header.addHeader(WarcConstants.FN_WARC_CONCURRENT_TO, "");
record.header.addHeader(WarcConstants.FN_WARC_IP_ADDRESS, arcRecord.header.inetAddress, arcRecord.header.ipAddressStr);
record.header.addHeader(WarcConstants.FN_WARC_WARCINFO_ID, "");
record.header.addHeader(WarcConstants.FN_WARC_BLOCK_DIGEST, warcBlockDigest, null);
record.header.addHeader(WarcConstants.FN_CONTENT_LENGTH, contentLength, null);
record.header.addHeader(WarcConstants.FN_CONTENT_TYPE, contentType);
writer.writeHeader(record);
payloadStream = managedPayload.getPayloadStream();
if (payloadStream != null) {
writer.streamPayload(payloadStream);
payloadStream.close();
payloadStream = null;
}
writer.closeRecord();
arcRecord.close();
++recordCount;
}
else {
/*
* Response.
*/
/*
if (recordCount == 901) {
System.out.println(recordCount);
}
*/
managedPayload.manageArcRecord(arcRecord, true);
httpHeader = managedPayload.httpHeader;
if (httpHeader == null || !httpHeader.isValid()) {
//savePayload(managedPayload);
// Optional
/*
int number = getNextPayloadErrorNumber();
savePayloadErrorArc(arcRecord, managedPayload, number);
*/
managedPayload = repairPayload.repairPayload(managedPayload, arcRecord.header.contentTypeStr, arcRecord.header.archiveDate);
}
httpHeader = managedPayload.httpHeader;
if (httpHeader != null && httpHeader.isValid()) {
if (httpHeader.headerType == HttpHeader.HT_RESPONSE) {
contentType = "application/http; msgtype=response";
}
else if (httpHeader.headerType == HttpHeader.HT_REQUEST) {
contentType = "application/http; msgtype=request";
}
else {
throw new IllegalStateException("Unknown header type!");
}
} else {
contentType = arcRecord.header.contentTypeStr;
}
warcBlockDigest = WarcDigest.createWarcDigest("SHA1", managedPayload.blockDigestBytes, "base32", Base32.encodeArray(managedPayload.blockDigestBytes));
warcPayloadDigest = WarcDigest.createWarcDigest("SHA1", managedPayload.payloadDigestBytes, "base32", Base32.encodeArray(managedPayload.payloadDigestBytes));
recordUuid = UUID.randomUUID();
record = WarcRecord.createRecord(writer);
record.header.addHeader(WarcConstants.FN_WARC_TYPE, WarcConstants.RT_RESPONSE);
record.header.addHeader(WarcConstants.FN_WARC_TARGET_URI, arcRecord.header.urlUri, arcRecord.header.urlStr);
record.header.addHeader(WarcConstants.FN_WARC_DATE, arcRecord.header.archiveDate, arcRecord.header.archiveDateStr);
record.header.addHeader(WarcConstants.FN_WARC_RECORD_ID, "");
record.header.addHeader(WarcConstants.FN_WARC_IP_ADDRESS, arcRecord.header.inetAddress, arcRecord.header.ipAddressStr);
record.header.addHeader(WarcConstants.FN_WARC_WARCINFO_ID, "");
contentLength = managedPayload.httpHeaderLength + managedPayload.payloadLength;
if (contentLength > 0) {
record.header.addHeader(WarcConstants.FN_WARC_BLOCK_DIGEST, warcBlockDigest, null);
if (managedPayload.httpHeaderLength > 0 && managedPayload.payloadLength > 0) {
record.header.addHeader(WarcConstants.FN_WARC_PAYLOAD_DIGEST, warcPayloadDigest, null);
}
}
record.header.addHeader(WarcConstants.FN_CONTENT_LENGTH, contentLength, null);
if (contentType != null) {
record.header.addHeader(WarcConstants.FN_CONTENT_TYPE, contentType);
}
writer.writeHeader(record);
InputStream httpHeaderStream = managedPayload.getHttpHeaderStream();
if (httpHeaderStream != null) {
writer.streamPayload(httpHeaderStream);
httpHeaderStream.close();
httpHeaderStream = null;
}
payloadStream = managedPayload.getPayloadStream();
if (payloadStream != null) {
writer.streamPayload(payloadStream);
payloadStream.close();
payloadStream = null;
}
writer.closeRecord();
arcRecord.close();
++recordCount;
}
}
if (!tmpDstFile.renameTo(dstFile)) {
throw new IOException("Could not rename '" + tmpDstFile.getPath() + "' to '" + dstFile.getPath() + "'");
}
}
}
catch (FileNotFoundException e) {
exceptionList.add(e);
}
catch (IOException e) {
exceptionList.add(e);
}
finally {
if (managedPayload != null) {
managedPayload.checkin();
}
if (writer != null) {
try {
writer.close();
writer = null;
}
catch (IOException e) {
exceptionList.add(e);
}
}
if (file_out != null) {
try {
file_out.close();
file_out = null;
}
catch (IOException e) {
exceptionList.add(e);
}
}
if (reader != null) {
// TODO arcreader.close throw exception?
reader.close();
reader = null;
/*
try {
}
catch (IOException e) {
}
*/
}
if (file_in != null) {
try {
file_in.close();
file_in = null;
}
catch (IOException e) {
exceptionList.add(e);
}
}
if (raf != null) {
try {
raf.close();
raf = null;
}
catch (IOException e) {
exceptionList.add(e);
}
}
}
} catch (Throwable t) {
exceptionList.add(t);
}
for (int i=0; i
© 2015 - 2025 Weber Informatics LLC | Privacy Policy