org.archive.io.arc.ARC2WCDX Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.io.arc;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Date;
import java.util.Iterator;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HeaderGroup;
import org.apache.commons.httpclient.util.DateParseException;
import org.apache.commons.httpclient.util.DateUtil;
import org.archive.io.ArchiveRecord;
import org.archive.util.ArchiveUtils;
import org.archive.util.SURT;
/**
* Create a 'Wide' CDX from an ARC. Takes one argument, the path to the ARC.
* Writes .wcdx.gz in same directory.
*
* @author gojomo
*/
public class ARC2WCDX {
final public static String WCDX_VERSION="0.1";
public static void main(String[] args) throws IOException {
String arcFilename = args[0];
createWcdx(arcFilename);
}
public static Object[] createWcdx(String arcFilename) throws IOException {
ARCReader reader = ARCReaderFactory.get(arcFilename);
Object[] retVal = createWcdx(reader);
reader.close();
return retVal;
}
public static Object[] createWcdx(ARCReader reader) {
reader.setDigest(true);
String wcdxPath = reader.getReaderIdentifier().replaceAll("\\.arc(\\.gz)?$",".wcdx.gz");
File wcdxFile = new File(wcdxPath+".open");
PrintStream writer = null;
long count = 0;
try {
writer = new PrintStream(new GZIPOutputStream(new FileOutputStream(wcdxFile)));
// write header: legend + timestamp
StringBuilder legend = new StringBuilder();
appendField(legend,"CDX");
appendField(legend,"surt-uri");
appendField(legend,"b"); // ARC timestamp
appendField(legend,"http-date");
appendField(legend,"s"); // status code
appendField(legend,"m"); // media type
appendField(legend,"sha1"); // content sha1
appendField(legend,"g"); // ARC name
appendField(legend,"V"); // start offset
appendField(legend,"end-offset"); // TODO: implement
appendField(legend,"n"); // ARC record length TODO: verify
appendField(legend,"http-content-length");
appendField(legend,"http-last-modified");
appendField(legend,"http-expires");
appendField(legend,"http-etag");
appendField(legend,"http-location");
appendField(legend,"e"); // IP
appendField(legend,"a"); // original URL
// WCDX version+creation time: crude version control
appendField(legend,WCDX_VERSION+"@"+ArchiveUtils.get14DigitDate());
writer.println(legend.toString());
Iterator iter = reader.iterator();
count = 0;
while(iter.hasNext()) {
ARCRecord record = (ARCRecord) iter.next();
record.close();
ARCRecordMetaData h = (ARCRecordMetaData) record.getHeader();
Header[] httpHeaders = record.getHttpHeaders();
if(httpHeaders==null) {
httpHeaders = new Header[0];
}
HeaderGroup hg = new HeaderGroup();
hg.setHeaders(httpHeaders);
StringBuilder builder = new StringBuilder();
// SURT-form URI
appendField(builder,SURT.fromURI(h.getUrl()));
// record timestamp ('b')
appendField(builder,h.getDate());
// http header date
appendTimeField(builder,hg.getFirstHeader("Date"));
// response code ('s')
appendField(builder,h.getStatusCode());
// media type ('m')
appendField(builder,h.getMimetype());
// content checksum (like 'c', but here Base32 SHA1)
appendField(builder,record.getDigestStr());
// arc name ('g')
appendField(builder,reader.getFileName());
// compressed start offset ('V')
appendField(builder,h.getOffset());
// compressed end offset (?)
// appendField(builder,
// reader.getInputStream() instanceof RepositionableStream
// ? ((GzippedInputStream)reader.getInputStream()).vPosition()
// : "-");
// TODO; leave unavail for now
appendField(builder, "-");
// uncompressed (declared in ARC headerline) record length
appendField(builder,h.getLength());
// http header content-length
appendField(builder,hg.getFirstHeader("Content-Length"));
// http header mod-date
appendTimeField(builder,hg.getFirstHeader("Last-Modified"));
// http header expires
appendTimeField(builder,hg.getFirstHeader("Expires"));
// http header etag
appendField(builder,hg.getFirstHeader("ETag"));
// http header redirect ('Location' header?)
appendField(builder,hg.getFirstHeader("Location"));
// ip ('e')
appendField(builder,h.getIp());
// original URI
appendField(builder,h.getUrl());
// TODO MAYBE - a title from inside content?
writer.println(builder.toString());
count++;
}
wcdxFile.renameTo(new File(wcdxPath));
} catch (IOException e) {
// soldier on: but leave '.open' wcdx file as indicator of error
if(!wcdxFile.exists()) {
try {
wcdxFile.createNewFile();
} catch (IOException e1) {
// TODO Auto-generated catch block
throw new RuntimeException(e1);
}
}
} catch (RuntimeException e) {
// soldier on: but leave '.open' wcdx file as indicator of error
if(!wcdxFile.exists()) {
try {
wcdxFile.createNewFile();
} catch (IOException e1) {
// TODO Auto-generated catch block
throw new RuntimeException(e1);
}
}
} finally {
if(writer!=null) {
writer.close();
}
}
return new Object[] {wcdxPath, count};
}
protected static void appendField(StringBuilder builder, Object obj) {
if(builder.length()>0) {
// prepend with delimiter
builder.append(' ');
}
if(obj instanceof Header) {
obj = ((Header)obj).getValue().trim();
}
builder.append((obj==null||obj.toString().length()==0)?"-":obj);
}
protected static void appendTimeField(StringBuilder builder, Object obj) {
if(builder.length()>0) {
// prepend with delimiter
builder.append(' ');
}
if(obj==null) {
builder.append("-");
return;
}
if(obj instanceof Header) {
String s = ((Header)obj).getValue().trim();
try {
Date date = DateUtil.parseDate(s);
String d = ArchiveUtils.get14DigitDate(date);
if(d.startsWith("209")) {
d = "199"+d.substring(3);
}
obj = d;
} catch (DateParseException e) {
builder.append('e');
return;
}
}
builder.append(obj);
}
}
//'wide' CDX
//a original url
//b timestamp
//s resp code
//m type
//? content md5 (full 'k'? 'c'?
//g arc name
//V compressed start offset
//? compressed length
//n? uncompressed length
//? mod date
//? expires
//? server 'date' hdr
//? etag
//r redirect ('Location'?)
//e ip
//MAYBE:
//? TITLE from HTML or other format?
© 2015 - 2024 Weber Informatics LLC | Privacy Policy