
org.jwat.tools.tasks.arc2warc.RepairPayload Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jwat-tools Show documentation
Show all versions of jwat-tools Show documentation
JWAT-Tools uses the available JWAT libraries to make high level tasks available either from command-line or programmatically.
Common tasks include: Test, Compress, Decompress, CDX, Arc2Warc.
More specialised tasks include: Changed, ContainerMD, Delete, Extract, Interval, PathIndex, Unpack, Headers2CDX.
package org.jwat.tools.tasks.arc2warc;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import java.util.TimeZone;
import org.jwat.arc.ArcConstants;
import org.jwat.archive.ManagedPayload;
import org.jwat.common.ArrayUtils;
import org.jwat.common.ByteCountingPushBackInputStream;
import org.jwat.common.ContentType;
import org.jwat.common.HttpHeader;
import org.jwat.tools.core.ManagedPayloadContentType;
import org.jwat.tools.core.ManagedPayloadContentTypeIdentifier;
public class RepairPayload {
/** Thread safe RepairPayload
. */
private static final ThreadLocal RepairPayloadTL = new ThreadLocal() {
@Override
public RepairPayload initialValue() {
return new RepairPayload();
}
};
//private Calendar calendar;
private SimpleDateFormat dateFormat;
/**
* Creates a new RepairPayload
object.
*/
private RepairPayload() {
//calendar = Calendar.getInstance();
dateFormat = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z", Locale.US);
dateFormat.setTimeZone(TimeZone.getTimeZone("GMT"));
}
public static RepairPayload getRepairPayload() {
return RepairPayloadTL.get();
}
private byte[] tmpBuf = new byte[16384];
private int position = 0;
private int limit = 0;
private PushbackInputStream pbin;
private ByteArrayOutputStream httpOut = new ByteArrayOutputStream();
private boolean bRepaired = false;
public ManagedPayload repairPayload(ManagedPayload managedPayload, String contentTypeStr, Date date) throws IOException {
ContentType contentType = null;
int possibleStatusCode = 0;
position = 0;
limit = 0;
pbin = null;
httpOut.reset();
bRepaired = false;
InputStream payloadStream = null;
HttpHeader httpHeader;
ManagedPayload newManagedPayload;
long newPayloadLength;
try {
/*
* no-type -> libmagic identify.
*/
if (ArcConstants.CONTENT_TYPE_NO_TYPE.equalsIgnoreCase(contentTypeStr)) {
ManagedPayloadContentTypeIdentifier managedPayloadContentTypeIdentifier = ManagedPayloadContentTypeIdentifier.getManagedPayloadContentTypeIdentifier();
ManagedPayloadContentType managedPayloadContentType = managedPayloadContentTypeIdentifier.guestimateContentType(managedPayload);
if (managedPayloadContentType != null) {
contentType = managedPayloadContentType.contentType;
possibleStatusCode = managedPayloadContentType.possibleStatusCode;
}
} else {
contentType = ContentType.parseContentType(contentTypeStr);
}
if (contentType == null) {
System.out.println("Unknown: " + contentTypeStr);
}
if (contentType != null) {
if ("text".equalsIgnoreCase(contentType.contentType)) {
/*
* Read first 16K so we can see what can be repaired.
*/
payloadStream = managedPayload.getPayloadStream();
pbin = new PushbackInputStream(payloadStream, 16384);
position = 0;
int remaining = tmpBuf.length;
int read = 0;
while (remaining > 0 && read != -1) {
read = pbin.read(tmpBuf, position, remaining);
if (read > 0) {
position += read;
remaining -= read;
}
}
limit = position;
position = 0;
int position2;
byte[] CRLFCRLF = "\r\n\r\n".getBytes();
byte[] fail_match1 = "HTTP/1.0 404: Not found\r\n\r\n".getBytes();
byte[] fail_match2 = "HTTP/1.1 /images/head_lycos_search.gif\r\n".getBytes();
byte[] fail_match3 = "http/1.0 301 redirect\r\n".getBytes();
if (ArrayUtils.startsWith(fail_match1, tmpBuf)) {
pbin.unread(tmpBuf, fail_match1.length, limit - fail_match1.length);
newPayloadLength = managedPayload.payloadLength - fail_match1.length;
httpOut.reset();
httpOut.write("HTTP/1.0 404 Not found".getBytes());
httpOut.write("\r\n".getBytes());
if (date != null) {
httpOut.write("Date: ".getBytes());
httpOut.write(dateFormat.format(date).getBytes());
httpOut.write("\r\n".getBytes());
}
httpOut.write("Content-Length: ".getBytes());
httpOut.write(Long.toString(newPayloadLength).getBytes());
httpOut.write("\r\n".getBytes());
if (contentType != null) {
httpOut.write("Content-Type: ".getBytes());
httpOut.write(contentType.toString().getBytes());
httpOut.write("\r\n".getBytes());
}
httpOut.write("Connection: close".getBytes());
httpOut.write("\r\n".getBytes());
httpOut.write("\r\n".getBytes());
newManagedPayload = ManagedPayload.checkout();
newManagedPayload.managedHttp(httpOut.toByteArray(), true);
newManagedPayload.managePayloadInputStream(pbin, newPayloadLength, true);
managedPayload.checkin();
managedPayload = newManagedPayload;
System.out.println("case 1");
} else if (ArrayUtils.startsWith(fail_match2, tmpBuf)) {
position = fail_match2.length;
position2 = ArrayUtils.indexOf(CRLFCRLF, tmpBuf, position);
httpOut.reset();
httpOut.write("HTTP/1.1 302 Found\r\n".getBytes());
httpOut.write(tmpBuf, position, position2 - position);
byte[] httpHeaderBytes = httpOut.toByteArray();
httpHeader = HttpHeader.processPayload(HttpHeader.HT_RESPONSE,
new ByteCountingPushBackInputStream(new ByteArrayInputStream(httpHeaderBytes), 8192),
httpHeaderBytes.length,
null);
if (httpHeader != null && httpHeader.isValid()) {
pbin.unread(tmpBuf, position2, limit - position2);
newPayloadLength = managedPayload.payloadLength - position2;
newManagedPayload = ManagedPayload.checkout();
newManagedPayload.managedHttp(httpHeaderBytes, true);
newManagedPayload.managePayloadInputStream(pbin, newPayloadLength, true);
managedPayload.checkin();
managedPayload = newManagedPayload;
System.out.println("case 2");
} else {
System.out.println("fail case 2");
}
} else if (ArrayUtils.startsWith(fail_match3, tmpBuf)) {
position = fail_match3.length;
position2 = ArrayUtils.indexOf(CRLFCRLF, tmpBuf, position);
httpOut.reset();
httpOut.write("HTTP/1.0 301 Redirect\r\n".getBytes());
httpOut.write(tmpBuf, position, position2 - position);
byte[] httpHeaderBytes = httpOut.toByteArray();
httpHeader = HttpHeader.processPayload(HttpHeader.HT_RESPONSE,
new ByteCountingPushBackInputStream(new ByteArrayInputStream(httpHeaderBytes), 8192),
httpHeaderBytes.length,
null);
if (httpHeader != null && httpHeader.isValid()) {
pbin.unread(tmpBuf, position2, limit - position2);
newPayloadLength = managedPayload.payloadLength - position2;
newManagedPayload = ManagedPayload.checkout();
newManagedPayload.managedHttp(httpHeaderBytes, true);
newManagedPayload.managePayloadInputStream(pbin, newPayloadLength, true);
managedPayload.checkin();
managedPayload = newManagedPayload;
System.out.println("case 3");
} else {
System.out.println("fail case 3");
}
} else {
position = ArrayUtils.skip(ArrayUtils.SKIP_WHITESPACE,tmpBuf, 0);
if (position < limit) {
if (!bRepaired) {
managedPayload = tryrepair_insert_200(managedPayload, contentType, date);
}
if (!bRepaired) {
managedPayload = tryrepair_insert_404(managedPayload, contentType, date);
}
if (!bRepaired) {
managedPayload = tryrepair_insert_500(managedPayload, contentType, date);
}
/*
if (!bRepaired) {
byte[][] htmlTags = {
"Location: http://".getBytes()
};
idx = 0;
while (!bInsertHttpHeader && idx < htmlTags.length) {
bInsertHttpHeader = ArrayUtils.equalsAtIgnoreCase(htmlTags[idx], tmpBuf, position);
++idx;
}
}
*/
} else {
// All spaces are belong in tmpBuf...
System.out.println("case 0");
}
}
//String statusLine = "HTTP/1.1 " + possibleStatusCode + " ";
} else if (contentType != null) {
payloadStream = managedPayload.getPayloadStream();
pbin = new PushbackInputStream(payloadStream, 16384);
newPayloadLength = managedPayload.payloadLength;
managedPayload = insertHeader(managedPayload, newPayloadLength, "HTTP/1.1 200 OK", contentType, date);
}
}
} finally {
if (payloadStream != null) {
payloadStream.close();
payloadStream = null;
}
}
return managedPayload;
}
public ManagedPayload insertHeader(ManagedPayload managedPayload, long newPayloadLength, String statusLine, ContentType contentType, Date date) throws IOException {
httpOut.reset();
httpOut.write(statusLine.getBytes());
httpOut.write("\r\n".getBytes());
if (date != null) {
httpOut.write("Date: ".getBytes());
httpOut.write(dateFormat.format(date).getBytes());
httpOut.write("\r\n".getBytes());
}
httpOut.write("Content-Length: ".getBytes());
httpOut.write(Long.toString(newPayloadLength).getBytes());
httpOut.write("\r\n".getBytes());
if (contentType != null) {
httpOut.write("Content-Type: ".getBytes());
httpOut.write(contentType.toString().getBytes());
httpOut.write("\r\n".getBytes());
}
httpOut.write("Connection: close".getBytes());
httpOut.write("\r\n".getBytes());
httpOut.write("\r\n".getBytes());
ManagedPayload newManagedPayload = ManagedPayload.checkout();
newManagedPayload.managedHttp(httpOut.toByteArray(), true);
newManagedPayload.managePayloadInputStream(pbin, newPayloadLength, true);
managedPayload.checkin();
managedPayload = newManagedPayload;
return managedPayload;
}
public ManagedPayload tryrepair_insert_200(ManagedPayload managedPayload, ContentType contentType, Date date) throws IOException {
byte[][] cases = {
"".getBytes(),
"".getBytes(),
"".getBytes(),
"