All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.jwat.tools.tasks.cdx.CDXFile Maven / Gradle / Ivy

Go to download

JWAT-Tools uses the available JWAT libraries to make high level tasks available either from command-line or programmatically. Common tasks include: Test, Compress, Decompress, CDX, Arc2Warc. More specialised tasks include: Changed, ContainerMD, Delete, Extract, Interval, PathIndex, Unpack, Headers2CDX.

There is a newer version: 0.7.1
Show newest version
package org.jwat.tools.tasks.cdx;

import java.io.File;
import java.io.IOException;

import org.jwat.arc.ArcHeader;
import org.jwat.arc.ArcRecord;
import org.jwat.arc.ArcRecordBase;
import org.jwat.archive.ArchiveParser;
import org.jwat.archive.ArchiveParserCallback;
import org.jwat.common.ContentType;
import org.jwat.common.HttpHeader;
import org.jwat.common.UriProfile;
import org.jwat.gzip.GzipEntry;
import org.jwat.warc.WarcConstants;
import org.jwat.warc.WarcHeader;
import org.jwat.warc.WarcRecord;

public class CDXFile implements ArchiveParserCallback {

	protected CDXResult result;

	public CDXFile() {
	}

	public CDXResult processFile(File srcFile) {
		result = new CDXResult();
		result.srcFile = srcFile;
		result.filename = srcFile.getName();
		ArchiveParser archiveParser = new ArchiveParser();
		archiveParser.uriProfile = UriProfile.RFC3986_ABS_16BIT_LAX;
		archiveParser.bBlockDigestEnabled = true;
		archiveParser.bPayloadDigestEnabled = true;
		result.consumed = archiveParser.parse(srcFile, this);
		return result;
	}

	@Override
	public void apcFileId(File file, int fileId) {
	}

	@Override
	public void apcUpdateConsumed(long consumed) {
	}

	@Override
	public void apcGzipEntryStart(GzipEntry gzipEntry, long startOffset) {
	}

	@Override
	public void apcArcRecordStart(ArcRecordBase arcRecord, long startOffset, boolean compressed) throws IOException {
		if (arcRecord.recordType == ArcRecord.RT_ARC_RECORD) {
			CDXEntry entry = new CDXEntry();
			ArcHeader arcHeader = arcRecord.header;
			long length = arcHeader.archiveLength;
			String mimetype = null;
			String responseCode = "-";
			int idx;
			/*
			 * HttpHeader content-type.
			 */
			HttpHeader httpHeader = arcRecord.getHttpHeader();
        	if (httpHeader != null && httpHeader.contentType != null) {
        		responseCode = httpHeader.statusCodeStr;
        		length = httpHeader.getPayloadLength();
    			String httpContentTypeStr = httpHeader.contentType;
    			ContentType httpContentType = ContentType.parseContentType(httpContentTypeStr);
        		if (httpContentType != null) {
        			httpContentTypeStr = httpContentType.toStringShort();
        		}
        		else {
        			if (httpContentTypeStr != null) {
            			idx = httpContentTypeStr.indexOf(';');
            			if (idx != -1) {
            				httpContentTypeStr = httpContentTypeStr.substring(0, idx);
            			}
            			httpContentTypeStr = httpContentTypeStr.trim();
        			}
        		}
        		mimetype = httpContentTypeStr;
        	}
        	/*
        	 * ArcRecord content-type.
        	 */
        	if (mimetype == null) {
            	ContentType recordContentType = arcHeader.contentType;
            	String recordContentTypeStr = null;
            	if (recordContentType != null) {
            		recordContentTypeStr = recordContentType.toStringShort();
            	}
            	else {
                	recordContentTypeStr = arcHeader.contentTypeStr;
                	if (recordContentTypeStr != null) {
            			idx = recordContentTypeStr.indexOf(';');
            			if (idx != -1) {
            				recordContentTypeStr = recordContentTypeStr.substring(0, idx);
            			}
                	}
        			recordContentTypeStr = recordContentTypeStr.trim();
            	}
            	mimetype = recordContentTypeStr;
        	}
        	/*
        	 * CDX entry values.
        	 */
			entry.date = arcHeader.archiveDate;
			entry.ip = arcHeader.ipAddressStr;
			entry.url = arcHeader.urlStr;
        	entry.mimetype = mimetype;
	        entry.responseCode = responseCode;
	        entry.checksum = null;
	        entry.offset = startOffset;
	        entry.length = length;
	        if (entry.url != null && !entry.url.toLowerCase().startsWith("filedesc:")) {
		        result.entries.add(entry);
	        }
		}
        arcRecord.close();
	}

	@Override
	public void apcWarcRecordStart(WarcRecord warcRecord, long startOffset, boolean compressed) throws IOException {
		if (warcRecord.header.warcTypeIdx == WarcConstants.RT_IDX_RESPONSE) {
			CDXEntry entry = new CDXEntry();
			WarcHeader warcHeader = warcRecord.header;
			long length = warcHeader.contentLength;
			String responseCode = "-";
			String mimetype = null;
            String msgtype = null;
            int idx;
			/*
			 * HttpHeader content-type.
			 */
			String recordContentTypeStr = warcHeader.contentTypeStr;
			ContentType recordContentType = warcHeader.contentType;
			if (recordContentType == null) {
				recordContentType = ContentType.parseContentType(recordContentTypeStr);
			}
			if (recordContentType != null) {
				String httpContentTypeStr;
				ContentType httpContentType;
		        if (warcHeader.contentType.contentType.equals("application") && warcHeader.contentType.mediaType.equals("http")) {
		            msgtype = warcHeader.contentType.getParameter("msgtype");
		        }
	            HttpHeader httpHeader = warcRecord.getHttpHeader();
            	if (httpHeader != null && httpHeader.contentType != null) {
            		responseCode = httpHeader.statusCodeStr;
            		length = httpHeader.getPayloadLength();
            		httpContentTypeStr = httpHeader.contentType;
        			httpContentType = ContentType.parseContentType(httpContentTypeStr);
            		if (httpContentType != null) {
            			httpContentTypeStr = httpContentType.toStringShort();
            		}
            		else {
            			if (httpContentTypeStr != null) {
                			idx = httpContentTypeStr.indexOf(';');
                			if (idx != -1) {
                				httpContentTypeStr = httpContentTypeStr.substring(0, idx);
                			}
                			httpContentTypeStr = httpContentTypeStr.trim();
            			}
            		}
            		mimetype = httpContentTypeStr;
            	}
			}
        	/*
        	 * WarcRecord content-type.
        	 */
        	if (mimetype == null) {
            	if (recordContentType != null) {
            		recordContentTypeStr = recordContentType.toStringShort();
            	}
            	else {
            		if (recordContentTypeStr != null) {
            			idx = recordContentTypeStr.indexOf(';');
            			if (idx != -1) {
            				recordContentTypeStr = recordContentTypeStr.substring(0, idx);
            			}
            			recordContentTypeStr = recordContentTypeStr.trim();
            		}
            	}
            	mimetype = recordContentTypeStr;
        	}
        	/*
        	 * CDX entry values.
        	 */
			entry.date = warcHeader.warcDate;
			entry.ip = warcHeader.warcIpAddress;
			entry.url = warcHeader.warcTargetUriStr;
	        entry.mimetype = mimetype;
	        entry.responseCode = responseCode;
	        entry.checksum = null;
	        entry.offset = startOffset;
	        entry.length = length;
	        String warctype = warcHeader.warcTypeStr;
	        if (warctype.equalsIgnoreCase("response") && (msgtype == null || (msgtype != null && msgtype.equalsIgnoreCase("response")))) {
		        result.entries.add(entry);
	        }
		}
		warcRecord.close();
	}

	@Override
	public void apcRuntimeError(Throwable t, long offset, long consumed) {
	}

	@Override
	public void apcDone() {
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy