org.jwat.warc.WarcRecord Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of jwat-warc Show documentation
Show all versions of jwat-warc Show documentation
Used for reading, writing and validating WARC files.
Implemented to follow the WARC/1.0 ISO specification as closely as possible.
/**
* Java Web Archive Toolkit - Software to read and validate ARC, WARC
* and GZip files. (http://jwat.org/)
* Copyright 2011-2012 Netarkivet.dk (http://netarkivet.dk/)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.jwat.warc;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.jwat.common.Base16;
import org.jwat.common.Base32;
import org.jwat.common.Base64;
import org.jwat.common.ByteCountingPushBackInputStream;
import org.jwat.common.Diagnosis;
import org.jwat.common.DiagnosisType;
import org.jwat.common.Diagnostics;
import org.jwat.common.HeaderLine;
import org.jwat.common.HttpHeader;
import org.jwat.common.NewlineParser;
import org.jwat.common.Payload;
import org.jwat.common.PayloadOnClosedHandler;
/**
* This class represents a parsed WARC record header block including
* possible validation and format warnings/errors encountered in the process.
* The payload of the WARC record is accessible through a wrapped payload
* object.
*
* @author nicl
*/
public class WarcRecord implements PayloadOnClosedHandler, Closeable {
/** Reader instance used, required for file compliance. */
protected WarcReader reader;
/** Input stream used to read this record. */
protected ByteCountingPushBackInputStream in;
/** Is this record compliant ie. error free. */
protected boolean bIsCompliant;
/** WARC record parsing start offset relative to the source WARC file input
* stream. Used to keep track of the uncompressed amount of bytes consumed. */
protected long startOffset = -1;
/** Uncompressed bytes consumed while validating this record. */
protected long consumed;
/** Validation errors and warnings. */
public final Diagnostics diagnostics = new Diagnostics();
/** Newline parser for counting/validating trailing newlines. */
public NewlineParser nlp = new NewlineParser();
/** Is Warc-Block-Digest valid. (Null is equal to not tested) */
public Boolean isValidBlockDigest = null;
/** Is Warc-Payload-Digest valid. (Null is equal to not tested) */
public Boolean isValidPayloadDigest = null;
/** Number of trailing newlines after record. */
public int trailingNewlines;
/*
* Header-Fields.
*/
/** WARC header. */
public WarcHeader header;
/*
* Payload
*/
/** Has payload been closed before. */
protected boolean bPayloadClosed;
/** Has record been closed before. */
protected boolean bClosed;
/** Payload object if any exists. */
protected Payload payload;
/** HTTP header content parsed from payload. */
protected HttpHeader httpHeader;
/** Computed block digest. */
public WarcDigest computedBlockDigest;
/** Computed payload digest. */
public WarcDigest computedPayloadDigest;
/**
* Non public constructor to allow unit testing.
*/
protected WarcRecord() {
}
/**
* Create a WarcRecord
and prepare it for writing.
* @param writer writer which will be used to write the record
* @return a WarcRecord
ready to be changed and then written
*/
public static WarcRecord createRecord(WarcWriter writer) {
WarcRecord record = new WarcRecord();
record.header = WarcHeader.initHeader(writer, record.diagnostics);
writer.fieldParsers.diagnostics = record.diagnostics;
return record;
}
/**
* Given an InputStream
it tries to read and validate a WARC
* header block.
* @param in InputStream
containing WARC record data
* @param reader WarcReader
used, with access to user defined
* options
* @return WarcRecord
or null
* @throws IOException I/O exception in the process of reading record
*/
public static WarcRecord parseRecord(ByteCountingPushBackInputStream in,
WarcReader reader) throws IOException {
WarcRecord record = new WarcRecord();
record.in = in;
record.reader = reader;
record.startOffset = in.getConsumed();
// Initialize WarcHeader with required context.
record.header = WarcHeader.initHeader(reader, in.getConsumed(), record.diagnostics);
WarcHeader header = record.header;
// Initialize WarcFieldParser to report diagnoses here.
reader.fieldParsers.diagnostics = record.diagnostics;
if (header.parseHeader(in)) {
++reader.records;
if (reader.wrpCallback != null) {
reader.wrpCallback.warcParsedRecordHeader(reader, record.startOffset, header);
}
/*
* Payload processing.
*/
if (header.contentLength != null && header.contentLength > 0) {
/*
* Payload.
*/
String digestAlgorithm = null;
if (reader.bBlockDigest) {
if (header.warcBlockDigest != null && header.warcBlockDigest.algorithm != null) {
// If a WARC block digest header is present in the
// record, use that algorithm.
digestAlgorithm = header.warcBlockDigest.algorithm;
} else {
// If no WARC block digest header is present,
// use the optional user specified algorithm.
// Can be null in which case nothing is computed.
digestAlgorithm = reader.blockDigestAlgorithm;
}
}
record.payload = Payload.processPayload(in, header.contentLength,
reader.payloadHeaderMaxSize, digestAlgorithm);
record.payload.setOnClosedHandler(record);
/*
* HttpHeader.
*/
if (header.contentType != null
&& header.contentType.contentType.equals("application")
&& header.contentType.mediaType.equals("http")) {
String value = header.contentType.getParameter("msgtype");
// request
int httpHeaderType = 0;
if ("response".equalsIgnoreCase(value)) {
httpHeaderType = HttpHeader.HT_RESPONSE;
} else if ("request".equalsIgnoreCase(value)) {
httpHeaderType = HttpHeader.HT_REQUEST;
}
if (httpHeaderType != 0) {
digestAlgorithm = null;
if (reader.bPayloadDigest) {
if (header.warcPayloadDigest != null && header.warcPayloadDigest.algorithm != null) {
// If a WARC payload digest header is present in the
// record, use that algorithm.
digestAlgorithm = header.warcPayloadDigest.algorithm;
} else {
// If no WARC payload digest header is present,
// use the optional user specified algorithm.
// Can be null in which case nothing is computed.
digestAlgorithm = reader.payloadDigestAlgorithm;
}
}
// Try to read a valid HTTP request/response header from the payload.
record.httpHeader = HttpHeader.processPayload(httpHeaderType,
record.payload.getInputStream(), header.contentLength,
digestAlgorithm);
if (record.httpHeader != null) {
if (record.httpHeader.isValid()) {
record.payload.setPayloadHeaderWrapped(record.httpHeader);
} else if (reader.bReportHttpHeaderError) {
record.diagnostics.addWarning(
DiagnosisType.ERROR, "http header", "Unable to parse http header!");
}
}
}
}
}
// Preliminary compliance status, will be updated when the
// payload/record is closed.
if (record.diagnostics.hasErrors() || record.diagnostics.hasWarnings()) {
record.bIsCompliant = false;
} else {
record.bIsCompliant = true;
}
reader.bIsCompliant &= record.bIsCompliant;
} else {
// In case no record is found the errors/warnings in the record object are transfered to the Reader.
long excess = in.getConsumed() - record.startOffset;
reader.consumed += excess;
reader.diagnostics.addAll(record.diagnostics);
if (record.diagnostics.hasErrors() || record.diagnostics.hasWarnings()) {
reader.errors += record.diagnostics.getErrors().size();
reader.warnings += record.diagnostics.getWarnings().size();
reader.bIsCompliant = false;
}
// Require one or more records to be present.
if (reader.records == 0) {
reader.diagnostics.addError(new Diagnosis(DiagnosisType.ERROR_EXPECTED, "WARC file", "One or more records"));
++reader.errors;
reader.bIsCompliant = false;
}
if (excess != 0) {
reader.diagnostics.addError(new Diagnosis(DiagnosisType.UNDESIRED_DATA, "Trailing data", "Garbage data found at offset=" + record.startOffset + " - length=" + excess));
}
// EOF
record = null;
}
return record;
}
/**
* Called when the payload object is closed and final steps in the
* validation process can be performed.
* @throws IOException I/O exception in final validation processing
*/
@Override
public void payloadClosed() throws IOException {
if (!bPayloadClosed) {
if (payload != null) {
// Check for truncated payload.
if (payload.getUnavailable() > 0) {
// Payload length mismatch - Payload truncated
addErrorDiagnosis(DiagnosisType.INVALID_DATA, "Payload length mismatch", "Payload truncated");
}
/*
* Check block digest.
*/
byte[] digest = payload.getDigest();
// Check for computed block digest.
if (digest != null) {
computedBlockDigest = new WarcDigest();
computedBlockDigest.digestBytes = digest;
}
// Auto detect encoding used in WARC header.
if (header.warcBlockDigest != null && header.warcBlockDigest.digestString != null) {
isValidBlockDigest = processWarcDigest(header.warcBlockDigest, computedBlockDigest, "block");
}
// Adjust information about computed block digest.
if (computedBlockDigest != null) {
processComputedDigest(computedBlockDigest,
reader.blockDigestAlgorithm, reader.blockDigestEncoding, "block");
}
// Revisit payload digest refers to the original. Continuation payload digest in first record also refers to original.
if ((header.warcTypeIdx != null && header.warcTypeIdx != WarcConstants.RT_IDX_REVISIT && header.warcTypeIdx != WarcConstants.RT_IDX_CONTINUATION) && httpHeader != null && httpHeader.isValid()) {
/*
* Check payload digest.
*/
digest = httpHeader.getDigest();
// Check for computed payload digest.
if (digest != null) {
computedPayloadDigest = new WarcDigest();
computedPayloadDigest.digestBytes = digest;
}
// Auto detect encoding used in WARC header.
if (header.warcPayloadDigest != null && header.warcPayloadDigest.digestString != null ) {
isValidPayloadDigest = processWarcDigest(header.warcPayloadDigest, computedPayloadDigest, "payload");
}
// Adjust information about computed payload digest.
if (computedPayloadDigest != null) {
processComputedDigest(computedPayloadDigest,
reader.payloadDigestAlgorithm, reader.payloadDigestEncoding, "payload");
}
}
}
// Check for trailing newlines.
trailingNewlines = nlp.parseCRLFs(in, diagnostics);
if (trailingNewlines != WarcConstants.WARC_RECORD_TRAILING_NEWLINES) {
addErrorDiagnosis(DiagnosisType.INVALID_EXPECTED,
"Trailing newlines",
Integer.toString(trailingNewlines),
Integer.toString(WarcConstants.WARC_RECORD_TRAILING_NEWLINES));
}
// isCompliant status update.
if (diagnostics.hasErrors() || diagnostics.hasWarnings()) {
bIsCompliant = false;
reader.errors += diagnostics.getErrors().size();
reader.warnings += diagnostics.getWarnings().size();
} else {
bIsCompliant = true;
}
reader.bIsCompliant &= bIsCompliant;
// Updated consumed after payload has been consumed.
consumed = in.getConsumed() - startOffset;
// Don't not close payload again.
bPayloadClosed = true;
// Callback.
reader.recordClosed();
}
}
/**
* Auto-detect encoding used in WARC digest header and compare it to the
* internal one, if it has been computed.
* @param warcDigest digest from WARC header
* @param computedDigest internally compute digest
* @param digestName used to identify the digest ("block" or "payload")
* @return WARC digest validity indication
*/
protected Boolean processWarcDigest(WarcDigest warcDigest, WarcDigest computedDigest, String digestName) {
byte[] digest;
Boolean isValidDigest = null;
int digestAlgorithmLength = WarcDigest.digestAlgorithmLength(warcDigest.algorithm);
digest = Base16.decodeToArray(warcDigest.digestString);
if (digest != null && digest.length == digestAlgorithmLength) {
warcDigest.digestBytes = digest;
warcDigest.encoding = "base16";
}
if (warcDigest.digestBytes == null) {
digest = Base32.decodeToArray(warcDigest.digestString, true);
if (digest != null && digest.length == digestAlgorithmLength) {
warcDigest.digestBytes = digest;
warcDigest.encoding = "base32";
}
if (warcDigest.digestBytes == null) {
digest = Base64.decodeToArray(warcDigest.digestString, true);
if (digest != null && digest.length == digestAlgorithmLength) {
warcDigest.digestBytes = digest;
warcDigest.encoding = "base64";
}
}
}
if (warcDigest.encoding == null) {
// Encoding - Unrecognized block digest encoding scheme
addErrorDiagnosis(DiagnosisType.UNKNOWN,
"Record " + digestName + " digest encoding scheme",
warcDigest.digestString);
}
if (computedDigest != null) {
computedDigest.algorithm = warcDigest.algorithm;
computedDigest.encoding = warcDigest.encoding;
if (warcDigest.digestBytes != null) {
if (!Arrays.equals(computedDigest.digestBytes, warcDigest.digestBytes)) {
// Block digest - Computed block digest does not match
addErrorDiagnosis(DiagnosisType.INVALID_EXPECTED,
"Incorrect " + digestName + " digest",
Base16.encodeArray(warcDigest.digestBytes),
Base16.encodeArray(computedDigest.digestBytes));
isValidDigest = false;
} else {
isValidDigest = true;
}
} else {
isValidDigest = false;
}
}
return isValidDigest;
}
/**
* Adjust algorithm and encoding information about computed block digest.
* @param computedDigest internally compute digest
* @param digestAlgorithm default algorithm
* @param digestEncoding default encoding
* @param digestName used to identify the digest ("block" or "payload")
*/
protected void processComputedDigest(WarcDigest computedDigest, String digestAlgorithm, String digestEncoding, String digestName) {
if (computedDigest.algorithm == null) {
computedDigest.algorithm = digestAlgorithm;
}
if (computedDigest.encoding == null && digestEncoding != null) {
if ("base32".equals(digestEncoding)) {
computedDigest.encoding = "base32";
} else if ("base64".equals(digestEncoding)) {
computedDigest.encoding = "base64";
} else if ("base16".equals(digestEncoding)) {
computedDigest.encoding = "base16";
} else {
// Encoding - Unknown block digest encoding scheme ..
addErrorDiagnosis(DiagnosisType.UNKNOWN,
"Default " + digestName + " digest encoding scheme",
digestEncoding);
}
}
if (computedDigest.encoding != null) {
if ("base32".equals(computedDigest.encoding)) {
computedDigest.digestString = Base32.encodeArray(computedDigest.digestBytes);
} else if ("base64".equals(computedDigest.encoding)) {
computedDigest.digestString = Base64.encodeArray(computedDigest.digestBytes);
} else if ("base16".equals(computedDigest.encoding)) {
computedDigest.digestString = Base16.encodeArray(computedDigest.digestBytes);
}
}
}
/**
* Check to see if the record has been closed.
* @return boolean indicating whether this record is closed or not
*/
public boolean isClosed() {
return bClosed;
}
/**
* Close resources associated with the WARC record.
* Mainly payload stream if any.
* @throws IOException if unable to close resources
*/
public void close() throws IOException {
if (!bClosed) {
// Ensure input stream is at the end of the record payload.
if (payload != null) {
payload.close();
}
payloadClosed();
reader = null;
in = null;
bClosed = true;
}
}
/**
* Returns a boolean indicating the ISO compliance status of this record.
* @return a boolean indicating the ISO compliance status of this record
*/
public boolean isCompliant() {
return bIsCompliant;
}
/**
* Get the record offset relative to the start of the WARC file
* InputStream
.
* @return the record offset relative to the start of the WARC file
*/
public long getStartOffset() {
return header.startOffset;
}
/**
* Return number of uncompressed bytes consumed validating this record.
* @return number of uncompressed bytes consumed validating this record
*/
public long getConsumed() {
return consumed;
}
/**
* Get a List
of all the non-standard WARC headers found
* during parsing.
* @return List
of HeaderLine
*/
public List getHeaderList() {
return Collections.unmodifiableList(header.headerList);
}
/**
* Get a non-standard WARC header or null, if nothing is stored for this
* header name.
* @param field header name
* @return HeaderLine
structure or null
*/
public HeaderLine getHeader(String field) {
if (field != null && field.length() > 0) {
return header.headerMap.get(field.toLowerCase());
} else {
return null;
}
}
/**
* Specifies whether this record has a payload or not.
* @return true/false whether the ARC record has a payload
*/
public boolean hasPayload() {
return (payload != null);
}
/**
* Return Payload object.
* @return payload or null
*/
public Payload getPayload() {
return payload;
}
/**
* Payload content InputStream
getter.
* @return Payload content InputStream
*/
public InputStream getPayloadContent() {
return (payload != null) ? payload.getInputStream() : null;
}
/**
* Returns the HttpHeader
object if identified in the payload,
* or null.
* @return the HttpHeader
object if identified or null
*/
public HttpHeader getHttpHeader() {
return httpHeader;
}
/**
* Add an error diagnosis of the given type on a specific entity with
* optional extra information. The information varies according to the
* diagnosis type.
* @param type diagnosis type
* @param entity entity examined
* @param information optional extra information
*/
protected void addErrorDiagnosis(DiagnosisType type, String entity, String... information) {
diagnostics.addError(new Diagnosis(type, entity, information));
}
}