All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.harvard.hul.ois.jhove.module.WarcModule Maven / Gradle / Ivy

package edu.harvard.hul.ois.jhove.module;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.jwat.common.Diagnosis;
import org.jwat.common.Diagnostics;
import org.jwat.common.InputStreamNoSkip;
import org.jwat.common.RandomAccessFileInputStream;
import org.jwat.common.UriProfile;
import org.jwat.warc.WarcReader;
import org.jwat.warc.WarcReaderFactory;
import org.jwat.warc.WarcRecord;

import edu.harvard.hul.ois.jhove.Agent;
import edu.harvard.hul.ois.jhove.Agent.Builder;
import edu.harvard.hul.ois.jhove.AgentType;
import edu.harvard.hul.ois.jhove.Document;
import edu.harvard.hul.ois.jhove.DocumentType;
import edu.harvard.hul.ois.jhove.ErrorMessage;
import edu.harvard.hul.ois.jhove.Identifier;
import edu.harvard.hul.ois.jhove.IdentifierType;
import edu.harvard.hul.ois.jhove.InfoMessage;
import edu.harvard.hul.ois.jhove.JhoveException;
import edu.harvard.hul.ois.jhove.ModuleBase;
import edu.harvard.hul.ois.jhove.Property;
import edu.harvard.hul.ois.jhove.PropertyArity;
import edu.harvard.hul.ois.jhove.PropertyType;
import edu.harvard.hul.ois.jhove.RepInfo;
import edu.harvard.hul.ois.jhove.module.warc.WarcRecordProperties;

/**
 * JHOVE module for identifying, validating and characterizing WARC files.
 * Ported from the JHOVE2 WARC module and based on the JWAT-tool, both
 * created by [email protected] (nclarkekb@git). 
 * 
 * This is a non-recursive validation. It only validates the WARC file format 
 * and WARC headers, not the actual payload of the WARC records.
 * 
 * @author [email protected]
 */
public class WarcModule extends ModuleBase {

    /*------------ MODULE DEFINITIONS ---------------*/
    private static final Agent KB_AGENT = new Builder(
            "Royal Library of Denmark", AgentType.STANDARD)
            .address("Søren Kierkegaards Plads 1, 1219 København K, Denmark")
            .fax("+45 3393 2218")
            .web("http://kb.dk").build();

    private static final String NAME = "WARC-kb";
    private static final String RELEASE = "1.0";
    private static final int[] DATE = {2015, 12, 07};
    private static final String[] FORMAT = {
            "WARC", "WARC, Web ARChive file format"
    };
    private static final String COVERAGE = "WARC, 28500:2009";
    private static final String[] MIMETYPE = {"application/warc", "application/warc-fields"};
    private static final String WELLFORMED = "";
    private static final String VALIDITY = "The file is well-formed";
    private static final String REPINFO = "";
    private static final String NOTE = "";
    private static final String RIGHTS = "Copyright 2015 by The Royal Library of Denmark. " +
            "Released under the GNU Lesser General Public License.";

    /* DEFAULT VALUES */
    private static final Boolean DEFAULT_COMPUTE_BLOCK_DIGEST = true;
    private static final String DEFAULT_BLOCK_DIGEST_ALGORITHM = "sha1";
    private static final String DEFAULT_BLOCK_DIGEST_ENCODING = "base32";
    private static final Boolean DEFAULT_COMPUTE_PAYLOAD_DIGEST = true;
    private static final String DEFAULT_PAYLOAD_DIGEST_ALGORITHM = "sha1";
    private static final String DEFAULT_PAYLOAD_DIGEST_ENCODING = "base32";
    private static final Boolean DEFAULT_STRICT_TARGET_URI_VALIDATION = false;
    private static final Boolean DEFAULT_STRICT_URI_VALIDATION = false;

    /*-------------- Local variables --------------*/

    private boolean bComputeBlockDigest;
    private String blockDigestAlgorithm;
    private String blockDigestEncoding;

    private boolean bComputePayloadDigest;
    private String payloadDigestAlgorithm;
    private String payloadDigestEncoding;

    private boolean bStrictTargetUriValidation;
    private boolean bStrictUriValidation;

    /** 
     * Map of the WARC record versions and their count. 
     * Used for reporting the most seen version, as the version for the WARC file.
     */
    private Map versions;
    /** 
     * List of Property elements for the records of the WARC-file. 
     * Each Property contains a map of all properties for a given record.
     */
    private List recordProperties;

    /**
     * Constructor.
     */
    public WarcModule() {
        super(NAME, RELEASE, DATE, FORMAT, COVERAGE, MIMETYPE, WELLFORMED,
                VALIDITY, REPINFO, NOTE, RIGHTS, false);
        setVendorAndSpecification();
        initialiseVariables();
    }

    /**
     * Sets the vendor and specification for this module.
     */
    private void setVendorAndSpecification() {
        _vendor = KB_AGENT;

        Document doc = new Document("WARC (Web ARChive) file format",
                DocumentType.WEB);
        // Should probably have IIPC and others as authors
        doc.setPublisher(Agent.newIsoInstance());
        doc.setDate("2009");
        doc.setIdentifier(new Identifier("28500:2009",
                IdentifierType.ISO));
        _specification.add(doc);
    }

    /**
     * Initialises the variables.
     */
    private void initialiseVariables() {
        versions = new HashMap();
        recordProperties = new ArrayList();

        bComputeBlockDigest = DEFAULT_COMPUTE_BLOCK_DIGEST;
        blockDigestAlgorithm = DEFAULT_BLOCK_DIGEST_ALGORITHM;
        blockDigestEncoding = DEFAULT_BLOCK_DIGEST_ENCODING;

        bComputePayloadDigest = DEFAULT_COMPUTE_PAYLOAD_DIGEST;
        payloadDigestAlgorithm = DEFAULT_PAYLOAD_DIGEST_ALGORITHM;
        payloadDigestEncoding = DEFAULT_PAYLOAD_DIGEST_ENCODING;

        bStrictTargetUriValidation = DEFAULT_STRICT_TARGET_URI_VALIDATION;
        bStrictUriValidation = DEFAULT_STRICT_URI_VALIDATION;
    }

    /** Reset parameter settings.
     *  Returns to a default state without any parameters.
     */
    @Override
    public void resetParams() throws Exception {
        initialiseVariables();
    }

    @Override
    public void parse(RandomAccessFile file, RepInfo info) throws IOException {
        InputStream stream = null;
        try {
            stream = new RandomAccessFileInputStream(file);
            parse(stream, info, 0);
        } finally {
            if (stream != null) {
                stream.close();
                stream = null;
            }
        }
    }

    @Override
    public int parse(InputStream stream, RepInfo info, int parseIndex) throws IOException {
        WarcReader reader = WarcReaderFactory.getReader(new InputStreamNoSkip(stream), 8192);
        try {
            setReaderOptions(reader);
            parseRecords(reader);

            info.setValid(reader.isCompliant());
            info.setWellFormed(reader.isCompliant());

            reportResults(reader, info);

            info.setSigMatch(_name);
            info.setFormat(_format[0]);
            info.setMimeType(_mimeType[0]);
        } catch (JhoveException e) {
            info.setMessage(new ErrorMessage(e.getMessage()));
            info.setValid(false);
            info.setWellFormed(false);
        } finally {
            if(reader != null) {
                reader.close();
                reader = null;
            }
        }
        return 0;
    }

    /**
     * Set digest options for WARC reader.
     * @param reader WARC reader instance
     */
    protected void setReaderOptions(WarcReader reader) throws JhoveException {
        reader.setBlockDigestEnabled(bComputeBlockDigest);
        reader.setPayloadDigestEnabled(bComputePayloadDigest);
        if (!reader.setBlockDigestAlgorithm(blockDigestAlgorithm)) {
            throw new JhoveException("Invalid block digest algorithm: " + blockDigestAlgorithm);
        }
        if (!reader.setPayloadDigestAlgorithm(payloadDigestAlgorithm)) {
            throw new JhoveException("Invalid payload digest algorithm: " + payloadDigestAlgorithm);
        }
        reader.setBlockDigestEncoding(blockDigestEncoding);
        reader.setPayloadDigestEncoding(payloadDigestEncoding);
        if (bStrictTargetUriValidation) {
            reader.setWarcTargetUriProfile(UriProfile.RFC3986);
        } else {
            reader.setWarcTargetUriProfile(UriProfile.RFC3986_ABS_16BIT_LAX);
        }
        if (bStrictUriValidation) {
            reader.setUriProfile(UriProfile.RFC3986);
        } else {
            reader.setUriProfile(UriProfile.RFC3986_ABS_16BIT_LAX);
        }
    }

    /**
     * Parse WARC records. Parsing should be straight forward with all records accessible through the same source.
     * @param reader WARC reader used to parse records
     * @throws EOFException if EOF occurs prematurely
     * @throws IOException if an IO error occurs while processing
     * @throws JhoveException if a serious problem needs to be reported
     */
    protected void parseRecords(WarcReader reader) throws IOException, JhoveException {
        if (reader != null) {
            WarcRecord record;
            while ((record = reader.getNextRecord()) != null) {
                processRecord(record);
                reader.diagnostics.addAll(record.diagnostics);
            }
        } else {
            throw new JhoveException("WarcReader has not been properly instantiated.");
        }
    }

    /**
     * Process a WARC record. 
     * Does not characterize the record payload.
     * @param record WARC record from WARC reader
     * @throws EOFException if EOF occurs prematurely
     * @throws IOException if an IO error occurs while processing
     * @throws JhoveException if a serious problem needs to be reported
     */
    protected void processRecord(WarcRecord record) throws IOException, JhoveException {
        if (record.header.bValidVersionFormat) {
            Integer count = versions.get(record.header.versionStr);
            if (count == null) {
                count = 0;
            }
            ++count;
            versions.put(record.header.versionStr, count);
        }

        WarcRecordProperties properties = new WarcRecordProperties(record);
        Property p = new Property("Record", PropertyType.STRING, PropertyArity.MAP, properties.getProperties());
        
        recordProperties.add(p);

        record.close();
    }

    /**
     * Report the results of the characterization.
     * @param reader The WARC reader, which has read the WARC-file.
     * @param repInfo The representation info, where to report the results.
     * @throws JhoveException
     * @throws IOException
     */
    private void reportResults(WarcReader reader, RepInfo repInfo) throws JhoveException, IOException {
        Diagnostics diagnostics = reader.diagnostics;
        if (diagnostics.hasErrors()) {
            for (Diagnosis d : diagnostics.getErrors()) {
                repInfo.setMessage(new ErrorMessage(extractDiagnosisType(d), extractDiagnosisMessage(d)));
            }
            repInfo.setConsistent(false);
        }
        if (diagnostics.hasWarnings()) {
            // Report warnings on source object.
            for (Diagnosis d : diagnostics.getWarnings()) {
                repInfo.setMessage(new InfoMessage(extractDiagnosisType(d), extractDiagnosisMessage(d)));
            }
        }

        int maxCount = -1;
        for(Entry e : versions.entrySet()) {
            if(e.getValue() > maxCount) {
                maxCount = e.getValue();
                repInfo.setVersion(e.getKey());
            }

            _features.add(e.getValue() + " WARC records of version " + e.getKey());
        }

        repInfo.setProperty(new Property("Records", PropertyType.PROPERTY, PropertyArity.LIST, recordProperties));
        repInfo.setSize(reader.getConsumed());
    }

    /**
     * Extracts the diagnosis type.
     * @param d The diagnosis whose type should be extracted
     * @return The type of diagnosis
     */
    private String extractDiagnosisType(Diagnosis d) {
        return d.type.name();
    }

    /**
     * Extracts the message from the diagnosis.
     * @param d The diagnosis
     * @return The message containing entity and informations.
     */
    private String extractDiagnosisMessage(Diagnosis d) {
        StringBuilder res = new StringBuilder();
        res.append("Entity: " + d.entity);
        for(String i : d.information) {
            res.append(", " + i);    		
        }
        return res.toString();
    }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy