All Downloads are FREE. Search and download functionalities are using the official Maven repository.

edu.harvard.hul.ois.jhove.module.GzipModule Maven / Gradle / Ivy

package edu.harvard.hul.ois.jhove.module;

import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.List;

import org.jwat.common.Diagnosis;
import org.jwat.common.Diagnostics;
import org.jwat.common.InputStreamNoSkip;
import org.jwat.common.RandomAccessFileInputStream;
import org.jwat.gzip.GzipEntry;
import org.jwat.gzip.GzipReader;

import edu.harvard.hul.ois.jhove.Agent;
import edu.harvard.hul.ois.jhove.Agent.Builder;
import edu.harvard.hul.ois.jhove.AgentType;
import edu.harvard.hul.ois.jhove.Document;
import edu.harvard.hul.ois.jhove.DocumentType;
import edu.harvard.hul.ois.jhove.ErrorMessage;
import edu.harvard.hul.ois.jhove.Identifier;
import edu.harvard.hul.ois.jhove.IdentifierType;
import edu.harvard.hul.ois.jhove.InfoMessage;
import edu.harvard.hul.ois.jhove.JhoveException;
import edu.harvard.hul.ois.jhove.ModuleBase;
import edu.harvard.hul.ois.jhove.Property;
import edu.harvard.hul.ois.jhove.PropertyArity;
import edu.harvard.hul.ois.jhove.PropertyType;
import edu.harvard.hul.ois.jhove.RepInfo;
import edu.harvard.hul.ois.jhove.module.gzip.GzipEntryProperties;

/**
 * JHOVE module for identifying, validating and characterizing GZIP files.
 * Ported from the JHOVE2 GZIP module, created by lbihanic, selghissassi, nicl
 * 
 * JHOVE2 GZip module.  This module parses and validates GZip files
 * in compliance with
 * RFC 1952 (GZIP
 * file format specification version 4.3) and supports multiple member
 * GZIP files.

* * This is a non-recursive validation. It only validates the GZIP file format, * not the actual content within the WARC records. * * @author [email protected] */ public class GzipModule extends ModuleBase { /*------------ MODULE DEFINITIONS ---------------*/ private static final Agent KB_AGENT = new Builder( "Royal Library of Denmark", AgentType.STANDARD) .address("Søren Kierkegaards Plads 1, 1219 København K, Denmark") .fax("+45 3393 2218") .web("http://kb.dk").build(); private static final String NAME = "GZIP-kb"; private static final String RELEASE = "0.1"; private static final int[] DATE = {2015, 12, 8}; private static final String[] FORMAT = {"GZIP"}; private static final String COVERAGE = "GZIP, https://tools.ietf.org/html/rfc1952"; private static final String[] MIMETYPE = {"application/gzip", "application/x-gzip"}; private static final String WELLFORMED = ""; private static final String VALIDITY = "The file is well-formed"; private static final String REPINFO = ""; private static final String NOTE = ""; private static final String RIGHTS = "Copyright 2015 by The Royal Library of Denmark. " + "Released under the GNU Lesser General Public License."; /** * List of Property elements for the entry of the GZIP-file. * Each Property contains a map of all properties for a given entry. */ private List entryProperties; /** * Constructor. */ public GzipModule() { super(NAME, RELEASE, DATE, FORMAT, COVERAGE, MIMETYPE, WELLFORMED, VALIDITY, REPINFO, NOTE, RIGHTS, false); setVendorAndSpecification(); initialiseVariables(); } /** * Sets the vendor and specification for this module. */ private void setVendorAndSpecification() { _vendor = KB_AGENT; Document doc = new Document("GZIP", DocumentType.RFC); // Should probably have IIPC and others as authors Agent ietfAgent = new Agent.Builder("IETF", AgentType.STANDARD).web( "http://www.ietf.org").build(); doc.setPublisher(ietfAgent); doc.setDate("1996"); doc.setIdentifier(new Identifier("https://www.ietf.org/rfc/rfc1952.txt", IdentifierType.RFC)); _specification.add(doc); // TODO figure out, why the tests fail, when the signature is added. // Signature sig = new ExternalSignature(".gz", SignatureType.EXTENSION, // SignatureUseType.OPTIONAL); // _signature.add(sig); } /** * Initializes the variables. */ private void initialiseVariables() { entryProperties = new ArrayList(); } /** Reset parameter settings. * Returns to a default state without any parameters. */ @Override public void resetParams() throws Exception { initialiseVariables(); } @Override public void parse(RandomAccessFile file, RepInfo info) throws IOException { InputStream stream = new RandomAccessFileInputStream(file); parse(stream, info, 0); } @Override public int parse(InputStream stream, RepInfo info, int parseIndex) throws IOException { GzipReader reader = new GzipReader(new InputStreamNoSkip(stream), 8192); try { parseRecords(reader); info.setValid(reader.isCompliant()); info.setWellFormed(reader.isCompliant()); reportResults(reader, info); info.setSigMatch(_name); info.setFormat(_format[0]); info.setVersion("4.3"); // Is it really version 4.3? info.setMimeType(_mimeType[0]); } catch (Exception e) { info.setMessage(new ErrorMessage(e.getMessage())); info.setValid(false); info.setWellFormed(false); } finally { if(reader != null) { reader.close(); reader = null; } } return 0; } /** * Parse GZIP entries. Parsing should be straight forward with all records accessible through the same source. * @param reader GZIP reader used to parse records * @throws EOFException if EOF occurs prematurely * @throws IOException if an IO error occurs while processing * @throws JHOVE2Exception if a serious problem needs to be reported */ protected void parseRecords(GzipReader reader) throws EOFException, IOException, JhoveException { if (reader != null) { GzipEntry entry; while ((entry = reader.getNextEntry()) != null) { processEntry(entry); reader.diagnostics.addAll(entry.diagnostics); } } else { throw new JhoveException("WarcReader is has not been properly instantiated."); } } /** * Process a GZIP entry. * Extracts all the properties of the entry into a map, and puts this map on the list. * @param entry ZGIP entry from GZIP reader * @throws EOFException if EOF occurs prematurely * @throws IOException if an IO error occurs while processing * @throws JhoveException if a serious problem needs to be reported */ protected void processEntry(GzipEntry entry) throws EOFException, IOException, JhoveException { GzipEntryProperties properties = new GzipEntryProperties(entry); Property p = new Property("Record", PropertyType.STRING, PropertyArity.MAP, properties.getProperties()); entryProperties.add(p); entry.close(); } /** * Report the results of the characterization. * @param reader The GZIP reader, which has read the GZIP-file. * @param repInfo The representation info, where to report the results. * @throws JhoveException * @throws IOException */ private void reportResults(GzipReader reader, RepInfo repInfo) throws JhoveException, IOException { Diagnostics diagnostics = reader.diagnostics; if (diagnostics.hasErrors()) { for (Diagnosis d : diagnostics.getErrors()) { repInfo.setMessage(new ErrorMessage(extractDiagnosisType(d), extractDiagnosisMessage(d))); } repInfo.setConsistent(false); } if (diagnostics.hasWarnings()) { // Report warnings on source object. for (Diagnosis d : diagnostics.getWarnings()) { repInfo.setMessage(new InfoMessage(extractDiagnosisType(d), extractDiagnosisMessage(d))); } } repInfo.setProperty(new Property("Records", PropertyType.PROPERTY, PropertyArity.LIST, entryProperties)); repInfo.setSize(reader.getConsumed()); } /** * Extracts the diagnosis type. * @param d The diagnosis whose type should be extracted * @return The type of diagnosis */ private String extractDiagnosisType(Diagnosis d) { return d.type.name(); } /** * Extracts the message from the diagnosis. * @param d The diagnosis * @return The message containing entity and informations. */ private String extractDiagnosisMessage(Diagnosis d) { StringBuilder res = new StringBuilder(); res.append("Entity: " + d.entity); for(String i : d.information) { res.append(", " + i); } return res.toString(); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy