org.archive.io.ArchiveReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of webarchive-commons Show documentation
There is a newer version: 1.1.9
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.io;


import java.io.BufferedInputStream;
import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.archive.util.MimetypeUtils;
import org.archive.util.zip.GZIPMembersInputStream;

import com.google.common.io.CountingInputStream;


/**
 * Reader for an Archive file of Archive {@link ArchiveRecord}s.
 * @author stack
 * @version $Date$ $Version$
 */
public abstract class ArchiveReader implements ArchiveFileConstants, Iterable, Closeable {    
    /**
     * Is this Archive file compressed?
     */
    private boolean compressed = false;
    
    /**
     * Should we digest as we read?
     */
    private boolean digest = true;
    
    /**
     * Should the parse be strict?
     */
    private boolean strict = false;
    
    /**
     * Archive file input stream.
     *
     * Keep it around so we can close it when done.
     *
     * Set in constructor. Should support at least 1 byte mark/reset.
     * Make it protected so subclasses have access.
     */
    protected InputStream in = null;
    
    /**
     * Maximum amount of recoverable exceptions in a row.
     * If more than this amount in a row, we'll let out the exception rather
     * than go back in for yet another retry.
     */
    public static final int MAX_ALLOWED_RECOVERABLES = 10;
    

    /**
     * The Record currently being read.
     *
     * Keep this ongoing reference so we'll close the record even if the caller
     * doesn't.
     */
    private ArchiveRecord currentRecord = null;
    
    /**
     * Descriptive string for the Archive file we're going against:
     * full path, url, etc. -- depends on context in which file was made.
     */
    private String identifier = null;
    
    /**
     * Archive file version.
     */
    private String version = null;
    
    
    protected ArchiveReader() {
        super();
    }
    
    /**
     * Convenience method used by subclass constructors.
     * @param i Identifier for Archive file this reader goes against.
     */
    protected void initialize(final String i) {
        setReaderIdentifier(i);
    }
    
    /**
     * Convenience method for constructors.
     * 
     * @param f File to read.
     * @param offset Offset at which to start reading.
     * @return InputStream to read from.
     * @throws IOException If failed open or fail to get a memory
     * mapped byte buffer on file.
     */
    protected InputStream getInputStream(final File f, final long offset)
    throws IOException {
        FileInputStream fin = new FileInputStream(f); 
        return new BufferedInputStream(fin);
    }

    public boolean isCompressed() {
        return this.compressed;
    }

    /**
     * Get record at passed offset.
     * 
     * @param offset Byte index into file at which a record starts.
     * @return An Archive Record reference.
     * @throws IOException
     */
    public ArchiveRecord get(long offset) throws IOException {
        cleanupCurrentRecord();
        long posn = positionForRecord(in); 
        if(offset>=posn) {
            in.skip(offset-posn); 
        } else {
            throw new UnsupportedOperationException("no reverse seeking: at "+posn+" requested "+offset); 
        }
        return createArchiveRecord(this.in, offset);
    }
    
    /**
     * @return Return Archive Record created against current offset.
     * @throws IOException
     */
    public ArchiveRecord get() throws IOException {
        return createArchiveRecord(this.in, positionForRecord(in));
    }

    public void close() throws IOException {
        if (this.in != null) {
            this.in.close();
            this.in = null;
        }
    }
    
    /**
     * Cleanout the current record if there is one.
     * @throws IOException
     */
    protected void cleanupCurrentRecord() throws IOException {
        if (this.currentRecord != null) {
            this.currentRecord.close();
            gotoEOR(this.currentRecord);
            this.currentRecord = null;
        }
    }
    
    /**
     * Return an Archive Record homed on offset into
     * is.
     * @param is Stream to read Record from.
     * @param offset Offset to find Record at.
     * @return ArchiveRecord instance.
     * @throws IOException
     */
    protected abstract ArchiveRecord createArchiveRecord(InputStream is,
    	long offset)
    throws IOException;
    
    /**
     * Skip over any trailing new lines at end of the record so we're lined up
     * ready to read the next.
     * @param record
     * @throws IOException
     */
    protected abstract void gotoEOR(ArchiveRecord record) throws IOException;
    
    public abstract String getFileExtension();
    public abstract String getDotFileExtension();

    /**
     * @return Version of this Archive file.
     */
    public String getVersion() {
    	return this.version;
    }

    /**
     * Validate the Archive file.
     *
     * This method iterates over the file throwing exception if it fails
     * to successfully parse any record.
     *
     * 
Assumes the stream is at the start of the file.
     * @return List of all read Archive Headers.
     *
     * @throws IOException
     */
    public List validate() throws IOException {
        return validate(-1);
    }

    /**
     * Validate the Archive file.
     *
     * This method iterates over the file throwing exception if it fails
     * to successfully parse.
     *
     * We start validation from wherever we are in the stream.
     *
     * @param numRecords Number of records expected.  Pass -1 if number is
     * unknown.
     *
     * @return List of all read metadatas. As we validate records, we add
     * a reference to the read metadata.
     *
     * @throws IOException
     */
    public List validate(int numRecords) 
    throws IOException {
        List hdrList = new ArrayList();
        int recordCount = 0;
        setStrict(true);
        for (Iterator i = iterator(); i.hasNext();) {
            recordCount++;
            ArchiveRecord r = i.next();
            if (r.getHeader().getLength() <= 0
                && r.getHeader().getMimetype().
                    equals(MimetypeUtils.NO_TYPE_MIMETYPE)) {
                throw new IOException("record content is empty.");
            }
            r.close();
            hdrList.add(r.getHeader());
        }

        if (numRecords != -1) {
            if (recordCount != numRecords) {
                throw new IOException("Count of records, " 
                        + Integer.toString(recordCount) 
                        + " is not equal to expected " 
                        + Integer.toString(numRecords));
            }
        }

        return hdrList;
    }

    /**
     * Test Archive file is valid.
     * Assumes the stream is at the start of the file.  Be aware that this
     * method makes a pass over the whole file. 
     * @return True if file can be successfully parsed.
     */
    public boolean isValid() {
        boolean valid = false;
        try {
            validate();
            valid = true;
        } catch(Exception e) {
            // File is not valid if exception thrown parsing.
            valid = false;
        }
    
        return valid;
    }

    /**
     * @return Returns the strict.
     */
    public boolean isStrict() {
        return this.strict;
    }

    /**
     * @param s The strict to set.
     */
    public void setStrict(boolean s) {
        this.strict = s;
    }

    /**
     * @param d True if we're to digest.
     */
    public void setDigest(boolean d) {
        this.digest = d;
    }

    /**
     * @return True if we're digesting as we read.
     */
    public boolean isDigest() {
        return this.digest;
    }
 
    protected Logger getLogger() {
        return Logger.getLogger(this.getClass().getName());
    }
    
    /**
     * Returns an ArchiveRecord iterator.
     * Of note, on IOException, especially if ZipException reading compressed
     * ARCs, rather than fail the iteration, try moving to the next record.
     * If {@link ArchiveReader#strict} is not set, this will usually succeed.
     * @return An iterator over ARC records.
     */
    public Iterator iterator() {
        // Eat up any record outstanding.
        try {
            cleanupCurrentRecord();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }

        return new ArchiveRecordIterator();
    }

	protected void setCompressed(boolean compressed) {
		this.compressed = compressed;
	}

    /**
     * @return The current ARC record or null if none.
     * After construction has the arcfile header record.
     * @see #get()
     */
	protected ArchiveRecord getCurrentRecord() {
		return this.currentRecord;
	}

	protected ArchiveRecord currentRecord(final ArchiveRecord r) {
		this.currentRecord = r;
        return r;
	}

	protected InputStream getIn() {
		return in;
	}

	protected void setIn(InputStream in) {
		this.in = in;
	}

	protected void setVersion(String version) {
		this.version = version;
	}

	public String getReaderIdentifier() {
		return this.identifier;
	}

	protected void setReaderIdentifier(final String i) {
		this.identifier = i;
	}
	
    /**
     * Log on stderr.
     * Logging should go via the logging system.  This method
     * bypasses the logging system going direct to stderr.
     * Should not generally be used.  Its used for rare messages
     * that come of cmdline usage of ARCReader ERRORs and WARNINGs.
     * Override if using ARCReader in a context where no stderr or
     * where you'd like to redirect stderr to other than System.err.
     * @param level Level to log message at.
     * @param message Message to log.
     */
    public void logStdErr(Level level, String message) {
        System.err.println(level.toString() + " " + message);
    }
    
//    /**
//     * Add buffering to RandomAccessInputStream.
//     */
//    protected class RandomAccessBufferedInputStream
//    extends BufferedInputStream implements RepositionableStream {
//
//        public RandomAccessBufferedInputStream(RandomAccessInputStream is)
//        		throws IOException {
//            super(is);
//        }
//
//        public RandomAccessBufferedInputStream(RandomAccessInputStream is, int size)
//        		throws IOException {
//            super(is, size);
//        }
//
//        public long position() throws IOException {
//            // Current position is the underlying files position
//            // minus the amount thats in the buffer yet to be read.
//            return ((RandomAccessInputStream)this.in).position() -
//            	(this.count - this.pos);
//        }
//
//        public void position(long position) throws IOException {
//            // Force refill of buffer whenever there's been a seek.
//            this.pos = 0;
//            this.count = 0;
//            ((RandomAccessInputStream)this.in).position(position);
//        }
//        
//        public int available() throws IOException {
//            // Avoid overflow on large datastreams
//            long amount = (long)in.available() + (long)(count - pos);
//            return (amount >= Integer.MAX_VALUE)? Integer.MAX_VALUE: (int)amount;
//        }
//    }
    
    /**
     * Inner ArchiveRecord Iterator class.
     * Throws RuntimeExceptions in {@link #hasNext()} and {@link #next()} if
     * trouble pulling record from underlying stream.
     * @author stack
     */
    protected class ArchiveRecordIterator implements Iterator {
        private final Logger logger =
            Logger.getLogger(this.getClass().getName());
        /**
         * @return True if we have more records to read.
         * @exception RuntimeException Can throw an IOException wrapped in a
         * RuntimeException if a problem reading underlying stream (Corrupted
         * gzip, etc.).
         */
        public boolean hasNext() {
            // Call close on any extant record.  This will scoot us past
            // any content not yet read.
            try {
                cleanupCurrentRecord();
            } catch (IOException e) {
                if (isStrict()) {
                    throw new RuntimeException(e);
                }
                if (e instanceof EOFException) {
                    logger.warning("Premature EOF cleaning up " + 
                        currentRecord.getHeader().toString() + ": " +
                        e.getMessage());
                    return false;
                }
                // If not strict, try going again.  We might be able to skip
                // over the bad record.
                logger.log(Level.WARNING,"Trying skip of failed record cleanup of " +
                    currentRecord.getHeader().toString() + ": " +
                    e.getMessage(), e);
            }
            return innerHasNext();
        }
        
        protected boolean innerHasNext(){
            try {
                getIn().mark(1); 
                int c = getIn().read();
                getIn().reset(); 
                return c > -1; 
            } catch (IOException e) {
                logger.log(Level.WARNING,"problem probing for more content",e);
                return false; 
            } 
        }

        /**
         * Tries to move to next record if we get
         * {@link RecoverableIOException}. If not strict
         * tries to move to next record if we get an
         * {@link IOException}.
         * @return Next object.
         * @exception RuntimeException Throws a runtime exception,
         * usually a wrapping of an IOException, if trouble getting
         * a record (Throws exception rather than return null).
         */
        public ArchiveRecord next() {
            long offset = -1;
            try {
                offset = positionForRecord(getIn()); 
                return exceptionNext();
            } catch (IOException e) {
                if (!isStrict()) {
                    // Retry though an IOE.  Maybe we will succeed reading
                    // subsequent record.
                    try {
                        if (hasNext()) {
                            getLogger().warning("Bad Record. Trying skip " +
                                "(Record start " +  offset + "): " +
                                e.getMessage());
                            return exceptionNext();
                        }
                        // Else we are at last record.  Iterator#next is
                        // expecting value. We do not have one. Throw exception.
                        throw new RuntimeException("Retried but no next " + 
                            "record (Record start " + offset + ")", e);
                    } catch (IOException e1) {
                        throw new RuntimeException("After retry (Offset " +
                                offset + ")", e1);
                    }
                }
                throw new RuntimeException("(Record start " + offset + ")", e);
            }
        }
        
        /**
         * A next that throws exceptions and has handling of
         * recoverable exceptions moving us to next record. Can call
         * hasNext which itself may throw exceptions.
         * @return Next record.
         * @throws IOException
         * @throws RuntimeException Thrown when we've reached maximum
         * retries.
         */
        protected ArchiveRecord exceptionNext()
        throws IOException, RuntimeException {
            ArchiveRecord result = null;
            IOException ioe = null;
            for (int i = MAX_ALLOWED_RECOVERABLES; i > 0 &&
                    result == null; i--) {
                ioe = null;
                try {
                    result = innerNext();
                } catch (RecoverableIOException e) {
                    ioe = e;
                    getLogger().warning(e.getMessage());
                    if (hasNext()) {
                        continue;
                    }
                    // No records left.  Throw exception rather than
                    // return null.  The caller is expecting to get
                    // back a record since they've just called
                    // hasNext.
                    break;
                }
            }
            if (ioe != null) {
                // Then we did MAX_ALLOWED_RECOVERABLES retries.  Throw
                // the recoverable ioe wrapped in a RuntimeException so
                // it goes out pass checks for IOE.
                throw new RuntimeException("Retried " +
                    MAX_ALLOWED_RECOVERABLES + " times in a row", ioe);
            }
            return result;
        }
        
        protected ArchiveRecord innerNext() throws IOException {
            return get(positionForRecord(getIn()));
        }
        
        public void remove() {
            throw new UnsupportedOperationException();
        }
    }
    
    protected static long positionForRecord(InputStream in) {
        return (in instanceof GZIPMembersInputStream) 
            ? ((GZIPMembersInputStream)in).getCurrentMemberStart()
            : ((CountingInputStream)in).getCount();
    }
    
    protected static String stripExtension(final String name,
    		final String ext) {
        return (!name.endsWith(ext))? name:
            name.substring(0, name.length() - ext.length());
    }
    
    /**
     * @return short name of Archive file.
     */
    public String getFileName() {
        return (new File(getReaderIdentifier())).getName();
    }

    /**
     * @return short name of Archive file.
     */
    public String getStrippedFileName() {
        return getStrippedFileName(getFileName(),
    		getDotFileExtension());
    }
    
    /**
     * @param name Name of ARCFile.
     * @param dotFileExtension '.arc' or '.warc', etc.
     * @return short name of Archive file.
     */
    public static String getStrippedFileName(String name,
    		final String dotFileExtension) {
    	name = stripExtension(name,
    		ArchiveFileConstants.DOT_COMPRESSED_FILE_EXTENSION);
    	return stripExtension(name, dotFileExtension);
    }
    
    /**
     * @param value Value to test.
     * @return True if value is 'true', else false.
     */
    protected static boolean getTrueOrFalse(final String value) {
    	if (value == null || value.length() <= 0) {
    		return false;
    	}
        return Boolean.TRUE.toString().equals(value.toLowerCase());
    }
    
    /**
     * @param format Format to use outputting.
     * @throws IOException
     * @throws java.text.ParseException
     * @return True if handled.
     */
    protected boolean output(final String format)
    throws IOException, java.text.ParseException {
    	boolean result = true;
        // long start = System.currentTimeMillis();
    	
        // Write output as pseudo-CDX file.  See
        // http://www.archive.org/web/researcher/cdx_legend.php
        // and http://www.archive.org/web/researcher/example_cdx.php.
        // Hash is hard-coded straight SHA-1 hash of content.
        if (format.equals(DUMP)) {
        	// No point digesting dumping.
        	setDigest(false);
            dump(false);
        } else if (format.equals(GZIP_DUMP)) {
        	// No point digesting dumping.
        	setDigest(false);
            dump(true);
        } else if (format.equals(CDX)) {
        	cdxOutput(false);   
        } else if (format.equals(CDX_FILE)) {
            cdxOutput(true);
        } else {
        	result = false;
        }	
        return result;
    }
    
    protected void cdxOutput(boolean toFile)
    throws IOException {
        BufferedWriter cdxWriter = null;
        if (toFile) {
            String cdxFilename = stripExtension(getReaderIdentifier(),
                DOT_COMPRESSED_FILE_EXTENSION);
            cdxFilename = stripExtension(cdxFilename, getDotFileExtension());
            cdxFilename += ('.' + CDX);
            cdxWriter = new BufferedWriter(new FileWriter(cdxFilename));
        }
        
        String header = "CDX b e a m s c " + ((isCompressed()) ? "V" : "v")
            + " n g";
        if (toFile) {
            cdxWriter.write(header);
            cdxWriter.newLine();
        } else {
            System.out.println(header);
        }
        
        String strippedFileName = getStrippedFileName();
        try {
            for (Iterator ii = iterator(); ii.hasNext();) {
            	ArchiveRecord r = ii.next();
                if (toFile) {
                    cdxWriter.write(r.outputCdx(strippedFileName));
                    cdxWriter.newLine();
                } else {
                    System.out.println(r.outputCdx(strippedFileName));
                }
            }
        } finally {
            if (toFile) {
                cdxWriter.close();
            }
        }
    }
    
    /**
     * Output passed record using passed format specifier.
     * @param format What format to use outputting.
     * @throws IOException
     * @return True if handled.
     */
    public boolean outputRecord(final String format)
    throws IOException {
    	boolean result = true;
        if (format.equals(CDX)) {
            System.out.println(get().outputCdx(getStrippedFileName()));
        } else if(format.equals(ArchiveFileConstants.DUMP)) {
            // No point digesting if dumping content.
            setDigest(false);
            get().dump();
        } else {
        	result = false;
        }
        return result;
    }

    /**
     * Dump this file on STDOUT
     * @throws compress True if dumped output is compressed.
     * @throws IOException
     * @throws java.text.ParseException
     */
    public abstract void dump(final boolean compress)
    throws IOException, java.text.ParseException;
    
    /**
     * @return an ArchiveReader that will delete a local file on close.  Used
     * when we bring Archive files local and need to clean up afterward.
     */
    public abstract ArchiveReader getDeleteFileOnCloseReader(final File f);
    
    /**
     * Output passed record using passed format specifier.
     * @param r ARCReader instance to output.
     * @param format What format to use outputting.
     * @throws IOException
     */
    protected static void outputRecord(final ArchiveReader r,
        final String format)
    throws IOException {
        if (!r.outputRecord(format)) {
            throw new IOException("Unsupported format" +
                " (or unsupported on a single record): " + format);
        }
    }
    
    /**
     * @return Base Options object filled out with help, digest, strict, etc.
     * options.
     */
    protected static Options getOptions() {
        Options options = new Options();
        options.addOption(new Option("h","help", false,
            "Prints this message and exits."));
        options.addOption(new Option("o","offset", true,
            "Outputs record at this offset into file."));
        options.addOption(new Option("d","digest", true,
            "Pass true|false. Expensive. Default: true (SHA-1)."));
        options.addOption(new Option("s","strict", false,
            "Strict mode. Fails parse if incorrectly formatted file."));
        options.addOption(new Option("f","format", true,
            "Output options: 'cdx', cdxfile', 'dump', 'gzipdump'," +
            "'or 'nohead'. Default: 'cdx'."));
        return options;
    }
}