org.archive.io.arc.ARCReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of webarchive-commons Show documentation
There is a newer version: 1.1.9
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.io.arc;

import java.io.ByteArrayOutputStream;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Logger;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveRecord;
import org.archive.io.ArchiveRecordHeader;
import org.archive.io.RecoverableIOException;
import org.archive.io.WriterPoolMember;
import org.archive.util.ArchiveUtils;


/**
 * Get an iterator on an ARC file or get a record by absolute position.
 *
 * ARC files are described here:
 * Arc
 * File Format.
 *
 * This class knows how to parse an ARC file.  Pass it a file path
 * or an URL to an ARC. It can parse ARC Version 1 and 2.
 *
 * 
Iterator returns ARCRecord
 * though {@link Iterator#next()} is returning
 * java.lang.Object.  Cast the return.
 *
 * 
Profiling java.io vs. memory-mapped ByteBufferInputStream shows the
 * latter slightly slower -- but not by much.  TODO: Test more.  Just
 * change {@link #getInputStream(File, long)}.
 *
 * @author stack
 * @version $Date$ $Revision$
 */
public abstract class ARCReader extends ArchiveReader
implements ARCConstants, Closeable {
    private final Logger logger = Logger.getLogger(ARCReader.class.getName());
    
    /**
     * Set to true if we are aligned on first record of Archive file.
     * We used depend on offset. If offset was zero, then we were
     * aligned on first record.  This is no longer necessarily the case when
     * Reader is created at an offset into an Archive file: The offset is zero
     * but its relative to where we started reading.
     */
    private boolean alignedOnFirstRecord = true;
    
    private boolean parseHttpHeaders = true;
    
    protected ARCReader() {
        super();
    }
    
    /**
     * Skip over any trailing new lines at end of the record so we're lined up
     * ready to read the next.
     * @param record
     * @throws IOException
     */
    protected void gotoEOR(ArchiveRecord record) throws IOException {
        if (getIn().available() <= 0) {
            return;
        }
        
        // Remove any trailing LINE_SEPARATOR
        int c = -1;
        while (getIn().available() > 0) {
            if (getIn().markSupported()) {
                getIn().mark(1);
            }
            c = getIn().read();
            if (c != -1) {
                if (c == LINE_SEPARATOR) {
                    continue;
                }
                if (getIn().markSupported()) {
                    // We've overread.  We're probably in next record.  There is
                    // no way of telling for sure. It may be dross at end of
                    // current record. Backup.
                        getIn().reset();
                    break;
                }
                ArchiveRecordHeader h = (getCurrentRecord() != null)?
                    record.getHeader(): null;
                throw new IOException("Read " + (char)c +
                    " when only " + LINE_SEPARATOR + " expected. " + 
                    getReaderIdentifier() + ((h != null)?
                        h.getHeaderFields().toString(): ""));
            }
        }
    }
    
    /**
     * Create new arc record.
     *
     * Encapsulate housekeeping that has to do w/ creating a new record.
     *
     * 
Call this method at end of constructor to read in the
     * arcfile header.  Will be problems reading subsequent arc records
     * if you don't since arcfile header has the list of metadata fields for
     * all records that follow.
     * 
     * 
When parsing through ARCs writing out CDX info, we spend about
     * 38% of CPU in here -- about 30% of which is in getTokenizedHeaderLine
     * -- of which 16% is reading.
     *
     * @param is InputStream to use.
     * @param offset Absolute offset into arc file.
     * @return An arc record.
     * @throws IOException
     */
    protected ARCRecord createArchiveRecord(InputStream is, long offset)
    throws IOException {
        try {
            String version = super.getVersion();
            ARCRecord record = new ARCRecord(is, getReaderIdentifier(), offset,
                    isDigest(), isStrict(), isParseHttpHeaders(),
                    isAlignedOnFirstRecord(), version);
            if (version != null && super.getVersion() == null)
                super.setVersion(version);
            currentRecord(record);
        } catch (IOException e) {
            if (e instanceof RecoverableIOException) {
                // Don't mess with RecoverableIOExceptions.  Let them out.
                throw e;
            }
            IOException newE = new IOException(e.getMessage() + " (Offset " +
                    offset + ").");
            newE.setStackTrace(e.getStackTrace());
            throw newE;
        }
        return (ARCRecord)getCurrentRecord();
    }
    
    /**
     * Returns version of this ARC file.  Usually read from first record of ARC.
     * If we're reading without having first read the first record -- e.g.
     * random access into middle of an ARC -- then version will not have been
     * set.  For now, we return a default, version 1.1.  Later, if more than
     * just one version of ARC, we could look at such as the meta line to see
     * what version of ARC this is.
     * @return Version of this ARC file.
     */
    public String getVersion() {
        return (super.getVersion() == null)? "1.1": super.getVersion();
    }

    protected boolean isAlignedOnFirstRecord() {
        return alignedOnFirstRecord;
    }

    protected void setAlignedOnFirstRecord(boolean alignedOnFirstRecord) {
        this.alignedOnFirstRecord = alignedOnFirstRecord;
    }
        
    /**
     * @return Returns the parseHttpHeaders.
     */
    public boolean isParseHttpHeaders() {
        return this.parseHttpHeaders;
    }
    
    /**
     * @param parse The parseHttpHeaders to set.
     */
    public void setParseHttpHeaders(boolean parse) {
        this.parseHttpHeaders = parse;
    }
    
        public String getFileExtension() {
                return ARC_FILE_EXTENSION;
        }
        
        public String getDotFileExtension() {
                return DOT_ARC_FILE_EXTENSION;
        }
        
        protected boolean output(final String format) 
        throws IOException, java.text.ParseException {
                boolean result = super.output(format);
                if(!result && (format.equals(NOHEAD) || format.equals(HEADER))) {
                        throw new IOException(format +
                                " format only supported for single Records");
                }
                return result;
        }
    
    public boolean outputRecord(final String format) throws IOException {
                boolean result = super.outputRecord(format);
                if (result) {
                        return result;
                }
                if (format.equals(NOHEAD)) {
                        // No point digesting if dumping content.
                        setDigest(false);
                        ARCRecord r = (ARCRecord) get();
                        r.skipHttpHeader();
                        r.dump();
                        result = true;
                } else if (format.equals(HEADER)) {
                        // No point digesting if dumping content.
                        setDigest(false);
                        ARCRecord r = (ARCRecord) get();
                        r.dumpHttpHeader();
                        result = true;
                }

                return result;
        }

    public void dump(final boolean compress)
    throws IOException, java.text.ParseException {
        // No point digesting if we're doing a dump.
        setDigest(false);
        boolean firstRecord = true;
        ARCWriter writer = null;
        for (Iterator ii = iterator(); ii.hasNext();) {
            ARCRecord r = (ARCRecord)ii.next();
            // We're to dump the arc on stdout.
            // Get the first record's data if any.
            ARCRecordMetaData meta = r.getMetaData();
            if (firstRecord) {
                firstRecord = false;
                // Get an ARCWriter.
                ByteArrayOutputStream baos =
                    new ByteArrayOutputStream(r.available());
                // This is slow but done only once at top of ARC.
                while (r.available() > 0) {
                    baos.write(r.read());
                }
                List listOfMetadata = new ArrayList();
                listOfMetadata.add(baos.toString(WriterPoolMember.UTF8));
                // Assume getArc returns full path to file.  ARCWriter
                // or new File will complain if it is otherwise.
                List outDirs = new ArrayList(); 
                WriterPoolSettingsData settings = 
                    new WriterPoolSettingsData("","",-1L,compress,outDirs,listOfMetadata); 
                writer = new ARCWriter(new AtomicInteger(), System.out,
                    new File(meta.getArc()), settings);
                continue;
            }
            
            writer.write(meta.getUrl(), meta.getMimetype(), meta.getIp(),
                ArchiveUtils.parse14DigitDate(meta.getDate()).getTime(),
                (int)meta.getLength(), r);
        }
        // System.out.println(System.currentTimeMillis() - start);
    }
    
    /**
     * @return an ArchiveReader that will delete a local file on close.  Used
     * when we bring Archive files local and need to clean up afterward.
     */
    public ARCReader getDeleteFileOnCloseReader(final File f) {
        final ARCReader d = this;
        return new ARCReader() {
            private final ARCReader delegate = d;
            private File archiveFile = f;
            
            public void close() throws IOException {
                this.delegate.close();
                if (this.archiveFile != null) {
                    if (archiveFile.exists()) {
                        archiveFile.delete();
                    }
                    this.archiveFile = null;
                }
            }
            
            public ArchiveRecord get(long o) throws IOException {
                return this.delegate.get(o);
            }
            
            public boolean isDigest() {
                return this.delegate.isDigest();
            }
            
            public boolean isStrict() {
                return this.delegate.isStrict();
            }
            
            public Iterator iterator() {
                return this.delegate.iterator();
            }
            
            public void setDigest(boolean d) {
                this.delegate.setDigest(d);
            }
            
            public void setStrict(boolean s) {
                this.delegate.setStrict(s);
            }
            
            public List validate() throws IOException {
                return this.delegate.validate();
            }

            @Override
            public ArchiveRecord get() throws IOException {
                return this.delegate.get();
            }

            @Override
            public String getVersion() {
                return this.delegate.getVersion();
            }

            @Override
            public List validate(int noRecords) throws IOException {
                return this.delegate.validate(noRecords);
            }

            @Override
            protected ARCRecord createArchiveRecord(InputStream is,
                    long offset)
            throws IOException {
                return this.delegate.createArchiveRecord(is, offset);
            }

            @Override
            protected void gotoEOR(ArchiveRecord record) throws IOException {
                this.delegate.gotoEOR(record);
            }

            @Override
            public void dump(boolean compress)
            throws IOException, java.text.ParseException {
                this.delegate.dump(compress);
            }

            @Override
            public String getDotFileExtension() {
                return this.delegate.getDotFileExtension();
            }

            @Override
            public String getFileExtension() {
                return this.delegate.getFileExtension();
            }
        };
    }
    
    // Static methods follow.

    /**
     *
     * @param formatter Help formatter instance.
     * @param options Usage options.
     * @param exitCode Exit code.
     */
    private static void usage(HelpFormatter formatter, Options options,
            int exitCode) {
        formatter.printHelp("java org.archive.io.arc.ARCReader" +
            " [--digest=true|false] \\\n" +
            " [--format=cdx|cdxfile|dump|gzipdump|header|nohead]" +
            " [--offset=#] \\\n[--strict] [--parse] ARC_FILE|ARC_URL",
                options);
        System.exit(exitCode);
    }

    /**
     * Write out the arcfile.
     * 
     * @param reader
     * @param format Format to use outputting.
     * @throws IOException
     * @throws java.text.ParseException
     */
    protected static void output(ARCReader reader, String format)
    throws IOException, java.text.ParseException {
        if (!reader.output(format)) {
            throw new IOException("Unsupported format: " + format);
        }
    }

    /**
     * Generate a CDX index file for an ARC file.
     *
     * @param urlOrPath The ARC file to generate a CDX index for
     * @throws IOException
     * @throws java.text.ParseException
     */
    public static void createCDXIndexFile(String urlOrPath)
    throws IOException, java.text.ParseException {
        ARCReader r = ARCReaderFactory.get(urlOrPath);
        r.setStrict(false);
        r.setParseHttpHeaders(true);
        r.setDigest(true);
        output(r, CDX_FILE);
    }

    /**
     * Command-line interface to ARCReader.
     *
     * Here is the command-line interface:
     * 
     * usage: java org.archive.io.arc.ARCReader [--offset=#] ARCFILE
     *  -h,--help      Prints this message and exits.
     *  -o,--offset    Outputs record at this offset into arc file.
     *
     * See in $HERITRIX_HOME/bin/arcreader for a script that'll
     * take care of classpaths and the calling of ARCReader.
     *
     * Outputs using a pseudo-CDX format as described here:
     * CDX
     * Legent and here
     * Example.
     * Legend used in below is: 'CDX b e a m s c V (or v if uncompressed) n g'.
     * Hash is hard-coded straight SHA-1 hash of content.
     *
     * @param args Command-line arguments.
     * @throws ParseException Failed parse of the command line.
     * @throws IOException
     * @throws java.text.ParseException
     */
    @SuppressWarnings("unchecked")
    public static void main(String [] args)
    throws ParseException, IOException, java.text.ParseException {
        Options options = getOptions();
        options.addOption(new Option("p","parse", false, "Parse headers."));
        PosixParser parser = new PosixParser();
        CommandLine cmdline = parser.parse(options, args, false);
        List cmdlineArgs = cmdline.getArgList();
        Option [] cmdlineOptions = cmdline.getOptions();
        HelpFormatter formatter = new HelpFormatter();

        // If no args, print help.
        if (cmdlineArgs.size() <= 0) {
            usage(formatter, options, 0);
        }

        // Now look at options passed.
        long offset = -1;
        boolean digest = false;
        boolean strict = false;
        boolean parse = false;
        String format = CDX;
        for (int i = 0; i < cmdlineOptions.length; i++) {
            switch(cmdlineOptions[i].getId()) {
                case 'h':
                    usage(formatter, options, 0);
                    break;

                case 'o':
                    offset =
                        Long.parseLong(cmdlineOptions[i].getValue());
                    break;
                    
                case 's':
                    strict = true;
                    break;
                    
                case 'p':
                        parse = true;
                    break;
                    
                case 'd':
                        digest = getTrueOrFalse(cmdlineOptions[i].getValue());
                    break;
                    
                case 'f':
                    format = cmdlineOptions[i].getValue().toLowerCase();
                    boolean match = false;
                    // List of supported formats.
                    final String [] supportedFormats =
                                {CDX, DUMP, GZIP_DUMP, HEADER, NOHEAD, CDX_FILE};
                    for (int ii = 0; ii < supportedFormats.length; ii++) {
                        if (supportedFormats[ii].equals(format)) {
                            match = true;
                            break;
                        }
                    }
                    if (!match) {
                        usage(formatter, options, 1);
                    }
                    break;

                default:
                    throw new RuntimeException("Unexpected option: " +
                        + cmdlineOptions[i].getId());
            }
        }
        
        if (offset >= 0) {
            if (cmdlineArgs.size() != 1) {
                System.out.println("Error: Pass one arcfile only.");
                usage(formatter, options, 1);
            }
            ARCReader arc = ARCReaderFactory.get((String)cmdlineArgs.get(0),
                offset);
            arc.setStrict(strict);
            // We must parse headers if we need to skip them.
            if (format.equals(NOHEAD) || format.equals(HEADER)) {
                parse = true;
            }
            arc.setParseHttpHeaders(parse);
            outputRecord(arc, format);
        } else {
            for (String urlOrPath : cmdlineArgs) {
                try {
                        ARCReader r = ARCReaderFactory.get(urlOrPath);
                        r.setStrict(strict);
                        r.setParseHttpHeaders(parse);
                        r.setDigest(digest);
                    output(r, format);
                } catch (RuntimeException e) {
                    // Write out name of file we failed on to help with
                    // debugging.  Then print stack trace and try to keep
                    // going.  We do this for case where we're being fed
                    // a bunch of ARCs; just note the bad one and move
                    // on to the next.
                    System.err.println("Exception processing " + urlOrPath +
                        ": " + e.getMessage());
                    e.printStackTrace(System.err);
                    System.exit(1);
                }
            }
        }
    }
}