All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.io.warc.WARCReaderFactory Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */

package org.archive.io.warc;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Iterator;

import org.archive.io.ArchiveReader;
import org.archive.io.ArchiveReaderFactory;
import org.archive.io.ArchiveRecord;
import org.archive.io.warc.WARCConstants;
import org.archive.util.ArchiveUtils;
import org.archive.util.FileUtils;
import org.archive.util.zip.GZIPMembersInputStream;

import com.google.common.io.CountingInputStream;

/**
 * Factory for WARC Readers.
 * Figures whether to give out a compressed file Reader or an uncompressed
 * Reader.
 * @author stack
 * @version $Date: 2006-08-23 17:59:04 -0700 (Wed, 23 Aug 2006) $ $Version$
 */
public class WARCReaderFactory extends ArchiveReaderFactory
implements WARCConstants {
    private static final WARCReaderFactory factory = new WARCReaderFactory();

    /**
     * Shutdown any access to default constructor.
     * This factory is Singleton.
     */
    private WARCReaderFactory() {
        super();
    }
    
    public static WARCReader get(String arcFileOrUrl)
    throws MalformedURLException, IOException {
    	return (WARCReader)WARCReaderFactory.factory.
    		getArchiveReader(arcFileOrUrl);
    }
    
    public static WARCReader get(final File f) throws IOException {
    	return (WARCReader)WARCReaderFactory.factory.getArchiveReader(f);
    }
    
    /**
     * @param f An arcfile to read.
     * @param offset Have returned Reader set to start reading at this offset.
     * @return A WARCReader.
     * @throws IOException 
     */
    public static WARCReader get(final File f, final long offset)
    throws IOException {
    	return (WARCReader)WARCReaderFactory.factory.
    		getArchiveReader(f, offset);
    }
    
    protected ArchiveReader getArchiveReader(final File f, final long offset)
    throws IOException {
		boolean compressed = testCompressedWARCFile(f);
		if (!compressed) {
			if (!FileUtils.isReadableWithExtensionAndMagic(f,
					DOT_WARC_FILE_EXTENSION, WARC_MAGIC)) {
				throw new IOException(f.getAbsolutePath()
						+ " is not a WARC file.");
			}
		}
		return (WARCReader)(compressed?
			WARCReaderFactory.factory.new CompressedWARCReader(f, offset):
			WARCReaderFactory.factory.new UncompressedWARCReader(f, offset));
	}
    
    public static ArchiveReader get(final String s, final InputStream is,
            final boolean atFirstRecord)
    throws IOException {
        return WARCReaderFactory.factory.getArchiveReader(s, is,
            atFirstRecord);
    }
    
	/*
	 * Note that the ARC companion does this differently, with quite a lot of duplication.
	 * 
	 * @see org.archive.io.arc.ARCReaderFactory.getArchiveReader(String, InputStream, boolean)
	 */
    protected ArchiveReader getArchiveReader(final String f,
			final InputStream is, final boolean atFirstRecord)
			throws IOException {
    	// Check if it's compressed, based on file extension.
    	if( f.endsWith(".gz") ) {
    		return new CompressedWARCReader(f, is, atFirstRecord);
    	} else {
    		return new UncompressedWARCReader(f, is);
    	}
	}
    
    public static WARCReader get(final URL arcUrl, final long offset)
    throws IOException {
        return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl,
            offset);
    }
    
    /**
     * Get an ARCReader.
     * Pulls the ARC local into whereever the System Property
     * java.io.tmpdir points. It then hands back an ARCReader that
     * points at this local copy.  A close on this ARCReader instance will
     * remove the local copy.
     * @param arcUrl An URL that points at an ARC.
     * @return An ARCReader.
     * @throws IOException 
     */
    public static WARCReader get(final URL arcUrl)
    throws IOException {
        return (WARCReader)WARCReaderFactory.factory.getArchiveReader(arcUrl);
    }
    
    /**
     * Check file is compressed WARC.
     *
     * @param f File to test.
     *
     * @return True if this is compressed WARC (TODO: Just tests if file is
     * GZIP'd file (It begins w/ GZIP MAGIC)).
     *
     * @exception IOException If file does not exist or is not unreadable.
     */
    public static boolean testCompressedWARCFile(final File f)
    throws IOException {
        FileUtils.assertReadable(f);
        boolean compressed = false;
        final InputStream is = new FileInputStream(f);
        try {
            compressed = ArchiveUtils.isGzipped(is);
        } finally {
            is.close();
        }
        return compressed;
    }

    /**
     * Uncompressed WARC file reader.
     * @author stack
     */
    public class UncompressedWARCReader extends WARCReader {
        /**
         * Constructor.
         * @param f Uncompressed arcfile to read.
         * @throws IOException
         */
        public UncompressedWARCReader(final File f)
        throws IOException {
            this(f, 0);
        }

        /**
         * Constructor.
         * 
         * @param f Uncompressed file to read.
         * @param offset Offset at which to position Reader.
         * @throws IOException
         */
        public UncompressedWARCReader(final File f, final long offset)
        throws IOException {
            // File has been tested for existence by time it has come to here.
            setIn(new CountingInputStream(getInputStream(f, offset)));
            getIn().skip(offset);
            initialize(f.getAbsolutePath());
        }
        
        /**
         * Constructor.
         * 
         * @param f Uncompressed file to read.
         * @param is InputStream.
         */
        public UncompressedWARCReader(final String f, final InputStream is) {
            // Arc file has been tested for existence by time it has come
            // to here.
            setIn(new CountingInputStream(is));
            initialize(f);
        }
    }
    
    /**
     * Compressed WARC file reader.
     * 
     * @author stack
     */
    public class CompressedWARCReader extends WARCReader {
        /**
         * Constructor.
         * 
         * @param f Compressed file to read.
         * @throws IOException
         */
        public CompressedWARCReader(final File f) throws IOException {
            this(f, 0);
        }

        /**
         * Constructor.
         * 
         * @param f Compressed arcfile to read.
         * @param offset Position at where to start reading file.
         * @throws IOException
         */
        public CompressedWARCReader(final File f, final long offset)
                throws IOException {
            // File has been tested for existence by time it has come to here.
            setIn(new GZIPMembersInputStream(getInputStream(f, offset)));
            ((GZIPMembersInputStream)getIn()).compressedSeek(offset); 
            setCompressed((offset == 0)); // TODO: does this make sense?!?!
            initialize(f.getAbsolutePath());
        }
        
        /**
         * Constructor.
         * 
         * @param f Compressed arcfile.
         * @param is InputStream to use.
         * @param atFirstRecord
         * @throws IOException
         */
        public CompressedWARCReader(final String f, final InputStream is,
            final boolean atFirstRecord)
        throws IOException {
            // Arc file has been tested for existence by time it has come
            // to here.
            setIn(new GZIPMembersInputStream(is));
            setCompressed(true);
            initialize(f);
            // TODO: Ignore atFirstRecord. Probably doesn't apply in WARC world.
        }
        
        /**
         * Get record at passed offset.
         * 
         * @param offset Byte index into file at which a record starts.
         * @return A WARCRecord reference.
         * @throws IOException
         */
        public WARCRecord get(long offset) throws IOException {
            cleanupCurrentRecord();
            ((GZIPMembersInputStream)getIn()).compressedSeek(offset);
            return (WARCRecord) createArchiveRecord(getIn(), offset);
        }
        
        public Iterator iterator() {
            /**
             * Override ArchiveRecordIterator so can base returned iterator on
             * GzippedInputStream iterator.
             */
            return new ArchiveRecordIterator() {
                private GZIPMembersInputStream gis =
                    (GZIPMembersInputStream)getIn();

                private Iterator gzipIterator = this.gis.memberIterator();

                protected boolean innerHasNext() {
                    return this.gzipIterator.hasNext();
                }

                protected ArchiveRecord innerNext() throws IOException {
                    // Get the position before gzipIterator.next moves
                    // it on past the gzip header.
                    InputStream is = (InputStream) this.gzipIterator.next();
                    return createArchiveRecord(is, Math.max(gis.getCurrentMemberStart(), gis.getCurrentMemberEnd()));
                }
            };
        }
        
        protected void gotoEOR(ArchiveRecord rec) throws IOException {
            long skipped = 0; 
            while (getIn().read()>-1) {
                skipped++;
            }
            if(skipped>4) {
                System.err.println("unexpected extra data after record "+rec);
            }
            return;
        }
    }
    
    public static boolean isWARCSuffix(final String f) {
    	return (f == null)?
    		false:
    		(f.toLowerCase().endsWith(DOT_COMPRESSED_WARC_FILE_EXTENSION))?
    		    true:
    			(f.toLowerCase().endsWith(DOT_WARC_FILE_EXTENSION))?
    			true: false;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy