All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.io.ArchiveReaderFactory Maven / Gradle / Ivy

The newest version!
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.io;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

import org.archive.io.arc.ARCReaderFactory;
import org.archive.io.warc.WARCReaderFactory;
import org.archive.net.md5.Md5URLConnection;
import org.archive.net.rsync.RsyncURLConnection;
import org.archive.url.UsableURI;
import org.archive.util.FileUtils;


/**
 * Factory that returns an Archive file Reader.
 * Returns Readers for ARCs or WARCs.
 * @author stack
 * @version $Date$ $Revision$
 */
public class ArchiveReaderFactory implements ArchiveFileConstants {
  // Static block to enable S3 URLs
  static {
    if (System.getProperty("java.protocol.handler.pkgs") != null) {
      System.setProperty("java.protocol.handler.pkgs",
        System.getProperty("java.protocol.handler.pkgs")
        + "|" + "org.archive.net");
    } else {
      System.setProperty("java.protocol.handler.pkgs", "org.archive.net");
    }
  }

	private static final ArchiveReaderFactory factory =
		new ArchiveReaderFactory();
	
    /**
     * Shutdown any public access to default constructor.
     */
    protected ArchiveReaderFactory() {
        super();
    }
    
    /**
     * Get an Archive file Reader on passed path or url.
     * Does primitive heuristic figuring if path or URL.
     * @param arcFileOrUrl File path or URL pointing at an Archive file.
     * @return An Archive file Reader.
     * @throws IOException 
     * @throws MalformedURLException 
     * @throws IOException 
     */
    public static ArchiveReader get(final String arcFileOrUrl)
    throws MalformedURLException, IOException {
    	return ArchiveReaderFactory.factory.getArchiveReader(arcFileOrUrl);
    }
    
    protected ArchiveReader getArchiveReader(final String arcFileOrUrl)
    throws MalformedURLException, IOException {
    	return getArchiveReader(arcFileOrUrl, 0);
    }
    
    protected ArchiveReader getArchiveReader(final String arcFileOrUrl,
    	final long offset)
    throws MalformedURLException, IOException {
    	return UsableURI.hasScheme(arcFileOrUrl) && arcFileOrUrl.indexOf(":")>1?
    		get(new URL(arcFileOrUrl), offset):
    			get(new File(arcFileOrUrl), offset);
    }
    
    /**
     * @param f An Archive file to read.
     * @return An ArchiveReader
     * @throws IOException 
     */
    public static ArchiveReader get(final File f) throws IOException {
    	return ArchiveReaderFactory.factory.getArchiveReader(f);
    }
    
    protected ArchiveReader getArchiveReader(final File f)
    throws IOException {
    	return getArchiveReader(f, 0);
    }
    
    /**
     * @param f An Archive file to read.
     * @param offset Have returned Reader set to start reading at this offset.
     * @return An ArchiveReader
     * @throws IOException 
     */
    public static ArchiveReader get(final File f, final long offset)
    throws IOException {
    	return ArchiveReaderFactory.factory.getArchiveReader(f, offset);
	}
    
    protected ArchiveReader getArchiveReader(final File f,
    	final long offset)
    throws IOException {
    	if (ARCReaderFactory.isARCSuffix(f.getName())) {
    		return ARCReaderFactory.get(f, true, offset);
    	} else if (WARCReaderFactory.isWARCSuffix(f.getName())) {
    		return WARCReaderFactory.get(f, offset);
    	}
    	throw new IOException("Unknown file extension (Not ARC nor WARC): "
    		+ f.getName());
    }
    
    /**
     * Wrap a Reader around passed Stream.
     * @param s Identifying String for this Stream used in error messages.
     * Must be a string that ends with the name of the file we're to put
     * an ArchiveReader on.  This code looks at file endings to figure
     * whether to return an ARC or WARC reader.
     * @param is Stream.  Stream will be wrapped with implementation of
     * RepositionableStream unless already supported.
     * @param atFirstRecord Are we at first Record?
     * @return ArchiveReader.
     * @throws IOException
     */
    public static ArchiveReader get(final String s, final InputStream is,
        final boolean atFirstRecord)
    throws IOException {
        return ArchiveReaderFactory.factory.getArchiveReader(s, is,
        	atFirstRecord);
    }
    
    protected ArchiveReader getArchiveReader(final String id, 
    		final InputStream is, final boolean atFirstRecord)
    throws IOException {
    	final InputStream stream = is; 
        if (ARCReaderFactory.isARCSuffix(id)) {
            return ARCReaderFactory.get(id, stream, atFirstRecord);
        } else if (WARCReaderFactory.isWARCSuffix(id)) {
            return WARCReaderFactory.get(id, stream, atFirstRecord);
        }
        throw new IOException("Unknown extension (Not ARC nor WARC): " + id);
    }
    
    /**
     * Get an Archive Reader aligned at offset.
     * This version of get will not bring the file local but will try to
     * stream across the net making an HTTP 1.1 Range request on remote
     * http server (RFC1435 Section 14.35).
     * @param u HTTP URL for an Archive file.
     * @param offset Offset into file at which to start fetching.
     * @return An ArchiveReader aligned at offset.
     * @throws IOException
     */
    public static ArchiveReader get(final URL u, final long offset)
    throws IOException {
    	return ArchiveReaderFactory.factory.getArchiveReader(u, offset);
    }
    
    protected ArchiveReader getArchiveReader(final URL f, final long offset)
    throws IOException {
        // Get URL connection.
        URLConnection connection = f.openConnection();
        if (connection instanceof HttpURLConnection) {
          addUserAgent((HttpURLConnection)connection);
        }
        if (offset != 0) {
        	// Use a Range request (Assumes HTTP 1.1 on other end). If
        	// length >= 0, add open-ended range header to the request.  Else,
        	// because end-byte is inclusive, subtract 1.
        	connection.addRequestProperty("Range", "bytes=" + offset + "-");
            // TODO: should actually verify that server respected 'Range' request
            // (spec allows them to ignore; 206 response or Content-Range header
            // should be present if Range satisfied; multipart/byteranges could be
            // a problem). 
        }
        
        return getArchiveReader(f.toString(), connection.getInputStream(), (offset == 0));
    }
    
    /**
     * Get an ARCReader.
     * Pulls the ARC local into whereever the System Property
     * java.io.tmpdir points. It then hands back an ARCReader that
     * points at this local copy.  A close on this ARCReader instance will
     * remove the local copy.
     * @param u An URL that points at an ARC.
     * @return An ARCReader.
     * @throws IOException 
     */
    public static ArchiveReader get(final URL u)
    throws IOException {
    	return ArchiveReaderFactory.factory.getArchiveReader(u);
    }
    
    protected ArchiveReader getArchiveReader(final URL u)
    throws IOException {
        // If url represents a local file then return file it points to.
        if (u.getPath() != null) {
            // TODO: Add scheme check and host check.
            File f = new File(u.getPath());
            if (f.exists()) {
                return get(f, 0);
            }
        }
       
        String scheme = u.getProtocol();
        if (scheme.startsWith("http") || scheme.equals("s3")) {
            // Try streaming if http or s3 URLs rather than copying local
        	// and then reading (Passing an offset will get us an Reader
        	// that wraps a Stream).
            return get(u, 0);
        }
        
        return makeARCLocal(u.openConnection());
    }
    
    protected ArchiveReader makeARCLocal(final URLConnection connection)
    throws IOException {
        File localFile = null;
        if (connection instanceof HttpURLConnection) {
            // If http url connection, bring down the resource local.
            String p = connection.getURL().getPath();
            int index = p.lastIndexOf('/');
            if (index >= 0) {
                // Name file for the file we're making local.
                localFile = File.createTempFile("",p.substring(index + 1));
                if (localFile.exists()) {
                    // If file of same name already exists in TMPDIR, then
                    // clean it up (Assuming only reason a file of same name in
                    // TMPDIR is because we failed a previous download).
                    localFile.delete();
                }
            } else {
                localFile = File.createTempFile(ArchiveReader.class.getName(),
                    ".tmp");
            }
            addUserAgent((HttpURLConnection)connection);
            connection.connect();
            try {
                FileUtils.readFullyToFile(connection.getInputStream(), localFile);
            } catch (IOException ioe) {
                localFile.delete();
                throw ioe;
            }
        } else if (connection instanceof RsyncURLConnection) {
            // Then, connect and this will create a local file.
            // See implementation of the rsync handler.
            connection.connect();
            localFile = ((RsyncURLConnection)connection).getFile();
        } else if (connection instanceof Md5URLConnection) {
            // Then, connect and this will create a local file.
            // See implementation of the md5 handler.
            connection.connect();
            localFile = ((Md5URLConnection)connection).getFile();
        } else {
            throw new UnsupportedOperationException("No support for " +
                connection);
        }
        
        ArchiveReader reader = null;
        try {
            reader = get(localFile, 0);
        } catch (IOException e) {
            localFile.delete();
            throw e;
        }
        
        // Return a delegate that does cleanup of downloaded file on close.
        return reader.getDeleteFileOnCloseReader(localFile);
    }
    
    protected void addUserAgent(final HttpURLConnection connection) {
        connection.addRequestProperty("User-Agent", this.getClass().getName());
    }
    
    /**
     * @param f File to test.
     * @return True if f is compressed.
     * @throws IOException
     */
    protected boolean isCompressed(final File f) throws IOException {
        return f.getName().toLowerCase().
        	endsWith(DOT_COMPRESSED_FILE_EXTENSION);
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy