All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.archive.io.RecordingInputStream Maven / Gradle / Ivy

There is a newer version: 1.1.9
Show newest version
/*
 *  This file is part of the Heritrix web crawler (crawler.archive.org).
 *
 *  Licensed to the Internet Archive (IA) by one or more individual 
 *  contributors. 
 *
 *  The IA licenses this file to You under the Apache License, Version 2.0
 *  (the "License"); you may not use this file except in compliance with
 *  the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License.
 */
package org.archive.io;

import java.io.IOException;
import java.io.InputStream;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.security.MessageDigest;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.commons.io.IOUtils;


/**
 * Stream which records all data read from it, which it acquires from a wrapped
 * input stream.
 *
 * Makes use of a RecordingOutputStream for recording because of its being
 * file backed so we can write massive amounts of data w/o worrying about
 * overflowing memory.
 *
 * @author gojomo
 *
 */
public class RecordingInputStream
    extends InputStream {

    protected static Logger logger =
        Logger.getLogger("org.archive.io.RecordingInputStream");

    /**
     * Where we are recording to.
     */
    private RecordingOutputStream recordingOutputStream;

    /**
     * Stream to record.
     */
    private InputStream in = null;

    /**
     * Reusable buffer to avoid reallocation on each readFullyUntil
     */
    protected byte[] drainBuffer = new byte[16*1024];

    /**
     * Create a new RecordingInputStream.
     *
     * @param bufferSize Size of buffer to use.
     * @param backingFilename Name of backing file.
     */
    public RecordingInputStream(int bufferSize, String backingFilename)
    {
        this.recordingOutputStream = new RecordingOutputStream(bufferSize,
            backingFilename);
    }

    public void open(InputStream wrappedStream) throws IOException {
        if (logger.isLoggable(Level.FINE)) {
            logger.fine("wrapping " + wrappedStream + " in thread "
                    + Thread.currentThread().getName());
        }
        if(isOpen()) {
            // error; should not be opening/wrapping in an unclosed 
            // stream remains open
            throw new IOException("RIS already open for "
                    +Thread.currentThread().getName());
        }
        try {
            this.in = wrappedStream;
            this.recordingOutputStream.open();
        } catch (IOException ioe) {
            close(); // ...and rethrow...
            throw ioe;
        }
    }

    public int read() throws IOException {
        if (!isOpen()) {
            throw new IOException("Stream closed " +
                Thread.currentThread().getName());
        }
        int b = this.in.read();
        if (b != -1) {
            assert this.recordingOutputStream != null: "ROS is null " +
                Thread.currentThread().getName();
            this.recordingOutputStream.write(b);
        }
        return b;
    }

    public int read(byte[] b, int off, int len) throws IOException {
        if (!isOpen()) {
            throw new IOException("Stream closed " +
                Thread.currentThread().getName());
        }
        int count = this.in.read(b,off,len);
        if (count > 0) {
            assert this.recordingOutputStream != null: "ROS is null " +
                Thread.currentThread().getName();
            this.recordingOutputStream.write(b,off,count);
        }
        return count;
    }

    public int read(byte[] b) throws IOException {
    	    if (!isOpen()) {
    	    	    throw new IOException("Stream closed " +
    			    Thread.currentThread().getName());
    	    }
    	    int count = this.in.read(b);
        if (count > 0) {
            assert this.recordingOutputStream != null: "ROS is null " +
                Thread.currentThread().getName();
            this.recordingOutputStream.write(b,0,count);
        }
        return count;
    }

    public void close() throws IOException {
        if (logger.isLoggable(Level.FINE)) {
            logger.fine("closing " + this.in + " in thread "
                    + Thread.currentThread().getName());
        }
        IOUtils.closeQuietly(this.in);
        this.in = null;
        IOUtils.closeQuietly(this.recordingOutputStream);
    }

    public ReplayInputStream getReplayInputStream() throws IOException {
        return this.recordingOutputStream.getReplayInputStream();
    }

    public ReplayInputStream getMessageBodyReplayInputStream() throws IOException {
        return this.recordingOutputStream.getMessageBodyReplayInputStream();
    }

    public long readFully() throws IOException {
        while(read(drainBuffer) != -1) {
            // Empty out stream.
            continue;
        }
        return this.recordingOutputStream.getSize();
    }

    public void readToEndOfContent(long contentLength)
            throws IOException, InterruptedException {
        // Check we're open before proceeding.
        if (!isOpen()) {
            // TODO: should this be a noisier exception-raising error? 
            return;
        } 

        long totalBytes = recordingOutputStream.position - recordingOutputStream.getMessageBodyBegin();
        long bytesRead = -1L;
        long maxToRead = -1; 
        while (contentLength <= 0 || totalBytes < contentLength) {
            try {
                // read no more than soft max
                maxToRead = (contentLength <= 0) 
                    ? drainBuffer.length 
                    : Math.min(drainBuffer.length, contentLength - totalBytes);
                // nor more than hard max
                maxToRead = Math.min(maxToRead, recordingOutputStream.getRemainingLength());
                // but always at least 1 (to trigger hard max exception) XXX wtf is this?
                maxToRead = Math.max(maxToRead, 1);
                
                bytesRead = read(drainBuffer,0,(int)maxToRead);
                if (bytesRead == -1) {
                    break;
                }
                totalBytes += bytesRead;

                if (Thread.interrupted()) {
                    throw new InterruptedException("Interrupted during IO");
                }
            } catch (SocketTimeoutException e) {
                // A socket timeout is just a transient problem, meaning
                // nothing was available in the configured  timeout period,
                // but something else might become available later.
                // Take this opportunity to check the overall 
                // timeout (below).  One reason for this timeout is 
                // servers that keep up the connection, 'keep-alive', even
                // though we asked them to not keep the connection open.
                if (logger.isLoggable(Level.FINE)) {
                    logger.log(Level.FINE, "socket timeout", e); 
                }
                // check for interrupt
                if (Thread.interrupted()) {
                    throw new InterruptedException("Interrupted during IO");
                }
                // check for overall timeout
                recordingOutputStream.checkLimits();
            } catch (SocketException se) {
                throw se;
            } catch (NullPointerException e) {
                // [ 896757 ] NPEs in Andy's Th-Fri Crawl.
                // A crawl was showing NPE's in this part of the code but can
                // not reproduce.  Adding this rethrowing catch block w/
                // diagnostics to help should we come across the problem in the
                // future.
                throw new NullPointerException("Stream " + this.in + ", " +
                    e.getMessage() + " " + Thread.currentThread().getName());
            }
        }
    }
    
    /**
     * Read all of a stream (Or read until we timeout or have read to the max).
     * @param softMaxLength Maximum length to read; if zero or < 0, then no 
     * limit. If met, return normally. 
     * @throws IOException failed read.
     * @throws RecorderLengthExceededException
     * @throws RecorderTimeoutException
     * @throws InterruptedException
     * @deprecated
     */
    public void readFullyOrUntil(long softMaxLength)
        throws IOException, RecorderLengthExceededException,
            RecorderTimeoutException, InterruptedException {
        // Check we're open before proceeding.
        if (!isOpen()) {
            // TODO: should this be a noisier exception-raising error? 
            return;
        } 

        long totalBytes = 0L;
        long bytesRead = -1L;
        long maxToRead = -1; 
        while (true) {
            try {
                // read no more than soft max
                maxToRead = (softMaxLength <= 0) 
                    ? drainBuffer.length 
                    : Math.min(drainBuffer.length, softMaxLength - totalBytes);
                // nor more than hard max
                maxToRead = Math.min(maxToRead, recordingOutputStream.getRemainingLength());
                // but always at least 1 (to trigger hard max exception
                maxToRead = Math.max(maxToRead, 1);
                
                bytesRead = read(drainBuffer,0,(int)maxToRead);
                if (bytesRead == -1) {
                    break;
                }
                totalBytes += bytesRead;

                if (Thread.interrupted()) {
                    throw new InterruptedException("Interrupted during IO");
                }
            } catch (SocketTimeoutException e) {
                // A socket timeout is just a transient problem, meaning
                // nothing was available in the configured  timeout period,
                // but something else might become available later.
                // Take this opportunity to check the overall 
                // timeout (below).  One reason for this timeout is 
                // servers that keep up the connection, 'keep-alive', even
                // though we asked them to not keep the connection open.
                if (logger.isLoggable(Level.FINE)) {
                    logger.log(Level.FINE, "socket timeout", e); 
                }
                // check for interrupt
                if (Thread.interrupted()) {
                    throw new InterruptedException("Interrupted during IO");
                }
                // check for overall timeout
                recordingOutputStream.checkLimits();
            } catch (SocketException se) {
                throw se;
            } catch (NullPointerException e) {
                // [ 896757 ] NPEs in Andy's Th-Fri Crawl.
                // A crawl was showing NPE's in this part of the code but can
                // not reproduce.  Adding this rethrowing catch block w/
                // diagnostics to help should we come across the problem in the
                // future.
                throw new NullPointerException("Stream " + this.in + ", " +
                    e.getMessage() + " " + Thread.currentThread().getName());
            }
            
            // if have read 'enough', just finish
            if (softMaxLength > 0 && totalBytes >= softMaxLength) {
                break; // return
            }
        }
    }

    public long getSize() {
        return this.recordingOutputStream.getSize();
    }

    public void markContentBegin() {
        this.recordingOutputStream.markMessageBodyBegin();
    }
    
    public long getContentBegin() {
        return this.recordingOutputStream.getMessageBodyBegin();
    }
    
    public void startDigest() {
        this.recordingOutputStream.startDigest();
    }

    /**
     * Convenience method for setting SHA1 digest.
     */
    public void setSha1Digest() {
        this.recordingOutputStream.setSha1Digest();
    }
    
    /**
     * Sets a digest algorithm which may be applied to recorded data.
     * As usually only a subset of the recorded data should
     * be fed to the digest, you must also call startDigest()
     * to begin digesting.
     *
     * @param algorithm
     */
    public void setDigest(String algorithm) {
        this.recordingOutputStream.setDigest(algorithm);
    }
    
    /**
     * Sets a digest function which may be applied to recorded data.
     * As usually only a subset of the recorded data should
     * be fed to the digest, you must also call startDigest()
     * to begin digesting.
     *
     * @param md
     */
    public void setDigest(MessageDigest md) {
        this.recordingOutputStream.setDigest(md);
    }

    /**
     * Return the digest value for any recorded, digested data. Call
     * only after all data has been recorded; otherwise, the running
     * digest state is ruined.
     *
     * @return the digest final value
     */
    public byte[] getDigestValue() {
        return this.recordingOutputStream.getDigestValue();
    }

    public long getResponseContentLength() {
        return this.recordingOutputStream.getResponseContentLength();
    }

    public void closeRecorder() throws IOException {
        this.recordingOutputStream.closeRecorder();
    }

    /**
     * @return True if we've been opened.
     */
    public boolean isOpen()
    {
        return this.in != null;
    }

    @Override
    public synchronized void mark(int readlimit) {
        this.in.mark(readlimit); 
        this.recordingOutputStream.mark(); 
    }

    @Override
    public boolean markSupported() {
        return this.in.markSupported(); 
    }

    @Override
    public synchronized void reset() throws IOException {
        this.in.reset();
        this.recordingOutputStream.reset();
    }

    /**
     *  Set limits to be enforced by internal recording-out
     */
    public void setLimits(long hardMax, long timeoutMs, long maxRateKBps) {
        recordingOutputStream.setLimits(hardMax, timeoutMs, maxRateKBps);
    }

    /**
     * Expose the amount of in-memory buffering used by the internal 
     * recording stream. 
     * @return int buffer size
     */
    public int getRecordedBufferLength() {
        return recordingOutputStream.getBufferLength();
    }

    /**
     * See doc on {@link RecordingOutputStream#chopAtMessageBodyBegin()}
     */
    public void chopAtMessageBodyBegin() {
        recordingOutputStream.chopAtMessageBodyBegin();
    }

    public void clearForReuse() throws IOException {
        recordingOutputStream.clearForReuse();
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy