org.archive.io.RecordingInputStream Maven / Gradle / Ivy
The newest version!
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.io;
import java.io.IOException;
import java.io.InputStream;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.security.MessageDigest;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.io.IOUtils;
/**
* Stream which records all data read from it, which it acquires from a wrapped
* input stream.
*
* Makes use of a RecordingOutputStream for recording because of its being
* file backed so we can write massive amounts of data w/o worrying about
* overflowing memory.
*
* @author gojomo
*
*/
public class RecordingInputStream
extends InputStream {
protected static Logger logger =
Logger.getLogger("org.archive.io.RecordingInputStream");
/**
* Where we are recording to.
*/
private RecordingOutputStream recordingOutputStream;
/**
* Stream to record.
*/
private InputStream in = null;
/**
* Reusable buffer to avoid reallocation on each readFullyUntil
*/
protected byte[] drainBuffer = new byte[16*1024];
/**
* Create a new RecordingInputStream.
*
* @param bufferSize Size of buffer to use.
* @param backingFilename Name of backing file.
*/
public RecordingInputStream(int bufferSize, String backingFilename)
{
this.recordingOutputStream = new RecordingOutputStream(bufferSize,
backingFilename);
}
public void open(InputStream wrappedStream) throws IOException {
if (logger.isLoggable(Level.FINE)) {
logger.fine("wrapping " + wrappedStream + " in thread "
+ Thread.currentThread().getName());
}
if(isOpen()) {
// error; should not be opening/wrapping in an unclosed
// stream remains open
throw new IOException("RIS already open for "
+Thread.currentThread().getName());
}
try {
this.in = wrappedStream;
this.recordingOutputStream.open();
} catch (IOException ioe) {
close(); // ...and rethrow...
throw ioe;
}
}
public int read() throws IOException {
if (!isOpen()) {
throw new IOException("Stream closed " +
Thread.currentThread().getName());
}
int b = this.in.read();
if (b != -1) {
assert this.recordingOutputStream != null: "ROS is null " +
Thread.currentThread().getName();
this.recordingOutputStream.write(b);
}
return b;
}
public int read(byte[] b, int off, int len) throws IOException {
if (!isOpen()) {
throw new IOException("Stream closed " +
Thread.currentThread().getName());
}
int count = this.in.read(b,off,len);
if (count > 0) {
assert this.recordingOutputStream != null: "ROS is null " +
Thread.currentThread().getName();
this.recordingOutputStream.write(b,off,count);
}
return count;
}
public int read(byte[] b) throws IOException {
if (!isOpen()) {
throw new IOException("Stream closed " +
Thread.currentThread().getName());
}
int count = this.in.read(b);
if (count > 0) {
assert this.recordingOutputStream != null: "ROS is null " +
Thread.currentThread().getName();
this.recordingOutputStream.write(b,0,count);
}
return count;
}
public void close() throws IOException {
if (logger.isLoggable(Level.FINE)) {
logger.fine("closing " + this.in + " in thread "
+ Thread.currentThread().getName());
}
IOUtils.closeQuietly(this.in);
this.in = null;
IOUtils.closeQuietly(this.recordingOutputStream);
}
public ReplayInputStream getReplayInputStream() throws IOException {
return this.recordingOutputStream.getReplayInputStream();
}
public ReplayInputStream getMessageBodyReplayInputStream() throws IOException {
return this.recordingOutputStream.getMessageBodyReplayInputStream();
}
public long readFully() throws IOException {
while(read(drainBuffer) != -1) {
// Empty out stream.
continue;
}
return this.recordingOutputStream.getSize();
}
public void readToEndOfContent(long contentLength)
throws IOException, InterruptedException {
// Check we're open before proceeding.
if (!isOpen()) {
// TODO: should this be a noisier exception-raising error?
return;
}
long totalBytes = recordingOutputStream.position - recordingOutputStream.getMessageBodyBegin();
long bytesRead = -1L;
long maxToRead = -1;
while (contentLength <= 0 || totalBytes < contentLength) {
try {
// read no more than soft max
maxToRead = (contentLength <= 0)
? drainBuffer.length
: Math.min(drainBuffer.length, contentLength - totalBytes);
// nor more than hard max
maxToRead = Math.min(maxToRead, recordingOutputStream.getRemainingLength());
// but always at least 1 (to trigger hard max exception) XXX wtf is this?
maxToRead = Math.max(maxToRead, 1);
bytesRead = read(drainBuffer,0,(int)maxToRead);
if (bytesRead == -1) {
break;
}
totalBytes += bytesRead;
if (Thread.interrupted()) {
throw new InterruptedException("Interrupted during IO");
}
} catch (SocketTimeoutException e) {
// A socket timeout is just a transient problem, meaning
// nothing was available in the configured timeout period,
// but something else might become available later.
// Take this opportunity to check the overall
// timeout (below). One reason for this timeout is
// servers that keep up the connection, 'keep-alive', even
// though we asked them to not keep the connection open.
if (logger.isLoggable(Level.FINE)) {
logger.log(Level.FINE, "socket timeout", e);
}
// check for interrupt
if (Thread.interrupted()) {
throw new InterruptedException("Interrupted during IO");
}
// check for overall timeout
recordingOutputStream.checkLimits();
} catch (SocketException se) {
throw se;
} catch (NullPointerException e) {
// [ 896757 ] NPEs in Andy's Th-Fri Crawl.
// A crawl was showing NPE's in this part of the code but can
// not reproduce. Adding this rethrowing catch block w/
// diagnostics to help should we come across the problem in the
// future.
throw new NullPointerException("Stream " + this.in + ", " +
e.getMessage() + " " + Thread.currentThread().getName());
}
}
}
/**
* Read all of a stream (Or read until we timeout or have read to the max).
* @param softMaxLength Maximum length to read; if zero or < 0, then no
* limit. If met, return normally.
* @throws IOException failed read.
* @throws RecorderLengthExceededException
* @throws RecorderTimeoutException
* @throws InterruptedException
* @deprecated
*/
public void readFullyOrUntil(long softMaxLength)
throws IOException, RecorderLengthExceededException,
RecorderTimeoutException, InterruptedException {
// Check we're open before proceeding.
if (!isOpen()) {
// TODO: should this be a noisier exception-raising error?
return;
}
long totalBytes = 0L;
long bytesRead = -1L;
long maxToRead = -1;
while (true) {
try {
// read no more than soft max
maxToRead = (softMaxLength <= 0)
? drainBuffer.length
: Math.min(drainBuffer.length, softMaxLength - totalBytes);
// nor more than hard max
maxToRead = Math.min(maxToRead, recordingOutputStream.getRemainingLength());
// but always at least 1 (to trigger hard max exception
maxToRead = Math.max(maxToRead, 1);
bytesRead = read(drainBuffer,0,(int)maxToRead);
if (bytesRead == -1) {
break;
}
totalBytes += bytesRead;
if (Thread.interrupted()) {
throw new InterruptedException("Interrupted during IO");
}
} catch (SocketTimeoutException e) {
// A socket timeout is just a transient problem, meaning
// nothing was available in the configured timeout period,
// but something else might become available later.
// Take this opportunity to check the overall
// timeout (below). One reason for this timeout is
// servers that keep up the connection, 'keep-alive', even
// though we asked them to not keep the connection open.
if (logger.isLoggable(Level.FINE)) {
logger.log(Level.FINE, "socket timeout", e);
}
// check for interrupt
if (Thread.interrupted()) {
throw new InterruptedException("Interrupted during IO");
}
// check for overall timeout
recordingOutputStream.checkLimits();
} catch (SocketException se) {
throw se;
} catch (NullPointerException e) {
// [ 896757 ] NPEs in Andy's Th-Fri Crawl.
// A crawl was showing NPE's in this part of the code but can
// not reproduce. Adding this rethrowing catch block w/
// diagnostics to help should we come across the problem in the
// future.
throw new NullPointerException("Stream " + this.in + ", " +
e.getMessage() + " " + Thread.currentThread().getName());
}
// if have read 'enough', just finish
if (softMaxLength > 0 && totalBytes >= softMaxLength) {
break; // return
}
}
}
public long getSize() {
return this.recordingOutputStream.getSize();
}
public void markContentBegin() {
this.recordingOutputStream.markMessageBodyBegin();
}
public long getContentBegin() {
return this.recordingOutputStream.getMessageBodyBegin();
}
public void startDigest() {
this.recordingOutputStream.startDigest();
}
/**
* Convenience method for setting SHA1 digest.
*/
public void setSha1Digest() {
this.recordingOutputStream.setSha1Digest();
}
/**
* Sets a digest algorithm which may be applied to recorded data.
* As usually only a subset of the recorded data should
* be fed to the digest, you must also call startDigest()
* to begin digesting.
*
* @param algorithm
*/
public void setDigest(String algorithm) {
this.recordingOutputStream.setDigest(algorithm);
}
/**
* Sets a digest function which may be applied to recorded data.
* As usually only a subset of the recorded data should
* be fed to the digest, you must also call startDigest()
* to begin digesting.
*
* @param md
*/
public void setDigest(MessageDigest md) {
this.recordingOutputStream.setDigest(md);
}
/**
* Return the digest value for any recorded, digested data. Call
* only after all data has been recorded; otherwise, the running
* digest state is ruined.
*
* @return the digest final value
*/
public byte[] getDigestValue() {
return this.recordingOutputStream.getDigestValue();
}
public long getResponseContentLength() {
return this.recordingOutputStream.getResponseContentLength();
}
public void closeRecorder() throws IOException {
this.recordingOutputStream.closeRecorder();
}
/**
* @return True if we've been opened.
*/
public boolean isOpen()
{
return this.in != null;
}
@Override
public synchronized void mark(int readlimit) {
this.in.mark(readlimit);
this.recordingOutputStream.mark();
}
@Override
public boolean markSupported() {
return this.in.markSupported();
}
@Override
public synchronized void reset() throws IOException {
this.in.reset();
this.recordingOutputStream.reset();
}
/**
* Set limits to be enforced by internal recording-out
*/
public void setLimits(long hardMax, long timeoutMs, long maxRateKBps) {
recordingOutputStream.setLimits(hardMax, timeoutMs, maxRateKBps);
}
/**
* Expose the amount of in-memory buffering used by the internal
* recording stream.
* @return int buffer size
*/
public int getRecordedBufferLength() {
return recordingOutputStream.getBufferLength();
}
/**
* See doc on {@link RecordingOutputStream#chopAtMessageBodyBegin()}
*/
public void chopAtMessageBodyBegin() {
recordingOutputStream.chopAtMessageBodyBegin();
}
public void clearForReuse() throws IOException {
recordingOutputStream.clearForReuse();
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy