org.archive.io.RecordingOutputStream Maven / Gradle / Ivy
/*
* This file is part of the Heritrix web crawler (crawler.archive.org).
*
* Licensed to the Internet Archive (IA) by one or more individual
* contributors.
*
* The IA licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.archive.io;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* An output stream that records all writes to wrapped output
* stream.
*
* A RecordingOutputStream can be wrapped around any other
* OutputStream to record all bytes written to it. You can
* then request a ReplayInputStream to read those bytes.
*
* The RecordingOutputStream uses an in-memory buffer and
* backing disk file to allow it to record streams of
* arbitrary length limited only by available disk space.
*
*
As long as the stream recorded is smaller than the
* in-memory buffer, no disk access will occur.
*
*
Recorded content can be recovered as a ReplayInputStream
* (via getReplayInputStream() or, for only the content after
* the content-begin-mark is set, getContentReplayInputStream() )
* or as a ReplayCharSequence (via getReplayCharSequence()).
*
*
This class is also used as a straight output stream
* by {@link RecordingInputStream} to which it records all reads.
* {@link RecordingInputStream} is exploiting the file backed buffer
* facility of this class passing null
for the stream
* to wrap. TODO: Make a FileBackedOutputStream class that is
* subclassed by RecordingInputStream.
*
* @author gojomo
*
*/
public class RecordingOutputStream extends OutputStream {
protected static Logger logger =
Logger.getLogger(RecordingOutputStream.class.getName());
/**
* Size of recording.
*
* Later passed to ReplayInputStream on creation. It uses it to know when
* EOS.
*/
protected long size = 0;
protected String backingFilename;
protected OutputStream diskStream = null;
/**
* Buffer we write recordings to.
*
* We write all recordings here first till its full. Thereafter we
* write the backing file.
*/
private byte[] buffer;
/** current virtual position in the recording */
long position;
/** flag to disable recording */
private boolean recording;
/**
* True if we're to digest content.
*/
private boolean shouldDigest = false;
/**
* Digest instance.
*/
private MessageDigest digest = null;
/**
* Define for SHA1 algarithm.
*/
private static final String SHA1 = "SHA1";
/**
* Maximum amount of header material to accept without the content
* body beginning -- if more, throw a RecorderTooMuchHeaderException.
* TODO: make configurable? make smaller?
*/
protected static final long MAX_HEADER_MATERIAL = 1024*1024; // 1MB
// configurable max length, max time limits
/** maximum length of material to record before throwing exception */
protected long maxLength = Long.MAX_VALUE;
/** maximum time to record before throwing exception */
protected long timeoutMs = Long.MAX_VALUE;
/** maximum rate to record (adds delays to hit target rate) */
protected long maxRateBytesPerMs = Long.MAX_VALUE;
/** time recording begins for timeout, rate calculations */
protected long startTime = Long.MAX_VALUE;
/**
* When recording HTTP, where the content-body starts.
*/
protected long messageBodyBeginMark;
/**
* While messageBodyBeginMark is not set, the last two bytes seen.
*
*
* This class does automatic detection of http message body begin (i.e. end
* of http headers). Unfortunately httpcomponents did not want to add
* functionality to help us with this, see
* https://issues.apache.org/jira/browse/HTTPCORE-325
*
*
* It works like this: while messageBodyBeginMark is not set, we remember
* the last two bytes seen, and look at each byte we write. If the
* lastTwoBytes+currentByte is "\n\r\n", or lastTwoBytes[1]+currentByte is
* "\n\n" then we call markMessageBodyBegin() at the position after
* currentByte.
*
*
* An assumption here is that protocols other than http don't have headers,
* and for those protocols the user of this class will call
* markMessageBodyBegin() at position 0 before writing anything.
*/
protected int[] lastTwoBytes = new int[] {-1, -1};
/**
* Stream to record.
*/
private OutputStream out = null;
// mark/reset support
/** furthest position reached before any reset()s */
private long maxPosition = 0;
/** remembered position to reset() to */
private long markPosition = 0;
/**
* Create a new RecordingOutputStream.
*
* @param bufferSize Buffer size to use.
* @param backingFilename Name of backing file to use.
*/
public RecordingOutputStream(int bufferSize, String backingFilename) {
this.buffer = new byte[bufferSize];
this.backingFilename = backingFilename;
recording = true;
}
/**
* Wrap the given stream, both recording and passing along any data written
* to this RecordingOutputStream.
*
* @throws IOException If failed creation of backing file.
*/
public void open() throws IOException {
this.open(null);
}
/**
* Wrap the given stream, both recording and passing along any data written
* to this RecordingOutputStream.
*
* @param wrappedStream Stream to wrap. May be null for case where we
* want to write to a file backed stream only.
*
* @throws IOException If failed creation of backing file.
*/
public void open(OutputStream wrappedStream) throws IOException {
if(isOpen()) {
// error; should not be opening/wrapping in an unclosed
// stream remains open
throw new IOException("ROS already open for "
+Thread.currentThread().getName());
}
clearForReuse();
this.out = wrappedStream;
startTime = System.currentTimeMillis();
}
protected OutputStream ensureDiskStream() throws FileNotFoundException {
if (this.diskStream == null) {
FileOutputStream fis = new FileOutputStream(this.backingFilename);
this.diskStream = new FastBufferedOutputStream(fis);
}
return this.diskStream;
}
public void write(int b) throws IOException {
if(position 0) {
write(b[off]);
off++;
len--;
}
if(recording) {
record(b, off, len);
}
if (this.out != null) {
this.out.write(b, off, len);
}
checkLimits();
}
/**
* Check any enforced limits.
*/
protected void checkLimits() throws RecorderIOException {
// too much material before finding end of headers?
if (messageBodyBeginMark<0) {
// no mark yet
if(position>MAX_HEADER_MATERIAL) {
throw new RecorderTooMuchHeaderException();
}
}
// overlong?
if(position>maxLength) {
throw new RecorderLengthExceededException();
}
// taking too long?
long duration = System.currentTimeMillis() - startTime;
duration = Math.max(duration,1); // !divzero
if(duration>timeoutMs) {
throw new RecorderTimeoutException();
}
// need to throttle reading to hit max configured rate?
if(position/duration >= maxRateBytesPerMs) {
long desiredDuration = position / maxRateBytesPerMs;
try {
Thread.sleep(desiredDuration-duration);
} catch (InterruptedException e) {
logger.log(Level.WARNING,
"bandwidth throttling sleep interrupted", e);
}
}
}
/**
* Record the given byte for later recovery
*
* @param b Int to record.
*
* @exception IOException Failed write to backing file.
*/
private void record(int b) throws IOException {
if (this.shouldDigest) {
this.digest.update((byte)b);
}
if (this.position >= this.buffer.length) {
this.ensureDiskStream().write(b);
} else {
this.buffer[(int) this.position] = (byte) b;
}
this.position++;
}
/**
* Record the given byte-array range for recovery later
*
* @param b Buffer to record.
* @param off Offset into buffer at which to start recording.
* @param len Length of buffer to record.
*
* @exception IOException Failed write to backing file.
*/
private void record(byte[] b, int off, int len) throws IOException {
if(this.shouldDigest) {
assert this.digest != null: "Digest is null.";
this.digest.update(b, off, len);
}
tailRecord(b, off, len);
}
/**
* Record without digesting.
*
* @param b Buffer to record.
* @param off Offset into buffer at which to start recording.
* @param len Length of buffer to record.
*
* @exception IOException Failed write to backing file.
*/
private void tailRecord(byte[] b, int off, int len) throws IOException {
if(this.position >= this.buffer.length){
this.ensureDiskStream().write(b, off, len);
this.position += len;
} else {
assert this.buffer != null: "Buffer is null";
int toCopy = (int)Math.min(this.buffer.length - this.position, len);
assert b != null: "Passed buffer is null";
System.arraycopy(b, off, this.buffer, (int)this.position, toCopy);
this.position += toCopy;
// TODO verify these are +1 -1 right
if (toCopy < len) {
tailRecord(b, off + toCopy, len - toCopy);
}
}
}
public void close() throws IOException {
if(messageBodyBeginMark<0) {
// if unset, consider 0 posn as content-start
// (so that a -1 never survives to replay step)
messageBodyBeginMark = 0;
}
if (this.out != null) {
this.out.close();
this.out = null;
}
closeRecorder();
}
protected synchronized void closeDiskStream()
throws IOException {
if (this.diskStream != null) {
this.diskStream.close();
this.diskStream = null;
}
}
public void closeRecorder() throws IOException {
recording = false;
closeDiskStream(); // if any
// This setting of size is important. Its passed to ReplayInputStream
// on creation. It uses it to know EOS.
if (this.size == 0) {
this.size = this.position;
}
}
/* (non-Javadoc)
* @see java.io.OutputStream#flush()
*/
public void flush() throws IOException {
if (this.out != null) {
this.out.flush();
}
if (this.diskStream != null) {
this.diskStream.flush();
}
}
public ReplayInputStream getReplayInputStream() throws IOException {
return getReplayInputStream(0);
}
public ReplayInputStream getReplayInputStream(long skip) throws IOException {
// If this method is being called, then assumption must be that the
// stream is closed. If it ain't, then the stream gotten won't work
// -- the size will zero so any attempt at a read will get back EOF.
assert this.out == null: "Stream is still open.";
ReplayInputStream replay = new ReplayInputStream(this.buffer,
this.size, this.messageBodyBeginMark, this.backingFilename);
replay.skip(skip);
return replay;
}
/**
* Return a replay stream, cued up to begining of content
*
* @throws IOException
* @return An RIS.
*/
public ReplayInputStream getMessageBodyReplayInputStream() throws IOException {
return getReplayInputStream(this.messageBodyBeginMark);
}
public long getSize() {
return this.size;
}
/**
* Remember the current position as the start of the "message
* body". Useful when recording HTTP traffic as a way to start
* replays after the headers.
*/
public void markMessageBodyBegin() {
this.messageBodyBeginMark = this.position;
startDigest();
}
/**
* Return stored message-body-begin-mark (which is also end-of-headers)
*/
public long getMessageBodyBegin() {
return this.messageBodyBeginMark;
}
/**
* Starts digesting recorded data, if a MessageDigest has been
* set.
*/
public void startDigest() {
if (this.digest != null) {
this.digest.reset();
this.shouldDigest = true;
}
}
/**
* Convenience method for setting SHA1 digest.
* @see #setDigest(String)
*/
public void setSha1Digest() {
setDigest(SHA1);
}
/**
* Sets a digest function which may be applied to recorded data.
* The difference between calling this method and {@link #setDigest(MessageDigest)}
* is that this method tries to reuse MethodDigest instance if already allocated
* and of appropriate algorithm.
* @param algorithm Message digest algorithm to use.
* @see #setDigest(MessageDigest)
*/
public void setDigest(String algorithm) {
try {
// Reuse extant digest if its sha1 algorithm.
if (this.digest == null ||
!this.digest.getAlgorithm().equals(algorithm)) {
setDigest(MessageDigest.getInstance(algorithm));
}
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
}
}
/**
* Sets a digest function which may be applied to recorded data.
*
* As usually only a subset of the recorded data should
* be fed to the digest, you must also call startDigest()
* to begin digesting.
*
* @param md Message digest function to use.
*/
public void setDigest(MessageDigest md) {
this.digest = md;
}
/**
* Return the digest value for any recorded, digested data. Call
* only after all data has been recorded; otherwise, the running
* digest state is ruined.
*
* @return the digest final value
*/
public byte[] getDigestValue() {
if(this.digest == null) {
return null;
}
return this.digest.digest();
}
public long getResponseContentLength() {
return this.size - this.messageBodyBeginMark;
}
/**
* @return True if this ROS is open.
*/
public boolean isOpen() {
return this.out != null;
}
public int getBufferLength() {
return this.buffer.length;
}
/**
* When used alongside a mark-supporting RecordingInputStream, remember
* a position reachable by a future reset().
*/
public void mark() {
// remember this position for subsequent reset()
this.markPosition = position;
}
/**
* When used alongside a mark-supporting RecordingInputStream, reset
* the position to that saved by previous mark(). Until the position
* again reached "new" material, none of the bytes pushed to this
* stream will be digested or recorded.
*/
public void reset() {
// take note of furthest-position-reached to avoid double-recording
maxPosition = Math.max(maxPosition, position);
// reset to previous position
position = markPosition;
}
/**
* Set limits on length, time, and rate to enforce.
*
* @param length
* @param milliseconds
* @param rateKBps
*/
public void setLimits(long length, long milliseconds, long rateKBps) {
maxLength = (length>0) ? length : Long.MAX_VALUE;
timeoutMs = (milliseconds>0) ? milliseconds : Long.MAX_VALUE;
maxRateBytesPerMs = (rateKBps>0) ? rateKBps*1024/1000 : Long.MAX_VALUE;
}
/**
* Reset limits to effectively-unlimited defaults
*/
public void resetLimits() {
maxLength = Long.MAX_VALUE;
timeoutMs = Long.MAX_VALUE;
maxRateBytesPerMs = Long.MAX_VALUE;
}
/**
* Return number of bytes that could be recorded without hitting
* length limit
*
* @return long byte count
*/
public long getRemainingLength() {
return maxLength - position;
}
/**
* Forget about anything past the point where the content-body starts. This
* is needed to support FetchHTTP's shouldFetchBody setting. See also the
* docs on {@link #lastTwoBytes}
*/
public void chopAtMessageBodyBegin() {
if (messageBodyBeginMark >= 0) {
this.size = messageBodyBeginMark;
this.position = messageBodyBeginMark;
}
}
public void clearForReuse() throws IOException {
this.out = null;
this.position = 0;
this.markPosition = 0;
this.maxPosition = 0;
this.size = 0;
this.messageBodyBeginMark = -1;
// ensure recording turned on
this.recording = true;
// Always begins false; must use startDigest() to begin
this.shouldDigest = false;
if (this.diskStream != null) {
closeDiskStream();
}
}
}