All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.tika.utils.RereadableInputStream Maven / Gradle / Ivy

There is a newer version: 2024.11.18751.20241128T090041Z-241100
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.utils;

import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;


/**
 * Wraps an input stream, reading it only once, but making it available
 * for rereading an arbitrary number of times.  The stream's bytes are
 * stored in memory up to a user specified maximum, and then stored in a
 * temporary file which is deleted when this class' close() method is called.
 */
public class RereadableInputStream extends InputStream {


    /**
     * Input stream originally passed to the constructor.
     */
    private InputStream originalInputStream;

    /**
     * The inputStream currently being used by this object to read contents;
     * may be the original stream passed in, or a stream that reads
     * the saved copy.
     */
    private InputStream inputStream;

    /**
     * Maximum number of bytes that can be stored in memory before
     * storage will be moved to a temporary file.
     */
    private int maxBytesInMemory;

    /**
     * True when the original stream is being read; set to false when
     * reading is set to use the stored data instead.
     */
    private boolean firstPass = true;

    /**
     * Whether or not the stream's contents are being stored in a file
     * as opposed to memory.
     */
    private boolean bufferIsInFile;

    /**
     * The buffer used to store the stream's content; this storage is moved
     * to a file when the stored data's size exceeds maxBytesInMemory.
     */
    private byte[] byteBuffer;

    /**
     * The total number of bytes read from the original stream at the time.
     */
    private int size;

    /**
     * File used to store the stream's contents; is null until the stored
     * content's size exceeds maxBytesInMemory.
     */
    private File storeFile;

    /**
     * OutputStream used to save the content of the input stream in a
     * temporary file.
     */
    private OutputStream storeOutputStream;


    /**
     * Specifies whether or not to read to the end of stream on first
     * rewind.  This defaults to true.  If this is set to false,
     * then the first time when rewind() is called, only those bytes
     * already read from the original stream will be available from then on.
     */
    private boolean readToEndOfStreamOnFirstRewind = true;


    /**
     * Specifies whether or not to close the original input stream
     * when close() is called.  Defaults to true.
     */
    private boolean closeOriginalStreamOnClose = true;


    // TODO: At some point it would be better to replace the current approach
    // (specifying the above) with more automated behavior.  The stream could
    // keep the original stream open until EOF was reached.  For example, if:
    //
    // the original stream is 10 bytes, and
    // only 2 bytes are read on the first pass
    // rewind() is called
    // 5 bytes are read
    //
    // In this case, this instance gets the first 2 from its store,
    // and the next 3 from the original stream, saving those additional 3
    // bytes in the store.  In this way, only the maximum number of bytes
    // ever needed must be saved in the store; unused bytes are never read.
    // The original stream is closed when EOF is reached, or when close()
    // is called, whichever comes first.  Using this approach eliminates
    // the need to specify the flag (though makes implementation more complex).
    


    /**
     * Creates a rereadable input stream.
     *
     * @param inputStream stream containing the source of data
     * @param maxBytesInMemory maximum number of bytes to use to store
     *     the stream's contents in memory before switching to disk; note that
     *     the instance will preallocate a byte array whose size is
     *     maxBytesInMemory.  This byte array will be made available for
     *     garbage collection (i.e. its reference set to null) when the
     *     content size exceeds the array's size, when close() is called, or
     *     when there are no more references to the instance.
     * @param readToEndOfStreamOnFirstRewind Specifies whether or not to
     *     read to the end of stream on first rewind.  If this is set to false,
     *     then when rewind() is first called, only those bytes already read
     *     from the original stream will be available from then on.
     */
    public RereadableInputStream(InputStream inputStream, int maxBytesInMemory,
            boolean readToEndOfStreamOnFirstRewind,
            boolean closeOriginalStreamOnClose) {
        this.inputStream = inputStream;
        this.originalInputStream = inputStream;
        this.maxBytesInMemory = maxBytesInMemory;
        byteBuffer = new byte[maxBytesInMemory];
        this.readToEndOfStreamOnFirstRewind = readToEndOfStreamOnFirstRewind;
        this.closeOriginalStreamOnClose = closeOriginalStreamOnClose;
    }

    /**
     * Reads a byte from the stream, saving it in the store if it is being
     * read from the original stream.  Implements the abstract
     * InputStream.read().
     *
     * @return the read byte, or -1 on end of stream.
     * @throws IOException
     */
    public int read() throws IOException {
        int inputByte = inputStream.read();
        if (firstPass) {
            saveByte(inputByte);
        }
        return inputByte;
    }

    /**
     * "Rewinds" the stream to the beginning for rereading.
     * @throws IOException
     */
    public void rewind() throws IOException {

        if (firstPass && readToEndOfStreamOnFirstRewind) {
            // Force read to end of stream to fill store with any
            // remaining bytes from original stream.
            while(read() != -1) {
                // empty loop
            }
        }

        closeStream();
        if (storeOutputStream != null) {
            storeOutputStream.close();
            storeOutputStream = null;
        }
        firstPass = false;
        boolean newStreamIsInMemory = (size < maxBytesInMemory);
        inputStream = newStreamIsInMemory
                ? new ByteArrayInputStream(byteBuffer)
                : new BufferedInputStream(new FileInputStream(storeFile));
    }

    /**
     * Closes the input stream currently used for reading (may either be
     * the original stream or a memory or file stream after the first pass).
     *
     * @throws IOException
     */
    // Does anyone need/want for this to be public?
    private void closeStream() throws IOException {
        if (inputStream != null
                &&
                (inputStream != originalInputStream
                        || closeOriginalStreamOnClose)) {
            inputStream.close();
            inputStream = null;
        }
    }

    /**
     * Closes the input stream and removes the temporary file if one was
     * created.
     * 
     * @throws IOException
     */
    public void close() throws IOException {
        closeStream();

        if (storeOutputStream != null) {
            storeOutputStream.close();
            storeOutputStream = null;
        }

        super.close();
        if (storeFile != null) {
            storeFile.delete();
        }
    }

    /**
     * Returns the number of bytes read from the original stream.
     *
     * @return number of bytes read
     */
    public int getSize() {
        return size;
    }

    /**
     * Saves the byte read from the original stream to the store.
     *
     * @param inputByte byte read from original stream
     * @throws IOException
     */
    private void saveByte(int inputByte) throws IOException {

        if (!bufferIsInFile) {
            boolean switchToFile = (size == (maxBytesInMemory));
            if (switchToFile) {
                storeFile = Files.createTempFile("TIKA_streamstore_", ".tmp").toFile();
                bufferIsInFile = true;
                storeOutputStream = new BufferedOutputStream(
                        new FileOutputStream(storeFile));
                storeOutputStream.write(byteBuffer, 0, size);
                storeOutputStream.write(inputByte);
                byteBuffer = null; // release for garbage collection
            } else {
                byteBuffer[size] = (byte) inputByte;
            }
        } else {
            storeOutputStream.write(inputByte);
        }
        ++size;
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy