it.unimi.dsi.fastutil.io.FastBufferedInputStream Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of fastutil Show documentation
fastutil extends the Java Collections Framework by providing type-specific maps, sets, lists and priority queues with a small memory footprint and fast access and insertion; provides also big (64-bit) arrays, sets and lists, and fast, practical I/O classes for binary and text files.
There is a newer version: 8.5.15
Show newest version
package it.unimi.dsi.fastutil.io;

/*
 * Copyright (C) 2005-2017 Sebastiano Vigna
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import it.unimi.dsi.fastutil.bytes.ByteArrays;
import java.io.IOException;
import java.io.InputStream;
import java.nio.channels.FileChannel;
import java.util.EnumSet;

/** Lightweight, unsynchronized, aligned input stream buffering class with
 *  {@linkplain #skip(long) true skipping},
 *  {@linkplain MeasurableStream measurability},
 *  {@linkplain RepositionableStream repositionability}
 *  and {@linkplain #readLine(byte[], int, int, EnumSet) line reading} support.
 *
 * This class provides buffering for input streams, but it does so with
 * purposes and an internal logic that are radically different from the ones
 * adopted in {@link java.io.BufferedInputStream}. The main features follow.
 *
 * 

 * There is no support for marking. All methods are unsychronized.
 *
 * 
As an additional feature, this class implements the {@link
 * RepositionableStream} and {@link MeasurableStream} interfaces.
 * An instance of this class will try to cast
 * the underlying byte stream to a {@link RepositionableStream} and to fetch by
 * reflection the {@link java.nio.channels.FileChannel} underlying the given
 * output stream, in this order. If either reference can be successfully
 * fetched, you can use {@link #position(long)} to reposition the stream.
 * Much in the same way, an instance of this class will try to cast the
 * the underlying byte stream to a {@link MeasurableStream}, and if this
 * operation is successful, or if a {@link java.nio.channels.FileChannel} can
 * be detected, then {@link #position()} and {@link #length()} will work as expected.
 *
 *
 * 
Due to erratic and unpredictable behaviour of {@link InputStream#skip(long)},
 * which does not correspond to its specification and which Sun refuses to fix
 * (see bug 6222822;
 * don't be fooled by the “closed, fixed” label),
 * this class peeks at the underlying stream and if it is {@link System#in} it uses
 * repeated reads instead of calling {@link InputStream#skip(long)} on the underlying stream; moreover,
 * skips and reads are tried alternately, so to guarantee that skipping
 * less bytes than requested can be caused only by reaching the end of file.
 *
 * 
This class keeps also track of the number of bytes read so far, so
 * to be able to implement {@link MeasurableStream#position()}
 * independently of underlying input stream.
 *
 * 
This class has limited support for
 * {@linkplain #readLine(byte[], int, int, EnumSet) “reading a line”}
 * (whatever that means) from the underlying input stream. You can choose the set of
 * {@linkplain FastBufferedInputStream.LineTerminator line terminators} that
 * delimit lines.
 *
 * 
 *
 * Warning: Since {@code fastutil} 6.0.0, this class detects
 * a implementations of {@link MeasurableStream} instead of subclasses {@code MeasurableInputStream} (which is deprecated).
 *
 * @since 4.4
 */

public class FastBufferedInputStream extends MeasurableInputStream implements RepositionableStream {

	/** The default size of the internal buffer in bytes (8Ki). */
	public static final int DEFAULT_BUFFER_SIZE = 8 * 1024;

	/** An enumeration of the supported line terminators. */
	public static enum LineTerminator {
		/** A carriage return (CR, ASCII 13). */
		CR,
		/** A line feed (LF, ASCII 10). */
		LF,
		/** A carriage return followed by a line feed (CR/LF, ASCII 13/10). */
		CR_LF
	}

	/** A set containing all available line terminators. */
	public static final EnumSet ALL_TERMINATORS = EnumSet.allOf(LineTerminator.class);

	/** The underlying input stream. */
	protected InputStream is;

	/** The internal buffer. */
	protected byte buffer[];

	/** The current position in the buffer. */
	protected int pos;

	/** The number of bytes ever read (reset upon a call to {@link #position(long)}).
	 * In particular, this will always represent the index (in the underlying input stream)
	 * of the first available byte in the buffer. */
	protected long readBytes;

	/** The number of buffer bytes available starting from {@link #pos}. */
	protected int avail;

	/** The cached file channel underlying {@link #is}, if any. */
	private FileChannel fileChannel;

	/** {@link #is} cast to a positionable stream, if possible. */
	private RepositionableStream repositionableStream;

	/** {@link #is} cast to a measurable stream, if possible. */
	private MeasurableStream measurableStream;

	private static int ensureBufferSize(final int bufferSize) {
		if (bufferSize <= 0) throw new IllegalArgumentException("Illegal buffer size: " + bufferSize);
		return bufferSize;
	}

	/** Creates a new fast buffered input stream by wrapping a given input stream with a given buffer.
	 *
	 * @param is an input stream to wrap.
	 * @param buffer a buffer of positive length.
	 */
	public FastBufferedInputStream(final InputStream is, final byte[] buffer) {
		this.is = is;
		ensureBufferSize(buffer.length);
		this.buffer = buffer;

		if (is instanceof RepositionableStream) repositionableStream = (RepositionableStream)is;
		if (is instanceof MeasurableStream) measurableStream = (MeasurableStream)is;

		if (repositionableStream == null) {

			try {
				fileChannel = (FileChannel)(is.getClass().getMethod("getChannel", new Class[] {})).invoke(is);
			}
			catch(IllegalAccessException e) {}
			catch(IllegalArgumentException e) {}
			catch(NoSuchMethodException e) {}
			catch(java.lang.reflect.InvocationTargetException e) {}
			catch(ClassCastException e) {}
		}
	}

	/** Creates a new fast buffered input stream by wrapping a given input stream with a given buffer size.
	 *
	 * @param is an input stream to wrap.
	 * @param bufferSize the size in bytes of the internal buffer (greater than zero).
	 */
	public FastBufferedInputStream(final InputStream is, final int bufferSize) {
		this(is, new byte[ensureBufferSize(bufferSize)]);
	}

	/** Creates a new fast buffered input stream by wrapping a given input stream with a buffer of {@link #DEFAULT_BUFFER_SIZE} bytes.
	 *
	 * @param is an input stream to wrap.
	 */
	public FastBufferedInputStream(final InputStream is) {
		this(is, DEFAULT_BUFFER_SIZE);
	}

	/** Checks whether no more bytes will be returned.
	 *
	 * 
This method will refill the internal buffer.
	 *
	 * @return true if there are no characters in the internal buffer and
	 * the underlying reader is exhausted.
	 */

	protected boolean noMoreCharacters() throws IOException {
		if (avail == 0) {
			avail = is.read(buffer);
			if (avail <= 0) {
				avail = 0;
				return true;
			}
			pos = 0;
		}
		return false;
	}

	@Override
	public int read() throws IOException {
		if (noMoreCharacters()) return -1;
		avail--;
		readBytes++;
		return buffer[pos++] & 0xFF;
	}

	@Override
	public int read(final byte b[], final int offset, final int length) throws IOException {
		if (length <= avail) {
			System.arraycopy(buffer, pos, b, offset, length);
			pos += length;
			avail -= length;
			readBytes += length;
			return length;
		}

		final int head = avail;

		System.arraycopy(buffer, pos, b, offset, head);
		pos = avail = 0;
		readBytes += head;

		if (length > buffer.length) {
			// We read directly into the destination
			final int result = is.read(b, offset + head, length - head);
			if (result > 0) readBytes += result;
			return result < 0 ? (head == 0 ? -1 : head) : result + head;
		}

		if (noMoreCharacters()) return head == 0 ? -1 : head;

		final int toRead = Math.min(length - head, avail);
		readBytes += toRead;
		System.arraycopy(buffer, 0, b, offset + head, toRead);
		pos = toRead;
		avail -= toRead;

		// Note that head >= 0, and necessarily toRead > 0
		return toRead + head;
	}

	/** Reads a line into the given byte array using {@linkplain #ALL_TERMINATORS all terminators}.
	 *
	 * @param array byte array where the next line will be stored.
	 * @return the number of bytes actually placed in {@code array}, or -1 at end of file.
	 * @see #readLine(byte[], int, int, EnumSet)
	 */

	public int readLine(final byte[] array) throws IOException {
		return readLine(array, 0, array.length, ALL_TERMINATORS);
	}

	/** Reads a line into the given byte array.
	 *
	 * @param array byte array where the next line will be stored.
	 * @param terminators a set containing the line termination sequences that we want
	 * to consider as valid.
	 * @return the number of bytes actually placed in {@code array}, or -1 at end of file.
	 * @see #readLine(byte[], int, int, EnumSet)
	 */

	public int readLine(final byte[] array, final EnumSet terminators) throws IOException {
		return readLine(array, 0, array.length, terminators);
	}

	/** Reads a line into the given byte-array fragment	using {@linkplain #ALL_TERMINATORS all terminators}.
	 *
	 * @param array byte array where the next line will be stored.
	 * @param off the first byte to use in {@code array}.
	 * @param len the maximum number of bytes to read.
	 * @return the number of bytes actually placed in {@code array}, or -1 at end of file.
	 * @see #readLine(byte[], int, int, EnumSet)
	 */
	public int readLine(final byte[] array, final int off, final int len) throws IOException {
		return readLine(array, off, len, ALL_TERMINATORS);
	}

	/** Reads a line into the given byte-array fragment.
	 *
	 * 
Reading lines (i.e., characters) out of a byte stream is not always sensible
	 * (methods available to that purpose in old versions of Java have been mercilessly deprecated).
	 * Nonetheless, in several situations, such as when decoding network protocols or headers
	 * known to be ASCII, it is very useful to be able to read a line from a byte stream.
	 *
	 * 
This method will attempt to read the next line into {@code array} starting at {@code off},
	 * reading at most {@code len} bytes. The read, however, will be stopped by the end of file or
	 * when meeting a {@linkplain LineTerminator line terminator}. Of course, for this operation
	 * to be sensible the encoding of the text contained in the stream, if any, must not generate spurious
	 * carriage returns or line feeds. Note that the termination detection uses a maximisation
	 * criterion, so if you specify both {@link LineTerminator#CR} and
	 * {@link LineTerminator#CR_LF} meeting a pair CR/LF will consider the whole pair a terminator.
	 *
	 * 
Terminators are not copied into array or included in the returned count. The
	 * returned integer can be used to check whether the line is complete: if it is smaller than
	 * {@code len}, then more bytes might be available, but note that this method (contrarily
	 * to {@link #read(byte[], int, int)}) can legitimately return zero when {@code len}
	 * is nonzero just because a terminator was found as the first character. Thus, the intended
	 * usage of this method is to call it on a given array, check whether {@code len} bytes
	 * have been read, and if so try again (possibly extending the array) until a number of read bytes
	 * strictly smaller than {@code len} (possibly, -1) is returned.
	 *
	 * 
If you need to guarantee that a full line is read, use the following idiom:
	 * 
	 * int start = off, len;
	 * while((len = fbis.readLine(array, start, array.length - start, terminators)) == array.length - start) {
	 *     start += len;
	 *     array = ByteArrays.grow(array, array.length + 1);
	 * }
	 * 
	 *
	 * At the end of the loop, the line will be placed in {@code array} starting at
	 * {@code off} (inclusive) and ending at {@code start + Math.max(len, 0)} (exclusive).
	 *
	 * @param array byte array where the next line will be stored.
	 * @param off the first byte to use in {@code array}.
	 * @param len the maximum number of bytes to read.
	 * @param terminators a set containing the line termination sequences that we want
	 * to consider as valid.
	 * @return the number of bytes actually placed in {@code array}, or -1 at end of file.
	 * Note that the returned number will be {@code len} if no line termination sequence
	 * specified in {@code terminators} has been met before scanning {@code len} byte,
	 * and if also we did not meet the end of file.
	 */

	public int readLine(final byte[] array, final int off, final int len, final EnumSet terminators) throws IOException {
		ByteArrays.ensureOffsetLength(array ,off, len);
		if (len == 0) return 0; // 0-length reads always return 0
		if (noMoreCharacters()) return -1;
		int i, k = 0, remaining = len, read = 0; // The number of bytes still to be read
		for(;;) {
			for(i = 0; i < avail && i < remaining && (k = buffer[pos + i]) != '\n' && k != '\r' ; i++);
			System.arraycopy(buffer, pos, array, off + read, i);
			pos += i;
			avail -= i;
			read += i;
			remaining -= i;
			if (remaining == 0) {
				readBytes += read;
				return read; // We did not stop because of a terminator
			}

			if (avail > 0) { // We met a terminator
				if (k == '\n') { // LF first
					pos++;
					avail--;
					if (terminators.contains(LineTerminator.LF)) {
						readBytes += read + 1;
						return read;
					}
					else {
						array[off + read++] = '\n';
						remaining--;
					}
				}
				else if (k == '\r') { // CR first
					pos++;
					avail--;

					if (terminators.contains(LineTerminator.CR_LF)) {
						if (avail > 0) {
							if (buffer[pos] == '\n') { // CR/LF with LF already in the buffer.
								pos ++;
								avail--;
								readBytes += read + 2;
								return read;
							}
						}
						else { // We must search for the LF.
							if (noMoreCharacters()) {
								// Not found a matching LF because of end of file, will return CR in buffer if not a terminator

								if (! terminators.contains(LineTerminator.CR)) {
									array[off + read++] = '\r';
									remaining--;
									readBytes += read;
								}
								else readBytes += read + 1;

								return read;
							}
							if (buffer[0] == '\n') {
								// Found matching LF, won't return terminators in the buffer
								pos++;
								avail--;
								readBytes += read + 2;
								return read;
							}
						}
					}

					if (terminators.contains(LineTerminator.CR)) {
						readBytes += read + 1;
						return read;
					}

					array[off + read++] = '\r';
					remaining--;
				}
			}
			else if (noMoreCharacters()) {
				readBytes += read;
				return read;
			}
		}
	}

	@Override
	public void position(long newPosition) throws IOException {

		final long position = readBytes;

		/** Note that this check will succeed also in the case of
		 * an empty buffer and position == newPosition. This behaviour is
		 * intentional, as it delays buffering to when it is actually
		 * necessary and avoids useless class the underlying stream. */

		if (newPosition <= position + avail && newPosition >= position - pos) {
			pos += newPosition - position;
			avail -= newPosition - position;
			readBytes = newPosition;
			return;
		}

		if (repositionableStream != null) repositionableStream.position(newPosition);
		else if (fileChannel != null) fileChannel.position(newPosition);
		else throw new UnsupportedOperationException("position() can only be called if the underlying byte stream implements the RepositionableStream interface or if the getChannel() method of the underlying byte stream exists and returns a FileChannel");
		readBytes = newPosition;

		avail = pos = 0;
	}

	@Override
	public long position() throws IOException {
		return readBytes;
	}

	/** Returns the length of the underlying input stream, if it is {@linkplain MeasurableStream measurable}.
	 *
	 * @return the length of the underlying input stream.
	 * @throws UnsupportedOperationException if the underlying input stream is not {@linkplain MeasurableStream measurable} and
	 * cannot provide a {@link FileChannel}.
	 */

	@Override
	public long length() throws IOException {
		if (measurableStream != null) return measurableStream.length();
		if (fileChannel != null) return fileChannel.size();
		throw new UnsupportedOperationException();
	}

	/** Skips the given amount of bytes by repeated reads.
	 *
	 * Warning: this method uses destructively the internal buffer.
	 *
	 * @param n the number of bytes to skip.
	 * @return the number of bytes actually skipped.
	 * @see InputStream#skip(long)
	 */

	private long skipByReading(final long n) throws IOException {
		long toSkip = n;
		int len;
		while(toSkip > 0) {
			len = is.read(buffer, 0, (int)Math.min(buffer.length, toSkip));
			if (len > 0) toSkip -= len;
			else break;
		}

		return n - toSkip;
	}

	/** Skips over and discards the given number of bytes of data from this fast buffered input stream.
	 *
	 * 
As explained in the {@linkplain FastBufferedInputStream class documentation}, the semantics
	 * of {@link InputStream#skip(long)} is fatally flawed. This method provides additional semantics as follows:
	 * it will skip the provided number of bytes, unless the end of file has been reached.
	 *
	 * 
Additionally, if the underlying input stream is {@link System#in} this method will use
	 * repeated reads instead of invoking {@link InputStream#skip(long)}.
	 *
	 * @param n the number of bytes to skip.
	 * @return the number of bytes actually skipped; it can be smaller than {@code n}
	 * only if the end of file has been reached.
	 * @see InputStream#skip(long)
	 */

	@Override
	public long skip(final long n) throws IOException {
		if (n <= avail) {
			final int m = (int)n;
			pos += m;
			avail -= m;
			readBytes += n;
			return n;
		}

		long toSkip = n - avail, result = 0;
		avail = 0;

		while (toSkip != 0 && (result = is == System.in ? skipByReading(toSkip) : is.skip(toSkip)) < toSkip) {
			if (result == 0) {
				if (is.read() == -1) break;
				toSkip--;
			}
			else toSkip -= result;
		}

		final long t = n - (toSkip - result);
		readBytes += t;
		return t;
	}

	@Override
	public int available() throws IOException {
		return (int)Math.min(is.available() + (long)avail, Integer.MAX_VALUE);
	}

	@Override
	public void close() throws IOException {
		if (is == null) return;
		if (is != System.in) is.close();
		is = null;
		buffer = null;
	}

	/** Resets the internal logic of this fast buffered input stream, clearing the buffer.
	 *
	 * 
All buffering information is discarded, and the number of bytes read so far
	 * (and thus, also the {@linkplain #position() current position})
	 * is adjusted to reflect this fact.
	 *
	 * This method is mainly useful for re-reading
	 * files that have been overwritten externally.
	 */

	public void flush() {
		if (is == null) return;
		readBytes += avail;
		avail = pos = 0;
	}

	/** Resets the internal logic of this fast buffered input stream.
	 *
	 * @deprecated As of {@code fastutil} 5.0.4, replaced by {@link #flush()}. The old
	 * semantics of this method does not contradict {@link InputStream}'s contract, as
	 * the semantics of {@link #reset()} is undefined if {@link InputStream#markSupported()}
	 * returns false. On the other hand, the name was really a poor choice.
	 */
	@Override
	@Deprecated
	public void reset() {
		flush();
	}
}