src.it.unimi.dsi.io.FastBufferedReader Maven / Gradle / Ivy
Show all versions of dsiutils Show documentation
/*
* DSI utilities
*
* Copyright (C) 2007-2020 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
package it.unimi.dsi.io;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.Reader;
import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.chars.CharArrays;
import it.unimi.dsi.fastutil.chars.CharOpenHashSet;
import it.unimi.dsi.fastutil.chars.CharSet;
import it.unimi.dsi.fastutil.chars.CharSets;
import it.unimi.dsi.lang.MutableString;
/** A lightweight, unsynchronised buffered reader based on
* {@linkplain it.unimi.dsi.lang.MutableString mutable strings}.
*
* This class provides buffering for readers, but it does so with
* purposes and an internal logic that are radically different from the ones
* adopted in {@link java.io.BufferedReader}.
*
*
There is no support for marking. All methods are unsychronised. All
* methods returning strings do so by writing in a given {@link it.unimi.dsi.lang.MutableString}.
*
*
Note that instances of this class can wrap {@linkplain #FastBufferedReader(char[]) an array}
* or a {@linkplain #FastBufferedReader(MutableString) mutable string}. In this case,
* instances of this class may be used as a lightweight, unsynchronised
* alternative to {@link java.io.CharArrayReader}
* providing additional services such as word and line breaking.
*
*
As any {@link it.unimi.dsi.io.WordReader}, this class is serialisable.
* The only field kept is the current buffer size, which will be used to rebuild
* a fast buffered reader with the same buffer size. All other fields will be reset.
*
*
Reading words
*
* This class implements {@link WordReader} in the simplest way: words are defined as
* maximal subsequences of characters satisfying {@link Character#isLetterOrDigit(char)}.
* To alter this behaviour, you have two choices:
*
* - you can provide at construction time a {@link CharSet} of characters that will be considered word constituents
* besides those accepted by {@link Character#isLetterOrDigit(char)};
*
- you can override the method {@link #isWordConstituent(char)}; in this case,
* you must override the {@link #copy()} method, too.
*
*
* The second approach is of course more flexible, but the first one is particularly useful from
* the command line as there is a {@linkplain FastBufferedReader#FastBufferedReader(String) constructor
* accepting the additional word constituents as a string}.
*/
public class FastBufferedReader extends Reader implements WordReader {
public static final long serialVersionUID = 1L;
/** The default size of the internal buffer in bytes (16Ki). */
public final static int DEFAULT_BUFFER_SIZE = 16 * 1024;
/** The buffer size (must be equal to {@link #buffer buffer.length}). */
protected final int bufferSize;
/** A set of additional characters that will be considered as word constituents, beside those accepted by {@link Character#isLetterOrDigit(int)}. */
protected final CharSet wordConstituents;
/** The internal buffer. */
protected transient char[] buffer;
/** The current position in the buffer. */
protected transient int pos;
/** The number of buffer bytes available starting from {@link #pos}. */
protected transient int avail;
/** The underlying reader. */
protected transient Reader reader;
/** Creates a new fast buffered reader with a given buffer size.
* The wrapped reader will have to be set later using {@link #setReader(Reader)}.
*
* @param bufferSize the size in characters of the internal buffer (must be nonzero).
*/
public FastBufferedReader(final int bufferSize) {
this (bufferSize, CharSets.EMPTY_SET);
}
/** Creates a new fast buffered reader with a given buffer size and set of additional word constituents.
* The wrapped reader will have to be set later using {@link #setReader(Reader)}.
*
* @param bufferSize the size in characters of the internal buffer (must be nonzero).
* @param wordConstituents a set of characters that will be considered word constituents.
*/
public FastBufferedReader(final int bufferSize, final CharSet wordConstituents) {
if (bufferSize <= 0) throw new IllegalArgumentException("The buffer size must be strictly positive.");
buffer = new char[this.bufferSize = bufferSize];
this.wordConstituents = wordConstituents;
}
/** Creates a new fast buffered reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters.
* The wrapped reader will have to be set later using {@link #setReader(Reader)}.
*/
public FastBufferedReader() {
this(DEFAULT_BUFFER_SIZE);
}
/** Creates a new fast buffered reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters and given set of additional word constituents.
* The wrapped reader will have to be set later using {@link #setReader(Reader)}.
* @param wordConstituents a set of characters that will be considered word constituents.
*/
public FastBufferedReader(final CharSet wordConstituents) {
this(DEFAULT_BUFFER_SIZE, wordConstituents);
}
/** Creates a new fast buffered reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters and a set of additional word constituents specified by a string.
*
*
Warning: it is easy to mistake this method for one whose semantics is
* the same as {@link #FastBufferedReader(MutableString)}, that is, wrapping the argument
* string in a reader.
*
* @param wordConstituents a string of characters that will be considered word constituents.
* @throws IllegalArgumentException if {@code wordConstituents} contains duplicate characters.
*/
public FastBufferedReader(final String wordConstituents) {
this(new CharOpenHashSet(wordConstituents.toCharArray(), Hash.VERY_FAST_LOAD_FACTOR));
if (wordConstituents.length() != this.wordConstituents.size()) throw new IllegalArgumentException("Repeated characters in word constituents: \"" + wordConstituents + "\"");
}
/** Creates a new fast buffered reader with a given buffer size and a set of additional word constituents, both specified by strings.
* @param bufferSize the size in characters of the internal buffer (must be nonzero).
* @param wordConstituents a string of characters that will be considered word constituents.
*/
public FastBufferedReader(final String bufferSize, final String wordConstituents) {
this(Integer.parseInt(bufferSize), new CharOpenHashSet(wordConstituents.toCharArray(), Hash.VERY_FAST_LOAD_FACTOR));
}
/** Creates a new fast buffered reader by wrapping a given reader with a given buffer size.
*
* @param r a reader to wrap.
* @param bufferSize the size in bytes of the internal buffer.
*/
public FastBufferedReader(final Reader r, final int bufferSize) {
this(bufferSize);
this.reader = r;
}
/** Creates a new fast buffered reader by wrapping a given reader with a given buffer size and using a set of additional word constituents.
*
* @param r a reader to wrap.
* @param bufferSize the size in characters of the internal buffer (must be nonzero).
* @param wordConstituents a set of characters that will be considered word constituents.
*/
public FastBufferedReader(final Reader r, final int bufferSize, final CharSet wordConstituents) {
this(bufferSize, wordConstituents);
this.reader = r;
}
/** Creates a new fast buffered reader by wrapping a given reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters.
*
* @param r a reader to wrap.
*/
public FastBufferedReader(final Reader r) {
this(r, DEFAULT_BUFFER_SIZE);
}
/** Creates a new fast buffered reader by wrapping a given reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters and using a set of additional word constituents.
*
* @param r a reader to wrap.
* @param wordConstituents a set of characters that will be considered word constituents.
*/
public FastBufferedReader(final Reader r, final CharSet wordConstituents) {
this(r, DEFAULT_BUFFER_SIZE, wordConstituents);
}
/** Creates a new fast buffered reader by wrapping a given fragment of a character array and using a set of additional word constituents.
*
The effect of {@link #setReader(Reader)} on a buffer created with
* this constructor is undefined.
*
* @param array the array that will be wrapped by the reader.
* @param offset the first character to be used.
* @param length the number of character to be used.
* @param wordConstituents a set of characters that will be considered word constituents.
*/
public FastBufferedReader(final char[] array, final int offset, final int length, final CharSet wordConstituents) {
CharArrays.ensureOffsetLength(array, offset, length);
buffer = array;
pos = offset;
avail = length;
bufferSize = array.length;
reader = NullReader.getInstance();
this.wordConstituents = wordConstituents;
}
/** Creates a new fast buffered reader by wrapping a given fragment of a character array.
*
The effect of {@link #setReader(Reader)} on a buffer created with
* this constructor is undefined.
*
* @param array the array that will be wrapped by the reader.
* @param offset the first character to be used.
* @param length the number of character to be used.
*/
public FastBufferedReader(final char[] array, final int offset, final int length) {
this(array, offset, length, CharSets.EMPTY_SET);
}
/** Creates a new fast buffered reader by wrapping a given character array and using a set of additional word constituents.
*
*
The effect of {@link #setReader(Reader)} on a buffer created with
* this constructor is undefined.
* @param array the array that will be wrapped by the reader.
* @param wordConstituents a set of characters that will be considered word constituents.
*/
public FastBufferedReader(final char[] array, final CharSet wordConstituents) {
this(array, 0, array.length, wordConstituents);
}
/** Creates a new fast buffered reader by wrapping a given character array.
*
*
The effect of {@link #setReader(Reader)} on a buffer created with
* this constructor is undefined.
* @param array the array that will be wrapped by the reader.
*/
public FastBufferedReader(final char[] array) {
this(array, 0, array.length);
}
/** Creates a new fast buffered reader by wrapping a given mutable string and using a set of additional word constituents.
*
*
The effect of {@link #setReader(Reader)} on a buffer created with
* this constructor is undefined.
*
* @param s the mutable string that will be wrapped by the reader.
* @param wordConstituents a set of characters that will be considered word constituents.
*/
public FastBufferedReader(final MutableString s, final CharSet wordConstituents) {
this(s.array(), 0, s.length(), wordConstituents);
}
/** Creates a new fast buffered reader by wrapping a given mutable string.
*
The effect of {@link #setReader(Reader)} on a buffer created with
* this constructor is undefined.
*
* @param s the mutable string that will be wrapped by the reader.
*/
public FastBufferedReader(final MutableString s) {
this(s.array(), 0, s.length());
}
@Override
public FastBufferedReader copy() {
return new FastBufferedReader(bufferSize, wordConstituents);
}
/** Checks whether no more characters will be returned.
*
* @return true if there are no characters in the internal buffer and
* the underlying reader is exhausted.
*/
protected boolean noMoreCharacters() throws IOException {
if (avail == 0) {
avail = reader.read(buffer);
if (avail <= 0) {
avail = 0;
return true;
}
pos = 0;
}
return false;
}
@Override
public int read() throws IOException {
if (noMoreCharacters()) return -1;
avail--;
return buffer[pos++];
}
@Override
public int read(final char[] b, int offset, int length) throws IOException {
CharArrays.ensureOffsetLength(b, offset, length);
if (length <= avail) {
System.arraycopy(buffer, pos, b, offset, length);
pos += length;
avail -= length;
return length;
}
final int head = avail;
System.arraycopy(buffer, pos, b, offset, head);
offset += head;
length -= head;
avail = 0;
int result;
result = reader.read(b, offset, length);
return result < 0
? (head != 0 ? head : -1)
: result + head;
}
/** Reads a line into the given mutable string.
*
*
The next line of input (defined as in {@link java.io.BufferedReader#readLine()})
* will be stored into s
. Note that if s
is
* not {@linkplain it.unimi.dsi.lang.MutableString loose}
* this method will be quite inefficient.
*
* @param s a mutable string that will be used to store the next line (which could be empty).
* @return s
, or {@code null} if the end of file was found, in which
* case s
is unchanged.
*/
public MutableString readLine(final MutableString s) throws IOException {
char c = 0;
int i;
if (noMoreCharacters()) return null;
s.length(0);
for(;;) {
for(i = 0; i < avail && (c = buffer[pos + i]) != '\n' && c != '\r' ; i++);
s.append(buffer, pos, i);
pos += i;
avail -= i;
if (avail > 0) {
if (c == '\n') { // LF only.
pos++;
avail--;
}
else { // c == '\r'
pos++;
avail--;
if (avail > 0) {
if (buffer[pos] == '\n') { // CR/LF with LF already in the buffer.
pos ++;
avail--;
}
}
else { // We must search for the LF.
if (noMoreCharacters()) return s;
if (buffer[0] == '\n') {
pos++;
avail--;
}
}
}
return s;
}
else if (noMoreCharacters()) return s;
}
}
/** Returns whether the given character is a word constituent.
*
*
The behaviour of this {@link FastBufferedReader} as a {@link WordReader} can
* be radically changed by overwriting this method.
*
* @param c a character.
* @return whether c
should be considered a word constituent.
*/
protected boolean isWordConstituent(final char c) {
return Character.isLetterOrDigit(c) || wordConstituents.contains(c);
}
@Override
public boolean next(final MutableString word, final MutableString nonWord) throws IOException {
int i;
final char buffer[] = this.buffer;
if (noMoreCharacters()) return false;
word.length(0);
nonWord.length(0);
for(;;) {
for(i = 0; i < avail && isWordConstituent(buffer[pos + i]); i++);
word.append(buffer, pos, i);
pos += i;
avail -= i;
if (avail > 0 || noMoreCharacters()) break;
}
if (noMoreCharacters()) return true;
for(;;) {
for(i = 0; i < avail && ! isWordConstituent(buffer[pos + i]); i++);
nonWord.append(buffer, pos, i);
pos += i;
avail -= i;
if (avail > 0 || noMoreCharacters()) return true;
}
}
@Override
public FastBufferedReader setReader(final Reader reader) {
this.reader = reader;
avail = 0;
return this;
}
@Override
public long skip(long n) throws IOException {
if (n <= avail) {
pos += ((int)n);
avail -= ((int)n);
return n;
}
final int head = avail;
n -= head;
avail = 0;
return reader.skip(n) + head;
}
@Override
public void close() throws IOException {
if (reader == null) return;
reader.close();
reader = null;
buffer = null;
}
private void readObject(final ObjectInputStream s) throws IOException, ClassNotFoundException {
s.defaultReadObject();
buffer = new char[bufferSize];
}
public String toSpec() {
return toString();
}
@Override
public String toString() {
final String className = getClass().getName();
if (bufferSize == DEFAULT_BUFFER_SIZE && wordConstituents.isEmpty()) return className;
if (wordConstituents.isEmpty()) return className + "(" + bufferSize + ")";
final String wordConstituents = new String(this.wordConstituents.toCharArray());
if (bufferSize == DEFAULT_BUFFER_SIZE) return className + "(\"" + wordConstituents + "\")";
return className + "(" + bufferSize + ",\"" + wordConstituents + "\")";
}
}