All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.io.FastBufferedReader Maven / Gradle / Ivy

Go to download

The DSI utilities are a mishmash of classes accumulated during the last twenty years in projects developed at the DSI (Dipartimento di Scienze dell'Informazione, i.e., Information Sciences Department), now DI (Dipartimento di Informatica, i.e., Informatics Department), of the Universita` degli Studi di Milano.

There is a newer version: 2.7.3
Show newest version
package it.unimi.dsi.io;

/*
 * DSI utilities
 *
 * Copyright (C) 2007-2019 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.chars.CharArrays;
import it.unimi.dsi.fastutil.chars.CharOpenHashSet;
import it.unimi.dsi.fastutil.chars.CharSet;
import it.unimi.dsi.fastutil.chars.CharSets;
import it.unimi.dsi.lang.MutableString;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.Reader;

/** A lightweight, unsynchronised buffered reader based on
 * {@linkplain it.unimi.dsi.lang.MutableString mutable strings}.
 *
 * 

This class provides buffering for readers, but it does so with * purposes and an internal logic that are radically different from the ones * adopted in {@link java.io.BufferedReader}. * *

There is no support for marking. All methods are unsychronised. All * methods returning strings do so by writing in a given {@link it.unimi.dsi.lang.MutableString}. * *

Note that instances of this class can wrap {@linkplain #FastBufferedReader(char[]) an array} * or a {@linkplain #FastBufferedReader(MutableString) mutable string}. In this case, * instances of this class may be used as a lightweight, unsynchronised * alternative to {@link java.io.CharArrayReader} * providing additional services such as word and line breaking. * *

As any {@link it.unimi.dsi.io.WordReader}, this class is serialisable. * The only field kept is the current buffer size, which will be used to rebuild * a fast buffered reader with the same buffer size. All other fields will be reset. * *

Reading words

* *

This class implements {@link WordReader} in the simplest way: words are defined as * maximal subsequences of characters satisfying {@link Character#isLetterOrDigit(char)}. * To alter this behaviour, you have two choices: *

    *
  • you can provide at construction time a {@link CharSet} of characters that will be considered word constituents * besides those accepted by {@link Character#isLetterOrDigit(char)}; *
  • you can override the method {@link #isWordConstituent(char)}; in this case, * you must override the {@link #copy()} method, too. *
* *

The second approach is of course more flexible, but the first one is particularly useful from * the command line as there is a {@linkplain FastBufferedReader#FastBufferedReader(String) constructor * accepting the additional word constituents as a string}. */ public class FastBufferedReader extends Reader implements WordReader { public static final long serialVersionUID = 1L; /** The default size of the internal buffer in bytes (16Ki). */ public final static int DEFAULT_BUFFER_SIZE = 16 * 1024; /** The buffer size (must be equal to {@link #buffer buffer.length}). */ protected final int bufferSize; /** A set of additional characters that will be considered as word constituents, beside those accepted by {@link Character#isLetterOrDigit(int)}. */ protected final CharSet wordConstituents; /** The internal buffer. */ protected transient char[] buffer; /** The current position in the buffer. */ protected transient int pos; /** The number of buffer bytes available starting from {@link #pos}. */ protected transient int avail; /** The underlying reader. */ protected transient Reader reader; /** Creates a new fast buffered reader with a given buffer size. * The wrapped reader will have to be set later using {@link #setReader(Reader)}. * * @param bufferSize the size in characters of the internal buffer (must be nonzero). */ public FastBufferedReader(final int bufferSize) { this (bufferSize, CharSets.EMPTY_SET); } /** Creates a new fast buffered reader with a given buffer size and set of additional word constituents. * The wrapped reader will have to be set later using {@link #setReader(Reader)}. * * @param bufferSize the size in characters of the internal buffer (must be nonzero). * @param wordConstituents a set of characters that will be considered word constituents. */ public FastBufferedReader(final int bufferSize, final CharSet wordConstituents) { if (bufferSize <= 0) throw new IllegalArgumentException("The buffer size must be strictly positive."); buffer = new char[this.bufferSize = bufferSize]; this.wordConstituents = wordConstituents; } /** Creates a new fast buffered reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters. * The wrapped reader will have to be set later using {@link #setReader(Reader)}. */ public FastBufferedReader() { this(DEFAULT_BUFFER_SIZE); } /** Creates a new fast buffered reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters and given set of additional word constituents. * The wrapped reader will have to be set later using {@link #setReader(Reader)}. * @param wordConstituents a set of characters that will be considered word constituents. */ public FastBufferedReader(final CharSet wordConstituents) { this(DEFAULT_BUFFER_SIZE, wordConstituents); } /** Creates a new fast buffered reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters and a set of additional word constituents specified by a string. * *

Warning: it is easy to mistake this method for one whose semantics is * the same as {@link #FastBufferedReader(MutableString)}, that is, wrapping the argument * string in a reader. * * @param wordConstituents a string of characters that will be considered word constituents. * @throws IllegalArgumentException if {@code wordConstituents} contains duplicate characters. */ public FastBufferedReader(final String wordConstituents) { this(new CharOpenHashSet(wordConstituents.toCharArray(), Hash.VERY_FAST_LOAD_FACTOR)); if (wordConstituents.length() != this.wordConstituents.size()) throw new IllegalArgumentException("Repeated characters in word constituents: \"" + wordConstituents + "\""); } /** Creates a new fast buffered reader with a given buffer size and a set of additional word constituents, both specified by strings. * @param bufferSize the size in characters of the internal buffer (must be nonzero). * @param wordConstituents a string of characters that will be considered word constituents. */ public FastBufferedReader(final String bufferSize, final String wordConstituents) { this(Integer.parseInt(bufferSize), new CharOpenHashSet(wordConstituents.toCharArray(), Hash.VERY_FAST_LOAD_FACTOR)); } /** Creates a new fast buffered reader by wrapping a given reader with a given buffer size. * * @param r a reader to wrap. * @param bufferSize the size in bytes of the internal buffer. */ public FastBufferedReader(final Reader r, final int bufferSize) { this(bufferSize); this.reader = r; } /** Creates a new fast buffered reader by wrapping a given reader with a given buffer size and using a set of additional word constituents. * * @param r a reader to wrap. * @param bufferSize the size in characters of the internal buffer (must be nonzero). * @param wordConstituents a set of characters that will be considered word constituents. */ public FastBufferedReader(final Reader r, final int bufferSize, final CharSet wordConstituents) { this(bufferSize, wordConstituents); this.reader = r; } /** Creates a new fast buffered reader by wrapping a given reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters. * * @param r a reader to wrap. */ public FastBufferedReader(final Reader r) { this(r, DEFAULT_BUFFER_SIZE); } /** Creates a new fast buffered reader by wrapping a given reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters and using a set of additional word constituents. * * @param r a reader to wrap. * @param wordConstituents a set of characters that will be considered word constituents. */ public FastBufferedReader(final Reader r, final CharSet wordConstituents) { this(r, DEFAULT_BUFFER_SIZE, wordConstituents); } /** Creates a new fast buffered reader by wrapping a given fragment of a character array and using a set of additional word constituents. *

The effect of {@link #setReader(Reader)} on a buffer created with * this constructor is undefined. * * @param array the array that will be wrapped by the reader. * @param offset the first character to be used. * @param length the number of character to be used. * @param wordConstituents a set of characters that will be considered word constituents. */ public FastBufferedReader(final char[] array, final int offset, final int length, final CharSet wordConstituents) { CharArrays.ensureOffsetLength(array, offset, length); buffer = array; pos = offset; avail = length; bufferSize = array.length; reader = NullReader.getInstance(); this.wordConstituents = wordConstituents; } /** Creates a new fast buffered reader by wrapping a given fragment of a character array. *

The effect of {@link #setReader(Reader)} on a buffer created with * this constructor is undefined. * * @param array the array that will be wrapped by the reader. * @param offset the first character to be used. * @param length the number of character to be used. */ public FastBufferedReader(final char[] array, final int offset, final int length) { this(array, offset, length, CharSets.EMPTY_SET); } /** Creates a new fast buffered reader by wrapping a given character array and using a set of additional word constituents. * *

The effect of {@link #setReader(Reader)} on a buffer created with * this constructor is undefined. * @param array the array that will be wrapped by the reader. * @param wordConstituents a set of characters that will be considered word constituents. */ public FastBufferedReader(final char[] array, final CharSet wordConstituents) { this(array, 0, array.length, wordConstituents); } /** Creates a new fast buffered reader by wrapping a given character array. * *

The effect of {@link #setReader(Reader)} on a buffer created with * this constructor is undefined. * @param array the array that will be wrapped by the reader. */ public FastBufferedReader(final char[] array) { this(array, 0, array.length); } /** Creates a new fast buffered reader by wrapping a given mutable string and using a set of additional word constituents. * *

The effect of {@link #setReader(Reader)} on a buffer created with * this constructor is undefined. * * @param s the mutable string that will be wrapped by the reader. * @param wordConstituents a set of characters that will be considered word constituents. */ public FastBufferedReader(final MutableString s, final CharSet wordConstituents) { this(s.array(), 0, s.length(), wordConstituents); } /** Creates a new fast buffered reader by wrapping a given mutable string. *

The effect of {@link #setReader(Reader)} on a buffer created with * this constructor is undefined. * * @param s the mutable string that will be wrapped by the reader. */ public FastBufferedReader(final MutableString s) { this(s.array(), 0, s.length()); } @Override public FastBufferedReader copy() { return new FastBufferedReader(bufferSize, wordConstituents); } /** Checks whether no more characters will be returned. * * @return true if there are no characters in the internal buffer and * the underlying reader is exhausted. */ protected boolean noMoreCharacters() throws IOException { if (avail == 0) { avail = reader.read(buffer); if (avail <= 0) { avail = 0; return true; } pos = 0; } return false; } @Override public int read() throws IOException { if (noMoreCharacters()) return -1; avail--; return buffer[pos++]; } @Override public int read(final char[] b, int offset, int length) throws IOException { CharArrays.ensureOffsetLength(b, offset, length); if (length <= avail) { System.arraycopy(buffer, pos, b, offset, length); pos += length; avail -= length; return length; } final int head = avail; System.arraycopy(buffer, pos, b, offset, head); offset += head; length -= head; avail = 0; int result; result = reader.read(b, offset, length); return result < 0 ? (head != 0 ? head : -1) : result + head; } /** Reads a line into the given mutable string. * *

The next line of input (defined as in {@link java.io.BufferedReader#readLine()}) * will be stored into s. Note that if s is * not {@linkplain it.unimi.dsi.lang.MutableString loose} * this method will be quite inefficient. * * @param s a mutable string that will be used to store the next line (which could be empty). * @return s, or {@code null} if the end of file was found, in which * case s is unchanged. */ public MutableString readLine(final MutableString s) throws IOException { char c = 0; int i; if (noMoreCharacters()) return null; s.length(0); for(;;) { for(i = 0; i < avail && (c = buffer[pos + i]) != '\n' && c != '\r' ; i++); s.append(buffer, pos, i); pos += i; avail -= i; if (avail > 0) { if (c == '\n') { // LF only. pos++; avail--; } else { // c == '\r' pos++; avail--; if (avail > 0) { if (buffer[pos] == '\n') { // CR/LF with LF already in the buffer. pos ++; avail--; } } else { // We must search for the LF. if (noMoreCharacters()) return s; if (buffer[0] == '\n') { pos++; avail--; } } } return s; } else if (noMoreCharacters()) return s; } } /** Returns whether the given character is a word constituent. * *

The behaviour of this {@link FastBufferedReader} as a {@link WordReader} can * be radically changed by overwriting this method. * * @param c a character. * @return whether c should be considered a word constituent. */ protected boolean isWordConstituent(final char c) { return Character.isLetterOrDigit(c) || wordConstituents.contains(c); } @Override public boolean next(final MutableString word, final MutableString nonWord) throws IOException { int i; final char buffer[] = this.buffer; if (noMoreCharacters()) return false; word.length(0); nonWord.length(0); for(;;) { for(i = 0; i < avail && isWordConstituent(buffer[pos + i]); i++); word.append(buffer, pos, i); pos += i; avail -= i; if (avail > 0 || noMoreCharacters()) break; } if (noMoreCharacters()) return true; for(;;) { for(i = 0; i < avail && ! isWordConstituent(buffer[pos + i]); i++); nonWord.append(buffer, pos, i); pos += i; avail -= i; if (avail > 0 || noMoreCharacters()) return true; } } @Override public FastBufferedReader setReader(final Reader reader) { this.reader = reader; avail = 0; return this; } @Override public long skip(long n) throws IOException { if (n <= avail) { pos += ((int)n); avail -= ((int)n); return n; } final int head = avail; n -= head; avail = 0; return reader.skip(n) + head; } @Override public void close() throws IOException { if (reader == null) return; reader.close(); reader = null; buffer = null; } private void readObject(final ObjectInputStream s) throws IOException, ClassNotFoundException { s.defaultReadObject(); buffer = new char[bufferSize]; } public String toSpec() { return toString(); } @Override public String toString() { final String className = getClass().getName(); if (bufferSize == DEFAULT_BUFFER_SIZE && wordConstituents.isEmpty()) return className; if (wordConstituents.isEmpty()) return className + "(" + bufferSize + ")"; String wordConstituents = new String(this.wordConstituents.toCharArray()); if (bufferSize == DEFAULT_BUFFER_SIZE) return className + "(\"" + wordConstituents + "\")"; return className + "(" + bufferSize + ",\"" + wordConstituents + "\")"; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy