All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.io.FastBufferedReader Maven / Gradle / Ivy

package it.unimi.dsi.io;

/*
 * DSI utilities
 *
 * Copyright (C) 2007-2017 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.chars.CharArrays;
import it.unimi.dsi.fastutil.chars.CharOpenHashSet;
import it.unimi.dsi.fastutil.chars.CharSet;
import it.unimi.dsi.fastutil.chars.CharSets;
import it.unimi.dsi.lang.MutableString;

import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.Reader;

/** A lightweight, unsynchronised buffered reader based on
 * {@linkplain it.unimi.dsi.lang.MutableString mutable strings}.
 *
 * 

This class provides buffering for readers, but it does so with * purposes and an internal logic that are radically different from the ones * adopted in {@link java.io.BufferedReader}. * *

There is no support for marking. All methods are unsychronised. All * methods returning strings do so by writing in a given {@link it.unimi.dsi.lang.MutableString}. * *

Note that instances of this class can wrap {@linkplain #FastBufferedReader(char[]) an array} * or a {@linkplain #FastBufferedReader(MutableString) mutable string}. In this case, * instances of this class may be used as a lightweight, unsynchronised * alternative to {@link java.io.CharArrayReader} * providing additional services such as word and line breaking. * *

As any {@link it.unimi.dsi.io.WordReader}, this class is serialisable. * The only field kept is the current buffer size, which will be used to rebuild * a fast buffered reader with the same buffer size. All other fields will be reset. * *

Reading words

* *

This class implements {@link WordReader} in the simplest way: words are defined as * maximal subsequences of characters satisfying {@link Character#isLetterOrDigit(char)}. * To alter this behaviour, you have two choices: *

    *
  • you can provide at construction time a {@link CharSet} of characters that will be considered word constituents * besides those accepted by {@link Character#isLetterOrDigit(char)}; *
  • you can override the method {@link #isWordConstituent(char)}; in this case, * you must override the {@link #copy()} method, too. *
* *

The second approach is of course more flexible, but the first one is particularly useful from * the command line as there is a {@linkplain FastBufferedReader#FastBufferedReader(String) constructor * accepting the additional word constituents as a string}. */ public class FastBufferedReader extends Reader implements WordReader { public static final long serialVersionUID = 1L; /** The default size of the internal buffer in bytes (16Ki). */ public final static int DEFAULT_BUFFER_SIZE = 16 * 1024; /** The buffer size (must be equal to {@link #buffer buffer.length}). */ protected final int bufferSize; /** A set of additional characters that will be considered as word constituents, beside those accepted by {@link Character#isLetterOrDigit(int)}. */ protected final CharSet wordConstituents; /** The internal buffer. */ protected transient char[] buffer; /** The current position in the buffer. */ protected transient int pos; /** The number of buffer bytes available starting from {@link #pos}. */ protected transient int avail; /** The underlying reader. */ protected transient Reader reader; /** Creates a new fast buffered reader with a given buffer size. * The wrapped reader will have to be set later using {@link #setReader(Reader)}. * * @param bufferSize the size in characters of the internal buffer (must be nonzero). */ public FastBufferedReader(final int bufferSize) { this (bufferSize, CharSets.EMPTY_SET); } /** Creates a new fast buffered reader with a given buffer size and set of additional word constituents. * The wrapped reader will have to be set later using {@link #setReader(Reader)}. * * @param bufferSize the size in characters of the internal buffer (must be nonzero). * @param wordConstituents a set of characters that will be considered word constituents. */ public FastBufferedReader(final int bufferSize, final CharSet wordConstituents) { if (bufferSize <= 0) throw new IllegalArgumentException("The buffer size must be strictly positive."); buffer = new char[this.bufferSize = bufferSize]; this.wordConstituents = wordConstituents; } /** Creates a new fast buffered reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters. * The wrapped reader will have to be set later using {@link #setReader(Reader)}. */ public FastBufferedReader() { this(DEFAULT_BUFFER_SIZE); } /** Creates a new fast buffered reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters and given set of additional word constituents. * The wrapped reader will have to be set later using {@link #setReader(Reader)}. * @param wordConstituents a set of characters that will be considered word constituents. */ public FastBufferedReader(final CharSet wordConstituents) { this(DEFAULT_BUFFER_SIZE, wordConstituents); } /** Creates a new fast buffered reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters and a set of additional word constituents specified by a string. * *

Warning: it is easy to mistake this method for one whose semantics is * the same as {@link #FastBufferedReader(MutableString)}, that is, wrapping the argument * string in a reader. * * @param wordConstituents a string of characters that will be considered word constituents. * @throws IllegalArgumentException if {@code wordConstituents} contains duplicate characters. */ public FastBufferedReader(final String wordConstituents) { this(new CharOpenHashSet(wordConstituents.toCharArray(), Hash.VERY_FAST_LOAD_FACTOR)); if (wordConstituents.length() != this.wordConstituents.size()) throw new IllegalArgumentException("Repeated characters in word constituents: \"" + wordConstituents + "\""); } /** Creates a new fast buffered reader with a given buffer size and a set of additional word constituents, both specified by strings. * @param bufferSize the size in characters of the internal buffer (must be nonzero). * @param wordConstituents a string of characters that will be considered word constituents. */ public FastBufferedReader(final String bufferSize, final String wordConstituents) { this(Integer.parseInt(bufferSize), new CharOpenHashSet(wordConstituents.toCharArray(), Hash.VERY_FAST_LOAD_FACTOR)); } /** Creates a new fast buffered reader by wrapping a given reader with a given buffer size. * * @param r a reader to wrap. * @param bufferSize the size in bytes of the internal buffer. */ public FastBufferedReader(final Reader r, final int bufferSize) { this(bufferSize); this.reader = r; } /** Creates a new fast buffered reader by wrapping a given reader with a given buffer size and using a set of additional word constituents. * * @param r a reader to wrap. * @param bufferSize the size in characters of the internal buffer (must be nonzero). * @param wordConstituents a set of characters that will be considered word constituents. */ public FastBufferedReader(final Reader r, final int bufferSize, final CharSet wordConstituents) { this(bufferSize, wordConstituents); this.reader = r; } /** Creates a new fast buffered reader by wrapping a given reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters. * * @param r a reader to wrap. */ public FastBufferedReader(final Reader r) { this(r, DEFAULT_BUFFER_SIZE); } /** Creates a new fast buffered reader by wrapping a given reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters and using a set of additional word constituents. * * @param r a reader to wrap. * @param wordConstituents a set of characters that will be considered word constituents. */ public FastBufferedReader(final Reader r, final CharSet wordConstituents) { this(r, DEFAULT_BUFFER_SIZE, wordConstituents); } /** Creates a new fast buffered reader by wrapping a given fragment of a character array and using a set of additional word constituents. *

The effect of {@link #setReader(Reader)} on a buffer created with * this constructor is undefined. * * @param array the array that will be wrapped by the reader. * @param offset the first character to be used. * @param length the number of character to be used. * @param wordConstituents a set of characters that will be considered word constituents. */ public FastBufferedReader(final char[] array, final int offset, final int length, final CharSet wordConstituents) { CharArrays.ensureOffsetLength(array, offset, length); buffer = array; pos = offset; avail = length; bufferSize = array.length; reader = NullReader.getInstance(); this.wordConstituents = wordConstituents; } /** Creates a new fast buffered reader by wrapping a given fragment of a character array. *

The effect of {@link #setReader(Reader)} on a buffer created with * this constructor is undefined. * * @param array the array that will be wrapped by the reader. * @param offset the first character to be used. * @param length the number of character to be used. */ public FastBufferedReader(final char[] array, final int offset, final int length) { this(array, offset, length, CharSets.EMPTY_SET); } /** Creates a new fast buffered reader by wrapping a given character array and using a set of additional word constituents. * *

The effect of {@link #setReader(Reader)} on a buffer created with * this constructor is undefined. * @param array the array that will be wrapped by the reader. * @param wordConstituents a set of characters that will be considered word constituents. */ public FastBufferedReader(final char[] array, final CharSet wordConstituents) { this(array, 0, array.length, wordConstituents); } /** Creates a new fast buffered reader by wrapping a given character array. * *

The effect of {@link #setReader(Reader)} on a buffer created with * this constructor is undefined. * @param array the array that will be wrapped by the reader. */ public FastBufferedReader(final char[] array) { this(array, 0, array.length); } /** Creates a new fast buffered reader by wrapping a given mutable string and using a set of additional word constituents. * *

The effect of {@link #setReader(Reader)} on a buffer created with * this constructor is undefined. * * @param s the mutable string that will be wrapped by the reader. * @param wordConstituents a set of characters that will be considered word constituents. */ public FastBufferedReader(final MutableString s, final CharSet wordConstituents) { this(s.array(), 0, s.length(), wordConstituents); } /** Creates a new fast buffered reader by wrapping a given mutable string. *

The effect of {@link #setReader(Reader)} on a buffer created with * this constructor is undefined. * * @param s the mutable string that will be wrapped by the reader. */ public FastBufferedReader(final MutableString s) { this(s.array(), 0, s.length()); } @Override public FastBufferedReader copy() { return new FastBufferedReader(bufferSize, wordConstituents); } /** Checks whether no more characters will be returned. * * @return true if there are no characters in the internal buffer and * the underlying reader is exhausted. */ protected boolean noMoreCharacters() throws IOException { if (avail == 0) { avail = reader.read(buffer); if (avail <= 0) { avail = 0; return true; } pos = 0; } return false; } @Override public int read() throws IOException { if (noMoreCharacters()) return -1; avail--; return buffer[pos++]; } @Override public int read(final char[] b, int offset, int length) throws IOException { CharArrays.ensureOffsetLength(b, offset, length); if (length <= avail) { System.arraycopy(buffer, pos, b, offset, length); pos += length; avail -= length; return length; } final int head = avail; System.arraycopy(buffer, pos, b, offset, head); offset += head; length -= head; avail = 0; int result; result = reader.read(b, offset, length); return result < 0 ? (head != 0 ? head : -1) : result + head; } /** Reads a line into the given mutable string. * *

The next line of input (defined as in {@link java.io.BufferedReader#readLine()}) * will be stored into s. Note that if s is * not {@linkplain it.unimi.dsi.lang.MutableString loose} * this method will be quite inefficient. * * @param s a mutable string that will be used to store the next line (which could be empty). * @return s, or {@code null} if the end of file was found, in which * case s is unchanged. */ public MutableString readLine(final MutableString s) throws IOException { char c = 0; int i; if (noMoreCharacters()) return null; s.length(0); for(;;) { for(i = 0; i < avail && (c = buffer[pos + i]) != '\n' && c != '\r' ; i++); s.append(buffer, pos, i); pos += i; avail -= i; if (avail > 0) { if (c == '\n') { // LF only. pos++; avail--; } else { // c == '\r' pos++; avail--; if (avail > 0) { if (buffer[pos] == '\n') { // CR/LF with LF already in the buffer. pos ++; avail--; } } else { // We must search for the LF. if (noMoreCharacters()) return s; if (buffer[0] == '\n') { pos++; avail--; } } } return s; } else if (noMoreCharacters()) return s; } } /** Returns whether the given character is a word constituent. * *

The behaviour of this {@link FastBufferedReader} as a {@link WordReader} can * be radically changed by overwriting this method. * * @param c a character. * @return whether c should be considered a word constituent. */ protected boolean isWordConstituent(final char c) { return Character.isLetterOrDigit(c) || wordConstituents.contains(c); } @Override public boolean next(final MutableString word, final MutableString nonWord) throws IOException { int i; final char buffer[] = this.buffer; if (noMoreCharacters()) return false; word.length(0); nonWord.length(0); for(;;) { for(i = 0; i < avail && isWordConstituent(buffer[pos + i]); i++); word.append(buffer, pos, i); pos += i; avail -= i; if (avail > 0 || noMoreCharacters()) break; } if (noMoreCharacters()) return true; for(;;) { for(i = 0; i < avail && ! isWordConstituent(buffer[pos + i]); i++); nonWord.append(buffer, pos, i); pos += i; avail -= i; if (avail > 0 || noMoreCharacters()) return true; } } @Override public FastBufferedReader setReader(final Reader reader) { this.reader = reader; avail = 0; return this; } @Override public long skip(long n) throws IOException { if (n <= avail) { pos += ((int)n); avail -= ((int)n); return n; } final int head = avail; n -= head; avail = 0; return reader.skip(n) + head; } @Override public void close() throws IOException { if (reader == null) return; reader.close(); reader = null; buffer = null; } private void readObject(final ObjectInputStream s) throws IOException, ClassNotFoundException { s.defaultReadObject(); buffer = new char[bufferSize]; } public String toSpec() { return toString(); } @Override public String toString() { final String className = getClass().getName(); if (bufferSize == DEFAULT_BUFFER_SIZE && wordConstituents.isEmpty()) return className; if (wordConstituents.isEmpty()) return className + "(" + bufferSize + ")"; String wordConstituents = new String(this.wordConstituents.toCharArray()); if (bufferSize == DEFAULT_BUFFER_SIZE) return className + "(\"" + wordConstituents + "\")"; return className + "(" + bufferSize + ",\"" + wordConstituents + "\")"; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy