src.it.unimi.dsi.io.DelimitedWordReader Maven / Gradle / Ivy
package it.unimi.dsi.io;
/*
* DSI utilities
*
* Copyright (C) 2009-2017 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.chars.CharOpenHashSet;
import it.unimi.dsi.fastutil.chars.CharSet;
import it.unimi.dsi.fastutil.chars.CharSets;
import it.unimi.dsi.lang.MutableString;
import java.io.Reader;
/** A word reader that breaks words on a given set of characters.
*
* This class is a simple subclass of {@link FastBufferedReader}. It
* overwrites {@link #isWordConstituent(char)} so that word constituents
* are defined negatively by a set of delimiters defined at construction time.
* There is a {@link DelimitedWordReader#DelimitedWordReader(String) constructor
* accepting the delimiter set as a string}. Note that LF and CR are always considered to be delimiters.
*
*/
public class DelimitedWordReader extends FastBufferedReader {
private static final long serialVersionUID = 1L;
/** The set of delimiters used to break the character stream into words. */
private final CharOpenHashSet delimiters;
@Override
protected boolean isWordConstituent(char c) {
return ! delimiters.contains(c);
}
private void addCrLf() {
delimiters.add('\n');
delimiters.add('\r');
}
/** Creates a new delimited word reader with a given buffer size and set of delimiters.
* The wrapped reader will have to be set later using {@link #setReader(Reader)}.
*
* @param bufferSize the size in bytes of the internal buffer.
* @param delimiters a set of characters that will be considered word delimiters.
*/
public DelimitedWordReader(int bufferSize, CharSet delimiters) {
super(bufferSize);
this.delimiters = new CharOpenHashSet(delimiters, Hash.VERY_FAST_LOAD_FACTOR);
addCrLf();
}
/** Creates a new delimited word reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters.
* The wrapped reader will have to be set later using {@link #setReader(Reader)}.
* @param delimiters a set of characters that will be considered word delimiters.
*/
public DelimitedWordReader(CharSet delimiters) {
this.delimiters = new CharOpenHashSet(delimiters, Hash.VERY_FAST_LOAD_FACTOR);
addCrLf();
}
/** Creates a new delimited word reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters.
* The wrapped reader will have to be set later using {@link #setReader(Reader)}.
*
*
Warning: it is easy to mistake this method for one whose semantics is
* the same as {@link FastBufferedReader#FastBufferedReader(MutableString)}, that is, wrapping the argument
* string in a reader.
*
* @param delimiters a set of characters that will be considered word delimiters, specified as a string.
*/
public DelimitedWordReader(String delimiters) {
this(new CharOpenHashSet(delimiters.toCharArray()));
}
/** Creates a new delimited word reader with a given buffer size and set of delimiters.
* The wrapped reader will have to be set later using {@link #setReader(Reader)}.
*
* @param bufferSize the size in bytes of the internal buffer, specified as a string.
* @param delimiters a set of characters that will be considered word delimiters, specified as a string.
*/
public DelimitedWordReader(String bufferSize, String delimiters) {
this(Integer.parseInt(bufferSize), new CharOpenHashSet(delimiters.toCharArray()));
}
/** Creates a new delimited word reader by wrapping a given reader with a given buffer size and using a set of delimiters.
*
* @param r a reader to wrap.
* @param bufferSize the size in bytes of the internal buffer.
* @param delimiters a set of characters that will be considered word delimiters.
*/
public DelimitedWordReader(Reader r, int bufferSize, CharSet delimiters) {
super(r, bufferSize);
this.delimiters = new CharOpenHashSet(delimiters, Hash.VERY_FAST_LOAD_FACTOR);
addCrLf();
}
/** Creates a new delimited word reader by wrapping a given reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters using a given set of delimiters.
*
* @param r a reader to wrap.
* @param delimiters a set of characters that will be considered word delimiters.
*/
public DelimitedWordReader(Reader r, CharSet delimiters) {
super(r);
this.delimiters = new CharOpenHashSet(delimiters, Hash.VERY_FAST_LOAD_FACTOR);
addCrLf();
}
/** Creates a new delimited word reader by wrapping a given fragment of a character array and using a set delimiters.
*
*
The effect of {@link #setReader(Reader)} on a buffer created with
* this constructor is undefined.
*
* @param array the array that will be wrapped by the reader.
* @param offset the first character to be used.
* @param length the number of character to be used.
* @param delimiters a set of characters that will be considered word delimiters.
*/
public DelimitedWordReader(char[] array, int offset, int length, CharSet delimiters) {
super(array, offset, length);
this.delimiters = new CharOpenHashSet(delimiters, Hash.VERY_FAST_LOAD_FACTOR);
addCrLf();
}
/** Creates a new delimited word reader by wrapping a given character array and using a set delimiters.
*
*
The effect of {@link #setReader(Reader)} on a buffer created with
* this constructor is undefined.
*
* @param array the array that will be wrapped by the reader.
* @param delimiters a set of characters that will be considered word delimiters.
*/
public DelimitedWordReader(char[] array, CharSet delimiters) {
super(array);
this.delimiters = new CharOpenHashSet(delimiters, Hash.VERY_FAST_LOAD_FACTOR);
addCrLf();
}
/** Creates a new delimited word reader by wrapping a given mutable string and using a set of delimiters.
*
*
The effect of {@link #setReader(Reader)} on a buffer created with
* this constructor is undefined.
*
* @param s the mutable string that will be wrapped by the reader.
* @param delimiters a set of characters that will be considered word delimiters.
*/
public DelimitedWordReader(MutableString s, CharSet delimiters) {
super(s, CharSets.EMPTY_SET);
this.delimiters = new CharOpenHashSet(delimiters, Hash.VERY_FAST_LOAD_FACTOR);
addCrLf();
}
@Override
public DelimitedWordReader copy() {
// TODO: improve this by sharing the backing set.
return new DelimitedWordReader(bufferSize, delimiters);
}
@Override
public String toSpec() {
return toString();
}
@Override
public String toString() {
final String className = getClass().getName();
CharOpenHashSet additionalDelimiters = delimiters.clone();
additionalDelimiters.remove('\n');
additionalDelimiters.remove('\r');
String delimiters = new String(additionalDelimiters.toCharArray());
if (bufferSize == DEFAULT_BUFFER_SIZE) return className + "(\"" + delimiters + "\")";
return className + "(" + bufferSize + ",\"" + delimiters + "\")";
}
}