it.unimi.dsi.io.DelimitedWordReader Maven / Gradle / Ivy
Show all versions of dsi-utils Show documentation
package it.unimi.dsi.io;
/*
* DSI utilities
*
* Copyright (C) 2009 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/
import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.chars.CharOpenHashSet;
import it.unimi.dsi.fastutil.chars.CharSet;
import it.unimi.dsi.fastutil.chars.CharSets;
import it.unimi.dsi.lang.MutableString;
import java.io.Reader;
/** A word reader that breaks words on a given set of characters.
*
* This class is a simple subclass of {@link FastBufferedReader}. It
* overwrites {@link #isWordConstituent(char)} so that word constituents
* are defined negatively by a set of delimiters defined at construction time.
* There is a {@link DelimitedWordReader#DelimitedWordReader(String) constructor
* accepting the delimiter set as a string}. Note that LF and CR are always considered to be delimiters.
*
*/
public class DelimitedWordReader extends FastBufferedReader {
private static final long serialVersionUID = 1L;
/** The set of delimiters used to break the character stream into words. */
private final CharOpenHashSet delimiters;
@Override
protected boolean isWordConstituent( char c ) {
return ! delimiters.contains( c );
}
private void addCrLf() {
delimiters.add( '\n' );
delimiters.add( '\r' );
}
/** Creates a new delimited word reader with a given buffer size and set of delimiters.
* The wrapped reader will have to be set later using {@link #setReader(Reader)}.
*
* @param bufferSize the size in bytes of the internal buffer.
* @param delimiters a set of characters that will be considered word delimiters.
*/
public DelimitedWordReader( int bufferSize, CharSet delimiters ) {
super( bufferSize );
this.delimiters = new CharOpenHashSet( delimiters, Hash.VERY_FAST_LOAD_FACTOR );
addCrLf();
}
/** Creates a new delimited word reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters.
* The wrapped reader will have to be set later using {@link #setReader(Reader)}.
* @param delimiters a set of characters that will be considered word delimiters.
*/
public DelimitedWordReader( CharSet delimiters ) {
this.delimiters = new CharOpenHashSet( delimiters, Hash.VERY_FAST_LOAD_FACTOR );
addCrLf();
}
/** Creates a new delimited word reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters.
* The wrapped reader will have to be set later using {@link #setReader(Reader)}.
* @param delimiters a set of characters that will be considered word delimiters, specified as a string.
*/
public DelimitedWordReader( String delimiters ) {
this( new CharOpenHashSet( delimiters.toCharArray() ) );
}
/** Creates a new delimited word reader with a given buffer size and set of delimiters.
* The wrapped reader will have to be set later using {@link #setReader(Reader)}.
*
* @param bufferSize the size in bytes of the internal buffer, specified as a string.
* @param delimiters a set of characters that will be considered word delimiters, specified as a string.
*/
public DelimitedWordReader( String bufferSize, String delimiters ) {
this( Integer.parseInt( bufferSize ), new CharOpenHashSet( delimiters.toCharArray() ) );
}
/** Creates a new delimited word reader by wrapping a given reader with a given buffer size and using a set of delimiters.
*
* @param r a reader to wrap.
* @param bufferSize the size in bytes of the internal buffer.
* @param delimiters a set of characters that will be considered word delimiters.
*/
public DelimitedWordReader( Reader r, int bufferSize, CharSet delimiters ) {
super( r, bufferSize );
this.delimiters = new CharOpenHashSet( delimiters, Hash.VERY_FAST_LOAD_FACTOR );
addCrLf();
}
/** Creates a new delimited word reader by wrapping a given reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters using a given set of delimiters.
*
* @param r a reader to wrap.
* @param delimiters a set of characters that will be considered word delimiters.
*/
public DelimitedWordReader( Reader r, CharSet delimiters ) {
super( r );
this.delimiters = new CharOpenHashSet( delimiters, Hash.VERY_FAST_LOAD_FACTOR );
addCrLf();
}
/** Creates a new delimited word reader by wrapping a given fragment of a character array and using a set delimiters.
*
*
The effect of {@link #setReader(Reader)} on a buffer created with
* this constructor is undefined.
*
* @param array the array that will be wrapped by the reader.
* @param offset the first character to be used.
* @param length the number of character to be used.
* @param delimiters a set of characters that will be considered word delimiters.
*/
public DelimitedWordReader( char[] array, int offset, int length, CharSet delimiters ) {
super( array, offset, length );
this.delimiters = new CharOpenHashSet( delimiters, Hash.VERY_FAST_LOAD_FACTOR );
addCrLf();
}
/** Creates a new delimited word reader by wrapping a given character array and using a set delimiters.
*
*
The effect of {@link #setReader(Reader)} on a buffer created with
* this constructor is undefined.
*
* @param array the array that will be wrapped by the reader.
* @param delimiters a set of characters that will be considered word delimiters.
*/
public DelimitedWordReader( char[] array, CharSet delimiters ) {
super( array );
this.delimiters = new CharOpenHashSet( delimiters, Hash.VERY_FAST_LOAD_FACTOR );
addCrLf();
}
/** Creates a new delimited word reader by wrapping a given mutable string and using a set of delimiters.
*
*
The effect of {@link #setReader(Reader)} on a buffer created with
* this constructor is undefined.
*
* @param s the mutable string that will be wrapped by the reader.
* @param delimiters a set of characters that will be considered word delimiters.
*/
public DelimitedWordReader( MutableString s, CharSet delimiters ) {
super( s, CharSets.EMPTY_SET );
this.delimiters = new CharOpenHashSet( delimiters, Hash.VERY_FAST_LOAD_FACTOR );
addCrLf();
}
public String toSpec() {
return toString();
}
public String toString() {
final String className = getClass().getName();
CharOpenHashSet additionalDelimiters = (CharOpenHashSet)delimiters.clone();
additionalDelimiters.remove( '\n' );
additionalDelimiters.remove( '\r' );
String delimiters = new String( additionalDelimiters.toCharArray() );
if ( bufferSize == DEFAULT_BUFFER_SIZE ) return className + "(\"" + delimiters + "\")";
return className + "(" + bufferSize + ",\"" + delimiters + "\")";
}
}