All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.io.DelimitedWordReader Maven / Gradle / Ivy

Go to download

The DSI utilities are a mishmash of classes accumulated during the last twenty years in projects developed at the DSI (Dipartimento di Scienze dell'Informazione, i.e., Information Sciences Department), now DI (Dipartimento di Informatica, i.e., Informatics Department), of the Universita` degli Studi di Milano.

There is a newer version: 2.7.3
Show newest version
package it.unimi.dsi.io;

/*
 * DSI utilities
 *
 * Copyright (C) 2009-2019 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */


import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.chars.CharOpenHashSet;
import it.unimi.dsi.fastutil.chars.CharSet;
import it.unimi.dsi.fastutil.chars.CharSets;
import it.unimi.dsi.lang.MutableString;

import java.io.Reader;

/** A word reader that breaks words on a given set of characters.
 *
 * 

This class is a simple subclass of {@link FastBufferedReader}. It * overwrites {@link #isWordConstituent(char)} so that word constituents * are defined negatively by a set of delimiters defined at construction time. * There is a {@link DelimitedWordReader#DelimitedWordReader(String) constructor * accepting the delimiter set as a string}. Note that LF and CR are always considered to be delimiters. * */ public class DelimitedWordReader extends FastBufferedReader { private static final long serialVersionUID = 1L; /** The set of delimiters used to break the character stream into words. */ private final CharOpenHashSet delimiters; @Override protected boolean isWordConstituent(char c) { return ! delimiters.contains(c); } private void addCrLf() { delimiters.add('\n'); delimiters.add('\r'); } /** Creates a new delimited word reader with a given buffer size and set of delimiters. * The wrapped reader will have to be set later using {@link #setReader(Reader)}. * * @param bufferSize the size in bytes of the internal buffer. * @param delimiters a set of characters that will be considered word delimiters. */ public DelimitedWordReader(int bufferSize, CharSet delimiters) { super(bufferSize); this.delimiters = new CharOpenHashSet(delimiters, Hash.VERY_FAST_LOAD_FACTOR); addCrLf(); } /** Creates a new delimited word reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters. * The wrapped reader will have to be set later using {@link #setReader(Reader)}. * @param delimiters a set of characters that will be considered word delimiters. */ public DelimitedWordReader(CharSet delimiters) { this.delimiters = new CharOpenHashSet(delimiters, Hash.VERY_FAST_LOAD_FACTOR); addCrLf(); } /** Creates a new delimited word reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters. * The wrapped reader will have to be set later using {@link #setReader(Reader)}. * *

Warning: it is easy to mistake this method for one whose semantics is * the same as {@link FastBufferedReader#FastBufferedReader(MutableString)}, that is, wrapping the argument * string in a reader. * * @param delimiters a set of characters that will be considered word delimiters, specified as a string. */ public DelimitedWordReader(String delimiters) { this(new CharOpenHashSet(delimiters.toCharArray())); } /** Creates a new delimited word reader with a given buffer size and set of delimiters. * The wrapped reader will have to be set later using {@link #setReader(Reader)}. * * @param bufferSize the size in bytes of the internal buffer, specified as a string. * @param delimiters a set of characters that will be considered word delimiters, specified as a string. */ public DelimitedWordReader(String bufferSize, String delimiters) { this(Integer.parseInt(bufferSize), new CharOpenHashSet(delimiters.toCharArray())); } /** Creates a new delimited word reader by wrapping a given reader with a given buffer size and using a set of delimiters. * * @param r a reader to wrap. * @param bufferSize the size in bytes of the internal buffer. * @param delimiters a set of characters that will be considered word delimiters. */ public DelimitedWordReader(Reader r, int bufferSize, CharSet delimiters) { super(r, bufferSize); this.delimiters = new CharOpenHashSet(delimiters, Hash.VERY_FAST_LOAD_FACTOR); addCrLf(); } /** Creates a new delimited word reader by wrapping a given reader with a buffer of {@link #DEFAULT_BUFFER_SIZE} characters using a given set of delimiters. * * @param r a reader to wrap. * @param delimiters a set of characters that will be considered word delimiters. */ public DelimitedWordReader(Reader r, CharSet delimiters) { super(r); this.delimiters = new CharOpenHashSet(delimiters, Hash.VERY_FAST_LOAD_FACTOR); addCrLf(); } /** Creates a new delimited word reader by wrapping a given fragment of a character array and using a set delimiters. * *

The effect of {@link #setReader(Reader)} on a buffer created with * this constructor is undefined. * * @param array the array that will be wrapped by the reader. * @param offset the first character to be used. * @param length the number of character to be used. * @param delimiters a set of characters that will be considered word delimiters. */ public DelimitedWordReader(char[] array, int offset, int length, CharSet delimiters) { super(array, offset, length); this.delimiters = new CharOpenHashSet(delimiters, Hash.VERY_FAST_LOAD_FACTOR); addCrLf(); } /** Creates a new delimited word reader by wrapping a given character array and using a set delimiters. * *

The effect of {@link #setReader(Reader)} on a buffer created with * this constructor is undefined. * * @param array the array that will be wrapped by the reader. * @param delimiters a set of characters that will be considered word delimiters. */ public DelimitedWordReader(char[] array, CharSet delimiters) { super(array); this.delimiters = new CharOpenHashSet(delimiters, Hash.VERY_FAST_LOAD_FACTOR); addCrLf(); } /** Creates a new delimited word reader by wrapping a given mutable string and using a set of delimiters. * *

The effect of {@link #setReader(Reader)} on a buffer created with * this constructor is undefined. * * @param s the mutable string that will be wrapped by the reader. * @param delimiters a set of characters that will be considered word delimiters. */ public DelimitedWordReader(MutableString s, CharSet delimiters) { super(s, CharSets.EMPTY_SET); this.delimiters = new CharOpenHashSet(delimiters, Hash.VERY_FAST_LOAD_FACTOR); addCrLf(); } @Override public DelimitedWordReader copy() { // TODO: improve this by sharing the backing set. return new DelimitedWordReader(bufferSize, delimiters); } @Override public String toSpec() { return toString(); } @Override public String toString() { final String className = getClass().getName(); CharOpenHashSet additionalDelimiters = delimiters.clone(); additionalDelimiters.remove('\n'); additionalDelimiters.remove('\r'); String delimiters = new String(additionalDelimiters.toCharArray()); if (bufferSize == DEFAULT_BUFFER_SIZE) return className + "(\"" + delimiters + "\")"; return className + "(" + bufferSize + ",\"" + delimiters + "\")"; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy