
src.it.unimi.dsi.big.io.FileLinesCollection Maven / Gradle / Ivy
Show all versions of dsiutils Show documentation
package it.unimi.dsi.big.io;
/*
* DSI utilities
*
* Copyright (C) 2005-2016 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.objects.ObjectBigArrayBigList;
import it.unimi.dsi.fastutil.objects.ObjectBigList;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.SafelyCloseable;
import it.unimi.dsi.lang.MutableString;
import java.io.Closeable;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.AbstractCollection;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.zip.GZIPInputStream;
/** A wrapper exhibiting the lines of a file as a {@link java.util.Collection}.
*
* Warning: the lines returned by iterators generated by
* instances of this class are not cacheable. The returned value is
* a {@link it.unimi.dsi.lang.MutableString} instance that is reused
* at each call, and that is modified by a call to {@link java.util.Iterator#hasNext() hasNext()}.
* Thus, for instance,
*
* ObjectIterators.unwrap( fileLinesColletion.iterator() );
*
* will not give the expected results. Use {@link #allLines()} to get
* the {@linkplain ObjectBigList big list} of all lines (again, under the form of compact {@link it.unimi.dsi.lang.MutableString}s).
* Note also that {@link #toString()} will return a single string containing all
* file lines separated by the string associated with the system property line.separator
.
*
* An instance of this class allows to access the lines of a file as a
* {@link java.util.Collection}. Using {@linkplain java.util.Collection#contains(java.lang.Object)
* direct access} is strongly discouraged (it will require a full scan of the file), but
* the {@link #iterator()} can be fruitfully used to scan the file, and can be called any
* number of times, as it opens an independent input stream at each call. For the
* same reason, the returned iterator type ({@link it.unimi.dsi.io.FileLinesCollection.FileLinesIterator})
* is {@link java.io.Closeable}, and should be closed after usage.
*
*
Using a suitable {@linkplain #FileLinesCollection(CharSequence, String, boolean) constructor}, it is possible
* to specify that the file is compressed in gzip
format (in this case, it will be opened using a {@link GZIPInputStream}).
*
*
Note that the first call to {@link #size64()} will require a full file scan.
*
* @author Sebastiano Vigna
* @since 2.0
*/
public class FileLinesCollection extends AbstractCollection implements Size64 {
/** The filename upon which this file-lines collection is based. */
private final String filename;
/** The encoding of {@link #filename}, or {@code null} for the standard platform encoding. */
private final String encoding;
/** The cached size of the collection. */
private long size = -1;
/** Whether {@link #filename} is zipped. */
private final boolean zipped;
/** Creates a file-lines collection for the specified filename with the specified encoding.
*
* @param filename a filename.
* @param encoding an encoding.
*/
public FileLinesCollection( final CharSequence filename, final String encoding ) {
this( filename, encoding, false );
}
/** Creates a file-lines collection for the specified filename with the specified encoding, optionally assuming
* that the file is compressed using gzip
format.
*
* @param filename a filename.
* @param encoding an encoding.
* @param zipped whether filename
is zipped.
*/
public FileLinesCollection( final CharSequence filename, final String encoding, final boolean zipped ) {
this.zipped = zipped;
this.filename = filename.toString();
this.encoding = encoding;
}
/** An iterator over the lines of a {@link FileLinesCollection}.
*
* Instances of this class open an {@link java.io.InputStream}, and thus should be {@linkplain Closeable#close() closed} after
* usage. A “safety-net” finaliser tries to take care of the cases in which
* closing an instance is impossible. An exhausted iterator, however, will be closed automagically.
*/
public static final class FileLinesIterator implements Iterator, SafelyCloseable {
private FastBufferedReader fbr;
MutableString s = new MutableString(), next;
boolean toAdvance = true;
private FileLinesIterator( final String filename, final String encoding, final boolean zipped ) {
try {
fbr = encoding != null
? new FastBufferedReader( new InputStreamReader( zipped ? new GZIPInputStream( new FileInputStream( filename ) ) : new FileInputStream( filename ), encoding ) )
: new FastBufferedReader( new FileReader( filename ) );
} catch (IOException e) {
throw new RuntimeException( e );
}
}
public boolean hasNext() {
if ( toAdvance ) {
try {
next = fbr.readLine( s );
if ( next == null ) close();
} catch (IOException e) {
throw new RuntimeException( e );
}
toAdvance = false;
}
return next != null;
}
public MutableString next() {
if ( ! hasNext() ) throw new NoSuchElementException();
toAdvance = true;
return s;
}
public void remove() {
throw new UnsupportedOperationException();
}
public synchronized void close() {
if ( fbr == null ) return;
try {
fbr.close();
}
catch ( IOException e ) {
throw new RuntimeException( e );
}
finally {
fbr = null;
}
}
protected synchronized void finalize() throws Throwable {
try {
if ( fbr != null ) close();
}
finally {
super.finalize();
}
}
}
public FileLinesIterator iterator() {
return new FileLinesIterator( filename, encoding, zipped );
}
public synchronized int size() {
return (int)Math.min( Integer.MAX_VALUE, size );
}
public synchronized long size64() {
if ( size == -1 ) {
FileLinesIterator i = iterator();
size = 0;
while( i.hasNext() ) {
size++;
i.next();
}
i.close();
}
return size;
}
/** Returns all lines of the file wrapped by this file-lines collection.
*
* @return all lines of the file wrapped by this file-lines collection.
*/
public ObjectBigList allLines() {
final ObjectBigList result = new ObjectBigArrayBigList();
for( Iterator i = iterator(); i.hasNext(); ) result.add( i.next().copy() );
return result;
}
@Override
@Deprecated
public Object[] toArray() {
throw new UnsupportedOperationException( "Use allLines()" );
}
@Override
@Deprecated
public T[] toArray( T[] a ) {
throw new UnsupportedOperationException( "Use allLines()" );
}
public String toString() {
final MutableString separator = new MutableString( System.getProperty( "line.separator" ) );
final MutableString s = new MutableString();
for( MutableString l: this ) s.append( l ).append( separator );
return s.toString();
}
}