src.it.unimi.dsi.big.io.FileLinesCollection Maven / Gradle / Ivy
Show all versions of dsiutils Show documentation
package it.unimi.dsi.big.io;
import java.io.Closeable;
import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.AbstractCollection;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.zip.GZIPInputStream;
/*
* DSI utilities
*
* Copyright (C) 2005-2020 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.fastutil.Size64;
import it.unimi.dsi.fastutil.objects.ObjectBigArrayBigList;
import it.unimi.dsi.fastutil.objects.ObjectBigList;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.SafelyCloseable;
import it.unimi.dsi.lang.MutableString;
/** A wrapper exhibiting the lines of a file as a {@link java.util.Collection}.
*
* Warning: the lines returned by iterators generated by
* instances of this class are not cacheable. The returned value is
* a {@link it.unimi.dsi.lang.MutableString} instance that is reused
* at each call, and that is modified by a call to {@link java.util.Iterator#hasNext() hasNext()}.
* Thus, for instance,
*
* ObjectIterators.unwrap(fileLinesColletion.iterator());
*
* will not give the expected results. Use {@link #allLines()} to get
* the {@linkplain ObjectBigList big list} of all lines (again, under the form of compact {@link it.unimi.dsi.lang.MutableString}s).
* Note also that {@link #toString()} will return a single string containing all
* file lines separated by the string associated with the system property line.separator
.
*
* An instance of this class allows to access the lines of a file as a
* {@link java.util.Collection}. Using {@linkplain java.util.Collection#contains(java.lang.Object)
* direct access} is strongly discouraged (it will require a full scan of the file), but
* the {@link #iterator()} can be fruitfully used to scan the file, and can be called any
* number of times, as it opens an independent input stream at each call. For the
* same reason, the returned iterator type ({@link it.unimi.dsi.io.FileLinesCollection.FileLinesIterator})
* is {@link java.io.Closeable}, and should be closed after usage.
*
*
Using a suitable {@linkplain #FileLinesCollection(CharSequence, String, boolean) constructor}, it is possible
* to specify that the file is compressed in gzip
format (in this case, it will be opened using a {@link GZIPInputStream}).
*
*
Note that the first call to {@link #size64()} will require a full file scan.
*
* @author Sebastiano Vigna
* @since 2.0
*/
public class FileLinesCollection extends AbstractCollection implements Size64 {
/** The filename upon which this file-lines collection is based. */
private final String filename;
/** The encoding of {@link #filename}, or {@code null} for the standard platform encoding. */
private final String encoding;
/** The cached size of the collection. */
private long size = -1;
/** Whether {@link #filename} is zipped. */
private final boolean zipped;
/** Creates a file-lines collection for the specified filename with the specified encoding.
*
* @param filename a filename.
* @param encoding an encoding.
*/
public FileLinesCollection(final CharSequence filename, final String encoding) {
this(filename, encoding, false);
}
/** Creates a file-lines collection for the specified filename with the specified encoding, optionally assuming
* that the file is compressed using gzip
format.
*
* @param filename a filename.
* @param encoding an encoding.
* @param zipped whether filename
is zipped.
*/
public FileLinesCollection(final CharSequence filename, final String encoding, final boolean zipped) {
this.zipped = zipped;
this.filename = filename.toString();
this.encoding = encoding;
}
/** An iterator over the lines of a {@link FileLinesCollection}.
*
* Instances of this class open an {@link java.io.InputStream}, and thus should be {@linkplain Closeable#close() closed} after
* usage. A “safety-net” finaliser tries to take care of the cases in which
* closing an instance is impossible. An exhausted iterator, however, will be closed automagically.
*/
public static final class FileLinesIterator implements Iterator, SafelyCloseable {
private FastBufferedReader fbr;
MutableString s = new MutableString(), next;
boolean toAdvance = true;
private FileLinesIterator(final String filename, final String encoding, final boolean zipped) {
try {
fbr = encoding != null
? new FastBufferedReader(new InputStreamReader(zipped ? new GZIPInputStream(new FileInputStream(filename)) : new FileInputStream(filename), encoding))
: new FastBufferedReader(new FileReader(filename));
} catch (final IOException e) {
throw new RuntimeException(e);
}
}
@Override
public boolean hasNext() {
if (toAdvance) {
try {
next = fbr.readLine(s);
if (next == null) close();
} catch (final IOException e) {
throw new RuntimeException(e);
}
toAdvance = false;
}
return next != null;
}
@Override
public MutableString next() {
if (! hasNext()) throw new NoSuchElementException();
toAdvance = true;
return s;
}
@Override
public synchronized void close() {
if (fbr == null) return;
try {
fbr.close();
}
catch (final IOException e) {
throw new RuntimeException(e);
}
finally {
fbr = null;
}
}
@SuppressWarnings("deprecation")
@Override
protected synchronized void finalize() throws Throwable {
try {
if (fbr != null) close();
}
finally {
super.finalize();
}
}
}
@Override
public FileLinesIterator iterator() {
return new FileLinesIterator(filename, encoding, zipped);
}
@Override
@Deprecated
public synchronized int size() {
return (int)Math.min(Integer.MAX_VALUE, size);
}
@Override
public synchronized long size64() {
if (size == -1) {
final FileLinesIterator i = iterator();
size = 0;
while(i.hasNext()) {
size++;
i.next();
}
i.close();
}
return size;
}
/** Returns all lines of the file wrapped by this file-lines collection.
*
* @return all lines of the file wrapped by this file-lines collection.
*/
public ObjectBigList allLines() {
final ObjectBigList result = new ObjectBigArrayBigList<>();
for(final Iterator i = iterator(); i.hasNext();) result.add(i.next().copy());
return result;
}
@Override
@Deprecated
public Object[] toArray() {
throw new UnsupportedOperationException("Use allLines()");
}
@Override
@Deprecated
public T[] toArray(final T[] a) {
throw new UnsupportedOperationException("Use allLines()");
}
@Override
public String toString() {
final MutableString separator = new MutableString(System.getProperty("line.separator"));
final MutableString s = new MutableString();
for(final MutableString l: this) s.append(l).append(separator);
return s.toString();
}
}