![JAR search and dependency download from the Maven repository](/logo.png)
src.it.unimi.dsi.big.mg4j.document.SubsetDocumentSequence Maven / Gradle / Ivy
Show all versions of mg4j-big Show documentation
package it.unimi.dsi.big.mg4j.document;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2009-2011 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongSet;
import it.unimi.dsi.lang.ObjectParser;
import java.io.IOException;
import java.io.Serializable;
/** A collection that exhibits a subset of documents (possibly not contiguous) from a given sequence.
*
* This class provides several string-based constructors that use the {@link ObjectParser}
* conventions; they can be used to generate easily subcollections from the command line.
*
* @author Paolo Boldi
*
*/
public class SubsetDocumentSequence extends AbstractDocumentSequence implements Serializable {
private static final long serialVersionUID = 1L;
/** The underlying document sequence. */
final DocumentSequence underlyingSequence;
/** The set of document pointers to be retained. */
final LongSet documents;
/** Creates a new subsequence.
*
* @param underlyingCollection the underlying document sequence.
* @param documents in the subsequence.
*/
public SubsetDocumentSequence( DocumentSequence underlyingSequence, LongSet documents ) {
this.underlyingSequence = underlyingSequence;
this.documents = documents;
}
/** Creates a new subsequence.
*
* @param underlyingSequenceBasename the basename of the underlying document sequence.
* @param documentFileBasename the basename of a file containing a serialized version of the set of document
* pointers to be retained.
*/
public SubsetDocumentSequence( String underlyingSequenceBasename, String documentFileBasename ) throws NumberFormatException, IllegalArgumentException, SecurityException, IOException, ClassNotFoundException {
this( (DocumentCollection)AbstractDocumentSequence.load( underlyingSequenceBasename ),
(LongSet)BinIO.loadObject( documentFileBasename ) );
}
@Override
public DocumentIterator iterator() throws IOException {
final DocumentIterator underlyingIterator = underlyingSequence.iterator();
return new AbstractDocumentIterator() {
long docPointer = -1;
boolean over = false, closed = false;
@Override
public Document nextDocument() throws IOException {
Document doc;
if ( over ) return null;
do {
doc = underlyingIterator.nextDocument();
docPointer++;
} while ( doc != null && !documents.contains( docPointer ) );
over = doc == null;
return doc;
}
@Override
public void close() throws IOException {
if ( !closed ) {
underlyingIterator.close();
super.close();
}
closed = true;
}
};
}
@Override
public DocumentFactory factory() {
return underlyingSequence.factory();
}
@Override
public void close() throws IOException {
underlyingSequence.close();
super.close();
}
}