![JAR search and dependency download from the Maven repository](/logo.png)
src.it.unimi.dsi.big.mg4j.document.ConcatenatedDocumentCollection Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j-big Show documentation
Show all versions of mg4j-big Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.
The newest version!
package it.unimi.dsi.big.mg4j.document;
/*
* MG4J: Managing Gigabytes for Java (big)
*
* Copyright (C) 2009-2011 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.Serializable;
import java.util.Arrays;
/** A document collection exhibiting a list of underlying document collections, called segments,
* as a single collection. The underlying collections are (virtually) concatenated—that is,
* the first document of the second collection is renumbered to the size of the first collection, and so on.
* All underlying collections must use the same {@linkplain DocumentFactory factory class}.
* @author Sebastiano Vigna
*
*/
public class ConcatenatedDocumentCollection extends AbstractDocumentCollection implements Serializable {
private static final long serialVersionUID = 1L;
/** The name of the collections composing this concatenated document collection. */
private final String[] collectionName;
/** The collections composing this concatenated document collection. */
private transient DocumentCollection[] collection;
/** The length of {@link #collection}. */
private final int n;
/** The array of starting documents (the last element is the overall number of documents). */
private long startDocument[];
/** Creates a new concatenated document collection using giving component collections.
*
* @param collection a list of component collections.
*/
protected ConcatenatedDocumentCollection( final String[] collectionName, final DocumentCollection[] collection ) {
if ( collection.length != collectionName.length ) throw new IllegalArgumentException();
this.collectionName = collectionName;
this.collection = collection;
this.n = collection.length;
}
private void initCollections( final CharSequence filename, boolean rethrow ) throws IllegalArgumentException, SecurityException, IOException, ClassNotFoundException {
try {
collection = new DocumentCollection[ n ];
File parent = filename != null ? new File( filename.toString() ).getParentFile() : null;
for( int i = n; i-- != 0; ) collection[ i ] = (DocumentCollection)AbstractDocumentSequence.load( new File( parent, collectionName[ i ] ).toString() );
if ( n > 0 ) {
Class extends DocumentFactory> factoryClass = collection[ 0 ].factory().getClass();
// TODO: this is crude. We should have a contract for equality of factories, and use equals().
for( int i = 0; i < n; i++ ) if ( collection[ i ].factory().getClass() != factoryClass ) throw new IllegalArgumentException( "All segment in a concatenated document collection must used the same factory class" );
}
startDocument = new long[ n + 1 ];
for( int i = 0; i < n; i++ ) startDocument[ i + 1 ] = startDocument[ i ] + collection[ i ].size();
}
catch( IOException e ) {
if ( rethrow ) throw e;
}
}
private void ensureCollections() {
if ( collection == null ) {
filename( null ); // We need collection, but no one called filename(), so we assume a relative path.
if ( collection == null ) throw new IllegalStateException( "The collections composing this " + ConcatenatedDocumentCollection.class.getName() + " have not been loaded correctly; please use " + AbstractDocumentSequence.class.getSimpleName() + ".load() or call filename() after deserialising this instance, and ensure that the names stored are correct " );
}
}
/** Creates a new, partially uninitialised concatenated document collection using giving component collections names.
*
* @param collectionName a list of names of component collections.
*/
public ConcatenatedDocumentCollection( String... collectionName ) throws IllegalArgumentException, SecurityException {
this.collectionName = collectionName;
n = collectionName.length;
}
public void filename( CharSequence filename ) {
try {
initCollections( filename, true );
}
catch ( IllegalArgumentException e ) {
throw new RuntimeException( e );
}
catch ( SecurityException e ) {
throw new RuntimeException( e );
}
catch ( IOException e ) {
throw new RuntimeException( e );
}
catch ( ClassNotFoundException e ) {
throw new RuntimeException( e );
}
}
public DocumentCollection copy() {
final DocumentCollection[] collection = new DocumentCollection[ n ];
for( int i = n; i-- != 0; ) collection[ i ] = this.collection[ i ].copy();
return new ConcatenatedDocumentCollection( collectionName, collection );
}
public Document document( final long index ) throws IOException {
ensureDocumentIndex( index );
ensureCollections();
int segment = Arrays.binarySearch( startDocument, index );
if ( segment < 0 ) segment = -segment - 2;
return collection[ segment ].document( (int)( index - startDocument[ segment ] ) );
}
public Reference2ObjectMap,Object> metadata( long index ) throws IOException {
ensureDocumentIndex( index );
ensureCollections();
int segment = Arrays.binarySearch( startDocument, index );
if ( segment < 0 ) segment = -segment - 2;
return collection[ segment ].metadata( (int)( index - startDocument[ segment ] ) );
}
public long size() {
return startDocument[ n ];
}
public InputStream stream( final long index ) throws IOException {
ensureDocumentIndex( index );
ensureCollections();
int segment = Arrays.binarySearch( startDocument, index );
if ( segment < 0 ) segment = -segment - 2;
return collection[ segment ].stream( (int)( index - startDocument[ segment ] ) );
}
public DocumentFactory factory() {
ensureCollections();
return collection[ 0 ].factory();
}
private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException {
s.defaultReadObject();
initCollections( null, false );
}
public void close() throws IOException {
super.close();
for( DocumentCollection c: collection ) c.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy