src.it.unimi.dsi.big.mg4j.tool.Paste Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j-big Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.
The newest version!
package it.unimi.dsi.big.mg4j.tool;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2005-2011 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.ints.AbstractIntComparator;
import it.unimi.dsi.fastutil.ints.IntBigArrays;
import it.unimi.dsi.fastutil.ints.IntHeapPriorityQueue;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.big.mg4j.index.CachingOutputBitStream;
import it.unimi.dsi.big.mg4j.index.CompressionFlags.Coding;
import it.unimi.dsi.big.mg4j.index.CompressionFlags.Component;
import it.unimi.dsi.big.mg4j.index.Index;
import it.unimi.dsi.big.mg4j.index.IndexIterator;

import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.net.URISyntaxException;
import java.util.Map;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;

import com.martiansoftware.jsap.JSAPException;

/** Pastes several indices.
 * 
 * Pasting is a very slow way of combining indices: we assume
 * that not only documents, but also document occurrences might be scattered
 * throughout several indices. When a document appears in several indices,
 * its occurrences in a given index are combined. We have two possibilities:
 * 

 * standard pasting: position lists are simply concatenated—it
 * is responsibility of the caller to guarantee that they have been numbered
 * in an increasing fashion; the sizes of the last input index are the sizes of
 * the pasted index;
 * 
incremental pasting: position lists are concatenated, but each
 * list is renumbered by adding to all positions the sum of the sizes of the
 * current document for all indices the precede the current one (this kind
 * of pasting was the only one available before version 3.0).
 * 
 * 
 * Standard pasting is used, for instance, to paste the batches of a
 * {@linkplain it.unimi.dsi.big.mg4j.document.DocumentFactory.FieldType#VIRTUAL virtual field}
 * generated by {@link Scan}; the latter takes care of numbering positions
 * correctly. If, however, you index parts of the same document collection on 
 * different machines using the same {@link VirtualDocumentResolver}, then
 * the resulting indices for virtual fields will 
 * have all position starting from zero, and they will need an incremental
 * pasting to be combined correctly.
 * 
 * 
Conceptually, this operation is equivalent to splitting a collection
 * vertically: each document is divided into a fixed number n 
 * of consecutive segments (possibly of length 0), and a set of n indices
 * is created using the k-th segment of all documents. Pasting the
 * resulting indices will produce an index that is identical to the index generated
 * by the original collection. The behaviour is analogous to that of the UN*X
 * paste command if documents are single-line lists of words. 
 * 
 * 
Note that in case every document appears at most in one index pasting
 * is equivalent to {@linkplain it.unimi.dsi.big.mg4j.tool.Merge merging}. It is, however,
 * significantly slower, as the presence of the same document in several lists makes
 * it necessary to scan completely the inverted lists to be pasted to compute the
 * frequency. To do so, an in-memory buffer is allocated. If an inverted list does not fit
 * in the memory buffer, it is spilled on disk. Sizing correctly the buffer, and choosing a fast
 * file system for the temporary directory can significantly affect performance.
 * 
 * Warning: incremental pasting is very memory-intensive, as
 * a list of sizes must be loaded for each index. You can use the URI option
 * succinctsizes=1 to load sizes in a succinct format, which will
 * ease the problem.
 *   
 * @author Sebastiano Vigna
 * @since 1.0
 */

final public class Paste extends Combine {
	@SuppressWarnings("unused")
	private static final Logger LOGGER = Util.getLogger( Paste.class );
	
	/** The default size of the temporary bit stream buffer used while pasting. Posting lists larger
	 * than this size will be precomputed on disk and then added to the index. */
	public final static int DEFAULT_MEMORY_BUFFER_SIZE = 16 * 1024 * 1024;
	
	/** The reference array of the document queue. */
	protected final long[] doc;
	/** Whether this paste is incremental. */
	private final boolean incremental;
	/** The queue containing document pointers (for remapped indices). */
	protected final IntHeapPriorityQueue documentQueue;
	/** The temporary cache file {@link #combine(int)}. */
	private final File tempFile;
	/** The temporary output bit stream for {@link #combine(int)}. */
	private final CachingOutputBitStream cacheBitStreamOut;
	/** The temporary output bit stream for {@link #combine(int)}. */
	private final InputBitStream cacheBitStreamIn;
	/** The input bit stream used to wrap directly {@link #cacheBitStreamOut}'s buffer. */
	private final InputBitStream cacheBitStreamInWrapper;
	/** The size of the size list for each index. */
	private final long[] sizesSize;
	
	/** Pastes several indices into one.
	 * 
	 * @param outputBasename the basename of the combined index.
	 * @param inputBasename the basenames of the input indices.
	 * @param metadataOnly if true, we save only metadata (term list, frequencies, global counts).
	 * @param incremental if true, we perform an incremental paste (needs sizes).
	 * @param bufferSize the buffer size for index readers.
	 * @param tempFileDir the directory of the temporary file used when pasting.
	 * @param tempBufferSize the size of the in-memory buffer used when pasting.
	 * @param writerFlags the flags for the index writer.
	 * @param interleaved forces an interleaved index.
	 * @param skips whether to insert skips in case interleaved is true.
	 * @param quantum the quantum of skipping structures; if negative, a percentage of space for variable-quantum indices (irrelevant if skips is false).
	 * @param height the height of skipping towers (irrelevant if skips is false).
	 * @param skipBufferSize the size of the buffer used to hold temporarily inverted lists during the skipping structure construction.
	 * @param logInterval how often we log.
	 */
	public Paste( final String outputBasename,
			final String[] inputBasename,
			final boolean metadataOnly,
			final boolean incremental,
			final int bufferSize,
			final File tempFileDir,
			final int tempBufferSize,
			final Map writerFlags,
			final boolean interleaved,
			final boolean skips,
			final int quantum,
			final int height,
			final int skipBufferSize,
			final long logInterval ) throws IOException, ConfigurationException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
		super( outputBasename, inputBasename, metadataOnly, incremental, bufferSize, writerFlags, interleaved, skips, quantum, height, skipBufferSize, logInterval );
		this.incremental = incremental;

		tempFile = File.createTempFile( "MG4J", ".data", tempFileDir );
		cacheBitStreamOut = new CachingOutputBitStream( tempFile, tempBufferSize );
		cacheBitStreamIn = new InputBitStream( tempFile, bufferSize );
		cacheBitStreamInWrapper = new InputBitStream( cacheBitStreamOut.buffer() );
		/* In this case, we must reallocate position as by merging occurences we might
		 * obtain an occurrence list as large as the concatenation of all largest
		 * lists. We use this estimate to allocate position, and update maxCount in
		 * combine() to get the real maxCount. */
		int estimateForMaxCount = 0, tempSize = 0;
		sizesSize = incremental ? new long[ numIndices ] : null;
		if ( incremental ) for( int i = index.length; i-- != 0; ) sizesSize[ i ] = index[ i ].sizes.size64();
	
		for( int i = 0; i < numIndices; i++ ) {
			if ( index[ i ].hasPayloads ) throw new IllegalArgumentException( "You cannot paste indices with payloads" );
			estimateForMaxCount += index[ i ].maxCount;
			tempSize = Math.max( tempSize, index[ i ].maxCount );
		}

		if ( hasPositions ) position = new int[ estimateForMaxCount ];
		doc = new long[ numIndices ];
		documentQueue = new IntHeapPriorityQueue( numIndices, new DocumentIndexComparator( doc ) );
	}

	/** A comparator making an integer priority queue work much like an indirect
	 * priority queue, with the additional property of using the reference index as secondary key.
	 */
	
	private final static class DocumentIndexComparator extends AbstractIntComparator {
		private final long[] refArray;

		public DocumentIndexComparator( final long[] refArray ) {
			this.refArray = refArray;
		}
 		
		public int compare( final int i, final int j ) {
			final long t = refArray[ i ] - refArray[ j ];
			return t != 0 ? (int)Math.signum( t ) : i - j;
		}
	}
	
	
	protected long combineNumberOfDocuments() {
		long n = 0;
		for( int i = 0; i < numIndices; i++ ) n = Math.max( n, index[ i ].numberOfDocuments );
		return n;
	}

	protected int combineSizes( final OutputBitStream sizesOutputBitStream ) throws IOException {
		long currDoc = 0;
		int maxDocSize = 0;
		
		if ( incremental ) {
			// We accumulate document sizes in an array.
			size = IntBigArrays.newBigArray( numberOfDocuments );
			for( int i = 0; i < numIndices; i++ ) {
				final IntIterator sizes = sizes( i );
				long j = index[ i ].numberOfDocuments;
				currDoc = 0;
				while( j-- != 0 ) {
					final int s = sizes.nextInt();
					IntBigArrays.set( size, currDoc++, s );
					maxDocSize = Math.max( maxDocSize, s );
				}
				if ( sizes instanceof Closeable ) ((Closeable)sizes).close();
			}
			// We write the array.
			for( int segment = 0; segment < size.length; segment++ ) 
				for( int s: size[ segment ] ) sizesOutputBitStream.writeGamma( s );
			// We keep it if we need sizes.
			if ( ! needsSizes ) size = null;
		}
		else {
			size = IntBigArrays.newBigArray( numberOfDocuments );
			final IntIterator sizes = sizes( numIndices - 1 );
			int s = 0;
			// We copy the last file size, and store the elements in an array if needsSizes is true.
			for( long j = 0; j < numberOfDocuments; j++ ) {
				s = sizes.nextInt();
				if ( needsSizes ) IntBigArrays.set( size, j, s );
				maxDocSize = Math.max( maxDocSize, s );
				sizesOutputBitStream.writeGamma( s );
			}
			if ( sizes instanceof Closeable ) ((Closeable)sizes).close();
			// We keep the array if we need sizes.
			if ( ! needsSizes ) size = null;
		}
		return maxDocSize;
	}


	protected long combine( final int numUsedIndices ) throws IOException {
		/* If we're merging just one list, merging is fine, and moreover
		 * maxCount need not be updated, as it is already initialised to
		 * the maximum over all indices. */
		int currIndex, count;
		long prevDoc = -1;
		long currDoc;
		int temp[];
		OutputBitStream obs;
		Index i;
		IndexIterator ii;
	
		// Note that the total frequency can be computed only during the merge.
		for( int k = numUsedIndices; k-- != 0; ) {
			currIndex = usedIndex[ k ];
			frequency[ currIndex ] = indexIterator[ currIndex ].frequency();
			doc[ currIndex ] = indexIterator[ currIndex ].nextDocument();
			documentQueue.enqueue( currIndex );
		}
		
		// First phase: we write the inverted list using a quick-and-dirty format in the cache.
		cacheBitStreamOut.position( 0 );
		long totalFrequency = 0;
		int increment, prevIndex, totalCount;
		
		while( ! documentQueue.isEmpty() ) {
			// We extract the smallest document pointer, and enqueue it in the new index.
			currDoc = doc[ currIndex = documentQueue.firstInt() ];
			totalFrequency++;
			if ( ! metadataOnly ) cacheBitStreamOut.writeLongDelta( currDoc - prevDoc - 1 );
			
			totalCount = prevIndex = increment = 0;
			
			do {
				if ( incremental) 
					while( prevIndex < currIndex ) {
						/* Note that some virtual documents could not exist at all in some index (in which
						 * case we extend the size list with zeroes). */ 
						if ( sizesSize[ prevIndex ] > currDoc ) increment += index[ prevIndex ].sizes.getInt( currDoc );
						prevIndex++;
					}
				i = index[ currIndex ];

				i = index[ currIndex ];
				ii = indexIterator[ currIndex ];
			
				if ( ! metadataOnly && i.hasCounts ) {
					count = ii.count();
					if ( i.hasPositions ) {
						temp = ii.positionArray();
						if ( ! incremental && totalCount > 0 && temp[ 0 ] <= position[ totalCount - 1 ] ) throw new IllegalStateException( "Positions in document " + currDoc + " are not increasing; you probably need to require an incremental pasting" );
						for( int k = count; k-- != 0; ) position[ totalCount + k ] = temp[ k ] + increment;
					}
					totalCount += count;
				}
				
				// If we just wrote the last document pointer of this term in index j, we dequeue it.
				if ( --frequency[ currIndex ] == 0 ) documentQueue.dequeue();
				else {
					doc[ currIndex ] = ii.nextDocument();
					documentQueue.changed();
				}
			} while( ! documentQueue.isEmpty() && doc[ currIndex = documentQueue.firstInt() ] == currDoc );
	
			if ( totalCount > maxCount ) maxCount = totalCount;
	
			if ( ! metadataOnly && hasCounts ) { 
				cacheBitStreamOut.writeGamma( totalCount );
				if ( hasPositions ) {
					cacheBitStreamOut.writeDelta( position[ 0 ] );
					for( int k = 1; k < totalCount; k++ ) cacheBitStreamOut.writeDelta( position[ k ] - position[ k - 1 ] - 1 );
				}
			}
	
			prevDoc = currDoc;
		}
	
		if ( ! metadataOnly ) {
			// Finally, we pour the data into the actual index.

			if ( p != 0 ) variableQuantumIndexWriter.newInvertedList( totalFrequency, p, predictedSize, predictedLengthNumBits ); 
			else indexWriter.newInvertedList();

			indexWriter.writeFrequency( totalFrequency );
			cacheBitStreamOut.align();
			final InputBitStream ibs;

			if ( cacheBitStreamOut.buffer() != null ) ibs = cacheBitStreamInWrapper;
			else {
				cacheBitStreamOut.flush();
				ibs = cacheBitStreamIn;
				ibs.flush();
			}

			ibs.position( 0 );

			currDoc = -1;
			for( long j = totalFrequency; j-- != 0; ) {
				obs = indexWriter.newDocumentRecord();
				indexWriter.writeDocumentPointer( obs, currDoc = ibs.readDelta() + currDoc + 1 );
				if ( hasCounts ) {
					count = ibs.readGamma();
					indexWriter.writePositionCount( obs, count );
					if ( hasPositions ) {
						position[ 0 ] = ibs.readDelta();
						for( int k = 1; k < count; k++ ) position[ k ] = position[ k - 1 ] + ibs.readDelta() + 1;
						indexWriter.writeDocumentPositions( obs, position, 0, count, size != null ? IntBigArrays.get( size, currDoc ) : -1 );
					}
				}
			}

		}
		
		return totalFrequency;
	}
	
	public void run() throws ConfigurationException, IOException {
		super.run();
		cacheBitStreamOut.close();
		tempFile.delete();
	}

	public static void main( String arg[] ) throws ConfigurationException, SecurityException, JSAPException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
		Combine.main( arg, Paste.class );
	}
}