src.it.unimi.dsi.big.mg4j.index.BitStreamIndexWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j-big Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.
The newest version!
package it.unimi.dsi.big.mg4j.index;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2003-2011 Paolo Boldi and Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.bits.Fast;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.big.mg4j.index.CompressionFlags.Coding;
import it.unimi.dsi.big.mg4j.index.CompressionFlags.Component;
import it.unimi.dsi.big.mg4j.index.payload.Payload;
import it.unimi.dsi.big.mg4j.io.InterpolativeCoding;
import it.unimi.dsi.util.Properties;

import java.io.IOException;
import java.util.Map;

/** Writes a bitstream-based interleaved index.
 *
 * Offsets bit stream
 *
 * An inverted index may have an associated {@link OutputBitStream} of
 * offsets: this file contains T+1 integers, where T
 * is the number of inverted lists (i.e., the number of terms), and the
 * i-th entry is the position in bits where
 * the i-th inverted list starts (the last entry is actually the
 * length, in bytes, of the inverted index file itself). 
 * 
 * The file actually contains γ-coded gaps: thus, in practice, it is formed by
 * the number zero (the offset of the first list) followed by the length of each inverted list.
 *
 * @author Paolo Boldi 
 * @author Sebastiano Vigna 
 * @since 0.6
 */


public class BitStreamIndexWriter extends AbstractBitStreamIndexWriter {
	private static final boolean ASSERTS = false;
	
	/** This value of {@link #state} means that we should call {@link #newInvertedList()}.*/
	protected static final int BEFORE_INVERTED_LIST = 0;

	/** This value of {@link #state} means that we are positioned at the start of an inverted list,
	 * and we should call {@link #writeFrequency(long)}.*/
	protected static final int BEFORE_FREQUENCY = 1;

	/** This value of {@link #state} means that we are ready to call {@link #newDocumentRecord()}. */
	protected static final int BEFORE_DOCUMENT_RECORD = 2;

	/** This value of {@link #state} means that we just started a new document record, and we
	 * should call {@link #writeDocumentPointer(OutputBitStream, long)}. */
	protected static final int BEFORE_POINTER = 3;

	/** This value of {@link #state} can be assumed only in indices that contain payloads; it
	 * means that we are positioned just before the payload for the current document record. */
	protected static final int BEFORE_PAYLOAD = 4;

	/** This value of {@link #state} can be assumed only in indices that contain counts; it
	 * means that we are positioned just before the count for the current document record. */
	protected static final int BEFORE_COUNT = 5;

	/** This value of {@link #state} can be assumed only in indices that contain document positions; 
	 * it means that we are positioned just before the position list of the current document record. */
	protected static final int BEFORE_POSITIONS = 6;

	/** This is the first unused state. Subclasses may start from this value to define new states. */
	protected static final int FIRST_UNUSED_STATE = 7;

	/** The underlying {@link OutputBitStream}. */
	protected OutputBitStream obs;
	/** The offsets {@link OutputBitStream}. */
	private OutputBitStream offsets;
	/** The {@link OutputBitStream} for the number of bits for positions. */
	private OutputBitStream posNumBits;
	/** The current state of the writer. */
	protected int state;
	/** The number of document records that the current inverted list will contain. */
	protected long frequency;
	/** The number of document records already written for the current inverted list. */
	protected long writtenDocuments;
	/** The current document pointer. */
	protected long currentDocument;
	/** The last document pointer in the current list. */
	protected long lastDocument;
	/** The position (in bytes) where the last inverted list started. */
	protected long lastInvertedListPos;
	/** The number of bits spent for positions in this the current inverted list. */
	private long currPosNumBits;
	/** The parameter b for Golomb coding of pointers. */
	protected int b;
	/** The parameter log2b for Golomb coding of pointers; it is the most significant bit of {@link #b}. */
	protected int log2b;
	/** The maximum number of positions in a document record so far. */
	public int maxCount;

	/** Creates a new index writer, with the specified basename. The index will be written on a file (stemmed with .index).
	 *  If writeOffsets, also an offset file will be produced (stemmed with .offsets). 
	 *  When {@link #close()} will be called, the property file will also be produced (stemmed with .properties),
	 *  or enriched if it already exists.
	 * 
	 * @param basename the basename.
	 * @param numberOfDocuments the number of documents in the collection to be indexed.
	 * @param writeOffsets if true, the offset file will also be produced.
	 * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
	 */
	public BitStreamIndexWriter( final CharSequence basename, final long numberOfDocuments, final boolean writeOffsets, final Map flags ) throws IOException {
		this( 
			new OutputBitStream( basename + DiskBasedIndex.INDEX_EXTENSION ),
			writeOffsets ? new OutputBitStream( basename + DiskBasedIndex.OFFSETS_EXTENSION ) : null,
			writeOffsets && flags.get( Component.POSITIONS ) != null ? new OutputBitStream( basename + DiskBasedIndex.POSITIONS_NUMBER_OF_BITS_EXTENSION ) : null,
			numberOfDocuments,
			flags
		 );
	}

	/** Creates a new index writer with payloads using the specified underlying {@link OutputBitStream}.
	 *
	 * @param obs the underlying output bit stream.
	 * @param offset the offset bit stream, or null if offsets should not be written.
	 * @param posNumBits the bit stream for positions bit lengths, or null if such lengths should not be written.
	 * @param numberOfDocuments the number of documents in the collection to be indexed.
	 * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
	 */
	public BitStreamIndexWriter( final OutputBitStream obs, final OutputBitStream offset, final OutputBitStream posNumBits, final long numberOfDocuments, final Map flags ) {
		super( numberOfDocuments, flags );
		this.obs = obs;
		this.posNumBits = posNumBits;
		this.offsets = offset;
		this.frequency = -1;
		this.currentTerm = -1;
		this.maxCount = 0;
		this.currPosNumBits = -1;

		if ( ! hasCounts && hasPositions ) throw new IllegalArgumentException( "Index would have positions but no counts (this can't happen)" );
	}

	/** Creates a new index writer, with the specified underlying {@link OutputBitStream},
	 *  without additional bit streams. 
	 *
	 * @param obs the underlying output bit stream.
	 * @param numberOfDocuments the number of documents in the collection to be indexed.
	 * @param flags a flag map setting the coding techniques to be used (see {@link CompressionFlags}).
	 */
	public BitStreamIndexWriter( final OutputBitStream obs, final long numberOfDocuments, final Map flags ) {
		this( obs, null,null, numberOfDocuments, flags );
	}

	public long newInvertedList() throws IOException {
		if ( frequency >= 0 && frequency != writtenDocuments ) throw new IllegalStateException( "The number of document records (" + this.writtenDocuments + ") does not match the frequency (" + this.frequency + ")" );
		if ( state != BEFORE_INVERTED_LIST && state != BEFORE_DOCUMENT_RECORD ) throw new IllegalStateException( "Trying to start new inverted list in state " + state );

		// The position (in bits) where the new inverted list starts
		long pos = obs.writtenBits();
		// Reset variables
		writtenDocuments = 0;
		currentTerm++;
		currentDocument = -1;

		// If needed, write the offset
		if ( offsets != null ) offsets.writeLongGamma( pos - lastInvertedListPos );
		lastInvertedListPos = pos;
		if ( posNumBits != null && currPosNumBits != -1 ) {
			posNumBits.writeLongGamma( currPosNumBits );
			currPosNumBits = 0;
		}
		state = BEFORE_FREQUENCY;
		return pos;
	}

	public int writeFrequency( final long frequency ) throws IOException {
		if ( state != BEFORE_FREQUENCY ) throw new IllegalStateException( "Trying to write frequency in state " + state );

		int bitCount;
		// Write the frequency
		switch( frequencyCoding ) {
		case SHIFTED_GAMMA:
			bitCount = obs.writeLongShiftedGamma( frequency - 1 ); // frequency cannot be 0
			break;
		case GAMMA:
			bitCount = obs.writeLongGamma( frequency - 1 ); // frequency cannot be 0
			break;
		case DELTA:
			bitCount = obs.writeLongDelta( frequency - 1 ); // frequency cannot be 0
			break;
		default:
			throw new IllegalStateException( "The required frequency coding (" + frequencyCoding + ") is not supported." );
		}

		this.frequency = frequency;

		// We compute the modulus used for pointer Golomb coding 
		if ( pointerCoding == Coding.GOLOMB ) {
			b = BitStreamIndex.golombModulus( frequency, numberOfDocuments ); 
			log2b = Fast.mostSignificantBit( b );
		}

		state = BEFORE_DOCUMENT_RECORD;
		bitsForFrequencies += bitCount;
		return bitCount;
	}

	public OutputBitStream newDocumentRecord() throws IOException {
		if ( frequency == writtenDocuments ) throw new IllegalStateException( "Document record overflow (written " + this.frequency + " already)" );
		if ( state != BEFORE_DOCUMENT_RECORD ) throw new IllegalStateException( "Trying to start new document record in state " + state );

		writtenDocuments++;
		numberOfPostings++;
		lastDocument = currentDocument;
		state = BEFORE_POINTER;
		return obs;
	}

	public long writeDocumentPointer( final OutputBitStream out, final long pointer ) throws IOException {
		if ( state != BEFORE_POINTER ) throw new IllegalStateException( "Trying to write pointer in state " + state );

		currentDocument = pointer;
		long bitCount = 0;

		if ( frequency != numberOfDocuments ) { // We do not write pointers for everywhere occurring documents.
			switch( pointerCoding ) {
				case SHIFTED_GAMMA:
					bitCount = out.writeLongShiftedGamma( pointer - lastDocument - 1 );
					break;
				case UNARY:
					bitCount = out.writeLongUnary( pointer - lastDocument - 1 );
					break;
				case GAMMA:
					bitCount = out.writeLongGamma( pointer - lastDocument - 1 );
					break;
				case DELTA:
					bitCount = out.writeLongDelta( pointer - lastDocument - 1 );
					break;
				case GOLOMB:
					bitCount = out.writeLongGolomb( pointer - lastDocument - 1, b, log2b );
					break;
				default:
					throw new IllegalStateException( "The required pointer coding (" + pointerCoding + ") is not supported." );
			}
		}
		else if ( pointer - lastDocument != 1 ) throw new IllegalStateException( "Term " + currentTerm + " has frequency equal to the number of documents, but pointers are not consecutive integers" );

		state = hasPayloads ? BEFORE_PAYLOAD : hasCounts ? BEFORE_COUNT : BEFORE_DOCUMENT_RECORD;
		bitsForPointers += bitCount;
		return bitCount;
	}

	public int writePayload( final OutputBitStream out, final Payload payload ) throws IOException {
		if ( frequency < 0 ) throw new IllegalStateException( "Trying to write payload without calling newInvertedList" );
		if ( state != BEFORE_PAYLOAD ) throw new IllegalStateException( "Trying to write payload in state " + state );
		final int count = payload.write( out );
		bitsForPayloads += count;
		state = hasCounts ? BEFORE_COUNT : BEFORE_DOCUMENT_RECORD;
		return count;
	}


	public void close() throws IOException {
		if ( state != BEFORE_DOCUMENT_RECORD && state != BEFORE_INVERTED_LIST ) throw new IllegalStateException( "Trying to close index in state " + state );
		if ( frequency >= 0 && frequency != writtenDocuments ) throw new IllegalStateException( "The number of document records (" + this.writtenDocuments + ") does not match the frequency (" + this.frequency + ")" );

		if ( writtenBits() != obs.writtenBits() ) 
			throw new IllegalStateException( "Written bits count mismatch: we say " + writtenBits() + ", the stream says " + obs.writtenBits() );

		if ( offsets != null ) {
			offsets.writeLongGamma( obs.writtenBits() - lastInvertedListPos );
			offsets.close();
		}
		
		if ( posNumBits != null ) {
			if ( currPosNumBits != -1 ) posNumBits.writeLongGamma( currPosNumBits );
			posNumBits.close();
		}

		obs.close();
	}
	

	public int writePositionCount( final OutputBitStream out, final int count ) throws IOException {
		if ( frequency < 0 ) throw new IllegalStateException( "Trying to write count without calling newInvertedList" );
		if ( state != BEFORE_COUNT ) throw new IllegalStateException( "Trying to write count in state " + state );
		final int bitCount;

		numberOfOccurrences += count;
		switch( countCoding ) {
			case SHIFTED_GAMMA:
				bitCount = out.writeShiftedGamma( count - 1 );
				break;
			case GAMMA:
				bitCount = out.writeGamma( count - 1 );
				break;
			case UNARY:
				bitCount = out.writeUnary( count - 1 );
				break;
			case DELTA:
				bitCount = out.writeDelta( count - 1 );
				break;
			default:
				throw new IllegalStateException( "The required count coding (" + countCoding + ") is not supported." );
		}
		
		state = hasPositions ? BEFORE_POSITIONS : BEFORE_DOCUMENT_RECORD;
		bitsForCounts += bitCount;
		return bitCount;
	}

	public int writeDocumentPositions( final OutputBitStream out, final int[] occ, final int offset, final int len, final int docSize ) throws IOException {
		if ( frequency < 0 ) throw new IllegalStateException( "Trying to write occurrences without calling newInvertedList" );
		if ( state != BEFORE_POSITIONS ) throw new IllegalStateException( "Trying to write positions in state " + state );

		if ( ASSERTS ) {
			if ( docSize > 0 ) for( int i = 0; i< len; i++ ) assert occ[ offset + i ] < docSize : "Position " + occ[ offset + i ] + " for document " + currentDocument + " is too large; size is " + docSize;
		}
		
		int i;
		int prev = -1;
		int bitCount = 0;
		final int end = offset + len;

		switch( positionCoding ) {
			case GAMMA:
				for( i = offset; i < end; i++ ) {
					bitCount += out.writeGamma( occ[ i ] - prev - 1 );
					prev = occ[ i ];
				}
				break;
			case DELTA:
				for( i = offset; i < end; i++ ) {
					bitCount += out.writeDelta( occ[ i ] - prev - 1 );
					prev = occ[ i ];
				}
				break;
			case SHIFTED_GAMMA:
				for( i = offset; i < end; i++ ) {
					bitCount += out.writeShiftedGamma( occ[ i ] - prev - 1 );
					prev = occ[ i ];
				}
				break;
			case GOLOMB:
				if ( len < 3 ) {
					for( i = 0; i < len; i++ ) bitCount += out.writeMinimalBinary( occ[ i ], docSize );
					break;
				}

				// We compute b and log2b for positions
				final int positionB = BitStreamIndex.golombModulus( len, docSize );
				final int positionLog2b = Fast.mostSignificantBit( positionB );

				for( i = offset; i < end; i++ ) {
					bitCount += out.writeGolomb( occ[ i ] - prev - 1, positionB, positionLog2b );
					prev = occ[ i ];
				}
				break;
			case INTERPOLATIVE:
				bitCount = InterpolativeCoding.write( out, occ, 0, len, 0, docSize - 1 );
				break;
			default:
				throw new IllegalStateException( "The required position coding (" + positionCoding + ") is not supported." );
		}

		state = BEFORE_DOCUMENT_RECORD;
		bitsForPositions += bitCount;
		currPosNumBits += bitCount;
		if ( len > maxCount ) maxCount = len;
		return bitCount;
	}

	public long writtenBits() {
		return bitsForFrequencies + bitsForPointers + bitsForPayloads + bitsForCounts + bitsForPositions;
	}

	public Properties properties() {
		Properties result = new Properties();
		result.setProperty( Index.PropertyKeys.DOCUMENTS, numberOfDocuments );
		result.setProperty( Index.PropertyKeys.TERMS, currentTerm + 1 );
		result.setProperty( Index.PropertyKeys.POSTINGS, numberOfPostings );
		result.setProperty( Index.PropertyKeys.MAXCOUNT, maxCount );
		result.setProperty( Index.PropertyKeys.INDEXCLASS, FileIndex.class.getName() );
		// We save all flags, except for the PAYLOAD component, which is just used internally.
		for( Map.Entry e: flags.entrySet() )
			if ( e.getKey() != Component.PAYLOADS ) result.addProperty( Index.PropertyKeys.CODING, new MutableString().append( e.getKey() ).append( ':' ).append( e.getValue() ) );
		return result;
	}
}