src.it.unimi.dsi.big.mg4j.index.AbstractBitStreamIndexWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j-big Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.
The newest version!
package it.unimi.dsi.big.mg4j.index;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2007-2011 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */


import it.unimi.dsi.Util;
import it.unimi.dsi.big.mg4j.index.CompressionFlags.Coding;
import it.unimi.dsi.big.mg4j.index.CompressionFlags.Component;

import java.io.PrintStream;
import java.util.Map;

/** An abstract bitstream-based index writer, providing common variables and a basic {@link #printStats(PrintStream)} implementation.
 * 
 * Compression flags
 *
 * Implementing subclasses need to know the compression method that they should use
 * to write frequencies, pointers, payloads, counts and positions (and whether to write any of them).
 * This information is passed to the {@linkplain #AbstractBitStreamIndexWriter(long, Map) constructor}
 * using a suitable flag map (see {@link CompressionFlags}).
 * 
 * @author Sebastiano Vigna
 * @since 1.2
 */

public abstract class AbstractBitStreamIndexWriter implements IndexWriter {

	/** The number of documents of the collection to be indexed. */
	protected final long numberOfDocuments;
	/** The flag map. */
	public Map flags;
	/** The coding for frequencies. */
	protected Coding frequencyCoding;
	/** The coding for pointers. */
	protected Coding pointerCoding;
	/** The coding for counts. */
	protected Coding countCoding;
	/** The coding for positions. */
	protected Coding positionCoding;
	/** Whether this index contains payloads. */
	protected final boolean hasPayloads;
	/** Whether this index contains counts. */
	protected final boolean hasCounts;
	/** Whether this index contains positions. */
	protected final boolean hasPositions;
	
	/** The number of indexed postings (pairs term/document). */
	protected long numberOfPostings;
	/** The number of indexed occurrences. */
	protected long numberOfOccurrences;
	/** The current term. */
	protected long currentTerm;
	/** The number of bits written for frequencies. */
	public long bitsForFrequencies;
	/** The number of bits written for document pointers. */
	public long bitsForPointers;
	/** The number of bits written for counts. */
	public long bitsForCounts;
	/** The number of bits written for payloads. */
	public long bitsForPayloads;
	/** The number of bits written for positions. */
	public long bitsForPositions;

	public AbstractBitStreamIndexWriter( final long numberOfDocuments, final Map flags ) {
		this.numberOfDocuments = numberOfDocuments;
		this.flags = flags;
		frequencyCoding = flags.get( Component.FREQUENCIES );
		pointerCoding = flags.get( Component.POINTERS );
		countCoding = flags.get( Component.COUNTS );
		positionCoding = flags.get( Component.POSITIONS );
		
		hasPayloads = flags.containsKey(  Component.PAYLOADS ); 
		hasCounts = countCoding != null;
		hasPositions = positionCoding != null;
	}

	public void printStats( PrintStream stats ) {
		stats.println( "Number of documents: " + Util.format( numberOfDocuments ) );
		stats.println( "Number of terms: " + Util.format( currentTerm + 1 ) );
		
		stats.println( "Frequencies: " + Util.format( bitsForFrequencies ) + " bits, " + Util.format( bitsForFrequencies / ( currentTerm + 1.0 ) ) + " bits/frequency." );
		stats.println( "Document pointers: " + Util.format( numberOfPostings ) + " (" + Util.format( bitsForPointers ) + " bits, " + Util.format( bitsForPointers / (double)numberOfPostings ) + " bits/pointer).");

		if ( hasCounts ) stats.println( "Counts: " + Util.format( numberOfPostings ) + " (" + Util.format( bitsForCounts ) + " bits, " + Util.format( bitsForCounts/ (double)numberOfPostings ) + " bits/count).");
		if ( hasPositions ) stats.println( "Occurrences: " + Util.format( numberOfOccurrences ) + " (" +  Util.format( bitsForPositions ) + " bits, " + Util.format( bitsForPositions / (double)numberOfOccurrences ) + " bits/occurrence).");
		if ( hasPayloads ) stats.println( "Payloads: " + Util.format( numberOfPostings )  + " (" + Util.format( bitsForPayloads ) + " bits, " + Util.format( bitsForPayloads / (double)numberOfPostings )  + " bits/payload)." );
		if ( hasPositions ) stats.println( "Total: " + Util.format( writtenBits() ) + " bits, " + Util.format( writtenBits() / (double)numberOfOccurrences ) + " bits/occurrence" );
		else stats.println( "Total: " + Util.format( writtenBits() ) + " bits, " + Util.format( writtenBits() / (double)numberOfPostings ) + " bits/posting" );
	}
}