src.it.unimi.di.mg4j.document.SimpleCompressedDocumentCollectionBuilder Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java.
There is a newer version: 5.2.2
package it.unimi.di.mg4j.document;

/*		 
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2009-2012 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */


import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectList;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.di.mg4j.document.DocumentFactory.FieldType;
import it.unimi.di.mg4j.document.SimpleCompressedDocumentCollection.FrequencyCodec;
import it.unimi.di.mg4j.tool.Scan;
import it.unimi.di.mg4j.tool.Scan.VirtualDocumentFragment;

import java.io.DataOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.PrintStream;
import java.io.Reader;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.CountingOutputStream;

/** A builder for {@linkplain SimpleCompressedDocumentCollection simple compressed document collections}.
 * 
 * @author Sebastiano Vigna
 */

public class SimpleCompressedDocumentCollectionBuilder implements DocumentCollectionBuilder {
	/** The factory of the base document sequence. */
	private final DocumentFactory factory;
	/** Whether will are building an exact collection (i.e., whether it stores nonwords). */
	private final boolean exact;
	/** A frequency keeper used to compress document terms. */
	private final FrequencyCodec termsFrequencyKeeper;
	/** A frequency keeper used to compress document nonterms, or null if {@link #exact} is false. */
	private final FrequencyCodec nonTermsFrequencyKeeper;
	/** The basename of the builder. */
	private String basename;
	/** The basename of current collection. */
	private String basenameSuffix;	
	/** The output bit stream for documents. */
	private OutputBitStream documentsOutputBitStream;
	/** The output stream for terms. */
	private CountingOutputStream termsOutputStream;
	/** The output stream for nonterms, or null if {@link #exact} is false. */
	private CountingOutputStream nonTermsOutputStream;
	/** The output bit stream for document offsets. */
	private OutputBitStream documentOffsetsObs;
	/** The output bit stream for term offsets. */
	private OutputBitStream termOffsetsObs;
	/** The output bit stream for nonterms offsets, or null if {@link #exact} is false. */
	private OutputBitStream nonTermOffsetsObs;
	/** A temporary cache for the content of a field as a list of global term numbers. If the collection is exact, it alternates terms and nonterms. */
	private IntArrayList fieldContent;
	/** The map from term to global term numbers, in order of appearance. */
	private Object2IntOpenHashMap terms;
	/** The map from term to global nonterm numbers, in order of appearance, or null if {@link #exact} is false. */
	private Object2IntOpenHashMap nonTerms;
	/** The number of documents indexed so far. */
	private int documents;
	/** The number of words indexed so far. */
	private long words;
	/** The number of fields indexed so far. */
	private long fields;
	/** The number of bits used to code words. */
	private long bitsForWords;
	/** The number of bits used to code nonwords. */
	private long bitsForNonWords;
	/** The number of bits used to code field lengths (the number of words/nonwords pairs). */
	private long bitsForFieldLengths;
	/** The number of bits used to code URIs. */
	private long bitsForUris;
	/** The number of bits used to code document titles. */
	private long bitsForTitles;
	/** Whether we are compressing non-text or virtual fields. */
	private boolean hasNonText;
	/** The zip output stream used to store non-text and virtual fields if {@link #hasNonText} is true, or  null otherwise. */
	private ZipOutputStream nonTextZipOutputStream;
	/** {@link #nonTextZipOutputStream} wrapped in a {@link DataOutputStream}. */
	private DataOutputStream nonTextZipDataOutputStream;


	public SimpleCompressedDocumentCollectionBuilder( final String basename, final DocumentFactory factory, final boolean exact ) {
		this.basename = basename;
		this.factory = factory;
		this.exact = exact;
		this.termsFrequencyKeeper = new SimpleCompressedDocumentCollection.FrequencyCodec();
		this.nonTermsFrequencyKeeper = exact ? new SimpleCompressedDocumentCollection.FrequencyCodec() : null;

		boolean hasNonText = false;
		for( int i = factory.numberOfFields(); i-- != 0; ) hasNonText |= factory.fieldType( i ) != FieldType.TEXT;
		this.hasNonText = hasNonText;
				
		terms = new Object2IntOpenHashMap( Scan.INITIAL_TERM_MAP_SIZE );
		terms.defaultReturnValue( -1 );
		if ( exact ) {
			nonTerms = new Object2IntOpenHashMap( Scan.INITIAL_TERM_MAP_SIZE );
			nonTerms.defaultReturnValue( -1 );
		}
		else nonTerms = null;
	}

	public String basename() {
		return basename;
	}
		
	public void open( final CharSequence suffix ) throws IOException {
		basenameSuffix = basename + suffix;
		documentsOutputBitStream = new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.DOCUMENTS_EXTENSION );
		termsOutputStream = new CountingOutputStream( new FastBufferedOutputStream( new FileOutputStream( basenameSuffix + SimpleCompressedDocumentCollection.TERMS_EXTENSION ) ) );
		nonTermsOutputStream = exact ? new CountingOutputStream( new FastBufferedOutputStream( new FileOutputStream( basenameSuffix + SimpleCompressedDocumentCollection.NONTERMS_EXTENSION ) ) ) : null;
		documentOffsetsObs = new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.DOCUMENT_OFFSETS_EXTENSION );
		termOffsetsObs = new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.TERM_OFFSETS_EXTENSION );
		nonTermOffsetsObs = exact? new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.NONTERM_OFFSETS_EXTENSION ) : null;
		fieldContent = new IntArrayList();

		if ( hasNonText ) nonTextZipDataOutputStream = new DataOutputStream( nonTextZipOutputStream = new ZipOutputStream( new FastBufferedOutputStream( new FileOutputStream( basenameSuffix + ZipDocumentCollection.ZIP_EXTENSION ) ) ) );

		terms.clear();
		terms.trim( Scan.INITIAL_TERM_MAP_SIZE );
		if ( exact ) {
			nonTerms.clear();
			nonTerms.trim( Scan.INITIAL_TERM_MAP_SIZE );
		}
		words = fields = bitsForWords = bitsForNonWords = bitsForFieldLengths = bitsForUris = bitsForTitles = documents = 0;

		// First offset
		documentOffsetsObs.writeDelta( 0 );
		termOffsetsObs.writeDelta( 0 );
		if ( exact ) nonTermOffsetsObs.writeDelta( 0 );
		
	}
	
	
	public void add( MutableString word, MutableString nonWord ) throws IOException {
		int t = terms.getInt( word );
		if ( t == -1 ) {
			terms.put( word.copy(), t = terms.size() );
			termsOutputStream.resetByteCount();
			word.writeSelfDelimUTF8( termsOutputStream );
			termOffsetsObs.writeLongDelta( termsOutputStream.getByteCount() );
		}
		fieldContent.add( t );
		if ( exact ) {
			t = nonTerms.getInt( nonWord );
			if ( t == -1 ) {
				nonTerms.put( nonWord.copy(), t = nonTerms.size() );
				nonTermsOutputStream.resetByteCount();
				nonWord.writeSelfDelimUTF8( nonTermsOutputStream );
				nonTermOffsetsObs.writeLongDelta( nonTermsOutputStream.getByteCount() );
			}
			fieldContent.add( t );
		}
	}

	
	public void close() throws IOException {
		documentsOutputBitStream.close();
		termsOutputStream.close();
		IOUtils.closeQuietly( nonTermsOutputStream );
		documentOffsetsObs.close();
		termOffsetsObs.close();
		if ( nonTermOffsetsObs != null ) nonTermOffsetsObs.close();
		if ( hasNonText ) {
			if ( documents == 0 ) nonTextZipOutputStream.putNextEntry( new ZipEntry( "dummy" ) );
			nonTextZipDataOutputStream.close();
		}

		final SimpleCompressedDocumentCollection simpleCompressedDocumentCollection = new SimpleCompressedDocumentCollection( basenameSuffix, documents, terms.size(), nonTerms != null ? nonTerms.size() : -1, exact, factory );
		BinIO.storeObject( simpleCompressedDocumentCollection, basenameSuffix + DocumentCollection.DEFAULT_EXTENSION );
		simpleCompressedDocumentCollection.close();
		
		final PrintStream stats = new PrintStream( new FileOutputStream ( basenameSuffix + SimpleCompressedDocumentCollection.STATS_EXTENSION ) );
		final long overallBits = bitsForTitles + bitsForUris + bitsForFieldLengths + bitsForWords + bitsForNonWords;
		stats.println( "Documents: " + Util.format( documents ) + " (" + Util.format( overallBits ) + ", " + Util.format( overallBits / (double)documents ) + " bits per document)" );
		stats.println( "Terms: " + Util.format( terms.size() ) + " (" + Util.format( words ) + " words, " + Util.format( bitsForWords ) + " bits, " + Util.format( bitsForWords / (double)words ) + " bits per word)" );
		if ( exact ) stats.println( "Nonterms: " + Util.format( nonTerms.size() ) + " (" + Util.format( words ) + " nonwords, " + Util.format( bitsForNonWords ) + " bits, " + Util.format( bitsForNonWords / (double)words ) + " bits per nonword)" );
		stats.println( "Bits for field lengths: " + Util.format( bitsForFieldLengths ) + " (" + Util.format( bitsForFieldLengths / (double)fields ) + " bits per field)" );
		stats.println( "Bits for URIs: " + Util.format( bitsForUris ) + " (" + Util.format( bitsForUris / (double)documents ) + " bits per URI)" );
		stats.println( "Bits for titles: " + Util.format( bitsForTitles ) + " (" + Util.format( bitsForTitles / (double)documents ) + " bits per title)" );
		stats.close();

	}

	
	public void endDocument() throws IOException {
		documentOffsetsObs.writeLongDelta( documentsOutputBitStream.writtenBits() );
		if ( hasNonText ) nonTextZipOutputStream.closeEntry();
	}
	

	public void endTextField() throws IOException {
		final int size = fieldContent.size();
		words += size / ( exact ? 2 : 1 );
		bitsForFieldLengths += documentsOutputBitStream.writeDelta( size / ( exact ? 2 : 1 ) );
		termsFrequencyKeeper.reset();
		if ( exact ) {
			nonTermsFrequencyKeeper.reset();
			for( int i = 0; i < size; i += 2 ) {
				bitsForWords += documentsOutputBitStream.writeDelta( termsFrequencyKeeper.encode( fieldContent.getInt( i ) ) );
				bitsForNonWords += documentsOutputBitStream.writeDelta( nonTermsFrequencyKeeper.encode( fieldContent.getInt( i + 1 ) ) );
			}
		}
		else for( int i = 0; i < size; i++ ) bitsForWords += documentsOutputBitStream.writeDelta( termsFrequencyKeeper.encode( fieldContent.getInt( i ) ) );
	}

	public void nonTextField( Object o ) throws IOException {
		final ObjectOutputStream oos = new ObjectOutputStream( nonTextZipDataOutputStream );
		oos.writeObject( o );
		oos.flush();
	}

	public static int writeSelfDelimitedUtf8String( final OutputBitStream obs, final CharSequence s ) throws IOException {
		final int len = s.length();
		int bits = 0;
		bits += obs.writeDelta( len );
		for( int i = 0; i < len; i++ ) bits += obs.writeZeta( s.charAt( i ), 7 );
		return bits;
	}
	

	
	public void startDocument( CharSequence title, CharSequence uri ) throws IOException {
		documentsOutputBitStream.writtenBits( 0 );
		bitsForUris += writeSelfDelimitedUtf8String( documentsOutputBitStream, uri == null ? "" : uri );
		bitsForTitles += writeSelfDelimitedUtf8String( documentsOutputBitStream, title == null ? "" : title );
		if ( hasNonText ) {
			final ZipEntry currEntry = new ZipEntry( Integer.toString( documents ) );
			nonTextZipOutputStream.putNextEntry( currEntry );

		}
		documents++;
	}

	
	public void startTextField() {
		fieldContent.size( 0 );
		fields++;
	}

	public void virtualField( final ObjectList fragments ) throws IOException {
		nonTextZipDataOutputStream.writeInt( fragments.size() );
		for ( VirtualDocumentFragment fragment: fragments ) {
			fragment.documentSpecifier().writeSelfDelimUTF8( nonTextZipOutputStream );
			fragment.text().writeSelfDelimUTF8( nonTextZipOutputStream );
		}
	}

	@SuppressWarnings("unchecked")
	public void build( final DocumentSequence inputSequence ) throws IOException {
		final DocumentIterator docIt = inputSequence.iterator();
		if ( factory != inputSequence.factory() ) throw new IllegalStateException( "The factory provided by the constructor does not correspond to the factory of the input sequence" );
		final int numberOfFields = factory.numberOfFields();
		WordReader wordReader;
		MutableString word = new MutableString();
		MutableString nonWord = new MutableString();
		
		open( "" );
		for (;;) {
			Document document = docIt.nextDocument();
			if ( document == null ) break;
			startDocument( document.title(), document.uri() );
			
			for ( int field = 0; field < numberOfFields; field++ ) {
				Object content = document.content( field );
				if ( factory.fieldType( field ) == FieldType.TEXT ) {
					startTextField();
					wordReader = document.wordReader( field );
					wordReader.setReader( (Reader)content );
					while ( wordReader.next( word, nonWord ) ) add( word, nonWord );
					endTextField();
				}
				else if ( factory.fieldType( field ) == FieldType.VIRTUAL ) virtualField( (ObjectList)content );
				else nonTextField( content );
			}
			document.close();
			endDocument();
		}
		docIt.close();
		close();
	}
}