src.it.unimi.di.archive4j.scratch.EstimatePositions Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of archive4j Show documentation
Archive4J is a suite of tools to store compactly term/count information of a document collection.
There is a newer version: 1.3.3
package it.unimi.di.archive4j.scratch;

/*
 * Copyright (C) 2006-2013 Paolo Boldi, Massimo Santini and Sebastiano Vigna
 *
 *  This program is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 2 of the License, or (at your option)
 *  any later version.
 *
 *  This program is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 */
import it.unimi.di.big.mg4j.document.AbstractDocumentSequence;
import it.unimi.di.big.mg4j.document.Document;
import it.unimi.di.big.mg4j.document.DocumentIterator;
import it.unimi.di.big.mg4j.document.DocumentSequence;
import it.unimi.di.big.mg4j.index.TermProcessor;
import it.unimi.di.archive4j.SequentialBitstreamArchive.CompressionFlags.Coding;
import it.unimi.di.archive4j.tool.Preprocess;
import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.ints.Int2IntLinkedOpenHashMap;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.NullOutputStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.StringMap;

import java.io.IOException;
import java.io.PrintWriter;
import java.io.Reader;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import cern.colt.Sorting;
import cern.colt.function.IntComparator;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;

/**
 * Estimates position cost.
 * 
 * @author Alessio Orlandi
 */

public class EstimatePositions {
	protected static final Logger LOGGER = LoggerFactory.getLogger( EstimatePositions.class );

	protected static class EstimatingWriter {

		final NullOutputStream nos = NullOutputStream.getInstance();

		final OutputBitStream obs = new OutputBitStream( nos, 0 );

		/** Term to global-count rank. */
		final protected int term2Rank[];

		/** Terms in the current document. */
		@SuppressWarnings("hiding")
		private IntArrayList terms = new IntArrayList();

		/** Count of each term in the current document. */
		protected Int2IntLinkedOpenHashMap termsMap = new Int2IntLinkedOpenHashMap();

		private boolean twopass, local;

		/** Counters for each term, and the position for the 'missing' term. */
		public int counters[], missingTerm;

		public long written = 0, words;

		protected Coding coding;

		public EstimatingWriter( final long[] counts, boolean twopass, boolean local, Coding coding ) {
			this.twopass = twopass || local;
			this.local = local;
			this.coding = coding;

			final int rank2Term[] = new int[ counts.length ];
			for ( int i = counts.length; i-- != 0; )
				rank2Term[ i ] = i;


			Sorting.quickSort( rank2Term, 0, rank2Term.length, new IntComparator() {
				// Sort in descending order using counts.
				public int compare( final int a, final int b ) {
					return (int)( counts[ b ] - counts[ a ] );
				}
			} );

			term2Rank = new int[ counts.length ];
			for ( int i = rank2Term.length; i-- != 0; )
				term2Rank[ rank2Term[ i ] ] = i;

			counters = new int[ counts.length + 1 ];
			missingTerm = counts.length;
		}

		public int numberOfPasses() {
			return ( twopass ? 2 : 1 );
		}

		public void append( final int term ) throws IOException {

			/* Two pass: we first need T/F information, so we collect the term. */
			if ( twopass ) {
				terms.add( term );
			}
			else // one pass: just count gamma-coding the current term.
			{
				if ( term < 0 ) counters[ missingTerm ]++;
				else {
					obs.writeGamma( term2Rank[ term ] );
					final long writtenNow = obs.writtenBits();
					counters[ term2Rank[ term ] ] += writtenNow - written;
					written = writtenNow;
					words++;
				}
			}
		}

		public void nextDocument() throws IOException {

			if ( !twopass ) return;

			termsMap.clear();

			// Build T/F, unsorted.
			for ( int t : terms )
				termsMap.put( t, termsMap.get( t ) + 1 );

			final int[] tpTerms = termsMap.keySet().toIntArray();

			/* Sort elements in tpTerms via some criteria. */
			if ( local ) twoPassLocal( tpTerms );
			else twoPassGlobal( tpTerms );


			if ( LOGGER.isDebugEnabled() ) LOGGER.debug( "Writing " + tpTerms.length + " terms covering " + terms.size() + "positions" );
			for ( int i = 0; i < tpTerms.length; i++ ) {
				final int freq = termsMap.get( tpTerms[ i ] );
				if ( tpTerms[ i ] < 0 ) counters[ missingTerm ] += freq;
				else {
					// Each position is represented by the term rank in the sorted list.
					obs.writtenBits( 0 );

					switch ( coding ) {
					case DELTA:
						obs.writeDelta( i );
						break;
					case GAMMA:
						obs.writeGamma( i );
						break;
					case NIBBLE:
						obs.writeNibble( i );
						break;
					case UNARY: // FIXME This is an hack.
						obs.writeMinimalBinary( i, tpTerms.length + 1 );
						break;
					case ZETA_2:
						obs.writeZeta( i, 2 );
						break;
					case ZETA_3:
						obs.writeZeta( i, 3 );
						break;
					case ZETA_4:
						obs.writeZeta( i, 4 );
						break;
					case ZETA_5:
						obs.writeZeta( i, 5 );
						break;
					case SHIFTED_GAMMA:
						obs.writeShiftedGamma( i );
						break;

					}

					final int quantity = (int)obs.writtenBits() * freq;
					counters[ term2Rank[ tpTerms[ i ] ] ] += quantity;
					written += quantity;
					words += freq;
				}

			}

			terms.clear();
		}

		protected void twoPassLocal( final int[] tpTerms ) {
			/* Renumbers term locally: sort them by descending count. */
			Sorting.quickSort( tpTerms, 0, tpTerms.length, new IntComparator() {
				public int compare( final int a, final int b ) {
					return termsMap.get( b ) - termsMap.get( a );
				}
			} );
		}

		protected void twoPassGlobal( final int[] tpTerms ) {
			/* Renumbers term globally: sort them by descending global count. */
			Sorting.quickSort( tpTerms, 0, tpTerms.length, new IntComparator() {
				public int compare( final int a, final int b ) {
					return ( a < 0 ? -1 : term2Rank[ a ] ) - ( b < 0 ? -1 : term2Rank[ b ] );
				}
			} );
		}


	};



	protected static TermProcessor termProcessor;

	protected static ObjectOpenHashSet removed;

	protected static StringMap terms;

	protected static StringMap urls;

	protected static DocumentSequence sequence;

	protected static int[] allCount;

	protected static final MutableString wordString = new MutableString(), nonWordString = new MutableString();

	@SuppressWarnings("unchecked")
	public static void main( String[] args ) throws Exception {

		SimpleJSAP jsap = new SimpleJSAP( EstimatePositions.class.getName(), "Estimates.",
				new Parameter[] {

						new Switch( "twopass", '2', "two-pass", "Use indirect referral of terms into counting (2-pass parsing)" ),
						new Switch( "local", 'l', "local", "Local count re-sorting." ),
						new FlaggedOption( "coding", JSAP.STRING_PARSER, "UNARY", JSAP.NOT_REQUIRED, 'c', "coding",
								"Specify code (unary,gamma,shifted_gamma,nibble,delta,zeta_K) for a given component in the form Component:Coding" ).setAllowMultipleDeclarations( true ),
						new Switch( "sorted", 's', "sorted", "Tells if we are guaranteed that input document are sorted (guarantees random access)" ),
						new FlaggedOption( "stringmap", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'm', "stringmap",
								"Specify an alternative string map different from preprocessed one (if any)" ),
						new Switch( "ascii", 'A', "ascii", "Make (debugging) output in ASCII format on stdout" ),
						new FlaggedOption( "urlmap", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'u', "urlmap", "The filename for the graph URLs dictionary" ),
						new FlaggedOption( "sequence", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'S', "sequence", "The filename for the serialized collection containing documents" ),
						new FlaggedOption( "termprocessor", new ObjectParser( TermProcessor.class,
								new String[] { "it.unimi.di.big.mg4j.index", "it.unimi.di.big.mg4j.snowball", "it.unimi.dsi.law.archive" }, new String[] { "getInstance", "" } ),
								"it.unimi.di.big.mg4j.index.NullTermProcessor", JSAP.NOT_REQUIRED, 't', "termprocessor", "The term processor to apply" ),
								new FlaggedOption( "indexedField", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'I', "indexed-field", "The field of the document factory that will be indexed" ), 
								new Switch( "generate", 'g', "generate", "Generates preprocessing data automatically." ),
						new FlaggedOption( "preprocessed", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', "preprocessed", "Preprocessed data basename" ),
						new FlaggedOption( "numberOfDocuments", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'n', "Number of documents in the archive" ),
						new UnflaggedOption( "output", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename for the output  data file" )

				} );

		JSAPResult jsapResult = jsap.parse( args );
		if ( jsap.messagePrinted() ) return;

		if ( jsapResult.userSpecified( "generate" ) && jsapResult.userSpecified( "preprocessed" ) ) {
			LOGGER.error( "Options 'generate' and 'input' are complementary." );
			return;
		}
		else if ( !jsapResult.userSpecified( "generate" ) && !jsapResult.userSpecified( "preprocessed" ) ) {
			LOGGER.error( "Either one option between 'generate' and 'input' must be specified." );
			return;
		}



		sequence = AbstractDocumentSequence.load( jsapResult.getString( "sequence" ) );
		urls = ( !jsapResult.userSpecified( "urlmap" ) ? null : (StringMap)BinIO.loadObject( jsapResult.getString( "urlmap" ) ) );

		if ( jsapResult.userSpecified( "removelist" ) ) {
			final FileLinesCollection lines = new FileLinesCollection( jsapResult.getString( "removelist" ), "UTF-8" );

			removed = new ObjectOpenHashSet( lines.size(), Hash.VERY_FAST_LOAD_FACTOR );
			for ( it.unimi.dsi.lang.MutableString s : lines )
				removed.add( s.copy() );
		}


		termProcessor = (TermProcessor)jsapResult.getObject( "termprocessor" );


		String preprocessBasename;
		if ( jsapResult.userSpecified( "generate" ) ) {
			preprocessBasename = jsapResult.getString( "output" ) + "-gpp";
			Preprocess.run( jsapResult.getString( "output" ), sequence, termProcessor, jsapResult.getString( "indexedField" ) );
		}
		else preprocessBasename = jsapResult.getString( "preprocessed" );


		terms = (StringMap)BinIO.loadObject( ( jsapResult.userSpecified( "stringmap" ) ? jsapResult.getString( "stringmap" ) : preprocessBasename + Preprocess.TERMS_EXTENSION ) );

		allCount = new int[ terms.size() ];

		final EstimatingWriter writer = new EstimatingWriter( BinIO.loadLongs( preprocessBasename + Preprocess.COUNTS_EXTENSION ), jsapResult.getBoolean( "twopass" ),
				jsapResult.getBoolean( "local" ), Coding.valueOf( jsapResult.getString( "coding" ) ) );
		final ProgressLogger pl = new ProgressLogger( LOGGER );

		pl.itemsName = "documents";
		pl.logInterval = ProgressLogger.TEN_SECONDS;
		pl.displayFreeMemory = true;
		pl.start( "Parsing documents..." );

		run( pl, writer, sequence.factory().fieldIndex( jsapResult.getString( "indexedField" ) ) );

		pl.done();
		sequence.close();

		PrintWriter pw = new PrintWriter( jsapResult.getString( "output" ) );

		pw.println( "Total number of written bytes: " + writer.written / 8 );
		pw.println( "Instances for the 'missing' term (over " + writer.words + " words): " + writer.counters[ writer.missingTerm ] );
		pw.println( "Counters for single term bits (by frequency-rank descending order) " );
		pw.println( "----" );
		for ( int i = 0; i < writer.counters.length - 1; i++ )
			pw.println( i + " " + writer.counters[ i ] );
		pw.close();
	}

	protected final static void buildTF( WordReader reader, EstimatingWriter writer ) throws IOException {
		// Copying fields speeds up the computation;
		final TermProcessor processor = termProcessor;
		final StringMap map = terms;
		final MutableString word = wordString, nonWord = nonWordString;
		int len = 0;

		while ( reader.next( word, nonWord ) ) {
			if ( word.length() == 0 ) continue;
			len++;

			if ( !processor.processTerm( word ) ) writer.append( -1 );
			else writer.append( (int)map.getLong( word ) );
		}
	}


	public static void run( ProgressLogger pl, EstimatingWriter writer, int field ) throws Exception {
		final DocumentIterator dIterator = sequence.iterator();
		Document d;

		int urlId = 0;
		if ( field == -1 ) throw new IllegalArgumentException();
		
		while ( ( d = dIterator.nextDocument() ) != null ) {

			int docId = (int)( urls != null ? urls.getLong( d.uri() ) : urlId++ );
			if ( docId < 0 ) {
				LOGGER.info( "Skipping document with unknown id for uri : " + d.uri() );
				d.close();
				continue;
			}
			else LOGGER.debug( "Indexing field " + field + " from " + d.uri() );

			final WordReader wReader = d.wordReader( field );
			wReader.setReader( (Reader)d.content( field ) );
			buildTF( wReader, writer );

			writer.nextDocument();

			if ( pl != null ) pl.lightUpdate();

			d.close();
		}

	}

}