All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.di.mg4j.tool.PrecomputeIndex Maven / Gradle / Ivy

Go to download

MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java.

There is a newer version: 5.2.2
Show newest version
package it.unimi.di.mg4j.tool;

/*		 
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2011-2012 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.objects.Object2ObjectMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ReferenceMap;
import it.unimi.dsi.fastutil.objects.Object2ReferenceOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectSet;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.io.FileLinesCollection.FileLinesIterator;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.di.mg4j.index.BitStreamHPIndexWriter;
import it.unimi.di.mg4j.index.BitStreamIndex;
import it.unimi.di.mg4j.index.BitStreamIndexWriter;
import it.unimi.di.mg4j.index.CompressionFlags;
import it.unimi.di.mg4j.index.DiskBasedIndex;
import it.unimi.di.mg4j.index.Index;
import it.unimi.di.mg4j.index.IndexReader;
import it.unimi.di.mg4j.index.IndexWriter;
import it.unimi.di.mg4j.index.SkipBitStreamIndexWriter;
import it.unimi.di.mg4j.index.TermProcessor;
import it.unimi.di.mg4j.index.CompressionFlags.Coding;
import it.unimi.di.mg4j.index.CompressionFlags.Component;
import it.unimi.di.mg4j.query.nodes.Query;
import it.unimi.di.mg4j.query.nodes.QueryBuilderVisitorException;
import it.unimi.di.mg4j.query.nodes.Term;
import it.unimi.di.mg4j.query.parser.QueryParserException;
import it.unimi.di.mg4j.query.parser.SimpleParser;
import it.unimi.di.mg4j.search.DocumentIterator;
import it.unimi.di.mg4j.search.DocumentIteratorBuilderVisitor;
import it.unimi.di.mg4j.search.IntervalIterator;
import it.unimi.dsi.util.Interval;
import it.unimi.dsi.util.Properties;

import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.lang.reflect.InvocationTargetException;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.Map;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;

import static it.unimi.di.mg4j.search.DocumentIterator.END_OF_LIST;

/** Precomputes an index.
 *  
 * @author Sebastiano Vigna
 * @since 4.0
 */

public class PrecomputeIndex {
	private static final Logger LOGGER = Util.getLogger( PrecomputeIndex.class );
	
	/** The overall number of documents. */
	protected final int numberOfDocuments;
	/** The output basename. */
	protected final String outputBasename;
	/** The logging interval. */
	private final long logInterval;
	/** The index writer for the merged index. */ 
	private final IndexWriter indexWriter;
	/** The main input index. */
	private final BitStreamIndex mainIndex;
	/** The parsed representation of the query. */
	private Query query;
	/** The visitor that will be used to instantiate the query. */
	private ReplacingDocumentIteratorBuilderVisitor visitor;
	/** Whether the index we build should have positions. */
	private boolean hasPositions;

	private String fieldName;

	private FileLinesIterator terms;

	private boolean hasCounts;
	
	protected final static class ReplacingDocumentIteratorBuilderVisitor extends DocumentIteratorBuilderVisitor implements Closeable {
		/** A map from {@linkplain Term term nodes} to the corresponding {@link IndexReader}. */
		private final Object2ObjectMap term2IndexReader;
		/** The buffer size for index readers. */
		private final int bufferSize;
		/** A marker string. Query terms containing the marker will be replaced by this builder visitor with an index iterator over {@link #currentTerm}. */
		private final MutableString marker;
		/** The current term of this builder visitor. */
		protected int currentTerm;
		
		public ReplacingDocumentIteratorBuilderVisitor( final MutableString marker, final Object2ReferenceMap indexMap, final Index defaultIndex, final int limit, final int bufferSize ) {
			super( indexMap, defaultIndex, limit );
			this.marker = marker;
			this.bufferSize = bufferSize;
			term2IndexReader = new Object2ObjectOpenHashMap();
		}

		@Override
		public DocumentIterator visit( Term node ) throws QueryBuilderVisitorException {
			try {
				// Get the current index reader for this node, or instantiate one lazily if necessary.
				IndexReader indexReader = term2IndexReader.get( node );
				if ( indexReader == null ) term2IndexReader.put( node, indexReader = curr.top().getReader( bufferSize ) ); 

				if ( node.term != null && marker.equals( node.term ) ) {
					return indexReader.documents( currentTerm ).weight( weight() );
				}
				else if ( node.termNumber != -1 ) return indexReader.documents( node.termNumber ).weight( weight() );
				return indexReader.documents( node.term ).weight( weight() );
			}
			catch ( IOException e ) {
				throw new QueryBuilderVisitorException( e );
			}
		}

		@Override
		public void close() throws IOException {
			for( IndexReader indexReader: term2IndexReader.values() ) indexReader.close();
			term2IndexReader.clear();
		}
	}
	
	/** Precomputes an index.
	 * 
	 * @param outputBasename the basename of the combined index.
	 * @throws QueryParserException 
	 * @throws QueryBuilderVisitorException 
	 */
	public PrecomputeIndex( final String outputBasename, final String[] inputBasename, final String queryString, final String fieldName, final int bufferSize, final Map writerFlags, boolean interleaved, boolean skips, final int quantum, final int height, final int skipBufferSize, final long logInterval ) throws IOException, ConfigurationException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, QueryParserException, QueryBuilderVisitorException {

		this.logInterval = logInterval;
		this.outputBasename = outputBasename;
		this.fieldName = fieldName;
		BitStreamIndex[] index = new BitStreamIndex[ inputBasename.length ];
		// This will remain set if *all* indices to be merged agree
		boolean havePositions = true;

		// The set of indices (for the parser).
		final ObjectSet indices = new ObjectOpenHashSet();
		// The map from indices to term processors (for the parser).
		final Object2ObjectMap termProcessors = new Object2ObjectOpenHashMap();
		// The index map (for the visitor).
		final Object2ReferenceMap indexMap = new Object2ReferenceOpenHashMap();
		
		int numberOfDocuments = -1;

		for( int i = 0; i < inputBasename.length; i++ ) {
			index[ i ] = (BitStreamIndex)Index.getInstance( inputBasename[ i ], true, false, true );
			indexMap.put( index[ i ].field, index[ i ] );
			if ( numberOfDocuments == -1 ) numberOfDocuments = index[ i ].numberOfDocuments;
			if ( numberOfDocuments != index[ i ].numberOfDocuments ) throw new IllegalArgumentException( "All indices must have the same number of documents" );
			indices.add( index[ i ].field );
			if ( index[ i ].termProcessor != null ) termProcessors.put( index[ i ].field, index[ i ].termProcessor );
			havePositions &= index[ i ].hasPositions;
		}

		mainIndex = index[ 0 ];
		final int questionMarkPos = inputBasename[ 0 ].indexOf( '?' ); 
		terms = new FileLinesCollection( ( questionMarkPos == -1 ? inputBasename[ 0 ] : inputBasename[ 0 ].substring( 0, questionMarkPos ) ) + DiskBasedIndex.TERMS_EXTENSION, "UTF-8" ).iterator();

		this.numberOfDocuments = numberOfDocuments;
		hasPositions = havePositions;
		query = new SimpleParser( indices, mainIndex.field, termProcessors ).parse( queryString );

		if ( ( hasCounts = writerFlags.containsKey( Component.COUNTS ) ) && ! havePositions ) throw new IllegalArgumentException( "Some of the indices to be combined do not have positions, which are necessary to compute counts." );
		if ( ( hasPositions = writerFlags.containsKey( Component.POSITIONS ) ) && ! havePositions ) throw new IllegalArgumentException( "Some of the indices to be combined do not have positions." );
		
		// If we have not all the index, we are forced to use an interleaved index.
		interleaved |= ! hasPositions;
		// High-performance indices always have skips.
		skips |= ! interleaved;
	
		if ( interleaved ) {
			if ( ! skips ) indexWriter = new BitStreamIndexWriter( outputBasename, numberOfDocuments, true, writerFlags );
			else indexWriter = new SkipBitStreamIndexWriter( outputBasename, numberOfDocuments, true, skipBufferSize, writerFlags, skips ? quantum : -1, skips ? height : -1 );
		}
		else indexWriter = new BitStreamHPIndexWriter( outputBasename, numberOfDocuments, true, skipBufferSize, writerFlags, quantum, height );

		visitor = new ReplacingDocumentIteratorBuilderVisitor( new MutableString("?"), indexMap, mainIndex, Integer.MAX_VALUE, bufferSize );

		LOGGER.debug( "Precomputing index " + outputBasename + " from " + Arrays.toString( inputBasename ) + " using query " + query );
	}
	
	
	public void run() throws IOException, ConfigurationException, QueryBuilderVisitorException {
		final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval );
		pl.displayFreeMemory = true;

		// To write the frequency of each term
		final OutputBitStream frequencies = new OutputBitStream( outputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );
		// To write the occurrency of each term
		final OutputBitStream occurrencies = hasPositions ? new OutputBitStream( outputBasename + DiskBasedIndex.OCCURRENCIES_EXTENSION ) : null;
		final PrintWriter termFile = new PrintWriter( new BufferedWriter( new OutputStreamWriter( new FileOutputStream( outputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) ) );

		pl.expectedUpdates = mainIndex.numberOfTerms;
		pl.itemsName = "terms";
		pl.logInterval = logInterval;
		pl.start( "Precomputing..." );
		final IntArrayList positions = new IntArrayList();
		long totOccurrencies = 0;
		
		for( int i = 0; i < mainIndex.numberOfTerms; i++ ) {
			final MutableString term = terms.next();
			visitor.currentTerm = i;
			DocumentIterator documentIterator = query.accept( visitor );
			
			// Precompute frequency.
			long f = 0;
			while( ( documentIterator.nextDocument() ) != END_OF_LIST ) f++;

			if ( f != 0 ) {
				termFile.println( term );
				indexWriter.newInvertedList();
				indexWriter.writeFrequency( (int)f );
				frequencies.writeLongGamma( f );
				documentIterator = query.accept( visitor );
				long occurrency = 0;
				for( int d; ( d = documentIterator.nextDocument() ) != END_OF_LIST; ) {
					OutputBitStream out = indexWriter.newDocumentRecord();
					indexWriter.writeDocumentPointer( out, d );
					if ( hasCounts ) {
						positions.clear();
						IntervalIterator intervalIterator = documentIterator.intervalIterator();
						for( Interval interval; ( interval = intervalIterator.nextInterval() ) != null; ) {
							if ( interval.length() > 1 ) throw new IllegalStateException();
							positions.add( interval.left );
						}

						indexWriter.writePositionCount( out, positions.size() );
						if ( hasPositions ) indexWriter.writeDocumentPositions( out, positions.elements(), 0, positions.size(), -1 );
						occurrency += positions.size();
					}
				}

				totOccurrencies += occurrency;
				if ( occurrencies != null ) occurrencies.writeLongGamma( occurrency );
			}
			
			pl.update();
		}

		visitor.close();
		frequencies.close();
		indexWriter.close();
		termFile.close();
		terms.close();
		if ( occurrencies != null ) occurrencies.close();
		
		pl.done();

		final Properties properties = indexWriter.properties();
		properties.addProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( mainIndex.termProcessor ) );
		if ( hasPositions ) properties.addProperty( Index.PropertyKeys.OCCURRENCES, totOccurrencies );
		if ( fieldName != null || mainIndex.field != null ) properties.addProperty( Index.PropertyKeys.FIELD, fieldName != null ? fieldName: mainIndex.field );
		properties.save( outputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
	}

	public static void main( final String[] arg ) throws JSAPException, ConfigurationException, IOException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, QueryParserException, QueryBuilderVisitorException {
		
		SimpleJSAP jsap = new SimpleJSAP( PrecomputeIndex.class.getName(), "Precomputes an index using a query. The query will be run replacing a settable marker symbol with all terms of the first input index. All queries producing nonempty results will generate a posting list associated to the current term, and containing the results of the query.",
				new Parameter[] {
				new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( Combine.DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ),
				new FlaggedOption( "comp", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c', "comp", "A compression flag for the index (may be specified several times)." ).setAllowMultipleDeclarations( true ),
				new Switch( "noSkips", JSAP.NO_SHORTFLAG, "no-skips", "Disables skips." ),
				new Switch( "interleaved", JSAP.NO_SHORTFLAG, "interleaved", "Forces an interleaved index." ),
				new FlaggedOption( "quantum", JSAP.INTEGER_PARSER, Integer.toString( BitStreamIndex.DEFAULT_FIXED_QUANTUM ), JSAP.NOT_REQUIRED, 'Q', "quantum", "Enable skips with given quantum, if positive; fix space occupancy of variable-quantum skip towers in percentage if negative." ),
				new FlaggedOption( "height", JSAP.INTSIZE_PARSER, Integer.toString( BitStreamIndex.DEFAULT_HEIGHT ), JSAP.NOT_REQUIRED, 'H', "height", "The skip height." ),
				new FlaggedOption( "skipBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE ), JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "skip-buffer-size", "The size of the internal temporary buffer used while creating an index with skips." ),
				new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),
				new UnflaggedOption( "outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the resulting index." ),
				new FlaggedOption( "fieldName", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'f', "field-name", "An optional field name for the precomputed index (by default, the same of the first input index)." ),
				new FlaggedOption( "marker", JSAP.STRING_PARSER, "?", JSAP.NOT_REQUIRED, 'm', "marker", "The term marker: instances in the query will be replaced by the current term from the first input index." ),
				new UnflaggedOption( "query", JSAP.STRING_PARSER, JSAP.NOT_REQUIRED, "A query containing instances of the marker." ),
				new UnflaggedOption( "inputBasename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.GREEDY, "The basenames of the indices to be queried." )
		});
		
		JSAPResult jsapResult = jsap.parse( arg );
		if ( jsap.messagePrinted() ) return;

		final boolean skips = ! jsapResult.getBoolean( "noSkips" );
		final boolean interleaved = jsapResult.getBoolean( "interleaved" );
		if ( ! skips && ( jsapResult.userSpecified( "quantum" ) || jsapResult.userSpecified( "height" ) ) ) throw new IllegalArgumentException( "You specified quantum or height, but you also disabled skips." );
		
		new PrecomputeIndex( jsapResult.getString( "outputBasename" ), jsapResult.getStringArray( "inputBasename" ), jsapResult.getString( "query" ), jsapResult.getString( "fieldName" ),
				jsapResult.getInt( "bufferSize" ),
				CompressionFlags.valueOf( jsapResult.getStringArray( "comp" ), CompressionFlags.DEFAULT_STANDARD_INDEX ),
				interleaved,
				skips,
				jsapResult.getInt( "quantum" ),
				jsapResult.getInt( "height" ),
				jsapResult.getInt( "skipBufferSize" ),
				jsapResult.getLong( "logInterval" ) ).run(); 
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy