src.it.unimi.di.mg4j.tool.PrecomputeIndex Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j Show documentation
Show all versions of mg4j Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java.
package it.unimi.di.mg4j.tool;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2011-2012 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.objects.Object2ObjectMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ReferenceMap;
import it.unimi.dsi.fastutil.objects.Object2ReferenceOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectSet;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.io.FileLinesCollection.FileLinesIterator;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.di.mg4j.index.BitStreamHPIndexWriter;
import it.unimi.di.mg4j.index.BitStreamIndex;
import it.unimi.di.mg4j.index.BitStreamIndexWriter;
import it.unimi.di.mg4j.index.CompressionFlags;
import it.unimi.di.mg4j.index.DiskBasedIndex;
import it.unimi.di.mg4j.index.Index;
import it.unimi.di.mg4j.index.IndexReader;
import it.unimi.di.mg4j.index.IndexWriter;
import it.unimi.di.mg4j.index.SkipBitStreamIndexWriter;
import it.unimi.di.mg4j.index.TermProcessor;
import it.unimi.di.mg4j.index.CompressionFlags.Coding;
import it.unimi.di.mg4j.index.CompressionFlags.Component;
import it.unimi.di.mg4j.query.nodes.Query;
import it.unimi.di.mg4j.query.nodes.QueryBuilderVisitorException;
import it.unimi.di.mg4j.query.nodes.Term;
import it.unimi.di.mg4j.query.parser.QueryParserException;
import it.unimi.di.mg4j.query.parser.SimpleParser;
import it.unimi.di.mg4j.search.DocumentIterator;
import it.unimi.di.mg4j.search.DocumentIteratorBuilderVisitor;
import it.unimi.di.mg4j.search.IntervalIterator;
import it.unimi.dsi.util.Interval;
import it.unimi.dsi.util.Properties;
import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.lang.reflect.InvocationTargetException;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.Map;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import static it.unimi.di.mg4j.search.DocumentIterator.END_OF_LIST;
/** Precomputes an index.
*
* @author Sebastiano Vigna
* @since 4.0
*/
public class PrecomputeIndex {
private static final Logger LOGGER = Util.getLogger( PrecomputeIndex.class );
/** The overall number of documents. */
protected final int numberOfDocuments;
/** The output basename. */
protected final String outputBasename;
/** The logging interval. */
private final long logInterval;
/** The index writer for the merged index. */
private final IndexWriter indexWriter;
/** The main input index. */
private final BitStreamIndex mainIndex;
/** The parsed representation of the query. */
private Query query;
/** The visitor that will be used to instantiate the query. */
private ReplacingDocumentIteratorBuilderVisitor visitor;
/** Whether the index we build should have positions. */
private boolean hasPositions;
private String fieldName;
private FileLinesIterator terms;
private boolean hasCounts;
protected final static class ReplacingDocumentIteratorBuilderVisitor extends DocumentIteratorBuilderVisitor implements Closeable {
/** A map from {@linkplain Term term nodes} to the corresponding {@link IndexReader}. */
private final Object2ObjectMap term2IndexReader;
/** The buffer size for index readers. */
private final int bufferSize;
/** A marker string. Query terms containing the marker will be replaced by this builder visitor with an index iterator over {@link #currentTerm}. */
private final MutableString marker;
/** The current term of this builder visitor. */
protected int currentTerm;
public ReplacingDocumentIteratorBuilderVisitor( final MutableString marker, final Object2ReferenceMap indexMap, final Index defaultIndex, final int limit, final int bufferSize ) {
super( indexMap, defaultIndex, limit );
this.marker = marker;
this.bufferSize = bufferSize;
term2IndexReader = new Object2ObjectOpenHashMap();
}
@Override
public DocumentIterator visit( Term node ) throws QueryBuilderVisitorException {
try {
// Get the current index reader for this node, or instantiate one lazily if necessary.
IndexReader indexReader = term2IndexReader.get( node );
if ( indexReader == null ) term2IndexReader.put( node, indexReader = curr.top().getReader( bufferSize ) );
if ( node.term != null && marker.equals( node.term ) ) {
return indexReader.documents( currentTerm ).weight( weight() );
}
else if ( node.termNumber != -1 ) return indexReader.documents( node.termNumber ).weight( weight() );
return indexReader.documents( node.term ).weight( weight() );
}
catch ( IOException e ) {
throw new QueryBuilderVisitorException( e );
}
}
@Override
public void close() throws IOException {
for( IndexReader indexReader: term2IndexReader.values() ) indexReader.close();
term2IndexReader.clear();
}
}
/** Precomputes an index.
*
* @param outputBasename the basename of the combined index.
* @throws QueryParserException
* @throws QueryBuilderVisitorException
*/
public PrecomputeIndex( final String outputBasename, final String[] inputBasename, final String queryString, final String fieldName, final int bufferSize, final Map writerFlags, boolean interleaved, boolean skips, final int quantum, final int height, final int skipBufferSize, final long logInterval ) throws IOException, ConfigurationException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, QueryParserException, QueryBuilderVisitorException {
this.logInterval = logInterval;
this.outputBasename = outputBasename;
this.fieldName = fieldName;
BitStreamIndex[] index = new BitStreamIndex[ inputBasename.length ];
// This will remain set if *all* indices to be merged agree
boolean havePositions = true;
// The set of indices (for the parser).
final ObjectSet indices = new ObjectOpenHashSet();
// The map from indices to term processors (for the parser).
final Object2ObjectMap termProcessors = new Object2ObjectOpenHashMap();
// The index map (for the visitor).
final Object2ReferenceMap indexMap = new Object2ReferenceOpenHashMap();
int numberOfDocuments = -1;
for( int i = 0; i < inputBasename.length; i++ ) {
index[ i ] = (BitStreamIndex)Index.getInstance( inputBasename[ i ], true, false, true );
indexMap.put( index[ i ].field, index[ i ] );
if ( numberOfDocuments == -1 ) numberOfDocuments = index[ i ].numberOfDocuments;
if ( numberOfDocuments != index[ i ].numberOfDocuments ) throw new IllegalArgumentException( "All indices must have the same number of documents" );
indices.add( index[ i ].field );
if ( index[ i ].termProcessor != null ) termProcessors.put( index[ i ].field, index[ i ].termProcessor );
havePositions &= index[ i ].hasPositions;
}
mainIndex = index[ 0 ];
final int questionMarkPos = inputBasename[ 0 ].indexOf( '?' );
terms = new FileLinesCollection( ( questionMarkPos == -1 ? inputBasename[ 0 ] : inputBasename[ 0 ].substring( 0, questionMarkPos ) ) + DiskBasedIndex.TERMS_EXTENSION, "UTF-8" ).iterator();
this.numberOfDocuments = numberOfDocuments;
hasPositions = havePositions;
query = new SimpleParser( indices, mainIndex.field, termProcessors ).parse( queryString );
if ( ( hasCounts = writerFlags.containsKey( Component.COUNTS ) ) && ! havePositions ) throw new IllegalArgumentException( "Some of the indices to be combined do not have positions, which are necessary to compute counts." );
if ( ( hasPositions = writerFlags.containsKey( Component.POSITIONS ) ) && ! havePositions ) throw new IllegalArgumentException( "Some of the indices to be combined do not have positions." );
// If we have not all the index, we are forced to use an interleaved index.
interleaved |= ! hasPositions;
// High-performance indices always have skips.
skips |= ! interleaved;
if ( interleaved ) {
if ( ! skips ) indexWriter = new BitStreamIndexWriter( outputBasename, numberOfDocuments, true, writerFlags );
else indexWriter = new SkipBitStreamIndexWriter( outputBasename, numberOfDocuments, true, skipBufferSize, writerFlags, skips ? quantum : -1, skips ? height : -1 );
}
else indexWriter = new BitStreamHPIndexWriter( outputBasename, numberOfDocuments, true, skipBufferSize, writerFlags, quantum, height );
visitor = new ReplacingDocumentIteratorBuilderVisitor( new MutableString("?"), indexMap, mainIndex, Integer.MAX_VALUE, bufferSize );
LOGGER.debug( "Precomputing index " + outputBasename + " from " + Arrays.toString( inputBasename ) + " using query " + query );
}
public void run() throws IOException, ConfigurationException, QueryBuilderVisitorException {
final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval );
pl.displayFreeMemory = true;
// To write the frequency of each term
final OutputBitStream frequencies = new OutputBitStream( outputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );
// To write the occurrency of each term
final OutputBitStream occurrencies = hasPositions ? new OutputBitStream( outputBasename + DiskBasedIndex.OCCURRENCIES_EXTENSION ) : null;
final PrintWriter termFile = new PrintWriter( new BufferedWriter( new OutputStreamWriter( new FileOutputStream( outputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) ) );
pl.expectedUpdates = mainIndex.numberOfTerms;
pl.itemsName = "terms";
pl.logInterval = logInterval;
pl.start( "Precomputing..." );
final IntArrayList positions = new IntArrayList();
long totOccurrencies = 0;
for( int i = 0; i < mainIndex.numberOfTerms; i++ ) {
final MutableString term = terms.next();
visitor.currentTerm = i;
DocumentIterator documentIterator = query.accept( visitor );
// Precompute frequency.
long f = 0;
while( ( documentIterator.nextDocument() ) != END_OF_LIST ) f++;
if ( f != 0 ) {
termFile.println( term );
indexWriter.newInvertedList();
indexWriter.writeFrequency( (int)f );
frequencies.writeLongGamma( f );
documentIterator = query.accept( visitor );
long occurrency = 0;
for( int d; ( d = documentIterator.nextDocument() ) != END_OF_LIST; ) {
OutputBitStream out = indexWriter.newDocumentRecord();
indexWriter.writeDocumentPointer( out, d );
if ( hasCounts ) {
positions.clear();
IntervalIterator intervalIterator = documentIterator.intervalIterator();
for( Interval interval; ( interval = intervalIterator.nextInterval() ) != null; ) {
if ( interval.length() > 1 ) throw new IllegalStateException();
positions.add( interval.left );
}
indexWriter.writePositionCount( out, positions.size() );
if ( hasPositions ) indexWriter.writeDocumentPositions( out, positions.elements(), 0, positions.size(), -1 );
occurrency += positions.size();
}
}
totOccurrencies += occurrency;
if ( occurrencies != null ) occurrencies.writeLongGamma( occurrency );
}
pl.update();
}
visitor.close();
frequencies.close();
indexWriter.close();
termFile.close();
terms.close();
if ( occurrencies != null ) occurrencies.close();
pl.done();
final Properties properties = indexWriter.properties();
properties.addProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( mainIndex.termProcessor ) );
if ( hasPositions ) properties.addProperty( Index.PropertyKeys.OCCURRENCES, totOccurrencies );
if ( fieldName != null || mainIndex.field != null ) properties.addProperty( Index.PropertyKeys.FIELD, fieldName != null ? fieldName: mainIndex.field );
properties.save( outputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
}
public static void main( final String[] arg ) throws JSAPException, ConfigurationException, IOException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, QueryParserException, QueryBuilderVisitorException {
SimpleJSAP jsap = new SimpleJSAP( PrecomputeIndex.class.getName(), "Precomputes an index using a query. The query will be run replacing a settable marker symbol with all terms of the first input index. All queries producing nonempty results will generate a posting list associated to the current term, and containing the results of the query.",
new Parameter[] {
new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( Combine.DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ),
new FlaggedOption( "comp", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c', "comp", "A compression flag for the index (may be specified several times)." ).setAllowMultipleDeclarations( true ),
new Switch( "noSkips", JSAP.NO_SHORTFLAG, "no-skips", "Disables skips." ),
new Switch( "interleaved", JSAP.NO_SHORTFLAG, "interleaved", "Forces an interleaved index." ),
new FlaggedOption( "quantum", JSAP.INTEGER_PARSER, Integer.toString( BitStreamIndex.DEFAULT_FIXED_QUANTUM ), JSAP.NOT_REQUIRED, 'Q', "quantum", "Enable skips with given quantum, if positive; fix space occupancy of variable-quantum skip towers in percentage if negative." ),
new FlaggedOption( "height", JSAP.INTSIZE_PARSER, Integer.toString( BitStreamIndex.DEFAULT_HEIGHT ), JSAP.NOT_REQUIRED, 'H', "height", "The skip height." ),
new FlaggedOption( "skipBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE ), JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "skip-buffer-size", "The size of the internal temporary buffer used while creating an index with skips." ),
new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),
new UnflaggedOption( "outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the resulting index." ),
new FlaggedOption( "fieldName", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'f', "field-name", "An optional field name for the precomputed index (by default, the same of the first input index)." ),
new FlaggedOption( "marker", JSAP.STRING_PARSER, "?", JSAP.NOT_REQUIRED, 'm', "marker", "The term marker: instances in the query will be replaced by the current term from the first input index." ),
new UnflaggedOption( "query", JSAP.STRING_PARSER, JSAP.NOT_REQUIRED, "A query containing instances of the marker." ),
new UnflaggedOption( "inputBasename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.GREEDY, "The basenames of the indices to be queried." )
});
JSAPResult jsapResult = jsap.parse( arg );
if ( jsap.messagePrinted() ) return;
final boolean skips = ! jsapResult.getBoolean( "noSkips" );
final boolean interleaved = jsapResult.getBoolean( "interleaved" );
if ( ! skips && ( jsapResult.userSpecified( "quantum" ) || jsapResult.userSpecified( "height" ) ) ) throw new IllegalArgumentException( "You specified quantum or height, but you also disabled skips." );
new PrecomputeIndex( jsapResult.getString( "outputBasename" ), jsapResult.getStringArray( "inputBasename" ), jsapResult.getString( "query" ), jsapResult.getString( "fieldName" ),
jsapResult.getInt( "bufferSize" ),
CompressionFlags.valueOf( jsapResult.getStringArray( "comp" ), CompressionFlags.DEFAULT_STANDARD_INDEX ),
interleaved,
skips,
jsapResult.getInt( "quantum" ),
jsapResult.getInt( "height" ),
jsapResult.getInt( "skipBufferSize" ),
jsapResult.getLong( "logInterval" ) ).run();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy