
src.it.unimi.di.mg4j.tool.PrecomputeIndex Maven / Gradle / Ivy
package it.unimi.di.mg4j.tool;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2011-2012 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.objects.Object2ObjectMap;
import it.unimi.dsi.fastutil.objects.Object2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2ReferenceMap;
import it.unimi.dsi.fastutil.objects.Object2ReferenceOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.fastutil.objects.ObjectSet;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.io.FileLinesCollection.FileLinesIterator;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.di.mg4j.index.BitStreamHPIndexWriter;
import it.unimi.di.mg4j.index.BitStreamIndex;
import it.unimi.di.mg4j.index.BitStreamIndexWriter;
import it.unimi.di.mg4j.index.CompressionFlags;
import it.unimi.di.mg4j.index.DiskBasedIndex;
import it.unimi.di.mg4j.index.Index;
import it.unimi.di.mg4j.index.IndexReader;
import it.unimi.di.mg4j.index.IndexWriter;
import it.unimi.di.mg4j.index.SkipBitStreamIndexWriter;
import it.unimi.di.mg4j.index.TermProcessor;
import it.unimi.di.mg4j.index.CompressionFlags.Coding;
import it.unimi.di.mg4j.index.CompressionFlags.Component;
import it.unimi.di.mg4j.query.nodes.Query;
import it.unimi.di.mg4j.query.nodes.QueryBuilderVisitorException;
import it.unimi.di.mg4j.query.nodes.Term;
import it.unimi.di.mg4j.query.parser.QueryParserException;
import it.unimi.di.mg4j.query.parser.SimpleParser;
import it.unimi.di.mg4j.search.DocumentIterator;
import it.unimi.di.mg4j.search.DocumentIteratorBuilderVisitor;
import it.unimi.di.mg4j.search.IntervalIterator;
import it.unimi.dsi.util.Interval;
import it.unimi.dsi.util.Properties;
import java.io.BufferedWriter;
import java.io.Closeable;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.lang.reflect.InvocationTargetException;
import java.net.URISyntaxException;
import java.util.Arrays;
import java.util.Map;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.log4j.Logger;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import static it.unimi.di.mg4j.search.DocumentIterator.END_OF_LIST;
/** Precomputes an index.
*
* @author Sebastiano Vigna
* @since 4.0
*/
public class PrecomputeIndex {
private static final Logger LOGGER = Util.getLogger( PrecomputeIndex.class );
/** The overall number of documents. */
protected final int numberOfDocuments;
/** The output basename. */
protected final String outputBasename;
/** The logging interval. */
private final long logInterval;
/** The index writer for the merged index. */
private final IndexWriter indexWriter;
/** The main input index. */
private final BitStreamIndex mainIndex;
/** The parsed representation of the query. */
private Query query;
/** The visitor that will be used to instantiate the query. */
private ReplacingDocumentIteratorBuilderVisitor visitor;
/** Whether the index we build should have positions. */
private boolean hasPositions;
private String fieldName;
private FileLinesIterator terms;
private boolean hasCounts;
protected final static class ReplacingDocumentIteratorBuilderVisitor extends DocumentIteratorBuilderVisitor implements Closeable {
/** A map from {@linkplain Term term nodes} to the corresponding {@link IndexReader}. */
private final Object2ObjectMap term2IndexReader;
/** The buffer size for index readers. */
private final int bufferSize;
/** A marker string. Query terms containing the marker will be replaced by this builder visitor with an index iterator over {@link #currentTerm}. */
private final MutableString marker;
/** The current term of this builder visitor. */
protected int currentTerm;
public ReplacingDocumentIteratorBuilderVisitor( final MutableString marker, final Object2ReferenceMap indexMap, final Index defaultIndex, final int limit, final int bufferSize ) {
super( indexMap, defaultIndex, limit );
this.marker = marker;
this.bufferSize = bufferSize;
term2IndexReader = new Object2ObjectOpenHashMap();
}
@Override
public DocumentIterator visit( Term node ) throws QueryBuilderVisitorException {
try {
// Get the current index reader for this node, or instantiate one lazily if necessary.
IndexReader indexReader = term2IndexReader.get( node );
if ( indexReader == null ) term2IndexReader.put( node, indexReader = curr.top().getReader( bufferSize ) );
if ( node.term != null && marker.equals( node.term ) ) {
return indexReader.documents( currentTerm ).weight( weight() );
}
else if ( node.termNumber != -1 ) return indexReader.documents( node.termNumber ).weight( weight() );
return indexReader.documents( node.term ).weight( weight() );
}
catch ( IOException e ) {
throw new QueryBuilderVisitorException( e );
}
}
@Override
public void close() throws IOException {
for( IndexReader indexReader: term2IndexReader.values() ) indexReader.close();
term2IndexReader.clear();
}
}
/** Precomputes an index.
*
* @param outputBasename the basename of the combined index.
* @throws QueryParserException
* @throws QueryBuilderVisitorException
*/
public PrecomputeIndex( final String outputBasename, final String[] inputBasename, final String queryString, final String fieldName, final int bufferSize, final Map writerFlags, boolean interleaved, boolean skips, final int quantum, final int height, final int skipBufferSize, final long logInterval ) throws IOException, ConfigurationException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, QueryParserException, QueryBuilderVisitorException {
this.logInterval = logInterval;
this.outputBasename = outputBasename;
this.fieldName = fieldName;
BitStreamIndex[] index = new BitStreamIndex[ inputBasename.length ];
// This will remain set if *all* indices to be merged agree
boolean havePositions = true;
// The set of indices (for the parser).
final ObjectSet indices = new ObjectOpenHashSet();
// The map from indices to term processors (for the parser).
final Object2ObjectMap termProcessors = new Object2ObjectOpenHashMap();
// The index map (for the visitor).
final Object2ReferenceMap indexMap = new Object2ReferenceOpenHashMap();
int numberOfDocuments = -1;
for( int i = 0; i < inputBasename.length; i++ ) {
index[ i ] = (BitStreamIndex)Index.getInstance( inputBasename[ i ], true, false, true );
indexMap.put( index[ i ].field, index[ i ] );
if ( numberOfDocuments == -1 ) numberOfDocuments = index[ i ].numberOfDocuments;
if ( numberOfDocuments != index[ i ].numberOfDocuments ) throw new IllegalArgumentException( "All indices must have the same number of documents" );
indices.add( index[ i ].field );
if ( index[ i ].termProcessor != null ) termProcessors.put( index[ i ].field, index[ i ].termProcessor );
havePositions &= index[ i ].hasPositions;
}
mainIndex = index[ 0 ];
final int questionMarkPos = inputBasename[ 0 ].indexOf( '?' );
terms = new FileLinesCollection( ( questionMarkPos == -1 ? inputBasename[ 0 ] : inputBasename[ 0 ].substring( 0, questionMarkPos ) ) + DiskBasedIndex.TERMS_EXTENSION, "UTF-8" ).iterator();
this.numberOfDocuments = numberOfDocuments;
hasPositions = havePositions;
query = new SimpleParser( indices, mainIndex.field, termProcessors ).parse( queryString );
if ( ( hasCounts = writerFlags.containsKey( Component.COUNTS ) ) && ! havePositions ) throw new IllegalArgumentException( "Some of the indices to be combined do not have positions, which are necessary to compute counts." );
if ( ( hasPositions = writerFlags.containsKey( Component.POSITIONS ) ) && ! havePositions ) throw new IllegalArgumentException( "Some of the indices to be combined do not have positions." );
// If we have not all the index, we are forced to use an interleaved index.
interleaved |= ! hasPositions;
// High-performance indices always have skips.
skips |= ! interleaved;
if ( interleaved ) {
if ( ! skips ) indexWriter = new BitStreamIndexWriter( outputBasename, numberOfDocuments, true, writerFlags );
else indexWriter = new SkipBitStreamIndexWriter( outputBasename, numberOfDocuments, true, skipBufferSize, writerFlags, skips ? quantum : -1, skips ? height : -1 );
}
else indexWriter = new BitStreamHPIndexWriter( outputBasename, numberOfDocuments, true, skipBufferSize, writerFlags, quantum, height );
visitor = new ReplacingDocumentIteratorBuilderVisitor( new MutableString("?"), indexMap, mainIndex, Integer.MAX_VALUE, bufferSize );
LOGGER.debug( "Precomputing index " + outputBasename + " from " + Arrays.toString( inputBasename ) + " using query " + query );
}
public void run() throws IOException, ConfigurationException, QueryBuilderVisitorException {
final ProgressLogger pl = new ProgressLogger( LOGGER, logInterval );
pl.displayFreeMemory = true;
// To write the frequency of each term
final OutputBitStream frequencies = new OutputBitStream( outputBasename + DiskBasedIndex.FREQUENCIES_EXTENSION );
// To write the occurrency of each term
final OutputBitStream occurrencies = hasPositions ? new OutputBitStream( outputBasename + DiskBasedIndex.OCCURRENCIES_EXTENSION ) : null;
final PrintWriter termFile = new PrintWriter( new BufferedWriter( new OutputStreamWriter( new FileOutputStream( outputBasename + DiskBasedIndex.TERMS_EXTENSION ), "UTF-8" ) ) );
pl.expectedUpdates = mainIndex.numberOfTerms;
pl.itemsName = "terms";
pl.logInterval = logInterval;
pl.start( "Precomputing..." );
final IntArrayList positions = new IntArrayList();
long totOccurrencies = 0;
for( int i = 0; i < mainIndex.numberOfTerms; i++ ) {
final MutableString term = terms.next();
visitor.currentTerm = i;
DocumentIterator documentIterator = query.accept( visitor );
// Precompute frequency.
long f = 0;
while( ( documentIterator.nextDocument() ) != END_OF_LIST ) f++;
if ( f != 0 ) {
termFile.println( term );
indexWriter.newInvertedList();
indexWriter.writeFrequency( (int)f );
frequencies.writeLongGamma( f );
documentIterator = query.accept( visitor );
long occurrency = 0;
for( int d; ( d = documentIterator.nextDocument() ) != END_OF_LIST; ) {
OutputBitStream out = indexWriter.newDocumentRecord();
indexWriter.writeDocumentPointer( out, d );
if ( hasCounts ) {
positions.clear();
IntervalIterator intervalIterator = documentIterator.intervalIterator();
for( Interval interval; ( interval = intervalIterator.nextInterval() ) != null; ) {
if ( interval.length() > 1 ) throw new IllegalStateException();
positions.add( interval.left );
}
indexWriter.writePositionCount( out, positions.size() );
if ( hasPositions ) indexWriter.writeDocumentPositions( out, positions.elements(), 0, positions.size(), -1 );
occurrency += positions.size();
}
}
totOccurrencies += occurrency;
if ( occurrencies != null ) occurrencies.writeLongGamma( occurrency );
}
pl.update();
}
visitor.close();
frequencies.close();
indexWriter.close();
termFile.close();
terms.close();
if ( occurrencies != null ) occurrencies.close();
pl.done();
final Properties properties = indexWriter.properties();
properties.addProperty( Index.PropertyKeys.TERMPROCESSOR, ObjectParser.toSpec( mainIndex.termProcessor ) );
if ( hasPositions ) properties.addProperty( Index.PropertyKeys.OCCURRENCES, totOccurrencies );
if ( fieldName != null || mainIndex.field != null ) properties.addProperty( Index.PropertyKeys.FIELD, fieldName != null ? fieldName: mainIndex.field );
properties.save( outputBasename + DiskBasedIndex.PROPERTIES_EXTENSION );
}
public static void main( final String[] arg ) throws JSAPException, ConfigurationException, IOException, URISyntaxException, ClassNotFoundException, SecurityException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, QueryParserException, QueryBuilderVisitorException {
SimpleJSAP jsap = new SimpleJSAP( PrecomputeIndex.class.getName(), "Precomputes an index using a query. The query will be run replacing a settable marker symbol with all terms of the first input index. All queries producing nonempty results will generate a posting list associated to the current term, and containing the results of the query.",
new Parameter[] {
new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( Combine.DEFAULT_BUFFER_SIZE ), JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ),
new FlaggedOption( "comp", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c', "comp", "A compression flag for the index (may be specified several times)." ).setAllowMultipleDeclarations( true ),
new Switch( "noSkips", JSAP.NO_SHORTFLAG, "no-skips", "Disables skips." ),
new Switch( "interleaved", JSAP.NO_SHORTFLAG, "interleaved", "Forces an interleaved index." ),
new FlaggedOption( "quantum", JSAP.INTEGER_PARSER, Integer.toString( BitStreamIndex.DEFAULT_FIXED_QUANTUM ), JSAP.NOT_REQUIRED, 'Q', "quantum", "Enable skips with given quantum, if positive; fix space occupancy of variable-quantum skip towers in percentage if negative." ),
new FlaggedOption( "height", JSAP.INTSIZE_PARSER, Integer.toString( BitStreamIndex.DEFAULT_HEIGHT ), JSAP.NOT_REQUIRED, 'H', "height", "The skip height." ),
new FlaggedOption( "skipBufferSize", JSAP.INTSIZE_PARSER, Util.formatBinarySize( SkipBitStreamIndexWriter.DEFAULT_TEMP_BUFFER_SIZE ), JSAP.NOT_REQUIRED, JSAP.NO_SHORTFLAG, "skip-buffer-size", "The size of the internal temporary buffer used while creating an index with skips." ),
new FlaggedOption( "logInterval", JSAP.LONG_PARSER, Long.toString( ProgressLogger.DEFAULT_LOG_INTERVAL ), JSAP.NOT_REQUIRED, 'l', "log-interval", "The minimum time interval between activity logs in milliseconds." ),
new UnflaggedOption( "outputBasename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The basename of the resulting index." ),
new FlaggedOption( "fieldName", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'f', "field-name", "An optional field name for the precomputed index (by default, the same of the first input index)." ),
new FlaggedOption( "marker", JSAP.STRING_PARSER, "?", JSAP.NOT_REQUIRED, 'm', "marker", "The term marker: instances in the query will be replaced by the current term from the first input index." ),
new UnflaggedOption( "query", JSAP.STRING_PARSER, JSAP.NOT_REQUIRED, "A query containing instances of the marker." ),
new UnflaggedOption( "inputBasename", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.GREEDY, "The basenames of the indices to be queried." )
});
JSAPResult jsapResult = jsap.parse( arg );
if ( jsap.messagePrinted() ) return;
final boolean skips = ! jsapResult.getBoolean( "noSkips" );
final boolean interleaved = jsapResult.getBoolean( "interleaved" );
if ( ! skips && ( jsapResult.userSpecified( "quantum" ) || jsapResult.userSpecified( "height" ) ) ) throw new IllegalArgumentException( "You specified quantum or height, but you also disabled skips." );
new PrecomputeIndex( jsapResult.getString( "outputBasename" ), jsapResult.getStringArray( "inputBasename" ), jsapResult.getString( "query" ), jsapResult.getString( "fieldName" ),
jsapResult.getInt( "bufferSize" ),
CompressionFlags.valueOf( jsapResult.getStringArray( "comp" ), CompressionFlags.DEFAULT_STANDARD_INDEX ),
interleaved,
skips,
jsapResult.getInt( "quantum" ),
jsapResult.getInt( "height" ),
jsapResult.getInt( "skipBufferSize" ),
jsapResult.getLong( "logInterval" ) ).run();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy