![JAR search and dependency download from the Maven repository](/logo.png)
src.it.unimi.dsi.big.mg4j.query.QueryEngine Maven / Gradle / Ivy
Show all versions of mg4j-big Show documentation
package it.unimi.dsi.big.mg4j.query;
/*
* MG4J: Managing Gigabytes for Java (big)
*
* Copyright (C) 2005-2011 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import it.unimi.dsi.fastutil.objects.Object2ReferenceMap;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectArrays;
import it.unimi.dsi.fastutil.objects.Reference2DoubleMap;
import it.unimi.dsi.fastutil.objects.Reference2DoubleOpenHashMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.lang.FlyweightPrototype;
import it.unimi.dsi.lang.FlyweightPrototypes;
import it.unimi.dsi.big.mg4j.index.Index;
import it.unimi.dsi.big.mg4j.query.nodes.Query;
import it.unimi.dsi.big.mg4j.query.nodes.QueryBuilderVisitor;
import it.unimi.dsi.big.mg4j.query.nodes.QueryBuilderVisitorException;
import it.unimi.dsi.big.mg4j.query.nodes.QueryTransformer;
import it.unimi.dsi.big.mg4j.query.parser.QueryParser;
import it.unimi.dsi.big.mg4j.query.parser.QueryParserException;
import it.unimi.dsi.big.mg4j.search.DocumentIterator;
import it.unimi.dsi.big.mg4j.search.score.AbstractAggregator;
import it.unimi.dsi.big.mg4j.search.score.DocumentScoreInfo;
import it.unimi.dsi.big.mg4j.search.score.LinearAggregator;
import it.unimi.dsi.big.mg4j.search.score.ScoredDocumentBoundedSizeQueue;
import it.unimi.dsi.big.mg4j.search.score.Scorer;
import java.io.IOException;
import java.util.Arrays;
import java.util.Iterator;
import org.apache.log4j.Logger;
/** An engine that takes a query and returns results, using a programmable
* set of scorers and policies.
*
* This class embodies most of the work that must be done when answering a query.
* Basically, {@link #process(String, int, int, ObjectArrayList) process(query,offset,length,results)} takes query
,
* parses it, turns it into a document iterator, scans the results, and deposits
* length
results starting at offset
into the list results
.
*
*
There however several additional features available. First of all, either by separating
* several queries with commas, or using directly {@link #process(Query[], int, int, ObjectArrayList)}
* it is possible to resolve a series of queries with an “and-then” semantics: results
* are added from each query, provided they did not appear before.
*
*
It is possible to {@linkplain #score(Scorer[], double[]) score queries} using one or
* more scorer with different weights (see {@link it.unimi.dsi.big.mg4j.search.score}), and also
* set {@linkplain #setWeights(Reference2DoubleMap) different weights for different indices} (they
* will be passed to the scorers). The scorers influence the order when processing each query,
* but results from different “and-then” queries are simply concatenated.
*
*
When using multiple scorers, {@linkplain #equalize(int) equalisation} can be used
* to avoid the problem associated with the potentially different value ranges of each scorer. Equalisation
* evaluates a settable number of sample documents and normalize the scorers using the maximum value in
* the sample. See {@link it.unimi.dsi.big.mg4j.search.score.AbstractAggregator} for some elaboration.
*
*
{@linkplain #multiplex Multiplexing} transforms a query q into index0:q | index1:q ….
* In other words, the query is multiplexed on all available indices. Note that if inside q
* there are selection operators that specify an index, the inner specification will overwrite
* the external one, so that the semantics of the query is only amplified, but never contradicted.
*
*
The results returned are instances of {@link it.unimi.dsi.big.mg4j.search.score.DocumentScoreInfo}. If
* an {@linkplain #intervalSelector interval selector} has been set,
* the info
field will contain a map from indices to arrays of {@linkplain it.unimi.dsi.big.mg4j.query.SelectedInterval selected intervals}
* satisfying the query (see {@link it.unimi.dsi.big.mg4j.search} for some elaboration on minimal-interval semantics support in MG4J).
*
*
For examples of usage of this class, please look at {@link it.unimi.dsi.big.mg4j.query.Query}
* and {@link it.unimi.dsi.big.mg4j.query.QueryServlet}.
*
*
Warning: This class is highly experimental. It has become
* definitely more decent in MG4J, but still needs some refactoring.
*
*
Warning: This class is not
* thread safe, but it provides {@linkplain it.unimi.dsi.lang.FlyweightPrototype flyweight copies}.
* The {@link #copy()} method is strengthened so to return an object implementing this interface.
*
* @author Sebastiano Vigna
* @author Paolo Boldi
* @since 1.0
*/
public class QueryEngine implements FlyweightPrototype {
private static final Logger LOGGER = Util.getLogger( QueryEngine.class );
private static final boolean ASSERTS = false;
/** The parser used to parse queries. */
public final QueryParser queryParser;
/** A map from names to indices. */
public final Object2ReferenceMap indexMap;
/** The number of indices used by {@link #queryParser}. */
public final int numIndices;
/** Whether multiplex is active. */
public volatile boolean multiplex;
/** The current interval selector, if any. */
public volatile IntervalSelector intervalSelector;
/** The current scorer, or null
if no scorer is in use. */
private Scorer scorer;
/** The builder visitor used to make queries into document iterators. */
private final QueryBuilderVisitor builderVisitor;
/** A map associating a weight with each index. */
protected final Reference2DoubleOpenHashMap index2Weight;
/** A transformer that will be applied to queries before resolving them, or null
. */
private QueryTransformer transformer;
/** Creates a new query engine.
*
* @param queryParser a query parser, or null
if this query engine will {@linkplain #process(Query[], int, int, ObjectArrayList) just process pre-parsed queries}.
* @param builderVisitor a builder visitor to transform {@linkplain Query queries} into {@linkplain DocumentIterator document iterators}.
* @param indexMap a map from symbolic name to indices (used for multiplexing and default weight initialisation).
*/
public QueryEngine( final QueryParser queryParser, final QueryBuilderVisitor builderVisitor, final Object2ReferenceMap indexMap ) {
this.queryParser = queryParser;
this.builderVisitor = builderVisitor;
this.indexMap = indexMap;
numIndices = indexMap.size();
index2Weight = new Reference2DoubleOpenHashMap( indexMap.size() );
index2Weight.defaultReturnValue( Double.NaN ); // Safety measure against improper access
for( Index index : indexMap.values() ) this.index2Weight.put( index, 1.0 / numIndices );
}
public synchronized QueryEngine copy() {
final QueryEngine newEngine = new QueryEngine( FlyweightPrototypes.copy( queryParser ), builderVisitor.copy(), indexMap );
newEngine.multiplex = multiplex;
newEngine.intervalSelector = FlyweightPrototypes.copy( intervalSelector );
newEngine.scorer = FlyweightPrototypes.copy( scorer );
newEngine.setWeights( index2Weight );
return newEngine;
}
/** Activate equalisation with the given number of samples-
*
* @param samples the number of samples for equalisation, or 0 for no equalisation.
*/
public synchronized void equalize( final int samples ) {
if ( scorer == null ) throw new IllegalStateException( "There is no scorer" );
if ( ! ( scorer instanceof AbstractAggregator ) ) throw new IllegalStateException( "The current scorer is not aggregated" );
((AbstractAggregator)scorer).equalize( samples );
}
/** Sets the scorers for this query engine.
*
* If scorer
has length zero, scoring is disabled. If it has length 1,
* the only scorer is used for scoring, and the only element of weight
is
* discarded. Otherwise, a {@link LinearAggregator} is used to combine results from
* the given scorers, using the given weights.
*
* @param scorer a (possibly empty) array of {@linkplain Scorer scorers}.
* @param weight a parallel array of weights (not to be confused with index weights).
*/
public synchronized void score( final Scorer[] scorer, final double[] weight ) {
if ( scorer.length == 0 ) this.scorer = null;
else {
if ( scorer.length == 1 ) this.scorer = scorer[ 0 ];
else this.scorer = new LinearAggregator( scorer, weight );
this.scorer.setWeights( index2Weight );
}
}
/** Sets a scorer for this query engine.
*
* @param scorer a scorer.
* @see #score(Scorer[], double[])
*/
public synchronized void score( final Scorer scorer ) {
score( new Scorer[] { scorer }, new double[] { 1 } );
}
/** Sets the transformer for this engine, or disables query transformation.
*
* @param transformer a {@linkplain QueryTransformer query transformer}, or null
to disable query transformation.
* */
public synchronized void transformer( final QueryTransformer transformer ) {
this.transformer = transformer;
}
/** Sets the index weights.
*
*
This method just delegates to {@link Scorer#setWeights(Reference2DoubleMap)}.
*
* @param index2Weight a map from indices to weights.
*/
public synchronized void setWeights( final Reference2DoubleMap index2Weight ) {
this.index2Weight.clear();
this.index2Weight.defaultReturnValue( 0 );
this.index2Weight.putAll( index2Weight );
if ( scorer != null ) scorer.setWeights( index2Weight );
}
/** Turns the given query into a multiplexed query if {@link #multiplex} is on.
*
* @param query a query.
* @return query
, if {@link #multiplex} is off; a multiplexed version of query
, otherwise.
*/
private String multiplex( final String query ) {
if ( ! multiplex ) return query;
final Iterator it = indexMap.keySet().iterator();
final StringBuilder builder = new StringBuilder();
while ( it.hasNext() ) {
builder.append( it.next() + ":(" + query + ")" );
if ( it.hasNext() ) builder.append( " | " );
}
LOGGER.debug( "Multiplex is active: submitting " + builder );
return builder.toString();
}
/** Parses one or more comma-separated queries and deposits in a given array a segment of the
* results corresponding to the queries, using the current settings of this query engine.
*
* Results are accumulated with an “and-then” semantics: results
* are added from each query in order, provided they did not appear before.
*
* @param queries one or more queries separated by commas.
* @param offset the first result to be added to results
.
* @param length the number of results to be added to results
* @param results an array list that will hold all results.
* @return the number of relevant documents scanned while filling results
.
*/
public int process( final String queries, int offset, final int length, final ObjectArrayList>> results ) throws QueryParserException, QueryBuilderVisitorException, IOException {
LOGGER.debug( "Processing query \"" + queries + "\", offset=" + offset + ", length="+ length );
final String[] part = queries.split( "," );
final Query[] partQuery = new Query[ part.length ];
for( int i = 0; i < part.length; i++ ) {
final String q = multiplex( part[ i ] );
partQuery[ i ] = queryParser.parse( q );
if ( transformer != null ) partQuery[ i ] = transformer.transform( partQuery[ i ] );
}
return process( partQuery, offset, length, results );
}
/** Processes one pre-parsed query and deposits in a given array a segment of the
* results corresponding to the query, using the current settings of this query engine.
*
* Results are accumulated with an “and-then” semantics: results
* are added from each query in order, provided they did not appear before.
*
* @param query a query;
* @param offset the first result to be added to results
.
* @param length the number of results to be added to results
* @param results an array list that will hold all results.
* @return the number of documents scanned while filling results
.
*/
public int process( final Query query, final int offset, final int length, final ObjectArrayList>> results ) throws QueryBuilderVisitorException, IOException {
return process( new Query[] { query }, offset, length, results );
}
/** Processes one or more pre-parsed queries and deposits in a given array a segment of the
* results corresponding to the queries, using the current settings of this query engine.
*
* Results are accumulated with an “and-then” semantics: results
* are added from each query in order, provided they did not appear before.
*
* @param query an array of queries.
* @param offset the first result to be added to results
.
* @param length the number of results to be added to results
* @param results an array list that will hold all results.
* @return the number of documents scanned while filling results
.
*/
@SuppressWarnings("unchecked")
public int process( final Query query[], final int offset, final int length, final ObjectArrayList>> results ) throws QueryBuilderVisitorException, IOException {
LOGGER.debug( "Processing Query array \"" + Arrays.toString( query ) + "\", offset=" + offset + ", length="+ length );
results.clear();
double lastMinScore = 1;
int total = 0, count, currOffset = offset, currLength = length;
final LongSet alreadySeen = query.length > 1 ? new LongOpenHashSet() : null;
for( int i = 0; i < query.length; i++ ) {
final int initialResultSize = results.size();
DocumentIterator documentIterator = query[ i ].accept( builderVisitor.prepare() );
count = scorer != null?
getScoredResults( documentIterator, currOffset, currLength, lastMinScore, results, alreadySeen ) :
getResults( documentIterator, currOffset, currLength, results, alreadySeen );
documentIterator.dispose();
if ( results.size() > 0 ) lastMinScore = results.get( results.size() - 1 ).score;
total += count;
currOffset -= count;
if ( currOffset < 0 ) {
currLength += currOffset;
currOffset = 0;
}
// Check whether we have intervals, we want intervals *and* we added some results.
boolean someHavePositions = false;
for( Index index: documentIterator.indices() ) someHavePositions |= index.hasPositions;
if ( someHavePositions && intervalSelector != null && results.size() != initialResultSize ) {
// We must now enrich the returned result with intervals
DocumentScoreInfo> sorted[] =
results.subList( initialResultSize, results.size() ).toArray( new DocumentScoreInfo[ results.size() - initialResultSize ] );
ObjectArrays.quickSort( sorted, DocumentScoreInfo.DOCUMENT_COMPARATOR );
documentIterator = query[ i ].accept( builderVisitor.prepare() );
for( DocumentScoreInfo> dsi: sorted ) {
documentIterator.skipTo( dsi.document );
dsi.info = intervalSelector.select( documentIterator, new Reference2ObjectArrayMap( numIndices ) );
}
documentIterator.dispose();
}
if ( ASSERTS ) assert length >= results.size();
if ( length == results.size() ) break;
}
return total;
}
private int getScoredResults( final DocumentIterator documentIterator, final int offset, final int length, final double lastMinScore, final ObjectArrayList>> results, final LongSet alreadySeen ) throws IOException {
final ScoredDocumentBoundedSizeQueue> top = new ScoredDocumentBoundedSizeQueue>( offset + length );
long document;
int count = 0; // Number of not-already-seen documents
scorer.wrap( documentIterator );
while ( ( document = scorer.nextDocument() ) != -1 ) {
if ( alreadySeen != null && ! alreadySeen.add( document ) ) continue;
count++;
// TODO: we should avoid enqueuing until we really know we shall use the values
top.enqueue( document, scorer.score() );
}
final int n = Math.max( top.size() - offset, 0 ); // Number of actually useful documents, if any
if ( ASSERTS ) assert n <= length : n;
if ( n > 0 ) {
final int s = results.size();
results.size( s + n );
final Object[] elements = results.elements();
// We scale all newly inserted item so that scores are always decreasing
for ( int i = n; i-- != 0; ) elements[ i + s ] = top.dequeue();
// The division by the maximum score was missing in previous versions; can be removed to reproduce regressions.
// TODO: this will change scores if offset leaves out an entire query
final double adjustment = lastMinScore / ( s != 0 ? ((DocumentScoreInfo>)elements[ s ]).score : 1.0 );
for ( int i = n; i-- != 0; ) ((DocumentScoreInfo>)elements[ i + s ]).score *= adjustment;
}
return count;
}
private int getResults( final DocumentIterator documentIterator, final int offset, final int length, final ObjectArrayList>> results, final LongSet alreadySeen ) throws IOException {
long document;
int count = 0; // Number of not-already-seen documents
// Unfortunately, to provide the exact count of results we have to scan the whole iterator.
while ( ( document = documentIterator.nextDocument() ) != -1 ) {
if ( alreadySeen != null && ! alreadySeen.add( document ) ) continue;
if ( count >= offset && count < offset + length ) results.add( new DocumentScoreInfo>( document, -1 ) );
count++;
}
return count;
}
public String toString() {
return this.getClass().getName() + indexMap;
}
}