src.it.unimi.dsi.big.mg4j.index.MultiTermIndexIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j-big Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.
The newest version!
package it.unimi.dsi.big.mg4j.index;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2003-2011 Paolo Boldi and Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.big.mg4j.index.payload.Payload;
import it.unimi.dsi.big.mg4j.search.AbstractCompositeDocumentIterator;
import it.unimi.dsi.big.mg4j.search.AbstractUnionDocumentIterator;
import it.unimi.dsi.big.mg4j.search.DocumentIterator;
import it.unimi.dsi.big.mg4j.search.IntervalIterator;
import it.unimi.dsi.big.mg4j.search.OrDocumentIterator;
import it.unimi.dsi.big.mg4j.search.score.BM25Scorer;
import it.unimi.dsi.big.mg4j.search.score.Scorer;
import it.unimi.dsi.big.mg4j.search.visitor.DocumentIteratorVisitor;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntIterators;
import it.unimi.dsi.fastutil.longs.LongSet;
import it.unimi.dsi.fastutil.objects.ObjectHeapIndirectPriorityQueue;
import it.unimi.dsi.util.Interval;

import java.io.IOException;

/** A virtual {@linkplain IndexIterator index iterator} that merges several component index iterators.
*
* This class adds to {@link it.unimi.dsi.big.mg4j.search.AbstractUnionDocumentIterator}
* an interval iterator generating the OR of the intervals returned for each of the documents involved.
* The main difference with an {@link OrDocumentIterator} built on the same array of component iterators
* is that this class implements {@link IndexIterator} and hence provides a {@link #count()} (the sum
* of counts of those component iterators positioned on the current document) and a {@link #frequency()}. The
* latter is by default the maximum frequency of a component iterator, but it can be set 
* at {@link MultiTermIndexIterator#getInstance(long, Index, IndexIterator[]) construction time}.
* 
* 
The main raison d'être of this class is support for query expansion: a blind application
* of {@link OrDocumentIterator} to an array of index iterators would mislead {@linkplain Scorer scorers} such as {@link BM25Scorer}
* because low-frequency terms (e.g., hapax legomena) would be responsible for most of the score.
* 
* 
Note that {@linkplain DocumentIteratorVisitor} has a {@linkplain DocumentIteratorVisitor#visit(IndexIterator) visit method for generic index iterator}
* and a {@linkplain DocumentIteratorVisitor#visit(MultiTermIndexIterator) visit method for instances of this class}.
* This approach provides additional flexibility—a scorer, for instance, might treat an instance of
* this class as a standard {@link IndexIterator}, or it might choose to {@linkplain #front(IndexIterator[]) query which terms actually appear}
* and do something more sophisticated (for instance, using {@linkplain DocumentIterator#weight() weights}).
*/

public class MultiTermIndexIterator extends AbstractUnionDocumentIterator implements IndexIterator {
	@SuppressWarnings("unused")
	private static final boolean ASSERTS = false;
	
	/** Value to be used for term frequency, or {@link Long#MIN_VALUE} to use the max; in any case, this attribute is used to cache
	 *  frequency after the first call to {@link #frequency()}. */
	private long frequency;
	/** The term of this iterator. */
	protected String term;
	/** The id of this iterator. */
	protected int id;
	/** The count of the last returned document. */
	private int count = -1;
	/** Whether all underlying index iterators have counts. */
	private final boolean hasCounts; 
	/** Whether all underlying index iterators have positions. */
	private final boolean hasPositions;
	
	/** Returns an index iterator that merges the given array of iterators.
	 *  This method requires that at least one iterator is provided. The frequency is computed as a max,
	 *  and {@link #index()} will return the result of the same method on the first iterator.
	 * 
	 * @param indexIterator the iterators to be joined (at least one).
	 * @return a merged index iterator. 
	 * @throws IllegalArgumentException if no iterators were provided.
	 */
	public static IndexIterator getInstance( final IndexIterator... indexIterator  ) throws IOException {
		return getInstance( Long.MIN_VALUE, indexIterator );
	}

	/** Returns an index iterator that merges the given array of iterators.
	 * 
	 * 
Note that the special case of the empty and of the singleton arrays
	 * are handled efficiently. The frequency is computed as a max, and
	 * {@link #index()} will return index.
	 * 
	 * @param index the index that wil be returned by {@link #index()}.
	 * @param indexIterator the iterators to be joined.
	 * @return a merged index iterator. 
	 */
	public static IndexIterator getInstance( final Index index, final IndexIterator... indexIterator  ) throws IOException {
		return getInstance( Long.MIN_VALUE, index, indexIterator );
	}

	/** Returns an index iterator that merges the given array of iterators.
	 *  This method requires that at least one iterator is provided.
	 * 
	 * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).
	 * @param indexIterator the iterators to be joined (at least one).
	 * @return a merged index iterator. 
	 * @throws IllegalArgumentException if no iterators were provided, or they run on different indices.
	 */
	public static IndexIterator getInstance( final long defaultFrequency, final IndexIterator... indexIterator  ) throws IOException {
		if ( indexIterator.length == 0 ) throw new IllegalArgumentException();
		return getInstance( defaultFrequency, indexIterator[ 0 ].index(), indexIterator );
	}

	/** Returns an index iterator that merges the given array of iterators.
	 * 
	 * 
Note that the special case of the empty and of the singleton arrays
	 * are handled efficiently. 
	 * 
	 * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).
	 * @param index the index that wil be returned by {@link #index()}.
	 * @param indexIterator the iterators to be joined.
	 * @return a merged index iterator. 
	 * @throws IllegalArgumentException if there is some iterator on an index different from index.
	 */
	public static IndexIterator getInstance( final long defaultFrequency, final Index index, final IndexIterator... indexIterator  ) throws IOException {
		if ( indexIterator.length == 0 ) return index.getEmptyIndexIterator();
		if ( indexIterator.length == 1 ) return indexIterator[ 0 ];
		return new MultiTermIndexIterator( defaultFrequency, indexIterator );
	}

	
	/** Creates a new document iterator that merges the given array of iterators. 
	 * 
	 * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max).
  	 * @param indexIterator the iterators to be joined.
	 */
	@SuppressWarnings("cast")
	protected MultiTermIndexIterator( final long defaultFrequency, final IndexIterator... indexIterator ) throws IOException {
		super( (DocumentIterator[]) indexIterator );
		this.frequency = defaultFrequency;
		boolean havePositions = true, haveCounts = true;
		for( IndexIterator i: indexIterator ) {
			if ( ! i.index().hasCounts ) haveCounts = false;
			if ( ! i.index().hasPositions ) havePositions = false;
				
		}
		
		hasCounts = haveCounts;
		hasPositions = havePositions;
	}

	protected IntervalIterator getComposedIntervalIterator( final Index index ) {
		return new MultiTermIntervalIterator();
	}

	@Override
	public long skipTo( final long n ) throws IOException {
		if ( curr >= n ) return curr;
		// We invalidate count before calling the superclass method.
		count = -1;
		return super.skipTo( n );
	}
	
	public long nextDocument() throws IOException {
		// We invalidate count before calling the superclass method.
		count = -1;
		return super.nextDocument();
	}
	
	/** The count is the sum of counts of those component iterators positioned on the current document.
	 * 
	 *  @return the sum of counts.
	 */
	public int count() throws IOException {
		ensureOnADocument();
		if ( ! hasCounts ) throw new IllegalStateException( "Some of the underlying iterators do not have counts" );
		if ( count == -1 ) {
			int count = 0;
			for ( int i = computeFront(); i-- != 0; ) count += indexIterator[ front[ i ] ].count();
			this.count = count;
		}
		return count;
	}

	/** Fills the given array with the index iterators composing the current front.
	 * 
	 * 
This method is essentially a safe exposure of the {@linkplain ObjectHeapIndirectPriorityQueue#front(int[]) front of the queue}
	 * merging the component {@linkplain IndexIterator index iterators}.
	 * After a call to {@link #nextDocument()}, you can use this method to know
	 * which terms actually appear in the current document. You can use the public
	 * field {@link AbstractCompositeDocumentIterator#n} to size the argument
	 * array appropriately.
	 * 
	 * @param indexIterator an array, at least large as the number of component index iterators,
	 * that will be partially filled with the index iterators corresponding to terms appearing in the current document.
	 * @return the number of iterators written into indexIterator. 
	 */
	public int front( final IndexIterator[] indexIterator ) {
		final int s = computeFront();
		for( int i = s; i-- != 0; ) indexIterator[ i ] = this.indexIterator[ front[ i ] ];
		return s;
	}
	
	/** The frequency is either the default frequency set at construction time, or the maximum frequency of the component iterators. 
	 * 
	 * @return the frequency.
	 */
	public long frequency() throws IOException {
		if ( frequency != Long.MIN_VALUE ) return frequency;
		long frequency = Long.MIN_VALUE;
		for ( int i = n; i-- != 0; ) frequency = Math.max( frequency, indexIterator[ i ].frequency() );
		return this.frequency = frequency; // caching it!
	}

	public IndexIterator term( final CharSequence term ) {
		this.term = term == null ? null : term.toString();
		return this;
	}

	public String term() { 
		return term;
	}

	public long termNumber() {
		// TODO: this is not particularly sensible
		return indexIterator[ 0 ].termNumber();
	}
	
	public IndexIterator id( final int id ) {
		this.id = id;
		return this;
	}
	
	public int id() {
		return id;
	}

	public Index index() {
		return soleIndex;
	}

	/** This method is not implemented by this class.
	 */
	public Payload payload() {
		throw new UnsupportedOperationException();
	}

	public int[] positionArray() throws IOException {
		if ( ! hasPositions ) throw new IllegalStateException( "Some of the underlying iterators do not have positions" );

		// If the front contains a single element, we can just use its position array.
		if ( computeFront() == 1 ) return indexIterator[ front[ 0 ] ].positionArray();
		
		final MultiTermIntervalIterator multiTermIntervalIterator = (MultiTermIntervalIterator)intervalIterator();
		multiTermIntervalIterator.drain();
		return multiTermIntervalIterator.cache;
	}

	public IntIterator positions() throws IOException {		
		return IntIterators.wrap( positionArray(), 0, count );
	}

	public int positions( int[] position ) throws IOException {
		int c = count;
		if ( position.length < c ) return -c;
		final int[] cache = positionArray();
		for( int i = c; i-- != 0; ) position[ i ] = cache[ i ];
		return c;
	}

	@Override
	public IndexIterator weight( final double weight ) {
		super.weight( weight );
		return this;
	}
	
	@Override
	public  T accept( DocumentIteratorVisitor visitor ) throws IOException {
		return visitor.visit( this );
	}

	@Override
	public  T acceptOnTruePaths( DocumentIteratorVisitor visitor ) throws IOException {
		return visitor.visit( this );
	}
	
	public  T acceptDeep( DocumentIteratorVisitor visitor ) throws IOException {
		return super.accept( visitor );
	}

	public  T acceptDeepOnTruePaths( DocumentIteratorVisitor visitor ) throws IOException {
		return super.accept( visitor );
	}	
	
	/** An optimised interval iterator with the same semantics as that implemented
	 *  by {@link OrDocumentIterator}, but not allowing duplicate positions.
	 *  
	 *  This iterator provides an additional {@link #drain()} method that exhausts the
	 *  merge queue, leaving however the returned elements in the {@link #cache} array. Moreover,
	 *  the internal state of the iterator is modified so that it continues to behave normally,
	 *  returning however its results from {@link #cache}. In this way we can easily provide
	 *  efficient implementations for {@link IndexIterator#positions()}, {@link IndexIterator#positionArray()},
	 *  and {@link IndexIterator#positions(int[])}.
	 */
	private class MultiTermIntervalIterator extends AbstractCompositeIndexIntervalIterator implements IntervalIterator {
		@SuppressWarnings({ "unused" })
		private final static boolean DEBUG = false;
		@SuppressWarnings("hiding")
		private final static boolean ASSERTS = false;

		/** A heap-based indirect priority queue used to keep track of the currently scanned positions. */
		private final IntHeapSemiIndirectPriorityQueue positionQueue;
		/** The cached results of this iterator. */
		public int[] cache;
		/** The number of results emitted by this iterator since the last call to {@link #reset()}. */
		private int emitted;
		/** The number of results extracted in {@link #cache} since the last call to {@link #reset()}. */
		private int extracted;

		public MultiTermIntervalIterator() {
			super( n );
			positionQueue = new IntHeapSemiIndirectPriorityQueue( curr );
			cache = new int[ 4 ];
		}

		public void reset() throws IOException {
			emitted = extracted = 0;
			next = null;
			positionQueue.clear();

			for ( int i = computeFront(), k; i-- != 0; ) {
				k = front[ i ];
				position[ k ] = indexIterator[ k ].positionArray();
				count[ k ] = indexIterator[ k ].count();
				curr[ k ] = position[ k ][ 0 ];
				currPos[ k ] = 0;
				positionQueue.enqueue( k );
			}

			if ( ASSERTS ) assert ! positionQueue.isEmpty();
		}

		public void intervalTerms( final LongSet terms ) {
			// TODO: this is not particularly sensible
			terms.add( indexIterator[ 0 ].termNumber() );
		}
		
		public Interval nextInterval() {
			if ( next != null ) {
				final Interval result = next;
				next = null;
				return result;
			}
			
			if ( emitted < extracted ) return Interval.valueOf( cache[ emitted++ ] );

			if ( positionQueue.isEmpty() ) return null;

			final int first = positionQueue.first();

			if ( extracted == cache.length ) cache = IntArrays.grow( cache, extracted + 1 );
			cache[ extracted++ ] = curr[ first ];

			if ( ++currPos[ first ] < count[ first ] ) {
				curr[ first ] = position[ first ][ currPos[ first ] ];
				positionQueue.changed();
				if ( curr[ positionQueue.first() ] == cache[ extracted - 1 ] ) throw new IllegalArgumentException( "Duplicate positions in " + this );
			}
			else positionQueue.dequeue();
				
			return Interval.valueOf( cache[ emitted++ ] );
		}

		public int extent() {
			return 1;
		}
		
		/** Drains all elements from the queue, stores them in {@link #cache} and
		 * restores {@link #emitted} so that the iterators continues to work transparently. 
		 */
		
		public void drain() {
			final int emittedNow = emitted - ( next != null ? 1 : 0 );
			next = null;
			while( nextInterval() != null );
			emitted = emittedNow;
		}
 	}
}