src.it.unimi.dsi.big.mg4j.search.ConsecutiveDocumentIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j-big Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.
The newest version!
package it.unimi.dsi.big.mg4j.search;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2003-2011 Paolo Boldi and Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.big.mg4j.index.Index;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.longs.LongSet;
import it.unimi.dsi.util.Interval;
import it.unimi.dsi.util.Intervals;

import java.io.IOException;

/** An iterator returning documents containing consecutive intervals (in query order) 
 * satisfying the underlying queries.
 * 
 * As an additional service, this class makes it possible to specify gaps between
 * intervals. If gaps are specified, a match will satisfy the condition
 * that the left extreme of the first interval is larger than or equal to the
 * first gap, the left extreme of the second interval is equal to
 * the right extreme of the first interval plus the second gap plus one, 
 * the left extreme of the third interval is equal to the right extreme
 * of the second interval plus the third gap plus one and so on.  The standard
 * semantics corresponds thus to the everywhere zero gap array. That
 * the returned intervals will contain the leftmost gap, too. 
 * 
 * 
This semantics
 * makes it possible to perform phrasal searches “with holes”, typically
 * because of stopwords that have not been indexed. Note that it is possible to specify
 * a gap before the first interval, but not after the last interval,
 * as in general the document length is not known at this level of query resolution.
 * 
 * 
This class will handle correctly {@link IntervalIterators#TRUE TRUE} iterators; in this
 * case, the semantics is defined as follows: an interval is in the output if it is formed by the union of disjoint intervals,
 * one from each input list, and each gap of value k corresponds to k iterators
 * returning all document positions as singleton intervals. Since {@link IntervalIterators#TRUE TRUE} represents a list containing just
 * the empty interval, the result is equivalent to dropping {@link IntervalIterators#TRUE TRUE} iterators from the input; as
 * a consequence, the gap of a {@link IntervalIterators#TRUE TRUE} iterator is merged with that of the following iterator.
 * 
 * 
Warning: In case gaps are specified, the mathematically correct semantics would require that
 * gaps before {@link IntervalIterators#TRUE TRUE} iterators that are not followed by any non-{@link IntervalIterators#TRUE TRUE} iterators
 * have the effect of enlarging the resulting intervals on the right side. However,
 * this behaviour is very difficult to implement at this level because document lengths are not known. For this
 * reason, if one or more {@link IntervalIterators#TRUE TRUE} iterators appear a the end of the component iterator list they will be simply dropped.  
 */

public class ConsecutiveDocumentIterator extends AbstractOrderedIntervalDocumentIterator {
	/** The gap array. This is essentially the array provided at construction time; however, if a {@link ConsecutiveIndexIntervalIterator}
	 * is requested by {@link #getComposedIntervalIterator(Index)} this array will be used to store cumulative gaps. */
	private final int gap[];
	
	/** Returns a document iterator that computes the consecutive AND of the given array of iterators.
	 * 
	 * 
Note that the special case of the empty and of the singleton arrays
	 * are handled efficiently.
	 * 
 	 * @param index the default index; relevant only if it has zero length.
	 * @param documentIterator the iterators to be composed.
	 * @return a document iterator that computes the consecutive AND of it. 
	 * @throws IOException 
	 */
	public static DocumentIterator getInstance( final Index index, final DocumentIterator... documentIterator ) throws IOException {
		if ( documentIterator.length == 0 ) throw new IllegalArgumentException( "The provided array of document iterators is empty." );
		if ( documentIterator.length == 1 ) return documentIterator[ 0 ];
		return new ConsecutiveDocumentIterator( documentIterator, null );
	}
	
	/** Returns a document iterator that computes the consecutive AND of the given nonzero-length array of iterators.
	 * 
	 * 
Note that the special case of the singleton array is handled efficiently.
	 * 
	 * @param documentIterator the iterators to be composed (at least one).
	 * @return a document iterator that computes the consecutive AND of documentIterator. 
	 * @throws IOException 
	 */
	public static DocumentIterator getInstance( final DocumentIterator... documentIterator ) throws IOException {
		if ( documentIterator.length == 0 ) throw new IllegalArgumentException( "The provided array of document iterators is empty." );
		if ( documentIterator.length == 1 ) return documentIterator[ 0 ];
		return getInstance( null, documentIterator );
	}
	
	/** Returns a document iterator that computes the consecutive AND of the given nonzero-length array of iterators, adding
	 * gaps between intervals.
	 * 
	 * 
A match will satisfy the condition
	 * that the left extreme of the first interval is larger than or equal to the
	 * first gap, the left extreme of the second interval is larger than 
	 * the right extreme of the first interval plus the second gap, and so on. This semantics
	 * makes it possible to perform phrasal searches “with holes”, typically
	 * because of stopwords that have not been indexed.
	 * 
	 * @param documentIterator the iterators to be composed (at least one).
	 * @param gap an array of gaps parallel to documentIterator, or null for no gaps. 
	 * @return a document iterator that computes the consecutive AND of documentIterator using the given gaps.  
	 * @throws IOException 
	 */
	public static DocumentIterator getInstance( final DocumentIterator documentIterator[], final int gap[] ) throws IOException {
		if ( gap != null && gap.length != documentIterator.length ) throw new IllegalArgumentException( "The number of gaps (" + gap.length + ") is not equal to the number of document iterators (" + documentIterator.length +")" );
		if ( documentIterator.length == 1 && ( gap == null || gap[ 0 ] == 0 ) ) return documentIterator[ 0 ];
		return new ConsecutiveDocumentIterator( documentIterator, gap );
	}
	
	protected ConsecutiveDocumentIterator( final DocumentIterator[] documentIterator, final int[] gap ) throws IOException {
		super( documentIterator );
		if ( gap == null ) this.gap = new int[ n ];
		else this.gap = gap.clone();
	}

	protected IntervalIterator getComposedIntervalIterator( final Index unused ) {
		if ( ASSERTS ) assert unused == soleIndex;
		if ( indexIterator == null ) return new ConsecutiveIntervalIterator( gap );
		// In this case, gap must be made cumulative
		for( int i = 1; i < n; i++ ) gap[ i ] += gap[ i - 1 ] + 1;
		return new ConsecutiveIndexIntervalIterator( gap );
	}

	
	/** An interval iterator returning the BLOCK of the component iterator 
	 * (i.e., intervals made of sequences of consecutive intervals
	 * of the component iterator, in the given order). 
	 * 
	 * 
In this implementation, {@link #advanced} is
	 * never true when {@link AbstractOrderedIntervalIterator#endOfProcess} is true. 
	 */
	
	private class ConsecutiveIntervalIterator extends AbstractOrderedIntervalIterator {
		/** A cached reference to the gap array. */
		@SuppressWarnings("hiding")
		private final int[] gap;
		/** The actual gaps. They depend on whether some {@link IntervalIterators#TRUE} iterator reduces the iterator array. */
		private final int[] actualGap;
		/** Whether the scan is over. */
		private boolean endOfProcess;
		/** The number of non-{@link IntervalIterators#TRUE} interval iterator. */
		private int m;

		public ConsecutiveIntervalIterator( final int[] gap ) {
			this.gap = gap;
			// The enlargement is made necessary by the filling long in reset().
			this.actualGap = new int[ n + 1 ];
		}

		/** Loads {@link #curr} with the first interval from each non-{@link IntervalIterators#TRUE} iterator, leaving
		 * in {@link #m} the number of non-{@link IntervalIterators#TRUE} iterators, and in {@link #actualGap}
		 * the gaps to be used for those {@link #m} iterators.
		 */

		public void reset() throws IOException {
			final int[] actualGap = this.actualGap;
			final int[] gap = this.gap;
			final IntervalIterator[] intervalIterator = this.intervalIterator;
			
			actualGap[ m = 0 ] = -1; // The first interval has actual gap zero if it has gap zero, so we compensate here for the increment below.

			int i;
			for( i = 0; i < n; i++ ) {
				actualGap[ m ] += gap[ i ]; // Accumulate gap
				
				if ( ( intervalIterator[ m ] = documentIterator[ i ].intervalIterator() ) != IntervalIterators.TRUE ) {
					actualGap[ m ]++; // If this interval iterator is real, add one.
					curr[ m ] = Intervals.MINUS_INFINITY;
					actualGap[ ++m ] = 0; // Prepare next gap.
				}
			}

			if ( m == 0 ) throw new IllegalStateException();			

			next = null;
			do	{
				curr[ 0 ] = intervalIterator[ 0 ].nextInterval(); 
			} while( curr[ 0 ] != null && curr[ 0 ].left < actualGap[ 0 ] );
			if ( ! ( endOfProcess = curr[ 0 ] == null ) ) next = align();
		}
		
		public void intervalTerms( final LongSet terms ) {
			for( int i = m; i-- != 0; ) intervalIterator[ i ].intervalTerms( terms );
		}
		
		private Interval align() throws IOException {
			if ( DEBUG ) System.err.println( this + ".align()" );
			
			final Interval[] curr = this.curr;
			final int[] actualGap = this.actualGap;
			final IntervalIterator[] intervalIterator = this.intervalIterator;
			
			if ( DEBUG ) System.err.println( java.util.Arrays.asList( curr ) );
			int k = 0;

			while( k < m ) {
				for ( k = 1; k < m; k++ ) {
					while ( curr[ k ].left < curr[ k - 1 ].right + actualGap[ k ] )
						if ( ( curr[ k ] = intervalIterator[ k ].nextInterval() ) == null ) {
							endOfProcess = true;
							return null;
						}

					if ( curr[ k ].left > curr[ k - 1 ].right + actualGap[ k ] ) {
						if ( endOfProcess = ( ( curr[ 0 ] = intervalIterator[ 0 ].nextInterval() ) == null ) ) return null;
						break;
					} 
				}
			}
			
			return Interval.valueOf( curr[ 0 ].left - actualGap[ 0 ], curr[ m - 1 ].right );
		}
		
		public Interval nextInterval() throws IOException {
			if ( next != null ) {
				final Interval result = next;
				next = null; 
				return result;
			}

			if ( endOfProcess ) return null;
			
			if ( ( curr[ 0 ] = intervalIterator[ 0 ].nextInterval() ) == null ) {
				endOfProcess = true;
				return null;
			}

			return align();
		}
		
		public int extent() {
			int s = 0;
			for ( int i = m; i-- != 0; ) s += intervalIterator[ i ].extent() + actualGap[ i ];
			return s - m + 1;
		}

	}


	private class ConsecutiveIndexIntervalIterator extends AbstractOrderedIndexIntervalIterator {
		/** A cached reference to the gap array. */
		@SuppressWarnings("hiding")
		private final int[] gap;
		/** Whether the scan is over. */
		private boolean endOfProcess;
		
		public ConsecutiveIndexIntervalIterator( final int[] gap ) {
			this.gap = gap;
		}
		
		/** Resets the iterator by calling the superclass method, and then aligning all iterators.
		 * 
		 * Note that in this class {@link #curr} is used to denote the value of the current position
		 * minus the corresponding {@linkplain #gap cumulative gap}; this method updates {@link #curr} accordingly. */

		public void reset() throws IOException {
			final int[][] position = this.position;
			final int[] currPos = this.currPos;
			final int[] curr = this.curr;
			final int[] count = this.count;
			final int[] gap = this.gap;

			IntArrays.fill( currPos, 0 );
			for( int i = n; i-- != 0; ) {
				count[ i ] = indexIterator[ i ].count();
				position[ i ] = indexIterator[ i ].positionArray();
				curr[ i ] = position[ i ][ 0 ];
			}
			endOfProcess = false;
			for( int i = n; i-- != 0; ) curr[ i ] -= gap[ i ];
			
			if ( gap[ 0 ] != 0 ) {
				// Go beyond the 0-th gap. This must be done just once.
				next = null;
				while ( curr[ 0 ] < 0 && ++currPos[ 0 ] < count[ 0 ] ) curr[ 0 ] = position[ 0 ][ currPos[ 0 ] ] - gap[ 0 ];
				endOfProcess = currPos[ 0 ] == count[ 0 ];
			}
			if ( ! endOfProcess ) next = align();
		}

		public void intervalTerms( final LongSet terms ) {
			for( int i = n; i-- != 0; ) terms.add( indexIterator[ i ].termNumber() );
		}
		
		private Interval align() {
			if ( DEBUG ) System.err.println( this + ".align()" );
			if ( n == 1 ) return Interval.valueOf( curr[ 0 ], curr[ 0 ] + gap[ 0 ] );
			
			final int[][] position = this.position;
			final int[] currPos = this.currPos;
			final int[] curr = this.curr;
			final int[] count = this.count;
			final int[] gap = this.gap;
			
			int c, k = 1, l = n <= 2 ? 0 : 2; // This is actually 2 % n
			boolean oneRoundDone = false;
			int[] p;
			int start = curr[ 0 ];
			
			for(;;) {
				p = position[ k ];
				c = currPos[ k ];
				// First, we try to align the k-th term.
				while ( c < count[ k ] && p[ c ] - gap[ k ] < start ) c++;
				// If we exhaust the term positions, it's all over.
				if ( c == count[ k ] ) {
					endOfProcess = true;
					return null;
				}
				curr[ k ] = p[ currPos[ k ] = c ] - gap[ k ];
				// If we went beyond start + k, we must update start.
				if ( curr[ k ] > start ) start = curr[ k ]; 
				
				// If k == 0, it means we have made a full round of alignment, so the next check is now valid.
				oneRoundDone |= ( k == 0 );
				// If oneRoundDone, all current normalised positions (curr[ x ] - gap[ x ]) are squeezed between start and curr[ l ].
				if ( oneRoundDone && curr[ l ] == start ) return Interval.valueOf( curr[ 0 ], curr[ 0 ] + gap[ n - 1 ] ); 
				k = l;
				
				if ( ( l = l + 1 ) == n ) l = 0;
			}
		}
		
		public Interval nextInterval() {
			if ( next != null ) {
				final Interval result = next;
				next = null; 
				return result;
			}

			if ( endOfProcess ) return null;
			
			if ( ++currPos[ 0 ] < count[ 0 ] ) curr[ 0 ] = position[ 0 ][ currPos[ 0 ] ] - gap[ 0 ];
			else {
				endOfProcess = true;
				return null;
			}
			return align();			
		}

		public int extent() {
			return gap[ n - 1 ] + 1;
		}
	}
}