src.it.unimi.dsi.big.mg4j.search.AbstractUnionDocumentIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j-big Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.
The newest version!
package it.unimi.dsi.big.mg4j.search;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2003-2011 Paolo Boldi and Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.big.mg4j.index.Index;
import it.unimi.dsi.big.mg4j.search.visitor.DocumentIteratorVisitor;
import it.unimi.dsi.fastutil.IndirectPriorityQueue;
import it.unimi.dsi.fastutil.longs.LongHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceArrayMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;

import java.io.IOException;
import java.util.Iterator;

/**  A document iterator on documents, generating the union of the documents returned
 * by a number of document iterators.
 * 
 * The pattern of this class is the same as that of {@link AbstractIntersectionDocumentIterator}.
 * Additionally, this class provides a mechanism that makes accessible the set of component
 * document iterators that are {@linkplain #computeFront() positioned on the current document}.
 */

public abstract class AbstractUnionDocumentIterator extends AbstractCompositeDocumentIterator {
	private final static boolean DEBUG = false;
	//private final static boolean ASSERTS = false;

	/** A heap-based semi-indirect priority queue used to keep track of the currently scanned integers. */
	final protected LongHeapSemiIndirectPriorityQueue queue;
	/** The {@link IndirectPriorityQueue#front(int[])} of {@link #queue}, if {@link #frontSize} is not -1. */
	final protected int[] front;
	/** The reference array used for the queue. */
	final protected long[] refArray;
	/** A map from indices to interval iterators. */
	final private Reference2ReferenceArrayMap intervalIterators;
	/** A map from indices to the iterators returned for the current document. The key set may
	 * not contain an index because the related iterator has never been requested. Moreover,
	 * the iterator in this map for a given index may differ from the one in {@link #intervalIterators}
	 * because it could be {@link IntervalIterators#TRUE} (in fact, in that case it may even
	 * happen that {@link #intervalIterators} does not contain the index). */
	final private Reference2ReferenceArrayMap currentIterators;
	/** An unmodifiable wrapper around {@link #currentIterators}. */
	final private Reference2ReferenceMap unmodifiableCurrentIterators;

	/** The number of valid entries in {@link #front}, or -1 if the front has not been computed for the current document. */
	protected int frontSize = -1;

	/** Creates a new document iterator that computes the OR of the given array of iterators.
	 *  @param documentIterator the iterators to be joined.
	 * @throws IOException 
	 */
	protected AbstractUnionDocumentIterator( final DocumentIterator... documentIterator ) throws IOException {
		super( documentIterator );
		this.refArray = new long[ n ];

		queue = new LongHeapSemiIndirectPriorityQueue( refArray );

		intervalIterators = new Reference2ReferenceArrayMap( indices.size() );
		currentIterators = new Reference2ReferenceArrayMap( indices.size() );
		unmodifiableCurrentIterators = Reference2ReferenceMaps.unmodifiable( currentIterators );

		// Only add to the queue nonempty iterators...
		for ( int i = 0; i < n; i++ ) if ( ( refArray[ i ] = documentIterator[ i ].nextDocument() ) != -1 ) queue.enqueue( i );
		// If queue is empty, the process is over
		
		if ( queue.isEmpty() ) curr = END_OF_LIST;
		front = new int[ queue.size() ];
	}

	public long skipTo( final long n ) throws IOException {
		if ( curr >= n ) return curr;

		currentIterators.clear(); 
		frontSize = -1; // Invalidate front

		int first;
		long res;
		while( refArray[ first = queue.first() ] < n ) {
			// Cannot advance the minimum
			if ( ( res = documentIterator[ first ].skipTo( n ) ) == END_OF_LIST ) {
				// Remove it
				queue.dequeue();
				// If nothing else remains, we are done
				if ( queue.isEmpty() ) return curr = END_OF_LIST;
			}
			else {
				// Advance the top element, and signal this fact to the queue
				refArray[ first ] = res;
				queue.changed();
			}
		}
		
		return curr = refArray[ first ];
	}

	
	public long nextDocument() throws IOException {
		if ( curr == END_OF_LIST ) return -1;
		final long c = refArray[ queue.first() ];
		// On the first call, the queue should not be advanced.
		if ( curr == -1 ) return curr = c;
		currentIterators.clear(); 
		frontSize = -1; // Invalidate front
		
		// The least element
		int first;
	
		// Advance all elements equal to the least one
		while( refArray[ first = queue.first() ] == c ) {
			if ( ( refArray[ first ] = documentIterator[ first ].nextDocument() ) != - 1 ) queue.changed();
			else {
				// Remove it
				queue.dequeue();
				// If nothing else remains, we are done
				if ( queue.isEmpty() ) {
					curr = END_OF_LIST;
					return -1;
				}
			}
		}

		return curr = refArray[ first ];
	}
	
	/** Forces computation of the current front, returning the number of indices it contains.
	 * 
	 * After a call to this method, 
	 * the first elements of {@link #front} contain
	 * the indices of the {@linkplain AbstractCompositeDocumentIterator#documentIterator component document iterators}
	 * that are positioned on the current document. If the front has already been
	 * computed for the current document, this method has no side effects.
	 * 
	 * @return the size of the current front (the number of valid entries in {@link #front}).
	 */
	
	protected int computeFront() {
		if ( frontSize == -1 ) frontSize = queue.front( front );
		return frontSize;
	}

	public Reference2ReferenceMap intervalIterators() throws IOException {
		final Iterator i = indices.iterator();
		while( i.hasNext() ) intervalIterator( i.next() );
		return unmodifiableCurrentIterators;
	}

	public IntervalIterator intervalIterator( final Index index ) throws IOException {
		ensureOnADocument();
		if ( DEBUG ) System.err.println( this + ".intervalIterator(" + index + ")" );
		if ( ! indices.contains( index ) ) return IntervalIterators.FALSE;

		IntervalIterator intervalIterator;

		// If the iterator has been created and it's ready, we just return it.		
		if ( ( intervalIterator = currentIterators.get( index ) ) != null ) return intervalIterator;

		int t = 0, f = 0, c = computeFront();
	
		/* We count the number of TRUE and FALSE iterators. In the case of index iterators, we can avoid 
		 * the check and just rely on the index internals.
		 * 
		 * If all iterators are FALSE, we return FALSE. Else if all remaining iterators are TRUE
		 * we return TRUE. 
		 */
		IntervalIterator soleIterator = null;
		if ( indexIterator == null ) 
			for( int i = c; i -- != 0; ) {
				intervalIterator = documentIterator[ front[ i ] ].intervalIterator( index );
				if ( intervalIterator == IntervalIterators.TRUE ) t++;
				else if ( intervalIterator == IntervalIterators.FALSE ) f++;
				else if ( soleIterator == null ) soleIterator = intervalIterator;
			}
		else
			for( int i = c; i -- != 0; ) {
				final Index indexIteratorIndex = indexIterator[ front[ i ] ].index();
				if ( indexIteratorIndex != index ) f++;
				else if ( ! indexIteratorIndex.hasPositions ) t++;
				else if ( soleIterator == null ) soleIterator = indexIterator[ front[ i ] ].intervalIterator( index );
			}

		if ( f == c ) intervalIterator = IntervalIterators.FALSE;
		else if ( f + t == c ) intervalIterator = IntervalIterators.TRUE;
		else if ( f + t < c - 1 ) {
			intervalIterator = intervalIterators.get( index );
			if ( intervalIterator == null ) intervalIterators.put( index, intervalIterator = getComposedIntervalIterator( index ) );
			intervalIterator.reset();
		} else intervalIterator = soleIterator;
				
		currentIterators.put( index, intervalIterator );	
		return intervalIterator;
	}

	abstract protected IntervalIterator getComposedIntervalIterator( Index index );

	/** Invokes {@link #acceptOnTruePaths(DocumentIteratorVisitor)} only on component
	 * iterators positioned on the current document.
	 * 
	 * @param visitor a visitor.
	 * @return true if the visit should continue.
	 * @throws IOException 
	 */
	
	@Override
	public  T acceptOnTruePaths( DocumentIteratorVisitor visitor ) throws IOException {
		if ( ! visitor.visitPre( this ) ) return null;
		final int s  = computeFront();
		final T[] a = visitor.newArray( s );
		if ( a == null ) {
			for( int i = 0; i < s; i++ ) if ( documentIterator[ front[ i ] ].acceptOnTruePaths( visitor ) == null ) return null;
		}
		else {
			for( int i = 0; i < s; i++ ) if ( ( a[ i ] = documentIterator[ front[ i ] ].acceptOnTruePaths( visitor ) ) == null ) return null;
		}
		return visitor.visitPost( this, a );
	}
}