src.it.unimi.dsi.big.mg4j.search.LowPassDocumentIterator Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j-big Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.
The newest version!
package it.unimi.dsi.big.mg4j.search;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2003-2011 Paolo Boldi and Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.big.mg4j.index.Index;
import it.unimi.dsi.big.mg4j.search.visitor.DocumentIteratorVisitor;
import it.unimi.dsi.fastutil.longs.LongSet;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceArrayMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;
import it.unimi.dsi.fastutil.objects.ReferenceSet;
import it.unimi.dsi.util.Interval;

import java.io.IOException;
import java.util.Iterator;



/** A document iterator that filters another document iterator, returning just intervals (and containing
 * documents) whose length does not exceed a given threshold.
 * 
 * @author Paolo Boldi
 * @author Sebastiano Vigna
 * @since 0.9
 */

public class LowPassDocumentIterator extends AbstractDocumentIterator {

	private final static boolean DEBUG = false;
	@SuppressWarnings("unused")
	private final static boolean ASSERTS = false;

	/** The underlying iterator. */
	final private DocumentIterator documentIterator;
	/** If not null, the sole index involved in this iterator. */
	final private Index soleIndex;
	/** The iterator threshold. */
	final protected int threshold; 
	/** A map from indices to interval iterators. */
	final private Reference2ReferenceArrayMap intervalIterators;
	/** A map from indices to the iterators returned for the current document. The key set may
	 * not contain an index because the related iterator has never been requested. Moreover,
	 * the iterator in this map for a given index may differ from the one in {@link #intervalIterators}
	 * because it could be {@link IntervalIterators#TRUE} (in fact, in that case it may even
	 * happen that {@link #intervalIterators} does not contain the index). */
	final private Reference2ReferenceArrayMap currentIterators;
	/** An unmodifiable wrapper around {@link #currentIterators}. */
	final private Reference2ReferenceMap unmodifiableCurrentIterators;


	/** Creates a new low-pass document iterator over a given iterator.
	 * @param documentIterator the iterator to be filtered.
	 * @param threshold the filter threshold.
	 */
	protected LowPassDocumentIterator( final DocumentIterator documentIterator, final int threshold ) {
		this.documentIterator = documentIterator;
		this.threshold = threshold;
		final int n = documentIterator.indices().size();
		soleIndex = n == 1 ? indices().iterator().next() : null;
		intervalIterators = new Reference2ReferenceArrayMap( n );
		currentIterators = new Reference2ReferenceArrayMap( n );
		unmodifiableCurrentIterators = Reference2ReferenceMaps.unmodifiable( currentIterators );
	}

	/** Returns a low-pass document iterator over a given iterator.
	 * @param it the iterator to be filtered.
	 * @param threshold the filter threshold.
	 */
	public static LowPassDocumentIterator getInstance( final DocumentIterator it, final int threshold ) {
		return new LowPassDocumentIterator( it, threshold );
	}

	public ReferenceSet indices() {
		return documentIterator.indices();
	}

	private boolean isValid() throws IOException {
		/* The policy here is that a low-pass is valid is at least one of the underlying
		 * interval iterators, once filtered, would return at least one interval. Note
		 * that TRUE iterators are not actually filtered, so they always 
		 * return true on a call to hasNext(). */
		
		if ( soleIndex != null ) return intervalIterator( soleIndex ).hasNext();
		
		for( Index index: indices() ) if ( intervalIterator( index ).hasNext() ) return true;
		return false;
	}

	public long nextDocument() throws IOException {
		do currentIterators.clear(); while( ( curr = documentIterator.nextDocument() ) != -1 && ! isValid() );
		final long d = curr;
		curr = fromNextDocument( d );
		return d;
	}
	
	public boolean mayHaveNext() {
		return documentIterator.mayHaveNext();
	}

	public long skipTo( final long n ) throws IOException {
		if ( DEBUG ) System.err.println( "********* skipTo(" + n+ "); last = " + curr );
		if ( curr >= n ) return curr;

		currentIterators.clear();
		// We first try to get a candidate document.
		if ( ( curr = documentIterator.skipTo( n ) ) != END_OF_LIST && ! isValid() ) nextDocument(); 
		return curr;
	}

	public Reference2ReferenceMap intervalIterators() throws IOException {
		final Iterator i = indices().iterator();
		while( i.hasNext() ) intervalIterator( i.next() );
		return unmodifiableCurrentIterators;
	}

	public IntervalIterator intervalIterator() throws IOException {
		if ( soleIndex == null ) throw new IllegalStateException();
		return intervalIterator( soleIndex );
	}

	public IntervalIterator intervalIterator( final Index index ) throws IOException {
		ensureOnADocument();
		if ( DEBUG ) System.err.println( this + ".intervalIterator(" + index + ")" );
		if ( ! documentIterator.indices().contains( index ) ) return IntervalIterators.FALSE;

		IntervalIterator intervalIterator;

		// If the iterator has been created and it's ready, we just return it.		
		if ( ( intervalIterator = currentIterators.get( index ) ) != null ) return intervalIterator;

		intervalIterator = documentIterator.intervalIterator( index );
			
		/* If the underlying iterator is TRUE or FALSE, then our contribution to the result is not relevant,
		 * and we just pass this information upwards. E.g., consider the query (A OR title:B)~2 with
		 * a document containing A but not B in its title. When evaluating the query for the title index,
		 * the subquery before the low-pass operator evaluates to TRUE, meaning that its truth is independent
		 * of the title field. This fact is not changed by the low-pass operator. */
		
		if ( intervalIterator != IntervalIterators.TRUE && intervalIterator != IntervalIterators.FALSE ) {
			intervalIterator = intervalIterators.get( index );
			if ( intervalIterator == null ) intervalIterators.put( index, intervalIterator = new LowPassIntervalIterator( index ) );
			intervalIterator.reset();
			if ( ! intervalIterator.hasNext() ) intervalIterator = IntervalIterators.FALSE;
		}
		
		currentIterators.put( index, intervalIterator );	
		return intervalIterator;
	}

	public void dispose() throws IOException {
		documentIterator.dispose();
	}
	
	public  T accept( final DocumentIteratorVisitor visitor ) throws IOException {
		if ( ! visitor.visitPre( this ) ) return null;
		final T[] a = visitor.newArray( 1 );
		if ( a == null ) {
			if ( documentIterator.accept( visitor ) == null ) return null;
		}
		else {
			if ( ( a[ 0 ] = documentIterator.accept( visitor ) ) == null ) return null;
		}
		return visitor.visitPost( this, a );
	}

	public  T acceptOnTruePaths( final DocumentIteratorVisitor visitor ) throws IOException {
		if ( ! visitor.visitPre( this ) ) return null;
		final T[] a = visitor.newArray( 1 );
		if ( a == null ) {
			if ( documentIterator.acceptOnTruePaths( visitor ) == null ) return null;			
		}
		else {
			if ( ( a[ 0 ] = documentIterator.acceptOnTruePaths( visitor ) ) == null ) return null;
		}
		return visitor.visitPost( this, a );
	}
	
	public String toString() {
	   return this.getClass().getSimpleName() + "(" + documentIterator + ", " + threshold + ")";
	}
	
	/** An interval iterator returning just the interval shorter than {@link #threshold}. */
	
	private class LowPassIntervalIterator extends AbstractIntervalIterator implements IntervalIterator {
		/** The index of this iterator. */
		final Index index;
		/** The underlying interval iterator. */
		private IntervalIterator intervalIterator;
		
		public LowPassIntervalIterator( final Index index ) {
			this.index = index;
		}

		public void reset( ) throws IOException {
			next = null;
			intervalIterator = documentIterator.intervalIterator( index );
		}

		public void intervalTerms( final LongSet terms ) {
			// Just delegate to the filtered iterator
			intervalIterator.intervalTerms( terms );
		}
		
		public Interval nextInterval() throws IOException {
			if ( next != null ) {
				final Interval result = next;
				next = null; 
				return result;
			}

			Interval result;
			while( ( result = intervalIterator.nextInterval() ) != null && result.length() > threshold );
			return result;
		}
		
		public int extent() {
			return Math.min( intervalIterator.extent(), threshold );
		}
		
		public String toString() {
		   return getClass().getSimpleName() + "(" + intervalIterator + ", " + threshold + ")";
		}
	}
}