All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.big.mg4j.search.AndDocumentIterator Maven / Gradle / Ivy

Go to download

MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.

The newest version!
package it.unimi.dsi.big.mg4j.search;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2003-2011 Paolo Boldi and Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.big.mg4j.index.Index;
import it.unimi.dsi.fastutil.ints.IntHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.fastutil.longs.LongSet;
import it.unimi.dsi.fastutil.objects.ObjectArrays;
import it.unimi.dsi.fastutil.objects.ObjectHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.util.Interval;
import it.unimi.dsi.util.Intervals;

import java.io.IOException;
import java.util.NoSuchElementException;


/** A document iterator that returns the AND of a number of document iterators.
 *
 * 

This class adds to {@link it.unimi.dsi.big.mg4j.search.AbstractIntersectionDocumentIterator} * an interval iterator generating the AND of the intervals returned for each of the documents involved. */ public class AndDocumentIterator extends AbstractIntersectionDocumentIterator { private static final boolean ASSERTS = false; private final static boolean DEBUG = false; /** Returns a document iterator that computes the AND of the given array of iterators. * *

Note that the special case of the empty and of the singleton arrays * are handled efficiently. * * @param index the default index; relevant only if it has zero length. * @param documentIterator the iterators to be joined. * @return a document iterator that computes the AND of it. * @throws IOException */ public static DocumentIterator getInstance( final Index index, final DocumentIterator... documentIterator ) throws IOException { if ( documentIterator.length == 0 ) return TrueDocumentIterator.getInstance( index ); if ( documentIterator.length == 1 ) return documentIterator[ 0 ]; return new AndDocumentIterator( documentIterator ); } /** Returns a document iterator that computes the AND of the given nonzero-length array of iterators. * *

Note that the special case of the singleton array is handled efficiently. * * @param documentIterator the iterators to be joined (at least one). * @return a document iterator that computes the AND of it. * @throws IOException */ public static DocumentIterator getInstance( final DocumentIterator... documentIterator ) throws IOException { if ( documentIterator.length == 0 ) throw new IllegalArgumentException( "The provided array of document iterators is empty." ); return getInstance( null, documentIterator ); } protected AndDocumentIterator( final DocumentIterator[] documentIterator ) { super( documentIterator ); } protected IntervalIterator getComposedIntervalIterator( final Index index ) { return indexIterator == null ? new AndIntervalIterator( index ) : new AndIndexIntervalIterator( index ); } /** An interval iterator returning the AND (in the Clarke−Cormack−Burkowski lattice) of the component interval iterators. */ private class AndIntervalIterator extends AbstractCompositeIntervalIterator implements IntervalIterator { /** The index of this iterator. */ final private Index index; /** A heap-based indirect priority queue used to keep track of the currently scanned intervals. */ final private ObjectHeapSemiIndirectPriorityQueue queue; /** Whether the scan is over. */ private boolean endOfProcess; /** The left extreme of the last returned interval, or {@link Integer#MIN_VALUE} after a {@link #reset()}. */ private int lastLeft; /** The maximum right extreme currently in the queue. */ private int maxRight; /** Creates a new AND interval iterator. * * @param index the index of the iterator. */ public AndIntervalIterator( final Index index ) { super( n ); // We just set up some internal data, but we perform no initialisation. this.index = index; queue = new ObjectHeapSemiIndirectPriorityQueue( curr, Intervals.STARTS_BEFORE_OR_PROLONGS ); } public void reset() throws IOException { ObjectArrays.fill( curr, null ); queue.clear(); next = null; maxRight = Integer.MIN_VALUE; lastLeft = Integer.MIN_VALUE; endOfProcess = false; for ( int i = 0; i < n; i++ ) { intervalIterator[ i ] = documentIterator[ i ].intervalIterator( index ); // TRUE and FALSE iterators are simply skipped. if ( intervalIterator[ i ] != IntervalIterators.TRUE && intervalIterator[ i ] != IntervalIterators.FALSE ) { curr[ i ] = intervalIterator[ i ].nextInterval(); queue.enqueue( i ); maxRight = Math.max( maxRight, curr[ i ].right ); } } if ( ASSERTS ) assert ! queue.isEmpty(); } public void intervalTerms( final LongSet terms ) { for( int i = n; i-- != 0; ) intervalIterator[ i ].intervalTerms( terms ); } public boolean hasNext() { try { return hasNextInternal(); } catch ( IOException e ) { throw new RuntimeException( e ); } } private boolean hasNextInternal() throws IOException { if ( endOfProcess ) return false; int first; while ( curr[ first = queue.first() ].left == lastLeft ) { if ( ( curr[ first ] = intervalIterator[ first ].nextInterval() ) == null ) { endOfProcess = true; return false; } maxRight = Math.max( maxRight, curr[ first ].right ); queue.changed(); } return true; } public Interval nextInterval() throws IOException { if ( ! hasNextInternal() ) return null; int nextLeft, nextRight; do { final int first = queue.first(); nextLeft = curr[ first ].left; nextRight = maxRight; if ( DEBUG ) System.err.println( this + " is saving interval " + Interval.valueOf( nextLeft, nextRight ) ); /* We check whether the current top is equal to the span of the queue, and * whether the top interval iterator is exhausted. In both cases, the * current span is guaranteed to be a minimal interval. */ if ( curr[ first ].right == maxRight || ( endOfProcess = ( curr[ first ] = intervalIterator[ first ].nextInterval() ) == null ) ) break; maxRight = Math.max( maxRight, curr[ first ].right ); queue.changed(); } while ( maxRight == nextRight ); return Interval.valueOf( lastLeft = nextLeft, nextRight ); } public int extent() { int s = 0; for ( int i = n; i-- != 0; ) s += intervalIterator[ i ].extent(); return s; } } /** An interval iterator returning the AND (in the Clarke−Cormack−Burkowski lattice) of the component interval iterators. */ private class AndIndexIntervalIterator extends AbstractCompositeIndexIntervalIterator implements IntervalIterator { /** The index of this iterator. */ final private Index index; /** A heap-based indirect priority queue used to keep track of the currently scanned positions. */ final private IntHeapSemiIndirectPriorityQueue queue; /** Whether the scan is over. */ private boolean endOfProcess; /** The left extreme of the last returned interval, or {@link Integer#MIN_VALUE} after a {@link #reset()}. */ private int lastLeft; /** The maximum right extreme currently in the queue. */ private int maxRight; /** Creates a new AND interval iterator. * * @param index the index of the iterator. */ public AndIndexIntervalIterator( final Index index ) { super( n ); // We just set up some internal data, but we perform no initialisation. this.index = index; queue = new IntHeapSemiIndirectPriorityQueue( curr ); } public void reset() throws IOException { queue.clear(); maxRight = Integer.MIN_VALUE; lastLeft = Integer.MIN_VALUE; next = null; endOfProcess = false; for ( int i = 0; i < n; i++ ) { // The case != index is identical to the TRUE/FALSE case in AndIntervalIterator. final Index indexIteratorIndex = indexIterator[ i ].index(); if ( indexIteratorIndex == index && indexIteratorIndex.hasPositions ) { position[ i ] = indexIterator[ i ].positionArray(); count[ i ] = indexIterator[ i ].count(); curr[ i ] = position[ i ][ currPos[ i ] = 0 ]; queue.enqueue( i ); maxRight = Math.max( maxRight, curr[ i ] ); } } if ( ASSERTS ) assert ! queue.isEmpty(); } public void intervalTerms( final LongSet terms ) { for( int i = n; i-- != 0; ) terms.add( indexIterator[ i ].termNumber() ); } public boolean hasNext() { if ( endOfProcess ) return false; int first; while ( curr[ first = queue.first() ] == lastLeft ) { if ( ++currPos[ first ] == count[ first ] ) { endOfProcess = true; return false; } curr[ first ] = position[ first ][ currPos[ first ] ]; maxRight = Math.max( maxRight, curr[ first ] ); queue.changed(); } return true; } public Interval next() { final Interval nextInterval = nextInterval(); if ( nextInterval == null ) throw new NoSuchElementException(); return nextInterval; } public Interval nextInterval() { if ( ! hasNext() ) return null; int nextLeft, nextRight; do { final int first = queue.first(); nextLeft = curr[ first ]; nextRight = maxRight; if ( DEBUG ) System.err.println( this + " is saving interval " + Interval.valueOf( nextLeft, nextRight ) ); /* We check whether all iterators are on the same position, and * whether the top interval iterator is exhausted. In both cases, the * current span is guaranteed to be a minimal interval. */ if ( curr[ first ] == maxRight || ( endOfProcess = ++currPos[ first ] == count[ first ] ) ) break; curr[ first ] = position[ first ][ currPos[ first ] ]; if ( maxRight < curr[ first ] ) maxRight = curr[ first ]; queue.changed(); } while ( maxRight == nextRight ); return Interval.valueOf( lastLeft = nextLeft, nextRight ); } public int extent() { return n; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy