All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.big.mg4j.search.OrDocumentIterator Maven / Gradle / Ivy

Go to download

MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.

The newest version!
package it.unimi.dsi.big.mg4j.search;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2003-2011 Paolo Boldi and Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.big.mg4j.index.Index;
import it.unimi.dsi.big.mg4j.index.IndexIterator;
import it.unimi.dsi.fastutil.ints.IntHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.fastutil.longs.LongSet;
import it.unimi.dsi.fastutil.objects.ObjectHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.util.Interval;
import it.unimi.dsi.util.Intervals;

import java.io.IOException;

/** An iterator on documents that returns the OR of a number of document iterators.
*
* 

This class adds to {@link it.unimi.dsi.big.mg4j.search.AbstractUnionDocumentIterator} * an interval iterator generating the OR of the intervals returned for each of the documents involved. */ public class OrDocumentIterator extends AbstractUnionDocumentIterator implements DocumentIterator { @SuppressWarnings("unused") private final static boolean DEBUG = false; /** Returns a document iterator that computes the OR of the given array of iterators. * *

Note that the special case of the empty and of the singleton arrays * are handled efficiently. * * @param index the default index; relevant only if it has zero length. * @param documentIterator the iterators to be joined. * @return a document iterator that computes the OR of it. * @throws IOException */ public static DocumentIterator getInstance( final Index index, DocumentIterator... documentIterator ) throws IOException { if ( documentIterator.length == 0 ) return FalseDocumentIterator.getInstance( index ); if ( documentIterator.length == 1 ) return documentIterator[ 0 ]; return new OrDocumentIterator( documentIterator ); } /** Returns a document iterator that computes the OR of the given nonzero-length array of iterators. * *

Note that the special case of the singleton array is handled efficiently. * * @param documentIterator the iterators to be joined. * @return a document iterator that computes the OR of it. * @throws IOException */ public static DocumentIterator getInstance( DocumentIterator... documentIterator ) throws IOException { if ( documentIterator.length == 0 ) throw new IllegalArgumentException( "The provided array of document iterators is empty." ); return getInstance( null, documentIterator ); } /** Creates a new document iterator that computes the OR of the given array of iterators. * @param documentIterator the iterators to be joined. * @throws IOException */ protected OrDocumentIterator( final DocumentIterator... documentIterator ) throws IOException { super( documentIterator ); } protected IntervalIterator getComposedIntervalIterator( final Index index ) { return indexIterator == null ? new OrIntervalIterator( index ) : new OrIndexIntervalIterator( index ); } /** An interval iterator of nondecreasing disjoint intervals * obtained ORing the output of a set of iterators with * the same property. */ private class OrIntervalIterator extends AbstractCompositeIntervalIterator { /** The index of this iterator. */ final Index index; /** A heap-based indirect priority queue used to keep track of the currently scanned intervals. */ private ObjectHeapSemiIndirectPriorityQueue intervalQueue; /** The left extreme of the last returned interval, or {@link Integer#MIN_VALUE} after a {@link #reset()}. */ private int lastLeft; /** An array to hold the front of the interval queue. */ private final int[] intervalFront; /** Creates a new OR interval iterator. * * @param index the index of the iterator. */ public OrIntervalIterator( final Index index ) { super( n ); // We just set up some internal data, but we perform no initialisation. this.index = index; intervalQueue = new ObjectHeapSemiIndirectPriorityQueue( curr, Intervals.ENDS_BEFORE_OR_IS_SUFFIX ); intervalFront = new int[ n ]; } public void reset() throws IOException { lastLeft = Integer.MIN_VALUE; next = null; intervalQueue.clear(); for ( int i = computeFront(), k; i-- != 0; ) { k = front[ i ]; intervalIterator[ k ] = documentIterator[ k ].intervalIterator( index ); if ( intervalIterator[ k ] != IntervalIterators.TRUE && ( curr[ k ] = intervalIterator[ k ].nextInterval() ) != null ) intervalQueue.enqueue( k ); } } public void intervalTerms( final LongSet terms ) { final int frontSize = intervalQueue.front( intervalFront ); final int[] intervalFront = this.intervalFront; for( int i = frontSize; i-- != 0; ) intervalIterator[ intervalFront[ i ] ].intervalTerms( terms ); } private int hasNextInternal() throws IOException { if ( intervalQueue.isEmpty() ) return -1; int first; while ( curr[ first = intervalQueue.first() ].left <= lastLeft ) { if ( ( curr[ first ] = intervalIterator[ first ].nextInterval() ) != null ) intervalQueue.changed(); else { intervalQueue.dequeue(); if ( intervalQueue.isEmpty() ) return -1; } } return first; } public boolean hasNext() { try { return hasNextInternal() != -1; } catch ( IOException e ) { throw new RuntimeException( e ); } } public Interval nextInterval () throws IOException { final int first = hasNextInternal(); if ( first == -1 ) return null; lastLeft = curr[ first ].left; return curr[ first ]; } public int extent() { int e = Integer.MAX_VALUE; for ( int i = computeFront(), k; i-- != 0; ) { k = front[ i ]; if ( curr[ k ] != null ) e = Math.min( e, intervalIterator[ k ].extent() ); } return e; } } /** An optimised interval iterator with the same semantics as that implemented * by {@link OrDocumentIterator}, but using just {@link IndexIterator#positionArray()}. */ protected class OrIndexIntervalIterator extends AbstractCompositeIndexIntervalIterator implements IntervalIterator { @SuppressWarnings({ "unused", "hiding" }) private final static boolean DEBUG = false; private final static boolean ASSERTS = false; /** The index of this iterator. */ final Index index; /** A heap-based semi-indirect priority queue used to keep track of the currently scanned positions. */ private final IntHeapSemiIndirectPriorityQueue positionQueue; /** An array to hold the front of the position queue. */ private final int[] positionFront; /** The last returned singleton interval, or {@link Integer#MIN_VALUE} after a {@link #reset()}. */ private int last; public OrIndexIntervalIterator( final Index index ) { super( n ); this.index = index; positionQueue = new IntHeapSemiIndirectPriorityQueue( curr ); positionFront = new int[ n ]; } public void reset() throws IOException { last = Integer.MIN_VALUE; positionQueue.clear(); for ( int i = computeFront(), k; i-- != 0; ) { k = front[ i ]; if ( indexIterator[ k ].index() == index && indexIterator[ k ].index().hasPositions ) { position[ k ] = indexIterator[ k ].positionArray(); count[ k ] = indexIterator[ k ].count(); curr[ k ] = position[ k ][ currPos[ k ] = 0 ]; positionQueue.enqueue( k ); } } if ( ASSERTS ) assert ! positionQueue.isEmpty(); next = Interval.valueOf( curr[ positionQueue.first() ] ); } public void intervalTerms( final LongSet terms ) { final int frontSize = positionQueue.front( positionFront ); final int[] positionFront = this.positionFront; for( int i = frontSize; i-- != 0; ) terms.add( indexIterator[ positionFront[ i ] ].termNumber() ); } private int hasNextInternal() { if ( positionQueue.isEmpty() ) return -1; int first; while ( curr[ first = positionQueue.first() ] == last ) { if ( ++currPos[ first ] == count[ first ] ) { positionQueue.dequeue(); if ( positionQueue.isEmpty() ) return -1; } else { curr[ first ] = position[ first ][ currPos[ first ] ]; positionQueue.changed(); } } return first; } public boolean hasNext() { return hasNextInternal() != -1; } public Interval nextInterval () throws IOException { final int first = hasNextInternal(); if ( first == -1 ) return null; return Interval.valueOf( last = curr[ first ] ); } public int extent() { return 1; } } public void dispose() throws IOException { for( DocumentIterator d: documentIterator ) d.dispose(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy