All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.big.mg4j.search.AbstractUnionDocumentIterator Maven / Gradle / Ivy

Go to download

MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.

The newest version!
package it.unimi.dsi.big.mg4j.search;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2003-2011 Paolo Boldi and Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.big.mg4j.index.Index;
import it.unimi.dsi.big.mg4j.search.visitor.DocumentIteratorVisitor;
import it.unimi.dsi.fastutil.IndirectPriorityQueue;
import it.unimi.dsi.fastutil.longs.LongHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceArrayMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;

import java.io.IOException;
import java.util.Iterator;

/**  A document iterator on documents, generating the union of the documents returned
 * by a number of document iterators.
 * 
 * 

The pattern of this class is the same as that of {@link AbstractIntersectionDocumentIterator}. * Additionally, this class provides a mechanism that makes accessible the set of component * document iterators that are {@linkplain #computeFront() positioned on the current document}. */ public abstract class AbstractUnionDocumentIterator extends AbstractCompositeDocumentIterator { private final static boolean DEBUG = false; //private final static boolean ASSERTS = false; /** A heap-based semi-indirect priority queue used to keep track of the currently scanned integers. */ final protected LongHeapSemiIndirectPriorityQueue queue; /** The {@link IndirectPriorityQueue#front(int[])} of {@link #queue}, if {@link #frontSize} is not -1. */ final protected int[] front; /** The reference array used for the queue. */ final protected long[] refArray; /** A map from indices to interval iterators. */ final private Reference2ReferenceArrayMap intervalIterators; /** A map from indices to the iterators returned for the current document. The key set may * not contain an index because the related iterator has never been requested. Moreover, * the iterator in this map for a given index may differ from the one in {@link #intervalIterators} * because it could be {@link IntervalIterators#TRUE} (in fact, in that case it may even * happen that {@link #intervalIterators} does not contain the index). */ final private Reference2ReferenceArrayMap currentIterators; /** An unmodifiable wrapper around {@link #currentIterators}. */ final private Reference2ReferenceMap unmodifiableCurrentIterators; /** The number of valid entries in {@link #front}, or -1 if the front has not been computed for the current document. */ protected int frontSize = -1; /** Creates a new document iterator that computes the OR of the given array of iterators. * @param documentIterator the iterators to be joined. * @throws IOException */ protected AbstractUnionDocumentIterator( final DocumentIterator... documentIterator ) throws IOException { super( documentIterator ); this.refArray = new long[ n ]; queue = new LongHeapSemiIndirectPriorityQueue( refArray ); intervalIterators = new Reference2ReferenceArrayMap( indices.size() ); currentIterators = new Reference2ReferenceArrayMap( indices.size() ); unmodifiableCurrentIterators = Reference2ReferenceMaps.unmodifiable( currentIterators ); // Only add to the queue nonempty iterators... for ( int i = 0; i < n; i++ ) if ( ( refArray[ i ] = documentIterator[ i ].nextDocument() ) != -1 ) queue.enqueue( i ); // If queue is empty, the process is over if ( queue.isEmpty() ) curr = END_OF_LIST; front = new int[ queue.size() ]; } public long skipTo( final long n ) throws IOException { if ( curr >= n ) return curr; currentIterators.clear(); frontSize = -1; // Invalidate front int first; long res; while( refArray[ first = queue.first() ] < n ) { // Cannot advance the minimum if ( ( res = documentIterator[ first ].skipTo( n ) ) == END_OF_LIST ) { // Remove it queue.dequeue(); // If nothing else remains, we are done if ( queue.isEmpty() ) return curr = END_OF_LIST; } else { // Advance the top element, and signal this fact to the queue refArray[ first ] = res; queue.changed(); } } return curr = refArray[ first ]; } public long nextDocument() throws IOException { if ( curr == END_OF_LIST ) return -1; final long c = refArray[ queue.first() ]; // On the first call, the queue should not be advanced. if ( curr == -1 ) return curr = c; currentIterators.clear(); frontSize = -1; // Invalidate front // The least element int first; // Advance all elements equal to the least one while( refArray[ first = queue.first() ] == c ) { if ( ( refArray[ first ] = documentIterator[ first ].nextDocument() ) != - 1 ) queue.changed(); else { // Remove it queue.dequeue(); // If nothing else remains, we are done if ( queue.isEmpty() ) { curr = END_OF_LIST; return -1; } } } return curr = refArray[ first ]; } /** Forces computation of the current front, returning the number of indices it contains. * *

After a call to this method, * the first elements of {@link #front} contain * the indices of the {@linkplain AbstractCompositeDocumentIterator#documentIterator component document iterators} * that are positioned on the current document. If the front has already been * computed for the current document, this method has no side effects. * * @return the size of the current front (the number of valid entries in {@link #front}). */ protected int computeFront() { if ( frontSize == -1 ) frontSize = queue.front( front ); return frontSize; } public Reference2ReferenceMap intervalIterators() throws IOException { final Iterator i = indices.iterator(); while( i.hasNext() ) intervalIterator( i.next() ); return unmodifiableCurrentIterators; } public IntervalIterator intervalIterator( final Index index ) throws IOException { ensureOnADocument(); if ( DEBUG ) System.err.println( this + ".intervalIterator(" + index + ")" ); if ( ! indices.contains( index ) ) return IntervalIterators.FALSE; IntervalIterator intervalIterator; // If the iterator has been created and it's ready, we just return it. if ( ( intervalIterator = currentIterators.get( index ) ) != null ) return intervalIterator; int t = 0, f = 0, c = computeFront(); /* We count the number of TRUE and FALSE iterators. In the case of index iterators, we can avoid * the check and just rely on the index internals. * * If all iterators are FALSE, we return FALSE. Else if all remaining iterators are TRUE * we return TRUE. */ IntervalIterator soleIterator = null; if ( indexIterator == null ) for( int i = c; i -- != 0; ) { intervalIterator = documentIterator[ front[ i ] ].intervalIterator( index ); if ( intervalIterator == IntervalIterators.TRUE ) t++; else if ( intervalIterator == IntervalIterators.FALSE ) f++; else if ( soleIterator == null ) soleIterator = intervalIterator; } else for( int i = c; i -- != 0; ) { final Index indexIteratorIndex = indexIterator[ front[ i ] ].index(); if ( indexIteratorIndex != index ) f++; else if ( ! indexIteratorIndex.hasPositions ) t++; else if ( soleIterator == null ) soleIterator = indexIterator[ front[ i ] ].intervalIterator( index ); } if ( f == c ) intervalIterator = IntervalIterators.FALSE; else if ( f + t == c ) intervalIterator = IntervalIterators.TRUE; else if ( f + t < c - 1 ) { intervalIterator = intervalIterators.get( index ); if ( intervalIterator == null ) intervalIterators.put( index, intervalIterator = getComposedIntervalIterator( index ) ); intervalIterator.reset(); } else intervalIterator = soleIterator; currentIterators.put( index, intervalIterator ); return intervalIterator; } abstract protected IntervalIterator getComposedIntervalIterator( Index index ); /** Invokes {@link #acceptOnTruePaths(DocumentIteratorVisitor)} only on component * iterators positioned on the current document. * * @param visitor a visitor. * @return true if the visit should continue. * @throws IOException */ @Override public T acceptOnTruePaths( DocumentIteratorVisitor visitor ) throws IOException { if ( ! visitor.visitPre( this ) ) return null; final int s = computeFront(); final T[] a = visitor.newArray( s ); if ( a == null ) { for( int i = 0; i < s; i++ ) if ( documentIterator[ front[ i ] ].acceptOnTruePaths( visitor ) == null ) return null; } else { for( int i = 0; i < s; i++ ) if ( ( a[ i ] = documentIterator[ front[ i ] ].acceptOnTruePaths( visitor ) ) == null ) return null; } return visitor.visitPost( this, a ); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy