All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.big.mg4j.index.MultiTermIndexIterator Maven / Gradle / Ivy

Go to download

MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.

The newest version!
package it.unimi.dsi.big.mg4j.index;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2003-2011 Paolo Boldi and Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.big.mg4j.index.payload.Payload;
import it.unimi.dsi.big.mg4j.search.AbstractCompositeDocumentIterator;
import it.unimi.dsi.big.mg4j.search.AbstractUnionDocumentIterator;
import it.unimi.dsi.big.mg4j.search.DocumentIterator;
import it.unimi.dsi.big.mg4j.search.IntervalIterator;
import it.unimi.dsi.big.mg4j.search.OrDocumentIterator;
import it.unimi.dsi.big.mg4j.search.score.BM25Scorer;
import it.unimi.dsi.big.mg4j.search.score.Scorer;
import it.unimi.dsi.big.mg4j.search.visitor.DocumentIteratorVisitor;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntHeapSemiIndirectPriorityQueue;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntIterators;
import it.unimi.dsi.fastutil.longs.LongSet;
import it.unimi.dsi.fastutil.objects.ObjectHeapIndirectPriorityQueue;
import it.unimi.dsi.util.Interval;

import java.io.IOException;

/** A virtual {@linkplain IndexIterator index iterator} that merges several component index iterators.
*
* 

This class adds to {@link it.unimi.dsi.big.mg4j.search.AbstractUnionDocumentIterator} * an interval iterator generating the OR of the intervals returned for each of the documents involved. * The main difference with an {@link OrDocumentIterator} built on the same array of component iterators * is that this class implements {@link IndexIterator} and hence provides a {@link #count()} (the sum * of counts of those component iterators positioned on the current document) and a {@link #frequency()}. The * latter is by default the maximum frequency of a component iterator, but it can be set * at {@link MultiTermIndexIterator#getInstance(long, Index, IndexIterator[]) construction time}. * *

The main raison d'être of this class is support for query expansion: a blind application * of {@link OrDocumentIterator} to an array of index iterators would mislead {@linkplain Scorer scorers} such as {@link BM25Scorer} * because low-frequency terms (e.g., hapax legomena) would be responsible for most of the score. * *

Note that {@linkplain DocumentIteratorVisitor} has a {@linkplain DocumentIteratorVisitor#visit(IndexIterator) visit method for generic index iterator} * and a {@linkplain DocumentIteratorVisitor#visit(MultiTermIndexIterator) visit method for instances of this class}. * This approach provides additional flexibility—a scorer, for instance, might treat an instance of * this class as a standard {@link IndexIterator}, or it might choose to {@linkplain #front(IndexIterator[]) query which terms actually appear} * and do something more sophisticated (for instance, using {@linkplain DocumentIterator#weight() weights}). */ public class MultiTermIndexIterator extends AbstractUnionDocumentIterator implements IndexIterator { @SuppressWarnings("unused") private static final boolean ASSERTS = false; /** Value to be used for term frequency, or {@link Long#MIN_VALUE} to use the max; in any case, this attribute is used to cache * frequency after the first call to {@link #frequency()}. */ private long frequency; /** The term of this iterator. */ protected String term; /** The id of this iterator. */ protected int id; /** The count of the last returned document. */ private int count = -1; /** Whether all underlying index iterators have counts. */ private final boolean hasCounts; /** Whether all underlying index iterators have positions. */ private final boolean hasPositions; /** Returns an index iterator that merges the given array of iterators. * This method requires that at least one iterator is provided. The frequency is computed as a max, * and {@link #index()} will return the result of the same method on the first iterator. * * @param indexIterator the iterators to be joined (at least one). * @return a merged index iterator. * @throws IllegalArgumentException if no iterators were provided. */ public static IndexIterator getInstance( final IndexIterator... indexIterator ) throws IOException { return getInstance( Long.MIN_VALUE, indexIterator ); } /** Returns an index iterator that merges the given array of iterators. * *

Note that the special case of the empty and of the singleton arrays * are handled efficiently. The frequency is computed as a max, and * {@link #index()} will return index. * * @param index the index that wil be returned by {@link #index()}. * @param indexIterator the iterators to be joined. * @return a merged index iterator. */ public static IndexIterator getInstance( final Index index, final IndexIterator... indexIterator ) throws IOException { return getInstance( Long.MIN_VALUE, index, indexIterator ); } /** Returns an index iterator that merges the given array of iterators. * This method requires that at least one iterator is provided. * * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max). * @param indexIterator the iterators to be joined (at least one). * @return a merged index iterator. * @throws IllegalArgumentException if no iterators were provided, or they run on different indices. */ public static IndexIterator getInstance( final long defaultFrequency, final IndexIterator... indexIterator ) throws IOException { if ( indexIterator.length == 0 ) throw new IllegalArgumentException(); return getInstance( defaultFrequency, indexIterator[ 0 ].index(), indexIterator ); } /** Returns an index iterator that merges the given array of iterators. * *

Note that the special case of the empty and of the singleton arrays * are handled efficiently. * * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max). * @param index the index that wil be returned by {@link #index()}. * @param indexIterator the iterators to be joined. * @return a merged index iterator. * @throws IllegalArgumentException if there is some iterator on an index different from index. */ public static IndexIterator getInstance( final long defaultFrequency, final Index index, final IndexIterator... indexIterator ) throws IOException { if ( indexIterator.length == 0 ) return index.getEmptyIndexIterator(); if ( indexIterator.length == 1 ) return indexIterator[ 0 ]; return new MultiTermIndexIterator( defaultFrequency, indexIterator ); } /** Creates a new document iterator that merges the given array of iterators. * * @param defaultFrequency the default term frequency (or {@link Integer#MIN_VALUE} for the max). * @param indexIterator the iterators to be joined. */ @SuppressWarnings("cast") protected MultiTermIndexIterator( final long defaultFrequency, final IndexIterator... indexIterator ) throws IOException { super( (DocumentIterator[]) indexIterator ); this.frequency = defaultFrequency; boolean havePositions = true, haveCounts = true; for( IndexIterator i: indexIterator ) { if ( ! i.index().hasCounts ) haveCounts = false; if ( ! i.index().hasPositions ) havePositions = false; } hasCounts = haveCounts; hasPositions = havePositions; } protected IntervalIterator getComposedIntervalIterator( final Index index ) { return new MultiTermIntervalIterator(); } @Override public long skipTo( final long n ) throws IOException { if ( curr >= n ) return curr; // We invalidate count before calling the superclass method. count = -1; return super.skipTo( n ); } public long nextDocument() throws IOException { // We invalidate count before calling the superclass method. count = -1; return super.nextDocument(); } /** The count is the sum of counts of those component iterators positioned on the current document. * * @return the sum of counts. */ public int count() throws IOException { ensureOnADocument(); if ( ! hasCounts ) throw new IllegalStateException( "Some of the underlying iterators do not have counts" ); if ( count == -1 ) { int count = 0; for ( int i = computeFront(); i-- != 0; ) count += indexIterator[ front[ i ] ].count(); this.count = count; } return count; } /** Fills the given array with the index iterators composing the current front. * *

This method is essentially a safe exposure of the {@linkplain ObjectHeapIndirectPriorityQueue#front(int[]) front of the queue} * merging the component {@linkplain IndexIterator index iterators}. * After a call to {@link #nextDocument()}, you can use this method to know * which terms actually appear in the current document. You can use the public * field {@link AbstractCompositeDocumentIterator#n} to size the argument * array appropriately. * * @param indexIterator an array, at least large as the number of component index iterators, * that will be partially filled with the index iterators corresponding to terms appearing in the current document. * @return the number of iterators written into indexIterator. */ public int front( final IndexIterator[] indexIterator ) { final int s = computeFront(); for( int i = s; i-- != 0; ) indexIterator[ i ] = this.indexIterator[ front[ i ] ]; return s; } /** The frequency is either the default frequency set at construction time, or the maximum frequency of the component iterators. * * @return the frequency. */ public long frequency() throws IOException { if ( frequency != Long.MIN_VALUE ) return frequency; long frequency = Long.MIN_VALUE; for ( int i = n; i-- != 0; ) frequency = Math.max( frequency, indexIterator[ i ].frequency() ); return this.frequency = frequency; // caching it! } public IndexIterator term( final CharSequence term ) { this.term = term == null ? null : term.toString(); return this; } public String term() { return term; } public long termNumber() { // TODO: this is not particularly sensible return indexIterator[ 0 ].termNumber(); } public IndexIterator id( final int id ) { this.id = id; return this; } public int id() { return id; } public Index index() { return soleIndex; } /** This method is not implemented by this class. */ public Payload payload() { throw new UnsupportedOperationException(); } public int[] positionArray() throws IOException { if ( ! hasPositions ) throw new IllegalStateException( "Some of the underlying iterators do not have positions" ); // If the front contains a single element, we can just use its position array. if ( computeFront() == 1 ) return indexIterator[ front[ 0 ] ].positionArray(); final MultiTermIntervalIterator multiTermIntervalIterator = (MultiTermIntervalIterator)intervalIterator(); multiTermIntervalIterator.drain(); return multiTermIntervalIterator.cache; } public IntIterator positions() throws IOException { return IntIterators.wrap( positionArray(), 0, count ); } public int positions( int[] position ) throws IOException { int c = count; if ( position.length < c ) return -c; final int[] cache = positionArray(); for( int i = c; i-- != 0; ) position[ i ] = cache[ i ]; return c; } @Override public IndexIterator weight( final double weight ) { super.weight( weight ); return this; } @Override public T accept( DocumentIteratorVisitor visitor ) throws IOException { return visitor.visit( this ); } @Override public T acceptOnTruePaths( DocumentIteratorVisitor visitor ) throws IOException { return visitor.visit( this ); } public T acceptDeep( DocumentIteratorVisitor visitor ) throws IOException { return super.accept( visitor ); } public T acceptDeepOnTruePaths( DocumentIteratorVisitor visitor ) throws IOException { return super.accept( visitor ); } /** An optimised interval iterator with the same semantics as that implemented * by {@link OrDocumentIterator}, but not allowing duplicate positions. * *

This iterator provides an additional {@link #drain()} method that exhausts the * merge queue, leaving however the returned elements in the {@link #cache} array. Moreover, * the internal state of the iterator is modified so that it continues to behave normally, * returning however its results from {@link #cache}. In this way we can easily provide * efficient implementations for {@link IndexIterator#positions()}, {@link IndexIterator#positionArray()}, * and {@link IndexIterator#positions(int[])}. */ private class MultiTermIntervalIterator extends AbstractCompositeIndexIntervalIterator implements IntervalIterator { @SuppressWarnings({ "unused" }) private final static boolean DEBUG = false; @SuppressWarnings("hiding") private final static boolean ASSERTS = false; /** A heap-based indirect priority queue used to keep track of the currently scanned positions. */ private final IntHeapSemiIndirectPriorityQueue positionQueue; /** The cached results of this iterator. */ public int[] cache; /** The number of results emitted by this iterator since the last call to {@link #reset()}. */ private int emitted; /** The number of results extracted in {@link #cache} since the last call to {@link #reset()}. */ private int extracted; public MultiTermIntervalIterator() { super( n ); positionQueue = new IntHeapSemiIndirectPriorityQueue( curr ); cache = new int[ 4 ]; } public void reset() throws IOException { emitted = extracted = 0; next = null; positionQueue.clear(); for ( int i = computeFront(), k; i-- != 0; ) { k = front[ i ]; position[ k ] = indexIterator[ k ].positionArray(); count[ k ] = indexIterator[ k ].count(); curr[ k ] = position[ k ][ 0 ]; currPos[ k ] = 0; positionQueue.enqueue( k ); } if ( ASSERTS ) assert ! positionQueue.isEmpty(); } public void intervalTerms( final LongSet terms ) { // TODO: this is not particularly sensible terms.add( indexIterator[ 0 ].termNumber() ); } public Interval nextInterval() { if ( next != null ) { final Interval result = next; next = null; return result; } if ( emitted < extracted ) return Interval.valueOf( cache[ emitted++ ] ); if ( positionQueue.isEmpty() ) return null; final int first = positionQueue.first(); if ( extracted == cache.length ) cache = IntArrays.grow( cache, extracted + 1 ); cache[ extracted++ ] = curr[ first ]; if ( ++currPos[ first ] < count[ first ] ) { curr[ first ] = position[ first ][ currPos[ first ] ]; positionQueue.changed(); if ( curr[ positionQueue.first() ] == cache[ extracted - 1 ] ) throw new IllegalArgumentException( "Duplicate positions in " + this ); } else positionQueue.dequeue(); return Interval.valueOf( cache[ emitted++ ] ); } public int extent() { return 1; } /** Drains all elements from the queue, stores them in {@link #cache} and * restores {@link #emitted} so that the iterators continues to work transparently. */ public void drain() { final int emittedNow = emitted - ( next != null ? 1 : 0 ); next = null; while( nextInterval() != null ); emitted = emittedNow; } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy