All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.big.mg4j.search.AlignDocumentIterator Maven / Gradle / Ivy

Go to download

MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.

The newest version!
package it.unimi.dsi.big.mg4j.search;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2008-2011 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.big.mg4j.index.Index;
import it.unimi.dsi.big.mg4j.index.IndexIterator;
import it.unimi.dsi.big.mg4j.search.visitor.DocumentIteratorVisitor;
import it.unimi.dsi.fastutil.longs.LongSet;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceArrayMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;
import it.unimi.dsi.fastutil.objects.ReferenceSet;
import it.unimi.dsi.util.Interval;

import java.io.IOException;


/** A document iterator that aligns the results of two iterators over
 * different indices.
 *
 * 

This class is an example of cross-index computation. As in the case of an * {@link AndDocumentIterator}, we intersect the posting lists. However, once * we get to the index level, we actually return just intervals that appear in * all component iterators. Of course, this is meaningful only if all * indices represent different views on the same data, a typical example being * semantic tagging. * *

An instance of this class exposes a single interval iterator associated with * the index of the first component iterator, as all interval iterators * are exhausted during the computation of their intersection. * Correspondingly, a call to {@link IntervalIterator#intervalTerms(LongSet)} just * returns the terms related to the first component iterator. */ public class AlignDocumentIterator extends AbstractDocumentIterator { private final static boolean DEBUG = false; /** The first operand, to be aligned. */ final private DocumentIterator firstIterator; /** The second operand, to be used to align the first operand. */ final private DocumentIterator secondIterator; /** {@link #firstIterator}, if it is an {@link IndexIterator}. */ final private IndexIterator firstIndexIterator; /** {@link #secondIterator}, if it is an {@link IndexIterator}. */ final private IndexIterator secondIndexIterator; /** The sole index involved in this iterator. */ final private Index index; /** A singleton containing {@link #currentIterator}. */ final private Reference2ReferenceMap currentIterators; /** An unmodifiable wrapper around {@link #currentIterator}. */ final private Reference2ReferenceMap unmodifiableCurrentIterators; /** The interval iterator associated with this document iterator, or null. */ private IntervalIterator intervalIterator; /** The iterator returned for the current document, if any, or null. */ private IntervalIterator currentIterator; /** Returns a document iterator that aligns the first iterator to the second. * * @param firstIterator the iterator to be aligned. * @param secondIterator the iterator used to align firstIterator. * * @return a document iterator that computes the alignment of firstIterator on secondIterator. */ public static DocumentIterator getInstance( final DocumentIterator firstIterator, final DocumentIterator secondIterator ) { return new AlignDocumentIterator( firstIterator, secondIterator ); } protected AlignDocumentIterator( final DocumentIterator firstIterator, final DocumentIterator secondIterator ) { this.firstIterator = firstIterator; this.secondIterator = secondIterator; if ( firstIterator instanceof IndexIterator && secondIterator instanceof IndexIterator ) { firstIndexIterator = (IndexIterator)firstIterator; secondIndexIterator = (IndexIterator)secondIterator; } else firstIndexIterator = secondIndexIterator = null; if ( firstIterator.indices().size() != 1 || secondIterator.indices().size() != 1 ) throw new IllegalArgumentException( "You can align single-index iterators only" ); index = firstIterator.indices().iterator().next(); currentIterators = new Reference2ReferenceArrayMap( 1 ); unmodifiableCurrentIterators = Reference2ReferenceMaps.unmodifiable( currentIterators ); } public ReferenceSet indices() { return firstIterator.indices(); } public long nextDocument() throws IOException { currentIterator = null; long first; if ( ( first = firstIterator.nextDocument() ) != -1 ) { long second = -1; // This forces a call to secondIterator.skipTo( first ). for( ;; ) { if ( first < second ) { if ( ( first = firstIterator.skipTo( second ) ) == END_OF_LIST ) break; } else if ( second < first ) { if ( ( second = secondIterator.skipTo( first ) ) == END_OF_LIST ) break; } else { curr = first; if ( intervalIterator().hasNext() ) return first; currentIterator = null; if ( ( first = firstIterator.nextDocument() ) == -1 ) break; } } } curr = END_OF_LIST; return -1; } public boolean mayHaveNext() { return firstIterator.mayHaveNext() && secondIterator.mayHaveNext(); } public long skipTo( final long n ) throws IOException { if ( curr >= n ) return curr; currentIterator = null; long first; long second; if ( ( first = firstIterator.skipTo( n ) ) == END_OF_LIST ) return curr = END_OF_LIST; second = -1; // This forces a call to secondIterator.skipTo( first ). for( ;; ) { if ( first < second ) { if ( ( first = firstIterator.skipTo( second ) ) == END_OF_LIST ) return curr = END_OF_LIST; } else if ( second < first ) { if ( ( second = secondIterator.skipTo( first ) ) == END_OF_LIST ) return curr = END_OF_LIST; } else { curr = first; if ( intervalIterator().hasNext() ) return first; currentIterator = null; if ( ( first = firstIterator.nextDocument() ) == -1 ) return curr = END_OF_LIST; } } } public IntervalIterator intervalIterator() throws IOException { return intervalIterator( index ); } public Reference2ReferenceMap intervalIterators() throws IOException { currentIterators.put( index, intervalIterator() ); return unmodifiableCurrentIterators; } public IntervalIterator intervalIterator( final Index index ) throws IOException { if ( DEBUG ) System.err.println( this + ".intervalIterator(" + index + ")" ); ensureOnADocument(); if ( index != this.index ) return IntervalIterators.FALSE; // If the iterator has been created and it's ready, we just return it. if ( currentIterator != null ) return currentIterator; final IntervalIterator firstIntervalIterator = firstIterator.intervalIterator(), secondIntervalIterator = secondIterator.intervalIterator(); if ( secondIntervalIterator == IntervalIterators.FALSE ) return currentIterator = IntervalIterators.FALSE; if ( secondIntervalIterator == IntervalIterators.TRUE ) return currentIterator = firstIntervalIterator == IntervalIterators.TRUE ? IntervalIterators.TRUE : IntervalIterators.FALSE; if ( firstIntervalIterator == IntervalIterators.TRUE ) return currentIterator = IntervalIterators.FALSE; if ( intervalIterator == null ) intervalIterator = firstIndexIterator == null ? new AlignIntervalIterator() : new AlignIndexIntervalIterator(); intervalIterator.reset(); return currentIterator = intervalIterator; } public void dispose() throws IOException { firstIterator.dispose(); secondIterator.dispose(); } public T accept( final DocumentIteratorVisitor visitor ) throws IOException { if ( ! visitor.visitPre( this ) ) return null; final T[] a = visitor.newArray( 2 ); if ( a == null ) { if ( firstIterator.accept( visitor ) == null ) return null; if ( secondIterator.accept( visitor ) == null ) return null; } else { if ( ( a[ 0 ] = firstIterator.accept( visitor ) ) == null ) return null; if ( ( a[ 1 ] = secondIterator.accept( visitor ) ) == null ) return null; } return visitor.visitPost( this, a ); } public T acceptOnTruePaths( final DocumentIteratorVisitor visitor ) throws IOException { if ( ! visitor.visitPre( this ) ) return null; final T[] a = visitor.newArray( 1 ); if ( a == null ) { if ( firstIterator.acceptOnTruePaths( visitor ) == null ) return null; } else { if ( ( a[ 0 ] = firstIterator.acceptOnTruePaths( visitor ) ) == null ) return null; } return visitor.visitPost( this, a ); } /** An interval iterator returning the intersection of the component interval iterators. */ private class AlignIntervalIterator extends AbstractIntervalIterator implements IntervalIterator { /** The interval iterator of the first iterator. */ private IntervalIterator firstIntervalIterator; /** The interval iterator of the second iterator. */ private IntervalIterator secondIntervalIterator; /** Whether the scan is over. */ private boolean endOfProcess; public void reset() throws IOException { next = null; endOfProcess = false; firstIntervalIterator = firstIterator.intervalIterator(); secondIntervalIterator = secondIterator.intervalIterator(); } public void intervalTerms( final LongSet terms ) { firstIntervalIterator.intervalTerms( terms ); } public Interval nextInterval() throws IOException { if ( next != null ) { final Interval result = next; next = null; return result; } if ( endOfProcess ) return null; Interval firstInterval = null, secondInterval = null; firstInterval = firstIntervalIterator.nextInterval(); secondInterval = secondIntervalIterator.nextInterval(); if ( firstInterval == null || secondInterval == null ) { endOfProcess = true; return null; } while ( ! firstInterval.equals( secondInterval ) ) { if ( firstInterval.left <= secondInterval.left ) { if ( ( firstInterval = firstIntervalIterator.nextInterval() ) == null ) { endOfProcess = true; return null; } } else { if ( ( secondInterval = secondIntervalIterator.nextInterval() ) == null ) { endOfProcess = true; return null; } } } return firstInterval; } public int extent() { return firstIntervalIterator.extent(); } public String toString() { return getClass().getSimpleName() + "(" + firstIterator + ", " + secondIterator + ")"; } } /** An interval iterator returning the intersection of the component interval iterators. */ private class AlignIndexIntervalIterator extends AbstractIntervalIterator implements IntervalIterator { /** Whether the scan is over. */ private boolean endOfProcess; /** The positions of the first iterator. */ private int[] firstPosition; /** The positions of the second iterator. */ private int[] secondPosition; /** The count of the first iterator. */ private int firstCount; /** The count of the second iterator. */ private int secondCount; /** The position of the first iterator. */ private int firstCurr; /** The position of the second iterator. */ private int secondCurr; public void reset() throws IOException { next = null; endOfProcess = false; firstPosition = firstIndexIterator.positionArray(); secondPosition = secondIndexIterator.positionArray(); firstCount = firstIndexIterator.count(); secondCount = secondIndexIterator.count(); firstCurr = secondCurr = -1; } public void intervalTerms( final LongSet terms ) { terms.add( firstIndexIterator.termNumber() ); } public Interval nextInterval() { if ( next != null ) { final Interval result = next; next = null; return result; } if ( endOfProcess ) return null; final int[] firstPosition = this.firstPosition, secondPosition = this.secondPosition; if ( ++firstCurr == firstCount || ++secondCurr == secondCount ) { endOfProcess = true; return null; } while ( firstPosition[ firstCurr ] != secondPosition[ secondCurr ] ) { if ( firstPosition[ firstCurr ] < secondPosition[ secondCurr ] ) { if ( ++firstCurr == firstCount ) { endOfProcess = true; return null; } } else { if ( ++secondCurr == secondCount ) { endOfProcess = true; return null; } } } return Interval.valueOf( secondPosition[ secondCurr ] ); } public int extent() { return 1; } public String toString() { return getClass().getSimpleName() + "(" + firstIterator + ", " + secondIterator + ")"; } } public String toString() { return getClass().getSimpleName() + "(" + firstIterator + ", " + secondIterator + ")"; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy