All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.di.mg4j.index.remote.RemoteIndexReader Maven / Gradle / Ivy

Go to download

MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java.

There is a newer version: 5.2.2
Show newest version
package it.unimi.di.mg4j.index.remote;

/*		 
 * MG4J: Managing Gigabytes for Java
 *
 * Copyright (C) 2006-2012 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.di.mg4j.index.AbstractIndexIterator;
import it.unimi.di.mg4j.index.AbstractIndexReader;
import it.unimi.di.mg4j.index.Index;
import it.unimi.di.mg4j.index.IndexIntervalIterator;
import it.unimi.di.mg4j.index.IndexIterator;
import it.unimi.di.mg4j.index.IndexReader;
import it.unimi.di.mg4j.index.payload.Payload;
import it.unimi.di.mg4j.search.IntervalIterator;
import it.unimi.di.mg4j.search.IntervalIterators;
import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMap;
import it.unimi.dsi.fastutil.objects.Reference2ReferenceMaps;
import it.unimi.dsi.fastutil.objects.ReferenceSet;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.Socket;

import org.apache.log4j.Logger;


/** An index reader for {@link it.unimi.di.mg4j.index.remote.RemoteIndex}.
 * 
 * @author Sebastiano Vigna
 * @author Alessandro Arrabito
 */

public class RemoteIndexReader extends AbstractIndexReader {
	final private static Logger LOGGER = Util.getLogger( RemoteIndexReader.class );
		
	private static final boolean ASSERTS = false; 

	private static final byte DOCUMENTS_BY_NAME = 0;
	private static final byte DOCUMENTS_BY_INDEX = 1;
	private static final byte PREFETCH = 2;
	private static final byte CLOSE = 3;
	private static final byte DISPOSE = 4;
	private static final byte SKIP_TO = 5;
			
	/** The index we refer to. */
	protected final Index index;
	/** The remote server connection used to call the index server.*/
	protected final RemoteIndexServerConnection connection;
	/** The index iterator associated with this reader.*/
	protected final RemoteIndexReaderIndexIterator remoteIndexIterator;
	/** The input stream in {@link #connection}, cached. */
	protected final DataInputStream inputStream;
	/** The output stream in {@link #connection}, cached. */
	protected final DataOutputStream outputStream;
	
	public RemoteIndexReader( final RemoteIndex index, final int bufferSize ) throws IOException {		
		this.index = index;
		connection = new RemoteIndexServerConnection( index.socketAddress, IndexServer.GET_INDEX_READER );
		inputStream = connection.inputStream;
		outputStream = connection.outputStream;
		remoteIndexIterator = new RemoteIndexReaderIndexIterator( bufferSize );
	}
	
	public void close() throws IOException, IllegalStateException {
        super.close();            
        remoteIndexIterator.flush();
		outputStream.writeByte( RemoteIndexReader.CLOSE );
		outputStream.flush();
		connection.close();
	}

	protected void finalize() throws Throwable {
		try {
			if ( ! connection.socket.isClosed() ) {
				LOGGER.warn( "This " + this.getClass().getName() + " [" + toString() + "] should have been closed." );
				close();
			}
		}
		finally {
			super.finalize();
		}
	}
	
	public IndexIterator documents( final int termNumber ) throws IOException {
		remoteIndexIterator.flush();
		outputStream.writeByte( RemoteIndexReader.DOCUMENTS_BY_INDEX );
		outputStream.writeInt( termNumber );
		outputStream.flush();
		remoteIndexIterator.term( null );
		// Read frequency
		remoteIndexIterator.reset( inputStream.readInt() );
		remoteIndexIterator.prefetchDocs( false );
		return remoteIndexIterator;
	}

	public IndexIterator documents( final CharSequence term ) throws IOException {
		remoteIndexIterator.flush();
		outputStream.writeByte( RemoteIndexReader.DOCUMENTS_BY_NAME );
		new MutableString( term ).writeSelfDelimUTF8( (OutputStream)outputStream );
		outputStream.flush();
		remoteIndexIterator.term( term );
		// Read frequency
		remoteIndexIterator.reset( inputStream.readInt() );
		remoteIndexIterator.prefetchDocs( false );
		return remoteIndexIterator;
	}


	/** An index iterator based on a remote index reader.
	 * 
	 * 

Each remote index reader creates exactly one instance of this class. The instance * is reused upon calls to {@link IndexReader#documents(int)}. * *

The internal state is unfortunately quite complicated by the necessity of grabbing data * from the socket as lazily as possible. * *

Basically, an instance of this class can be in one of three states: *

    *
  • If {@link #exhausted} is true, then there are no more items to be returned * and there is no more data coming from the socket. If you need to force * this state, you can call {@link #flush()}, which discards the remaining data * coming from the socket and set {@link #exhausted} (this is necessary, for instance, * each time you reuse the iterator). *
  • Otherwise, if {@link #last} is -1 the iterator is brand new, {@link #next} is -1, * too, and the socket input stream has been filled but never read. *
  • Otherwise, if {@link #next} is greater than or equal to zero, then {@link #next} * is the next document pointer to be returned, and it has been just read from the * socket input stream (for instance, if there are counts the socket input stream is * positioned just before the count). *
  • Finally, if {@link #next} is -1 then {@link #last} is the last document pointer * returned, and the socket input stream is positioned exactly before the next document * pointer to be returned, or over the end-of-block marker. *
*/ private class RemoteIndexReaderIndexIterator extends AbstractIndexIterator implements IndexIterator { /** The number of byte requested with a single request to an index server. */ private final int bufferSize; /** The next document pointer to be returned, or -1 if the iterator has to be advanced. */ private int next; /** The last document pointer returned. */ private int last; /** The frequency of the current term. */ private int frequency; /** Whether this iterator has been exhausted. */ private boolean exhausted; /** The current payload, or null. */ protected Payload payload; /** The current count. */ protected int count; /** The current positions. */ protected final int[] position; /** The index of the next position to be returned by {@link #nextPosition}. */ protected int currentPosition; public RemoteIndexReaderIndexIterator ( final int bufferSize ) { this.bufferSize = bufferSize; this.position = new int[ Math.max( 0, index.maxCount ) ]; this.exhausted = true; // To avoid flushing the first time } public Index index() { return keyIndex; } public void flush() throws IOException { if ( ! exhausted ) { while( inputStream.readInt() >= 0 ); inputStream.readBoolean(); if ( ASSERTS ) assert inputStream.available() == 0; exhausted = true; } } public void reset( final int frequency ) { this.frequency = frequency; exhausted = false; next = last = -1; } /** Prefetches a batch of document data from the server. * @param alreadyOnFirst will be passed as an argument to the remote call. */ public void prefetchDocs( final boolean alreadyOnFirst ) throws IOException { outputStream.writeByte( RemoteIndexReader.PREFETCH ); outputStream.writeBoolean( alreadyOnFirst ); outputStream.writeInt( bufferSize ); outputStream.flush(); } /** Tries to advance the remote iterator. * *

After a call to this method returning -1, the prefetched data is exhausted * and {@link #exhausted} is true. Otherwise, the input stream of the connection is * positioned just before counts and position of the returned document pointer. * * @return -1 if there are no more elements, the next pointer otherwise. The value returned * in stored in {@link #next}. */ private int advance() { if ( next >= 0 ) return next; try { next = inputStream.readInt(); if ( next < 0 ) { if ( inputStream.readBoolean() ) prefetchDocs( false ); else { exhausted = true; return -1; } next = inputStream.readInt(); if ( ASSERTS ) assert next >= 0; } return next; } catch ( Exception e ) { throw new RuntimeException( e ); } } public int document() { if ( last < 0 ) throw new IllegalStateException(); return last; } public int skipTo( final int p ) { try { if ( p <= last ) return last; if ( exhausted ) return END_OF_LIST; // First we check whether we can skip inside the local buffer. if ( next < 0 ) next = inputStream.readInt(); while( next >= 0 && next < p ) { if ( index.hasCounts ) { count = inputStream.readInt(); if ( index.hasPositions ) for ( int i = 0; i < count; i++ ) inputStream.readInt(); } next = inputStream.readInt(); } //System.err.println( "Out of loop: " + next ); if ( next >= 0 ) return nextDocument(); if ( exhausted = ! inputStream.readBoolean() ) return END_OF_LIST; if ( ASSERTS ) assert inputStream.available() == 0; int result; outputStream.writeByte( RemoteIndexReader.SKIP_TO ); outputStream.writeInt( p ); outputStream.flush(); result = inputStream.readInt(); //System.err.println( "Skip to " + p + " result: " + result ); if ( result == END_OF_LIST ) { exhausted = true; return END_OF_LIST; } prefetchDocs( true ); //System.err.println( "Prefetch completed" ); next = result; return nextDocument(); } catch ( Exception e ) { throw new RuntimeException( e ); } } public boolean mayHaveNext() { if ( exhausted ) return false; if ( next < 0 ) next = advance(); return next >= 0; } public int nextDocument() { if ( ! mayHaveNext() ) return END_OF_LIST; last = next; next = -1; try { // TODO: this is *very rough* and preliminary if ( index.hasPayloads ) payload.read( new InputBitStream( inputStream, 0 ) ); if ( index.hasCounts ) { count = inputStream.readInt(); if ( index.hasPositions ) { currentPosition = 0; for ( int i = 0; i < count; i++ ) position[ i ] = inputStream.readInt(); } } } catch ( IOException e ) { throw new RuntimeException( e ); } return last; } // TODO: implement skip efficiently (also in bit stream readers) @Override public void dispose() throws IOException { close(); } private final IndexIntervalIterator intervalIterator = index.hasPositions? new IndexIntervalIterator( this ) : null; private final Index keyIndex = RemoteIndexReader.this.index.keyIndex; private final Reference2ReferenceMap singletonIntervalIterator = Reference2ReferenceMaps.singleton( keyIndex, (IntervalIterator)intervalIterator ); @Override public int frequency() { return frequency; } @Override public Payload payload() { return payload; } @Override public int count() { return count; } @Override public int nextPosition() throws IOException { if ( currentPosition == count ) return END_OF_POSITIONS; return position[ currentPosition++ ]; } @Override public ReferenceSet indices() { return index.singletonSet; } @Override public IntervalIterator intervalIterator( final Index index ) { if ( ! index.hasPositions ) throw new UnsupportedOperationException( "Index " + index + " does not contain positions" ); return index == keyIndex ? intervalIterator : IntervalIterators.FALSE; } @Override public IntervalIterator intervalIterator() { return intervalIterator; } @Override public Reference2ReferenceMap intervalIterators() { return singletonIntervalIterator; } public int termNumber() { throw new UnsupportedOperationException(); } } public static class ServerThread extends it.unimi.di.mg4j.index.remote.ServerThread { @SuppressWarnings("hiding") private final static Logger LOGGER = Util.getLogger( ServerThread.class ); private final static boolean DEBUG = false; /** The index we refer to. */ private final Index index; /** The remoted index reader. */ private final IndexReader indexReader; /** The current index iterator . */ private IndexIterator indexIterator; public ServerThread( final Socket socket, final Index index ) throws IOException { super( socket ); this.index = index; this.indexReader = index.getReader(); } public void run() { try { int command; for(;;) { try { command = inputStream.readByte(); } catch ( IOException e ) { LOGGER.warn( "Socket has been probably closed", e ); return; } if ( DEBUG ) LOGGER.debug( "Received remote command: " + command ); switch ( command ) { case RemoteIndexReader.CLOSE: indexReader.close(); // We don't close the socket--the caller should return; case RemoteIndexReader.DOCUMENTS_BY_INDEX: indexIterator = indexReader.documents( inputStream.readInt() ); outputStream.writeInt( indexIterator.frequency() ); outputStream.flush(); break; case RemoteIndexReader.DOCUMENTS_BY_NAME: indexIterator = indexReader.documents( new MutableString().readSelfDelimUTF8( (InputStream)inputStream ) ); outputStream.writeInt( indexIterator.frequency() ); outputStream.flush(); break; case RemoteIndexReader.SKIP_TO: outputStream.writeInt( indexIterator.skipTo( inputStream.readInt() ) ); outputStream.flush(); break; case RemoteIndexReader.PREFETCH: /* When alreadyOfFirst is true, the caller does not really want * to get the first document pointer, as it has it already * got somehow (e.g., by skipping). */ boolean alreadyOnFirst = inputStream.readBoolean(); int count, bufSize = inputStream.readInt(); for ( int i = 0; ( indexIterator.mayHaveNext() || alreadyOnFirst && i == 0 ) && bufSize > 0; i++ ) { if ( i > 0 || ! alreadyOnFirst ) { outputStream.writeInt( indexIterator.nextDocument() ); bufSize--; } if ( index.hasPayloads ) { // TODO: this is *very rough* & preliminary OutputBitStream obs = new OutputBitStream( outputStream ); indexIterator.payload().write( obs ); obs.flush(); } if ( index.hasCounts ) { outputStream.writeInt( count = indexIterator.count() ); bufSize--; if ( index.hasPositions ) { bufSize -= count; while( count-- != 0 ) outputStream.writeInt( indexIterator.nextPosition() ); } } } outputStream.writeInt( -1 ); // End marker outputStream.writeBoolean( indexIterator.mayHaveNext() ); // A peek farther. outputStream.flush(); //System.err.println( "Prefetch completed" ); break; case RemoteIndexReader.DISPOSE: indexIterator.dispose(); // We don't close the socket--the caller should return; default: LOGGER.error( "Unknown remote command: " + command ); } } } catch ( EOFException e ) { LOGGER.warn( "The socket has been closed" ); } catch ( Exception e ) { LOGGER.fatal( e, e ); } } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy