src.it.unimi.dsi.big.mg4j.document.InputStreamDocumentSequence Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j-big Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.
The newest version!
package it.unimi.dsi.big.mg4j.document;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2005-2011 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;
import it.unimi.dsi.big.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys;

import java.io.IOException;
import java.io.InputStream;

/** A document sequence obtained by breaking an input stream at a specified separator. 
 * 
 * This document sequences blindly passes to the indexer sequences of characters read
 * in a specified encoding and separated by a specified byte. 
 */

public class InputStreamDocumentSequence extends FastBufferedInputStream implements DocumentSequence {
	/** The byte separating documents in the input stream. */
	private final int separator;
	/** The factory used to return {@link Document}s. */
	private final DocumentFactory factory;
	/** If true, the last returned stream has been exhausted. This variable
	 * will be reset by a call to nextDocument(). */
	private boolean eof = true;
	/** The sequence will not return more than this number of documents. */
	private final int maxDocs;
	
	/** Creates a new document sequence based on a given input stream and separator; the
	 * sequence will not return more than the given number of documents.
	 * 
	 * @param inputStream the input stream containing all documents.
	 * @param separator the separator.
	 * @param factory the factory that will be used to create documents.
	 * @param maxDocs the maximum number of documents returned.
	 */
	
	public InputStreamDocumentSequence( final InputStream inputStream, final int separator, final DocumentFactory factory, final int maxDocs ) {
		super( inputStream );
		this.separator = separator;
		this.factory = factory;
		this.maxDocs = maxDocs;
	}
	
	/** Creates a new document sequence based on a given input stream and separator.
	 * 
	 * @param inputStream the input stream containing all documents.
	 * @param separator the separator.
	 * @param factory the factory that will be used to create documents.
	 */
	
	public InputStreamDocumentSequence( final InputStream inputStream, final int separator, final DocumentFactory factory ) {
		this( inputStream, separator, factory, Integer.MAX_VALUE );
	}

	public DocumentIterator iterator() {
		final Reference2ObjectArrayMap,Object> metadata = new Reference2ObjectArrayMap,Object>( 2 );
		
		return new DocumentIterator() {
			private int i;
			private Document last;
			
			public Document nextDocument() throws IOException {
				if ( last != null ) last.close();
				if ( i >= maxDocs ) return last = null;
				// If eof is not true, the caller did not exhaust the current document. We do it, however.
				if ( ! eof ) InputStreamDocumentSequence.this.close();
				eof = false;
				final String documentIndex = Integer.toString( i++ );
				metadata.put( MetadataKeys.TITLE, documentIndex );
				metadata.put( MetadataKeys.URI, documentIndex );

				return last = InputStreamDocumentSequence.this.noMoreBytes() ? null : factory.getDocument( InputStreamDocumentSequence.this, metadata );
			}

			public void close() {}
		};
	}

	public DocumentFactory factory() {
		return factory;
	}
	
	public boolean noMoreBytes() throws IOException {
		if ( avail > 0 ) return false;
		
    	avail = is.read( buffer );
    	if ( avail <= 0 ) {
    		// Ooops, there's nothing more to read. Let us set up eof and return.
    		avail = 0;
    		eof = true;
    		return true;
    	}
    	pos = 0;
    	return false;
	}
	
	public int read() throws IOException {
		if ( eof ) return -1;
		final int nextByte = super.read();

		if ( nextByte == separator || nextByte == -1 ) {
			eof = true;
			return -1;
		}
		
		return nextByte;
	}
	
	public int read( final byte[] b ) throws IOException {
		if ( eof ) return -1;
		return read( b, 0, b.length );
	}
	
    public int read( final byte[] b, int offset, int length ) throws IOException {
    	if ( eof ) return -1;
    	if ( length == 0 ) return 0;
    	
    	final int startOffset = offset;
    	int i, l;
    	
        for(;;) {
        	l = Math.min( length, avail );
       		// We scan the buffer for the separator, copying elements in the mean time. 
       		for( i = 0; i < l; i++ ) {
       			if ( buffer[ pos + i ] == separator ) break;
       			b[ offset + i ] = buffer[ pos + i ];
       		}

        	pos += i;
        	avail -= i;
        	
        	offset += i;
        	length -= i;

        	// If we were able to read enough characters, it's over.
        	if ( length == 0 ) return offset - startOffset;
        	// Otherwise, if we found a separator we return.
        	if ( i < l ) {
        		// This will set up eof.
        		read();
        		return offset - startOffset != 0 ? offset - startOffset : -1;
        	}
        	// Finally, in the last case (i == avail) we try to fill the buffer.
        	if ( noMoreBytes() ) return offset - startOffset != 0 ? offset - startOffset : -1;
        }
    }

	public void mark( final int readlimit ) {}

	public boolean markSupported() {
		return false;
	}
	
	// TODO: rewrite skip()
	public long skip( final long skip ) {
		throw new UnsupportedOperationException();
	}
	
	@Deprecated
	public void reset() {
		throw new UnsupportedOperationException();
	}
	
	public void flush() {
		throw new UnsupportedOperationException();
	}
	
	public void close() throws IOException {
		if ( ! eof ) while( read() != -1 );
		super.close();
	}

	public void filename( CharSequence filename ) throws IOException {
	}
}