src.it.unimi.dsi.big.mg4j.document.TRECDocumentCollection Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j-big Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.
The newest version!
package it.unimi.dsi.big.mg4j.document;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2006-2011 Sebastiano Vigna
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectArrays;
import it.unimi.dsi.fastutil.objects.ObjectBigArrayBigList;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.io.SegmentedInputStream;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.big.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys;
import it.unimi.dsi.big.mg4j.util.MG4JClassParser;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.InvocationTargetException;
import java.util.Arrays;
import java.util.zip.GZIPInputStream;

import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;

/** A collection for the TREC GOV2 data set.
 * 
 * The documents are stored as a set of descriptors, representing the (possibly gzipped) file
 * they are contained in and the start and stop position in that file. To manage
 * descriptors later we rely on {@link SegmentedInputStream}.
 * 
 * 
To interpret a file, we read up to <DOC> and place a start
 * marker there, we advance to the header and store the URI. An intermediate
 * marker is placed at the end of the doc header tag and a stop marker just
 * before </DOC>.
 * 
 * 
The resulting {@link SegmentedInputStream} has two segments
 * per document. By using a {@link it.unimi.dsi.big.mg4j.document.CompositeDocumentFactory}, the
 * first segment is parsed by a {@link it.unimi.dsi.big.mg4j.document.TRECHeaderDocumentFactory},
 * whereas the second segment is parsed by a user-provided factory—usually,
 * an {@link it.unimi.dsi.big.mg4j.document.HtmlDocumentFactory}.
 * 
 * 
The collection provides both sequential access to all documents via the
 * iterator and random access to a given document. However, the two operations
 * are performed very differently as the sequential operation is much more
 * efficient than calling {@link #document(long)} repeatedly.
 * 
 * @author Alessio Orlandi
 * @author Luca Natali
 */
public class TRECDocumentCollection extends AbstractDocumentCollection implements Serializable {

	private static final Logger LOGGER = Logger.getLogger( TRECDocumentCollection.class );
	private static final long serialVersionUID = -4251461013312968454L;
	
	private static final boolean DEBUG = false;
	/** Default buffer size, set up after some experiments. */
	public static final String DEFAULT_BUFFER_SIZE = "64Ki";

	/** The list of the files containing the documents. */
	private String[] file;
	/** Whether the files in {@link #file} are gzipped. */
	private final boolean useGzip;
	/** The document factory. */
	protected DocumentFactory factory;
	/** The list of document descriptors.  We assume that descriptors within the same file are contiguous */
	protected transient ObjectBigArrayBigList descriptors;
	/** The buffer size. */
	private final int bufferSize;
	/** The last returned stream. */
	private SegmentedInputStream lastStream;

	/** A compact description of the location and of the internal segmentation of
	 * a TREC document inside a file. 
	 */
	
	private static class TRECDocumentDescriptor implements Cloneable {
		/** A reference to the file containing this document. */
		public int fileIndex;
		/** The starting position of this document in the file. */
		public long startMarker;
		/** The starting position of the content of this document in the file. */
		public int intermediateMarkerDiff;
		/** The ending position. */
		public int stopMarkerDiff;

		// TODO: this computation should be moved in the caller
		public TRECDocumentDescriptor(int findex, long start, long intermediateMarker, long stop) {
			this.fileIndex = findex;
			this.startMarker = start;
			this.intermediateMarkerDiff = (int) (intermediateMarker - start);
			this.stopMarkerDiff = (int) (stop - start);

		}

		public TRECDocumentDescriptor(int findex, long start,
				int intermediateMarkerDiff, int stopMarkerDiff) {
			this.fileIndex = findex;
			this.startMarker = start;
			this.intermediateMarkerDiff = intermediateMarkerDiff;
			this.stopMarkerDiff = stopMarkerDiff;
		}

		public final long[] toSegments() {
			return new long[] { startMarker, startMarker + intermediateMarkerDiff, stopMarkerDiff + startMarker };

		}

		public Object clone() {
			return new TRECDocumentDescriptor(this.fileIndex, this.startMarker,
					this.startMarker + this.intermediateMarkerDiff,
					this.stopMarkerDiff + this.startMarker);
		}

	}

	protected final static byte[] DOC_OPEN, DOC_CLOSE, DOCNO_OPEN, DOCNO_CLOSE, DOCHDR_OPEN, DOCHDR_CLOSE;
	
	static {
		try {
			DOC_OPEN = "".getBytes( "ASCII" );
			DOC_CLOSE = "".getBytes( "ASCII" );
			DOCNO_OPEN = "".getBytes( "ASCII" );
			DOCNO_CLOSE = "".getBytes( "ASCII" );
			DOCHDR_OPEN = "".getBytes( "ASCII" );
			DOCHDR_CLOSE = "".getBytes( "ASCII" );
		}
		catch ( UnsupportedEncodingException cantHappen ) {
			throw new RuntimeException( cantHappen );
		}
	}
	
	protected static boolean equals( byte[] a, int len, byte[] b ) {
		if ( len != b.length ) return false;
		while( len-- != 0 ) if ( a[ len ] != b[ len ] ) return false;
		return true;
	}

	byte buffer[] = new byte[ 8 * 1024 ];

	private void parseContent( int fileIndex, InputStream is ) throws IOException {
		long currStart, currStop, currInter, oldPos;
		boolean pastHeader = false, startedBlock = false;

		LOGGER.debug( "Processing file " + fileIndex + " (" + file[ fileIndex ] + ")" );

		FastBufferedInputStream fbis = new FastBufferedInputStream( is, bufferSize );

		currStart = 0; // make java compiler happy.
		currInter = 0;
		oldPos = 0;
		
		int l;
		
		while ( ( l = fbis.readLine( buffer ) ) != -1 ) {
			if ( l == buffer.length ) {
				// We filled the buffer, which means we have a very very long line. Let's skip it.
				while ( ( l = fbis.readLine( buffer ) ) == buffer.length );
			}
			else {
				if ( !startedBlock && equals( buffer, l, DOC_OPEN ) ) {
					currStart = oldPos;
					startedBlock = true; // Start of the current block (includes  marker)
				}
				else if ( startedBlock && equals( buffer, l, DOC_CLOSE ) ) {
					currStop = oldPos;
					if ( DEBUG ) LOGGER.debug( "Setting markers <" + currStart + "," + currInter + ", " + currStop + ">" );
					descriptors.add( new TRECDocumentDescriptor( fileIndex, currStart, currInter, currStop ) );
					startedBlock = pastHeader = false;
				}
				else if ( startedBlock && !pastHeader && equals( buffer, l, DOCHDR_CLOSE ) ) {
					currInter = fbis.position();
					pastHeader = true;
				}
				oldPos = fbis.position();
			}
		}

		fbis.close();
	}

	/**
	 * Copy constructor (that is, the one used by {@link #copy()}. Just
	 * initializes final fields
	 */
	protected TRECDocumentCollection( String[] file, DocumentFactory factory, ObjectBigArrayBigList descriptors, int bufferSize, boolean useGzip ) {
		this.useGzip = useGzip;
		this.file = file;
		this.bufferSize = bufferSize;
		this.factory = factory;
		this.descriptors = descriptors;
	}

	public TRECDocumentCollection copy() {
		return new TRECDocumentCollection( file, factory.copy(), descriptors, bufferSize, useGzip );
	}

	private final InputStream openFileStream( String fileName ) throws IOException {
		final InputStream s = new FileInputStream( fileName );
		if ( useGzip ) return new GZIPInputStream( s );
		else return s;
	}

	/** Creates a new TREC collection by parsing the given files.
	 * 
	 * @param file an array of file names containing documents in TREC GOV2 format.
	 * @param factory the document factory (usually, a composite one).
	 * @param bufferSize the buffer size.
	 * @param useGzip true iff the files are gzipped.
	 */
	public TRECDocumentCollection( String[] file, DocumentFactory factory, int bufferSize, boolean useGzip ) throws IOException {
		this.file = file;
		this.factory = factory;
		this.bufferSize = bufferSize;
		this.descriptors = new ObjectBigArrayBigList();
		this.useGzip = useGzip;

		final ProgressLogger progressLogger = new ProgressLogger( LOGGER );
		progressLogger.expectedUpdates = file.length;
		progressLogger.itemsName = "files";

		progressLogger.start( "Parsing " + ( useGzip ? "GZip" : "plain" ) + " files" );

		for ( int i = 0; i < file.length; i++ ) {
			parseContent( i, openFileStream( file[ i ] ) );
			progressLogger.update();
		}

		progressLogger.done();
	}

	public long size() {
		return descriptors.size64();
	}

	public Document document( long n ) throws IOException {
		Reference2ObjectMap,Object> metadata = metadata( n );
		return factory.getDocument( stream( n ), metadata );
	}

	public InputStream stream( final long n ) throws IOException {
		// Creates a Segmented Input Stream with only one segment in (the requested one).
		ensureDocumentIndex( n );
		IOUtils.closeQuietly( lastStream );
		final TRECDocumentDescriptor descr = descriptors.get( n );
		return lastStream = new SegmentedInputStream( openFileStream( file[ descr.fileIndex ] ), descr.toSegments() );
	}

	public Reference2ObjectMap,Object> metadata( final long index ) {
		ensureDocumentIndex( index );
		final Reference2ObjectArrayMap, Object> metadata = new Reference2ObjectArrayMap, Object>( 4 );

		metadata.put( MetadataKeys.URI, "Document #" + index );
		return metadata;
	}

	public DocumentFactory factory() {
		return this.factory;
	}

	public void close() throws IOException {
		super.close();
		if ( lastStream != null ) lastStream.close();
		descriptors = null;
	}

	/**
	 * Merges a new collection in this one, by rebuilding the gzFile array and
	 * appending the other object one, concatenating the descriptors while
	 * rebuilding all.
	 * 
	 * It is supposed that the passed object contains no duplicates for the
	 * local collection.
	 */
	public void merge( TRECDocumentCollection other ) {
		int oldLength = this.file.length;

		this.file = ObjectArrays.ensureCapacity( this.file, this.file.length + other.file.length );
		System.arraycopy( other.file, 0, this.file, oldLength, other.file.length );

		ObjectIterator iter = other.descriptors.iterator();
		while ( iter.hasNext() ) {
			final TRECDocumentDescriptor tdd = (TRECDocumentDescriptor)iter.next().clone();
			tdd.fileIndex += oldLength;
			this.descriptors.add( tdd );
		}
	}

	public DocumentIterator iterator() throws IOException {
		return new AbstractDocumentIterator() {
			/**
			 * An iterator returning the descriptors of the documents in the
			 * enveloping collection.
			 */
			private final ObjectIterator descriptorIterator = descriptors.iterator();
			/** The current stream. */
			private SegmentedInputStream siStream;
			/** The current document. */
			private int currentDocument = 0;
			/** The last returned document. */
			private Document last;
			/** The first descriptor of the next file, if any, or null if nextFile() has never been called. */
			private TRECDocumentDescriptor firstNextDescriptor;
			
			private boolean nextFile() throws FileNotFoundException, IOException {
				if ( size() == 0 ) return false;
				IOUtils.closeQuietly( siStream );
				if ( ! descriptorIterator.hasNext() ) return false;

				/*
				 * We assume documents contained in the same gzip file are
				 * contiguous so we collect all of them until we find a different
				 * file index.
				 */
				TRECDocumentDescriptor currentDescriptor = firstNextDescriptor != null ? firstNextDescriptor : descriptorIterator.next();
				int currentFileIndex = currentDescriptor.fileIndex;

				if ( DEBUG ) LOGGER.debug( "Skipping to contents file " + currentFileIndex + " (" + file[ currentFileIndex ] + ")" );

				/*
				 * We create the segmented input stream with all just collected
				 * descriptors
				 */
				siStream = new SegmentedInputStream( openFileStream( file[ currentFileIndex ] ) );

				do {
					siStream.addBlock( currentDescriptor.toSegments() );
					if ( ! descriptorIterator.hasNext() ) break;
					currentDescriptor = descriptorIterator.next();
				} while ( currentDescriptor.fileIndex == currentFileIndex );

				firstNextDescriptor = currentDescriptor; // The last assignment will be meaningless, but it won't be used anyway
				return true;
			}

			public Document nextDocument() throws IOException {
				/* If necessary, skip to the next segment, else, try skipping to the next gzip file. */
				if ( DEBUG ) LOGGER.debug( "nextDocument() has been called " );
				
				if ( last != null ) {
					last.close();
					if ( ! siStream.hasMoreBlocks() ) {
						if ( ! nextFile() ) return last = null;
					}
					else siStream.nextBlock();
				}
				else if ( ! nextFile() ) return null; // First call
				
				return last = factory.getDocument( siStream, metadata( currentDocument++ ) );
			}

			public void close() throws IOException {
				if ( siStream != null ) {
					if ( last != null ) last.close();
					super.close();
					siStream.close();
					siStream = null;
				}
			}
		};
	}

	private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException {
		s.defaultReadObject();

		final long size = s.readLong();
		final ObjectBigArrayBigList descriptors = new ObjectBigArrayBigList();
		descriptors.ensureCapacity( size );
		for ( int i = 0; i < size; i++ )
			descriptors.add( new TRECDocumentDescriptor( s.readInt(), s.readLong(), s.readInt(), s.readInt() ) );
		this.descriptors = descriptors;
	}

	private void writeObject(final ObjectOutputStream s) throws IOException {
		s.defaultWriteObject();
		s.writeLong(descriptors.size64());

		for (TRECDocumentDescriptor descriptor : descriptors) {
			s.writeInt(descriptor.fileIndex);
			s.writeLong(descriptor.startMarker);
			s.writeInt(descriptor.intermediateMarkerDiff);
			s.writeInt(descriptor.stopMarkerDiff);
		}
	}

	public static void main( final String[] arg ) throws IOException, JSAPException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {

		SimpleJSAP jsap = new SimpleJSAP(
				TRECDocumentCollection.class.getName(), "Saves a serialised TREC document collection based on a set of file names (which will be sorted lexicographically).",
				new Parameter[] {
						new FlaggedOption( "factory", MG4JClassParser.getParser(), IdentityDocumentFactory.class.getName(), JSAP.NOT_REQUIRED, 'f', "factory", "A document factory with a standard constructor." ),
						new FlaggedOption( "property", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', "property", "A 'key=value' specification, or the name of a property file" ).setAllowMultipleDeclarations( true ),
						new Switch( "gzipped", 'z', "gzipped", "The files are gzipped." ),
						new Switch( "unsorted", 'u', "unsorted", "Keep the file list unsorted." ),
						new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, DEFAULT_BUFFER_SIZE, JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of an I/O buffer." ),
						new UnflaggedOption( "collection", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename for the serialised collection." ),
						new UnflaggedOption( "file", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.GREEDY, "A list of files that will be indexed. If missing, a list of files will be read from standard input." ) 
		} );

		JSAPResult jsapResult = jsap.parse( arg );
		if ( jsap.messagePrinted() ) return;

		final DocumentFactory userFactory = PropertyBasedDocumentFactory.getInstance( jsapResult.getClass( "factory" ), jsapResult.getStringArray( "property" ) );

		String[] file = jsapResult.getStringArray( "file" );
		if ( file.length == 0 ) {
			final ObjectArrayList files = new ObjectArrayList();
			BufferedReader bufferedReader = new BufferedReader( new InputStreamReader( System.in ) );
			String s;
			while ( ( s = bufferedReader.readLine() ) != null ) files.add( s );
			file = files.toArray( new String[ 0 ] );
		}

		// To avoid problems with find and similar utilities, we sort the file names
		if ( !jsapResult.getBoolean( "unsorted" ) ) Arrays.sort( file );

		final DocumentFactory composite = CompositeDocumentFactory.getFactory( new TRECHeaderDocumentFactory(), userFactory );

		if ( file.length == 0 ) System.err.println( "WARNING: empty file set." );
		BinIO.storeObject( new TRECDocumentCollection( file, composite, jsapResult.getInt( "bufferSize" ), jsapResult.getBoolean( "gzipped" ) ), jsapResult.getString( "collection" ) );
	}
}