src.it.unimi.dsi.big.mg4j.document.SimpleCompressedDocumentCollection Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j-big Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.
The newest version!
package it.unimi.dsi.big.mg4j.document;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2009-2011 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */


import it.unimi.dsi.fastutil.ints.AbstractIntComparator;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.longs.AbstractLongIterator;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMaps;
import it.unimi.dsi.io.ByteBufferInputStream;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.NullInputStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.big.mg4j.document.DocumentFactory.FieldType;
import it.unimi.dsi.big.mg4j.tool.Scan.VirtualDocumentFragment;
import it.unimi.dsi.big.mg4j.util.parser.callback.AnchorExtractor;
import it.unimi.dsi.sux4j.util.EliasFanoMonotoneLongBigList;

import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.Serializable;
import java.lang.reflect.InvocationTargetException;
import java.nio.channels.FileChannel.MapMode;
import java.util.NoSuchElementException;
import java.util.zip.ZipFile;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.io.IOUtils;

import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.UnflaggedOption;

/** A basic, compressed document collection that can be easily built at indexing time.
 *
 * Instances of this class record virtual and non-text fields just like {@link ZipDocumentCollection}—that is,
 * in a zip file. However, text fields are recorded in a simple but highly efficient format. Terms (and nonterms) are numbered globally
 * in an increasing way as they are met. While we scan each document, we keep track of frequencies for a limited number of terms:
 * terms are encoded with their frequency rank if we know their statistics, or by a special code derived from their
 * global number if we have no statistics about them. Every number involved is written in delta code.
 * 
 * 
A collection can be exact or approximated: in the latter case, nonwords will not be recorded, and will
 * be turned into spaces when decompressing.
 * 
 * 
A instance of this collection will be, as any other collection, serialised on a file, but it will refer to several other files
 * that are derived from the instance basename. Please use {@link AbstractDocumentSequence#load(CharSequence)}
 * to load instances of this collection.
 * 
 * This class suffers the same scalability problem of {@link ZipDocumentCollection} if you compress non-text or virtual fields. Text
 * compression, on the other hand, is extremely efficient and scalable.
 *  
 * @author Sebastiano Vigna
 */

public class SimpleCompressedDocumentCollection extends AbstractDocumentCollection implements Serializable {
	private static final long serialVersionUID = 1L;

	private static final boolean DEBUG = false;
	protected static final boolean ASSERTS = false;

	/** Standard extension for the file containing encoded documents. */
	public static final String DOCUMENTS_EXTENSION = ".documents";
	/** Standard extension for the file containing document offsets stored as δ-encoded gaps. */
	public static final String DOCUMENT_OFFSETS_EXTENSION = ".docoffsets";
	/** Standard extension for the file containing terms in {@link MutableString#writeSelfDelimUTF8(java.io.OutputStream)} format. */
	public static final String TERMS_EXTENSION  = ".terms";
	/** Standard extension for the file containing term offsets stored as δ-encoded gaps. */
	public static final String TERM_OFFSETS_EXTENSION = ".termoffsets";
	/** Standard extension for the file containing nonterms in {@link MutableString#writeSelfDelimUTF8(java.io.OutputStream)} format. */
	public static final String NONTERMS_EXTENSION  = ".nonterms";
	/** Standard extension for the file containing nonterm offsets stored as δ-encoded gaps. */
	public static final String NONTERM_OFFSETS_EXTENSION = ".nontermoffsets";
	/** Standard extension for the stats file. */
	public static final String STATS_EXTENSION = ".stats";

	/** The basename of this collection. */
	private final String basename;
	/** Whether this collection is exact (i.e., whether it stores nonwords). */
	private final boolean exact;
	/** The number of documents in this collection. */
	private final long documents;
	/** The number of terms in this collection. */
	private final long terms;
	/** The number of nonterms in this collection, or -1 if {@link #exact} is false. */
	private final long nonTerms;
	/** The document offsets. */
	private transient EliasFanoMonotoneLongBigList docOffsets;
	/** The term offsets. */
	private transient EliasFanoMonotoneLongBigList termOffsets;
	/** The nonterm offsets, or null if {@link #exact} is false. */
	private transient EliasFanoMonotoneLongBigList nonTermOffsets;
	/** The input bit stream for documents. */
	private transient InputBitStream documentsInputBitStream;
	/** The input bit stream for terms. */
	private transient FastBufferedInputStream termsInputStream;
	/** The input bit stream for nonterms, or null if {@link #exact} is false. */
	private transient FastBufferedInputStream nonTermsInputStream;
	/** A frequency keeper used to decompress document terms. */
	private transient FrequencyCodec termsFrequencyKeeper;
	/** A frequency keeper used to decompress document nonterms, or null if {@link #exact} is false. */
	private transient FrequencyCodec nonTermsFrequencyKeeper;
	/** The underlying factory. */
	private final DocumentFactory factory;
	/** Whether this collection contains non-text or virtual fields. */
	private final boolean hasNonText;
	/** The zip file used to store non-text and virtual fields if {@link #hasNonText} is true, or  null if this collection does not store such fields. */
	private transient ZipFile zipFile;
	/** The input stream obtained by memory-mapping the file containing documents, or null. */
	private transient ByteBufferInputStream documentsByteBufferInputStream;
	/** The input stream obtained by memory-mapping the file containing terms, or null. */
	private transient ByteBufferInputStream termsByteBufferInputStream;
	/** The input stream obtained by memory-mapping the file containing nonterms, or null. */
	private transient ByteBufferInputStream nonTermsByteBufferInputStream;
	/** True if ancillary files have been all correctly opened. */
	private boolean fileOpenOk;
	/** True if memory mappings have been all been obtained. */
	private boolean fileMappingOk;

	/** An iterator used to load δ-encoded offset gaps. */
	private static final class OffsetsLongIterator extends AbstractLongIterator {
		private final long numberOfItems;
		private long currIndex;
		private long currValue;
		private final InputBitStream ibs;
		
		public OffsetsLongIterator( InputBitStream ibs, long numberOfItems ) {
			this.ibs = ibs;
			this.numberOfItems = numberOfItems;
		}
		
		public boolean hasNext() {
			return currIndex < numberOfItems;
		}
		
		@Override
		public long nextLong() {
			if ( ! hasNext() ) throw new NoSuchElementException();
			try {
				currIndex++;
				return currValue += ibs.readDelta();
			}
			catch ( IOException e ) {
				throw new RuntimeException( e );
			}
		}
	}

	/** A simple codec for integers that remaps frequent numbers to smaller numbers. */
	protected static class FrequencyCodec {
		/** The size of the symbol queue. */
		private final static int MAX_QUEUE_SIZE = 2048;
		/** The symbol queue. */
		private final int[] queue;
		/** An array parallel to  {@link  #queue} containing frequencies. */
		private final int[] freq; 
		/** A map from input symbols to positions in {@link #queue}. */
		private final Int2IntOpenHashMap code2Pos;
		/** The current size of {{@link #queue}. */
		private int queueSize;
		
		public FrequencyCodec() {
			code2Pos = new Int2IntOpenHashMap();
			code2Pos.defaultReturnValue( -1 );
			queue = new int[ MAX_QUEUE_SIZE ];
			freq = new int[ MAX_QUEUE_SIZE ];
		}
		
		/** Empties the queue and the symbol-to-position map. */
		public void reset() {
			queueSize = 0;
			code2Pos.clear();
		}
		
		private final void newSymbol( final int symbol ) {
			if ( queueSize == MAX_QUEUE_SIZE ) {
				// Queue filled up. First, we guarantee that there are elements with frequency one.
				if ( freq[ MAX_QUEUE_SIZE -1 ] != 1 ) for( int j = MAX_QUEUE_SIZE; j-- != 0; ) freq[ j ] /= freq[ MAX_QUEUE_SIZE - 1 ];
				// Then, we remove half of them.
				int j = MAX_QUEUE_SIZE;
				while( j-- != 0 ) if ( freq[ j ] > 1 ) break;
				for( int k = j + ( MAX_QUEUE_SIZE - j ) / 2; k < MAX_QUEUE_SIZE; k++ ) {
					if ( ASSERTS ) assert freq[ k ] == 1; 
					code2Pos.remove( queue[ k ] );
				}
				queueSize = j + ( MAX_QUEUE_SIZE - j ) / 2;
			}
			
			// Now we know that we have space.
			if ( ASSERTS ) assert queueSize < MAX_QUEUE_SIZE;
			code2Pos.put( symbol, queueSize );
			queue[ queueSize ] = symbol;
			freq[ queueSize ] = 1;
			queueSize++;
		}
		
		private final void oldSymbol( final int pos ) {
			// Term already in list
			// Find term to exchange for change of frequency
			int ex = pos;
			while( ex >= 0 && freq[ ex ] == freq[ pos ] ) ex--;
			++ex;
			freq[ pos ]++;
			// Exchange
			int t = queue[ pos ];
			queue[ pos ] = queue[ ex ];
			queue[ ex ] = t;
			t = freq[ pos ];
			freq[ pos ] = freq[ ex ];
			freq[ ex ] = t;
			code2Pos.put( queue[ ex ], ex );
			code2Pos.put( queue[ pos ], pos );
		}
		
		/** Encodes a symbol, returning a (hopefully smaller) symbol.
		 * 
		 * @param symbol the input symbol.
		 * @return the output symbol.
		 */
		public int encode( final int symbol ) {
			final int pos = code2Pos.get( symbol );
			if ( pos == -1 ) {
				final int result = queueSize + symbol;
				newSymbol( symbol );
				return result;
			}
			else {
				if ( DEBUG ) System.err.println( "Symbol " + symbol + " in list; writing " + pos  + " " + code2Pos + " "  + IntArrayList.wrap( queue, queueSize ) + " " + IntArrayList.wrap( freq, queueSize ) );
				oldSymbol( pos );
				return pos;
			}
		}

		/** Decodes a symbol, returning the original symbol.
		 * 
		 * @param symbol a symbol an encoded file.
		 * @return the corresponding original input symbol.
		 */
		public int decode( final int symbol ) {
			
			if ( symbol < queueSize ) {
				final int result = queue[ symbol ];
				oldSymbol( symbol );
				return result;
			}
			else {
				int term = symbol - queueSize;
				newSymbol( term );
				return term;
			}
		}
	}
	
	private SimpleCompressedDocumentCollection( String basename, DocumentFactory factory,
			EliasFanoMonotoneLongBigList docOffsets, EliasFanoMonotoneLongBigList termOffsets, EliasFanoMonotoneLongBigList nonTermOffsets, ByteBufferInputStream documentsByteBufferInputStream,
			ByteBufferInputStream termsByteBufferInputStream, ByteBufferInputStream nonTermsByteBufferInputStream ) {
		this.basename = basename;
		this.documents = docOffsets.size64() - 1;
		this.terms = termOffsets.size64() - 1;
		this.exact = nonTermOffsets != null;
		this.nonTerms = exact ? termOffsets.size64() - 1 : -1;
		this.docOffsets = docOffsets;
		this.termOffsets = termOffsets;
		this.nonTermOffsets = nonTermOffsets;
		this.factory = factory;
		this.termsFrequencyKeeper = new FrequencyCodec();
		this.nonTermsFrequencyKeeper = exact ? new FrequencyCodec() : null;
		this.documentsByteBufferInputStream = documentsByteBufferInputStream;
		this.termsByteBufferInputStream = termsByteBufferInputStream;
		this.nonTermsByteBufferInputStream = nonTermsByteBufferInputStream;
		this.hasNonText = hasNonText( factory );

	}

	protected SimpleCompressedDocumentCollection( final String basename, final long documents, final long terms, final long nonTerms, final boolean exact, final DocumentFactory factory ) {
		this.hasNonText = hasNonText( factory );
		this.basename = basename;
		this.documents = documents;
		this.terms = terms;
		this.nonTerms = nonTerms;
		this.exact = exact;
		this.factory = factory;
		this.termsFrequencyKeeper = null;
		this.nonTermsFrequencyKeeper = null;
		docOffsets = termOffsets = nonTermOffsets = null;
		documentsInputBitStream = null;
		termsInputStream = nonTermsInputStream = null;
		zipFile = null;
		try {
			super.close();
		}
		catch ( IOException cantHappen ) {
			throw new RuntimeException( cantHappen );
		}
	}

	private static boolean hasNonText( final DocumentFactory factory ) {
		boolean hasNonText = false;
		for( int i = factory.numberOfFields(); i-- != 0; ) hasNonText |= factory.fieldType( i ) != FieldType.TEXT;
		return hasNonText;
	}

	private void initMappings( final String basename, final boolean rethrow ) throws IOException {
		try {
			// TODO: This is too risky: we will have to make it optional at some point
			// documentsByteBufferInputStream = ByteBufferInputStream.map( new FileInputStream( basename + DOCUMENTS_EXTENSION ).getChannel(), MapMode.READ_ONLY );
			termsByteBufferInputStream = ByteBufferInputStream.map( new FileInputStream( basename + TERMS_EXTENSION ).getChannel(), MapMode.READ_ONLY );
			nonTermsByteBufferInputStream = nonTermOffsets != null ? ByteBufferInputStream.map( new FileInputStream( basename + NONTERMS_EXTENSION ).getChannel(), MapMode.READ_ONLY ) : null;
			fileMappingOk = true;
		}
		catch( IOException e ) {
			// We leave the possibility for a filename() to fix the problem and map the files.
			if ( rethrow ) throw e;
		}
	}
	
	private void loadOffsets( final String basename, final boolean rethrow ) throws IOException {
		try {
			docOffsets = loadOffsetsSuccinctly( basename + DOCUMENT_OFFSETS_EXTENSION, documents, new File( basename + DOCUMENTS_EXTENSION ).length() * Byte.SIZE + 1 );
			termOffsets = loadOffsetsSuccinctly( basename + TERM_OFFSETS_EXTENSION, terms, new File( basename + TERMS_EXTENSION ).length() + 1 );
			nonTermOffsets = nonTerms < 0 ? null : loadOffsetsSuccinctly( basename + NONTERM_OFFSETS_EXTENSION, nonTerms, new File( basename + NONTERMS_EXTENSION ).length() + 1 );
		}
		catch( IOException e ) {
			// We leave the possibility for a filename() to fix the problem and load the right files.
			if ( rethrow ) throw e;
		}
	}
			
	private void initFiles(  final String basename, final boolean rethrow ) throws IOException {
		try {
			documentsInputBitStream = documentsByteBufferInputStream != null ? new InputBitStream( documentsByteBufferInputStream ) : new InputBitStream( basename + DOCUMENTS_EXTENSION );
			termsInputStream = new FastBufferedInputStream( termsByteBufferInputStream != null ? termsByteBufferInputStream : new FileInputStream( basename + TERMS_EXTENSION ) );
			nonTermsInputStream = exact ? new FastBufferedInputStream( nonTermsByteBufferInputStream != null ? nonTermsByteBufferInputStream : new FileInputStream( basename + NONTERMS_EXTENSION ) ) : null;
			zipFile = hasNonText ? new ZipFile(  basename + ZipDocumentCollection.ZIP_EXTENSION ) : null;
			fileOpenOk = true;
		}
		catch( IOException e ) {
			// We leave the possibility for a filename() to fix the problem and load the right files.
			if ( rethrow ) throw e;
		}
	}
	
	private void ensureFiles() {
		if ( ! fileOpenOk ) throw new IllegalStateException( "Some of the files used by this " + SimpleCompressedDocumentCollection.class.getSimpleName() + " have not been loaded correctly; please use " + AbstractDocumentSequence.class.getName() + ".load() or call filename() after deserialising this instance" );
	}
	
	private static EliasFanoMonotoneLongBigList loadOffsetsSuccinctly( final CharSequence filename, final long numberOfItems, final long upperBound ) throws IOException {
		final InputBitStream ibs = new InputBitStream( filename.toString() );
		final EliasFanoMonotoneLongBigList offsets = new EliasFanoMonotoneLongBigList( numberOfItems + 1, upperBound, new OffsetsLongIterator( ibs, numberOfItems + 1 ) );
		ibs.close();
		return offsets;
	}
		
	@Override
	public void filename( CharSequence filename ) throws IOException {
		if ( ! fileMappingOk ) initMappings( new File( new File( filename.toString() ).getParentFile(), basename ).toString(), true );
		if ( ! fileOpenOk ) {
			loadOffsets( new File( new File( filename.toString() ).getParentFile(), basename ).toString(), true );
			initFiles( new File( new File( filename.toString() ).getParentFile(), basename ).toString(), true );
		}
	}
	
	public DocumentCollection copy() {
		ensureFiles();
		try {
			SimpleCompressedDocumentCollection copy = new SimpleCompressedDocumentCollection( 
					basename,
					factory.copy(),
					docOffsets, 
					termOffsets, 
					nonTermOffsets,
					documentsByteBufferInputStream != null ? documentsByteBufferInputStream.copy() : null,
					termsByteBufferInputStream != null ? termsByteBufferInputStream.copy() : null,
					nonTermsByteBufferInputStream != null ? nonTermsByteBufferInputStream.copy() : null );
			copy.loadOffsets( basename, true );
			copy.initFiles( basename, true );
			return copy;
		}
		catch ( IOException e ) {
			throw new RuntimeException( e );
		}
	}

	private static MutableString readSelfDelimitedUtf8String( final InputBitStream ibs, final MutableString s ) throws IOException {
		s.length( 0 );
		for( int length = ibs.readDelta(); length-- != 0; ) s.append( (char)ibs.readZeta( 7 ) );
		return s;
	}
	
	
	public Document document( long index ) throws IOException {
		ensureDocumentIndex( index );
		ensureFiles();
		documentsInputBitStream.position( docOffsets.getLong( index ) );
		final DataInputStream nonTextDataInputStream = hasNonText ? new DataInputStream( new FastBufferedInputStream( zipFile.getInputStream( zipFile.getEntry( Long.toString( index ) ) ) ) ) : null;
		final MutableString uri = readSelfDelimitedUtf8String( documentsInputBitStream, new MutableString() );
		final MutableString title = readSelfDelimitedUtf8String( documentsInputBitStream, new MutableString() );

		return new AbstractDocument() {
			final MutableString fieldContent = new MutableString();
			
			@SuppressWarnings("unchecked")
			final Document fakeDocument = factory.getDocument( NullInputStream.getInstance(), Reference2ObjectMaps.EMPTY_MAP );
			
			int nextField = 0;

			public Object content( int field ) throws IOException {
				FieldType fieldType = factory.fieldType( field );

				if ( nextField > field ) throw new IllegalStateException();
				// Skip fields
				final MutableString s = new MutableString();
				int len;
				while( nextField < field ) {
					switch( fieldType ) {
					case TEXT:
						len = documentsInputBitStream.readDelta();
						if ( exact ) len *= 2;
						documentsInputBitStream.skipDeltas( len );
						break;
					case VIRTUAL:
						final int nfrag = nonTextDataInputStream.readInt();
						for ( int i = 0; i < 2 * nfrag; i++ ) MutableString.skipSelfDelimUTF8( nonTextDataInputStream );
						break;
					default:
						try { new ObjectInputStream( nonTextDataInputStream ).readObject(); } catch ( ClassNotFoundException e ) { throw new RuntimeException( e ); }
					}
					nextField++;
				}
				
				// Read field
				nextField++;

				switch( fieldType ) {
				case TEXT:
					len = documentsInputBitStream.readDelta();
					fieldContent.length( 0 );

					termsFrequencyKeeper.reset();
					if ( exact ) nonTermsFrequencyKeeper.reset();

					while( len-- != 0 ) {
						termsInputStream.position( termOffsets.getLong( termsFrequencyKeeper.decode( documentsInputBitStream.readDelta() ) ) );
						s.readSelfDelimUTF8( termsInputStream );
						fieldContent.append( s );
						if ( exact ) {
							nonTermsInputStream.position( nonTermOffsets.getLong( nonTermsFrequencyKeeper.decode( documentsInputBitStream.readDelta() ) ) );
							s.readSelfDelimUTF8( nonTermsInputStream );
							fieldContent.append( s );
						}
						else fieldContent.append( ' ');
					}
					return new FastBufferedReader( fieldContent );
				case VIRTUAL:
					final int nfrag = nonTextDataInputStream.readInt();
					MutableString doc = new MutableString();
					MutableString text = new MutableString();
					VirtualDocumentFragment[] fragArray = new VirtualDocumentFragment[ nfrag ];
					for ( int i = 0; i < nfrag; i++ ) {
						doc.readSelfDelimUTF8( (InputStream)nonTextDataInputStream );
						text.readSelfDelimUTF8( (InputStream)nonTextDataInputStream );
						fragArray[ i ] = new AnchorExtractor.Anchor( doc.copy(), text.copy() );
					}
					return new ObjectArrayList( fragArray );

				default:
					try { return new ObjectInputStream( nonTextDataInputStream ).readObject(); } catch ( ClassNotFoundException e ) { throw new RuntimeException( e ); }
				}
			
				
			}

			public CharSequence title() {
				return title;
			}

			public CharSequence uri() {
				return uri.length() == 0 ? null : uri;
			}

			public WordReader wordReader( int field ) {
				switch( factory.fieldType( field ) ) {
				case TEXT:
				case VIRTUAL: return fakeDocument.wordReader( field );
				default: return null;
				}
			}
			
			public void close() throws IOException {
				super.close();
				if ( hasNonText ) nonTextDataInputStream.close();
			}

		};
	}

	
	public Reference2ObjectMap,Object> metadata( long index ) throws IOException {
		throw new UnsupportedOperationException();
	}

	
	public long size() {
		return documents;
	}

	
	public InputStream stream( long index ) throws IOException {
		throw new UnsupportedOperationException();
	}
	
	public void close() throws IOException {
		super.close();
		if ( documentsInputBitStream != null ) documentsInputBitStream.close();
		IOUtils.closeQuietly( termsInputStream );
		IOUtils.closeQuietly( nonTermsInputStream );
	}

	public DocumentFactory factory() {
		return factory;
	}

	private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException {
		s.defaultReadObject();
		loadOffsets( basename, false );
		initMappings( basename, false );
		initFiles( basename, false );
		termsFrequencyKeeper = new FrequencyCodec();
		if ( exact ) nonTermsFrequencyKeeper = new FrequencyCodec();
	}
		
	
	// Unfinished, experimental method
	public static void optimize( final CharSequence basename ) throws IOException, ClassNotFoundException {
		final SimpleCompressedDocumentCollection collection = (SimpleCompressedDocumentCollection)AbstractDocumentCollection.load( basename );
		final long[] termFrequency = new long[ (int)collection.terms ];
		final long[] nonTermFrequency = collection.exact ? new long[ (int)collection.nonTerms ] : null;
		final InputBitStream documentsIbs = collection.documentsInputBitStream;
		final DocumentFactory factory = collection.factory;
		final boolean exact = collection.exact;
		final MutableString s = new MutableString();
		documentsIbs.position( 0 );
		for( int i = (int)collection.documents; i-- != 0; ) {
			readSelfDelimitedUtf8String( documentsIbs, s ); // Skip URI
			readSelfDelimitedUtf8String( documentsIbs, s ); // Skip title
			for( int f = factory.numberOfFields() - 1; f-- !=0; ) {
				int len = documentsIbs.readDelta();
				while( len-- != 0 ) {
					termFrequency[ documentsIbs.readDelta() ]++;
					if ( exact ) nonTermFrequency[ documentsIbs.readDelta() ]++;
				}
			}
		}
		
		int[] termPerm = new int[ termFrequency.length ];
		for( int i = termPerm.length; i-- != 0; ) termPerm[ i ] = i;
		IntArrays.quickSort( termPerm, 0, termPerm.length, new AbstractIntComparator() {
			public int compare( int arg0, int arg1 ) {
				return termFrequency[ arg1 ] - termFrequency[ arg0 ] < 0 ? -1 : termFrequency[ arg1 ] == termFrequency[ arg0 ] ? 0 : 1;
			}
		});
		
		int[] invTermPerm = new int[ termFrequency.length ];
		for( int i = invTermPerm.length; i-- != 0; ) invTermPerm[ termPerm[ i ] ] = i;
		
		int[] nonTermPerm = null, invNonTermPerm = null;
		if ( exact ) {
			nonTermPerm = new int[ termFrequency.length ];
			for( int i = nonTermPerm.length; i-- != 0; ) nonTermPerm[ i ] = i;
			IntArrays.quickSort( nonTermPerm, 0, nonTermPerm.length, new AbstractIntComparator() {
				public int compare( int arg0, int arg1 ) {
					return termFrequency[ arg1 ] - termFrequency[ arg0 ] < 0 ? -1 : termFrequency[ arg1 ] == termFrequency[ arg0 ] ? 0 : 1;
				}
			});
			invNonTermPerm = new int[ nonTermFrequency.length ];
			for( int i = invNonTermPerm.length; i-- != 0; ) invNonTermPerm[ nonTermPerm[ i ] ] = i;
		}

		File newDocumentsFile = File.createTempFile( SimpleCompressedDocumentCollection.class.getSimpleName(), "temp", new File( basename.toString() ).getParentFile() );
		OutputBitStream newDocumentsObs = new OutputBitStream( newDocumentsFile );
		documentsIbs.position( 0 );
		for( int i = (int)collection.documents; i-- != 0; ) {
			readSelfDelimitedUtf8String( documentsIbs, s ); // Skip URI
			SimpleCompressedDocumentCollectionBuilder.writeSelfDelimitedUtf8String( newDocumentsObs, s );
			readSelfDelimitedUtf8String( documentsIbs, s ); // Skip title
			SimpleCompressedDocumentCollectionBuilder.writeSelfDelimitedUtf8String( newDocumentsObs, s );
			for( int f = factory.numberOfFields() - 1; f-- !=0; ) {
				int len = documentsIbs.readDelta();
				newDocumentsObs.writeDelta( len );
				while( len-- != 0 ) {
					newDocumentsObs.writeDelta( invTermPerm[ documentsIbs.readDelta() ] );
					if ( exact ) newDocumentsObs.writeDelta( invNonTermPerm[ documentsIbs.readDelta() ] );
				}
			}
		}
		newDocumentsObs.close();
		new File( basename + DOCUMENTS_EXTENSION ).delete();
		newDocumentsFile.renameTo( new File( basename + DOCUMENTS_EXTENSION ) );
		newDocumentsObs = null;
		invTermPerm = invNonTermPerm = null;
		
		FastBufferedInputStream termsStream = new FastBufferedInputStream( new FileInputStream( basename + TERMS_EXTENSION ) ) ;
		MutableString term[] = new MutableString[ (int)collection.terms ];
		for( int i = 0; i < term.length; i++ ) term[ i ] = new MutableString().readSelfDelimUTF8( termsStream );
		termsStream.close();

		new FastBufferedOutputStream( new FileOutputStream( basename + TERMS_EXTENSION ) );
	}

	public static void main( final String[] arg ) throws IOException, JSAPException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException, ConfigurationException, ClassNotFoundException {

		SimpleJSAP jsap = new SimpleJSAP( FileSetDocumentCollection.class.getName(), "Optimises a simple compressed document collection.",
				new Parameter[] {
					new UnflaggedOption( "basename", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename of the collection." ),
				}
		);
		
		JSAPResult jsapResult = jsap.parse( arg );
		if ( jsap.messagePrinted() ) return;
		
		optimize( jsapResult.getString( "basename" ) );
	}

}