it.unimi.dsi.util.ImmutableExternalPrefixMap Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of dsi-utils Show documentation
Blazegraph Modifications to the DSI utils. This are forked from version 1.10.0 under LGPLv2.1.
There is a newer version: 2.1.4
package it.unimi.dsi.util;

/*		 
 * DSI utilities
 *
 * Copyright (C) 2005-2009 Sebastiano Vigna 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 2.1 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 */

import it.unimi.dsi.bits.BitVector;
import it.unimi.dsi.bits.PrefixCoderTransformationStrategy;
import it.unimi.dsi.compression.Decoder;
import it.unimi.dsi.compression.HuTuckerCodec;
import it.unimi.dsi.compression.PrefixCodec;
import it.unimi.dsi.compression.PrefixCoder;
import it.unimi.dsi.fastutil.booleans.BooleanIterator;
import it.unimi.dsi.fastutil.chars.Char2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.ObjectIterator;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.lang.MutableString;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.zip.GZIPInputStream;

import org.apache.commons.io.IOUtils;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import com.martiansoftware.jsap.stringparsers.ForNameStringParser;

// TODO: implement interfaces correctly (e.g., using the iterator)

/** An immutable prefix map mostly stored in external memory.
 *
 * An {@link it.unimi.dsi.util.ImmutableExternalPrefixMap} compresses words using
 * a {@link it.unimi.dsi.compression.HuTuckerCodec} and approximates
 * intervals using an {@link it.unimi.dsi.util.ImmutableBinaryTrie} that uses the same codec.
 * 
 * This class releases on a dump stream most of the data that
 * would be contained in the corresponding internal-memory map. 
 * More precisely, each 
 * block (with user-definable length, possibly the size of a basic disk I/O operation)
 * is filled as much as possible with strings front coded and compressed with a 
 * {@link it.unimi.dsi.compression.HuTuckerCodec}. 
 * Each block starts with the length of the first string in unary, followed by the encoding of the
 * string. Then, for each string we write in unary the length of the common prefix (in characters)
 * with the previous string, the length of the remaining suffix (in characters)
 * and finally the encoded suffix. Note that if the encoding of a string is longer than a block, the string will occupy more than one block.
 * 
 * 
We keep track using an {@link ImmutableBinaryTrie}
 * of the strings at the start of each block: thus, we are able to retrieve the interval corresponding
 * to a given prefix by calling {@link ImmutableBinaryTrie#getApproximatedInterval(BooleanIterator) getApproximatedInterval()}
 * and scanning at most two blocks.
 * 
 * 
Self-contained or non-self-contained
 * 
 * There are two kinds of external prefix maps: self-contained and non-self-contained.
 * In the first case, you get a serialised object that you can load at any time. The dump
 * stream is serialised with the object and expanded at each deserialisation in the Java temporary directory.
 * If you deserialise a map several times, you will get correspondingly many copies of
 * the dump stream in the temporary directory. The dump streams are deleted when the JVM
 * exits. This mechanism is not very efficient, but since this class implements several
 * interfaces it is essential that clients can make the thing work in a standard way.
 * 
 * 
Alternatively, you can give at creation time a filename for the dump stream. 
 * The resulting non-self-contained external prefix map
 * can be serialised, but after deserialisation 
 * you need to set back the {@linkplain #setDumpStream(CharSequence) dump stream filename}
 * or even directly the {@linkplain #setDumpStream(InputBitStream) dump stream} (for instance, to 
 * an {@linkplain it.unimi.dsi.io.OutputBitStream#OutputBitStream(byte[]) output bit stream
 * wrapping a byte array where the dump stream has been loaded}). You can deserialise many
 * copies of an external prefix map, letting all copies share the same dump stream. 
 * 
 * 
This data structure is not synchronised, and concurrent reads may cause problems
 * because of clashes in the usage of the underlying input bit stream. It would not
 * be a good idea in any case to open a new stream for each caller, as that would 
 * certainly lead to disk thrashing. 
 * 
 * 
The {@linkplain #main(String[]) main method} of this class
 * helps in building large external prefix maps.
 * 
 * @author Sebastiano Vigna
 * @since 0.9.3
 */
public class ImmutableExternalPrefixMap extends AbstractPrefixMap implements Serializable {
	final private static boolean DEBUG = false;
	final private static boolean ASSERTS = false;

	public static final long serialVersionUID = 1L;
	
	/** The standard block size (in bytes). */
	public final static int STD_BLOCK_SIZE = 1024;
	
	/** The in-memory data structure used to approximate intervals.. */
	final protected ImmutableBinaryTrie intervalApproximator;
	/** The block size of this  (in bits). */
	final protected long blockSize;
	/** A decoder used to read data from the dump stream. */
	final protected Decoder decoder;
	/** A map (given by an array) from symbols in the coder to characters. */
	final protected char[] symbol2char;
	/** A map from characters to symbols of the coder. */
	final protected Char2IntOpenHashMap char2symbol;
	/** The number of terms in this map. */
	final protected int size;
	/** The index of the first word in each block, plus an additional entry containing {@link #size}. */
	final protected int[] blockStart;
	/** An array parallel to {@link #blockStart} giving the offset in blocks in the dump file
	 * of the corresponding word in {@link #blockStart}. If there are no overflows, this will just
	 * be an initial segment of the natural numbers, but overflows cause jumps. */
	final protected int[] blockOffset;
	/** Whether this map is self-contained. */
	final protected boolean selfContained;
	/** The length in bytes of the dump stream, both for serialisation purposes and for minimal checks. */
	final private long dumpStreamLength;
	/** The filename of the temporary dump stream, or of the dump stream created by the constructor or by readObject(). */
	private transient String tempDumpStreamFilename;
	/** If true, the creation of the last DumpStreamIterator was not
	 * followed by a call to any get method. */
	protected transient boolean iteratorIsUsable;
	/** A reference to the dump stream. */
	protected transient InputBitStream dumpStream;
	
	/** map external map.
	 * 
	 * 
This constructor does not assume that strings returned by terms.iterator()
	 * will be distinct. Thus, it can be safely used with {@link FileLinesCollection}.
	 * 
	 * @param terms an iterable whose iterator will enumerate in lexicographical order the terms for the map.
	 * @param blockSizeInBytes the block size (in bytes).
	 * @param dumpStreamFilename the name of the dump stream, or null for a map
	 * with an automatic dump stream.
	 */
	
	public ImmutableExternalPrefixMap( final Iterable terms, final int blockSizeInBytes, final CharSequence dumpStreamFilename ) throws IOException {
		this.blockSize = blockSizeInBytes * 8;
		this.selfContained = dumpStreamFilename == null;
		// First of all, we gather frequencies for all Unicode characters
		int[] frequency = new int[ Character.MAX_VALUE + 1 ]; 
		int maxWordLength = 0;
		CharSequence s;
		int count = 0;

		final MutableString prevTerm = new MutableString();

		for( Iterator i = terms.iterator(); i.hasNext(); ) {
			s = i.next();
			maxWordLength = Math.max( s.length(), maxWordLength );
			for( int j = s.length(); j-- != 0; ) frequency[ s.charAt( j ) ]++;
			if ( count > 0 && prevTerm.compareTo( s ) >= 0 ) throw new IllegalArgumentException( "The provided term collection is not sorted, or contains duplicates [" + prevTerm + ", " + s + "]" );
			count++;
			prevTerm.replace( s );
		}
		
		size = count;
		
		// Then, we compute the number of actually used characters
		count = 0;
		for( int i = frequency.length; i-- != 0; ) if ( frequency[ i ] != 0 ) count++;

		/* Now we remap used characters in f, building at the same time maps from 
		 * symbol to characters and from characters to symbols. */
		
		int[] packedFrequency = new int[ count ];
		symbol2char = new char[ count ];
		char2symbol = new Char2IntOpenHashMap( count );
		char2symbol.defaultReturnValue( -1 );
		
		for( int i = frequency.length, k = count; i-- != 0; ) {
			if ( frequency[ i ] != 0 ) {
				packedFrequency[ --k ] = frequency[ i ];
				symbol2char[ k ] = (char)i;
				char2symbol.put( (char)i, k );
			}
		}
		
		char2symbol.trim();
		
		// We now build the coder used to code the strings
		
		final PrefixCoder prefixCoder;
		final PrefixCodec codec;
		final BitVector[] codeWord;

		if ( packedFrequency.length != 0 ) {
			codec = new HuTuckerCodec( packedFrequency );
			prefixCoder = codec.coder();
			decoder = codec.decoder();
			codeWord = prefixCoder.codeWords();
		}
		else {
			// This handles the case of a collection without words
			codec = null;
			prefixCoder = null;
			decoder = null;
			codeWord = null;
		}
		
		packedFrequency = frequency = null;

		// We now compress all strings using the given codec mixed with front coding
		final OutputBitStream output;
		if ( selfContained ) {
			final File temp = File.createTempFile( this.getClass().getName(), ".dump" );
			temp.deleteOnExit();
			tempDumpStreamFilename = temp.toString();
			output = new OutputBitStream( temp, blockSizeInBytes );
		}
		else output = new OutputBitStream( tempDumpStreamFilename = dumpStreamFilename.toString(), blockSizeInBytes );
		
		// This array will contain the delimiting words (the ones at the start of each block)
		boolean isDelimiter;
		
		int length, prevTermLength = 0, bits;
		int prefixLength = 0, termCount = 0;
		int currBuffer = 0;
		
		final IntArrayList blockStarts = new IntArrayList();
		final IntArrayList blockOffsets = new IntArrayList();
		final ObjectArrayList delimiters = new ObjectArrayList();
		prevTerm.length( 0 );
		
		for( Iterator i = terms.iterator(); i.hasNext(); ) {
			s = (CharSequence) i.next();
			length = s.length();

			isDelimiter = false;
			
			// We compute the common prefix and the number of bits that are necessary to code the next term.
			bits = 0;
			for( prefixLength = 0; prefixLength < length && prefixLength < prevTermLength && prevTerm.charAt( prefixLength ) == s.charAt( prefixLength ); prefixLength++ );
			for( int j = prefixLength; j < length; j++ ) bits += codeWord[ char2symbol.get( s.charAt( j ) ) ].size();
			
			//if ( bits + length + 1 > blockSize ) throw new IllegalArgumentException( "The string \"" + s + "\" is too long to be encoded with block size " + blockSizeInBytes );
			
			// If the next term would overflow the block, and we are not at the start of a block, we align.
			if ( output.writtenBits() % blockSize != 0 && output.writtenBits() / blockSize != ( output.writtenBits() + ( length - prefixLength + 1 ) + ( prefixLength + 1 ) + bits - 1 ) / blockSize ) {
				// We align by writing 0es.
				if ( DEBUG ) System.err.println( "Aligning away " + ( blockSize - output.writtenBits() % blockSize ) + " bits..." );
				for( int j = (int)( blockSize - output.writtenBits() % blockSize ); j-- != 0; ) output.writeBit( 0 );
				if ( ASSERTS ) assert output.writtenBits() % blockSize == 0;
			}

			if ( output.writtenBits() % blockSize == 0 ) {
				isDelimiter = true;
				prefixLength = 0;
				blockOffsets.add( (int)( output.writtenBits() / blockSize ) );
			}
			
			// Note that delimiters do not get the prefix length, as it's 0.
			if ( ! isDelimiter ) output.writeUnary( prefixLength );
			output.writeUnary( length - prefixLength );

			// Write the next coded suffix on output.
			for( int j = prefixLength; j < length; j++ ) {
				BitVector c = codeWord[ char2symbol.get( s.charAt( j ) ) ];
				for( int k = 0; k < c.size(); k++ ) output.writeBit( c.getBoolean( k ) );
			}
			
			if ( isDelimiter ) {
				if ( DEBUG ) System.err.println( "First string of block " + blockStarts.size() + ": " + termCount + " (" + s + ")" );
				// The current word starts a new block
				blockStarts.add( termCount );
				// We do not want to rely on s being immutable.
				delimiters.add( new MutableString( s ) );
			}
			
			currBuffer = 1 - currBuffer;
			prevTerm.replace( s );
			prevTermLength = length;
			termCount++;
		}
		
		output.align();
		dumpStreamLength = output.writtenBits() / 8;
		output.close();
		
		intervalApproximator = prefixCoder == null ? null : new ImmutableBinaryTrie( delimiters, new PrefixCoderTransformationStrategy( prefixCoder, char2symbol, false ) );

		blockStarts.add( size );
		blockStart = blockStarts.toIntArray();
		blockOffset = blockOffsets.toIntArray();
		
		// We use a buffer of the same size of a block, hoping in fast I/O. */
		dumpStream = new InputBitStream( tempDumpStreamFilename, blockSizeInBytes );
	}

	/** Creates an external map with block size {@link #STD_BLOCK_SIZE} and specified dump stream.
	 * 
	 * 
This constructor does not assume that strings returned by terms.iterator()
	 * will be distinct. Thus, it can be safely used with {@link FileLinesCollection}.
	 * 
	 * @param terms a collection whose iterator will enumerate in lexicographical order the terms for the map.
	 * @param dumpStreamFilename the name of the dump stream, or null for a map
	 * with an automatic dump stream.
	 */
	
	public ImmutableExternalPrefixMap( final Iterable terms, final CharSequence dumpStreamFilename ) throws IOException {
		this( terms, STD_BLOCK_SIZE, dumpStreamFilename );
	}

	/** Creates an external map with specified block size.
	 * 
	 * 
This constructor does not assume that strings returned by terms.iterator()
	 * will be distinct. Thus, it can be safely used with {@link FileLinesCollection}.
	 * 
	 * @param blockSizeInBytes the block size (in bytes).
	 * @param terms a collection whose iterator will enumerate in lexicographical order the terms for the map.
	 */
	
	public ImmutableExternalPrefixMap( final Iterable terms, final int blockSizeInBytes ) throws IOException {
		this( terms, blockSizeInBytes, null );
	}
	
	/** Creates an external prefix map with block size {@link #STD_BLOCK_SIZE}.
	 * 
	 * 
This constructor does not assume that strings returned by terms.iterator()
	 * will be distinct. Thus, it can be safely used with {@link FileLinesCollection}.
	 * 
	 * @param terms a collection whose iterator will enumerate in lexicographical order the terms for the map.
	 */
	
	public ImmutableExternalPrefixMap( final Iterable terms ) throws IOException {
		this( terms, null );
	}

	private void safelyCloseDumpStream() {
		try {
			if ( this.dumpStream != null ) this.dumpStream.close();
		} 
		catch ( IOException ignore ) {}
	}
	
	private void ensureNotSelfContained() {
		if ( selfContained ) throw new IllegalStateException( "You cannot set the dump file of a self-contained external prefix map" );
	}
	
	private boolean isEncodable( final CharSequence s ) {
		for( int i = s.length(); i-- != 0; ) if ( ! char2symbol.containsKey( s.charAt( i ) ) ) return false;
		return true;
	}

	
	
	/** Sets the dump stream of this external prefix map to a given filename.
	 *
	 * 
This method sets the dump file used by this map, and should be only
	 * called after deserialisation, providing exactly the file generated at
	 * creation time. Essentially anything can happen if you do not follow the rules.
	 *
	 * 
Note that this method will attempt to close the old stream, if present.
	 *   
	 * @param dumpStreamFilename the name of the dump file.
	 * @see #setDumpStream(InputBitStream)
	 */
	
	public void setDumpStream( final CharSequence dumpStreamFilename ) throws FileNotFoundException{
		ensureNotSelfContained();
		safelyCloseDumpStream();
		iteratorIsUsable = false;
		final long newLength = new File( dumpStreamFilename.toString() ).length();
		if ( newLength != dumpStreamLength )
			throw new IllegalArgumentException( "The size of the new dump file (" + newLength + ") does not match the original length (" + dumpStreamLength + ")" );
		dumpStream = new InputBitStream( dumpStreamFilename.toString(), (int)( blockSize / 8 ) );
	}

	
	/** Sets the dump stream of this external prefix map to a given input bit stream.
	 *
	 * 
This method sets the dump file used by this map, and should be only
	 * called after deserialisation, providing a repositionable stream containing
	 * exactly the file generated at
	 * creation time. Essentially anything can happen if you do not follow the rules.
	 *  
	 * 
Using this method you can load an external prefix map in core memory, enjoying
	 * the compactness of the data structure, but getting much more speed. 
	 * 
	 * 
Note that this method will attemp to close the old stream, if present.
	 *   
	 * @param dumpStream a repositionable input bit stream containing exactly the dump stream generated
	 * at creation time.
	 * @see #setDumpStream(CharSequence)
	 */
	public void setDumpStream( final InputBitStream dumpStream ) {
		ensureNotSelfContained();
		safelyCloseDumpStream();
		iteratorIsUsable = false;
		this.dumpStream = dumpStream;
	}

	private void ensureStream() {
		if ( dumpStream == null ) throw new IllegalStateException( "This external prefix map has been deserialised, but no dump stream has been set" );
	}
	
	public Interval getInterval( final CharSequence prefix ) {
		ensureStream();
		// If prefix contains any character not coded by the prefix coder, we can return the empty interval.
		if ( ! isEncodable( prefix ) ) return Intervals.EMPTY_INTERVAL;

		// We recover the left extremes of the intervals where extensions of prefix could possibly lie.
		Interval interval = intervalApproximator.getApproximatedInterval( prefix );
		// System.err.println( "Approximate interval: " + interval + " , terms: [" + blockStart[ interval.left ] + ", " + blockStart[ interval.right ] + "]" );

		if ( interval == Intervals.EMPTY_INTERVAL ) return interval;
		try {
			dumpStream.position( blockOffset[ interval.left ] * blockSize );
			dumpStream.readBits( 0 );
			iteratorIsUsable = false;
			MutableString s = new MutableString();
			int suffixLength, prefixLength = -1, count = blockStart[ interval.left ], blockEnd = blockStart[ interval.left + 1 ], start = -1, end = -1;

			/* We scan the dump file, stopping if we exhaust the block */
			while( count < blockEnd ) {
				if ( prefixLength < 0 ) prefixLength = 0;
				else prefixLength = dumpStream.readUnary();
				suffixLength = dumpStream.readUnary();
				s.delete( prefixLength, s.length() );
				s.length( prefixLength + suffixLength );
				for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
				if ( s.startsWith( prefix ) ) {
					start = count;
					break; 
				}
				count++;
			}
			
			/* If we did not find our string, there are two possibilities: if the
			 * interval contains one point, there is no string extending prefix. But
			 * if  the interval  is larger, the first string of the second block in the
			 * interval must be an extension of prefix. */
			if ( start < 0 && interval.length() == 1 ) return Intervals.EMPTY_INTERVAL;
			else start = count;
			
			end = start + 1;
			//assert dumpStream.readBits() <= blockSize;

			/* If the interval contains more than one point, the last string with
			 * given prefix is necessarily contained in the last block, and we
			 * must restart the search process. */
			if ( interval.length() > 1  ) {
				dumpStream.position( blockOffset[ interval.right ] * blockSize );
				dumpStream.readBits( 0 );
				s.length( 0 );
				end = blockStart[ interval.right ];
				blockEnd = blockStart[ interval.right + 1 ];
				prefixLength = -1;
			}
			
			
			while( end < blockEnd ) {
				if ( prefixLength < 0 ) prefixLength = 0;
				else prefixLength = dumpStream.readUnary();
				suffixLength = dumpStream.readUnary();
				s.delete( prefixLength, s.length() );
				s.length( prefixLength + suffixLength );
				for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
				if ( ! s.startsWith( prefix ) ) break;
				end++;
			}
			
			return Interval.valueOf( start, end - 1 );
		} catch (IOException rethrow ) {
			throw new RuntimeException( rethrow );
		}
		
	}
	
	protected MutableString getTerm( final int index, final MutableString s ) {
		ensureStream();
		// We perform a binary search to find the  block to which s could possibly belong.
		int block = Arrays.binarySearch( blockStart, index );
		if ( block < 0 ) block = - block - 2;

		try {
			dumpStream.position( blockOffset[ block ] * blockSize );
			dumpStream.readBits( 0 );
			iteratorIsUsable = false;
			int suffixLength, prefixLength = -1;

			for( int i = index - blockStart[ block ] + 1; i-- != 0; ) { 
				if ( prefixLength < 0 ) prefixLength = 0;
				else prefixLength = dumpStream.readUnary();
				suffixLength = dumpStream.readUnary();
				s.delete( prefixLength, s.length() );
				s.length( prefixLength + suffixLength );
				for( int j = 0; j < suffixLength; j++ ) s.charAt( j + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
			}
			
			return s;
		}
		catch( IOException rethrow ) {
			throw new RuntimeException( rethrow );
		}
	}

	private long getIndex( final Object o ) {
		final CharSequence term = (CharSequence)o;
		ensureStream();
		// If term contains any character not coded by the prefix coder, we can return -1
		if ( ! isEncodable( term ) ) return -1;

		/* If term is in the map, any string extending term must follow term. Thus,
		 * term can be in the map only if it can be found in the left block
		 * of an approximated interval for itself. */
		Interval interval = intervalApproximator.getApproximatedInterval( term );
		if ( interval == Intervals.EMPTY_INTERVAL ) return -1;
		try {
			dumpStream.position( blockOffset[ interval.left ] * blockSize );
			dumpStream.readBits( 0 );
			iteratorIsUsable = false;
			MutableString s = new MutableString();
			int suffixLength, prefixLength = -1, count = blockStart[ interval.left ], blockEnd = blockStart[ interval.left + 1 ];

			/* We scan the dump file, stopping if we exhaust the block */
			while( count < blockEnd ) {
				if ( prefixLength < 0 ) prefixLength = 0;
				else prefixLength = dumpStream.readUnary();
				suffixLength = dumpStream.readUnary();
				s.delete( prefixLength, s.length() );
				s.length( prefixLength + suffixLength );
				for( int i = 0; i < suffixLength; i++ ) s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
				if ( s.equals( term ) ) return count;
				count++;
			}
			
			return -1;
		}
		catch (IOException rethrow ) {
			throw new RuntimeException( rethrow );
		}
	}
	

	public boolean containsKey( final Object term ) {
		return getIndex( term ) != -1;
	}
	
	public long getLong( final Object o ) {
		final long result = getIndex( o );
		return result == -1 ? defRetValue : result;
	}

	/** An iterator over the dump stream. It does not use the interval approximator—it just scans the file. */
	
	private final class DumpStreamIterator extends AbstractObjectIterator {
		/** The current block being enumerated. */
		private int currBlock = -1;
		/** The index of next term that will be returned. */
		private int index;
		/** The mutable string used to return the result. */
		final MutableString s = new MutableString();

		private DumpStreamIterator() {
			try {
				dumpStream.position( 0 );
			}
			catch ( IOException e ) {
				throw new RuntimeException( e );
			}
			dumpStream.readBits( 0 );
			iteratorIsUsable = true;
		}
		
		public boolean hasNext() {
			if ( ! iteratorIsUsable ) throw new IllegalStateException( "Get methods of this map have caused a stream repositioning" );
			return index < size;
		}

		public CharSequence next() {
			if ( ! hasNext() ) throw new NoSuchElementException();
			try {
				final int prefixLength;
				if ( index == blockStart[ currBlock + 1 ] ) {
					if ( dumpStream.readBits() % blockSize != 0 ) dumpStream.skip( blockSize - dumpStream.readBits() % blockSize );
					currBlock++;
					prefixLength = 0;
				}
				else prefixLength = dumpStream.readUnary();
				final int suffixLength = dumpStream.readUnary();
				s.delete( prefixLength, s.length() );
				s.length( prefixLength + suffixLength );
				for ( int i = 0; i < suffixLength; i++ )
					s.charAt( i + prefixLength, symbol2char[ decoder.decode( dumpStream ) ] );
				index++;
				return s;
			}
			catch ( IOException e ) {
				throw new RuntimeException( e );
			}
		}

	}
	
	/** Returns an iterator over the map.
	 * 
	 * 
The iterator returned by this method scans directly the dump stream. 
	 * 
	 * Note that the returned iterator uses the same stream as all get methods. Calling such methods while
	 * the iterator is being used will produce an {@link IllegalStateException}.
	 * 
	 * @return an iterator over the map that just scans the dump stream.
	 */
	
	public ObjectIterator iterator() {
		return new DumpStreamIterator();
	}
	
	public int size() {
		return size;
	}

	private void writeObject( final ObjectOutputStream s ) throws IOException {
		s.defaultWriteObject();
		if ( selfContained ) {
			final FileInputStream fis = new FileInputStream( tempDumpStreamFilename );
			IOUtils.copy( fis, s );
			fis.close();
		}
	}

	private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException {
		s.defaultReadObject();
		if ( selfContained ) {
			final File temp = File.createTempFile( this.getClass().getName(), ".dump" );
			temp.deleteOnExit();
			tempDumpStreamFilename = temp.toString();
			// TODO: propose Jakarta CopyUtils extension with length control and refactor.
			FileOutputStream fos = new FileOutputStream( temp );
			final byte[] b = new byte[ 64 * 1024 ];
			int len;
			while( ( len = s.read( b ) ) >= 0 ) fos.write( b, 0, len );			fos.close();
			dumpStream = new InputBitStream( temp, (int)( blockSize / 8 ) );
		}
	}

	@SuppressWarnings("unchecked")
	public static void main( final String[] arg ) throws ClassNotFoundException, IOException, JSAPException, SecurityException, NoSuchMethodException {

		final SimpleJSAP jsap = new SimpleJSAP( ImmutableExternalPrefixMap.class.getName(), "Builds an external map reading from standard input a newline-separated list of terms or a serialised term list. If the dump stream name is not specified, the map will be self-contained.", 
				new Parameter[] {
					new FlaggedOption( "blockSize", JSAP.INTSIZE_PARSER, ( STD_BLOCK_SIZE / 1024 ) + "Ki", JSAP.NOT_REQUIRED, 'b', "block-size", "The size of a block in the dump stream." ),
					new Switch( "serialised", 's', "serialised", "The data source (file or standard input) provides a serialised java.util.List of terms." ),
					new Switch( "zipped", 'z', "zipped", "Standard input is compressed in gzip format." ),
					new FlaggedOption( "termFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "offline", "Read terms from this file instead of standard input." ),					
					new FlaggedOption( "encoding", ForNameStringParser.getParser( Charset.class ), "UTF-8", JSAP.NOT_REQUIRED, 'e', "encoding", "The term list encoding." ),
					new UnflaggedOption( "map", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the serialised map." ),
					new UnflaggedOption( "dump", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, JSAP.NOT_GREEDY, "An optional dump stream (the resulting map will not be self-contained)." )
			}
		);

		JSAPResult jsapResult = jsap.parse( arg );
		if ( jsap.messagePrinted() ) return;
		
		Collection termList;
		
		final String termFile = jsapResult.getString( "termFile" );
		final Charset encoding = (Charset)jsapResult.getObject( "encoding" );
		final boolean zipped = jsapResult.getBoolean( "zipped" );
		final boolean serialised = jsapResult.getBoolean( "serialised" );

		if ( zipped && serialised ) throw new IllegalArgumentException( "The zipped and serialised options are incompatible" );

		if ( serialised ) termList = (List) ( termFile != null ? BinIO.loadObject( termFile ) : BinIO.loadObject( System.in ) );
		else {
			if ( termFile != null ) termList = new FileLinesCollection( termFile, encoding.name(), zipped );
			else {
				final ObjectArrayList list = new ObjectArrayList();
				termList = list;
				final FastBufferedReader terms = new FastBufferedReader( new InputStreamReader( 
						zipped ? new GZIPInputStream( System.in ) : System.in, encoding.name() ) );
				final MutableString term = new MutableString();
				while( terms.readLine( term ) != null ) list.add( term.copy() );
				terms.close();
			}
		}

		BinIO.storeObject( new ImmutableExternalPrefixMap( termList, jsapResult.getInt( "blockSize" ), jsapResult.getString( "dump" ) ), jsapResult.getString( "map" ) );
	}
}