src.it.unimi.di.archive4j.SequentialBitstreamArchive Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of archive4j Show documentation
Archive4J is a suite of tools to store compactly term/count information of a document collection.
There is a newer version: 1.3.3
package it.unimi.di.archive4j;

/*
 * Copyright (C) 2008-2013 Alessio Orlandi and Sebastiano Vigna
 *
 *  This program is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU General Public License as published by the Free
 *  Software Foundation; either version 2 of the License, or (at your option)
 *  any later version.
 *
 *  This program is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 *
 */
import it.unimi.di.archive4j.SequentialBitstreamArchive.CompressionFlags.Coding;
import it.unimi.di.archive4j.SequentialBitstreamArchive.CompressionFlags.Component;
import it.unimi.di.archive4j.tool.Preprocess;
import it.unimi.dsi.bits.Fast;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.io.FastMultiByteArrayInputStream;
import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.util.Properties;
import it.unimi.dsi.util.StringMap;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URI;
import java.util.Collections;
import java.util.EnumMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;


/** An {@link Archive} implementation providing sequential access only.
 * 
 * @see RandomAccessBitstreamArchive
 * @see BitstreamArchiveWriter
 * 
 * @author Alessio Orlandi
 * @author Sebastiano Vigna
 */

public class SequentialBitstreamArchive implements Archive {
	/**
	 * Class representing compression flags for much of the data in this archive.
	 */
	public static class CompressionFlags {

		/** Each component of the data file or frequency file. */
		public static enum Component {
			/** The sizes of summaries (the number of terms/count pairs in the summary). */
			SIZES, 
			/** The document lengths (in words). */
			DOCLENGTHS, 
			/** The terms in the summary (actually, the term gaps). */
			TERMS, 
			/** The counts in the summary (actually, reverse-order gaps). */
			COUNTS
		};

		public static enum Coding {
			UNARY, NIBBLE, SHIFTED_GAMMA, GAMMA, DELTA, ZETA_2, ZETA_3, ZETA_4, ZETA_5;
		}

		/**
		 * Default codings.
		 */
		public static final Map DEFAULT;

		static {
			Map map = new EnumMap( Component.class );
			DEFAULT = Collections.unmodifiableMap( map );
			map.put( Component.SIZES, Coding.ZETA_3 );
			map.put( Component.DOCLENGTHS, Coding.DELTA );
			map.put( Component.TERMS, Coding.ZETA_2 );
			map.put( Component.COUNTS, Coding.GAMMA );
		}

		/**
		 * Decodes a set of strings of the form {@link Component}:{@link Coding} into a map.
		 * 
		 * @param codingSpec an array of strings of the form {@link Component}:{@link Coding}.
		 * @return a map obtained by parsing flag.
		 */
		public static Map valueOf( final String[] codingSpec ) {
			return valueOf( codingSpec, null );
		}

		/**
		 * Decodes a set of strings of the form {@link Component}:{@link Coding} into a map, patching
		 * missing components with a given default map. Components with coding NONE
		 * are removed from the ap.
		 * 
		 * @param flag an array of strings of the form {@link Component}:{@link Coding}.
		 * @param defaultMap a map containing defaults, or null.
		 * @return a map obtained by parsing flag; missing components are taken
		 * from defaultMap.
		 */
		public static Map valueOf( final String[] flag, final Map defaultMap ) {
			final EnumMap m = new EnumMap( Component.class );
			if ( defaultMap != null ) m.putAll( defaultMap );
			
			for ( int i = 0; i < flag.length; i++ ) {
				final String[] spec = flag[ i ].trim().split( ":" );
				if ( spec.length != 2 ) throw new IllegalArgumentException( "Bad format: " + flag[ i ] );
				else m.put( Component.valueOf( spec[ 0 ] ), "NONE".equalsIgnoreCase( spec[ 1 ] ) ? null : Coding.valueOf( spec[ 1 ] ) );
			}
			return m;
		}
	}

	/**
	 * Additional properties (w.r.t. {@link ArchiveLoader.PropertyKeys}).
	 */
	public static enum PropertyKeys {
		/** A {@link Component}:{@link Coding} pair. Multiple instances are allowed. */
		CODING,
		/** Version the archive. Used for backward compatibility. */
		VERSION,
	}

	/** The standard archive extension. */
	public static final String ARCHIVE_EXTENSION = ".archive";

	/** The standard permutation extension. */
	public static final String PERM_EXTENSION = ".permutation";

	/** The input bit stream for the data file. If {@link #fmbais} is not null, it
	 * wraps it; otherwise, it refers directly to a file named {@link #basename} + {@link #ARCHIVE_EXTENSION}).
	 * When the archive is closed it is nullified. */
	protected InputBitStream data;

	/** If not null, the in-memory stream upon which {@link #data} is based. */
	protected final FastMultiByteArrayInputStream fmbais;
	
	/** An optional list of URIs that will be used to create the URI associated to each summary. */
	protected final List uriList;

	/** The number of document summaries in this archive */
	protected final int numberOfDocuments;

	/** The number of terms in this archive. */
	protected final int numberOfTerms;

	/** The number of words in the documents summarized by this archive */
	protected final long numberOfWords;

	/** The map from frequency rank to terms. */
	protected final int[] rank2Term;

	/** The frequency of each term. */
	protected final int[] frequency;

	/** The codings of this archive. */
	protected final Map codings;

	/** The version of this archive. */
	private int version;

	/** The coding for summary sizes. */
	private final Coding sizeCoding;

	/** The coding for document lengths. */
	private final Coding lengthCoding;

	/** The coding for term gaps. */
	private final Coding termCoding;

	/** The coding for count gaps. */
	private final Coding countCoding;

	/** The basename of this archive. */
	protected CharSequence basename;

	/** Creates a new sequential bitstream archive.
	 * 
	 * @param basename the basename of the archive. 
	 * @param rank2Term the permutation from rank to terms.
	 * @param properties the properties of the archive.
	 * @param uriList an optional list of URIs that will be used to associate a URI to each summary, or null.
	 * @param frequency the term frequencies.
	 */
	protected SequentialBitstreamArchive( final CharSequence basename, final int[] rank2Term, final Properties properties, final List uriList, final int[] frequency ) throws IOException {
		this.numberOfDocuments = properties.getInt( ArchiveLoader.PropertyKeys.DOCUMENTS );
		this.numberOfTerms = properties.getInt( ArchiveLoader.PropertyKeys.TERMS );
		this.numberOfWords = properties.getLong( ArchiveLoader.PropertyKeys.WORDS );
		if ( properties.containsKey( ArchiveLoader.PropertyKeys.INMEMORY ) && properties.getBoolean( ArchiveLoader.PropertyKeys.INMEMORY ) ) {
			FastBufferedInputStream fbis = new FastBufferedInputStream( new FileInputStream( basename + ARCHIVE_EXTENSION ) );
			this.data = new InputBitStream( fmbais = new FastMultiByteArrayInputStream( fbis, fbis.length() ) );
		}
		else {
			fmbais = null;
			this.data = new InputBitStream( basename + ARCHIVE_EXTENSION );
		}
		this.uriList = uriList;
		this.rank2Term = rank2Term;
		this.frequency = frequency;
		this.version = properties.getInt( PropertyKeys.VERSION );
		this.basename = basename;

		this.codings = CompressionFlags.valueOf( properties.getStringArray( PropertyKeys.CODING ) );

		this.lengthCoding = codings.get( Component.DOCLENGTHS );
		this.sizeCoding = codings.get( Component.SIZES );
		this.termCoding = codings.get( Component.TERMS );
		this.countCoding = codings.get( Component.COUNTS );
		
		if ( uriList != null && uriList.size() != numberOfDocuments ) throw new IllegalArgumentException( "The URI list contains " + uriList.size() + " URIs, but the archive contains " + numberOfDocuments + " documents." );
	}

	protected SequentialBitstreamArchive( SequentialBitstreamArchive archive ) throws IOException {
		this.frequency = archive.frequency;
		this.numberOfDocuments = archive.numberOfDocuments;
		this.numberOfTerms = archive.numberOfTerms;
		this.numberOfWords = archive.numberOfWords;
		this.version = archive.version;
		this.rank2Term = archive.rank2Term;
		this.codings = archive.codings;
		this.basename = archive.basename;
		this.uriList = archive.uriList;
		this.fmbais = archive.fmbais == null ? null : new FastMultiByteArrayInputStream( archive.fmbais );
		this.data = fmbais == null ? new InputBitStream( basename + ARCHIVE_EXTENSION ) : new InputBitStream( fmbais );
		this.lengthCoding = archive.lengthCoding;
		this.sizeCoding = archive.sizeCoding;
		this.termCoding = archive.termCoding;
		this.countCoding = archive.countCoding;
	}

	protected void ensureOpen() throws IllegalStateException {
		if ( data == null ) throw new IllegalStateException( "The archive has been closed" );
	}

	public void close() throws IOException {
		data.close();
		data = null;
	}

	public long numberOfWords() {
		return numberOfWords;
	}

	private final int readInt( final InputBitStream stream, final Coding coding ) throws IOException {
		switch ( coding ) {
		case UNARY:
			return stream.readUnary();
		case GAMMA:
			return stream.readGamma();
		case DELTA:
			return stream.readDelta();
		case SHIFTED_GAMMA:
			return stream.readShiftedGamma();
		case ZETA_2:
			return stream.readZeta( 2 );
		case ZETA_3:
			return stream.readZeta( 3 );
		case ZETA_4:
			return stream.readZeta( 4 );
		case ZETA_5:
			return stream.readZeta( 5 );
		case NIBBLE:
			return stream.readNibble();
		default:
			throw new UnsupportedOperationException( "Coding " + coding + " is not known." );
		}
	}

	private final void readInts( final InputBitStream stream, final Coding coding, int[] where, int n ) throws IOException {
		switch ( coding ) {
		case UNARY:
			for ( int i = 0; i < n; i++ )
				where[ i ] = stream.readUnary();
			break;
		case GAMMA:
			stream.readGammas( where, n );
			break;
		case DELTA:
			stream.readDeltas( where, n );
			break;
		case SHIFTED_GAMMA:
			stream.readShiftedGammas( where, n );
			break;
		case NIBBLE:
			for ( int i = 0; i < n; i++ )
				where[ i ] = stream.readNibble();
			break;
		case ZETA_2:
			stream.readZetas( 2, where, n );
			break;
		case ZETA_3:
			stream.readZetas( 3, where, n );
			break;
		case ZETA_4:
			stream.readZetas( 4, where, n );
			break;
		case ZETA_5:
			stream.readZetas( 5, where, n );
			break;
		default:
			throw new UnsupportedOperationException( "Coding " + coding + " is not known." );
		}
	}

	/**
	 * Reads the document record beginning at the current file position and builds a
	 * {@link ArrayDocumentSummary} object representing it, if necessary.
	 */
	protected ArrayDocumentSummary readCurrentDocument( final int id ) throws IOException {
		int realWords = ( version > 1 && lengthCoding != null ? readInt( data, lengthCoding ) : -1 );
		int nTerms = readInt( data, sizeCoding );

		final int terms[] = new int[ nTerms ];
		final int count[] = countCoding != null ? new int[ nTerms ] : null;

		readInts( data, termCoding, terms, nTerms );
		if ( countCoding != null ) readInts( data, countCoding, count, nTerms );

		int prevTerm = -1, prevCount = 0;
		for ( int i = 0; i < nTerms; i++ ) {
			prevTerm = ( terms[ i ] += prevTerm + 1 );
			terms[ i ] = rank2Term[ terms[ i ] ];

			if ( countCoding != null ) {
				if ( version > 2 ) {
					if ( i == 0 ) count[ i ]++; 
					else count[ i ] = Fast.nat2int( count[ i ] ) + prevCount; 
					prevCount = count[ i ];
				}
				else if ( version > 1 ) {
					count[ i ] = Fast.nat2int( count[ i ] ) + prevCount;
					prevCount = count[ i ];
				}
				else count[ i ]++;
			}
		}

		if ( countCoding != null ) {
			int tmp;
			if ( version > 1 ) for ( int i = 0; i < nTerms / 2; i++ ) {
				tmp = count[ i ];
				count[ i ] = count[ nTerms - i - 1 ];
				count[ nTerms - i - 1 ] = tmp;
			}
		}
			
		return new ArrayDocumentSummary( terms, count, id, ( uriList == null ? null : URI.create( uriList.get( id ).toString() ) ), realWords, false );
	}

	/**
	 * Returns an unmodifiable copy of the codings used by this archive.
	 * 
	 * 
	 * @return an unmodifiable copy of the codings used by this archive.
	 */
	public Map getCodings() {
		return Collections.unmodifiableMap( codings );
	}

	/**
	 * Returns the rank-to-term permutation.
	 * 
	 * @return the rank-to-term permutation.
	 */
	public int[] getPermutation() {
		return rank2Term;
	}

	public Iterator iterator() {

		ensureOpen();

		try {
			data.position( 0 );
		}
		catch ( IOException e ) {
			throw new RuntimeException( e );
		}

		return new AbstractObjectIterator() {
			private int currIdx = 0;

			private int lastId = 0;

			public boolean hasNext() {
				return currIdx < numberOfDocuments;
			}

			public ArrayDocumentSummary next() {
				ensureOpen();
				if ( !hasNext() ) throw new NoSuchElementException();

				try {
					currIdx++;
					lastId = data.readDelta();
					return readCurrentDocument( lastId );
				}
				catch ( IOException e ) {
					throw new RuntimeException( e );
				}

			}
		};
	}

	public int numberOfDocuments() {
		return numberOfDocuments;
	}

	public int numberOfTerms() {
		return numberOfTerms;
	}

	public int frequency( int term ) {
		if ( frequency == null ) throw new UnsupportedOperationException();
		return frequency[ term ];
	}

	public boolean hasRandomAccess() {
		return false;
	}

	/** Loads γ-coded frequencies, if they exist. */
	protected static int[] loadFrequencies( CharSequence basename, int numTerms ) throws IOException {
		int[] frequency = null;
		if ( new File( basename + Preprocess.FREQUENCIES_EXTENSION ).exists() ) {
			frequency = new int[ numTerms ];
			final InputBitStream frequencyFile = new InputBitStream( basename + Preprocess.FREQUENCIES_EXTENSION );
			frequencyFile.readGammas( frequency, frequency.length );
			frequencyFile.close();
		}
		return frequency;
	}

	/** Returns a {@link SequentialBitstreamArchive} obtained by loading with given basename and optional URI list.
	 * 
	 * @param basename the archive basename.
	 * @param properties the archive properties.
	 * @param uriFilename the filename of a URI list, or null; the file must contained either
	 * a {@link StringMap} supporting {@link StringMap#list()}, or a {@link List} of {@link CharSequence}s.
	 * @return the {@link SequentialBitstreamArchive} with given basename and URI list.
	 */
	@SuppressWarnings("unchecked")
	public static SequentialBitstreamArchive getInstance( CharSequence basename, Properties properties, CharSequence uriFilename ) throws IOException, ClassNotFoundException {
		final Object uriData = uriFilename == null ? null : BinIO.loadObject( uriFilename );
		return new SequentialBitstreamArchive( basename, BinIO.loadInts( basename + PERM_EXTENSION ), properties, 
				uriData == null ? null : uriData instanceof StringMap ? ((StringMap)uriData).list() : (List)uriData, loadFrequencies( basename, properties.getInt( ArchiveLoader.PropertyKeys.TERMS ) ) );
	}

	public ArrayDocumentSummary getDocumentById( final int id ) throws IOException {
		throw new UnsupportedOperationException( "This bitstream archive does not support random access." );
	}

	public ArrayDocumentSummary getDocumentByIndex( final int index ) throws IOException {
		throw new UnsupportedOperationException( "This bitstream archive does not support random access." );
	}

	public SequentialBitstreamArchive copy() {
		try {
			return new SequentialBitstreamArchive( this );
		}
		catch ( IOException e ) {
			throw new RuntimeException( e );
		}
	}
}