![JAR search and dependency download from the Maven repository](/logo.png)
src.it.unimi.di.archive4j.SequentialBitstreamArchive Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of archive4j Show documentation
Show all versions of archive4j Show documentation
Archive4J is a suite of tools to store compactly term/count information of a document collection.
package it.unimi.di.archive4j;
/*
* Copyright (C) 2008-2013 Alessio Orlandi and Sebastiano Vigna
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/
import it.unimi.di.archive4j.SequentialBitstreamArchive.CompressionFlags.Coding;
import it.unimi.di.archive4j.SequentialBitstreamArchive.CompressionFlags.Component;
import it.unimi.di.archive4j.tool.Preprocess;
import it.unimi.dsi.bits.Fast;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.io.FastMultiByteArrayInputStream;
import it.unimi.dsi.fastutil.objects.AbstractObjectIterator;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.util.Properties;
import it.unimi.dsi.util.StringMap;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URI;
import java.util.Collections;
import java.util.EnumMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
/** An {@link Archive} implementation providing sequential access only.
*
* @see RandomAccessBitstreamArchive
* @see BitstreamArchiveWriter
*
* @author Alessio Orlandi
* @author Sebastiano Vigna
*/
public class SequentialBitstreamArchive implements Archive {
/**
* Class representing compression flags for much of the data in this archive.
*/
public static class CompressionFlags {
/** Each component of the data file or frequency file. */
public static enum Component {
/** The sizes of summaries (the number of terms/count pairs in the summary). */
SIZES,
/** The document lengths (in words). */
DOCLENGTHS,
/** The terms in the summary (actually, the term gaps). */
TERMS,
/** The counts in the summary (actually, reverse-order gaps). */
COUNTS
};
public static enum Coding {
UNARY, NIBBLE, SHIFTED_GAMMA, GAMMA, DELTA, ZETA_2, ZETA_3, ZETA_4, ZETA_5;
}
/**
* Default codings.
*/
public static final Map DEFAULT;
static {
Map map = new EnumMap( Component.class );
DEFAULT = Collections.unmodifiableMap( map );
map.put( Component.SIZES, Coding.ZETA_3 );
map.put( Component.DOCLENGTHS, Coding.DELTA );
map.put( Component.TERMS, Coding.ZETA_2 );
map.put( Component.COUNTS, Coding.GAMMA );
}
/**
* Decodes a set of strings of the form {@link Component}:{@link Coding} into a map.
*
* @param codingSpec an array of strings of the form {@link Component}:{@link Coding}.
* @return a map obtained by parsing flag
.
*/
public static Map valueOf( final String[] codingSpec ) {
return valueOf( codingSpec, null );
}
/**
* Decodes a set of strings of the form {@link Component}:{@link Coding} into a map, patching
* missing components with a given default map. Components with coding NONE
* are removed from the ap.
*
* @param flag an array of strings of the form {@link Component}:{@link Coding}.
* @param defaultMap a map containing defaults, or null
.
* @return a map obtained by parsing flag
; missing components are taken
* from defaultMap
.
*/
public static Map valueOf( final String[] flag, final Map defaultMap ) {
final EnumMap m = new EnumMap( Component.class );
if ( defaultMap != null ) m.putAll( defaultMap );
for ( int i = 0; i < flag.length; i++ ) {
final String[] spec = flag[ i ].trim().split( ":" );
if ( spec.length != 2 ) throw new IllegalArgumentException( "Bad format: " + flag[ i ] );
else m.put( Component.valueOf( spec[ 0 ] ), "NONE".equalsIgnoreCase( spec[ 1 ] ) ? null : Coding.valueOf( spec[ 1 ] ) );
}
return m;
}
}
/**
* Additional properties (w.r.t. {@link ArchiveLoader.PropertyKeys}).
*/
public static enum PropertyKeys {
/** A {@link Component}:{@link Coding} pair. Multiple instances are allowed. */
CODING,
/** Version the archive. Used for backward compatibility. */
VERSION,
}
/** The standard archive extension. */
public static final String ARCHIVE_EXTENSION = ".archive";
/** The standard permutation extension. */
public static final String PERM_EXTENSION = ".permutation";
/** The input bit stream for the data file. If {@link #fmbais} is not null
, it
* wraps it; otherwise, it refers directly to a file named {@link #basename} + {@link #ARCHIVE_EXTENSION}).
* When the archive is closed it is nullified. */
protected InputBitStream data;
/** If not null
, the in-memory stream upon which {@link #data} is based. */
protected final FastMultiByteArrayInputStream fmbais;
/** An optional list of URIs that will be used to create the URI associated to each summary. */
protected final List extends CharSequence> uriList;
/** The number of document summaries in this archive */
protected final int numberOfDocuments;
/** The number of terms in this archive. */
protected final int numberOfTerms;
/** The number of words in the documents summarized by this archive */
protected final long numberOfWords;
/** The map from frequency rank to terms. */
protected final int[] rank2Term;
/** The frequency of each term. */
protected final int[] frequency;
/** The codings of this archive. */
protected final Map codings;
/** The version of this archive. */
private int version;
/** The coding for summary sizes. */
private final Coding sizeCoding;
/** The coding for document lengths. */
private final Coding lengthCoding;
/** The coding for term gaps. */
private final Coding termCoding;
/** The coding for count gaps. */
private final Coding countCoding;
/** The basename of this archive. */
protected CharSequence basename;
/** Creates a new sequential bitstream archive.
*
* @param basename the basename of the archive.
* @param rank2Term the permutation from rank to terms.
* @param properties the properties of the archive.
* @param uriList an optional list of URIs that will be used to associate a URI to each summary, or null
.
* @param frequency the term frequencies.
*/
protected SequentialBitstreamArchive( final CharSequence basename, final int[] rank2Term, final Properties properties, final List extends CharSequence> uriList, final int[] frequency ) throws IOException {
this.numberOfDocuments = properties.getInt( ArchiveLoader.PropertyKeys.DOCUMENTS );
this.numberOfTerms = properties.getInt( ArchiveLoader.PropertyKeys.TERMS );
this.numberOfWords = properties.getLong( ArchiveLoader.PropertyKeys.WORDS );
if ( properties.containsKey( ArchiveLoader.PropertyKeys.INMEMORY ) && properties.getBoolean( ArchiveLoader.PropertyKeys.INMEMORY ) ) {
FastBufferedInputStream fbis = new FastBufferedInputStream( new FileInputStream( basename + ARCHIVE_EXTENSION ) );
this.data = new InputBitStream( fmbais = new FastMultiByteArrayInputStream( fbis, fbis.length() ) );
}
else {
fmbais = null;
this.data = new InputBitStream( basename + ARCHIVE_EXTENSION );
}
this.uriList = uriList;
this.rank2Term = rank2Term;
this.frequency = frequency;
this.version = properties.getInt( PropertyKeys.VERSION );
this.basename = basename;
this.codings = CompressionFlags.valueOf( properties.getStringArray( PropertyKeys.CODING ) );
this.lengthCoding = codings.get( Component.DOCLENGTHS );
this.sizeCoding = codings.get( Component.SIZES );
this.termCoding = codings.get( Component.TERMS );
this.countCoding = codings.get( Component.COUNTS );
if ( uriList != null && uriList.size() != numberOfDocuments ) throw new IllegalArgumentException( "The URI list contains " + uriList.size() + " URIs, but the archive contains " + numberOfDocuments + " documents." );
}
protected SequentialBitstreamArchive( SequentialBitstreamArchive archive ) throws IOException {
this.frequency = archive.frequency;
this.numberOfDocuments = archive.numberOfDocuments;
this.numberOfTerms = archive.numberOfTerms;
this.numberOfWords = archive.numberOfWords;
this.version = archive.version;
this.rank2Term = archive.rank2Term;
this.codings = archive.codings;
this.basename = archive.basename;
this.uriList = archive.uriList;
this.fmbais = archive.fmbais == null ? null : new FastMultiByteArrayInputStream( archive.fmbais );
this.data = fmbais == null ? new InputBitStream( basename + ARCHIVE_EXTENSION ) : new InputBitStream( fmbais );
this.lengthCoding = archive.lengthCoding;
this.sizeCoding = archive.sizeCoding;
this.termCoding = archive.termCoding;
this.countCoding = archive.countCoding;
}
protected void ensureOpen() throws IllegalStateException {
if ( data == null ) throw new IllegalStateException( "The archive has been closed" );
}
public void close() throws IOException {
data.close();
data = null;
}
public long numberOfWords() {
return numberOfWords;
}
private final int readInt( final InputBitStream stream, final Coding coding ) throws IOException {
switch ( coding ) {
case UNARY:
return stream.readUnary();
case GAMMA:
return stream.readGamma();
case DELTA:
return stream.readDelta();
case SHIFTED_GAMMA:
return stream.readShiftedGamma();
case ZETA_2:
return stream.readZeta( 2 );
case ZETA_3:
return stream.readZeta( 3 );
case ZETA_4:
return stream.readZeta( 4 );
case ZETA_5:
return stream.readZeta( 5 );
case NIBBLE:
return stream.readNibble();
default:
throw new UnsupportedOperationException( "Coding " + coding + " is not known." );
}
}
private final void readInts( final InputBitStream stream, final Coding coding, int[] where, int n ) throws IOException {
switch ( coding ) {
case UNARY:
for ( int i = 0; i < n; i++ )
where[ i ] = stream.readUnary();
break;
case GAMMA:
stream.readGammas( where, n );
break;
case DELTA:
stream.readDeltas( where, n );
break;
case SHIFTED_GAMMA:
stream.readShiftedGammas( where, n );
break;
case NIBBLE:
for ( int i = 0; i < n; i++ )
where[ i ] = stream.readNibble();
break;
case ZETA_2:
stream.readZetas( 2, where, n );
break;
case ZETA_3:
stream.readZetas( 3, where, n );
break;
case ZETA_4:
stream.readZetas( 4, where, n );
break;
case ZETA_5:
stream.readZetas( 5, where, n );
break;
default:
throw new UnsupportedOperationException( "Coding " + coding + " is not known." );
}
}
/**
* Reads the document record beginning at the current file position and builds a
* {@link ArrayDocumentSummary} object representing it, if necessary.
*/
protected ArrayDocumentSummary readCurrentDocument( final int id ) throws IOException {
int realWords = ( version > 1 && lengthCoding != null ? readInt( data, lengthCoding ) : -1 );
int nTerms = readInt( data, sizeCoding );
final int terms[] = new int[ nTerms ];
final int count[] = countCoding != null ? new int[ nTerms ] : null;
readInts( data, termCoding, terms, nTerms );
if ( countCoding != null ) readInts( data, countCoding, count, nTerms );
int prevTerm = -1, prevCount = 0;
for ( int i = 0; i < nTerms; i++ ) {
prevTerm = ( terms[ i ] += prevTerm + 1 );
terms[ i ] = rank2Term[ terms[ i ] ];
if ( countCoding != null ) {
if ( version > 2 ) {
if ( i == 0 ) count[ i ]++;
else count[ i ] = Fast.nat2int( count[ i ] ) + prevCount;
prevCount = count[ i ];
}
else if ( version > 1 ) {
count[ i ] = Fast.nat2int( count[ i ] ) + prevCount;
prevCount = count[ i ];
}
else count[ i ]++;
}
}
if ( countCoding != null ) {
int tmp;
if ( version > 1 ) for ( int i = 0; i < nTerms / 2; i++ ) {
tmp = count[ i ];
count[ i ] = count[ nTerms - i - 1 ];
count[ nTerms - i - 1 ] = tmp;
}
}
return new ArrayDocumentSummary( terms, count, id, ( uriList == null ? null : URI.create( uriList.get( id ).toString() ) ), realWords, false );
}
/**
* Returns an unmodifiable copy of the codings used by this archive.
*
*
* @return an unmodifiable copy of the codings used by this archive.
*/
public Map getCodings() {
return Collections.unmodifiableMap( codings );
}
/**
* Returns the rank-to-term permutation.
*
* @return the rank-to-term permutation.
*/
public int[] getPermutation() {
return rank2Term;
}
public Iterator iterator() {
ensureOpen();
try {
data.position( 0 );
}
catch ( IOException e ) {
throw new RuntimeException( e );
}
return new AbstractObjectIterator() {
private int currIdx = 0;
private int lastId = 0;
public boolean hasNext() {
return currIdx < numberOfDocuments;
}
public ArrayDocumentSummary next() {
ensureOpen();
if ( !hasNext() ) throw new NoSuchElementException();
try {
currIdx++;
lastId = data.readDelta();
return readCurrentDocument( lastId );
}
catch ( IOException e ) {
throw new RuntimeException( e );
}
}
};
}
public int numberOfDocuments() {
return numberOfDocuments;
}
public int numberOfTerms() {
return numberOfTerms;
}
public int frequency( int term ) {
if ( frequency == null ) throw new UnsupportedOperationException();
return frequency[ term ];
}
public boolean hasRandomAccess() {
return false;
}
/** Loads γ-coded frequencies, if they exist. */
protected static int[] loadFrequencies( CharSequence basename, int numTerms ) throws IOException {
int[] frequency = null;
if ( new File( basename + Preprocess.FREQUENCIES_EXTENSION ).exists() ) {
frequency = new int[ numTerms ];
final InputBitStream frequencyFile = new InputBitStream( basename + Preprocess.FREQUENCIES_EXTENSION );
frequencyFile.readGammas( frequency, frequency.length );
frequencyFile.close();
}
return frequency;
}
/** Returns a {@link SequentialBitstreamArchive} obtained by loading with given basename and optional URI list.
*
* @param basename the archive basename.
* @param properties the archive properties.
* @param uriFilename the filename of a URI list, or null
; the file must contained either
* a {@link StringMap} supporting {@link StringMap#list()}, or a {@link List} of {@link CharSequence}s.
* @return the {@link SequentialBitstreamArchive} with given basename and URI list.
*/
@SuppressWarnings("unchecked")
public static SequentialBitstreamArchive getInstance( CharSequence basename, Properties properties, CharSequence uriFilename ) throws IOException, ClassNotFoundException {
final Object uriData = uriFilename == null ? null : BinIO.loadObject( uriFilename );
return new SequentialBitstreamArchive( basename, BinIO.loadInts( basename + PERM_EXTENSION ), properties,
uriData == null ? null : uriData instanceof StringMap ? ((StringMap extends CharSequence>)uriData).list() : (List extends CharSequence>)uriData, loadFrequencies( basename, properties.getInt( ArchiveLoader.PropertyKeys.TERMS ) ) );
}
public ArrayDocumentSummary getDocumentById( final int id ) throws IOException {
throw new UnsupportedOperationException( "This bitstream archive does not support random access." );
}
public ArrayDocumentSummary getDocumentByIndex( final int index ) throws IOException {
throw new UnsupportedOperationException( "This bitstream archive does not support random access." );
}
public SequentialBitstreamArchive copy() {
try {
return new SequentialBitstreamArchive( this );
}
catch ( IOException e ) {
throw new RuntimeException( e );
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy