![JAR search and dependency download from the Maven repository](/logo.png)
src.it.unimi.di.archive4j.scratch.GapLengths Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of archive4j Show documentation
Show all versions of archive4j Show documentation
Archive4J is a suite of tools to store compactly term/count information of a document collection.
package it.unimi.di.archive4j.scratch;
import it.unimi.di.archive4j.SequentialBitstreamArchive.CompressionFlags;
import it.unimi.di.archive4j.SequentialBitstreamArchive.CompressionFlags.Coding;
import it.unimi.di.archive4j.SequentialBitstreamArchive.CompressionFlags.Component;
import it.unimi.di.archive4j.SequentialBitstreamArchive.PropertyKeys;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.Properties;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Map;
public class GapLengths {
protected final static int readInt( final InputBitStream stream, final Coding coding ) throws IOException {
switch ( coding ) {
case UNARY:
return stream.readUnary();
case GAMMA:
return stream.readGamma();
case DELTA:
return stream.readDelta();
case SHIFTED_GAMMA:
return stream.readShiftedGamma();
case ZETA_2:
return stream.readZeta( 2 );
case ZETA_3:
return stream.readZeta( 3 );
case ZETA_4:
return stream.readZeta( 4 );
case ZETA_5:
return stream.readZeta( 5 );
case NIBBLE:
return stream.readNibble();
default:
throw new UnsupportedOperationException( "Coding " + coding + " is not known. " );
}
}
protected final static void readInts( final InputBitStream stream, final Coding coding, int[] where, int n ) throws IOException {
switch ( coding ) {
case UNARY:
for ( int i = 0; i < n; i++ )
where[ i ] = stream.readUnary();
break;
case GAMMA:
stream.readGammas( where, n );
break;
case DELTA:
stream.readDeltas( where, n );
break;
case SHIFTED_GAMMA:
stream.readShiftedGammas( where, n );
break;
case NIBBLE:
for ( int i = 0; i < n; i++ )
where[ i ] = stream.readNibble();
break;
case ZETA_2:
stream.readZetas( 2, where, n );
break;
case ZETA_3:
stream.readZetas( 3, where, n );
break;
case ZETA_4:
stream.readZetas( 4, where, n );
break;
case ZETA_5:
stream.readZetas( 5, where, n );
break;
default:
throw new UnsupportedOperationException( "Coding " + coding + " is not known. " );
}
}
public static void main( String[] args ) throws Exception {
final Properties p = new Properties( args[ 0 ] + ".properties" );
final Map codings = CompressionFlags.valueOf( p.getStringArray( PropertyKeys.CODING ), CompressionFlags.DEFAULT );
final Coding lengthCoding = codings.get( Component.DOCLENGTHS );
final Coding termlengthCoding = codings.get( Component.SIZES );
final Coding termCoding = codings.get( Component.TERMS );
final Coding countCoding = codings.get( Component.COUNTS );
final int totTerms = p.getInt( "terms" );
final int[] gaps = new int[ totTerms ];
final int[] terms = new int[ totTerms ];
final int[] count = new int[ totTerms ];
ProgressLogger log = new ProgressLogger();
log.expectedUpdates = p.getInt( "documents" );
log.itemsName = "documents";
log.displayFreeMemory = false;
log.start();
final InputBitStream ibs = new InputBitStream( args[ 0 ] + ".archive" );
final int ndocs = (int)log.expectedUpdates;
for ( int i = 0; i < ndocs; i++, log.lightUpdate() ) {
ibs.readDelta(); // ID
readInt( ibs, lengthCoding );
final int nTerms = readInt( ibs, termlengthCoding );
readInts( ibs, termCoding, terms, nTerms );
readInts( ibs, countCoding, count, nTerms );
for ( int j = 0; j < nTerms; j++ ) {
gaps[ terms[ j ] ]++;
}
}
log.stop();
int maxgap = 0;
for ( int i = 0; i < gaps.length; i++ )
if ( gaps[ i ] > 0 ) maxgap = i;
++maxgap;
// Arrays.sort( gaps, 0, maxgap );
PrintWriter pw = new PrintWriter( args[ 1 ] );
for ( int i = 0; i < maxgap; i++ )
pw.println( gaps[i] );
pw.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy