All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.di.archive4j.scratch.GapLengths Maven / Gradle / Ivy

Go to download

Archive4J is a suite of tools to store compactly term/count information of a document collection.

There is a newer version: 1.3.3
Show newest version
package it.unimi.di.archive4j.scratch;

import it.unimi.di.archive4j.SequentialBitstreamArchive.CompressionFlags;
import it.unimi.di.archive4j.SequentialBitstreamArchive.CompressionFlags.Coding;
import it.unimi.di.archive4j.SequentialBitstreamArchive.CompressionFlags.Component;
import it.unimi.di.archive4j.SequentialBitstreamArchive.PropertyKeys;
import it.unimi.dsi.io.InputBitStream;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.Properties;

import java.io.IOException;
import java.io.PrintWriter;
import java.util.Map;


public class GapLengths {


	protected final static int readInt( final InputBitStream stream, final Coding coding ) throws IOException {
		switch ( coding ) {
		case UNARY:
			return stream.readUnary();
		case GAMMA:
			return stream.readGamma();
		case DELTA:
			return stream.readDelta();
		case SHIFTED_GAMMA:
			return stream.readShiftedGamma();
		case ZETA_2:
			return stream.readZeta( 2 );
		case ZETA_3:
			return stream.readZeta( 3 );
		case ZETA_4:
			return stream.readZeta( 4 );
		case ZETA_5:
			return stream.readZeta( 5 );
		case NIBBLE:
			return stream.readNibble();
		default:
			throw new UnsupportedOperationException( "Coding " + coding + " is not known. " );
		}
	}


	protected final static void readInts( final InputBitStream stream, final Coding coding, int[] where, int n ) throws IOException {
		switch ( coding ) {
		case UNARY:
			for ( int i = 0; i < n; i++ )
				where[ i ] = stream.readUnary();
			break;
		case GAMMA:
			stream.readGammas( where, n );
			break;
		case DELTA:
			stream.readDeltas( where, n );
			break;
		case SHIFTED_GAMMA:
			stream.readShiftedGammas( where, n );
			break;
		case NIBBLE:
			for ( int i = 0; i < n; i++ )
				where[ i ] = stream.readNibble();
			break;
		case ZETA_2:
			stream.readZetas( 2, where, n );
			break;
		case ZETA_3:
			stream.readZetas( 3, where, n );
			break;
		case ZETA_4:
			stream.readZetas( 4, where, n );
			break;
		case ZETA_5:
			stream.readZetas( 5, where, n );
			break;
		default:
			throw new UnsupportedOperationException( "Coding " + coding + " is not known. " );
		}
	}


	public static void main( String[] args ) throws Exception {

		final Properties p = new Properties( args[ 0 ] + ".properties" );
		final Map codings = CompressionFlags.valueOf( p.getStringArray( PropertyKeys.CODING ), CompressionFlags.DEFAULT );
		final Coding lengthCoding = codings.get( Component.DOCLENGTHS );
		final Coding termlengthCoding = codings.get( Component.SIZES );
		final Coding termCoding = codings.get( Component.TERMS );
		final Coding countCoding = codings.get( Component.COUNTS );


		final int totTerms = p.getInt( "terms" );
		final int[] gaps = new int[ totTerms ];
		final int[] terms = new int[ totTerms ];
		final int[] count = new int[ totTerms ];

		ProgressLogger log = new ProgressLogger();
		log.expectedUpdates = p.getInt( "documents" );

		log.itemsName = "documents";
		log.displayFreeMemory = false;

		log.start();

		final InputBitStream ibs = new InputBitStream( args[ 0 ] + ".archive" );
		final int ndocs = (int)log.expectedUpdates;
		for ( int i = 0; i < ndocs; i++, log.lightUpdate() ) {
			ibs.readDelta(); // ID
			readInt( ibs, lengthCoding );
			
			final int nTerms = readInt( ibs, termlengthCoding );

			readInts( ibs, termCoding, terms, nTerms );
			readInts( ibs, countCoding, count, nTerms );

			for ( int j = 0; j < nTerms; j++ ) {
				gaps[ terms[ j ] ]++;
			}
		}


		log.stop();

		int maxgap = 0;
		for ( int i = 0; i < gaps.length; i++ )
			if ( gaps[ i ] > 0 ) maxgap = i;
        ++maxgap;
        
  //      Arrays.sort( gaps, 0, maxgap );

		PrintWriter pw = new PrintWriter( args[ 1 ] );
		for ( int i = 0; i < maxgap; i++ ) 
			pw.println( gaps[i] );
		pw.close();
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy