![JAR search and dependency download from the Maven repository](/logo.png)
src.it.unimi.di.archive4j.scratch.EstimatePositions Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of archive4j Show documentation
Show all versions of archive4j Show documentation
Archive4J is a suite of tools to store compactly term/count information of a document collection.
package it.unimi.di.archive4j.scratch;
/*
* Copyright (C) 2006-2013 Paolo Boldi, Massimo Santini and Sebastiano Vigna
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*/
import it.unimi.di.big.mg4j.document.AbstractDocumentSequence;
import it.unimi.di.big.mg4j.document.Document;
import it.unimi.di.big.mg4j.document.DocumentIterator;
import it.unimi.di.big.mg4j.document.DocumentSequence;
import it.unimi.di.big.mg4j.index.TermProcessor;
import it.unimi.di.archive4j.SequentialBitstreamArchive.CompressionFlags.Coding;
import it.unimi.di.archive4j.tool.Preprocess;
import it.unimi.dsi.fastutil.Hash;
import it.unimi.dsi.fastutil.ints.Int2IntLinkedOpenHashMap;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.objects.ObjectOpenHashSet;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.NullOutputStream;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.lang.ObjectParser;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.util.StringMap;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Reader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import cern.colt.Sorting;
import cern.colt.function.IntComparator;
import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
/**
* Estimates position cost.
*
* @author Alessio Orlandi
*/
public class EstimatePositions {
protected static final Logger LOGGER = LoggerFactory.getLogger( EstimatePositions.class );
protected static class EstimatingWriter {
final NullOutputStream nos = NullOutputStream.getInstance();
final OutputBitStream obs = new OutputBitStream( nos, 0 );
/** Term to global-count rank. */
final protected int term2Rank[];
/** Terms in the current document. */
@SuppressWarnings("hiding")
private IntArrayList terms = new IntArrayList();
/** Count of each term in the current document. */
protected Int2IntLinkedOpenHashMap termsMap = new Int2IntLinkedOpenHashMap();
private boolean twopass, local;
/** Counters for each term, and the position for the 'missing' term. */
public int counters[], missingTerm;
public long written = 0, words;
protected Coding coding;
public EstimatingWriter( final long[] counts, boolean twopass, boolean local, Coding coding ) {
this.twopass = twopass || local;
this.local = local;
this.coding = coding;
final int rank2Term[] = new int[ counts.length ];
for ( int i = counts.length; i-- != 0; )
rank2Term[ i ] = i;
Sorting.quickSort( rank2Term, 0, rank2Term.length, new IntComparator() {
// Sort in descending order using counts.
public int compare( final int a, final int b ) {
return (int)( counts[ b ] - counts[ a ] );
}
} );
term2Rank = new int[ counts.length ];
for ( int i = rank2Term.length; i-- != 0; )
term2Rank[ rank2Term[ i ] ] = i;
counters = new int[ counts.length + 1 ];
missingTerm = counts.length;
}
public int numberOfPasses() {
return ( twopass ? 2 : 1 );
}
public void append( final int term ) throws IOException {
/* Two pass: we first need T/F information, so we collect the term. */
if ( twopass ) {
terms.add( term );
}
else // one pass: just count gamma-coding the current term.
{
if ( term < 0 ) counters[ missingTerm ]++;
else {
obs.writeGamma( term2Rank[ term ] );
final long writtenNow = obs.writtenBits();
counters[ term2Rank[ term ] ] += writtenNow - written;
written = writtenNow;
words++;
}
}
}
public void nextDocument() throws IOException {
if ( !twopass ) return;
termsMap.clear();
// Build T/F, unsorted.
for ( int t : terms )
termsMap.put( t, termsMap.get( t ) + 1 );
final int[] tpTerms = termsMap.keySet().toIntArray();
/* Sort elements in tpTerms via some criteria. */
if ( local ) twoPassLocal( tpTerms );
else twoPassGlobal( tpTerms );
if ( LOGGER.isDebugEnabled() ) LOGGER.debug( "Writing " + tpTerms.length + " terms covering " + terms.size() + "positions" );
for ( int i = 0; i < tpTerms.length; i++ ) {
final int freq = termsMap.get( tpTerms[ i ] );
if ( tpTerms[ i ] < 0 ) counters[ missingTerm ] += freq;
else {
// Each position is represented by the term rank in the sorted list.
obs.writtenBits( 0 );
switch ( coding ) {
case DELTA:
obs.writeDelta( i );
break;
case GAMMA:
obs.writeGamma( i );
break;
case NIBBLE:
obs.writeNibble( i );
break;
case UNARY: // FIXME This is an hack.
obs.writeMinimalBinary( i, tpTerms.length + 1 );
break;
case ZETA_2:
obs.writeZeta( i, 2 );
break;
case ZETA_3:
obs.writeZeta( i, 3 );
break;
case ZETA_4:
obs.writeZeta( i, 4 );
break;
case ZETA_5:
obs.writeZeta( i, 5 );
break;
case SHIFTED_GAMMA:
obs.writeShiftedGamma( i );
break;
}
final int quantity = (int)obs.writtenBits() * freq;
counters[ term2Rank[ tpTerms[ i ] ] ] += quantity;
written += quantity;
words += freq;
}
}
terms.clear();
}
protected void twoPassLocal( final int[] tpTerms ) {
/* Renumbers term locally: sort them by descending count. */
Sorting.quickSort( tpTerms, 0, tpTerms.length, new IntComparator() {
public int compare( final int a, final int b ) {
return termsMap.get( b ) - termsMap.get( a );
}
} );
}
protected void twoPassGlobal( final int[] tpTerms ) {
/* Renumbers term globally: sort them by descending global count. */
Sorting.quickSort( tpTerms, 0, tpTerms.length, new IntComparator() {
public int compare( final int a, final int b ) {
return ( a < 0 ? -1 : term2Rank[ a ] ) - ( b < 0 ? -1 : term2Rank[ b ] );
}
} );
}
};
protected static TermProcessor termProcessor;
protected static ObjectOpenHashSet removed;
protected static StringMap terms;
protected static StringMap urls;
protected static DocumentSequence sequence;
protected static int[] allCount;
protected static final MutableString wordString = new MutableString(), nonWordString = new MutableString();
@SuppressWarnings("unchecked")
public static void main( String[] args ) throws Exception {
SimpleJSAP jsap = new SimpleJSAP( EstimatePositions.class.getName(), "Estimates.",
new Parameter[] {
new Switch( "twopass", '2', "two-pass", "Use indirect referral of terms into counting (2-pass parsing)" ),
new Switch( "local", 'l', "local", "Local count re-sorting." ),
new FlaggedOption( "coding", JSAP.STRING_PARSER, "UNARY", JSAP.NOT_REQUIRED, 'c', "coding",
"Specify code (unary,gamma,shifted_gamma,nibble,delta,zeta_K) for a given component in the form Component:Coding" ).setAllowMultipleDeclarations( true ),
new Switch( "sorted", 's', "sorted", "Tells if we are guaranteed that input document are sorted (guarantees random access)" ),
new FlaggedOption( "stringmap", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'm', "stringmap",
"Specify an alternative string map different from preprocessed one (if any)" ),
new Switch( "ascii", 'A', "ascii", "Make (debugging) output in ASCII format on stdout" ),
new FlaggedOption( "urlmap", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'u', "urlmap", "The filename for the graph URLs dictionary" ),
new FlaggedOption( "sequence", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'S', "sequence", "The filename for the serialized collection containing documents" ),
new FlaggedOption( "termprocessor", new ObjectParser( TermProcessor.class,
new String[] { "it.unimi.di.big.mg4j.index", "it.unimi.di.big.mg4j.snowball", "it.unimi.dsi.law.archive" }, new String[] { "getInstance", "" } ),
"it.unimi.di.big.mg4j.index.NullTermProcessor", JSAP.NOT_REQUIRED, 't', "termprocessor", "The term processor to apply" ),
new FlaggedOption( "indexedField", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'I', "indexed-field", "The field of the document factory that will be indexed" ),
new Switch( "generate", 'g', "generate", "Generates preprocessing data automatically." ),
new FlaggedOption( "preprocessed", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'p', "preprocessed", "Preprocessed data basename" ),
new FlaggedOption( "numberOfDocuments", JSAP.INTEGER_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'n', "Number of documents in the archive" ),
new UnflaggedOption( "output", JSAP.STRING_PARSER, JSAP.REQUIRED, "The filename for the output data file" )
} );
JSAPResult jsapResult = jsap.parse( args );
if ( jsap.messagePrinted() ) return;
if ( jsapResult.userSpecified( "generate" ) && jsapResult.userSpecified( "preprocessed" ) ) {
LOGGER.error( "Options 'generate' and 'input' are complementary." );
return;
}
else if ( !jsapResult.userSpecified( "generate" ) && !jsapResult.userSpecified( "preprocessed" ) ) {
LOGGER.error( "Either one option between 'generate' and 'input' must be specified." );
return;
}
sequence = AbstractDocumentSequence.load( jsapResult.getString( "sequence" ) );
urls = ( !jsapResult.userSpecified( "urlmap" ) ? null : (StringMap)BinIO.loadObject( jsapResult.getString( "urlmap" ) ) );
if ( jsapResult.userSpecified( "removelist" ) ) {
final FileLinesCollection lines = new FileLinesCollection( jsapResult.getString( "removelist" ), "UTF-8" );
removed = new ObjectOpenHashSet( lines.size(), Hash.VERY_FAST_LOAD_FACTOR );
for ( it.unimi.dsi.lang.MutableString s : lines )
removed.add( s.copy() );
}
termProcessor = (TermProcessor)jsapResult.getObject( "termprocessor" );
String preprocessBasename;
if ( jsapResult.userSpecified( "generate" ) ) {
preprocessBasename = jsapResult.getString( "output" ) + "-gpp";
Preprocess.run( jsapResult.getString( "output" ), sequence, termProcessor, jsapResult.getString( "indexedField" ) );
}
else preprocessBasename = jsapResult.getString( "preprocessed" );
terms = (StringMap)BinIO.loadObject( ( jsapResult.userSpecified( "stringmap" ) ? jsapResult.getString( "stringmap" ) : preprocessBasename + Preprocess.TERMS_EXTENSION ) );
allCount = new int[ terms.size() ];
final EstimatingWriter writer = new EstimatingWriter( BinIO.loadLongs( preprocessBasename + Preprocess.COUNTS_EXTENSION ), jsapResult.getBoolean( "twopass" ),
jsapResult.getBoolean( "local" ), Coding.valueOf( jsapResult.getString( "coding" ) ) );
final ProgressLogger pl = new ProgressLogger( LOGGER );
pl.itemsName = "documents";
pl.logInterval = ProgressLogger.TEN_SECONDS;
pl.displayFreeMemory = true;
pl.start( "Parsing documents..." );
run( pl, writer, sequence.factory().fieldIndex( jsapResult.getString( "indexedField" ) ) );
pl.done();
sequence.close();
PrintWriter pw = new PrintWriter( jsapResult.getString( "output" ) );
pw.println( "Total number of written bytes: " + writer.written / 8 );
pw.println( "Instances for the 'missing' term (over " + writer.words + " words): " + writer.counters[ writer.missingTerm ] );
pw.println( "Counters for single term bits (by frequency-rank descending order) " );
pw.println( "----" );
for ( int i = 0; i < writer.counters.length - 1; i++ )
pw.println( i + " " + writer.counters[ i ] );
pw.close();
}
protected final static void buildTF( WordReader reader, EstimatingWriter writer ) throws IOException {
// Copying fields speeds up the computation;
final TermProcessor processor = termProcessor;
final StringMap map = terms;
final MutableString word = wordString, nonWord = nonWordString;
int len = 0;
while ( reader.next( word, nonWord ) ) {
if ( word.length() == 0 ) continue;
len++;
if ( !processor.processTerm( word ) ) writer.append( -1 );
else writer.append( (int)map.getLong( word ) );
}
}
public static void run( ProgressLogger pl, EstimatingWriter writer, int field ) throws Exception {
final DocumentIterator dIterator = sequence.iterator();
Document d;
int urlId = 0;
if ( field == -1 ) throw new IllegalArgumentException();
while ( ( d = dIterator.nextDocument() ) != null ) {
int docId = (int)( urls != null ? urls.getLong( d.uri() ) : urlId++ );
if ( docId < 0 ) {
LOGGER.info( "Skipping document with unknown id for uri : " + d.uri() );
d.close();
continue;
}
else LOGGER.debug( "Indexing field " + field + " from " + d.uri() );
final WordReader wReader = d.wordReader( field );
wReader.setReader( (Reader)d.content( field ) );
buildTF( wReader, writer );
writer.nextDocument();
if ( pl != null ) pl.lightUpdate();
d.close();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy