src.it.unimi.di.mg4j.document.SimpleCompressedDocumentCollectionBuilder Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j Show documentation
Show all versions of mg4j Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java.
package it.unimi.di.mg4j.document;
/*
* MG4J: Managing Gigabytes for Java
*
* Copyright (C) 2009-2012 Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.ObjectList;
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.di.mg4j.document.DocumentFactory.FieldType;
import it.unimi.di.mg4j.document.SimpleCompressedDocumentCollection.FrequencyCodec;
import it.unimi.di.mg4j.tool.Scan;
import it.unimi.di.mg4j.tool.Scan.VirtualDocumentFragment;
import java.io.DataOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.PrintStream;
import java.io.Reader;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.CountingOutputStream;
/** A builder for {@linkplain SimpleCompressedDocumentCollection simple compressed document collections}.
*
* @author Sebastiano Vigna
*/
public class SimpleCompressedDocumentCollectionBuilder implements DocumentCollectionBuilder {
/** The factory of the base document sequence. */
private final DocumentFactory factory;
/** Whether will are building an exact collection (i.e., whether it stores nonwords). */
private final boolean exact;
/** A frequency keeper used to compress document terms. */
private final FrequencyCodec termsFrequencyKeeper;
/** A frequency keeper used to compress document nonterms, or null
if {@link #exact} is false. */
private final FrequencyCodec nonTermsFrequencyKeeper;
/** The basename of the builder. */
private String basename;
/** The basename of current collection. */
private String basenameSuffix;
/** The output bit stream for documents. */
private OutputBitStream documentsOutputBitStream;
/** The output stream for terms. */
private CountingOutputStream termsOutputStream;
/** The output stream for nonterms, or null
if {@link #exact} is false. */
private CountingOutputStream nonTermsOutputStream;
/** The output bit stream for document offsets. */
private OutputBitStream documentOffsetsObs;
/** The output bit stream for term offsets. */
private OutputBitStream termOffsetsObs;
/** The output bit stream for nonterms offsets, or null
if {@link #exact} is false. */
private OutputBitStream nonTermOffsetsObs;
/** A temporary cache for the content of a field as a list of global term numbers. If the collection is exact, it alternates terms and nonterms. */
private IntArrayList fieldContent;
/** The map from term to global term numbers, in order of appearance. */
private Object2IntOpenHashMap terms;
/** The map from term to global nonterm numbers, in order of appearance, or null
if {@link #exact} is false. */
private Object2IntOpenHashMap nonTerms;
/** The number of documents indexed so far. */
private int documents;
/** The number of words indexed so far. */
private long words;
/** The number of fields indexed so far. */
private long fields;
/** The number of bits used to code words. */
private long bitsForWords;
/** The number of bits used to code nonwords. */
private long bitsForNonWords;
/** The number of bits used to code field lengths (the number of words/nonwords pairs). */
private long bitsForFieldLengths;
/** The number of bits used to code URIs. */
private long bitsForUris;
/** The number of bits used to code document titles. */
private long bitsForTitles;
/** Whether we are compressing non-text or virtual fields. */
private boolean hasNonText;
/** The zip output stream used to store non-text and virtual fields if {@link #hasNonText} is true, or null
otherwise. */
private ZipOutputStream nonTextZipOutputStream;
/** {@link #nonTextZipOutputStream} wrapped in a {@link DataOutputStream}. */
private DataOutputStream nonTextZipDataOutputStream;
public SimpleCompressedDocumentCollectionBuilder( final String basename, final DocumentFactory factory, final boolean exact ) {
this.basename = basename;
this.factory = factory;
this.exact = exact;
this.termsFrequencyKeeper = new SimpleCompressedDocumentCollection.FrequencyCodec();
this.nonTermsFrequencyKeeper = exact ? new SimpleCompressedDocumentCollection.FrequencyCodec() : null;
boolean hasNonText = false;
for( int i = factory.numberOfFields(); i-- != 0; ) hasNonText |= factory.fieldType( i ) != FieldType.TEXT;
this.hasNonText = hasNonText;
terms = new Object2IntOpenHashMap( Scan.INITIAL_TERM_MAP_SIZE );
terms.defaultReturnValue( -1 );
if ( exact ) {
nonTerms = new Object2IntOpenHashMap( Scan.INITIAL_TERM_MAP_SIZE );
nonTerms.defaultReturnValue( -1 );
}
else nonTerms = null;
}
public String basename() {
return basename;
}
public void open( final CharSequence suffix ) throws IOException {
basenameSuffix = basename + suffix;
documentsOutputBitStream = new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.DOCUMENTS_EXTENSION );
termsOutputStream = new CountingOutputStream( new FastBufferedOutputStream( new FileOutputStream( basenameSuffix + SimpleCompressedDocumentCollection.TERMS_EXTENSION ) ) );
nonTermsOutputStream = exact ? new CountingOutputStream( new FastBufferedOutputStream( new FileOutputStream( basenameSuffix + SimpleCompressedDocumentCollection.NONTERMS_EXTENSION ) ) ) : null;
documentOffsetsObs = new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.DOCUMENT_OFFSETS_EXTENSION );
termOffsetsObs = new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.TERM_OFFSETS_EXTENSION );
nonTermOffsetsObs = exact? new OutputBitStream( basenameSuffix + SimpleCompressedDocumentCollection.NONTERM_OFFSETS_EXTENSION ) : null;
fieldContent = new IntArrayList();
if ( hasNonText ) nonTextZipDataOutputStream = new DataOutputStream( nonTextZipOutputStream = new ZipOutputStream( new FastBufferedOutputStream( new FileOutputStream( basenameSuffix + ZipDocumentCollection.ZIP_EXTENSION ) ) ) );
terms.clear();
terms.trim( Scan.INITIAL_TERM_MAP_SIZE );
if ( exact ) {
nonTerms.clear();
nonTerms.trim( Scan.INITIAL_TERM_MAP_SIZE );
}
words = fields = bitsForWords = bitsForNonWords = bitsForFieldLengths = bitsForUris = bitsForTitles = documents = 0;
// First offset
documentOffsetsObs.writeDelta( 0 );
termOffsetsObs.writeDelta( 0 );
if ( exact ) nonTermOffsetsObs.writeDelta( 0 );
}
public void add( MutableString word, MutableString nonWord ) throws IOException {
int t = terms.getInt( word );
if ( t == -1 ) {
terms.put( word.copy(), t = terms.size() );
termsOutputStream.resetByteCount();
word.writeSelfDelimUTF8( termsOutputStream );
termOffsetsObs.writeLongDelta( termsOutputStream.getByteCount() );
}
fieldContent.add( t );
if ( exact ) {
t = nonTerms.getInt( nonWord );
if ( t == -1 ) {
nonTerms.put( nonWord.copy(), t = nonTerms.size() );
nonTermsOutputStream.resetByteCount();
nonWord.writeSelfDelimUTF8( nonTermsOutputStream );
nonTermOffsetsObs.writeLongDelta( nonTermsOutputStream.getByteCount() );
}
fieldContent.add( t );
}
}
public void close() throws IOException {
documentsOutputBitStream.close();
termsOutputStream.close();
IOUtils.closeQuietly( nonTermsOutputStream );
documentOffsetsObs.close();
termOffsetsObs.close();
if ( nonTermOffsetsObs != null ) nonTermOffsetsObs.close();
if ( hasNonText ) {
if ( documents == 0 ) nonTextZipOutputStream.putNextEntry( new ZipEntry( "dummy" ) );
nonTextZipDataOutputStream.close();
}
final SimpleCompressedDocumentCollection simpleCompressedDocumentCollection = new SimpleCompressedDocumentCollection( basenameSuffix, documents, terms.size(), nonTerms != null ? nonTerms.size() : -1, exact, factory );
BinIO.storeObject( simpleCompressedDocumentCollection, basenameSuffix + DocumentCollection.DEFAULT_EXTENSION );
simpleCompressedDocumentCollection.close();
final PrintStream stats = new PrintStream( new FileOutputStream ( basenameSuffix + SimpleCompressedDocumentCollection.STATS_EXTENSION ) );
final long overallBits = bitsForTitles + bitsForUris + bitsForFieldLengths + bitsForWords + bitsForNonWords;
stats.println( "Documents: " + Util.format( documents ) + " (" + Util.format( overallBits ) + ", " + Util.format( overallBits / (double)documents ) + " bits per document)" );
stats.println( "Terms: " + Util.format( terms.size() ) + " (" + Util.format( words ) + " words, " + Util.format( bitsForWords ) + " bits, " + Util.format( bitsForWords / (double)words ) + " bits per word)" );
if ( exact ) stats.println( "Nonterms: " + Util.format( nonTerms.size() ) + " (" + Util.format( words ) + " nonwords, " + Util.format( bitsForNonWords ) + " bits, " + Util.format( bitsForNonWords / (double)words ) + " bits per nonword)" );
stats.println( "Bits for field lengths: " + Util.format( bitsForFieldLengths ) + " (" + Util.format( bitsForFieldLengths / (double)fields ) + " bits per field)" );
stats.println( "Bits for URIs: " + Util.format( bitsForUris ) + " (" + Util.format( bitsForUris / (double)documents ) + " bits per URI)" );
stats.println( "Bits for titles: " + Util.format( bitsForTitles ) + " (" + Util.format( bitsForTitles / (double)documents ) + " bits per title)" );
stats.close();
}
public void endDocument() throws IOException {
documentOffsetsObs.writeLongDelta( documentsOutputBitStream.writtenBits() );
if ( hasNonText ) nonTextZipOutputStream.closeEntry();
}
public void endTextField() throws IOException {
final int size = fieldContent.size();
words += size / ( exact ? 2 : 1 );
bitsForFieldLengths += documentsOutputBitStream.writeDelta( size / ( exact ? 2 : 1 ) );
termsFrequencyKeeper.reset();
if ( exact ) {
nonTermsFrequencyKeeper.reset();
for( int i = 0; i < size; i += 2 ) {
bitsForWords += documentsOutputBitStream.writeDelta( termsFrequencyKeeper.encode( fieldContent.getInt( i ) ) );
bitsForNonWords += documentsOutputBitStream.writeDelta( nonTermsFrequencyKeeper.encode( fieldContent.getInt( i + 1 ) ) );
}
}
else for( int i = 0; i < size; i++ ) bitsForWords += documentsOutputBitStream.writeDelta( termsFrequencyKeeper.encode( fieldContent.getInt( i ) ) );
}
public void nonTextField( Object o ) throws IOException {
final ObjectOutputStream oos = new ObjectOutputStream( nonTextZipDataOutputStream );
oos.writeObject( o );
oos.flush();
}
public static int writeSelfDelimitedUtf8String( final OutputBitStream obs, final CharSequence s ) throws IOException {
final int len = s.length();
int bits = 0;
bits += obs.writeDelta( len );
for( int i = 0; i < len; i++ ) bits += obs.writeZeta( s.charAt( i ), 7 );
return bits;
}
public void startDocument( CharSequence title, CharSequence uri ) throws IOException {
documentsOutputBitStream.writtenBits( 0 );
bitsForUris += writeSelfDelimitedUtf8String( documentsOutputBitStream, uri == null ? "" : uri );
bitsForTitles += writeSelfDelimitedUtf8String( documentsOutputBitStream, title == null ? "" : title );
if ( hasNonText ) {
final ZipEntry currEntry = new ZipEntry( Integer.toString( documents ) );
nonTextZipOutputStream.putNextEntry( currEntry );
}
documents++;
}
public void startTextField() {
fieldContent.size( 0 );
fields++;
}
public void virtualField( final ObjectList fragments ) throws IOException {
nonTextZipDataOutputStream.writeInt( fragments.size() );
for ( VirtualDocumentFragment fragment: fragments ) {
fragment.documentSpecifier().writeSelfDelimUTF8( nonTextZipOutputStream );
fragment.text().writeSelfDelimUTF8( nonTextZipOutputStream );
}
}
@SuppressWarnings("unchecked")
public void build( final DocumentSequence inputSequence ) throws IOException {
final DocumentIterator docIt = inputSequence.iterator();
if ( factory != inputSequence.factory() ) throw new IllegalStateException( "The factory provided by the constructor does not correspond to the factory of the input sequence" );
final int numberOfFields = factory.numberOfFields();
WordReader wordReader;
MutableString word = new MutableString();
MutableString nonWord = new MutableString();
open( "" );
for (;;) {
Document document = docIt.nextDocument();
if ( document == null ) break;
startDocument( document.title(), document.uri() );
for ( int field = 0; field < numberOfFields; field++ ) {
Object content = document.content( field );
if ( factory.fieldType( field ) == FieldType.TEXT ) {
startTextField();
wordReader = document.wordReader( field );
wordReader.setReader( (Reader)content );
while ( wordReader.next( word, nonWord ) ) add( word, nonWord );
endTextField();
}
else if ( factory.fieldType( field ) == FieldType.VIRTUAL ) virtualField( (ObjectList)content );
else nonTextField( content );
}
document.close();
endDocument();
}
docIt.close();
close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy