![JAR search and dependency download from the Maven repository](/logo.png)
src.it.unimi.dsi.big.mg4j.index.IndexWriter Maven / Gradle / Ivy
Show all versions of mg4j-big Show documentation
package it.unimi.dsi.big.mg4j.index;
/*
* MG4J: Managing Gigabytes for Java (big)
*
* Copyright (C) 2005-2011 Paolo Boldi and Sebastiano Vigna
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import it.unimi.dsi.io.OutputBitStream;
import it.unimi.dsi.big.mg4j.index.payload.Payload;
import it.unimi.dsi.util.Properties;
import java.io.IOException;
import java.io.PrintStream;
/** An interface for classes that generate indices.
*
* Implementations of this interface are used to write inverted lists in
* sequential order, as follows:
*
* - to create a new inverted list, you must call {@link #newInvertedList()};
*
- then, you must specified the frequency using {@link #writeFrequency(long)};
*
- the document records follow; before writing a new document record, you must call {@link #newDocumentRecord()};
* note that, all in all, the number of calls to {@link #newDocumentRecord()} must be equal to the frequency;
*
- for each document record, you must supply the information needed for the index you are building
* ({@linkplain #writeDocumentPointer(OutputBitStream, long) pointer},
* {@linkplain #writePayload(OutputBitStream, Payload) payload},
* {@linkplain #writePositionCount(OutputBitStream, int) count}, and
* {@linkplain #writeDocumentPositions(OutputBitStream, int[], int, int, int) positions}, in this order).
*
*
* {@link #newDocumentRecord()} returns an {@link OutputBitStream} that must be used to write the document-record data.
* Note that there is no guarantee that the returned {@link OutputBitStream} coincides with the
* underlying bit stream. Moreover, there is no guarantee as to when the bits will be actually
* written on the underlying stream, except that when starting a new inverted list, the previous
* inverted list, if any, will be written onto the underlying stream.
*
* @author Paolo Boldi
* @author Sebastiano Vigna
* @since 1.2
*/
public interface IndexWriter {
/** Starts a new inverted list. The previous inverted list, if any, is actually written
* to the underlying bit stream.
*
* @return the position (in bits) of the underlying bit stream where the new inverted
* list starts.
* @throws IllegalStateException if too few records were written for the previous inverted
* list.
*/
long newInvertedList() throws IOException;
/** Writes the frequency.
*
* @param frequency the (positive) number of document records that this inverted list will contain.
* @return the number of bits written.
*/
int writeFrequency( final long frequency ) throws IOException;
/** Starts a new document record.
*
*
This method must be called exactly exactly f times, where f is the frequency specified with
* {@link #writeFrequency(long)}.
*
* @return the output bit stream where the next document record data should be written.
* @throws IllegalStateException if too many records were written for the current inverted list,
* or if there is no current inverted list.
*/
OutputBitStream newDocumentRecord() throws IOException;
/** Writes a document pointer.
*
*
This method must be called immediately after {@link #newDocumentRecord()}.
*
* @param out the output bit stream where the pointer will be written.
* @param pointer the document pointer.
* @return the number of bits written.
*/
long writeDocumentPointer( final OutputBitStream out, final long pointer ) throws IOException;
/** Writes the payload for the current document.
*
*
This method must be called immediately after {@link #writeDocumentPointer(OutputBitStream, long)}.
*
* @param out the output bit stream where the payload will be written.
* @param payload the payload.
* @return the number of bits written.
*/
int writePayload( final OutputBitStream out, final Payload payload ) throws IOException;
/** Writes the count of the occurrences of the current term in the current document to the given {@link OutputBitStream}.
* @param out the output stream where the occurrences should be written.
* @param count the count.
* @return the number of bits written.
*/
int writePositionCount( final OutputBitStream out, final int count ) throws IOException;
/** Writes the positions of the occurrences of the current term in the current document to the given {@link OutputBitStream}.
*
* @param out the output stream where the occurrences should be written.
* @param occ the position vector (a sequence of strictly increasing natural numbers).
* @param offset the first valid entry in occ
.
* @param len the number of valid entries in occ
.
* @param docSize the size of the current document (only for Golomb and interpolative coding; you can safely pass -1 otherwise).
* @return the number of bits written.
* @throws IllegalStateException if there is no current inverted list.
*/
int writeDocumentPositions( final OutputBitStream out, final int[] occ, final int offset, final int len, final int docSize ) throws IOException;
/** Returns the overall number of bits written onto the underlying stream(s).
*
* @return the number of bits written, according to the variables keeping statistical records.
*/
long writtenBits();
/** Returns properties of the index generated by this index writer.
*
*
This method should only be called after {@link #close()}.
* It returns a new {@linkplain Properties property object}
* containing values for (whenever appropriate)
* {@link Index.PropertyKeys#DOCUMENTS}, {@link Index.PropertyKeys#TERMS},
* {@link Index.PropertyKeys#POSTINGS}, {@link Index.PropertyKeys#MAXCOUNT},
* {@link Index.PropertyKeys#INDEXCLASS}, {@link Index.PropertyKeys#CODING}, {@link Index.PropertyKeys#PAYLOADCLASS},
* {@link BitStreamIndex.PropertyKeys#SKIPQUANTUM}, and {@link BitStreamIndex.PropertyKeys#SKIPHEIGHT}.
*
* @return properties a new set of properties for the just created index.
*/
Properties properties();
/** Closes this index writer, completing the index creation process and releasing all resources.
*
* @throws IllegalStateException if too few records were written for the last inverted list.
*/
void close() throws IOException;
/** Writes to the given print stream statistical information about the index just built.
* This method must be called after {@link #close()}.
*
* @param stats a print stream where statistical information will be written.
*/
void printStats( final PrintStream stats );
}