
org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat Maven / Gradle / Ivy
/*
* COPIED FROM APACHE LUCENE 4.7.2
*
* Git URL: [email protected]:apache/lucene.git, tag: releases/lucene-solr/4.7.2, path: lucene/core/src/java
*
* (see https://issues.apache.org/jira/browse/OAK-10786 for details)
*/
package org.apache.lucene.codecs.lucene40;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.codecs.BlockTreeTermsReader;
import org.apache.lucene.codecs.BlockTreeTermsWriter;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.DocsEnum; // javadocs
import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
import org.apache.lucene.index.FieldInfos; // javadocs
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput; // javadocs
import org.apache.lucene.util.fst.FST; // javadocs
/**
* Lucene 4.0 Postings format.
*
* Files:
*
* - .tim: Term Dictionary
* - .tip: Term Index
* - .frq: Frequencies
* - .prx: Positions
*
*
* Term Dictionary
*
* The .tim file contains the list of terms in each
* field along with per-term statistics (such as docfreq)
* and pointers to the frequencies, positions and
* skip data in the .frq and .prx files.
* See {@link BlockTreeTermsWriter} for more details on the format.
*
*
* NOTE: The term dictionary can plug into different postings implementations:
* the postings writer/reader are actually responsible for encoding
* and decoding the Postings Metadata and Term Metadata sections described here:
*
* - Postings Metadata --> Header, SkipInterval, MaxSkipLevels, SkipMinimum
* - Term Metadata --> FreqDelta, SkipDelta?, ProxDelta?
*
- Header --> {@link CodecUtil#writeHeader CodecHeader}
* - SkipInterval,MaxSkipLevels,SkipMinimum --> {@link DataOutput#writeInt Uint32}
* - SkipDelta,FreqDelta,ProxDelta --> {@link DataOutput#writeVLong VLong}
*
* Notes:
*
* - Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
* for the postings.
* - SkipInterval is the fraction of TermDocs stored in skip tables. It is used to accelerate
* {@link DocsEnum#advance(int)}. Larger values result in smaller indexes, greater
* acceleration, but fewer accelerable cases, while smaller values result in bigger indexes,
* less acceleration (in case of a small value for MaxSkipLevels) and more accelerable cases.
*
* - MaxSkipLevels is the max. number of skip levels stored for each term in the .frq file. A
* low value results in smaller indexes but less acceleration, a larger value results in
* slightly larger indexes but greater acceleration. See format of .frq file for more
* information about skip levels.
* - SkipMinimum is the minimum document frequency a term must have in order to write any
* skip data at all.
* - FreqDelta determines the position of this term's TermFreqs within the .frq
* file. In particular, it is the difference between the position of this term's
* data in that file and the position of the previous term's data (or zero, for
* the first term in the block).
* - ProxDelta determines the position of this term's TermPositions within the
* .prx file. In particular, it is the difference between the position of this
* term's data in that file and the position of the previous term's data (or zero,
* for the first term in the block. For fields that omit position data, this will
* be 0 since prox information is not stored.
* - SkipDelta determines the position of this term's SkipData within the .frq
* file. In particular, it is the number of bytes after TermFreqs that the
* SkipData starts. In other words, it is the length of the TermFreq data.
* SkipDelta is only stored if DocFreq is not smaller than SkipMinimum.
*
*
* Term Index
* The .tip file contains an index into the term dictionary, so that it can be
* accessed randomly. See {@link BlockTreeTermsWriter} for more details on the format.
*
* Frequencies
* The .frq file contains the lists of documents which contain each term, along
* with the frequency of the term in that document (except when frequencies are
* omitted: {@link IndexOptions#DOCS_ONLY}).
*
* - FreqFile (.frq) --> Header, <TermFreqs, SkipData?> TermCount
* - Header --> {@link CodecUtil#writeHeader CodecHeader}
* - TermFreqs --> <TermFreq> DocFreq
* - TermFreq --> DocDelta[, Freq?]
* - SkipData --> <<SkipLevelLength, SkipLevel>
* NumSkipLevels-1, SkipLevel> <SkipDatum>
* - SkipLevel --> <SkipDatum> DocFreq/(SkipInterval^(Level +
* 1))
* - SkipDatum -->
* DocSkip,PayloadLength?,OffsetLength?,FreqSkip,ProxSkip,SkipChildLevelPointer?
* - DocDelta,Freq,DocSkip,PayloadLength,OffsetLength,FreqSkip,ProxSkip --> {@link DataOutput#writeVInt VInt}
* - SkipChildLevelPointer --> {@link DataOutput#writeVLong VLong}
*
* TermFreqs are ordered by term (the term is implicit, from the term dictionary).
* TermFreq entries are ordered by increasing document number.
* DocDelta: if frequencies are indexed, this determines both the document
* number and the frequency. In particular, DocDelta/2 is the difference between
* this document number and the previous document number (or zero when this is the
* first document in a TermFreqs). When DocDelta is odd, the frequency is one.
* When DocDelta is even, the frequency is read as another VInt. If frequencies
* are omitted, DocDelta contains the gap (not multiplied by 2) between document
* numbers and no frequency information is stored.
* For example, the TermFreqs for a term which occurs once in document seven
* and three times in document eleven, with frequencies indexed, would be the
* following sequence of VInts:
* 15, 8, 3
* If frequencies were omitted ({@link IndexOptions#DOCS_ONLY}) it would be this
* sequence of VInts instead:
* 7,4
* DocSkip records the document number before every SkipInterval th
* document in TermFreqs. If payloads and offsets are disabled for the term's field, then
* DocSkip represents the difference from the previous value in the sequence. If
* payloads and/or offsets are enabled for the term's field, then DocSkip/2 represents the
* difference from the previous value in the sequence. In this case when
* DocSkip is odd, then PayloadLength and/or OffsetLength are stored indicating the length of
* the last payload/offset before the SkipIntervalth document in TermPositions.
* PayloadLength indicates the length of the last payload.
* OffsetLength indicates the length of the last offset (endOffset-startOffset).
*
* FreqSkip and ProxSkip record the position of every SkipInterval th
* entry in FreqFile and ProxFile, respectively. File positions are relative to
* the start of TermFreqs and Positions, to the previous SkipDatum in the
* sequence.
* For example, if DocFreq=35 and SkipInterval=16, then there are two SkipData
* entries, containing the 15 th and 31 st document numbers
* in TermFreqs. The first FreqSkip names the number of bytes after the beginning
* of TermFreqs that the 16 th SkipDatum starts, and the second the
* number of bytes after that that the 32 nd starts. The first ProxSkip
* names the number of bytes after the beginning of Positions that the 16
* th SkipDatum starts, and the second the number of bytes after that
* that the 32 nd starts.
* Each term can have multiple skip levels. The amount of skip levels for a
* term is NumSkipLevels = Min(MaxSkipLevels,
* floor(log(DocFreq/log(SkipInterval)))). The number of SkipData entries for a
* skip level is DocFreq/(SkipInterval^(Level + 1)), whereas the lowest skip level
* is Level=0.
* Example: SkipInterval = 4, MaxSkipLevels = 2, DocFreq = 35. Then skip level 0
* has 8 SkipData entries, containing the 3rd, 7th,
* 11th, 15th, 19th, 23rd,
* 27th, and 31st document numbers in TermFreqs. Skip level
* 1 has 2 SkipData entries, containing the 15th and 31st
* document numbers in TermFreqs.
* The SkipData entries on all upper levels > 0 contain a SkipChildLevelPointer
* referencing the corresponding SkipData entry in level-1. In the example has
* entry 15 on level 1 a pointer to entry 15 on level 0 and entry 31 on level 1 a
* pointer to entry 31 on level 0.
*
*
* Positions
* The .prx file contains the lists of positions that each term occurs at
* within documents. Note that fields omitting positional data do not store
* anything into this file, and if all fields in the index omit positional data
* then the .prx file will not exist.
*
* - ProxFile (.prx) --> Header, <TermPositions> TermCount
* - Header --> {@link CodecUtil#writeHeader CodecHeader}
* - TermPositions --> <Positions> DocFreq
* - Positions --> <PositionDelta,PayloadLength?,OffsetDelta?,OffsetLength?,PayloadData?> Freq
* - PositionDelta,OffsetDelta,OffsetLength,PayloadLength --> {@link DataOutput#writeVInt VInt}
* - PayloadData --> {@link DataOutput#writeByte byte}PayloadLength
*
* TermPositions are ordered by term (the term is implicit, from the term dictionary).
* Positions entries are ordered by increasing document number (the document
* number is implicit from the .frq file).
* PositionDelta is, if payloads are disabled for the term's field, the
* difference between the position of the current occurrence in the document and
* the previous occurrence (or zero, if this is the first occurrence in this
* document). If payloads are enabled for the term's field, then PositionDelta/2
* is the difference between the current and the previous position. If payloads
* are enabled and PositionDelta is odd, then PayloadLength is stored, indicating
* the length of the payload at the current term position.
* For example, the TermPositions for a term which occurs as the fourth term in
* one document, and as the fifth and ninth term in a subsequent document, would
* be the following sequence of VInts (payloads disabled):
* 4, 5, 4
* PayloadData is metadata associated with the current term position. If
* PayloadLength is stored at the current position, then it indicates the length
* of this payload. If PayloadLength is not stored, then this payload has the same
* length as the payload at the previous position.
* OffsetDelta/2 is the difference between this position's startOffset from the
* previous occurrence (or zero, if this is the first occurrence in this document).
* If OffsetDelta is odd, then the length (endOffset-startOffset) differs from the
* previous occurrence and an OffsetLength follows. Offset data is only written for
* {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.
*
* @deprecated Only for reading old 4.0 segments */
// TODO: this class could be created by wrapping
// BlockTreeTermsDict around Lucene40PostingsBaseFormat; ie
// we should not duplicate the code from that class here:
@Deprecated
public class Lucene40PostingsFormat extends PostingsFormat {
/** minimum items (terms or sub-blocks) per block for BlockTree */
protected final int minBlockSize;
/** maximum items (terms or sub-blocks) per block for BlockTree */
protected final int maxBlockSize;
/** Creates {@code Lucene40PostingsFormat} with default
* settings. */
public Lucene40PostingsFormat() {
this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
}
/** Creates {@code Lucene40PostingsFormat} with custom
* values for {@code minBlockSize} and {@code
* maxBlockSize} passed to block terms dictionary.
* @see BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) */
private Lucene40PostingsFormat(int minBlockSize, int maxBlockSize) {
super("Lucene40");
this.minBlockSize = minBlockSize;
assert minBlockSize > 1;
this.maxBlockSize = maxBlockSize;
}
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
throw new UnsupportedOperationException("this codec can only be used for reading");
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
PostingsReaderBase postings = new Lucene40PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);
boolean success = false;
try {
FieldsProducer ret = new BlockTreeTermsReader(
state.directory,
state.fieldInfos,
state.segmentInfo,
postings,
state.context,
state.segmentSuffix,
state.termsIndexDivisor);
success = true;
return ret;
} finally {
if (!success) {
postings.close();
}
}
}
/** Extension of freq postings file */
static final String FREQ_EXTENSION = "frq";
/** Extension of prox postings file */
static final String PROX_EXTENSION = "prx";
@Override
public String toString() {
return getName() + "(minBlockSize=" + minBlockSize + " maxBlockSize=" + maxBlockSize + ")";
}
}