org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat Maven / Gradle / Ivy

Go to download
/*
 * COPIED FROM APACHE LUCENE 4.7.2
 *
 * Git URL: [email protected]:apache/lucene.git, tag: releases/lucene-solr/4.7.2, path: lucene/core/src/java
 *
 * (see https://issues.apache.org/jira/browse/OAK-10786 for details)
 */

package org.apache.lucene.codecs.lucene40;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;

import org.apache.lucene.codecs.BlockTreeTermsReader;
import org.apache.lucene.codecs.BlockTreeTermsWriter;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.DocsEnum; // javadocs
import org.apache.lucene.index.FieldInfo.IndexOptions; // javadocs
import org.apache.lucene.index.FieldInfos; // javadocs
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput; // javadocs
import org.apache.lucene.util.fst.FST; // javadocs

/** 
 * Lucene 4.0 Postings format.
 * 
 * Files:
 * 

 *   .tim: Term Dictionary
 *   .tip: Term Index
 *   .frq: Frequencies
 *   .prx: Positions
 * 
 * 
 * 
 * 
 * 
Term Dictionary
 *
 * The .tim file contains the list of terms in each
 * field along with per-term statistics (such as docfreq)
 * and pointers to the frequencies, positions and
 * skip data in the .frq and .prx files.
 * See {@link BlockTreeTermsWriter} for more details on the format.
 * 
 *
 * NOTE: The term dictionary can plug into different postings implementations:
 * the postings writer/reader are actually responsible for encoding 
 * and decoding the Postings Metadata and Term Metadata sections described here:
 * 
 *    Postings Metadata --> Header, SkipInterval, MaxSkipLevels, SkipMinimum
 *    Term Metadata --> FreqDelta, SkipDelta?, ProxDelta?
 *    
Header --> {@link CodecUtil#writeHeader CodecHeader}
 *    SkipInterval,MaxSkipLevels,SkipMinimum --> {@link DataOutput#writeInt Uint32}
 *    SkipDelta,FreqDelta,ProxDelta --> {@link DataOutput#writeVLong VLong}
 * 
 * Notes:
 * 
 *    Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information
 *        for the postings.
 *    SkipInterval is the fraction of TermDocs stored in skip tables. It is used to accelerate 
 *        {@link DocsEnum#advance(int)}. Larger values result in smaller indexes, greater 
 *        acceleration, but fewer accelerable cases, while smaller values result in bigger indexes, 
 *        less acceleration (in case of a small value for MaxSkipLevels) and more accelerable cases.
 *        
 *    MaxSkipLevels is the max. number of skip levels stored for each term in the .frq file. A 
 *        low value results in smaller indexes but less acceleration, a larger value results in 
 *        slightly larger indexes but greater acceleration. See format of .frq file for more 
 *        information about skip levels.
 *    SkipMinimum is the minimum document frequency a term must have in order to write any 
 *        skip data at all.
 *    FreqDelta determines the position of this term's TermFreqs within the .frq
 *        file. In particular, it is the difference between the position of this term's
 *        data in that file and the position of the previous term's data (or zero, for
 *        the first term in the block).
 *    ProxDelta determines the position of this term's TermPositions within the
 *        .prx file. In particular, it is the difference between the position of this
 *        term's data in that file and the position of the previous term's data (or zero,
 *        for the first term in the block. For fields that omit position data, this will
 *        be 0 since prox information is not stored.
 *    SkipDelta determines the position of this term's SkipData within the .frq
 *        file. In particular, it is the number of bytes after TermFreqs that the
 *        SkipData starts. In other words, it is the length of the TermFreq data.
 *        SkipDelta is only stored if DocFreq is not smaller than SkipMinimum.
 * 
 * 
 * Term Index
 * The .tip file contains an index into the term dictionary, so that it can be 
 * accessed randomly.  See {@link BlockTreeTermsWriter} for more details on the format.
 * 
 * Frequencies
 * The .frq file contains the lists of documents which contain each term, along
 * with the frequency of the term in that document (except when frequencies are
 * omitted: {@link IndexOptions#DOCS_ONLY}).
 * 
 *   FreqFile (.frq) --> Header, <TermFreqs, SkipData?> ^TermCount
 *   Header --> {@link CodecUtil#writeHeader CodecHeader}
 *   TermFreqs --> <TermFreq> ^DocFreq
 *   TermFreq --> DocDelta[, Freq?]
 *   SkipData --> <<SkipLevelLength, SkipLevel>
 *       ^{NumSkipLevels-1}, SkipLevel> <SkipDatum>
 *   SkipLevel --> <SkipDatum> ^{DocFreq/(SkipInterval^(Level +
 *       1))}
 *   SkipDatum -->
 *       DocSkip,PayloadLength?,OffsetLength?,FreqSkip,ProxSkip,SkipChildLevelPointer?
 *   DocDelta,Freq,DocSkip,PayloadLength,OffsetLength,FreqSkip,ProxSkip --> {@link DataOutput#writeVInt VInt}
 *   SkipChildLevelPointer --> {@link DataOutput#writeVLong VLong}
 * 
 * TermFreqs are ordered by term (the term is implicit, from the term dictionary).
 * TermFreq entries are ordered by increasing document number.
 * DocDelta: if frequencies are indexed, this determines both the document
 * number and the frequency. In particular, DocDelta/2 is the difference between
 * this document number and the previous document number (or zero when this is the
 * first document in a TermFreqs). When DocDelta is odd, the frequency is one.
 * When DocDelta is even, the frequency is read as another VInt. If frequencies
 * are omitted, DocDelta contains the gap (not multiplied by 2) between document
 * numbers and no frequency information is stored.
 * For example, the TermFreqs for a term which occurs once in document seven
 * and three times in document eleven, with frequencies indexed, would be the
 * following sequence of VInts:
 * 15, 8, 3
 * If frequencies were omitted ({@link IndexOptions#DOCS_ONLY}) it would be this
 * sequence of VInts instead:
 * 7,4
 * DocSkip records the document number before every SkipInterval ^th
 * document in TermFreqs. If payloads and offsets are disabled for the term's field, then
 * DocSkip represents the difference from the previous value in the sequence. If
 * payloads and/or offsets are enabled for the term's field, then DocSkip/2 represents the
 * difference from the previous value in the sequence. In this case when
 * DocSkip is odd, then PayloadLength and/or OffsetLength are stored indicating the length of 
 * the last payload/offset before the SkipInterval^th document in TermPositions.
 * PayloadLength indicates the length of the last payload.
 * OffsetLength indicates the length of the last offset (endOffset-startOffset).
 * 
 * FreqSkip and ProxSkip record the position of every SkipInterval ^th
 * entry in FreqFile and ProxFile, respectively. File positions are relative to
 * the start of TermFreqs and Positions, to the previous SkipDatum in the
 * sequence.
 * For example, if DocFreq=35 and SkipInterval=16, then there are two SkipData
 * entries, containing the 15 ^th and 31 ^st document numbers
 * in TermFreqs. The first FreqSkip names the number of bytes after the beginning
 * of TermFreqs that the 16 ^th SkipDatum starts, and the second the
 * number of bytes after that that the 32 ^nd starts. The first ProxSkip
 * names the number of bytes after the beginning of Positions that the 16
 * ^th SkipDatum starts, and the second the number of bytes after that
 * that the 32 ^nd starts.
 * Each term can have multiple skip levels. The amount of skip levels for a
 * term is NumSkipLevels = Min(MaxSkipLevels,
 * floor(log(DocFreq/log(SkipInterval)))). The number of SkipData entries for a
 * skip level is DocFreq/(SkipInterval^(Level + 1)), whereas the lowest skip level
 * is Level=0.

 * Example: SkipInterval = 4, MaxSkipLevels = 2, DocFreq = 35. Then skip level 0
 * has 8 SkipData entries, containing the 3^rd, 7^th,
 * 11^th, 15^th, 19^th, 23^rd,
 * 27^th, and 31^st document numbers in TermFreqs. Skip level
 * 1 has 2 SkipData entries, containing the 15^th and 31^st
 * document numbers in TermFreqs.

 * The SkipData entries on all upper levels > 0 contain a SkipChildLevelPointer
 * referencing the corresponding SkipData entry in level-1. In the example has
 * entry 15 on level 1 a pointer to entry 15 on level 0 and entry 31 on level 1 a
 * pointer to entry 31 on level 0.
 * 
 * 
 * Positions
 * The .prx file contains the lists of positions that each term occurs at
 * within documents. Note that fields omitting positional data do not store
 * anything into this file, and if all fields in the index omit positional data
 * then the .prx file will not exist.
 * 
 *   ProxFile (.prx) --> Header, <TermPositions> ^TermCount
 *   Header --> {@link CodecUtil#writeHeader CodecHeader}
 *   TermPositions --> <Positions> ^DocFreq
 *   Positions --> <PositionDelta,PayloadLength?,OffsetDelta?,OffsetLength?,PayloadData?> ^Freq
 *   PositionDelta,OffsetDelta,OffsetLength,PayloadLength --> {@link DataOutput#writeVInt VInt}
 *   PayloadData --> {@link DataOutput#writeByte byte}^{PayloadLength}
 * 
 * TermPositions are ordered by term (the term is implicit, from the term dictionary).
 * Positions entries are ordered by increasing document number (the document
 * number is implicit from the .frq file).
 * PositionDelta is, if payloads are disabled for the term's field, the
 * difference between the position of the current occurrence in the document and
 * the previous occurrence (or zero, if this is the first occurrence in this
 * document). If payloads are enabled for the term's field, then PositionDelta/2
 * is the difference between the current and the previous position. If payloads
 * are enabled and PositionDelta is odd, then PayloadLength is stored, indicating
 * the length of the payload at the current term position.
 * For example, the TermPositions for a term which occurs as the fourth term in
 * one document, and as the fifth and ninth term in a subsequent document, would
 * be the following sequence of VInts (payloads disabled):
 * 4, 5, 4
 * PayloadData is metadata associated with the current term position. If
 * PayloadLength is stored at the current position, then it indicates the length
 * of this payload. If PayloadLength is not stored, then this payload has the same
 * length as the payload at the previous position.
 * OffsetDelta/2 is the difference between this position's startOffset from the
 * previous occurrence (or zero, if this is the first occurrence in this document).
 * If OffsetDelta is odd, then the length (endOffset-startOffset) differs from the
 * previous occurrence and an OffsetLength follows. Offset data is only written for
 * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.
 * 
 *  @deprecated Only for reading old 4.0 segments */

// TODO: this class could be created by wrapping
// BlockTreeTermsDict around Lucene40PostingsBaseFormat; ie
// we should not duplicate the code from that class here:
@Deprecated
public class Lucene40PostingsFormat extends PostingsFormat {

  /** minimum items (terms or sub-blocks) per block for BlockTree */
  protected final int minBlockSize;
  /** maximum items (terms or sub-blocks) per block for BlockTree */
  protected final int maxBlockSize;

  /** Creates {@code Lucene40PostingsFormat} with default
   *  settings. */
  public Lucene40PostingsFormat() {
    this(BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE);
  }

  /** Creates {@code Lucene40PostingsFormat} with custom
   *  values for {@code minBlockSize} and {@code
   *  maxBlockSize} passed to block terms dictionary.
   *  @see BlockTreeTermsWriter#BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) */
  private Lucene40PostingsFormat(int minBlockSize, int maxBlockSize) {
    super("Lucene40");
    this.minBlockSize = minBlockSize;
    assert minBlockSize > 1;
    this.maxBlockSize = maxBlockSize;
  }

  @Override
  public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
    throw new UnsupportedOperationException("this codec can only be used for reading");
  }

  @Override
  public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
    PostingsReaderBase postings = new Lucene40PostingsReader(state.directory, state.fieldInfos, state.segmentInfo, state.context, state.segmentSuffix);

    boolean success = false;
    try {
      FieldsProducer ret = new BlockTreeTermsReader(
                                                    state.directory,
                                                    state.fieldInfos,
                                                    state.segmentInfo,
                                                    postings,
                                                    state.context,
                                                    state.segmentSuffix,
                                                    state.termsIndexDivisor);
      success = true;
      return ret;
    } finally {
      if (!success) {
        postings.close();
      }
    }
  }

  /** Extension of freq postings file */
  static final String FREQ_EXTENSION = "frq";

  /** Extension of prox postings file */
  static final String PROX_EXTENSION = "prx";

  @Override
  public String toString() {
    return getName() + "(minBlockSize=" + minBlockSize + " maxBlockSize=" + maxBlockSize + ")";
  }
}