org.apache.lucene.backward_codecs.lucene50.Lucene50PostingsFormat Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.backward_codecs.lucene50;

import java.io.IOException;
import org.apache.lucene.backward_codecs.lucene40.blocktree.Lucene40BlockTreeTermsReader;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.MultiLevelSkipListWriter;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.PostingsReaderBase;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.TermState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.packed.PackedInts;

/**
 * Lucene 5.0 postings format, which encodes postings in packed integer blocks for fast decode.
 *
 * Basic idea:
 *
 * 

 *   Packed Blocks and VInt Blocks:
 *       In packed blocks, integers are encoded with the same bit width ({@link PackedInts packed
 *       format}): the block size (i.e. number of integers inside block) is fixed (currently 128).
 *       Additionally blocks that are all the same value are encoded in an optimized way.
 *       
In VInt blocks, integers are encoded as {@link DataOutput#writeVInt VInt}: the block
 *       size is variable.
 *   
Block structure:
 *       When the postings are long enough, Lucene50PostingsFormat will try to encode most
 *       integer data as a packed block.
 *       
Take a term with 259 documents as an example, the first 256 document ids are encoded as
 *       two packed blocks, while the remaining 3 are encoded as one VInt block.
 *       
Different kinds of data are always encoded separately into different packed blocks, but
 *       may possibly be interleaved into the same VInt block.
 *       
This strategy is applied to pairs: <document number, frequency>, <position,
 *       payload length>, <position, offset start, offset length>, and <position,
 *       payload length, offsetstart, offset length>.
 *   
Skipdata settings:
 *       The structure of skip table is quite similar to previous version of Lucene. Skip
 *       interval is the same as block size, and each skip entry points to the beginning of each
 *       block. However, for the first block, skip data is omitted.
 *   
Positions, Payloads, and Offsets:
 *       A position is an integer indicating where the term occurs within one document. A payload
 *       is a blob of metadata associated with current position. An offset is a pair of integers
 *       indicating the tokenized start/end offsets for given term in current position: it is
 *       essentially a specialized payload.
 *       
When payloads and offsets are not omitted, numPositions==numPayloads==numOffsets
 *       (assuming a null payload contributes one count). As mentioned in block structure, it is
 *       possible to encode these three either combined or separately.
 *       
In all cases, payloads and offsets are stored together. When encoded as a packed block,
 *       position data is separated out as .pos, while payloads and offsets are encoded in .pay
 *       (payload metadata will also be stored directly in .pay). When encoded as VInt blocks, all
 *       these three are stored interleaved into the .pos (so is payload metadata).
 *       
With this strategy, the majority of payload and offset data will be outside .pos file.
 *       So for queries that require only position data, running on a full index with payloads and
 *       offsets, this reduces disk pre-fetches.
 * 
 *
 * Files and detailed format:
 *
 * 

 *   .tim: Term Dictionary
 *   
.tip: Term Index
 *   
.doc: Frequencies and Skip Data
 *   
.pos: Positions
 *   
.pay: Payloads and Offsets
 * 
 *
 * 
 *
 * 
 *   Term Dictionary
 *       The .tim file contains the list of terms in each field along with per-term statistics
 *       (such as docfreq) and pointers to the frequencies, positions, payload and skip data in the
 *       .doc, .pos, and .pay files. See {@code BlockTreeTermsWriter} for more details on the
 *       format.
 *       
NOTE: The term dictionary can plug into different postings implementations: the postings
 *       writer/reader are actually responsible for encoding and decoding the PostingsHeader and
 *       TermMetadata sections described here:
 *       

 *         PostingsHeader --> Header, PackedBlockSize
 *         
TermMetadata --> (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?,
 *             PayFPDelta?, SkipFPDelta?
 *         
Header, --> {@link CodecUtil#writeIndexHeader IndexHeader}
 *         
PackedBlockSize, SingletonDocID --> {@link DataOutput#writeVInt VInt}
 *         
DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta, SkipFPDelta --> {@link
 *             DataOutput#writeVLong VLong}
 *         
Footer --> {@link CodecUtil#writeFooter CodecFooter}
 *       
 *       Notes:
 *       

 *         Header is a {@link CodecUtil#writeIndexHeader IndexHeader} storing the version
 *             information for the postings.
 *         
PackedBlockSize is the fixed block size for packed blocks. In packed block, bit width
 *             is determined by the largest integer. Smaller block size result in smaller variance
 *             among width of integers hence smaller indexes. Larger block size result in more
 *             efficient bulk i/o hence better acceleration. This value should always be a multiple
 *             of 64, currently fixed as 128 as a tradeoff. It is also the skip interval used to
 *             accelerate {@link org.apache.lucene.index.PostingsEnum#advance(int)}.
 *         
DocFPDelta determines the position of this term's TermFreqs within the .doc file. In
 *             particular, it is the difference of file offset between this term's data and previous
 *             term's data (or zero, for the first term in the block).On disk it is stored as the
 *             difference from previous value in sequence.
 *         
PosFPDelta determines the position of this term's TermPositions within the .pos file.
 *             While PayFPDelta determines the position of this term's <TermPayloads,
 *             TermOffsets?> within the .pay file. Similar to DocFPDelta, it is the difference
 *             between two file positions (or neglected, for fields that omit payloads and offsets).
 *         
PosVIntBlockFPDelta determines the position of this term's last TermPosition in last
 *             pos packed block within the .pos file. It is synonym for PayVIntBlockFPDelta or
 *             OffsetVIntBlockFPDelta. This is actually used to indicate whether it is necessary to
 *             load following payloads and offsets from .pos instead of .pay. Every time a new block
 *             of positions are to be loaded, the PostingsReader will use this value to check
 *             whether current block is packed format or VInt. When packed format, payloads and
 *             offsets are fetched from .pay, otherwise from .pos. (this value is neglected when
 *             total number of positions i.e. totalTermFreq is less or equal to PackedBlockSize).
 *         
SkipFPDelta determines the position of this term's SkipData within the .doc file. In
 *             particular, it is the length of the TermFreq data. SkipDelta is only stored if
 *             DocFreq is not smaller than SkipMinimum (i.e. 128 in Lucene50PostingsFormat).
 *         
SingletonDocID is an optimization when a term only appears in one document. In this
 *             case, instead of writing a file pointer to the .doc file (DocFPDelta), and then a
 *             VIntBlock at that location, the single document ID is written to the term dictionary.
 *       
 * 
 *
 * 
 *
 * 
 *   Term Index
 *       The .tip file contains an index into the term dictionary, so that it can be accessed
 *       randomly. See {@code BlockTreeTermsWriter} for more details on the format.
 * 
 *
 * 
 *
 * 
 *   Frequencies and Skip Data
 *       The .doc file contains the lists of documents which contain each term, along with the
 *       frequency of the term in that document (except when frequencies are omitted: {@link
 *       IndexOptions#DOCS}). It also saves skip data to the beginning of each packed or VInt block,
 *       when the length of document list is larger than packed block size.
 *       

 *         docFile(.doc) --> Header, <TermFreqs, SkipData?>^TermCount, Footer
 *         
Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
 *         
TermFreqs --> <PackedBlock> ^{PackedDocBlockNum}, VIntBlock?
 *         
PackedBlock --> PackedDocDeltaBlock, PackedFreqBlock?
 *         
VIntBlock --> <DocDelta[,
 *             Freq?]>^{DocFreq-PackedBlockSize*PackedDocBlockNum}
 *         
SkipData --> <<SkipLevelLength, SkipLevel> ^{NumSkipLevels-1},
 *             SkipLevel>, SkipDatum?
 *         
SkipLevel --> <SkipDatum> ^{TrimmedDocFreq/(PackedBlockSize^(Level +
 *             1))}
 *         
SkipDatum --> DocSkip, DocFPSkip, <PosFPSkip, PosBlockOffset, PayLength?,
 *             PayFPSkip?>?, SkipChildLevelPointer?
 *         
PackedDocDeltaBlock, PackedFreqBlock --> {@link PackedInts PackedInts}
 *         
DocDelta, Freq, DocSkip, DocFPSkip, PosFPSkip, PosBlockOffset, PayByteUpto, PayFPSkip
 *             --> {@link DataOutput#writeVInt VInt}
 *         
SkipChildLevelPointer --> {@link DataOutput#writeVLong VLong}
 *         
Footer --> {@link CodecUtil#writeFooter CodecFooter}
 *       
 *       Notes:
 *       

 *         PackedDocDeltaBlock is theoretically generated from two steps:
 *             
 *               Calculate the difference between each document number and previous one, and get
 *                   a d-gaps list (for the first document, use absolute value);
 *               
For those d-gaps from first one to
 *                   PackedDocBlockNum*PackedBlockSize^th, separately encode as packed
 *                   blocks.
 *             
 *             If frequencies are not omitted, PackedFreqBlock will be generated without d-gap step.
 *         
VIntBlock stores remaining d-gaps (along with frequencies when possible) with a
 *             format that encodes DocDelta and Freq:
 *             DocDelta: if frequencies are indexed, this determines both the document number and
 *             the frequency. In particular, DocDelta/2 is the difference between this document
 *             number and the previous document number (or zero when this is the first document in a
 *             TermFreqs). When DocDelta is odd, the frequency is one. When DocDelta is even, the
 *             frequency is read as another VInt. If frequencies are omitted, DocDelta contains the
 *             gap (not multiplied by 2) between document numbers and no frequency information is
 *             stored.
 *             
For example, the TermFreqs for a term which occurs once in document seven and
 *             three times in document eleven, with frequencies indexed, would be the following
 *             sequence of VInts:
 *             
15, 8, 3
 *             
If frequencies were omitted ({@link IndexOptions#DOCS}) it would be this sequence
 *             of VInts instead:
 *             
7,4
 *         
PackedDocBlockNum is the number of packed blocks for current term's docids or
 *             frequencies. In particular, PackedDocBlockNum = floor(DocFreq/PackedBlockSize)
 *         
TrimmedDocFreq = DocFreq % PackedBlockSize == 0 ? DocFreq - 1 : DocFreq. We use this
 *             trick since the definition of skip entry is a little different from base interface.
 *             In {@link MultiLevelSkipListWriter}, skip data is assumed to be saved for
 *             skipInterval^th, 2*skipInterval^th ... posting in the list.
 *             However, in Lucene50PostingsFormat, the skip data is saved for
 *             skipInterval+1^th, 2*skipInterval+1^th ... posting
 *             (skipInterval==PackedBlockSize in this case). When DocFreq is multiple of
 *             PackedBlockSize, MultiLevelSkipListWriter will expect one more skip data than
 *             Lucene50SkipWriter.
 *         
SkipDatum is the metadata of one skip entry. For the first block (no matter packed or
 *             VInt), it is omitted.
 *         
DocSkip records the document number of every PackedBlockSize^th document
 *             number in the postings (i.e. last document number in each packed block). On disk it
 *             is stored as the difference from previous value in the sequence.
 *         
DocFPSkip records the file offsets of each block (excluding )posting at
 *             PackedBlockSize+1^th, 2*PackedBlockSize+1^th ... , in DocFile.
 *             The file offsets are relative to the start of current term's TermFreqs. On disk it is
 *             also stored as the difference from previous SkipDatum in the sequence.
 *         
Since positions and payloads are also block encoded, the skip should skip to related
 *             block first, then fetch the values according to in-block offset. PosFPSkip and
 *             PayFPSkip record the file offsets of related block in .pos and .pay, respectively.
 *             While PosBlockOffset indicates which value to fetch inside the related block
 *             (PayBlockOffset is unnecessary since it is always equal to PosBlockOffset). Same as
 *             DocFPSkip, the file offsets are relative to the start of current term's TermFreqs,
 *             and stored as a difference sequence.
 *         
PayByteUpto indicates the start offset of the current payload. It is equivalent to
 *             the sum of the payload lengths in the current block up to PosBlockOffset
 *       
 * 
 *
 * 
 *
 * 
 *   Positions
 *       The .pos file contains the lists of positions that each term occurs at within documents.
 *       It also sometimes stores part of payloads and offsets for speedup.
 *       

 *         PosFile(.pos) --> Header, <TermPositions> ^TermCount, Footer
 *         
Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
 *         
TermPositions --> <PackedPosDeltaBlock> ^{PackedPosBlockNum},
 *             VIntBlock?
 *         
VIntBlock --> <PositionDelta[, PayloadLength?], PayloadData?, OffsetDelta?,
 *             OffsetLength?>^PosVIntCount
 *         
PackedPosDeltaBlock --> {@link PackedInts PackedInts}
 *         
PositionDelta, OffsetDelta, OffsetLength --> {@link DataOutput#writeVInt VInt}
 *         
PayloadData --> {@link DataOutput#writeByte byte}^PayLength
 *         
Footer --> {@link CodecUtil#writeFooter CodecFooter}
 *       
 *       Notes:
 *       

 *         TermPositions are order by term (terms are implicit, from the term dictionary), and
 *             position values for each term document pair are incremental, and ordered by document
 *             number.
 *         
PackedPosBlockNum is the number of packed blocks for current term's positions,
 *             payloads or offsets. In particular, PackedPosBlockNum =
 *             floor(totalTermFreq/PackedBlockSize)
 *         
PosVIntCount is the number of positions encoded as VInt format. In particular,
 *             PosVIntCount = totalTermFreq - PackedPosBlockNum*PackedBlockSize
 *         
The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock
 *             in chapter Frequencies and Skip Data.
 *         
PositionDelta is, if payloads are disabled for the term's field, the difference
 *             between the position of the current occurrence in the document and the previous
 *             occurrence (or zero, if this is the first occurrence in this document). If payloads
 *             are enabled for the term's field, then PositionDelta/2 is the difference between the
 *             current and the previous position. If payloads are enabled and PositionDelta is odd,
 *             then PayloadLength is stored, indicating the length of the payload at the current
 *             term position.
 *         
For example, the TermPositions for a term which occurs as the fourth term in one
 *             document, and as the fifth and ninth term in a subsequent document, would be the
 *             following sequence of VInts (payloads disabled):
 *             4, 5, 4
 *         
PayloadData is metadata associated with the current term position. If PayloadLength
 *             is stored at the current position, then it indicates the length of this payload. If
 *             PayloadLength is not stored, then this payload has the same length as the payload at
 *             the previous position.
 *         
OffsetDelta/2 is the difference between this position's startOffset from the previous
 *             occurrence (or zero, if this is the first occurrence in this document). If
 *             OffsetDelta is odd, then the length (endOffset-startOffset) differs from the previous
 *             occurrence and an OffsetLength follows. Offset data is only written for {@link
 *             IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.
 *       
 * 
 *
 * 
 *
 * 
 *   Payloads and Offsets
 *       The .pay file will store payloads and offsets associated with certain term-document
 *       positions. Some payloads and offsets will be separated out into .pos file, for performance
 *       reasons.
 *       

 *         PayFile(.pay): --> Header, <TermPayloads, TermOffsets?>
 *             ^TermCount, Footer
 *         
Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
 *         
TermPayloads --> <PackedPayLengthBlock, SumPayLength, PayData>
 *             ^{PackedPayBlockNum}
 *         
TermOffsets --> <PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock>
 *             ^{PackedPayBlockNum}
 *         
PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock -->
 *             {@link PackedInts PackedInts}
 *         
SumPayLength --> {@link DataOutput#writeVInt VInt}
 *         
PayData --> {@link DataOutput#writeByte byte}^SumPayLength
 *         
Footer --> {@link CodecUtil#writeFooter CodecFooter}
 *       
 *       Notes:
 *       

 *         The order of TermPayloads/TermOffsets will be the same as TermPositions, note that
 *             part of payload/offsets are stored in .pos.
 *         
The procedure how PackedPayLengthBlock and PackedOffsetLengthBlock are generated is
 *             the same as PackedFreqBlock in chapter Frequencies and Skip
 *             Data. While PackedStartDeltaBlock follows a same procedure as
 *             PackedDocDeltaBlock.
 *         
PackedPayBlockNum is always equal to PackedPosBlockNum, for the same term. It is also
 *             synonym for PackedOffsetBlockNum.
 *         
SumPayLength is the total length of payloads written within one block, should be the
 *             sum of PayLengths in one packed block.
 *         
PayLength in PackedPayLengthBlock is the length of each payload associated with the
 *             current position.
 *       
 * 
 *
 * @lucene.experimental
 */
public class Lucene50PostingsFormat extends PostingsFormat {

  /**
   * Filename extension for document number, frequencies, and skip data. See chapter: Frequencies and Skip Data
   */
  public static final String DOC_EXTENSION = "doc";

  /** Filename extension for positions. See chapter: Positions */
  public static final String POS_EXTENSION = "pos";

  /**
   * Filename extension for payloads and offsets. See chapter: Payloads and
   * Offsets
   */
  public static final String PAY_EXTENSION = "pay";

  /**
   * Expert: The maximum number of skip levels. Smaller values result in slightly smaller indexes,
   * but slower skipping in big posting lists.
   */
  static final int MAX_SKIP_LEVELS = 10;

  static final String TERMS_CODEC = "Lucene50PostingsWriterTerms";
  static final String DOC_CODEC = "Lucene50PostingsWriterDoc";
  static final String POS_CODEC = "Lucene50PostingsWriterPos";
  static final String PAY_CODEC = "Lucene50PostingsWriterPay";

  // Increment version to change it
  static final int VERSION_START = 0;
  static final int VERSION_IMPACT_SKIP_DATA = 1;
  static final int VERSION_CURRENT = VERSION_IMPACT_SKIP_DATA;

  /** Fixed packed block size, number of integers encoded in a single packed block. */
  // NOTE: must be multiple of 64 because of PackedInts long-aligned encoding/decoding
  public static final int BLOCK_SIZE = 128;

  /** Creates {@code Lucene50PostingsFormat} with default settings. */
  public Lucene50PostingsFormat() {
    super("Lucene50");
  }

  @Override
  public String toString() {
    return getName() + "(blocksize=" + BLOCK_SIZE + ")";
  }

  @Override
  public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
    throw new UnsupportedOperationException("Old formats can't be used for writing");
  }

  @Override
  public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
    PostingsReaderBase postingsReader = new Lucene50PostingsReader(state);
    boolean success = false;
    try {
      FieldsProducer ret = new Lucene40BlockTreeTermsReader(postingsReader, state);
      success = true;
      return ret;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(postingsReader);
      }
    }
  }

  /**
   * Holds all state required for {@link Lucene50PostingsReader} to produce a {@link
   * org.apache.lucene.index.PostingsEnum} without re-seeking the terms dict.
   *
   * @lucene.internal
   */
  public static final class IntBlockTermState extends BlockTermState {
    /** file pointer to the start of the doc ids enumeration, in {@link #DOC_EXTENSION} file */
    public long docStartFP;
    /** file pointer to the start of the positions enumeration, in {@link #POS_EXTENSION} file */
    public long posStartFP;
    /** file pointer to the start of the payloads enumeration, in {@link #PAY_EXTENSION} file */
    public long payStartFP;
    /**
     * file offset for the start of the skip list, relative to docStartFP, if there are more than
     * {@link #BLOCK_SIZE} docs; otherwise -1
     */
    public long skipOffset;
    /**
     * file offset for the last position in the last block, if there are more than {@link
     * #BLOCK_SIZE} positions; otherwise -1
     */
    public long lastPosBlockOffset;
    /**
     * docid when there is a single pulsed posting, otherwise -1. freq is always implicitly
     * totalTermFreq in this case.
     */
    public int singletonDocID;

    /** Sole constructor. */
    public IntBlockTermState() {
      skipOffset = -1;
      lastPosBlockOffset = -1;
      singletonDocID = -1;
    }

    @Override
    public IntBlockTermState clone() {
      IntBlockTermState other = new IntBlockTermState();
      other.copyFrom(this);
      return other;
    }

    @Override
    public void copyFrom(TermState _other) {
      super.copyFrom(_other);
      IntBlockTermState other = (IntBlockTermState) _other;
      docStartFP = other.docStartFP;
      posStartFP = other.posStartFP;
      payStartFP = other.payStartFP;
      lastPosBlockOffset = other.lastPosBlockOffset;
      skipOffset = other.skipOffset;
      singletonDocID = other.singletonDocID;
    }

    @Override
    public String toString() {
      return super.toString()
          + " docStartFP="
          + docStartFP
          + " posStartFP="
          + posStartFP
          + " payStartFP="
          + payStartFP
          + " lastPosBlockOffset="
          + lastPosBlockOffset
          + " singletonDocID="
          + singletonDocID;
    }
  }
}