org.apache.lucene.codecs.memory.FSTOrdTermsWriter Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of lucene-codecs Show documentation
Apache Lucene (module: codecs)
There is a newer version: 9.11.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.memory;


import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;

/** 
 * FST-based term dict, using ord as FST output.
 *
 * The FST holds the mapping between <term, ord>, and 
 * term's metadata is delta encoded into a single byte block.
 *
 * Typically the byte block consists of four parts:
 * 1. term statistics: docFreq, totalTermFreq;
 * 2. monotonic long[], e.g. the pointer to the postings list for that term;
 * 3. generic byte[], e.g. other information customized by postings base.
 * 4. single-level skip list to speed up metadata decoding by ord.
 *
 * 
 * Files:
 * 

 *  .tix: Term Index
 *  .tbk: Term Block
 * 
 *
 * 
 * Term Index
 * 
 *  The .tix contains a list of FSTs, one for each field.
 *  The FST maps a term to its corresponding order in current field.
 * 
 * 
 * 
 *  TermIndex(.tix) --> Header, TermFST^NumFields, Footer
 *  TermFST --> {@link FST FST<long>}
 *  Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
 *  Footer --> {@link CodecUtil#writeFooter CodecFooter}
 * 
 *
 * Notes:
 * 
 *  
 *  Since terms are already sorted before writing to Term Block, 
 *  their ords can directly used to seek term metadata from term block.
 *  
 * 
 *
 * 
 * Term Block
 * 
 *  The .tbk contains all the statistics and metadata for terms, along with field summary (e.g. 
 *  per-field data like number of documents in current field). For each field, there are four blocks:
 *  

 *   statistics bytes block: contains term statistics; 
 *   metadata longs block: delta-encodes monotonic part of metadata; 
 *   metadata bytes block: encodes other parts of metadata; 
 *   skip block: contains skip data, to speed up metadata seeking and decoding
 *  
 *
 * File Format:
 * 
 *  TermBlock(.tbk) --> Header, PostingsHeader, FieldSummary, DirOffset
 *  FieldSummary --> NumFields, <FieldNumber, NumTerms, SumTotalTermFreq?, SumDocFreq,
 *                                         DocCount, LongsSize, DataBlock > ^NumFields, Footer
 *
 *  DataBlock --> StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength, 
 *                       SkipBlock, StatsBlock, MetaLongsBlock, MetaBytesBlock 
 *  SkipBlock --> < StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta, 
 *                            MetaLongsSkipDelta^LongsSize >^NumTerms
 *  
StatsBlock --> < DocFreq[Same?], (TotalTermFreq-DocFreq) ? > ^NumTerms
 *  
MetaLongsBlock --> < LongDelta^LongsSize, BytesSize > ^NumTerms
 *  
MetaBytesBlock --> Byte ^{MetaBytesBlockLength}
 *  
Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
 *  DirOffset --> {@link DataOutput#writeLong Uint64}
 *  NumFields, FieldNumber, DocCount, DocFreq, LongsSize, 
 *        FieldNumber, DocCount --> {@link DataOutput#writeVInt VInt}
 *  NumTerms, SumTotalTermFreq, SumDocFreq, StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength,
 *        StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta, MetaLongsSkipStart, TotalTermFreq, 
 *        LongDelta,--> {@link DataOutput#writeVLong VLong}
 *  Footer --> {@link CodecUtil#writeFooter CodecFooter}
 * 
 * Notes: 
 * 
 *  
 *   The format of PostingsHeader and MetaBytes are customized by the specific postings implementation:
 *   they contain arbitrary per-file data (such as parameters or versioning information), and per-term data 
 *   (non-monotonic ones like pulsed postings data).
 *  
 *  
 *   During initialization the reader will load all the blocks into memory. SkipBlock will be decoded, so that during seek
 *   term dict can lookup file pointers directly. StatsFPDelta, MetaLongsSkipFPDelta, etc. are file offset
 *   for every SkipInterval's term. MetaLongsSkipDelta is the difference from previous one, which indicates
 *   the value of preceding metadata longs for every SkipInterval's term.
 *  
 *  
 *   DocFreq is the count of documents which contain the term. TotalTermFreq is the total number of occurrences of the term. 
 *   Usually these two values are the same for long tail terms, therefore one bit is stole from DocFreq to check this case,
 *   so that encoding of TotalTermFreq may be omitted.
 *  
 * 
 *
 * @lucene.experimental 
 */

public class FSTOrdTermsWriter extends FieldsConsumer {
  static final String TERMS_INDEX_EXTENSION = "tix";
  static final String TERMS_BLOCK_EXTENSION = "tbk";
  static final String TERMS_CODEC_NAME = "FSTOrdTerms";
  static final String TERMS_INDEX_CODEC_NAME = "FSTOrdIndex";

  public static final int VERSION_START = 2;
  public static final int VERSION_CURRENT = VERSION_START;
  public static final int SKIP_INTERVAL = 8;
  
  final PostingsWriterBase postingsWriter;
  final FieldInfos fieldInfos;
  final int maxDoc;
  final List fields = new ArrayList<>();
  IndexOutput blockOut = null;
  IndexOutput indexOut = null;

  public FSTOrdTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException {
    final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
    final String termsBlockFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_BLOCK_EXTENSION);

    this.postingsWriter = postingsWriter;
    this.fieldInfos = state.fieldInfos;
    this.maxDoc = state.segmentInfo.maxDoc();

    boolean success = false;
    try {
      this.indexOut = state.directory.createOutput(termsIndexFileName, state.context);
      this.blockOut = state.directory.createOutput(termsBlockFileName, state.context);
      CodecUtil.writeIndexHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT, 
                                             state.segmentInfo.getId(), state.segmentSuffix);
      CodecUtil.writeIndexHeader(blockOut, TERMS_CODEC_NAME, VERSION_CURRENT, 
                                             state.segmentInfo.getId(), state.segmentSuffix);
      this.postingsWriter.init(blockOut, state); 
      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(indexOut, blockOut);
      }
    }
  }

  @Override
  public void write(Fields fields, NormsProducer norms) throws IOException {
    for(String field : fields) {
      Terms terms = fields.terms(field);
      if (terms == null) {
        continue;
      }
      FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
      boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
      TermsEnum termsEnum = terms.iterator();
      TermsWriter termsWriter = new TermsWriter(fieldInfo);

      long sumTotalTermFreq = 0;
      long sumDocFreq = 0;
      FixedBitSet docsSeen = new FixedBitSet(maxDoc);
      while (true) {
        BytesRef term = termsEnum.next();
        if (term == null) {
          break;
        }
        BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen, norms);
        if (termState != null) {
          termsWriter.finishTerm(term, termState);
          sumTotalTermFreq += termState.totalTermFreq;
          sumDocFreq += termState.docFreq;
        }
      }

      termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
    }
  }

  @Override
  public void close() throws IOException {
    if (blockOut != null) {
      boolean success = false;
      try {
        final long blockDirStart = blockOut.getFilePointer();
        
        // write field summary
        blockOut.writeVInt(fields.size());
        for (FieldMetaData field : fields) {
          blockOut.writeVInt(field.fieldInfo.number);
          blockOut.writeVLong(field.numTerms);
          if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
            blockOut.writeVLong(field.sumTotalTermFreq);
          }
          blockOut.writeVLong(field.sumDocFreq);
          blockOut.writeVInt(field.docCount);
          blockOut.writeVInt(field.longsSize);
          blockOut.writeVLong(field.statsOut.getFilePointer());
          blockOut.writeVLong(field.metaLongsOut.getFilePointer());
          blockOut.writeVLong(field.metaBytesOut.getFilePointer());
          
          field.skipOut.writeTo(blockOut);
          field.statsOut.writeTo(blockOut);
          field.metaLongsOut.writeTo(blockOut);
          field.metaBytesOut.writeTo(blockOut);
          field.dict.save(indexOut);
        }
        writeTrailer(blockOut, blockDirStart);
        CodecUtil.writeFooter(indexOut);
        CodecUtil.writeFooter(blockOut);
        success = true;
      } finally {
        if (success) {
          IOUtils.close(blockOut, indexOut, postingsWriter);
        } else {
          IOUtils.closeWhileHandlingException(blockOut, indexOut, postingsWriter);
        }
        blockOut = null;
      }
    }
  }

  private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
    out.writeLong(dirStart);
  }

  private static class FieldMetaData {
    public FieldInfo fieldInfo;
    public long numTerms;
    public long sumTotalTermFreq;
    public long sumDocFreq;
    public int docCount;
    public int longsSize;
    public FST dict;

    // TODO: block encode each part 

    // vint encode next skip point (fully decoded when reading)
    public RAMOutputStream skipOut;
    // vint encode df, (ttf-df)
    public RAMOutputStream statsOut;
    // vint encode monotonic long[] and length for corresponding byte[]
    public RAMOutputStream metaLongsOut;
    // generic byte[]
    public RAMOutputStream metaBytesOut;
  }

  final class TermsWriter {
    private final Builder builder;
    private final PositiveIntOutputs outputs;
    private final FieldInfo fieldInfo;
    private final int longsSize;
    private long numTerms;

    private final IntsRefBuilder scratchTerm = new IntsRefBuilder();
    private final RAMOutputStream statsOut = new RAMOutputStream();
    private final RAMOutputStream metaLongsOut = new RAMOutputStream();
    private final RAMOutputStream metaBytesOut = new RAMOutputStream();

    private final RAMOutputStream skipOut = new RAMOutputStream();
    private long lastBlockStatsFP;
    private long lastBlockMetaLongsFP;
    private long lastBlockMetaBytesFP;
    private long[] lastBlockLongs;

    private long[] lastLongs;
    private long lastMetaBytesFP;

    TermsWriter(FieldInfo fieldInfo) {
      this.numTerms = 0;
      this.fieldInfo = fieldInfo;
      this.longsSize = postingsWriter.setField(fieldInfo);
      this.outputs = PositiveIntOutputs.getSingleton();
      this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);

      this.lastBlockStatsFP = 0;
      this.lastBlockMetaLongsFP = 0;
      this.lastBlockMetaBytesFP = 0;
      this.lastBlockLongs = new long[longsSize];

      this.lastLongs = new long[longsSize];
      this.lastMetaBytesFP = 0;
    }

    public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
      if (numTerms > 0 && numTerms % SKIP_INTERVAL == 0) {
        bufferSkip();
      }
      // write term meta data into fst
      final long longs[] = new long[longsSize];
      final long delta = state.totalTermFreq - state.docFreq;
      if (state.totalTermFreq > 0) {
        if (delta == 0) {
          statsOut.writeVInt(state.docFreq<<1|1);
        } else {
          statsOut.writeVInt(state.docFreq<<1);
          statsOut.writeVLong(state.totalTermFreq-state.docFreq);
        }
      } else {
        statsOut.writeVInt(state.docFreq);
      }
      postingsWriter.encodeTerm(longs, metaBytesOut, fieldInfo, state, true);
      for (int i = 0; i < longsSize; i++) {
        metaLongsOut.writeVLong(longs[i] - lastLongs[i]);
        lastLongs[i] = longs[i];
      }
      metaLongsOut.writeVLong(metaBytesOut.getFilePointer() - lastMetaBytesFP);

      builder.add(Util.toIntsRef(text, scratchTerm), numTerms);
      numTerms++;

      lastMetaBytesFP = metaBytesOut.getFilePointer();
    }

    public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
      if (numTerms > 0) {
        final FieldMetaData metadata = new FieldMetaData();
        metadata.fieldInfo = fieldInfo;
        metadata.numTerms = numTerms;
        metadata.sumTotalTermFreq = sumTotalTermFreq;
        metadata.sumDocFreq = sumDocFreq;
        metadata.docCount = docCount;
        metadata.longsSize = longsSize;
        metadata.skipOut = skipOut;
        metadata.statsOut = statsOut;
        metadata.metaLongsOut = metaLongsOut;
        metadata.metaBytesOut = metaBytesOut;
        metadata.dict = builder.finish();
        fields.add(metadata);
      }
    }

    private void bufferSkip() throws IOException {
      skipOut.writeVLong(statsOut.getFilePointer() - lastBlockStatsFP);
      skipOut.writeVLong(metaLongsOut.getFilePointer() - lastBlockMetaLongsFP);
      skipOut.writeVLong(metaBytesOut.getFilePointer() - lastBlockMetaBytesFP);
      for (int i = 0; i < longsSize; i++) {
        skipOut.writeVLong(lastLongs[i] - lastBlockLongs[i]);
      }
      lastBlockStatsFP = statsOut.getFilePointer();
      lastBlockMetaLongsFP = metaLongsOut.getFilePointer();
      lastBlockMetaBytesFP = metaBytesOut.getFilePointer();
      System.arraycopy(lastLongs, 0, lastBlockLongs, 0, longsSize);
    }
  }
}