org.apache.lucene.codecs.memory.FSTOrdTermsWriter Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-codecs Show documentation
Show all versions of lucene-codecs Show documentation
Apache Lucene (module: codecs)
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.memory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;
/**
* FST-based term dict, using ord as FST output.
*
* The FST holds the mapping between <term, ord>, and
* term's metadata is delta encoded into a single byte block.
*
* Typically the byte block consists of four parts:
* 1. term statistics: docFreq, totalTermFreq;
* 2. monotonic long[], e.g. the pointer to the postings list for that term;
* 3. generic byte[], e.g. other information customized by postings base.
* 4. single-level skip list to speed up metadata decoding by ord.
*
*
* Files:
*
* - .tix: Term Index
* - .tbk: Term Block
*
*
*
* Term Index
*
* The .tix contains a list of FSTs, one for each field.
* The FST maps a term to its corresponding order in current field.
*
*
*
* - TermIndex(.tix) --> Header, TermFSTNumFields, Footer
* - TermFST --> {@link FST FST<long>}
* - Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
* - Footer --> {@link CodecUtil#writeFooter CodecFooter}
*
*
* Notes:
*
* -
* Since terms are already sorted before writing to Term Block,
* their ords can directly used to seek term metadata from term block.
*
*
*
*
* Term Block
*
* The .tbk contains all the statistics and metadata for terms, along with field summary (e.g.
* per-field data like number of documents in current field). For each field, there are four blocks:
*
* - statistics bytes block: contains term statistics;
* - metadata longs block: delta-encodes monotonic part of metadata;
* - metadata bytes block: encodes other parts of metadata;
* - skip block: contains skip data, to speed up metadata seeking and decoding
*
*
* File Format:
*
* - TermBlock(.tbk) --> Header, PostingsHeader, FieldSummary, DirOffset
* - FieldSummary --> NumFields, <FieldNumber, NumTerms, SumTotalTermFreq?, SumDocFreq,
* DocCount, LongsSize, DataBlock > NumFields, Footer
*
* - DataBlock --> StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength,
* SkipBlock, StatsBlock, MetaLongsBlock, MetaBytesBlock
* - SkipBlock --> < StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta,
* MetaLongsSkipDeltaLongsSize >NumTerms
*
- StatsBlock --> < DocFreq[Same?], (TotalTermFreq-DocFreq) ? > NumTerms
*
- MetaLongsBlock --> < LongDeltaLongsSize, BytesSize > NumTerms
*
- MetaBytesBlock --> Byte MetaBytesBlockLength
*
- Header --> {@link CodecUtil#writeIndexHeader IndexHeader}
* - DirOffset --> {@link DataOutput#writeLong Uint64}
* - NumFields, FieldNumber, DocCount, DocFreq, LongsSize,
* FieldNumber, DocCount --> {@link DataOutput#writeVInt VInt}
* - NumTerms, SumTotalTermFreq, SumDocFreq, StatsBlockLength, MetaLongsBlockLength, MetaBytesBlockLength,
* StatsFPDelta, MetaLongsSkipFPDelta, MetaBytesSkipFPDelta, MetaLongsSkipStart, TotalTermFreq,
* LongDelta,--> {@link DataOutput#writeVLong VLong}
* - Footer --> {@link CodecUtil#writeFooter CodecFooter}
*
* Notes:
*
* -
* The format of PostingsHeader and MetaBytes are customized by the specific postings implementation:
* they contain arbitrary per-file data (such as parameters or versioning information), and per-term data
* (non-monotonic ones like pulsed postings data).
*
* -
* During initialization the reader will load all the blocks into memory. SkipBlock will be decoded, so that during seek
* term dict can lookup file pointers directly. StatsFPDelta, MetaLongsSkipFPDelta, etc. are file offset
* for every SkipInterval's term. MetaLongsSkipDelta is the difference from previous one, which indicates
* the value of preceding metadata longs for every SkipInterval's term.
*
* -
* DocFreq is the count of documents which contain the term. TotalTermFreq is the total number of occurrences of the term.
* Usually these two values are the same for long tail terms, therefore one bit is stole from DocFreq to check this case,
* so that encoding of TotalTermFreq may be omitted.
*
*
*
* @lucene.experimental
*/
public class FSTOrdTermsWriter extends FieldsConsumer {
static final String TERMS_INDEX_EXTENSION = "tix";
static final String TERMS_BLOCK_EXTENSION = "tbk";
static final String TERMS_CODEC_NAME = "FSTOrdTerms";
static final String TERMS_INDEX_CODEC_NAME = "FSTOrdIndex";
public static final int VERSION_START = 2;
public static final int VERSION_CURRENT = VERSION_START;
public static final int SKIP_INTERVAL = 8;
final PostingsWriterBase postingsWriter;
final FieldInfos fieldInfos;
final int maxDoc;
final List fields = new ArrayList<>();
IndexOutput blockOut = null;
IndexOutput indexOut = null;
public FSTOrdTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException {
final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
final String termsBlockFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_BLOCK_EXTENSION);
this.postingsWriter = postingsWriter;
this.fieldInfos = state.fieldInfos;
this.maxDoc = state.segmentInfo.maxDoc();
boolean success = false;
try {
this.indexOut = state.directory.createOutput(termsIndexFileName, state.context);
this.blockOut = state.directory.createOutput(termsBlockFileName, state.context);
CodecUtil.writeIndexHeader(indexOut, TERMS_INDEX_CODEC_NAME, VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
CodecUtil.writeIndexHeader(blockOut, TERMS_CODEC_NAME, VERSION_CURRENT,
state.segmentInfo.getId(), state.segmentSuffix);
this.postingsWriter.init(blockOut, state);
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(indexOut, blockOut);
}
}
}
@Override
public void write(Fields fields) throws IOException {
for(String field : fields) {
Terms terms = fields.terms(field);
if (terms == null) {
continue;
}
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
TermsEnum termsEnum = terms.iterator();
TermsWriter termsWriter = new TermsWriter(fieldInfo);
long sumTotalTermFreq = 0;
long sumDocFreq = 0;
FixedBitSet docsSeen = new FixedBitSet(maxDoc);
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen);
if (termState != null) {
termsWriter.finishTerm(term, termState);
sumTotalTermFreq += termState.totalTermFreq;
sumDocFreq += termState.docFreq;
}
}
termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
}
}
@Override
public void close() throws IOException {
if (blockOut != null) {
boolean success = false;
try {
final long blockDirStart = blockOut.getFilePointer();
// write field summary
blockOut.writeVInt(fields.size());
for (FieldMetaData field : fields) {
blockOut.writeVInt(field.fieldInfo.number);
blockOut.writeVLong(field.numTerms);
if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS) {
blockOut.writeVLong(field.sumTotalTermFreq);
}
blockOut.writeVLong(field.sumDocFreq);
blockOut.writeVInt(field.docCount);
blockOut.writeVInt(field.longsSize);
blockOut.writeVLong(field.statsOut.getFilePointer());
blockOut.writeVLong(field.metaLongsOut.getFilePointer());
blockOut.writeVLong(field.metaBytesOut.getFilePointer());
field.skipOut.writeTo(blockOut);
field.statsOut.writeTo(blockOut);
field.metaLongsOut.writeTo(blockOut);
field.metaBytesOut.writeTo(blockOut);
field.dict.save(indexOut);
}
writeTrailer(blockOut, blockDirStart);
CodecUtil.writeFooter(indexOut);
CodecUtil.writeFooter(blockOut);
success = true;
} finally {
if (success) {
IOUtils.close(blockOut, indexOut, postingsWriter);
} else {
IOUtils.closeWhileHandlingException(blockOut, indexOut, postingsWriter);
}
blockOut = null;
}
}
}
private void writeTrailer(IndexOutput out, long dirStart) throws IOException {
out.writeLong(dirStart);
}
private static class FieldMetaData {
public FieldInfo fieldInfo;
public long numTerms;
public long sumTotalTermFreq;
public long sumDocFreq;
public int docCount;
public int longsSize;
public FST dict;
// TODO: block encode each part
// vint encode next skip point (fully decoded when reading)
public RAMOutputStream skipOut;
// vint encode df, (ttf-df)
public RAMOutputStream statsOut;
// vint encode monotonic long[] and length for corresponding byte[]
public RAMOutputStream metaLongsOut;
// generic byte[]
public RAMOutputStream metaBytesOut;
}
final class TermsWriter {
private final Builder builder;
private final PositiveIntOutputs outputs;
private final FieldInfo fieldInfo;
private final int longsSize;
private long numTerms;
private final IntsRefBuilder scratchTerm = new IntsRefBuilder();
private final RAMOutputStream statsOut = new RAMOutputStream();
private final RAMOutputStream metaLongsOut = new RAMOutputStream();
private final RAMOutputStream metaBytesOut = new RAMOutputStream();
private final RAMOutputStream skipOut = new RAMOutputStream();
private long lastBlockStatsFP;
private long lastBlockMetaLongsFP;
private long lastBlockMetaBytesFP;
private long[] lastBlockLongs;
private long[] lastLongs;
private long lastMetaBytesFP;
TermsWriter(FieldInfo fieldInfo) {
this.numTerms = 0;
this.fieldInfo = fieldInfo;
this.longsSize = postingsWriter.setField(fieldInfo);
this.outputs = PositiveIntOutputs.getSingleton();
this.builder = new Builder<>(FST.INPUT_TYPE.BYTE1, outputs);
this.lastBlockStatsFP = 0;
this.lastBlockMetaLongsFP = 0;
this.lastBlockMetaBytesFP = 0;
this.lastBlockLongs = new long[longsSize];
this.lastLongs = new long[longsSize];
this.lastMetaBytesFP = 0;
}
public void finishTerm(BytesRef text, BlockTermState state) throws IOException {
if (numTerms > 0 && numTerms % SKIP_INTERVAL == 0) {
bufferSkip();
}
// write term meta data into fst
final long longs[] = new long[longsSize];
final long delta = state.totalTermFreq - state.docFreq;
if (state.totalTermFreq > 0) {
if (delta == 0) {
statsOut.writeVInt(state.docFreq<<1|1);
} else {
statsOut.writeVInt(state.docFreq<<1);
statsOut.writeVLong(state.totalTermFreq-state.docFreq);
}
} else {
statsOut.writeVInt(state.docFreq);
}
postingsWriter.encodeTerm(longs, metaBytesOut, fieldInfo, state, true);
for (int i = 0; i < longsSize; i++) {
metaLongsOut.writeVLong(longs[i] - lastLongs[i]);
lastLongs[i] = longs[i];
}
metaLongsOut.writeVLong(metaBytesOut.getFilePointer() - lastMetaBytesFP);
builder.add(Util.toIntsRef(text, scratchTerm), numTerms);
numTerms++;
lastMetaBytesFP = metaBytesOut.getFilePointer();
}
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException {
if (numTerms > 0) {
final FieldMetaData metadata = new FieldMetaData();
metadata.fieldInfo = fieldInfo;
metadata.numTerms = numTerms;
metadata.sumTotalTermFreq = sumTotalTermFreq;
metadata.sumDocFreq = sumDocFreq;
metadata.docCount = docCount;
metadata.longsSize = longsSize;
metadata.skipOut = skipOut;
metadata.statsOut = statsOut;
metadata.metaLongsOut = metaLongsOut;
metadata.metaBytesOut = metaBytesOut;
metadata.dict = builder.finish();
fields.add(metadata);
}
}
private void bufferSkip() throws IOException {
skipOut.writeVLong(statsOut.getFilePointer() - lastBlockStatsFP);
skipOut.writeVLong(metaLongsOut.getFilePointer() - lastBlockMetaLongsFP);
skipOut.writeVLong(metaBytesOut.getFilePointer() - lastBlockMetaBytesFP);
for (int i = 0; i < longsSize; i++) {
skipOut.writeVLong(lastLongs[i] - lastBlockLongs[i]);
}
lastBlockStatsFP = statsOut.getFilePointer();
lastBlockMetaLongsFP = metaLongsOut.getFilePointer();
lastBlockMetaBytesFP = metaBytesOut.getFilePointer();
System.arraycopy(lastLongs, 0, lastBlockLongs, 0, longsSize);
}
}
}