All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.memory.FSTTermsWriter Maven / Gradle / Ivy

There is a newer version: 9.11.1
Show newest version
package org.apache.lucene.codecs.memory;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import java.util.Comparator;

import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.store.RAMOutputStream;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.fst.Builder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.Util;
import org.apache.lucene.codecs.BlockTermState;
import org.apache.lucene.codecs.PostingsWriterBase;
import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.codecs.CodecUtil;

/**
 * FST-based term dict, using metadata as FST output.
 *
 * The FST directly holds the mapping between <term, metadata>.
 *
 * Term metadata consists of three parts:
 * 1. term statistics: docFreq, totalTermFreq;
 * 2. monotonic long[], e.g. the pointer to the postings list for that term;
 * 3. generic byte[], e.g. other information need by postings reader.
 *
 * 

* File: *

*

* * *

Term Dictionary

*

* The .tst contains a list of FSTs, one for each field. * The FST maps a term to its corresponding statistics (e.g. docfreq) * and metadata (e.g. information for postings list reader like file pointer * to postings list). *

*

* Typically the metadata is separated into two parts: *

    *
  • * Monotonical long array: Some metadata will always be ascending in order * with the corresponding term. This part is used by FST to share outputs between arcs. *
  • *
  • * Generic byte array: Used to store non-monotonic metadata. *
  • *
*

* * File format: *
    *
  • TermsDict(.tst) --> Header, PostingsHeader, FieldSummary, DirOffset
  • *
  • FieldSummary --> NumFields, <FieldNumber, NumTerms, SumTotalTermFreq?, * SumDocFreq, DocCount, LongsSize, TermFST >NumFields
  • *
  • TermFST --> {@link FST FST<TermData>}
  • *
  • TermData --> Flag, BytesSize?, LongDeltaLongsSize?, ByteBytesSize?, * < DocFreq[Same?], (TotalTermFreq-DocFreq) > ?
  • *
  • Header --> {@link CodecUtil#writeHeader CodecHeader}
  • *
  • DirOffset --> {@link DataOutput#writeLong Uint64}
  • *
  • DocFreq, LongsSize, BytesSize, NumFields, * FieldNumber, DocCount --> {@link DataOutput#writeVInt VInt}
  • *
  • TotalTermFreq, NumTerms, SumTotalTermFreq, SumDocFreq, LongDelta --> * {@link DataOutput#writeVLong VLong}
  • *
*

Notes:

*
    *
  • * The format of PostingsHeader and generic meta bytes are customized by the specific postings implementation: * they contain arbitrary per-file data (such as parameters or versioning information), and per-term data * (non-monotonic ones like pulsed postings data). *
  • *
  • * The format of TermData is determined by FST, typically monotonic metadata will be dense around shallow arcs, * while in deeper arcs only generic bytes and term statistics exist. *
  • *
  • * The byte Flag is used to indicate which part of metadata exists on current arc. Specially the monotonic part * is omitted when it is an array of 0s. *
  • *
  • * Since LongsSize is per-field fixed, it is only written once in field summary. *
  • *
* * @lucene.experimental */ public class FSTTermsWriter extends FieldsConsumer { static final String TERMS_EXTENSION = "tmp"; static final String TERMS_CODEC_NAME = "FST_TERMS_DICT"; public static final int TERMS_VERSION_START = 0; public static final int TERMS_VERSION_CURRENT = TERMS_VERSION_START; final PostingsWriterBase postingsWriter; final FieldInfos fieldInfos; final IndexOutput out; final List fields = new ArrayList(); public FSTTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter) throws IOException { final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, TERMS_EXTENSION); this.postingsWriter = postingsWriter; this.fieldInfos = state.fieldInfos; this.out = state.directory.createOutput(termsFileName, state.context); boolean success = false; try { writeHeader(out); this.postingsWriter.init(out); success = true; } finally { if (!success) { IOUtils.closeWhileHandlingException(out); } } } private void writeHeader(IndexOutput out) throws IOException { CodecUtil.writeHeader(out, TERMS_CODEC_NAME, TERMS_VERSION_CURRENT); } private void writeTrailer(IndexOutput out, long dirStart) throws IOException { out.writeLong(dirStart); } @Override public TermsConsumer addField(FieldInfo field) throws IOException { return new TermsWriter(field); } @Override public void close() throws IOException { IOException ioe = null; try { // write field summary final long dirStart = out.getFilePointer(); out.writeVInt(fields.size()); for (FieldMetaData field : fields) { out.writeVInt(field.fieldInfo.number); out.writeVLong(field.numTerms); if (field.fieldInfo.getIndexOptions() != IndexOptions.DOCS_ONLY) { out.writeVLong(field.sumTotalTermFreq); } out.writeVLong(field.sumDocFreq); out.writeVInt(field.docCount); out.writeVInt(field.longsSize); field.dict.save(out); } writeTrailer(out, dirStart); } catch (IOException ioe2) { ioe = ioe2; } finally { IOUtils.closeWhileHandlingException(ioe, out, postingsWriter); } } private static class FieldMetaData { public final FieldInfo fieldInfo; public final long numTerms; public final long sumTotalTermFreq; public final long sumDocFreq; public final int docCount; public final int longsSize; public final FST dict; public FieldMetaData(FieldInfo fieldInfo, long numTerms, long sumTotalTermFreq, long sumDocFreq, int docCount, int longsSize, FST fst) { this.fieldInfo = fieldInfo; this.numTerms = numTerms; this.sumTotalTermFreq = sumTotalTermFreq; this.sumDocFreq = sumDocFreq; this.docCount = docCount; this.longsSize = longsSize; this.dict = fst; } } final class TermsWriter extends TermsConsumer { private final Builder builder; private final FSTTermOutputs outputs; private final FieldInfo fieldInfo; private final int longsSize; private long numTerms; private final IntsRef scratchTerm = new IntsRef(); private final RAMOutputStream statsWriter = new RAMOutputStream(); private final RAMOutputStream metaWriter = new RAMOutputStream(); TermsWriter(FieldInfo fieldInfo) { this.numTerms = 0; this.fieldInfo = fieldInfo; this.longsSize = postingsWriter.setField(fieldInfo); this.outputs = new FSTTermOutputs(fieldInfo, longsSize); this.builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); } @Override public Comparator getComparator() { return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override public PostingsConsumer startTerm(BytesRef text) throws IOException { postingsWriter.startTerm(); return postingsWriter; } @Override public void finishTerm(BytesRef text, TermStats stats) throws IOException { // write term meta data into fst final BlockTermState state = postingsWriter.newTermState(); final FSTTermOutputs.TermData meta = new FSTTermOutputs.TermData(); meta.longs = new long[longsSize]; meta.bytes = null; meta.docFreq = state.docFreq = stats.docFreq; meta.totalTermFreq = state.totalTermFreq = stats.totalTermFreq; postingsWriter.finishTerm(state); postingsWriter.encodeTerm(meta.longs, metaWriter, fieldInfo, state, true); final int bytesSize = (int)metaWriter.getFilePointer(); if (bytesSize > 0) { meta.bytes = new byte[bytesSize]; metaWriter.writeTo(meta.bytes, 0); metaWriter.reset(); } builder.add(Util.toIntsRef(text, scratchTerm), meta); numTerms++; } @Override public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { // save FST dict if (numTerms > 0) { final FST fst = builder.finish(); fields.add(new FieldMetaData(fieldInfo, numTerms, sumTotalTermFreq, sumDocFreq, docCount, longsSize, fst)); } } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy