All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.blockterms;

import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;

/**
 * Selects index terms according to provided pluggable {@link IndexTermSelector}, and stores them in
 * a prefix trie that's loaded entirely in RAM stored as an FST. This terms index only supports
 * unsigned byte term sort order (unicode codepoint order when the bytes are UTF8).
 *
 * @lucene.experimental
 */
public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
  protected IndexOutput metaOut;
  protected IndexOutput out;

  /** Extension of terms index file */
  static final String TERMS_INDEX_EXTENSION = "tiv";

  /** Extension of terms meta file */
  static final String TERMS_META_EXTENSION = "tmv";

  static final String META_CODEC_NAME = "VariableGapTermsMeta";
  static final String CODEC_NAME = "VariableGapTermsIndex";
  static final int VERSION_START = 4;
  static final int VERSION_CURRENT = VERSION_START;

  @SuppressWarnings("unused")
  private final FieldInfos fieldInfos; // unread

  private final IndexTermSelector policy;

  /**
   * Hook for selecting which terms should be placed in the terms index.
   *
   * 

{@link #newField} is called at the start of each new field, and {@link #isIndexTerm} for * each term in that field. * * @lucene.experimental */ public abstract static class IndexTermSelector { /** * Called sequentially on every term being written, returning true if this term should be * indexed */ public abstract boolean isIndexTerm(BytesRef term, TermStats stats); /** Called when a new field is started. */ public abstract void newField(FieldInfo fieldInfo); } /** Same policy as {@link FixedGapTermsIndexWriter} */ public static final class EveryNTermSelector extends IndexTermSelector { private int count; private final int interval; public EveryNTermSelector(int interval) { this.interval = interval; // First term is first indexed term: count = interval; } @Override public boolean isIndexTerm(BytesRef term, TermStats stats) { if (count >= interval) { count = 1; return true; } else { count++; return false; } } @Override public void newField(FieldInfo fieldInfo) { count = interval; } } /** * Sets an index term when docFreq >= docFreqThresh, or every interval terms. This should * reduce seek time to high docFreq terms. */ public static final class EveryNOrDocFreqTermSelector extends IndexTermSelector { private int count; private final int docFreqThresh; private final int interval; public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval) { this.interval = interval; this.docFreqThresh = docFreqThresh; // First term is first indexed term: count = interval; } @Override public boolean isIndexTerm(BytesRef term, TermStats stats) { if (stats.docFreq() >= docFreqThresh || count >= interval) { count = 1; return true; } else { count++; return false; } } @Override public void newField(FieldInfo fieldInfo) { count = interval; } } // TODO: it'd be nice to let the FST builder prune based // on term count of each node (the prune1/prune2 that it // accepts), and build the index based on that. This // should result in a more compact terms index, more like // a prefix trie than the other selectors, because it // only stores enough leading bytes to get down to N // terms that may complete that prefix. It becomes // "deeper" when terms are dense, and "shallow" when they // are less dense. // // However, it's not easy to make that work this this // API, because that pruning doesn't immediately know on // seeing each term whether that term will be a seek point // or not. It requires some non-causality in the API, ie // only on seeing some number of future terms will the // builder decide which past terms are seek points. // Somehow the API'd need to be able to return a "I don't // know" value, eg like a Future, which only later on is // flipped (frozen) to true or false. // // We could solve this with a 2-pass approach, where the // first pass would build an FSA (no outputs) solely to // determine which prefixes are the 'leaves' in the // pruning. The 2nd pass would then look at this prefix // trie to mark the seek points and build the FST mapping // to the true output. // // But, one downside to this approach is that it'd result // in uneven index term selection. EG with prune1=10, the // resulting index terms could be as frequent as every 10 // terms or as rare as every * 10 (eg 2560), // in the extremes. public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy) throws IOException { fieldInfos = state.fieldInfos; this.policy = policy; final String metaFileName = IndexFileNames.segmentFileName( state.segmentInfo.name, state.segmentSuffix, TERMS_META_EXTENSION); final String indexFileName = IndexFileNames.segmentFileName( state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION); boolean success = false; try { metaOut = state.directory.createOutput(metaFileName, state.context); out = state.directory.createOutput(indexFileName, state.context); CodecUtil.writeIndexHeader( metaOut, META_CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); CodecUtil.writeIndexHeader( out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); success = true; } finally { if (!success) { IOUtils.closeWhileHandlingException(this); } } } @Override public FieldWriter addField(FieldInfo field, long termsFilePointer) throws IOException { //// System.out.println("VGW: field=" + field.name); policy.newField(field); return new FSTFieldWriter(field, termsFilePointer); } /** * NOTE: if your codec does not sort in unicode code point order, you must override this method, * to simply return indexedTerm.length. */ protected int indexedTermPrefixLength(final BytesRef priorTerm, final BytesRef indexedTerm) { // As long as codec sorts terms in unicode codepoint // order, we can safely strip off the non-distinguishing // suffix to save RAM in the loaded terms index. final int idxTermOffset = indexedTerm.offset; final int priorTermOffset = priorTerm.offset; final int limit = Math.min(priorTerm.length, indexedTerm.length); for (int byteIdx = 0; byteIdx < limit; byteIdx++) { if (priorTerm.bytes[priorTermOffset + byteIdx] != indexedTerm.bytes[idxTermOffset + byteIdx]) { return byteIdx + 1; } } return Math.min(1 + priorTerm.length, indexedTerm.length); } private class FSTFieldWriter extends FieldWriter { private final FSTCompiler fstCompiler; private final PositiveIntOutputs fstOutputs; private final long startTermsFilePointer; final FieldInfo fieldInfo; FST fst; private final BytesRefBuilder lastTerm = new BytesRefBuilder(); private boolean first = true; public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException { this.fieldInfo = fieldInfo; fstOutputs = PositiveIntOutputs.getSingleton(); fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs).build(); //// System.out.println("VGW: field=" + fieldInfo.name); // Always put empty string in fstCompiler.add(new IntsRef(), termsFilePointer); startTermsFilePointer = termsFilePointer; } @Override public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException { // System.out.println("VGW: index term=" + text.utf8ToString()); // NOTE: we must force the first term per field to be // indexed, in case policy doesn't: if (policy.isIndexTerm(text, stats) || first) { first = false; // System.out.println(" YES"); return true; } else { lastTerm.copyBytes(text); return false; } } private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); @Override public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException { if (text.length == 0) { // We already added empty string in ctor assert termsFilePointer == startTermsFilePointer; return; } final int lengthSave = text.length; text.length = indexedTermPrefixLength(lastTerm.get(), text); try { fstCompiler.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer); } finally { text.length = lengthSave; } lastTerm.copyBytes(text); } @Override public void finish(long termsFilePointer) throws IOException { fst = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader()); if (fst != null) { metaOut.writeInt(fieldInfo.number); metaOut.writeVLong(out.getFilePointer()); fst.save(metaOut, out); } } } @Override public void close() throws IOException { try { if (metaOut != null) { metaOut.writeInt(-1); CodecUtil.writeFooter(metaOut); } if (out != null) { CodecUtil.writeFooter(out); } } finally { try { IOUtils.close(out, metaOut); } finally { out = null; metaOut = null; } } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy