org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-codecs Show documentation
Codecs and postings formats for Apache Lucene.
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.blockterms;

import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.PositiveIntOutputs;
import org.apache.lucene.util.fst.Util;

/**
 * Selects index terms according to provided pluggable {@link IndexTermSelector}, and stores them in
 * a prefix trie that's loaded entirely in RAM stored as an FST. This terms index only supports
 * unsigned byte term sort order (unicode codepoint order when the bytes are UTF8).
 *
 * @lucene.experimental
 */
public class VariableGapTermsIndexWriter extends TermsIndexWriterBase {
  protected IndexOutput metaOut;
  protected IndexOutput out;

  /** Extension of terms index file */
  static final String TERMS_INDEX_EXTENSION = "tiv";

  /** Extension of terms meta file */
  static final String TERMS_META_EXTENSION = "tmv";

  static final String META_CODEC_NAME = "VariableGapTermsMeta";
  static final String CODEC_NAME = "VariableGapTermsIndex";
  static final int VERSION_START = 4;
  static final int VERSION_CURRENT = VERSION_START;

  @SuppressWarnings("unused")
  private final FieldInfos fieldInfos; // unread

  private final IndexTermSelector policy;

  /**
   * Hook for selecting which terms should be placed in the terms index.
   *
   * {@link #newField} is called at the start of each new field, and {@link #isIndexTerm} for
   * each term in that field.
   *
   * @lucene.experimental
   */
  public abstract static class IndexTermSelector {
    /**
     * Called sequentially on every term being written, returning true if this term should be
     * indexed
     */
    public abstract boolean isIndexTerm(BytesRef term, TermStats stats);

    /** Called when a new field is started. */
    public abstract void newField(FieldInfo fieldInfo);
  }

  /** Same policy as {@link FixedGapTermsIndexWriter} */
  public static final class EveryNTermSelector extends IndexTermSelector {
    private int count;
    private final int interval;

    public EveryNTermSelector(int interval) {
      this.interval = interval;
      // First term is first indexed term:
      count = interval;
    }

    @Override
    public boolean isIndexTerm(BytesRef term, TermStats stats) {
      if (count >= interval) {
        count = 1;
        return true;
      } else {
        count++;
        return false;
      }
    }

    @Override
    public void newField(FieldInfo fieldInfo) {
      count = interval;
    }
  }

  /**
   * Sets an index term when docFreq >= docFreqThresh, or every interval terms. This should
   * reduce seek time to high docFreq terms.
   */
  public static final class EveryNOrDocFreqTermSelector extends IndexTermSelector {
    private int count;
    private final int docFreqThresh;
    private final int interval;

    public EveryNOrDocFreqTermSelector(int docFreqThresh, int interval) {
      this.interval = interval;
      this.docFreqThresh = docFreqThresh;

      // First term is first indexed term:
      count = interval;
    }

    @Override
    public boolean isIndexTerm(BytesRef term, TermStats stats) {
      if (stats.docFreq() >= docFreqThresh || count >= interval) {
        count = 1;
        return true;
      } else {
        count++;
        return false;
      }
    }

    @Override
    public void newField(FieldInfo fieldInfo) {
      count = interval;
    }
  }

  // TODO: it'd be nice to let the FST builder prune based
  // on term count of each node (the prune1/prune2 that it
  // accepts), and build the index based on that.  This
  // should result in a more compact terms index, more like
  // a prefix trie than the other selectors, because it
  // only stores enough leading bytes to get down to N
  // terms that may complete that prefix.  It becomes
  // "deeper" when terms are dense, and "shallow" when they
  // are less dense.
  //
  // However, it's not easy to make that work this this
  // API, because that pruning doesn't immediately know on
  // seeing each term whether that term will be a seek point
  // or not.  It requires some non-causality in the API, ie
  // only on seeing some number of future terms will the
  // builder decide which past terms are seek points.
  // Somehow the API'd need to be able to return a "I don't
  // know" value, eg like a Future, which only later on is
  // flipped (frozen) to true or false.
  //
  // We could solve this with a 2-pass approach, where the
  // first pass would build an FSA (no outputs) solely to
  // determine which prefixes are the 'leaves' in the
  // pruning. The 2nd pass would then look at this prefix
  // trie to mark the seek points and build the FST mapping
  // to the true output.
  //
  // But, one downside to this approach is that it'd result
  // in uneven index term selection.  EG with prune1=10, the
  // resulting index terms could be as frequent as every 10
  // terms or as rare as every  * 10 (eg 2560),
  // in the extremes.

  public VariableGapTermsIndexWriter(SegmentWriteState state, IndexTermSelector policy)
      throws IOException {
    fieldInfos = state.fieldInfos;
    this.policy = policy;

    final String metaFileName =
        IndexFileNames.segmentFileName(
            state.segmentInfo.name, state.segmentSuffix, TERMS_META_EXTENSION);
    final String indexFileName =
        IndexFileNames.segmentFileName(
            state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);

    boolean success = false;
    try {
      metaOut = state.directory.createOutput(metaFileName, state.context);
      out = state.directory.createOutput(indexFileName, state.context);
      CodecUtil.writeIndexHeader(
          metaOut,
          META_CODEC_NAME,
          VERSION_CURRENT,
          state.segmentInfo.getId(),
          state.segmentSuffix);
      CodecUtil.writeIndexHeader(
          out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(this);
      }
    }
  }

  @Override
  public FieldWriter addField(FieldInfo field, long termsFilePointer) throws IOException {
    //// System.out.println("VGW: field=" + field.name);
    policy.newField(field);
    return new FSTFieldWriter(field, termsFilePointer);
  }

  /**
   * NOTE: if your codec does not sort in unicode code point order, you must override this method,
   * to simply return indexedTerm.length.
   */
  protected int indexedTermPrefixLength(final BytesRef priorTerm, final BytesRef indexedTerm) {
    // As long as codec sorts terms in unicode codepoint
    // order, we can safely strip off the non-distinguishing
    // suffix to save RAM in the loaded terms index.
    final int idxTermOffset = indexedTerm.offset;
    final int priorTermOffset = priorTerm.offset;
    final int limit = Math.min(priorTerm.length, indexedTerm.length);
    for (int byteIdx = 0; byteIdx < limit; byteIdx++) {
      if (priorTerm.bytes[priorTermOffset + byteIdx]
          != indexedTerm.bytes[idxTermOffset + byteIdx]) {
        return byteIdx + 1;
      }
    }
    return Math.min(1 + priorTerm.length, indexedTerm.length);
  }

  private class FSTFieldWriter extends FieldWriter {
    private final FSTCompiler fstCompiler;
    private final PositiveIntOutputs fstOutputs;
    private final long startTermsFilePointer;

    final FieldInfo fieldInfo;
    FST fst;

    private final BytesRefBuilder lastTerm = new BytesRefBuilder();
    private boolean first = true;

    public FSTFieldWriter(FieldInfo fieldInfo, long termsFilePointer) throws IOException {
      this.fieldInfo = fieldInfo;
      fstOutputs = PositiveIntOutputs.getSingleton();
      fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, fstOutputs).build();
      //// System.out.println("VGW: field=" + fieldInfo.name);

      // Always put empty string in
      fstCompiler.add(new IntsRef(), termsFilePointer);
      startTermsFilePointer = termsFilePointer;
    }

    @Override
    public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
      // System.out.println("VGW: index term=" + text.utf8ToString());
      // NOTE: we must force the first term per field to be
      // indexed, in case policy doesn't:
      if (policy.isIndexTerm(text, stats) || first) {
        first = false;
        // System.out.println("  YES");
        return true;
      } else {
        lastTerm.copyBytes(text);
        return false;
      }
    }

    private final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();

    @Override
    public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
      if (text.length == 0) {
        // We already added empty string in ctor
        assert termsFilePointer == startTermsFilePointer;
        return;
      }
      final int lengthSave = text.length;
      text.length = indexedTermPrefixLength(lastTerm.get(), text);
      try {
        fstCompiler.add(Util.toIntsRef(text, scratchIntsRef), termsFilePointer);
      } finally {
        text.length = lengthSave;
      }
      lastTerm.copyBytes(text);
    }

    @Override
    public void finish(long termsFilePointer) throws IOException {
      fst = FST.fromFSTReader(fstCompiler.compile(), fstCompiler.getFSTReader());
      if (fst != null) {
        metaOut.writeInt(fieldInfo.number);
        metaOut.writeVLong(out.getFilePointer());
        fst.save(metaOut, out);
      }
    }
  }

  @Override
  public void close() throws IOException {
    try {
      if (metaOut != null) {
        metaOut.writeInt(-1);
        CodecUtil.writeFooter(metaOut);
      }
      if (out != null) {
        CodecUtil.writeFooter(out);
      }
    } finally {
      try {
        IOUtils.close(out, metaOut);
      } finally {
        out = null;
        metaOut = null;
      }
    }
  }
}