All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.blockterms.FixedGapTermsIndexWriter Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.blockterms;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.packed.MonotonicBlockPackedWriter;
import org.apache.lucene.util.packed.PackedInts;

/**
 * Selects every Nth term as and index term, and hold term bytes (mostly) fully expanded in memory.
 * This terms index supports seeking by ord. See {@link VariableGapTermsIndexWriter} for a more
 * memory efficient terms index that does not support seeking by ord.
 *
 * @lucene.experimental
 */
public class FixedGapTermsIndexWriter extends TermsIndexWriterBase {
  protected IndexOutput out;

  /** Extension of terms index file */
  static final String TERMS_INDEX_EXTENSION = "tii";

  static final String CODEC_NAME = "FixedGapTermsIndex";
  static final int VERSION_START = 4;
  static final int VERSION_CURRENT = VERSION_START;

  static final int BLOCKSIZE = 4096;
  private final int termIndexInterval;
  public static final int DEFAULT_TERM_INDEX_INTERVAL = 32;

  private final List fields = new ArrayList<>();

  public FixedGapTermsIndexWriter(SegmentWriteState state) throws IOException {
    this(state, DEFAULT_TERM_INDEX_INTERVAL);
  }

  public FixedGapTermsIndexWriter(SegmentWriteState state, int termIndexInterval)
      throws IOException {
    if (termIndexInterval <= 0) {
      throw new IllegalArgumentException("invalid termIndexInterval: " + termIndexInterval);
    }
    this.termIndexInterval = termIndexInterval;
    final String indexFileName =
        IndexFileNames.segmentFileName(
            state.segmentInfo.name, state.segmentSuffix, TERMS_INDEX_EXTENSION);
    out = state.directory.createOutput(indexFileName, state.context);
    boolean success = false;
    try {
      CodecUtil.writeIndexHeader(
          out, CODEC_NAME, VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix);
      out.writeVInt(termIndexInterval);
      out.writeVInt(PackedInts.VERSION_CURRENT);
      out.writeVInt(BLOCKSIZE);
      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(out);
      }
    }
  }

  @Override
  public FieldWriter addField(FieldInfo field, long termsFilePointer) {
    // System.out.println("FGW: addFfield=" + field.name);
    SimpleFieldWriter writer = new SimpleFieldWriter(field, termsFilePointer);
    fields.add(writer);
    return writer;
  }

  /**
   * NOTE: if your codec does not sort in unicode code point order, you must override this method,
   * to simply return indexedTerm.length.
   */
  protected int indexedTermPrefixLength(final BytesRef priorTerm, final BytesRef indexedTerm) {
    // As long as codec sorts terms in unicode codepoint
    // order, we can safely strip off the non-distinguishing
    // suffix to save RAM in the loaded terms index.
    return StringHelper.sortKeyLength(priorTerm, indexedTerm);
  }

  private class SimpleFieldWriter extends FieldWriter {
    final FieldInfo fieldInfo;
    int numIndexTerms;
    final long indexStart;
    final long termsStart;
    long packedIndexStart;
    long packedOffsetsStart;
    private long numTerms;

    private ByteBuffersDataOutput offsetsBuffer = ByteBuffersDataOutput.newResettableInstance();
    private MonotonicBlockPackedWriter termOffsets =
        new MonotonicBlockPackedWriter(offsetsBuffer, BLOCKSIZE);
    private long currentOffset;

    private ByteBuffersDataOutput addressBuffer = ByteBuffersDataOutput.newResettableInstance();
    private MonotonicBlockPackedWriter termAddresses =
        new MonotonicBlockPackedWriter(addressBuffer, BLOCKSIZE);

    private final BytesRefBuilder lastTerm = new BytesRefBuilder();

    SimpleFieldWriter(FieldInfo fieldInfo, long termsFilePointer) {
      this.fieldInfo = fieldInfo;
      indexStart = out.getFilePointer();
      termsStart = termsFilePointer;
      // we write terms+1 offsets, term n's length is n+1 - n
      try {
        termOffsets.add(0L);
      } catch (IOException bogus) {
        throw new RuntimeException(bogus);
      }
    }

    @Override
    public boolean checkIndexTerm(BytesRef text, TermStats stats) throws IOException {
      // First term is first indexed term:
      // System.out.println("FGW: checkIndexTerm text=" + text.utf8ToString());
      if (0 == (numTerms++ % termIndexInterval)) {
        return true;
      } else {
        if (0 == numTerms % termIndexInterval) {
          // save last term just before next index term so we
          // can compute wasted suffix
          lastTerm.copyBytes(text);
        }
        return false;
      }
    }

    @Override
    public void add(BytesRef text, TermStats stats, long termsFilePointer) throws IOException {
      final int indexedTermLength;
      if (numIndexTerms == 0) {
        // no previous term: no bytes to write
        indexedTermLength = 0;
      } else {
        indexedTermLength = indexedTermPrefixLength(lastTerm.get(), text);
      }
      // System.out.println("FGW: add text=" + text.utf8ToString() + " " + text + " fp=" +
      // termsFilePointer);

      // write only the min prefix that shows the diff
      // against prior term
      out.writeBytes(text.bytes, text.offset, indexedTermLength);

      // save delta terms pointer
      termAddresses.add(termsFilePointer - termsStart);

      // save term length (in bytes)
      assert indexedTermLength <= Short.MAX_VALUE;
      currentOffset += indexedTermLength;
      termOffsets.add(currentOffset);

      lastTerm.copyBytes(text);
      numIndexTerms++;
    }

    @Override
    public void finish(long termsFilePointer) throws IOException {

      // write primary terms dict offsets
      packedIndexStart = out.getFilePointer();

      // relative to our indexStart
      termAddresses.finish();
      addressBuffer.copyTo(out);

      packedOffsetsStart = out.getFilePointer();

      // write offsets into the byte[] terms
      termOffsets.finish();
      offsetsBuffer.copyTo(out);

      // our referrer holds onto us, while other fields are
      // being written, so don't tie up this RAM:
      termOffsets = termAddresses = null;
      addressBuffer = null;
      offsetsBuffer = null;
    }
  }

  @Override
  public void close() throws IOException {
    if (out != null) {
      boolean success = false;
      try {
        final long dirStart = out.getFilePointer();
        final int fieldCount = fields.size();

        int nonNullFieldCount = 0;
        for (int i = 0; i < fieldCount; i++) {
          SimpleFieldWriter field = fields.get(i);
          if (field.numIndexTerms > 0) {
            nonNullFieldCount++;
          }
        }

        out.writeVInt(nonNullFieldCount);
        for (int i = 0; i < fieldCount; i++) {
          SimpleFieldWriter field = fields.get(i);
          if (field.numIndexTerms > 0) {
            out.writeVInt(field.fieldInfo.number);
            out.writeVInt(field.numIndexTerms);
            out.writeVLong(field.termsStart);
            out.writeVLong(field.indexStart);
            out.writeVLong(field.packedIndexStart);
            out.writeVLong(field.packedOffsetsStart);
          }
        }
        writeTrailer(dirStart);
        CodecUtil.writeFooter(out);
        success = true;
      } finally {
        if (success) {
          IOUtils.close(out);
        } else {
          IOUtils.closeWhileHandlingException(out);
        }
        out = null;
      }
    }
  }

  private void writeTrailer(long dirStart) throws IOException {
    out.writeLong(dirStart);
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy