org.apache.lucene.codecs.blockterms.FixedGapTermsIndexReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-codecs Show documentation
Codecs and postings formats for Apache Lucene.
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.blockterms;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.packed.MonotonicBlockPackedReader;

/**
 * TermsIndexReader for simple every Nth terms indexes.
 *
 * @see FixedGapTermsIndexWriter
 * @lucene.experimental
 */
public class FixedGapTermsIndexReader extends TermsIndexReaderBase {

  // NOTE: long is overkill here, but we use this in a
  // number of places to multiply out the actual ord, and we
  // will overflow int during those multiplies.  So to avoid
  // having to upgrade each multiple to long in multiple
  // places (error-prone), we use long here:
  private final long indexInterval;

  private final int packedIntsVersion;
  private final int blocksize;

  private static final int PAGED_BYTES_BITS = 15;

  // all fields share this single logical byte[]
  private final PagedBytes.Reader termBytesReader;

  final HashMap fields = new HashMap<>();

  public FixedGapTermsIndexReader(SegmentReadState state) throws IOException {
    final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS);

    String fileName =
        IndexFileNames.segmentFileName(
            state.segmentInfo.name,
            state.segmentSuffix,
            FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION);
    final IndexInput in = state.directory.openInput(fileName, state.context);

    boolean success = false;

    try {

      CodecUtil.checkIndexHeader(
          in,
          FixedGapTermsIndexWriter.CODEC_NAME,
          FixedGapTermsIndexWriter.VERSION_CURRENT,
          FixedGapTermsIndexWriter.VERSION_CURRENT,
          state.segmentInfo.getId(),
          state.segmentSuffix);

      CodecUtil.checksumEntireFile(in);

      indexInterval = in.readVInt();
      if (indexInterval < 1) {
        throw new CorruptIndexException("invalid indexInterval: " + indexInterval, in);
      }
      packedIntsVersion = in.readVInt();
      blocksize = in.readVInt();

      seekDir(in);

      // Read directory
      final int numFields = in.readVInt();
      if (numFields < 0) {
        throw new CorruptIndexException("invalid numFields: " + numFields, in);
      }
      // System.out.println("FGR: init seg=" + segment + " div=" + indexDivisor + " nF=" +
      // numFields);
      for (int i = 0; i < numFields; i++) {
        final int field = in.readVInt();
        final long numIndexTerms =
            in.readVInt(); // TODO: change this to a vLong if we fix writer to support > 2B index
        // terms
        if (numIndexTerms < 0) {
          throw new CorruptIndexException("invalid numIndexTerms: " + numIndexTerms, in);
        }
        final long termsStart = in.readVLong();
        final long indexStart = in.readVLong();
        final long packedIndexStart = in.readVLong();
        final long packedOffsetsStart = in.readVLong();
        if (packedIndexStart < indexStart) {
          throw new CorruptIndexException(
              "invalid packedIndexStart: "
                  + packedIndexStart
                  + " indexStart: "
                  + indexStart
                  + " numIndexTerms: "
                  + numIndexTerms,
              in);
        }
        final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
        FieldIndexData previous =
            fields.put(
                fieldInfo.name,
                new FieldIndexData(
                    in,
                    termBytes,
                    indexStart,
                    termsStart,
                    packedIndexStart,
                    packedOffsetsStart,
                    numIndexTerms));
        if (previous != null) {
          throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in);
        }
      }
      success = true;
    } finally {
      if (success) {
        IOUtils.close(in);
      } else {
        IOUtils.closeWhileHandlingException(in);
      }
      termBytesReader = termBytes.freeze(true);
    }
  }

  private class IndexEnum extends FieldIndexEnum {
    private final FieldIndexData fieldIndex;
    private final BytesRef term = new BytesRef();
    private long ord;

    public IndexEnum(FieldIndexData fieldIndex) {
      this.fieldIndex = fieldIndex;
    }

    @Override
    public BytesRef term() {
      return term;
    }

    @Override
    public long seek(BytesRef target) {
      long lo = 0; // binary search
      long hi = fieldIndex.numIndexTerms - 1;

      while (hi >= lo) {
        long mid = (lo + hi) >>> 1;

        final long offset = fieldIndex.termOffsets.get(mid);
        final int length = (int) (fieldIndex.termOffsets.get(1 + mid) - offset);
        termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);

        int delta = target.compareTo(term);
        if (delta < 0) {
          hi = mid - 1;
        } else if (delta > 0) {
          lo = mid + 1;
        } else {
          assert mid >= 0;
          ord = mid * indexInterval;
          return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(mid);
        }
      }

      if (hi < 0) {
        assert hi == -1;
        hi = 0;
      }

      final long offset = fieldIndex.termOffsets.get(hi);
      final int length = (int) (fieldIndex.termOffsets.get(1 + hi) - offset);
      termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);

      ord = hi * indexInterval;
      return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(hi);
    }

    @Override
    public long next() {
      final long idx = 1 + (ord / indexInterval);
      if (idx >= fieldIndex.numIndexTerms) {
        return -1;
      }
      ord += indexInterval;

      final long offset = fieldIndex.termOffsets.get(idx);
      final int length = (int) (fieldIndex.termOffsets.get(1 + idx) - offset);
      termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
      return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
    }

    @Override
    public long ord() {
      return ord;
    }

    @Override
    public long seek(long ord) {
      long idx = ord / indexInterval;
      // caller must ensure ord is in bounds
      assert idx < fieldIndex.numIndexTerms;
      final long offset = fieldIndex.termOffsets.get(idx);
      final int length = (int) (fieldIndex.termOffsets.get(1 + idx) - offset);
      termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
      this.ord = idx * indexInterval;
      return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
    }
  }

  @Override
  public boolean supportsOrd() {
    return true;
  }

  private final class FieldIndexData implements Accountable {
    // where this field's terms begin in the packed byte[]
    // data
    final long termBytesStart;

    // offset into index termBytes
    final MonotonicBlockPackedReader termOffsets;

    // index pointers into main terms dict
    final MonotonicBlockPackedReader termsDictOffsets;

    final long numIndexTerms;
    final long termsStart;

    public FieldIndexData(
        IndexInput in,
        PagedBytes termBytes,
        long indexStart,
        long termsStart,
        long packedIndexStart,
        long packedOffsetsStart,
        long numIndexTerms)
        throws IOException {

      this.termsStart = termsStart;
      termBytesStart = termBytes.getPointer();

      IndexInput clone = in.clone();
      clone.seek(indexStart);

      this.numIndexTerms = numIndexTerms;
      assert this.numIndexTerms > 0 : "numIndexTerms=" + numIndexTerms;

      // slurp in the images from disk:

      try (clone) {
        final long numTermBytes = packedIndexStart - indexStart;
        termBytes.copy(clone, numTermBytes);

        // records offsets into main terms dict file
        termsDictOffsets =
            MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, numIndexTerms);

        // records offsets into byte[] term data
        termOffsets =
            MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, 1 + numIndexTerms);
      }
    }

    @Override
    public long ramBytesUsed() {
      return ((termOffsets != null) ? termOffsets.ramBytesUsed() : 0)
          + ((termsDictOffsets != null) ? termsDictOffsets.ramBytesUsed() : 0);
    }

    @Override
    public Collection getChildResources() {
      List resources = new ArrayList<>();
      if (termOffsets != null) {
        resources.add(Accountables.namedAccountable("term lengths", termOffsets));
      }
      if (termsDictOffsets != null) {
        resources.add(Accountables.namedAccountable("offsets", termsDictOffsets));
      }
      return Collections.unmodifiableList(resources);
    }

    @Override
    public String toString() {
      return "FixedGapTermIndex(indexterms=" + numIndexTerms + ")";
    }
  }

  @Override
  public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) {
    return new IndexEnum(fields.get(fieldInfo.name));
  }

  @Override
  public void close() throws IOException {}

  private void seekDir(IndexInput input) throws IOException {
    input.seek(input.length() - CodecUtil.footerLength() - 8);
    long dirOffset = input.readLong();
    input.seek(dirOffset);
  }

  @Override
  public String toString() {
    return getClass().getSimpleName()
        + "(fields="
        + fields.size()
        + ",interval="
        + indexInterval
        + ")";
  }
}