org.apache.lucene.codecs.blockterms.FixedGapTermsIndexReader Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene-codecs Show documentation
Apache Lucene (module: codecs)
There is a newer version: 9.11.1
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.blockterms;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.packed.MonotonicBlockPackedReader;

/** 
 * TermsIndexReader for simple every Nth terms indexes.
 *
 * @see FixedGapTermsIndexWriter
 * @lucene.experimental 
 */
public class FixedGapTermsIndexReader extends TermsIndexReaderBase {

  // NOTE: long is overkill here, but we use this in a
  // number of places to multiply out the actual ord, and we
  // will overflow int during those multiplies.  So to avoid
  // having to upgrade each multiple to long in multiple
  // places (error prone), we use long here:
  private final long indexInterval;
  
  private final int packedIntsVersion;
  private final int blocksize;

  private final static int PAGED_BYTES_BITS = 15;

  // all fields share this single logical byte[]
  private final PagedBytes.Reader termBytesReader;

  final HashMap fields = new HashMap<>();
  
  public FixedGapTermsIndexReader(SegmentReadState state) throws IOException {
    final PagedBytes termBytes = new PagedBytes(PAGED_BYTES_BITS);
    
    String fileName = IndexFileNames.segmentFileName(state.segmentInfo.name, 
                                                     state.segmentSuffix, 
                                                     FixedGapTermsIndexWriter.TERMS_INDEX_EXTENSION);
    final IndexInput in = state.directory.openInput(fileName, state.context);
    
    boolean success = false;

    try {
      
      CodecUtil.checkIndexHeader(in, FixedGapTermsIndexWriter.CODEC_NAME,
                                       FixedGapTermsIndexWriter.VERSION_CURRENT, 
                                       FixedGapTermsIndexWriter.VERSION_CURRENT,
                                       state.segmentInfo.getId(), state.segmentSuffix);
      
      CodecUtil.checksumEntireFile(in);
      
      indexInterval = in.readVInt();
      if (indexInterval < 1) {
        throw new CorruptIndexException("invalid indexInterval: " + indexInterval, in);
      }
      packedIntsVersion = in.readVInt();
      blocksize = in.readVInt();
      
      seekDir(in);

      // Read directory
      final int numFields = in.readVInt();     
      if (numFields < 0) {
        throw new CorruptIndexException("invalid numFields: " + numFields, in);
      }
      //System.out.println("FGR: init seg=" + segment + " div=" + indexDivisor + " nF=" + numFields);
      for(int i=0;i 2B index terms
        if (numIndexTerms < 0) {
          throw new CorruptIndexException("invalid numIndexTerms: " + numIndexTerms, in);
        }
        final long termsStart = in.readVLong();
        final long indexStart = in.readVLong();
        final long packedIndexStart = in.readVLong();
        final long packedOffsetsStart = in.readVLong();
        if (packedIndexStart < indexStart) {
          throw new CorruptIndexException("invalid packedIndexStart: " + packedIndexStart + " indexStart: " + indexStart + "numIndexTerms: " + numIndexTerms, in);
        }
        final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field);
        FieldIndexData previous = fields.put(fieldInfo.name, new FieldIndexData(in, termBytes, indexStart, termsStart, packedIndexStart, packedOffsetsStart, numIndexTerms));
        if (previous != null) {
          throw new CorruptIndexException("duplicate field: " + fieldInfo.name, in);
        }
      }
      success = true;
    } finally {
      if (success) {
        IOUtils.close(in);
      } else {
        IOUtils.closeWhileHandlingException(in);
      }
      termBytesReader = termBytes.freeze(true);
    }
  }

  private class IndexEnum extends FieldIndexEnum {
    private final FieldIndexData fieldIndex;
    private final BytesRef term = new BytesRef();
    private long ord;

    public IndexEnum(FieldIndexData fieldIndex) {
      this.fieldIndex = fieldIndex;
    }

    @Override
    public BytesRef term() {
      return term;
    }

    @Override
    public long seek(BytesRef target) {
      long lo = 0;          // binary search
      long hi = fieldIndex.numIndexTerms - 1;

      while (hi >= lo) {
        long mid = (lo + hi) >>> 1;

        final long offset = fieldIndex.termOffsets.get(mid);
        final int length = (int) (fieldIndex.termOffsets.get(1+mid) - offset);
        termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);

        int delta = target.compareTo(term);
        if (delta < 0) {
          hi = mid - 1;
        } else if (delta > 0) {
          lo = mid + 1;
        } else {
          assert mid >= 0;
          ord = mid*indexInterval;
          return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(mid);
        }
      }

      if (hi < 0) {
        assert hi == -1;
        hi = 0;
      }

      final long offset = fieldIndex.termOffsets.get(hi);
      final int length = (int) (fieldIndex.termOffsets.get(1+hi) - offset);
      termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);

      ord = hi*indexInterval;
      return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(hi);
    }

    @Override
    public long next() {
      final long idx = 1 + (ord / indexInterval);
      if (idx >= fieldIndex.numIndexTerms) {
        return -1;
      }
      ord += indexInterval;

      final long offset = fieldIndex.termOffsets.get(idx);
      final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
      termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
      return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
    }

    @Override
    public long ord() {
      return ord;
    }

    @Override
    public long seek(long ord) {
      long idx = ord / indexInterval;
      // caller must ensure ord is in bounds
      assert idx < fieldIndex.numIndexTerms;
      final long offset = fieldIndex.termOffsets.get(idx);
      final int length = (int) (fieldIndex.termOffsets.get(1+idx) - offset);
      termBytesReader.fillSlice(term, fieldIndex.termBytesStart + offset, length);
      this.ord = idx * indexInterval;
      return fieldIndex.termsStart + fieldIndex.termsDictOffsets.get(idx);
    }
  }

  @Override
  public boolean supportsOrd() {
    return true;
  }

  private final class FieldIndexData implements Accountable {
    // where this field's terms begin in the packed byte[]
    // data
    final long termBytesStart;
    
    // offset into index termBytes
    final MonotonicBlockPackedReader termOffsets;
    
    // index pointers into main terms dict
    final MonotonicBlockPackedReader termsDictOffsets;
    
    final long numIndexTerms;
    final long termsStart;
    
    public FieldIndexData(IndexInput in, PagedBytes termBytes, long indexStart, long termsStart, long packedIndexStart, long packedOffsetsStart, long numIndexTerms) throws IOException {
      
      this.termsStart = termsStart;
      termBytesStart = termBytes.getPointer();
      
      IndexInput clone = in.clone();
      clone.seek(indexStart);
      
      this.numIndexTerms = numIndexTerms;
      assert this.numIndexTerms  > 0: "numIndexTerms=" + numIndexTerms;
      
      // slurp in the images from disk:
      
      try {
        final long numTermBytes = packedIndexStart - indexStart;
        termBytes.copy(clone, numTermBytes);
        
        // records offsets into main terms dict file
        termsDictOffsets = MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, numIndexTerms, false);
        
        // records offsets into byte[] term data
        termOffsets = MonotonicBlockPackedReader.of(clone, packedIntsVersion, blocksize, 1+numIndexTerms, false);
      } finally {
        clone.close();
      }
    }
    
    @Override
    public long ramBytesUsed() {
      return ((termOffsets!=null)? termOffsets.ramBytesUsed() : 0) + 
          ((termsDictOffsets!=null)? termsDictOffsets.ramBytesUsed() : 0);
    }

    @Override
    public Collection getChildResources() {
      List resources = new ArrayList<>();
      if (termOffsets != null) {
        resources.add(Accountables.namedAccountable("term lengths", termOffsets));
      }
      if (termsDictOffsets != null) {
        resources.add(Accountables.namedAccountable("offsets", termsDictOffsets));
      }
      return Collections.unmodifiableList(resources);
    }

    @Override
    public String toString() {
      return "FixedGapTermIndex(indexterms=" + numIndexTerms + ")";
    }
  }

  @Override
  public FieldIndexEnum getFieldEnum(FieldInfo fieldInfo) {
    return new IndexEnum(fields.get(fieldInfo.name));
  }

  @Override
  public void close() throws IOException {}

  private void seekDir(IndexInput input) throws IOException {
    input.seek(input.length() - CodecUtil.footerLength() - 8);
    long dirOffset = input.readLong();
    input.seek(dirOffset);
  }

  @Override
  public long ramBytesUsed() {
    long sizeInBytes = ((termBytesReader!=null)? termBytesReader.ramBytesUsed() : 0);
    for(FieldIndexData entry : fields.values()) {
      sizeInBytes += entry.ramBytesUsed();
    }
    return sizeInBytes;
  }

  @Override
  public Collection getChildResources() {
    return Accountables.namedAccountables("field", fields);
  }

  @Override
  public String toString() {
    return getClass().getSimpleName() + "(fields=" + fields.size() + ",interval=" + indexInterval + ")";
  }
}