All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.codecs.memory.DirectDocValuesProducer Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.codecs.memory;


import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.RandomAccessOrds;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.RamUsageEstimator;

/**
 * Reader for {@link DirectDocValuesFormat}
 */

class DirectDocValuesProducer extends DocValuesProducer {
  // metadata maps (just file pointers and minimal stuff)
  private final Map numerics = new HashMap<>();
  private final Map binaries = new HashMap<>();
  private final Map sorteds = new HashMap<>();
  private final Map sortedSets = new HashMap<>();
  private final Map sortedNumerics = new HashMap<>();
  private final IndexInput data;
  
  // ram instances we have already loaded
  private final Map numericInstances = new HashMap<>();
  private final Map binaryInstances = new HashMap<>();
  private final Map sortedInstances = new HashMap<>();
  private final Map sortedSetInstances = new HashMap<>();
  private final Map sortedNumericInstances = new HashMap<>();
  private final Map docsWithFieldInstances = new HashMap<>();
  
  private final int numEntries;
  
  private final int maxDoc;
  private final AtomicLong ramBytesUsed;
  private final int version;
  
  private final boolean merging;
  
  static final byte NUMBER = 0;
  static final byte BYTES = 1;
  static final byte SORTED = 2;
  static final byte SORTED_SET = 3;
  static final byte SORTED_SET_SINGLETON = 4;
  static final byte SORTED_NUMERIC = 5;
  static final byte SORTED_NUMERIC_SINGLETON = 6;

  static final int VERSION_START = 3;
  static final int VERSION_CURRENT = VERSION_START;
  
  // clone for merge: when merging we don't do any instances.put()s
  DirectDocValuesProducer(DirectDocValuesProducer original) throws IOException {
    assert Thread.holdsLock(original);
    numerics.putAll(original.numerics);
    binaries.putAll(original.binaries);
    sorteds.putAll(original.sorteds);
    sortedSets.putAll(original.sortedSets);
    sortedNumerics.putAll(original.sortedNumerics);
    data = original.data.clone();
    
    numericInstances.putAll(original.numericInstances);
    binaryInstances.putAll(original.binaryInstances);
    sortedInstances.putAll(original.sortedInstances);
    sortedSetInstances.putAll(original.sortedSetInstances);
    sortedNumericInstances.putAll(original.sortedNumericInstances);
    docsWithFieldInstances.putAll(original.docsWithFieldInstances);
    
    numEntries = original.numEntries;
    maxDoc = original.maxDoc;
    ramBytesUsed = new AtomicLong(original.ramBytesUsed.get());
    version = original.version;
    merging = true;
  }
    
  DirectDocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
    maxDoc = state.segmentInfo.maxDoc();
    merging = false;
    String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
    // read in the entries from the metadata file.
    ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context);
    ramBytesUsed = new AtomicLong(RamUsageEstimator.shallowSizeOfInstance(getClass()));
    boolean success = false;
    try {
      version = CodecUtil.checkIndexHeader(in, metaCodec, VERSION_START, VERSION_CURRENT, 
                                                 state.segmentInfo.getId(), state.segmentSuffix);
      numEntries = readFields(in, state.fieldInfos);

      CodecUtil.checkFooter(in);
      success = true;
    } finally {
      if (success) {
        IOUtils.close(in);
      } else {
        IOUtils.closeWhileHandlingException(in);
      }
    }

    String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
    this.data = state.directory.openInput(dataName, state.context);
    success = false;
    try {
      final int version2 = CodecUtil.checkIndexHeader(data, dataCodec, VERSION_START, VERSION_CURRENT,
                                                              state.segmentInfo.getId(), state.segmentSuffix);
      if (version != version2) {
        throw new CorruptIndexException("Format versions mismatch: meta=" + version + ", data=" + version2, data);
      }
      
      // NOTE: data file is too costly to verify checksum against all the bytes on open,
      // but for now we at least verify proper structure of the checksum footer: which looks
      // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
      // such as file truncation.
      CodecUtil.retrieveChecksum(data);

      success = true;
    } finally {
      if (!success) {
        IOUtils.closeWhileHandlingException(this.data);
      }
    }
  }

  private NumericEntry readNumericEntry(IndexInput meta) throws IOException {
    NumericEntry entry = new NumericEntry();
    entry.offset = meta.readLong();
    entry.count = meta.readInt();
    entry.missingOffset = meta.readLong();
    if (entry.missingOffset != -1) {
      entry.missingBytes = meta.readLong();
    } else {
      entry.missingBytes = 0;
    }
    entry.byteWidth = meta.readByte();

    return entry;
  }

  private BinaryEntry readBinaryEntry(IndexInput meta) throws IOException {
    BinaryEntry entry = new BinaryEntry();
    entry.offset = meta.readLong();
    entry.numBytes = meta.readInt();
    entry.count = meta.readInt();
    entry.missingOffset = meta.readLong();
    if (entry.missingOffset != -1) {
      entry.missingBytes = meta.readLong();
    } else {
      entry.missingBytes = 0;
    }

    return entry;
  }

  private SortedEntry readSortedEntry(IndexInput meta) throws IOException {
    SortedEntry entry = new SortedEntry();
    entry.docToOrd = readNumericEntry(meta);
    entry.values = readBinaryEntry(meta);
    return entry;
  }

  private SortedSetEntry readSortedSetEntry(IndexInput meta, boolean singleton) throws IOException {
    SortedSetEntry entry = new SortedSetEntry();
    if (singleton == false) {
      entry.docToOrdAddress = readNumericEntry(meta);
    }
    entry.ords = readNumericEntry(meta);
    entry.values = readBinaryEntry(meta);
    return entry;
  }
  
  private SortedNumericEntry readSortedNumericEntry(IndexInput meta, boolean singleton) throws IOException {
    SortedNumericEntry entry = new SortedNumericEntry();
    if (singleton == false) {
      entry.docToAddress = readNumericEntry(meta);
    }
    entry.values = readNumericEntry(meta);
    return entry;
  }

  private int readFields(IndexInput meta, FieldInfos infos) throws IOException {
    int numEntries = 0;
    int fieldNumber = meta.readVInt();
    while (fieldNumber != -1) {
      numEntries++;
      FieldInfo info = infos.fieldInfo(fieldNumber);
      int fieldType = meta.readByte();
      if (fieldType == NUMBER) {
        numerics.put(info.name, readNumericEntry(meta));
      } else if (fieldType == BYTES) {
        binaries.put(info.name, readBinaryEntry(meta));
      } else if (fieldType == SORTED) {
        SortedEntry entry = readSortedEntry(meta);
        sorteds.put(info.name, entry);
        binaries.put(info.name, entry.values);
      } else if (fieldType == SORTED_SET) {
        SortedSetEntry entry = readSortedSetEntry(meta, false);
        sortedSets.put(info.name, entry);
        binaries.put(info.name, entry.values);
      } else if (fieldType == SORTED_SET_SINGLETON) {
        SortedSetEntry entry = readSortedSetEntry(meta, true);
        sortedSets.put(info.name, entry);
        binaries.put(info.name, entry.values);
      } else if (fieldType == SORTED_NUMERIC) {
        SortedNumericEntry entry = readSortedNumericEntry(meta, false);
        sortedNumerics.put(info.name, entry);
      } else if (fieldType == SORTED_NUMERIC_SINGLETON) {
        SortedNumericEntry entry = readSortedNumericEntry(meta, true);
        sortedNumerics.put(info.name, entry);
      } else {
        throw new CorruptIndexException("invalid entry type: " + fieldType + ", field= " + info.name, meta);
      }
      fieldNumber = meta.readVInt();
    }
    return numEntries;
  }

  @Override
  public long ramBytesUsed() {
    return ramBytesUsed.get();
  }
  
  @Override
  public synchronized Collection getChildResources() {
    List resources = new ArrayList<>();
    resources.addAll(Accountables.namedAccountables("numeric field", numericInstances));
    resources.addAll(Accountables.namedAccountables("binary field", binaryInstances));
    resources.addAll(Accountables.namedAccountables("sorted field", sortedInstances));
    resources.addAll(Accountables.namedAccountables("sorted set field", sortedSetInstances));
    resources.addAll(Accountables.namedAccountables("sorted numeric field", sortedNumericInstances));
    resources.addAll(Accountables.namedAccountables("missing bitset field", docsWithFieldInstances));
    return Collections.unmodifiableList(resources);
  }
  
  @Override
  public String toString() {
    return getClass().getSimpleName() + "(entries=" + numEntries + ")";
  }

  @Override
  public void checkIntegrity() throws IOException {
    CodecUtil.checksumEntireFile(data.clone());
  }

  @Override
  public synchronized NumericDocValues getNumeric(FieldInfo field) throws IOException {
    NumericRawValues instance = numericInstances.get(field.name);
    if (instance == null) {
      // Lazy load
      instance = loadNumeric(numerics.get(field.name));
      if (!merging) {
        numericInstances.put(field.name, instance);
        ramBytesUsed.addAndGet(instance.ramBytesUsed());
      }
    }
    return instance.numerics;
  }
  
  private NumericRawValues loadNumeric(NumericEntry entry) throws IOException {
    NumericRawValues ret = new NumericRawValues();
    IndexInput data = this.data.clone();
    data.seek(entry.offset + entry.missingBytes);
    switch (entry.byteWidth) {
    case 1:
      {
        final byte[] values = new byte[entry.count];
        data.readBytes(values, 0, entry.count);
        ret.bytesUsed = RamUsageEstimator.sizeOf(values);
        ret.numerics = new NumericDocValues() {
          @Override
          public long get(int idx) {
            return values[idx];
          }
        };
        return ret;
      }

    case 2:
      {
        final short[] values = new short[entry.count];
        for(int i=0;i> 3];
          for (int i = 0; i < bits.length; i++) {
            bits[i] = data.readLong();
          }
          instance = new FixedBitSet(bits, maxDoc);
          if (!merging) {
            docsWithFieldInstances.put(field.name, instance);
            ramBytesUsed.addAndGet(instance.ramBytesUsed());
          }
        }
      }
      return instance;
    }
  }
  
  @Override
  public Bits getDocsWithField(FieldInfo field) throws IOException {
    switch(field.getDocValuesType()) {
      case SORTED_SET:
        return DocValues.docsWithValue(getSortedSet(field), maxDoc);
      case SORTED_NUMERIC:
        return DocValues.docsWithValue(getSortedNumeric(field), maxDoc);
      case SORTED:
        return DocValues.docsWithValue(getSorted(field), maxDoc);
      case BINARY:
        BinaryEntry be = binaries.get(field.name);
        return getMissingBits(field, be.missingOffset, be.missingBytes);
      case NUMERIC:
        NumericEntry ne = numerics.get(field.name);
        return getMissingBits(field, ne.missingOffset, ne.missingBytes);
      default: 
        throw new AssertionError();
    }
  }

  @Override
  public synchronized DocValuesProducer getMergeInstance() throws IOException {
    return new DirectDocValuesProducer(this);
  }

  @Override
  public void close() throws IOException {
    data.close();
  }

  static class BinaryRawValues implements Accountable {
    byte[] bytes;
    int[] address;
    
    @Override
    public long ramBytesUsed() {
      long bytesUsed = RamUsageEstimator.sizeOf(bytes);
      if (address != null) {
        bytesUsed += RamUsageEstimator.sizeOf(address);
      }
      return bytesUsed;
    }
    
    @Override
    public Collection getChildResources() {
      List resources = new ArrayList<>();
      if (address != null) {
        resources.add(Accountables.namedAccountable("addresses", RamUsageEstimator.sizeOf(address)));
      }
      resources.add(Accountables.namedAccountable("bytes", RamUsageEstimator.sizeOf(bytes)));
      return Collections.unmodifiableList(resources);
    }

    @Override
    public String toString() {
      return getClass().getSimpleName();
    }
  }
  
  static class NumericRawValues implements Accountable {
    NumericDocValues numerics;
    long bytesUsed;
    
    @Override
    public long ramBytesUsed() {
      return bytesUsed;
    }
    
    @Override
    public String toString() {
      return getClass().getSimpleName();
    }
  }

  static class SortedRawValues implements Accountable {
    NumericRawValues docToOrd;

    @Override
    public long ramBytesUsed() {
      return docToOrd.ramBytesUsed();
    }

    @Override
    public Collection getChildResources() {
      return docToOrd.getChildResources();
    }
    
    @Override
    public String toString() {
      return getClass().getSimpleName();
    }
  }
  
  static class SortedNumericRawValues implements Accountable {
    NumericRawValues docToAddress;
    NumericRawValues values;
    
    @Override
    public long ramBytesUsed() {
      long bytesUsed = values.ramBytesUsed();
      if (docToAddress != null) {
        bytesUsed += docToAddress.ramBytesUsed();
      }
      return bytesUsed;
    }
    
    @Override
    public Collection getChildResources() {
      List resources = new ArrayList<>();
      if (docToAddress != null) {
        resources.add(Accountables.namedAccountable("addresses", docToAddress));
      }
      resources.add(Accountables.namedAccountable("values", values));
      return Collections.unmodifiableList(resources);
    }
    
    @Override
    public String toString() {
      return getClass().getSimpleName();
    }
  }

  static class SortedSetRawValues implements Accountable {
    NumericRawValues docToOrdAddress;
    NumericRawValues ords;

    @Override
    public long ramBytesUsed() {
      long bytesUsed = ords.ramBytesUsed();
      if (docToOrdAddress != null) {
        bytesUsed += docToOrdAddress.ramBytesUsed();
      }
      return bytesUsed;
    }

    @Override
    public Collection getChildResources() {
      List resources = new ArrayList<>();
      if (docToOrdAddress != null) {
        resources.add(Accountables.namedAccountable("addresses", docToOrdAddress));
      }
      resources.add(Accountables.namedAccountable("ordinals", ords));
      return Collections.unmodifiableList(resources);
    }
    
    @Override
    public String toString() {
      return getClass().getSimpleName();
    }
  }

  static class NumericEntry {
    long offset;
    int count;
    long missingOffset;
    long missingBytes;
    byte byteWidth;
    int packedIntsVersion;
  }

  static class BinaryEntry {
    long offset;
    long missingOffset;
    long missingBytes;
    int count;
    int numBytes;
    int minLength;
    int maxLength;
    int packedIntsVersion;
    int blockSize;
  }
  
  static class SortedEntry {
    NumericEntry docToOrd;
    BinaryEntry values;
  }

  static class SortedSetEntry {
    NumericEntry docToOrdAddress;
    NumericEntry ords;
    BinaryEntry values;
  }
  
  static class SortedNumericEntry {
    NumericEntry docToAddress;
    NumericEntry values;
  }
  
  static class FSTEntry {
    long offset;
    long numOrds;
  }
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy