org.apache.lucene.codecs.lucene54.Lucene54DocValuesProducer Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.lucene54;
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.lucene54.Lucene54DocValuesConsumer.NumberType;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.RandomAccessOrds;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.SortedNumericDocValues;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.RandomAccessInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.LongValues;
import org.apache.lucene.util.PagedBytes;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.packed.DirectMonotonicReader;
import org.apache.lucene.util.packed.DirectReader;
import org.apache.lucene.util.packed.MonotonicBlockPackedReader;
import static org.apache.lucene.codecs.lucene54.Lucene54DocValuesFormat.*;
/** reader for {@link Lucene54DocValuesFormat} */
final class Lucene54DocValuesProducer extends DocValuesProducer implements Closeable {
private final Map numerics = new HashMap<>();
private final Map binaries = new HashMap<>();
private final Map sortedSets = new HashMap<>();
private final Map sortedNumerics = new HashMap<>();
private final Map ords = new HashMap<>();
private final Map ordIndexes = new HashMap<>();
private final int numFields;
private final AtomicLong ramBytesUsed;
private final IndexInput data;
private final int maxDoc;
// memory-resident structures
private final Map addressInstances = new HashMap<>();
private final Map reverseIndexInstances = new HashMap<>();
private final Map directAddressesMeta = new HashMap<>();
private final boolean merging;
// clone for merge: when merging we don't do any instances.put()s
Lucene54DocValuesProducer(Lucene54DocValuesProducer original) throws IOException {
assert Thread.holdsLock(original);
numerics.putAll(original.numerics);
binaries.putAll(original.binaries);
sortedSets.putAll(original.sortedSets);
sortedNumerics.putAll(original.sortedNumerics);
ords.putAll(original.ords);
ordIndexes.putAll(original.ordIndexes);
numFields = original.numFields;
ramBytesUsed = new AtomicLong(original.ramBytesUsed.get());
data = original.data.clone();
maxDoc = original.maxDoc;
addressInstances.putAll(original.addressInstances);
reverseIndexInstances.putAll(original.reverseIndexInstances);
merging = true;
}
/** expert: instantiates a new reader */
Lucene54DocValuesProducer(SegmentReadState state, String dataCodec, String dataExtension, String metaCodec, String metaExtension) throws IOException {
String metaName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, metaExtension);
this.maxDoc = state.segmentInfo.maxDoc();
merging = false;
ramBytesUsed = new AtomicLong(RamUsageEstimator.shallowSizeOfInstance(getClass()));
int version = -1;
int numFields = -1;
// read in the entries from the metadata file.
try (ChecksumIndexInput in = state.directory.openChecksumInput(metaName, state.context)) {
Throwable priorE = null;
try {
version = CodecUtil.checkIndexHeader(in, metaCodec,
Lucene54DocValuesFormat.VERSION_START,
Lucene54DocValuesFormat.VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
numFields = readFields(in, state.fieldInfos);
} catch (Throwable exception) {
priorE = exception;
} finally {
CodecUtil.checkFooter(in, priorE);
}
}
this.numFields = numFields;
String dataName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, dataExtension);
this.data = state.directory.openInput(dataName, state.context);
boolean success = false;
try {
final int version2 = CodecUtil.checkIndexHeader(data, dataCodec,
Lucene54DocValuesFormat.VERSION_START,
Lucene54DocValuesFormat.VERSION_CURRENT,
state.segmentInfo.getId(),
state.segmentSuffix);
if (version != version2) {
throw new CorruptIndexException("Format versions mismatch: meta=" + version + ", data=" + version2, data);
}
// NOTE: data file is too costly to verify checksum against all the bytes on open,
// but for now we at least verify proper structure of the checksum footer: which looks
// for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption
// such as file truncation.
CodecUtil.retrieveChecksum(data);
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(this.data);
}
}
}
private void readSortedField(FieldInfo info, IndexInput meta) throws IOException {
// sorted = binary + numeric
if (meta.readVInt() != info.number) {
throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene54DocValuesFormat.BINARY) {
throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta);
}
BinaryEntry b = readBinaryEntry(info, meta);
binaries.put(info.name, b);
if (meta.readVInt() != info.number) {
throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) {
throw new CorruptIndexException("sorted entry for field: " + info.name + " is corrupt", meta);
}
NumericEntry n = readNumericEntry(info, meta);
ords.put(info.name, n);
}
private void readSortedSetFieldWithAddresses(FieldInfo info, IndexInput meta) throws IOException {
// sortedset = binary + numeric (addresses) + ordIndex
if (meta.readVInt() != info.number) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene54DocValuesFormat.BINARY) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
BinaryEntry b = readBinaryEntry(info, meta);
binaries.put(info.name, b);
if (meta.readVInt() != info.number) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
NumericEntry n1 = readNumericEntry(info, meta);
ords.put(info.name, n1);
if (meta.readVInt() != info.number) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
NumericEntry n2 = readNumericEntry(info, meta);
ordIndexes.put(info.name, n2);
}
private void readSortedSetFieldWithTable(FieldInfo info, IndexInput meta) throws IOException {
// sortedset table = binary + ordset table + ordset index
if (meta.readVInt() != info.number) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene54DocValuesFormat.BINARY) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
BinaryEntry b = readBinaryEntry(info, meta);
binaries.put(info.name, b);
if (meta.readVInt() != info.number) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
NumericEntry n = readNumericEntry(info, meta);
ords.put(info.name, n);
}
private int readFields(IndexInput meta, FieldInfos infos) throws IOException {
int numFields = 0;
int fieldNumber = meta.readVInt();
while (fieldNumber != -1) {
numFields++;
FieldInfo info = infos.fieldInfo(fieldNumber);
if (info == null) {
// trickier to validate more: because we use multiple entries for "composite" types like sortedset, etc.
throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta);
}
byte type = meta.readByte();
if (type == Lucene54DocValuesFormat.NUMERIC) {
numerics.put(info.name, readNumericEntry(info, meta));
} else if (type == Lucene54DocValuesFormat.BINARY) {
BinaryEntry b = readBinaryEntry(info, meta);
binaries.put(info.name, b);
} else if (type == Lucene54DocValuesFormat.SORTED) {
readSortedField(info, meta);
} else if (type == Lucene54DocValuesFormat.SORTED_SET) {
SortedSetEntry ss = readSortedSetEntry(meta);
sortedSets.put(info.name, ss);
if (ss.format == SORTED_WITH_ADDRESSES) {
readSortedSetFieldWithAddresses(info, meta);
} else if (ss.format == SORTED_SET_TABLE) {
readSortedSetFieldWithTable(info, meta);
} else if (ss.format == SORTED_SINGLE_VALUED) {
if (meta.readVInt() != fieldNumber) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene54DocValuesFormat.SORTED) {
throw new CorruptIndexException("sortedset entry for field: " + info.name + " is corrupt", meta);
}
readSortedField(info, meta);
} else {
throw new AssertionError();
}
} else if (type == Lucene54DocValuesFormat.SORTED_NUMERIC) {
SortedSetEntry ss = readSortedSetEntry(meta);
sortedNumerics.put(info.name, ss);
if (ss.format == SORTED_WITH_ADDRESSES) {
if (meta.readVInt() != fieldNumber) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
numerics.put(info.name, readNumericEntry(info, meta));
if (meta.readVInt() != fieldNumber) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
NumericEntry ordIndex = readNumericEntry(info, meta);
ordIndexes.put(info.name, ordIndex);
} else if (ss.format == SORTED_SET_TABLE) {
if (meta.readVInt() != info.number) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
NumericEntry n = readNumericEntry(info, meta);
ords.put(info.name, n);
} else if (ss.format == SORTED_SINGLE_VALUED) {
if (meta.readVInt() != fieldNumber) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
if (meta.readByte() != Lucene54DocValuesFormat.NUMERIC) {
throw new CorruptIndexException("sortednumeric entry for field: " + info.name + " is corrupt", meta);
}
numerics.put(info.name, readNumericEntry(info, meta));
} else {
throw new AssertionError();
}
} else {
throw new CorruptIndexException("invalid type: " + type, meta);
}
fieldNumber = meta.readVInt();
}
return numFields;
}
private NumericEntry readNumericEntry(FieldInfo info, IndexInput meta) throws IOException {
NumericEntry entry = new NumericEntry();
entry.format = meta.readVInt();
entry.missingOffset = meta.readLong();
if (entry.format == SPARSE_COMPRESSED) {
// sparse bits need a bit more metadata
entry.numDocsWithValue = meta.readVLong();
final int blockShift = meta.readVInt();
entry.monotonicMeta = DirectMonotonicReader.loadMeta(meta, entry.numDocsWithValue, blockShift);
ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed());
directAddressesMeta.put(info.name, entry.monotonicMeta);
}
entry.offset = meta.readLong();
entry.count = meta.readVLong();
switch(entry.format) {
case CONST_COMPRESSED:
entry.minValue = meta.readLong();
if (entry.count > Integer.MAX_VALUE) {
// currently just a limitation e.g. of bits interface and so on.
throw new CorruptIndexException("illegal CONST_COMPRESSED count: " + entry.count, meta);
}
break;
case GCD_COMPRESSED:
entry.minValue = meta.readLong();
entry.gcd = meta.readLong();
entry.bitsPerValue = meta.readVInt();
break;
case TABLE_COMPRESSED:
final int uniqueValues = meta.readVInt();
if (uniqueValues > 256) {
throw new CorruptIndexException("TABLE_COMPRESSED cannot have more than 256 distinct values, got=" + uniqueValues, meta);
}
entry.table = new long[uniqueValues];
for (int i = 0; i < uniqueValues; ++i) {
entry.table[i] = meta.readLong();
}
ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(entry.table));
entry.bitsPerValue = meta.readVInt();
break;
case DELTA_COMPRESSED:
entry.minValue = meta.readLong();
entry.bitsPerValue = meta.readVInt();
break;
case MONOTONIC_COMPRESSED:
final int blockShift = meta.readVInt();
entry.monotonicMeta = DirectMonotonicReader.loadMeta(meta, maxDoc + 1, blockShift);
ramBytesUsed.addAndGet(entry.monotonicMeta.ramBytesUsed());
directAddressesMeta.put(info.name, entry.monotonicMeta);
break;
case SPARSE_COMPRESSED:
final byte numberType = meta.readByte();
switch (numberType) {
case 0:
entry.numberType = NumberType.VALUE;
break;
case 1:
entry.numberType = NumberType.ORDINAL;
break;
default:
throw new CorruptIndexException("Number type can only be 0 or 1, got=" + numberType, meta);
}
// now read the numeric entry for non-missing values
final int fieldNumber = meta.readVInt();
if (fieldNumber != info.number) {
throw new CorruptIndexException("Field numbers mistmatch: " + fieldNumber + " != " + info.number, meta);
}
final int dvFormat = meta.readByte();
if (dvFormat != NUMERIC) {
throw new CorruptIndexException("Formats mistmatch: " + dvFormat + " != " + NUMERIC, meta);
}
entry.nonMissingValues = readNumericEntry(info, meta);
break;
default:
throw new CorruptIndexException("Unknown format: " + entry.format + ", input=", meta);
}
entry.endOffset = meta.readLong();
return entry;
}
private BinaryEntry readBinaryEntry(FieldInfo info, IndexInput meta) throws IOException {
BinaryEntry entry = new BinaryEntry();
entry.format = meta.readVInt();
entry.missingOffset = meta.readLong();
entry.minLength = meta.readVInt();
entry.maxLength = meta.readVInt();
entry.count = meta.readVLong();
entry.offset = meta.readLong();
switch(entry.format) {
case BINARY_FIXED_UNCOMPRESSED:
break;
case BINARY_PREFIX_COMPRESSED:
entry.addressesOffset = meta.readLong();
entry.packedIntsVersion = meta.readVInt();
entry.blockSize = meta.readVInt();
entry.reverseIndexOffset = meta.readLong();
break;
case BINARY_VARIABLE_UNCOMPRESSED:
entry.addressesOffset = meta.readLong();
final int blockShift = meta.readVInt();
entry.addressesMeta = DirectMonotonicReader.loadMeta(meta, entry.count + 1, blockShift);
ramBytesUsed.addAndGet(entry.addressesMeta.ramBytesUsed());
directAddressesMeta.put(info.name, entry.addressesMeta);
entry.addressesEndOffset = meta.readLong();
break;
default:
throw new CorruptIndexException("Unknown format: " + entry.format, meta);
}
return entry;
}
SortedSetEntry readSortedSetEntry(IndexInput meta) throws IOException {
SortedSetEntry entry = new SortedSetEntry();
entry.format = meta.readVInt();
if (entry.format == SORTED_SET_TABLE) {
final int totalTableLength = meta.readInt();
if (totalTableLength > 256) {
throw new CorruptIndexException("SORTED_SET_TABLE cannot have more than 256 values in its dictionary, got=" + totalTableLength, meta);
}
entry.table = new long[totalTableLength];
for (int i = 0; i < totalTableLength; ++i) {
entry.table[i] = meta.readLong();
}
ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(entry.table));
final int tableSize = meta.readInt();
if (tableSize > totalTableLength + 1) { // +1 because of the empty set
throw new CorruptIndexException("SORTED_SET_TABLE cannot have more set ids than ords in its dictionary, got " + totalTableLength + " ords and " + tableSize + " sets", meta);
}
entry.tableOffsets = new int[tableSize + 1];
for (int i = 1; i < entry.tableOffsets.length; ++i) {
entry.tableOffsets[i] = entry.tableOffsets[i - 1] + meta.readInt();
}
ramBytesUsed.addAndGet(RamUsageEstimator.sizeOf(entry.tableOffsets));
} else if (entry.format != SORTED_SINGLE_VALUED && entry.format != SORTED_WITH_ADDRESSES) {
throw new CorruptIndexException("Unknown format: " + entry.format, meta);
}
return entry;
}
@Override
public NumericDocValues getNumeric(FieldInfo field) throws IOException {
NumericEntry entry = numerics.get(field.name);
return getNumeric(entry);
}
@Override
public long ramBytesUsed() {
return ramBytesUsed.get();
}
@Override
public synchronized Collection getChildResources() {
List resources = new ArrayList<>();
resources.addAll(Accountables.namedAccountables("addresses field", addressInstances));
resources.addAll(Accountables.namedAccountables("reverse index field", reverseIndexInstances));
resources.addAll(Accountables.namedAccountables("direct addresses meta field", directAddressesMeta));
return Collections.unmodifiableList(resources);
}
@Override
public void checkIntegrity() throws IOException {
CodecUtil.checksumEntireFile(data);
}
@Override
public String toString() {
return getClass().getSimpleName() + "(fields=" + numFields + ")";
}
LongValues getNumeric(NumericEntry entry) throws IOException {
switch (entry.format) {
case CONST_COMPRESSED: {
final long constant = entry.minValue;
final Bits live = getLiveBits(entry.missingOffset, (int)entry.count);
return new LongValues() {
@Override
public long get(long index) {
return live.get((int)index) ? constant : 0;
}
};
}
case DELTA_COMPRESSED: {
RandomAccessInput slice = this.data.randomAccessSlice(entry.offset, entry.endOffset - entry.offset);
final long delta = entry.minValue;
final LongValues values = DirectReader.getInstance(slice, entry.bitsPerValue, 0);
return new LongValues() {
@Override
public long get(long id) {
return delta + values.get(id);
}
};
}
case GCD_COMPRESSED: {
RandomAccessInput slice = this.data.randomAccessSlice(entry.offset, entry.endOffset - entry.offset);
final long min = entry.minValue;
final long mult = entry.gcd;
final LongValues quotientReader = DirectReader.getInstance(slice, entry.bitsPerValue, 0);
return new LongValues() {
@Override
public long get(long id) {
return min + mult * quotientReader.get(id);
}
};
}
case TABLE_COMPRESSED: {
RandomAccessInput slice = this.data.randomAccessSlice(entry.offset, entry.endOffset - entry.offset);
final long table[] = entry.table;
final LongValues ords = DirectReader.getInstance(slice, entry.bitsPerValue, 0);
return new LongValues() {
@Override
public long get(long id) {
return table[(int) ords.get(id)];
}
};
}
case SPARSE_COMPRESSED:
final SparseBits docsWithField = getSparseLiveBits(entry);
final LongValues values = getNumeric(entry.nonMissingValues);
final long missingValue;
switch (entry.numberType) {
case ORDINAL:
missingValue = -1L;
break;
case VALUE:
missingValue = 0L;
break;
default:
throw new AssertionError();
}
return new SparseLongValues(docsWithField, values, missingValue);
default:
throw new AssertionError();
}
}
static class SparseBits implements Bits {
final long maxDoc, docIDsLength, firstDocId;
final LongValues docIds;
long index; // index of docId in docIds
long docId; // doc ID at index
long nextDocId; // doc ID at (index+1)
SparseBits(long maxDoc, long docIDsLength, LongValues docIDs) {
if (docIDsLength > 0 && maxDoc <= docIDs.get(docIDsLength - 1)) {
throw new IllegalArgumentException("maxDoc must be > the last element of docIDs");
}
this.maxDoc = maxDoc;
this.docIDsLength = docIDsLength;
this.docIds = docIDs;
this.firstDocId = docIDsLength == 0 ? maxDoc : docIDs.get(0);
reset();
}
private void reset() {
index = -1;
this.docId = -1;
this.nextDocId = firstDocId;
}
/** Gallop forward and stop as soon as an index is found that is greater than
* the given docId. {@code index} will store an index that stores a value
* that is <= {@code docId} while the return value will give an index
* that stores a value that is > {@code docId}. These indices can then be
* used to binary search. */
private long gallop(long docId) {
index++;
this.docId = nextDocId;
long hiIndex = index + 1;
while (true) {
if (hiIndex >= docIDsLength) {
hiIndex = docIDsLength;
nextDocId = maxDoc;
break;
}
final long hiDocId = docIds.get(hiIndex);
if (hiDocId > docId) {
nextDocId = hiDocId;
break;
}
final long delta = hiIndex - index;
index = hiIndex;
this.docId = hiDocId;
hiIndex += delta << 1; // double the step each time
}
return hiIndex;
}
private void binarySearch(long hiIndex, long docId) {
while (index + 1 < hiIndex) {
final long midIndex = (index + hiIndex) >>> 1;
final long midDocId = docIds.get(midIndex);
if (midDocId > docId) {
hiIndex = midIndex;
nextDocId = midDocId;
} else {
index = midIndex;
this.docId = midDocId;
}
}
}
private boolean checkInvariants(long nextIndex, long docId) {
assert this.docId <= docId;
assert this.nextDocId > docId;
assert (index == -1 && this.docId == -1) || this.docId == docIds.get(index);
assert (nextIndex == docIDsLength && nextDocId == maxDoc) || nextDocId == docIds.get(nextIndex);
return true;
}
private void exponentialSearch(long docId) {
// seek forward by doubling the interval on each iteration
final long hiIndex = gallop(docId);
assert checkInvariants(hiIndex, docId);
// now perform the actual binary search
binarySearch(hiIndex, docId);
}
boolean get(final long docId) {
if (docId < this.docId) {
// reading doc IDs backward, go back to the start
reset();
}
if (docId >= nextDocId) {
exponentialSearch(docId);
}
assert checkInvariants(index + 1, docId);
return docId == this.docId;
}
@Override
public boolean get(int index) {
return get((long) index);
}
@Override
public int length() {
return (int) maxDoc;
}
}
static class SparseLongValues extends LongValues {
final SparseBits docsWithField;
final LongValues values;
final long missingValue;
SparseLongValues(SparseBits docsWithField, LongValues values, long missingValue) {
this.docsWithField = docsWithField;
this.values = values;
this.missingValue = missingValue;
}
@Override
public long get(long docId) {
if (docsWithField.get(docId)) {
return values.get(docsWithField.index);
} else {
return missingValue;
}
}
}
@Override
public BinaryDocValues getBinary(FieldInfo field) throws IOException {
BinaryEntry bytes = binaries.get(field.name);
switch(bytes.format) {
case BINARY_FIXED_UNCOMPRESSED:
return getFixedBinary(field, bytes);
case BINARY_VARIABLE_UNCOMPRESSED:
return getVariableBinary(field, bytes);
case BINARY_PREFIX_COMPRESSED:
return getCompressedBinary(field, bytes);
default:
throw new AssertionError();
}
}
private BinaryDocValues getFixedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
final IndexInput data = this.data.slice("fixed-binary", bytes.offset, bytes.count * bytes.maxLength);
final BytesRef term = new BytesRef(bytes.maxLength);
final byte[] buffer = term.bytes;
final int length = term.length = bytes.maxLength;
return new LongBinaryDocValues() {
@Override
public BytesRef get(long id) {
try {
data.seek(id * length);
data.readBytes(buffer, 0, buffer.length);
return term;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
}
private BinaryDocValues getVariableBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
final RandomAccessInput addressesData = this.data.randomAccessSlice(bytes.addressesOffset, bytes.addressesEndOffset - bytes.addressesOffset);
final LongValues addresses = DirectMonotonicReader.getInstance(bytes.addressesMeta, addressesData);
final IndexInput data = this.data.slice("var-binary", bytes.offset, bytes.addressesOffset - bytes.offset);
final BytesRef term = new BytesRef(Math.max(0, bytes.maxLength));
final byte buffer[] = term.bytes;
return new LongBinaryDocValues() {
@Override
public BytesRef get(long id) {
long startAddress = addresses.get(id);
long endAddress = addresses.get(id+1);
int length = (int) (endAddress - startAddress);
try {
data.seek(startAddress);
data.readBytes(buffer, 0, length);
term.length = length;
return term;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
};
}
/** returns an address instance for prefix-compressed binary values. */
private synchronized MonotonicBlockPackedReader getIntervalInstance(FieldInfo field, BinaryEntry bytes) throws IOException {
MonotonicBlockPackedReader addresses = addressInstances.get(field.name);
if (addresses == null) {
data.seek(bytes.addressesOffset);
final long size = (bytes.count + INTERVAL_MASK) >>> INTERVAL_SHIFT;
addresses = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, size, false);
if (!merging) {
addressInstances.put(field.name, addresses);
ramBytesUsed.addAndGet(addresses.ramBytesUsed() + RamUsageEstimator.NUM_BYTES_INT);
}
}
return addresses;
}
/** returns a reverse lookup instance for prefix-compressed binary values. */
private synchronized ReverseTermsIndex getReverseIndexInstance(FieldInfo field, BinaryEntry bytes) throws IOException {
ReverseTermsIndex index = reverseIndexInstances.get(field.name);
if (index == null) {
index = new ReverseTermsIndex();
data.seek(bytes.reverseIndexOffset);
long size = (bytes.count + REVERSE_INTERVAL_MASK) >>> REVERSE_INTERVAL_SHIFT;
index.termAddresses = MonotonicBlockPackedReader.of(data, bytes.packedIntsVersion, bytes.blockSize, size, false);
long dataSize = data.readVLong();
PagedBytes pagedBytes = new PagedBytes(15);
pagedBytes.copy(data, dataSize);
index.terms = pagedBytes.freeze(true);
if (!merging) {
reverseIndexInstances.put(field.name, index);
ramBytesUsed.addAndGet(index.ramBytesUsed());
}
}
return index;
}
private BinaryDocValues getCompressedBinary(FieldInfo field, final BinaryEntry bytes) throws IOException {
final MonotonicBlockPackedReader addresses = getIntervalInstance(field, bytes);
final ReverseTermsIndex index = getReverseIndexInstance(field, bytes);
assert addresses.size() > 0; // we don't have to handle empty case
IndexInput slice = data.slice("terms", bytes.offset, bytes.addressesOffset - bytes.offset);
return new CompressedBinaryDocValues(bytes, addresses, index, slice);
}
@Override
public SortedDocValues getSorted(FieldInfo field) throws IOException {
final int valueCount = (int) binaries.get(field.name).count;
final BinaryDocValues binary = getBinary(field);
NumericEntry entry = ords.get(field.name);
final LongValues ordinals = getNumeric(entry);
return new SortedDocValues() {
@Override
public int getOrd(int docID) {
return (int) ordinals.get(docID);
}
@Override
public BytesRef lookupOrd(int ord) {
return binary.get(ord);
}
@Override
public int getValueCount() {
return valueCount;
}
@Override
public int lookupTerm(BytesRef key) {
if (binary instanceof CompressedBinaryDocValues) {
return (int) ((CompressedBinaryDocValues)binary).lookupTerm(key);
} else {
return super.lookupTerm(key);
}
}
@Override
public TermsEnum termsEnum() {
if (binary instanceof CompressedBinaryDocValues) {
return ((CompressedBinaryDocValues)binary).getTermsEnum();
} else {
return super.termsEnum();
}
}
};
}
/** returns an address instance for sortedset ordinal lists */
private LongValues getOrdIndexInstance(FieldInfo field, NumericEntry entry) throws IOException {
RandomAccessInput data = this.data.randomAccessSlice(entry.offset, entry.endOffset - entry.offset);
return DirectMonotonicReader.getInstance(entry.monotonicMeta, data);
}
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
SortedSetEntry ss = sortedNumerics.get(field.name);
if (ss.format == SORTED_SINGLE_VALUED) {
NumericEntry numericEntry = numerics.get(field.name);
final LongValues values = getNumeric(numericEntry);
final Bits docsWithField;
if (numericEntry.format == SPARSE_COMPRESSED) {
docsWithField = ((SparseLongValues) values).docsWithField;
} else {
docsWithField = getLiveBits(numericEntry.missingOffset, maxDoc);
}
return DocValues.singleton(values, docsWithField);
} else if (ss.format == SORTED_WITH_ADDRESSES) {
NumericEntry numericEntry = numerics.get(field.name);
final LongValues values = getNumeric(numericEntry);
final LongValues ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name));
return new SortedNumericDocValues() {
long startOffset;
long endOffset;
@Override
public void setDocument(int doc) {
startOffset = ordIndex.get(doc);
endOffset = ordIndex.get(doc+1L);
}
@Override
public long valueAt(int index) {
return values.get(startOffset + index);
}
@Override
public int count() {
return (int) (endOffset - startOffset);
}
};
} else if (ss.format == SORTED_SET_TABLE) {
NumericEntry entry = ords.get(field.name);
final LongValues ordinals = getNumeric(entry);
final long[] table = ss.table;
final int[] offsets = ss.tableOffsets;
return new SortedNumericDocValues() {
int startOffset;
int endOffset;
@Override
public void setDocument(int doc) {
final int ord = (int) ordinals.get(doc);
startOffset = offsets[ord];
endOffset = offsets[ord + 1];
}
@Override
public long valueAt(int index) {
return table[startOffset + index];
}
@Override
public int count() {
return endOffset - startOffset;
}
};
} else {
throw new AssertionError();
}
}
@Override
public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
SortedSetEntry ss = sortedSets.get(field.name);
switch (ss.format) {
case SORTED_SINGLE_VALUED:
final SortedDocValues values = getSorted(field);
return DocValues.singleton(values);
case SORTED_WITH_ADDRESSES:
return getSortedSetWithAddresses(field);
case SORTED_SET_TABLE:
return getSortedSetTable(field, ss);
default:
throw new AssertionError();
}
}
private SortedSetDocValues getSortedSetWithAddresses(FieldInfo field) throws IOException {
final long valueCount = binaries.get(field.name).count;
// we keep the byte[]s and list of ords on disk, these could be large
final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
final LongValues ordinals = getNumeric(ords.get(field.name));
// but the addresses to the ord stream are in RAM
final LongValues ordIndex = getOrdIndexInstance(field, ordIndexes.get(field.name));
return new RandomAccessOrds() {
long startOffset;
long offset;
long endOffset;
@Override
public long nextOrd() {
if (offset == endOffset) {
return NO_MORE_ORDS;
} else {
long ord = ordinals.get(offset);
offset++;
return ord;
}
}
@Override
public void setDocument(int docID) {
startOffset = offset = ordIndex.get(docID);
endOffset = ordIndex.get(docID+1L);
}
@Override
public BytesRef lookupOrd(long ord) {
return binary.get(ord);
}
@Override
public long getValueCount() {
return valueCount;
}
@Override
public long lookupTerm(BytesRef key) {
if (binary instanceof CompressedBinaryDocValues) {
return ((CompressedBinaryDocValues)binary).lookupTerm(key);
} else {
return super.lookupTerm(key);
}
}
@Override
public TermsEnum termsEnum() {
if (binary instanceof CompressedBinaryDocValues) {
return ((CompressedBinaryDocValues)binary).getTermsEnum();
} else {
return super.termsEnum();
}
}
@Override
public long ordAt(int index) {
return ordinals.get(startOffset + index);
}
@Override
public int cardinality() {
return (int) (endOffset - startOffset);
}
};
}
private SortedSetDocValues getSortedSetTable(FieldInfo field, SortedSetEntry ss) throws IOException {
final long valueCount = binaries.get(field.name).count;
final LongBinaryDocValues binary = (LongBinaryDocValues) getBinary(field);
final LongValues ordinals = getNumeric(ords.get(field.name));
final long[] table = ss.table;
final int[] offsets = ss.tableOffsets;
return new RandomAccessOrds() {
int offset, startOffset, endOffset;
@Override
public void setDocument(int docID) {
final int ord = (int) ordinals.get(docID);
offset = startOffset = offsets[ord];
endOffset = offsets[ord + 1];
}
@Override
public long ordAt(int index) {
return table[startOffset + index];
}
@Override
public long nextOrd() {
if (offset == endOffset) {
return NO_MORE_ORDS;
} else {
return table[offset++];
}
}
@Override
public int cardinality() {
return endOffset - startOffset;
}
@Override
public BytesRef lookupOrd(long ord) {
return binary.get(ord);
}
@Override
public long getValueCount() {
return valueCount;
}
@Override
public long lookupTerm(BytesRef key) {
if (binary instanceof CompressedBinaryDocValues) {
return ((CompressedBinaryDocValues) binary).lookupTerm(key);
} else {
return super.lookupTerm(key);
}
}
@Override
public TermsEnum termsEnum() {
if (binary instanceof CompressedBinaryDocValues) {
return ((CompressedBinaryDocValues) binary).getTermsEnum();
} else {
return super.termsEnum();
}
}
};
}
private Bits getLiveBits(final long offset, final int count) throws IOException {
if (offset == ALL_MISSING) {
return new Bits.MatchNoBits(count);
} else if (offset == ALL_LIVE) {
return new Bits.MatchAllBits(count);
} else {
int length = (int) ((count + 7L) >>> 3);
final RandomAccessInput in = data.randomAccessSlice(offset, length);
return new Bits() {
@Override
public boolean get(int index) {
try {
return (in.readByte(index >> 3) & (1 << (index & 7))) != 0;
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
public int length() {
return count;
}
};
}
}
private SparseBits getSparseLiveBits(NumericEntry entry) throws IOException {
final RandomAccessInput docIdsData = this.data.randomAccessSlice(entry.missingOffset, entry.offset - entry.missingOffset);
final LongValues docIDs = DirectMonotonicReader.getInstance(entry.monotonicMeta, docIdsData);
return new SparseBits(maxDoc, entry.numDocsWithValue, docIDs);
}
@Override
public Bits getDocsWithField(FieldInfo field) throws IOException {
switch(field.getDocValuesType()) {
case SORTED_SET:
return DocValues.docsWithValue(getSortedSet(field), maxDoc);
case SORTED_NUMERIC:
return DocValues.docsWithValue(getSortedNumeric(field), maxDoc);
case SORTED:
return DocValues.docsWithValue(getSorted(field), maxDoc);
case BINARY:
BinaryEntry be = binaries.get(field.name);
return getLiveBits(be.missingOffset, maxDoc);
case NUMERIC:
NumericEntry ne = numerics.get(field.name);
if (ne.format == SPARSE_COMPRESSED) {
return getSparseLiveBits(ne);
} else {
return getLiveBits(ne.missingOffset, maxDoc);
}
default:
throw new AssertionError();
}
}
@Override
public synchronized DocValuesProducer getMergeInstance() throws IOException {
return new Lucene54DocValuesProducer(this);
}
@Override
public void close() throws IOException {
data.close();
}
/** metadata entry for a numeric docvalues field */
static class NumericEntry {
private NumericEntry() {}
/** offset to the bitset representing docsWithField, or -1 if no documents have missing values */
long missingOffset;
/** offset to the actual numeric values */
public long offset;
/** end offset to the actual numeric values */
public long endOffset;
/** bits per value used to pack the numeric values */
public int bitsPerValue;
int format;
/** count of values written */
public long count;
/** monotonic meta */
public DirectMonotonicReader.Meta monotonicMeta;
long minValue;
long gcd;
long table[];
/** for sparse compression */
long numDocsWithValue;
NumericEntry nonMissingValues;
NumberType numberType;
}
/** metadata entry for a binary docvalues field */
static class BinaryEntry {
private BinaryEntry() {}
/** offset to the bitset representing docsWithField, or -1 if no documents have missing values */
long missingOffset;
/** offset to the actual binary values */
long offset;
int format;
/** count of values written */
public long count;
int minLength;
int maxLength;
/** offset to the addressing data that maps a value to its slice of the byte[] */
public long addressesOffset, addressesEndOffset;
/** meta data for addresses */
public DirectMonotonicReader.Meta addressesMeta;
/** offset to the reverse index */
public long reverseIndexOffset;
/** packed ints version used to encode addressing information */
public int packedIntsVersion;
/** packed ints blocksize */
public int blockSize;
}
/** metadata entry for a sorted-set docvalues field */
static class SortedSetEntry {
private SortedSetEntry() {}
int format;
long[] table;
int[] tableOffsets;
}
// internally we compose complex dv (sorted/sortedset) from other ones
static abstract class LongBinaryDocValues extends BinaryDocValues {
@Override
public final BytesRef get(int docID) {
return get((long)docID);
}
abstract BytesRef get(long id);
}
// used for reverse lookup to a small range of blocks
static class ReverseTermsIndex implements Accountable {
public MonotonicBlockPackedReader termAddresses;
public PagedBytes.Reader terms;
@Override
public long ramBytesUsed() {
return termAddresses.ramBytesUsed() + terms.ramBytesUsed();
}
@Override
public Collection getChildResources() {
List resources = new ArrayList<>();
resources.add(Accountables.namedAccountable("term bytes", terms));
resources.add(Accountables.namedAccountable("term addresses", termAddresses));
return Collections.unmodifiableList(resources);
}
@Override
public String toString() {
return getClass().getSimpleName() + "(size=" + termAddresses.size() + ")";
}
}
//in the compressed case, we add a few additional operations for
//more efficient reverse lookup and enumeration
static final class CompressedBinaryDocValues extends LongBinaryDocValues {
final long numValues;
final long numIndexValues;
final int maxTermLength;
final MonotonicBlockPackedReader addresses;
final IndexInput data;
final CompressedBinaryTermsEnum termsEnum;
final PagedBytes.Reader reverseTerms;
final MonotonicBlockPackedReader reverseAddresses;
final long numReverseIndexValues;
public CompressedBinaryDocValues(BinaryEntry bytes, MonotonicBlockPackedReader addresses, ReverseTermsIndex index, IndexInput data) throws IOException {
this.maxTermLength = bytes.maxLength;
this.numValues = bytes.count;
this.addresses = addresses;
this.numIndexValues = addresses.size();
this.data = data;
this.reverseTerms = index.terms;
this.reverseAddresses = index.termAddresses;
this.numReverseIndexValues = reverseAddresses.size();
this.termsEnum = getTermsEnum(data);
}
@Override
public BytesRef get(long id) {
try {
termsEnum.seekExact(id);
return termsEnum.term();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
long lookupTerm(BytesRef key) {
try {
switch (termsEnum.seekCeil(key)) {
case FOUND: return termsEnum.ord();
case NOT_FOUND: return -termsEnum.ord()-1;
default: return -numValues-1;
}
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
TermsEnum getTermsEnum() {
try {
return getTermsEnum(data.clone());
} catch (IOException e) {
throw new RuntimeException(e);
}
}
private CompressedBinaryTermsEnum getTermsEnum(IndexInput input) throws IOException {
return new CompressedBinaryTermsEnum(input);
}
class CompressedBinaryTermsEnum extends TermsEnum {
private long currentOrd = -1;
// offset to the start of the current block
private long currentBlockStart;
private final IndexInput input;
// delta from currentBlockStart to start of each term
private final int offsets[] = new int[INTERVAL_COUNT];
private final byte buffer[] = new byte[2*INTERVAL_COUNT-1];
private final BytesRef term = new BytesRef(maxTermLength);
private final BytesRef firstTerm = new BytesRef(maxTermLength);
private final BytesRef scratch = new BytesRef();
CompressedBinaryTermsEnum(IndexInput input) throws IOException {
this.input = input;
input.seek(0);
}
private void readHeader() throws IOException {
firstTerm.length = input.readVInt();
input.readBytes(firstTerm.bytes, 0, firstTerm.length);
input.readBytes(buffer, 0, INTERVAL_COUNT-1);
if (buffer[0] == -1) {
readShortAddresses();
} else {
readByteAddresses();
}
currentBlockStart = input.getFilePointer();
}
// read single byte addresses: each is delta - 2
// (shared prefix byte and length > 0 are both implicit)
private void readByteAddresses() throws IOException {
int addr = 0;
for (int i = 1; i < offsets.length; i++) {
addr += 2 + (buffer[i-1] & 0xFF);
offsets[i] = addr;
}
}
// read double byte addresses: each is delta - 2
// (shared prefix byte and length > 0 are both implicit)
private void readShortAddresses() throws IOException {
input.readBytes(buffer, INTERVAL_COUNT-1, INTERVAL_COUNT);
int addr = 0;
for (int i = 1; i < offsets.length; i++) {
int x = i<<1;
addr += 2 + ((buffer[x-1] << 8) | (buffer[x] & 0xFF));
offsets[i] = addr;
}
}
// set term to the first term
private void readFirstTerm() throws IOException {
term.length = firstTerm.length;
System.arraycopy(firstTerm.bytes, firstTerm.offset, term.bytes, 0, term.length);
}
// read term at offset, delta encoded from first term
private void readTerm(int offset) throws IOException {
int start = input.readByte() & 0xFF;
System.arraycopy(firstTerm.bytes, firstTerm.offset, term.bytes, 0, start);
int suffix = offsets[offset] - offsets[offset-1] - 1;
input.readBytes(term.bytes, start, suffix);
term.length = start + suffix;
}
@Override
public BytesRef next() throws IOException {
currentOrd++;
if (currentOrd >= numValues) {
return null;
} else {
int offset = (int) (currentOrd & INTERVAL_MASK);
if (offset == 0) {
// switch to next block
readHeader();
readFirstTerm();
} else {
readTerm(offset);
}
return term;
}
}
// binary search reverse index to find smaller
// range of blocks to search
long binarySearchIndex(BytesRef text) throws IOException {
long low = 0;
long high = numReverseIndexValues - 1;
while (low <= high) {
long mid = (low + high) >>> 1;
reverseTerms.fill(scratch, reverseAddresses.get(mid));
int cmp = scratch.compareTo(text);
if (cmp < 0) {
low = mid + 1;
} else if (cmp > 0) {
high = mid - 1;
} else {
return mid;
}
}
return high;
}
// binary search against first term in block range
// to find term's block
long binarySearchBlock(BytesRef text, long low, long high) throws IOException {
while (low <= high) {
long mid = (low + high) >>> 1;
input.seek(addresses.get(mid));
term.length = input.readVInt();
input.readBytes(term.bytes, 0, term.length);
int cmp = term.compareTo(text);
if (cmp < 0) {
low = mid + 1;
} else if (cmp > 0) {
high = mid - 1;
} else {
return mid;
}
}
return high;
}
@Override
public SeekStatus seekCeil(BytesRef text) throws IOException {
// locate block: narrow to block range with index, then search blocks
final long block;
long indexPos = binarySearchIndex(text);
if (indexPos < 0) {
block = 0;
} else {
long low = indexPos << BLOCK_INTERVAL_SHIFT;
long high = Math.min(numIndexValues - 1, low + BLOCK_INTERVAL_MASK);
block = Math.max(low, binarySearchBlock(text, low, high));
}
// position before block, then scan to term.
input.seek(addresses.get(block));
currentOrd = (block << INTERVAL_SHIFT) - 1;
while (next() != null) {
int cmp = term.compareTo(text);
if (cmp == 0) {
return SeekStatus.FOUND;
} else if (cmp > 0) {
return SeekStatus.NOT_FOUND;
}
}
return SeekStatus.END;
}
@Override
public void seekExact(long ord) throws IOException {
long block = ord >>> INTERVAL_SHIFT;
if (block != currentOrd >>> INTERVAL_SHIFT) {
// switch to different block
input.seek(addresses.get(block));
readHeader();
}
currentOrd = ord;
int offset = (int) (ord & INTERVAL_MASK);
if (offset == 0) {
readFirstTerm();
} else {
input.seek(currentBlockStart + offsets[offset-1]);
readTerm(offset);
}
}
@Override
public BytesRef term() throws IOException {
return term;
}
@Override
public long ord() throws IOException {
return currentOrd;
}
@Override
public int docFreq() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public long totalTermFreq() throws IOException {
return -1;
}
@Override
public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
throw new UnsupportedOperationException();
}
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy