
java.org.apache.lucene.util.bkd.BKDReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of lucene Show documentation
Show all versions of lucene Show documentation
Libraries for Elasticsearch
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util.bkd;
import java.io.IOException;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.PointValues;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.MathUtil;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
/** Handles intersection of an multi-dimensional shape in byte[] space with a block KD-tree previously written with {@link BKDWriter}.
*
* @lucene.experimental */
public final class BKDReader extends PointValues implements Accountable {
// Packed array of byte[] holding all split values in the full binary tree:
final int leafNodeOffset;
final int numDims;
final int bytesPerDim;
final int numLeaves;
final IndexInput in;
final int maxPointsInLeafNode;
final byte[] minPackedValue;
final byte[] maxPackedValue;
final long pointCount;
final int docCount;
final int version;
protected final int packedBytesLength;
// Used for 6.4.0+ index format:
final byte[] packedIndex;
// Used for Legacy (pre-6.4.0) index format, to hold a compact form of the index:
final private byte[] splitPackedValues;
final int bytesPerIndexEntry;
final long[] leafBlockFPs;
/** Caller must pre-seek the provided {@link IndexInput} to the index location that {@link BKDWriter#finish} returned */
public BKDReader(IndexInput in) throws IOException {
version = CodecUtil.checkHeader(in, BKDWriter.CODEC_NAME, BKDWriter.VERSION_START, BKDWriter.VERSION_CURRENT);
numDims = in.readVInt();
maxPointsInLeafNode = in.readVInt();
bytesPerDim = in.readVInt();
bytesPerIndexEntry = numDims == 1 && version >= BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D ? bytesPerDim : bytesPerDim + 1;
packedBytesLength = numDims * bytesPerDim;
// Read index:
numLeaves = in.readVInt();
assert numLeaves > 0;
leafNodeOffset = numLeaves;
minPackedValue = new byte[packedBytesLength];
maxPackedValue = new byte[packedBytesLength];
in.readBytes(minPackedValue, 0, packedBytesLength);
in.readBytes(maxPackedValue, 0, packedBytesLength);
for(int dim=0;dim 0) {
throw new CorruptIndexException("minPackedValue " + new BytesRef(minPackedValue) + " is > maxPackedValue " + new BytesRef(maxPackedValue) + " for dim=" + dim, in);
}
}
pointCount = in.readVLong();
docCount = in.readVInt();
if (version >= BKDWriter.VERSION_PACKED_INDEX) {
int numBytes = in.readVInt();
packedIndex = new byte[numBytes];
in.readBytes(packedIndex, 0, numBytes);
leafBlockFPs = null;
splitPackedValues = null;
} else {
// legacy un-packed index
splitPackedValues = new byte[bytesPerIndexEntry*numLeaves];
in.readBytes(splitPackedValues, 0, splitPackedValues.length);
// Read the file pointers to the start of each leaf block:
long[] leafBlockFPs = new long[numLeaves];
long lastFP = 0;
for(int i=0;i 1) {
int levelCount = 2;
while (true) {
if (numLeaves >= levelCount && numLeaves <= 2*levelCount) {
int lastLevel = 2*(numLeaves - levelCount);
assert lastLevel >= 0;
if (lastLevel != 0) {
// Last level is partially filled, so we must rotate the leaf FPs to match. We do this here, after loading
// at read-time, so that we can still delta code them on disk at write:
long[] newLeafBlockFPs = new long[numLeaves];
System.arraycopy(leafBlockFPs, lastLevel, newLeafBlockFPs, 0, leafBlockFPs.length - lastLevel);
System.arraycopy(leafBlockFPs, 0, newLeafBlockFPs, leafBlockFPs.length - lastLevel, lastLevel);
leafBlockFPs = newLeafBlockFPs;
}
break;
}
levelCount *= 2;
}
}
this.leafBlockFPs = leafBlockFPs;
packedIndex = null;
}
this.in = in;
}
long getMinLeafBlockFP() {
if (packedIndex != null) {
return new ByteArrayDataInput(packedIndex).readVLong();
} else {
long minFP = Long.MAX_VALUE;
for(long fp : leafBlockFPs) {
minFP = Math.min(minFP, fp);
}
return minFP;
}
}
/** Used to walk the in-heap index
*
* @lucene.internal */
public abstract class IndexTree implements Cloneable {
protected int nodeID;
// level is 1-based so that we can do level-1 w/o checking each time:
protected int level;
protected int splitDim;
protected final byte[][] splitPackedValueStack;
protected IndexTree() {
int treeDepth = getTreeDepth();
splitPackedValueStack = new byte[treeDepth+1][];
nodeID = 1;
level = 1;
splitPackedValueStack[level] = new byte[packedBytesLength];
}
public void pushLeft() {
nodeID *= 2;
level++;
if (splitPackedValueStack[level] == null) {
splitPackedValueStack[level] = new byte[packedBytesLength];
}
}
/** Clone, but you are not allowed to pop up past the point where the clone happened. */
public abstract IndexTree clone();
public void pushRight() {
nodeID = nodeID * 2 + 1;
level++;
if (splitPackedValueStack[level] == null) {
splitPackedValueStack[level] = new byte[packedBytesLength];
}
}
public void pop() {
nodeID /= 2;
level--;
splitDim = -1;
//System.out.println(" pop nodeID=" + nodeID);
}
public boolean isLeafNode() {
return nodeID >= leafNodeOffset;
}
public boolean nodeExists() {
return nodeID - leafNodeOffset < leafNodeOffset;
}
public int getNodeID() {
return nodeID;
}
public byte[] getSplitPackedValue() {
assert isLeafNode() == false;
assert splitPackedValueStack[level] != null: "level=" + level;
return splitPackedValueStack[level];
}
/** Only valid after pushLeft or pushRight, not pop! */
public int getSplitDim() {
assert isLeafNode() == false;
return splitDim;
}
/** Only valid after pushLeft or pushRight, not pop! */
public abstract BytesRef getSplitDimValue();
/** Only valid after pushLeft or pushRight, not pop! */
public abstract long getLeafBlockFP();
/** Return the number of leaves below the current node. */
public int getNumLeaves() {
int leftMostLeafNode = nodeID;
while (leftMostLeafNode < leafNodeOffset) {
leftMostLeafNode = leftMostLeafNode * 2;
}
int rightMostLeafNode = nodeID;
while (rightMostLeafNode < leafNodeOffset) {
rightMostLeafNode = rightMostLeafNode * 2 + 1;
}
final int numLeaves;
if (rightMostLeafNode >= leftMostLeafNode) {
// both are on the same level
numLeaves = rightMostLeafNode - leftMostLeafNode + 1;
} else {
// left is one level deeper than right
numLeaves = rightMostLeafNode - leftMostLeafNode + 1 + leafNodeOffset;
}
assert numLeaves == getNumLeavesSlow(nodeID) : numLeaves + " " + getNumLeavesSlow(nodeID);
return numLeaves;
}
// for assertions
private int getNumLeavesSlow(int node) {
if (node >= 2 * leafNodeOffset) {
return 0;
} else if (node >= leafNodeOffset) {
return 1;
} else {
final int leftCount = getNumLeavesSlow(node * 2);
final int rightCount = getNumLeavesSlow(node * 2 + 1);
return leftCount + rightCount;
}
}
}
/** Reads the original simple yet heap-heavy index format */
private final class LegacyIndexTree extends IndexTree {
private long leafBlockFP;
private final byte[] splitDimValue = new byte[bytesPerDim];
private final BytesRef scratch = new BytesRef();
public LegacyIndexTree() {
setNodeData();
scratch.bytes = splitDimValue;
scratch.length = bytesPerDim;
}
@Override
public LegacyIndexTree clone() {
LegacyIndexTree index = new LegacyIndexTree();
index.nodeID = nodeID;
index.level = level;
index.splitDim = splitDim;
index.leafBlockFP = leafBlockFP;
index.splitPackedValueStack[index.level] = splitPackedValueStack[index.level].clone();
return index;
}
@Override
public void pushLeft() {
super.pushLeft();
setNodeData();
}
@Override
public void pushRight() {
super.pushRight();
setNodeData();
}
private void setNodeData() {
if (isLeafNode()) {
leafBlockFP = leafBlockFPs[nodeID - leafNodeOffset];
splitDim = -1;
} else {
leafBlockFP = -1;
int address = nodeID * bytesPerIndexEntry;
if (numDims == 1) {
splitDim = 0;
if (version < BKDWriter.VERSION_IMPLICIT_SPLIT_DIM_1D) {
// skip over wastefully encoded 0 splitDim:
assert splitPackedValues[address] == 0;
address++;
}
} else {
splitDim = splitPackedValues[address++] & 0xff;
}
System.arraycopy(splitPackedValues, address, splitDimValue, 0, bytesPerDim);
}
}
@Override
public long getLeafBlockFP() {
assert isLeafNode();
return leafBlockFP;
}
@Override
public BytesRef getSplitDimValue() {
assert isLeafNode() == false;
return scratch;
}
@Override
public void pop() {
super.pop();
leafBlockFP = -1;
}
}
/** Reads the new packed byte[] index format which can be up to ~63% smaller than the legacy index format on 20M NYC taxis tests. This
* format takes advantage of the limited access pattern to the BKD tree at search time, i.e. starting at the root node and recursing
* downwards one child at a time. */
private final class PackedIndexTree extends IndexTree {
// used to read the packed byte[]
private final ByteArrayDataInput in;
// holds the minimum (left most) leaf block file pointer for each level we've recursed to:
private final long[] leafBlockFPStack;
// holds the address, in the packed byte[] index, of the left-node of each level:
private final int[] leftNodePositions;
// holds the address, in the packed byte[] index, of the right-node of each level:
private final int[] rightNodePositions;
// holds the splitDim for each level:
private final int[] splitDims;
// true if the per-dim delta we read for the node at this level is a negative offset vs. the last split on this dim; this is a packed
// 2D array, i.e. to access array[level][dim] you read from negativeDeltas[level*numDims+dim]. this will be true if the last time we
// split on this dimension, we next pushed to the left sub-tree:
private final boolean[] negativeDeltas;
// holds the packed per-level split values; the intersect method uses this to save the cell min/max as it recurses:
private final byte[][] splitValuesStack;
// scratch value to return from getPackedValue:
private final BytesRef scratch;
public PackedIndexTree() {
int treeDepth = getTreeDepth();
leafBlockFPStack = new long[treeDepth+1];
leftNodePositions = new int[treeDepth+1];
rightNodePositions = new int[treeDepth+1];
splitValuesStack = new byte[treeDepth+1][];
splitDims = new int[treeDepth+1];
negativeDeltas = new boolean[numDims*(treeDepth+1)];
in = new ByteArrayDataInput(packedIndex);
splitValuesStack[0] = new byte[packedBytesLength];
readNodeData(false);
scratch = new BytesRef();
scratch.length = bytesPerDim;
}
@Override
public PackedIndexTree clone() {
PackedIndexTree index = new PackedIndexTree();
index.nodeID = nodeID;
index.level = level;
index.splitDim = splitDim;
index.leafBlockFPStack[level] = leafBlockFPStack[level];
index.leftNodePositions[level] = leftNodePositions[level];
index.rightNodePositions[level] = rightNodePositions[level];
index.splitValuesStack[index.level] = splitValuesStack[index.level].clone();
System.arraycopy(negativeDeltas, level*numDims, index.negativeDeltas, level*numDims, numDims);
index.splitDims[level] = splitDims[level];
return index;
}
@Override
public void pushLeft() {
int nodePosition = leftNodePositions[level];
super.pushLeft();
System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims);
assert splitDim != -1;
negativeDeltas[level*numDims+splitDim] = true;
in.setPosition(nodePosition);
readNodeData(true);
}
@Override
public void pushRight() {
int nodePosition = rightNodePositions[level];
super.pushRight();
System.arraycopy(negativeDeltas, (level-1)*numDims, negativeDeltas, level*numDims, numDims);
assert splitDim != -1;
negativeDeltas[level*numDims+splitDim] = false;
in.setPosition(nodePosition);
readNodeData(false);
}
@Override
public void pop() {
super.pop();
splitDim = splitDims[level];
}
@Override
public long getLeafBlockFP() {
assert isLeafNode(): "nodeID=" + nodeID + " is not a leaf";
return leafBlockFPStack[level];
}
@Override
public BytesRef getSplitDimValue() {
assert isLeafNode() == false;
scratch.bytes = splitValuesStack[level];
scratch.offset = splitDim * bytesPerDim;
return scratch;
}
private void readNodeData(boolean isLeft) {
leafBlockFPStack[level] = leafBlockFPStack[level-1];
// read leaf block FP delta
if (isLeft == false) {
leafBlockFPStack[level] += in.readVLong();
}
if (isLeafNode()) {
splitDim = -1;
} else {
// read split dim, prefix, firstDiffByteDelta encoded as int:
int code = in.readVInt();
splitDim = code % numDims;
splitDims[level] = splitDim;
code /= numDims;
int prefix = code % (1+bytesPerDim);
int suffix = bytesPerDim - prefix;
if (splitValuesStack[level] == null) {
splitValuesStack[level] = new byte[packedBytesLength];
}
System.arraycopy(splitValuesStack[level-1], 0, splitValuesStack[level], 0, packedBytesLength);
if (suffix > 0) {
int firstDiffByteDelta = code / (1+bytesPerDim);
if (negativeDeltas[level*numDims + splitDim]) {
firstDiffByteDelta = -firstDiffByteDelta;
}
int oldByte = splitValuesStack[level][splitDim*bytesPerDim+prefix] & 0xFF;
splitValuesStack[level][splitDim*bytesPerDim+prefix] = (byte) (oldByte + firstDiffByteDelta);
in.readBytes(splitValuesStack[level], splitDim*bytesPerDim+prefix+1, suffix-1);
} else {
// our split value is == last split value in this dim, which can happen when there are many duplicate values
}
int leftNumBytes;
if (nodeID * 2 < leafNodeOffset) {
leftNumBytes = in.readVInt();
} else {
leftNumBytes = 0;
}
leftNodePositions[level] = in.getPosition();
rightNodePositions[level] = leftNodePositions[level] + leftNumBytes;
}
}
}
private int getTreeDepth() {
// First +1 because all the non-leave nodes makes another power
// of 2; e.g. to have a fully balanced tree with 4 leaves you
// need a depth=3 tree:
// Second +1 because MathUtil.log computes floor of the logarithm; e.g.
// with 5 leaves you need a depth=4 tree:
return MathUtil.log(numLeaves, 2) + 2;
}
/** Used to track all state for a single call to {@link #intersect}. */
public static final class IntersectState {
final IndexInput in;
final int[] scratchDocIDs;
final byte[] scratchPackedValue;
final int[] commonPrefixLengths;
final IntersectVisitor visitor;
public final IndexTree index;
public IntersectState(IndexInput in, int numDims,
int packedBytesLength,
int maxPointsInLeafNode,
IntersectVisitor visitor,
IndexTree indexVisitor) {
this.in = in;
this.visitor = visitor;
this.commonPrefixLengths = new int[numDims];
this.scratchDocIDs = new int[maxPointsInLeafNode];
this.scratchPackedValue = new byte[packedBytesLength];
this.index = indexVisitor;
}
}
@Override
public void intersect(IntersectVisitor visitor) throws IOException {
intersect(getIntersectState(visitor), minPackedValue, maxPackedValue);
}
@Override
public long estimatePointCount(IntersectVisitor visitor) {
return estimatePointCount(getIntersectState(visitor), minPackedValue, maxPackedValue);
}
/** Fast path: this is called when the query box fully encompasses all cells under this node. */
private void addAll(IntersectState state, boolean grown) throws IOException {
//System.out.println("R: addAll nodeID=" + nodeID);
if (grown == false) {
final long maxPointCount = (long) maxPointsInLeafNode * state.index.getNumLeaves();
if (maxPointCount <= Integer.MAX_VALUE) { // could be >MAX_VALUE if there are more than 2B points in total
state.visitor.grow((int) maxPointCount);
grown = true;
}
}
if (state.index.isLeafNode()) {
assert grown;
//System.out.println("ADDALL");
if (state.index.nodeExists()) {
visitDocIDs(state.in, state.index.getLeafBlockFP(), state.visitor);
}
// TODO: we can assert that the first value here in fact matches what the index claimed?
} else {
state.index.pushLeft();
addAll(state, grown);
state.index.pop();
state.index.pushRight();
addAll(state, grown);
state.index.pop();
}
}
/** Create a new {@link IntersectState} */
public IntersectState getIntersectState(IntersectVisitor visitor) {
IndexTree index;
if (packedIndex != null) {
index = new PackedIndexTree();
} else {
index = new LegacyIndexTree();
}
return new IntersectState(in.clone(), numDims,
packedBytesLength,
maxPointsInLeafNode,
visitor,
index);
}
/** Visits all docIDs and packed values in a single leaf block */
public void visitLeafBlockValues(IndexTree index, IntersectState state) throws IOException {
// Leaf node; scan and filter all points in this block:
int count = readDocIDs(state.in, index.getLeafBlockFP(), state.scratchDocIDs);
// Again, this time reading values and checking with the visitor
visitDocValues(state.commonPrefixLengths, state.scratchPackedValue, state.in, state.scratchDocIDs, count, state.visitor);
}
private void visitDocIDs(IndexInput in, long blockFP, IntersectVisitor visitor) throws IOException {
// Leaf node
in.seek(blockFP);
// How many points are stored in this leaf cell:
int count = in.readVInt();
// No need to call grow(), it has been called up-front
if (version < BKDWriter.VERSION_COMPRESSED_DOC_IDS) {
DocIdsWriter.readInts32(in, count, visitor);
} else {
DocIdsWriter.readInts(in, count, visitor);
}
}
int readDocIDs(IndexInput in, long blockFP, int[] docIDs) throws IOException {
in.seek(blockFP);
// How many points are stored in this leaf cell:
int count = in.readVInt();
if (version < BKDWriter.VERSION_COMPRESSED_DOC_IDS) {
DocIdsWriter.readInts32(in, count, docIDs);
} else {
DocIdsWriter.readInts(in, count, docIDs);
}
return count;
}
void visitDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
visitor.grow(count);
readCommonPrefixes(commonPrefixLengths, scratchPackedValue, in);
int compressedDim = version < BKDWriter.VERSION_COMPRESSED_VALUES
? -1
: readCompressedDim(in);
if (compressedDim == -1) {
visitRawDocValues(commonPrefixLengths, scratchPackedValue, in, docIDs, count, visitor);
} else {
visitCompressedDocValues(commonPrefixLengths, scratchPackedValue, in, docIDs, count, visitor, compressedDim);
}
}
// Just read suffixes for every dimension
private void visitRawDocValues(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in, int[] docIDs, int count, IntersectVisitor visitor) throws IOException {
for (int i = 0; i < count; ++i) {
for(int dim=0;dim= numDims) {
throw new CorruptIndexException("Got compressedDim="+compressedDim, in);
}
return compressedDim;
}
private void readCommonPrefixes(int[] commonPrefixLengths, byte[] scratchPackedValue, IndexInput in) throws IOException {
for(int dim=0;dim 0) {
in.readBytes(scratchPackedValue, dim*bytesPerDim, prefix);
}
//System.out.println("R: " + dim + " of " + numDims + " prefix=" + prefix);
}
}
private void intersect(IntersectState state, byte[] cellMinPacked, byte[] cellMaxPacked) throws IOException {
/*
System.out.println("\nR: intersect nodeID=" + state.index.getNodeID());
for(int dim=0;dim
Related Artifacts
Related Groups
-->
© 2015 - 2025 Weber Informatics LLC | Privacy Policy