All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.cassandra.index.sai.disk.v1.bbtree.BlockBalancedTreeWalker Maven / Gradle / Ivy

Go to download

The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.

There is a newer version: 5.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.index.sai.disk.v1.bbtree;

import java.io.Closeable;
import java.io.IOException;
import java.util.Arrays;

import javax.annotation.concurrent.NotThreadSafe;

import com.google.common.annotations.VisibleForTesting;

import org.agrona.collections.IntArrayList;
import org.apache.cassandra.index.sai.disk.io.IndexInputReader;
import org.apache.cassandra.index.sai.disk.v1.SAICodecUtils;
import org.apache.cassandra.io.util.FileHandle;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.io.util.RandomAccessReader;
import org.apache.cassandra.utils.ByteArrayUtil;
import org.apache.cassandra.utils.ObjectSizes;
import org.apache.cassandra.utils.Throwables;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.store.ByteArrayDataInput;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;

/**
 * Base reader for a block balanced tree previously written with {@link BlockBalancedTreeWriter}.
 * 

* Holds the index tree on heap and enables its traversal via {@link #traverse(TraversalCallback)}. */ public class BlockBalancedTreeWalker implements Closeable { final FileHandle treeIndexFile; final int bytesPerValue; final int numLeaves; final int treeDepth; final byte[] minPackedValue; final byte[] maxPackedValue; final long valueCount; final int maxValuesInLeafNode; final byte[] packedIndex; final long memoryUsage; BlockBalancedTreeWalker(FileHandle treeIndexFile, long treeIndexRoot) { this.treeIndexFile = treeIndexFile; try (RandomAccessReader reader = treeIndexFile.createReader(); IndexInput indexInput = IndexInputReader.create(reader)) { SAICodecUtils.validate(indexInput); indexInput.seek(treeIndexRoot); maxValuesInLeafNode = indexInput.readVInt(); bytesPerValue = indexInput.readVInt(); // Read index: numLeaves = indexInput.readVInt(); assert numLeaves > 0; treeDepth = indexInput.readVInt(); minPackedValue = new byte[bytesPerValue]; maxPackedValue = new byte[bytesPerValue]; indexInput.readBytes(minPackedValue, 0, bytesPerValue); indexInput.readBytes(maxPackedValue, 0, bytesPerValue); if (ByteArrayUtil.compareUnsigned(minPackedValue, 0, maxPackedValue, 0, bytesPerValue) > 0) { String message = String.format("Min packed value %s is > max packed value %s.", new BytesRef(minPackedValue), new BytesRef(maxPackedValue)); throw new CorruptIndexException(message, indexInput); } valueCount = indexInput.readVLong(); int numBytes = indexInput.readVInt(); packedIndex = new byte[numBytes]; indexInput.readBytes(packedIndex, 0, numBytes); memoryUsage = ObjectSizes.sizeOfArray(packedIndex) + ObjectSizes.sizeOfArray(minPackedValue) + ObjectSizes.sizeOfArray(maxPackedValue); } catch (Throwable t) { FileUtils.closeQuietly(treeIndexFile); throw Throwables.unchecked(t); } } @VisibleForTesting public BlockBalancedTreeWalker(DataInput indexInput, long treeIndexRoot) throws IOException { treeIndexFile = null; indexInput.skipBytes(treeIndexRoot); maxValuesInLeafNode = indexInput.readVInt(); bytesPerValue = indexInput.readVInt(); // Read index: numLeaves = indexInput.readVInt(); assert numLeaves > 0; treeDepth = indexInput.readVInt(); minPackedValue = new byte[bytesPerValue]; maxPackedValue = new byte[bytesPerValue]; indexInput.readBytes(minPackedValue, 0, bytesPerValue); indexInput.readBytes(maxPackedValue, 0, bytesPerValue); if (ByteArrayUtil.compareUnsigned(minPackedValue, 0, maxPackedValue, 0, bytesPerValue) > 0) { String message = String.format("Min packed value %s is > max packed value %s.", new BytesRef(minPackedValue), new BytesRef(maxPackedValue)); throw new CorruptIndexException(message, indexInput); } valueCount = indexInput.readVLong(); int numBytes = indexInput.readVInt(); packedIndex = new byte[numBytes]; indexInput.readBytes(packedIndex, 0, numBytes); memoryUsage = ObjectSizes.sizeOfArray(packedIndex) + ObjectSizes.sizeOfArray(minPackedValue) + ObjectSizes.sizeOfArray(maxPackedValue); } public long memoryUsage() { return memoryUsage; } public TraversalState newTraversalState() { return new TraversalState(); } @Override public void close() { FileUtils.closeQuietly(treeIndexFile); } void traverse(TraversalCallback callback) { traverse(newTraversalState(), callback, new IntArrayList()); } private void traverse(TraversalState state, TraversalCallback callback, IntArrayList pathToRoot) { if (state.atLeafNode()) { // In the unbalanced case it's possible the left most node only has one child: if (state.nodeExists()) { callback.onLeaf(state.nodeID, state.getLeafBlockFP(), pathToRoot); } } else { IntArrayList currentPath = new IntArrayList(); currentPath.addAll(pathToRoot); currentPath.add(state.nodeID); state.pushLeft(); traverse(state, callback, currentPath); state.pop(); state.pushRight(); traverse(state, callback, currentPath); state.pop(); } } interface TraversalCallback { void onLeaf(int leafNodeID, long leafBlockFP, IntArrayList pathToRoot); } /** * This maintains the state for a traversal of the packed index. It is loaded once and can be resused * by calling the reset method. *

* The packed index is a packed representation of a balanced tree and takes the form of a packed array of * file pointer / split value pairs. Both the file pointers and split values are prefix compressed by tree level * requiring us to maintain a stack of values for each level in the tree. The stack size is always the tree depth. *

* The tree is traversed by recursively following the left and then right subtrees under the current node. For the * following tree (split values in square brackets): *

     *        1[16]
     *       / \
     *      /   \
     *     2[8]  3[24]
     *    / \   / \
     *   4   5 6   7
     * 
* The traversal will be 1 -> 2 -> 4 -> 5 -> 3 -> 6 -> 7 with nodes 4, 5, 6 & 7 being leaf nodes. *

* Assuming the full range of values in the tree is 0 -> 32, the non-leaf nodes will represent the following * values: *

     *         1[0-32]
     *        /      \
     *    2[0-16]   3[16-32]
     * 
*/ @NotThreadSafe final class TraversalState { // used to read the packed index byte[] final ByteArrayDataInput dataInput; // holds the minimum (left most) leaf block file pointer for each level we've recursed to: final long[] leafBlockFPStack; // holds the address, in the packed byte[] index, of the left-node of each level: final int[] leftNodePositions; // holds the address, in the packed byte[] index, of the right-node of each level: final int[] rightNodePositions; // holds the packed per-level split values; the run method uses this to save the cell min/max as it recurses: final byte[][] splitValuesStack; int nodeID; int level; @VisibleForTesting int maxLevel; private TraversalState() { nodeID = 1; level = 0; leafBlockFPStack = new long[treeDepth]; leftNodePositions = new int[treeDepth]; rightNodePositions = new int[treeDepth]; splitValuesStack = new byte[treeDepth][]; this.dataInput = new ByteArrayDataInput(packedIndex); readNodeData(false); } public void pushLeft() { int nodePosition = leftNodePositions[level]; nodeID *= 2; level++; maxLevel = Math.max(maxLevel, level); dataInput.setPosition(nodePosition); readNodeData(true); } public void pushRight() { int nodePosition = rightNodePositions[level]; nodeID = nodeID * 2 + 1; level++; maxLevel = Math.max(maxLevel, level); dataInput.setPosition(nodePosition); readNodeData(false); } public void pop() { nodeID /= 2; level--; } public boolean atLeafNode() { return nodeID >= numLeaves; } public boolean nodeExists() { return nodeID - numLeaves < numLeaves; } public long getLeafBlockFP() { return leafBlockFPStack[level]; } public byte[] getSplitValue() { assert !atLeafNode(); return splitValuesStack[level]; } private void readNodeData(boolean isLeft) { leafBlockFPStack[level] = level == 0 ? 0 : leafBlockFPStack[level - 1]; // read leaf block FP delta if (!isLeft) leafBlockFPStack[level] += dataInput.readVLong(); if (!atLeafNode()) { // read prefix, firstDiffByteDelta encoded as int: int code = dataInput.readVInt(); int prefix = code % (1 + bytesPerValue); int suffix = bytesPerValue - prefix; pushSplitValueStack(); if (suffix > 0) { int firstDiffByteDelta = code / (1 + bytesPerValue); // If we are pushing to the left subtree then the delta will be negative if (isLeft) firstDiffByteDelta = -firstDiffByteDelta; int oldByte = splitValuesStack[level][prefix] & 0xFF; splitValuesStack[level][prefix] = (byte) (oldByte + firstDiffByteDelta); dataInput.readBytes(splitValuesStack[level], prefix + 1, suffix - 1); } int leftNumBytes = nodeID * 2 < numLeaves ? dataInput.readVInt() : 0; leftNodePositions[level] = dataInput.getPosition(); rightNodePositions[level] = leftNodePositions[level] + leftNumBytes; } } private void pushSplitValueStack() { if (splitValuesStack[level] == null) splitValuesStack[level] = new byte[bytesPerValue]; if (level == 0) Arrays.fill(splitValuesStack[level], (byte) 0); else System.arraycopy(splitValuesStack[level - 1], 0, splitValuesStack[level], 0, bytesPerValue); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy