org.apache.cassandra.index.sasi.disk.OnDiskIndex Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.index.sasi.disk;
import java.io.*;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.stream.Collectors;
import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.index.sasi.Term;
import org.apache.cassandra.index.sasi.plan.Expression;
import org.apache.cassandra.index.sasi.plan.Expression.Op;
import org.apache.cassandra.index.sasi.utils.MappedBuffer;
import org.apache.cassandra.index.sasi.utils.RangeUnionIterator;
import org.apache.cassandra.index.sasi.utils.AbstractIterator;
import org.apache.cassandra.index.sasi.utils.RangeIterator;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.io.FSReadError;
import org.apache.cassandra.io.util.ChannelProxy;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.FBUtilities;
import com.google.common.base.Function;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.PeekingIterator;
import static org.apache.cassandra.index.sasi.disk.OnDiskBlock.SearchResult;
public class OnDiskIndex implements Iterable, Closeable
{
public enum IteratorOrder
{
DESC(1), ASC(-1);
public final int step;
IteratorOrder(int step)
{
this.step = step;
}
public int startAt(OnDiskBlock block, Expression e)
{
switch (this)
{
case DESC:
return e.lower == null
? 0
: startAt(block.search(e.validator, e.lower.value), e.lower.inclusive);
case ASC:
return e.upper == null
? block.termCount() - 1
: startAt(block.search(e.validator, e.upper.value), e.upper.inclusive);
default:
throw new IllegalArgumentException("Unknown order: " + this);
}
}
public int startAt(SearchResult found, boolean inclusive)
{
switch (this)
{
case DESC:
if (found.cmp < 0)
return found.index + 1;
return inclusive || found.cmp != 0 ? found.index : found.index + 1;
case ASC:
if (found.cmp < 0) // search term was bigger then whole data set
return found.index;
return inclusive && (found.cmp == 0 || found.cmp < 0) ? found.index : found.index - 1;
default:
throw new IllegalArgumentException("Unknown order: " + this);
}
}
}
public final Descriptor descriptor;
protected final OnDiskIndexBuilder.Mode mode;
protected final OnDiskIndexBuilder.TermSize termSize;
protected final AbstractType> comparator;
protected final MappedBuffer indexFile;
protected final long indexSize;
protected final boolean hasMarkedPartials;
protected final Function keyFetcher;
protected final String indexPath;
protected final PointerLevel[] levels;
protected final DataLevel dataLevel;
protected final ByteBuffer minTerm, maxTerm, minKey, maxKey;
@SuppressWarnings("resource")
public OnDiskIndex(File index, AbstractType> cmp, Function keyReader)
{
keyFetcher = keyReader;
comparator = cmp;
indexPath = index.getAbsolutePath();
RandomAccessFile backingFile = null;
try
{
backingFile = new RandomAccessFile(index, "r");
descriptor = new Descriptor(backingFile.readUTF());
termSize = OnDiskIndexBuilder.TermSize.of(backingFile.readShort());
minTerm = ByteBufferUtil.readWithShortLength(backingFile);
maxTerm = ByteBufferUtil.readWithShortLength(backingFile);
minKey = ByteBufferUtil.readWithShortLength(backingFile);
maxKey = ByteBufferUtil.readWithShortLength(backingFile);
mode = OnDiskIndexBuilder.Mode.mode(backingFile.readUTF());
hasMarkedPartials = backingFile.readBoolean();
indexSize = backingFile.length();
indexFile = new MappedBuffer(new ChannelProxy(indexPath, backingFile.getChannel()));
// start of the levels
indexFile.position(indexFile.getLong(indexSize - 8));
int numLevels = indexFile.getInt();
levels = new PointerLevel[numLevels];
for (int i = 0; i < levels.length; i++)
{
int blockCount = indexFile.getInt();
levels[i] = new PointerLevel(indexFile.position(), blockCount);
indexFile.position(indexFile.position() + blockCount * 8);
}
int blockCount = indexFile.getInt();
dataLevel = new DataLevel(indexFile.position(), blockCount);
}
catch (IOException e)
{
throw new FSReadError(e, index);
}
finally
{
FileUtils.closeQuietly(backingFile);
}
}
public boolean hasMarkedPartials()
{
return hasMarkedPartials;
}
public OnDiskIndexBuilder.Mode mode()
{
return mode;
}
public ByteBuffer minTerm()
{
return minTerm;
}
public ByteBuffer maxTerm()
{
return maxTerm;
}
public ByteBuffer minKey()
{
return minKey;
}
public ByteBuffer maxKey()
{
return maxKey;
}
public DataTerm min()
{
return dataLevel.getBlock(0).getTerm(0);
}
public DataTerm max()
{
DataBlock block = dataLevel.getBlock(dataLevel.blockCount - 1);
return block.getTerm(block.termCount() - 1);
}
/**
* Search for rows which match all of the terms inside the given expression in the index file.
*
* @param exp The expression to use for the query.
*
* @return Iterator which contains rows for all of the terms from the given range.
*/
public RangeIterator search(Expression exp)
{
assert mode.supports(exp.getOp());
if (exp.getOp() == Expression.Op.PREFIX && mode == OnDiskIndexBuilder.Mode.CONTAINS && !hasMarkedPartials)
throw new UnsupportedOperationException("prefix queries in CONTAINS mode are not supported by this index");
// optimization in case single term is requested from index
// we don't really need to build additional union iterator
if (exp.getOp() == Op.EQ)
{
DataTerm term = getTerm(exp.lower.value);
return term == null ? null : term.getTokens();
}
// convert single NOT_EQ to range with exclusion
final Expression expression = (exp.getOp() != Op.NOT_EQ)
? exp
: new Expression(exp).setOp(Op.RANGE)
.setLower(new Expression.Bound(minTerm, true))
.setUpper(new Expression.Bound(maxTerm, true))
.addExclusion(exp.lower.value);
List exclusions = new ArrayList<>(expression.exclusions.size());
Iterables.addAll(exclusions, expression.exclusions.stream().filter(exclusion -> {
// accept only exclusions which are in the bounds of lower/upper
return !(expression.lower != null && comparator.compare(exclusion, expression.lower.value) < 0)
&& !(expression.upper != null && comparator.compare(exclusion, expression.upper.value) > 0);
}).collect(Collectors.toList()));
Collections.sort(exclusions, comparator);
if (exclusions.size() == 0)
return searchRange(expression);
List ranges = new ArrayList<>(exclusions.size());
// calculate range splits based on the sorted exclusions
Iterator exclusionsIterator = exclusions.iterator();
Expression.Bound min = expression.lower, max = null;
while (exclusionsIterator.hasNext())
{
max = new Expression.Bound(exclusionsIterator.next(), false);
ranges.add(new Expression(expression).setOp(Op.RANGE).setLower(min).setUpper(max));
min = max;
}
assert max != null;
ranges.add(new Expression(expression).setOp(Op.RANGE).setLower(max).setUpper(expression.upper));
RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
for (Expression e : ranges)
{
@SuppressWarnings("resource")
RangeIterator range = searchRange(e);
if (range != null)
builder.add(range);
}
return builder.build();
}
private RangeIterator searchRange(Expression range)
{
Expression.Bound lower = range.lower;
Expression.Bound upper = range.upper;
int lowerBlock = lower == null ? 0 : getDataBlock(lower.value);
int upperBlock = upper == null
? dataLevel.blockCount - 1
// optimization so we don't have to fetch upperBlock when query has lower == upper
: (lower != null && comparator.compare(lower.value, upper.value) == 0) ? lowerBlock : getDataBlock(upper.value);
return (mode != OnDiskIndexBuilder.Mode.SPARSE || lowerBlock == upperBlock || upperBlock - lowerBlock <= 1)
? searchPoint(lowerBlock, range)
: searchRange(lowerBlock, lower, upperBlock, upper);
}
private RangeIterator searchRange(int lowerBlock, Expression.Bound lower, int upperBlock, Expression.Bound upper)
{
// if lower is at the beginning of the block that means we can just do a single iterator per block
SearchResult lowerPosition = (lower == null) ? null : searchIndex(lower.value, lowerBlock);
SearchResult upperPosition = (upper == null) ? null : searchIndex(upper.value, upperBlock);
RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
// optimistically assume that first and last blocks are full block reads, saves at least 3 'else' conditions
int firstFullBlockIdx = lowerBlock, lastFullBlockIdx = upperBlock;
// 'lower' doesn't cover the whole block so we need to do a partial iteration
// Two reasons why that can happen:
// - 'lower' is not the first element of the block
// - 'lower' is first element but it's not inclusive in the query
if (lowerPosition != null && (lowerPosition.index > 0 || !lower.inclusive))
{
DataBlock block = dataLevel.getBlock(lowerBlock);
int start = (lower.inclusive || lowerPosition.cmp != 0) ? lowerPosition.index : lowerPosition.index + 1;
builder.add(block.getRange(start, block.termCount()));
firstFullBlockIdx = lowerBlock + 1;
}
if (upperPosition != null)
{
DataBlock block = dataLevel.getBlock(upperBlock);
int lastIndex = block.termCount() - 1;
// The save as with 'lower' but here we need to check if the upper is the last element of the block,
// which means that we only have to get individual results if:
// - if it *is not* the last element, or
// - it *is* but shouldn't be included (dictated by upperInclusive)
if (upperPosition.index != lastIndex || !upper.inclusive)
{
int end = (upperPosition.cmp < 0 || (upperPosition.cmp == 0 && upper.inclusive))
? upperPosition.index + 1 : upperPosition.index;
builder.add(block.getRange(0, end));
lastFullBlockIdx = upperBlock - 1;
}
}
int totalSuperBlocks = (lastFullBlockIdx - firstFullBlockIdx) / OnDiskIndexBuilder.SUPER_BLOCK_SIZE;
// if there are no super-blocks, we can simply read all of the block iterators in sequence
if (totalSuperBlocks == 0)
{
for (int i = firstFullBlockIdx; i <= lastFullBlockIdx; i++)
builder.add(dataLevel.getBlock(i).getBlockIndex().iterator(keyFetcher));
return builder.build();
}
// first get all of the blocks which are aligned before the first super-block in the sequence,
// e.g. if the block range was (1, 9) and super-block-size = 4, we need to read 1, 2, 3, 4 - 7 is covered by
// super-block, 8, 9 is a remainder.
int superBlockAlignedStart = firstFullBlockIdx == 0 ? 0 : (int) FBUtilities.align(firstFullBlockIdx, OnDiskIndexBuilder.SUPER_BLOCK_SIZE);
for (int blockIdx = firstFullBlockIdx; blockIdx < Math.min(superBlockAlignedStart, lastFullBlockIdx); blockIdx++)
builder.add(getBlockIterator(blockIdx));
// now read all of the super-blocks matched by the request, from the previous comment
// it's a block with index 1 (which covers everything from 4 to 7)
int superBlockIdx = superBlockAlignedStart / OnDiskIndexBuilder.SUPER_BLOCK_SIZE;
for (int offset = 0; offset < totalSuperBlocks - 1; offset++)
builder.add(dataLevel.getSuperBlock(superBlockIdx++).iterator());
// now it's time for a remainder read, again from the previous example it's 8, 9 because
// we have over-shot previous block but didn't request enough to cover next super-block.
int lastCoveredBlock = superBlockIdx * OnDiskIndexBuilder.SUPER_BLOCK_SIZE;
for (int offset = 0; offset <= (lastFullBlockIdx - lastCoveredBlock); offset++)
builder.add(getBlockIterator(lastCoveredBlock + offset));
return builder.build();
}
private RangeIterator searchPoint(int lowerBlock, Expression expression)
{
Iterator terms = new TermIterator(lowerBlock, expression, IteratorOrder.DESC);
RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
while (terms.hasNext())
{
try
{
builder.add(terms.next().getTokens());
}
finally
{
expression.checkpoint();
}
}
return builder.build();
}
private RangeIterator getBlockIterator(int blockIdx)
{
DataBlock block = dataLevel.getBlock(blockIdx);
return (block.hasCombinedIndex)
? block.getBlockIndex().iterator(keyFetcher)
: block.getRange(0, block.termCount());
}
public Iterator iteratorAt(ByteBuffer query, IteratorOrder order, boolean inclusive)
{
Expression e = new Expression("", comparator);
Expression.Bound bound = new Expression.Bound(query, inclusive);
switch (order)
{
case DESC:
e.setLower(bound);
break;
case ASC:
e.setUpper(bound);
break;
default:
throw new IllegalArgumentException("Unknown order: " + order);
}
return new TermIterator(levels.length == 0 ? 0 : getBlockIdx(findPointer(query), query), e, order);
}
private int getDataBlock(ByteBuffer query)
{
return levels.length == 0 ? 0 : getBlockIdx(findPointer(query), query);
}
public Iterator iterator()
{
return new TermIterator(0, new Expression("", comparator), IteratorOrder.DESC);
}
public void close() throws IOException
{
FileUtils.closeQuietly(indexFile);
}
private PointerTerm findPointer(ByteBuffer query)
{
PointerTerm ptr = null;
for (PointerLevel level : levels)
{
if ((ptr = level.getPointer(ptr, query)) == null)
return null;
}
return ptr;
}
private DataTerm getTerm(ByteBuffer query)
{
SearchResult term = searchIndex(query, getDataBlock(query));
return term.cmp == 0 ? term.result : null;
}
private SearchResult searchIndex(ByteBuffer query, int blockIdx)
{
return dataLevel.getBlock(blockIdx).search(comparator, query);
}
private int getBlockIdx(PointerTerm ptr, ByteBuffer query)
{
int blockIdx = 0;
if (ptr != null)
{
int cmp = ptr.compareTo(comparator, query);
blockIdx = (cmp == 0 || cmp > 0) ? ptr.getBlock() : ptr.getBlock() + 1;
}
return blockIdx;
}
protected class PointerLevel extends Level
{
public PointerLevel(long offset, int count)
{
super(offset, count);
}
public PointerTerm getPointer(PointerTerm parent, ByteBuffer query)
{
return getBlock(getBlockIdx(parent, query)).search(comparator, query).result;
}
protected PointerBlock cast(MappedBuffer block)
{
return new PointerBlock(block);
}
}
protected class DataLevel extends Level
{
protected final int superBlockCnt;
protected final long superBlocksOffset;
public DataLevel(long offset, int count)
{
super(offset, count);
long baseOffset = blockOffsets + blockCount * 8;
superBlockCnt = indexFile.getInt(baseOffset);
superBlocksOffset = baseOffset + 4;
}
protected DataBlock cast(MappedBuffer block)
{
return new DataBlock(block);
}
public OnDiskSuperBlock getSuperBlock(int idx)
{
assert idx < superBlockCnt : String.format("requested index %d is greater than super block count %d", idx, superBlockCnt);
long blockOffset = indexFile.getLong(superBlocksOffset + idx * 8);
return new OnDiskSuperBlock(indexFile.duplicate().position(blockOffset));
}
}
protected class OnDiskSuperBlock
{
private final TokenTree tokenTree;
public OnDiskSuperBlock(MappedBuffer buffer)
{
tokenTree = new TokenTree(descriptor, buffer);
}
public RangeIterator iterator()
{
return tokenTree.iterator(keyFetcher);
}
}
protected abstract class Level
{
protected final long blockOffsets;
protected final int blockCount;
public Level(long offsets, int count)
{
this.blockOffsets = offsets;
this.blockCount = count;
}
public T getBlock(int idx) throws FSReadError
{
assert idx >= 0 && idx < blockCount;
// calculate block offset and move there
// (long is intentional, we'll just need mmap implementation which supports long positions)
long blockOffset = indexFile.getLong(blockOffsets + idx * 8);
return cast(indexFile.duplicate().position(blockOffset));
}
protected abstract T cast(MappedBuffer block);
}
protected class DataBlock extends OnDiskBlock
{
public DataBlock(MappedBuffer data)
{
super(descriptor, data, BlockType.DATA);
}
protected DataTerm cast(MappedBuffer data)
{
return new DataTerm(data, termSize, getBlockIndex());
}
public RangeIterator getRange(int start, int end)
{
RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
NavigableMap sparse = new TreeMap<>();
for (int i = start; i < end; i++)
{
DataTerm term = getTerm(i);
if (term.isSparse())
{
NavigableMap tokens = term.getSparseTokens();
for (Map.Entry t : tokens.entrySet())
{
Token token = sparse.get(t.getKey());
if (token == null)
sparse.put(t.getKey(), t.getValue());
else
token.merge(t.getValue());
}
}
else
{
builder.add(term.getTokens());
}
}
PrefetchedTokensIterator prefetched = sparse.isEmpty() ? null : new PrefetchedTokensIterator(sparse);
if (builder.rangeCount() == 0)
return prefetched;
builder.add(prefetched);
return builder.build();
}
}
protected class PointerBlock extends OnDiskBlock
{
public PointerBlock(MappedBuffer block)
{
super(descriptor, block, BlockType.POINTER);
}
protected PointerTerm cast(MappedBuffer data)
{
return new PointerTerm(data, termSize, hasMarkedPartials);
}
}
public class DataTerm extends Term implements Comparable
{
private final TokenTree perBlockIndex;
protected DataTerm(MappedBuffer content, OnDiskIndexBuilder.TermSize size, TokenTree perBlockIndex)
{
super(content, size, hasMarkedPartials);
this.perBlockIndex = perBlockIndex;
}
public RangeIterator getTokens()
{
final long blockEnd = FBUtilities.align(content.position(), OnDiskIndexBuilder.BLOCK_SIZE);
if (isSparse())
return new PrefetchedTokensIterator(getSparseTokens());
long offset = blockEnd + 4 + content.getInt(getDataOffset() + 1);
return new TokenTree(descriptor, indexFile.duplicate().position(offset)).iterator(keyFetcher);
}
public boolean isSparse()
{
return content.get(getDataOffset()) > 0;
}
public NavigableMap getSparseTokens()
{
long ptrOffset = getDataOffset();
byte size = content.get(ptrOffset);
assert size > 0;
NavigableMap individualTokens = new TreeMap<>();
for (int i = 0; i < size; i++)
{
Token token = perBlockIndex.get(content.getLong(ptrOffset + 1 + (8 * i)), keyFetcher);
assert token != null;
individualTokens.put(token.get(), token);
}
return individualTokens;
}
public int compareTo(DataTerm other)
{
return other == null ? 1 : compareTo(comparator, other.getTerm());
}
}
protected static class PointerTerm extends Term
{
public PointerTerm(MappedBuffer content, OnDiskIndexBuilder.TermSize size, boolean hasMarkedPartials)
{
super(content, size, hasMarkedPartials);
}
public int getBlock()
{
return content.getInt(getDataOffset());
}
}
private static class PrefetchedTokensIterator extends RangeIterator
{
private final NavigableMap tokens;
private PeekingIterator currentIterator;
public PrefetchedTokensIterator(NavigableMap tokens)
{
super(tokens.firstKey(), tokens.lastKey(), tokens.size());
this.tokens = tokens;
this.currentIterator = Iterators.peekingIterator(tokens.values().iterator());
}
protected Token computeNext()
{
return currentIterator != null && currentIterator.hasNext()
? currentIterator.next()
: endOfData();
}
protected void performSkipTo(Long nextToken)
{
currentIterator = Iterators.peekingIterator(tokens.tailMap(nextToken, true).values().iterator());
}
public void close() throws IOException
{
endOfData();
}
}
public AbstractType> getComparator()
{
return comparator;
}
public String getIndexPath()
{
return indexPath;
}
private class TermIterator extends AbstractIterator
{
private final Expression e;
private final IteratorOrder order;
protected OnDiskBlock currentBlock;
protected int blockIndex, offset;
private boolean checkLower = true, checkUpper = true;
public TermIterator(int startBlock, Expression expression, IteratorOrder order)
{
this.e = expression;
this.order = order;
this.blockIndex = startBlock;
nextBlock();
}
protected DataTerm computeNext()
{
for (;;)
{
if (currentBlock == null)
return endOfData();
if (offset >= 0 && offset < currentBlock.termCount())
{
DataTerm currentTerm = currentBlock.getTerm(nextOffset());
// we need to step over all of the partial terms, in PREFIX mode,
// encountered by the query until upper-bound tells us to stop
if (e.getOp() == Op.PREFIX && currentTerm.isPartial())
continue;
// haven't reached the start of the query range yet, let's
// keep skip the current term until lower bound is satisfied
if (checkLower && !e.isLowerSatisfiedBy(currentTerm))
continue;
// flip the flag right on the first bounds match
// to avoid expensive comparisons
checkLower = false;
if (checkUpper && !e.isUpperSatisfiedBy(currentTerm))
return endOfData();
return currentTerm;
}
nextBlock();
}
}
protected void nextBlock()
{
currentBlock = null;
if (blockIndex < 0 || blockIndex >= dataLevel.blockCount)
return;
currentBlock = dataLevel.getBlock(nextBlockIndex());
offset = checkLower ? order.startAt(currentBlock, e) : currentBlock.minOffset(order);
// let's check the last term of the new block right away
// if expression's upper bound is satisfied by it such means that we can avoid
// doing any expensive upper bound checks for that block.
checkUpper = e.hasUpper() && !e.isUpperSatisfiedBy(currentBlock.getTerm(currentBlock.maxOffset(order)));
}
protected int nextBlockIndex()
{
int current = blockIndex;
blockIndex += order.step;
return current;
}
protected int nextOffset()
{
int current = offset;
offset += order.step;
return current;
}
}
}