org.apache.cassandra.index.sasi.disk.OnDiskIndex Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
There is a newer version: 5.0-rc1
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.index.sasi.disk;

import java.io.*;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.stream.Collectors;

import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.index.sasi.Term;
import org.apache.cassandra.index.sasi.plan.Expression;
import org.apache.cassandra.index.sasi.plan.Expression.Op;
import org.apache.cassandra.index.sasi.utils.MappedBuffer;
import org.apache.cassandra.index.sasi.utils.RangeUnionIterator;
import org.apache.cassandra.index.sasi.utils.AbstractIterator;
import org.apache.cassandra.index.sasi.utils.RangeIterator;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.io.FSReadError;
import org.apache.cassandra.io.util.ChannelProxy;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.FBUtilities;

import com.google.common.base.Function;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.PeekingIterator;

import static org.apache.cassandra.index.sasi.disk.OnDiskBlock.SearchResult;

public class OnDiskIndex implements Iterable, Closeable
{
    public enum IteratorOrder
    {
        DESC(1), ASC(-1);

        public final int step;

        IteratorOrder(int step)
        {
            this.step = step;
        }

        public int startAt(OnDiskBlock block, Expression e)
        {
            switch (this)
            {
                case DESC:
                    return e.lower == null
                            ? 0
                            : startAt(block.search(e.validator, e.lower.value), e.lower.inclusive);

                case ASC:
                    return e.upper == null
                            ? block.termCount() - 1
                            : startAt(block.search(e.validator, e.upper.value), e.upper.inclusive);

                default:
                    throw new IllegalArgumentException("Unknown order: " + this);
            }
        }

        public int startAt(SearchResult found, boolean inclusive)
        {
            switch (this)
            {
                case DESC:
                    if (found.cmp < 0)
                        return found.index + 1;

                    return inclusive || found.cmp != 0 ? found.index : found.index + 1;

                case ASC:
                    if (found.cmp < 0) // search term was bigger then whole data set
                        return found.index;
                    return inclusive && (found.cmp == 0 || found.cmp < 0) ? found.index : found.index - 1;

                default:
                    throw new IllegalArgumentException("Unknown order: " + this);
            }
        }
    }

    public final Descriptor descriptor;
    protected final OnDiskIndexBuilder.Mode mode;
    protected final OnDiskIndexBuilder.TermSize termSize;

    protected final AbstractType comparator;
    protected final MappedBuffer indexFile;
    protected final long indexSize;
    protected final boolean hasMarkedPartials;

    protected final Function keyFetcher;

    protected final String indexPath;

    protected final PointerLevel[] levels;
    protected final DataLevel dataLevel;

    protected final ByteBuffer minTerm, maxTerm, minKey, maxKey;

    @SuppressWarnings("resource")
    public OnDiskIndex(File index, AbstractType cmp, Function keyReader)
    {
        keyFetcher = keyReader;

        comparator = cmp;
        indexPath = index.getAbsolutePath();

        RandomAccessFile backingFile = null;
        try
        {
            backingFile = new RandomAccessFile(index, "r");

            descriptor = new Descriptor(backingFile.readUTF());

            termSize = OnDiskIndexBuilder.TermSize.of(backingFile.readShort());

            minTerm = ByteBufferUtil.readWithShortLength(backingFile);
            maxTerm = ByteBufferUtil.readWithShortLength(backingFile);

            minKey = ByteBufferUtil.readWithShortLength(backingFile);
            maxKey = ByteBufferUtil.readWithShortLength(backingFile);

            mode = OnDiskIndexBuilder.Mode.mode(backingFile.readUTF());
            hasMarkedPartials = backingFile.readBoolean();

            indexSize = backingFile.length();
            indexFile = new MappedBuffer(new ChannelProxy(indexPath, backingFile.getChannel()));

            // start of the levels
            indexFile.position(indexFile.getLong(indexSize - 8));

            int numLevels = indexFile.getInt();
            levels = new PointerLevel[numLevels];
            for (int i = 0; i < levels.length; i++)
            {
                int blockCount = indexFile.getInt();
                levels[i] = new PointerLevel(indexFile.position(), blockCount);
                indexFile.position(indexFile.position() + blockCount * 8);
            }

            int blockCount = indexFile.getInt();
            dataLevel = new DataLevel(indexFile.position(), blockCount);
        }
        catch (IOException e)
        {
            throw new FSReadError(e, index);
        }
        finally
        {
            FileUtils.closeQuietly(backingFile);
        }
    }

    public boolean hasMarkedPartials()
    {
        return hasMarkedPartials;
    }

    public OnDiskIndexBuilder.Mode mode()
    {
        return mode;
    }

    public ByteBuffer minTerm()
    {
        return minTerm;
    }

    public ByteBuffer maxTerm()
    {
        return maxTerm;
    }

    public ByteBuffer minKey()
    {
        return minKey;
    }

    public ByteBuffer maxKey()
    {
        return maxKey;
    }

    public DataTerm min()
    {
        return dataLevel.getBlock(0).getTerm(0);
    }

    public DataTerm max()
    {
        DataBlock block = dataLevel.getBlock(dataLevel.blockCount - 1);
        return block.getTerm(block.termCount() - 1);
    }

    /**
     * Search for rows which match all of the terms inside the given expression in the index file.
     *
     * @param exp The expression to use for the query.
     *
     * @return Iterator which contains rows for all of the terms from the given range.
     */
    public RangeIterator search(Expression exp)
    {
        assert mode.supports(exp.getOp());

        if (exp.getOp() == Expression.Op.PREFIX && mode == OnDiskIndexBuilder.Mode.CONTAINS && !hasMarkedPartials)
            throw new UnsupportedOperationException("prefix queries in CONTAINS mode are not supported by this index");

        // optimization in case single term is requested from index
        // we don't really need to build additional union iterator
        if (exp.getOp() == Op.EQ)
        {
            DataTerm term = getTerm(exp.lower.value);
            return term == null ? null : term.getTokens();
        }

        // convert single NOT_EQ to range with exclusion
        final Expression expression = (exp.getOp() != Op.NOT_EQ)
                                        ? exp
                                        : new Expression(exp).setOp(Op.RANGE)
                                                .setLower(new Expression.Bound(minTerm, true))
                                                .setUpper(new Expression.Bound(maxTerm, true))
                                                .addExclusion(exp.lower.value);

        List exclusions = new ArrayList<>(expression.exclusions.size());

        Iterables.addAll(exclusions, expression.exclusions.stream().filter(exclusion -> {
            // accept only exclusions which are in the bounds of lower/upper
            return !(expression.lower != null && comparator.compare(exclusion, expression.lower.value) < 0)
                && !(expression.upper != null && comparator.compare(exclusion, expression.upper.value) > 0);
        }).collect(Collectors.toList()));

        Collections.sort(exclusions, comparator);

        if (exclusions.size() == 0)
            return searchRange(expression);

        List ranges = new ArrayList<>(exclusions.size());

        // calculate range splits based on the sorted exclusions
        Iterator exclusionsIterator = exclusions.iterator();

        Expression.Bound min = expression.lower, max = null;
        while (exclusionsIterator.hasNext())
        {
            max = new Expression.Bound(exclusionsIterator.next(), false);
            ranges.add(new Expression(expression).setOp(Op.RANGE).setLower(min).setUpper(max));
            min = max;
        }

        assert max != null;
        ranges.add(new Expression(expression).setOp(Op.RANGE).setLower(max).setUpper(expression.upper));

        RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
        for (Expression e : ranges)
        {
            @SuppressWarnings("resource")
            RangeIterator range = searchRange(e);
            if (range != null)
                builder.add(range);
        }

        return builder.build();
    }

    private RangeIterator searchRange(Expression range)
    {
        Expression.Bound lower = range.lower;
        Expression.Bound upper = range.upper;

        int lowerBlock = lower == null ? 0 : getDataBlock(lower.value);
        int upperBlock = upper == null
                ? dataLevel.blockCount - 1
                // optimization so we don't have to fetch upperBlock when query has lower == upper
                : (lower != null && comparator.compare(lower.value, upper.value) == 0) ? lowerBlock : getDataBlock(upper.value);

        return (mode != OnDiskIndexBuilder.Mode.SPARSE || lowerBlock == upperBlock || upperBlock - lowerBlock <= 1)
                ? searchPoint(lowerBlock, range)
                : searchRange(lowerBlock, lower, upperBlock, upper);
    }

    private RangeIterator searchRange(int lowerBlock, Expression.Bound lower, int upperBlock, Expression.Bound upper)
    {
        // if lower is at the beginning of the block that means we can just do a single iterator per block
        SearchResult lowerPosition = (lower == null) ? null : searchIndex(lower.value, lowerBlock);
        SearchResult upperPosition = (upper == null) ? null : searchIndex(upper.value, upperBlock);

        RangeUnionIterator.Builder builder = RangeUnionIterator.builder();

        // optimistically assume that first and last blocks are full block reads, saves at least 3 'else' conditions
        int firstFullBlockIdx = lowerBlock, lastFullBlockIdx = upperBlock;

        // 'lower' doesn't cover the whole block so we need to do a partial iteration
        // Two reasons why that can happen:
        //   - 'lower' is not the first element of the block
        //   - 'lower' is first element but it's not inclusive in the query
        if (lowerPosition != null && (lowerPosition.index > 0 || !lower.inclusive))
        {
            DataBlock block = dataLevel.getBlock(lowerBlock);
            int start = (lower.inclusive || lowerPosition.cmp != 0) ? lowerPosition.index : lowerPosition.index + 1;

            builder.add(block.getRange(start, block.termCount()));
            firstFullBlockIdx = lowerBlock + 1;
        }

        if (upperPosition != null)
        {
            DataBlock block = dataLevel.getBlock(upperBlock);
            int lastIndex = block.termCount() - 1;

            // The save as with 'lower' but here we need to check if the upper is the last element of the block,
            // which means that we only have to get individual results if:
            //  - if it *is not* the last element, or
            //  - it *is* but shouldn't be included (dictated by upperInclusive)
            if (upperPosition.index != lastIndex || !upper.inclusive)
            {
                int end = (upperPosition.cmp < 0 || (upperPosition.cmp == 0 && upper.inclusive))
                                ? upperPosition.index + 1 : upperPosition.index;

                builder.add(block.getRange(0, end));
                lastFullBlockIdx = upperBlock - 1;
            }
        }

        int totalSuperBlocks = (lastFullBlockIdx - firstFullBlockIdx) / OnDiskIndexBuilder.SUPER_BLOCK_SIZE;

        // if there are no super-blocks, we can simply read all of the block iterators in sequence
        if (totalSuperBlocks == 0)
        {
            for (int i = firstFullBlockIdx; i <= lastFullBlockIdx; i++)
                builder.add(dataLevel.getBlock(i).getBlockIndex().iterator(keyFetcher));

            return builder.build();
        }

        // first get all of the blocks which are aligned before the first super-block in the sequence,
        // e.g. if the block range was (1, 9) and super-block-size = 4, we need to read 1, 2, 3, 4 - 7 is covered by
        // super-block, 8, 9 is a remainder.

        int superBlockAlignedStart = firstFullBlockIdx == 0 ? 0 : (int) FBUtilities.align(firstFullBlockIdx, OnDiskIndexBuilder.SUPER_BLOCK_SIZE);
        for (int blockIdx = firstFullBlockIdx; blockIdx < Math.min(superBlockAlignedStart, lastFullBlockIdx); blockIdx++)
            builder.add(getBlockIterator(blockIdx));

        // now read all of the super-blocks matched by the request, from the previous comment
        // it's a block with index 1 (which covers everything from 4 to 7)

        int superBlockIdx = superBlockAlignedStart / OnDiskIndexBuilder.SUPER_BLOCK_SIZE;
        for (int offset = 0; offset < totalSuperBlocks - 1; offset++)
            builder.add(dataLevel.getSuperBlock(superBlockIdx++).iterator());

        // now it's time for a remainder read, again from the previous example it's 8, 9 because
        // we have over-shot previous block but didn't request enough to cover next super-block.

        int lastCoveredBlock = superBlockIdx * OnDiskIndexBuilder.SUPER_BLOCK_SIZE;
        for (int offset = 0; offset <= (lastFullBlockIdx - lastCoveredBlock); offset++)
            builder.add(getBlockIterator(lastCoveredBlock + offset));

        return builder.build();
    }

    private RangeIterator searchPoint(int lowerBlock, Expression expression)
    {
        Iterator terms = new TermIterator(lowerBlock, expression, IteratorOrder.DESC);
        RangeUnionIterator.Builder builder = RangeUnionIterator.builder();

        while (terms.hasNext())
        {
            try
            {
                builder.add(terms.next().getTokens());
            }
            finally
            {
                expression.checkpoint();
            }
        }

        return builder.build();
    }

    private RangeIterator getBlockIterator(int blockIdx)
    {
        DataBlock block = dataLevel.getBlock(blockIdx);
        return (block.hasCombinedIndex)
                ? block.getBlockIndex().iterator(keyFetcher)
                : block.getRange(0, block.termCount());
    }

    public Iterator iteratorAt(ByteBuffer query, IteratorOrder order, boolean inclusive)
    {
        Expression e = new Expression("", comparator);
        Expression.Bound bound = new Expression.Bound(query, inclusive);

        switch (order)
        {
            case DESC:
                e.setLower(bound);
                break;

            case ASC:
                e.setUpper(bound);
                break;

            default:
                throw new IllegalArgumentException("Unknown order: " + order);
        }

        return new TermIterator(levels.length == 0 ? 0 : getBlockIdx(findPointer(query), query), e, order);
    }

    private int getDataBlock(ByteBuffer query)
    {
        return levels.length == 0 ? 0 : getBlockIdx(findPointer(query), query);
    }

    public Iterator iterator()
    {
        return new TermIterator(0, new Expression("", comparator), IteratorOrder.DESC);
    }

    public void close() throws IOException
    {
        FileUtils.closeQuietly(indexFile);
    }

    private PointerTerm findPointer(ByteBuffer query)
    {
        PointerTerm ptr = null;
        for (PointerLevel level : levels)
        {
            if ((ptr = level.getPointer(ptr, query)) == null)
                return null;
        }

        return ptr;
    }

    private DataTerm getTerm(ByteBuffer query)
    {
        SearchResult term = searchIndex(query, getDataBlock(query));
        return term.cmp == 0 ? term.result : null;
    }

    private SearchResult searchIndex(ByteBuffer query, int blockIdx)
    {
        return dataLevel.getBlock(blockIdx).search(comparator, query);
    }

    private int getBlockIdx(PointerTerm ptr, ByteBuffer query)
    {
        int blockIdx = 0;
        if (ptr != null)
        {
            int cmp = ptr.compareTo(comparator, query);
            blockIdx = (cmp == 0 || cmp > 0) ? ptr.getBlock() : ptr.getBlock() + 1;
        }

        return blockIdx;
    }

    protected class PointerLevel extends Level
    {
        public PointerLevel(long offset, int count)
        {
            super(offset, count);
        }

        public PointerTerm getPointer(PointerTerm parent, ByteBuffer query)
        {
            return getBlock(getBlockIdx(parent, query)).search(comparator, query).result;
        }

        protected PointerBlock cast(MappedBuffer block)
        {
            return new PointerBlock(block);
        }
    }

    protected class DataLevel extends Level
    {
        protected final int superBlockCnt;
        protected final long superBlocksOffset;

        public DataLevel(long offset, int count)
        {
            super(offset, count);
            long baseOffset = blockOffsets + blockCount * 8;
            superBlockCnt = indexFile.getInt(baseOffset);
            superBlocksOffset = baseOffset + 4;
        }

        protected DataBlock cast(MappedBuffer block)
        {
            return new DataBlock(block);
        }

        public OnDiskSuperBlock getSuperBlock(int idx)
        {
            assert idx < superBlockCnt : String.format("requested index %d is greater than super block count %d", idx, superBlockCnt);
            long blockOffset = indexFile.getLong(superBlocksOffset + idx * 8);
            return new OnDiskSuperBlock(indexFile.duplicate().position(blockOffset));
        }
    }

    protected class OnDiskSuperBlock
    {
        private final TokenTree tokenTree;

        public OnDiskSuperBlock(MappedBuffer buffer)
        {
            tokenTree = new TokenTree(descriptor, buffer);
        }

        public RangeIterator iterator()
        {
            return tokenTree.iterator(keyFetcher);
        }
    }

    protected abstract class Level
    {
        protected final long blockOffsets;
        protected final int blockCount;

        public Level(long offsets, int count)
        {
            this.blockOffsets = offsets;
            this.blockCount = count;
        }

        public T getBlock(int idx) throws FSReadError
        {
            assert idx >= 0 && idx < blockCount;

            // calculate block offset and move there
            // (long is intentional, we'll just need mmap implementation which supports long positions)
            long blockOffset = indexFile.getLong(blockOffsets + idx * 8);
            return cast(indexFile.duplicate().position(blockOffset));
        }

        protected abstract T cast(MappedBuffer block);
    }

    protected class DataBlock extends OnDiskBlock
    {
        public DataBlock(MappedBuffer data)
        {
            super(descriptor, data, BlockType.DATA);
        }

        protected DataTerm cast(MappedBuffer data)
        {
            return new DataTerm(data, termSize, getBlockIndex());
        }

        public RangeIterator getRange(int start, int end)
        {
            RangeUnionIterator.Builder builder = RangeUnionIterator.builder();
            NavigableMap sparse = new TreeMap<>();

            for (int i = start; i < end; i++)
            {
                DataTerm term = getTerm(i);

                if (term.isSparse())
                {
                    NavigableMap tokens = term.getSparseTokens();
                    for (Map.Entry t : tokens.entrySet())
                    {
                        Token token = sparse.get(t.getKey());
                        if (token == null)
                            sparse.put(t.getKey(), t.getValue());
                        else
                            token.merge(t.getValue());
                    }
                }
                else
                {
                    builder.add(term.getTokens());
                }
            }

            PrefetchedTokensIterator prefetched = sparse.isEmpty() ? null : new PrefetchedTokensIterator(sparse);

            if (builder.rangeCount() == 0)
                return prefetched;

            builder.add(prefetched);
            return builder.build();
        }
    }

    protected class PointerBlock extends OnDiskBlock
    {
        public PointerBlock(MappedBuffer block)
        {
            super(descriptor, block, BlockType.POINTER);
        }

        protected PointerTerm cast(MappedBuffer data)
        {
            return new PointerTerm(data, termSize, hasMarkedPartials);
        }
    }

    public class DataTerm extends Term implements Comparable
    {
        private final TokenTree perBlockIndex;

        protected DataTerm(MappedBuffer content, OnDiskIndexBuilder.TermSize size, TokenTree perBlockIndex)
        {
            super(content, size, hasMarkedPartials);
            this.perBlockIndex = perBlockIndex;
        }

        public RangeIterator getTokens()
        {
            final long blockEnd = FBUtilities.align(content.position(), OnDiskIndexBuilder.BLOCK_SIZE);

            if (isSparse())
                return new PrefetchedTokensIterator(getSparseTokens());

            long offset = blockEnd + 4 + content.getInt(getDataOffset() + 1);
            return new TokenTree(descriptor, indexFile.duplicate().position(offset)).iterator(keyFetcher);
        }

        public boolean isSparse()
        {
            return content.get(getDataOffset()) > 0;
        }

        public NavigableMap getSparseTokens()
        {
            long ptrOffset = getDataOffset();

            byte size = content.get(ptrOffset);

            assert size > 0;

            NavigableMap individualTokens = new TreeMap<>();
            for (int i = 0; i < size; i++)
            {
                Token token = perBlockIndex.get(content.getLong(ptrOffset + 1 + (8 * i)), keyFetcher);

                assert token != null;
                individualTokens.put(token.get(), token);
            }

            return individualTokens;
        }

        public int compareTo(DataTerm other)
        {
            return other == null ? 1 : compareTo(comparator, other.getTerm());
        }
    }

    protected static class PointerTerm extends Term
    {
        public PointerTerm(MappedBuffer content, OnDiskIndexBuilder.TermSize size, boolean hasMarkedPartials)
        {
            super(content, size, hasMarkedPartials);
        }

        public int getBlock()
        {
            return content.getInt(getDataOffset());
        }
    }

    private static class PrefetchedTokensIterator extends RangeIterator
    {
        private final NavigableMap tokens;
        private PeekingIterator currentIterator;

        public PrefetchedTokensIterator(NavigableMap tokens)
        {
            super(tokens.firstKey(), tokens.lastKey(), tokens.size());
            this.tokens = tokens;
            this.currentIterator = Iterators.peekingIterator(tokens.values().iterator());
        }

        protected Token computeNext()
        {
            return currentIterator != null && currentIterator.hasNext()
                    ? currentIterator.next()
                    : endOfData();
        }

        protected void performSkipTo(Long nextToken)
        {
            currentIterator = Iterators.peekingIterator(tokens.tailMap(nextToken, true).values().iterator());
        }

        public void close() throws IOException
        {
            endOfData();
        }
    }

    public AbstractType getComparator()
    {
        return comparator;
    }

    public String getIndexPath()
    {
        return indexPath;
    }

    private class TermIterator extends AbstractIterator
    {
        private final Expression e;
        private final IteratorOrder order;

        protected OnDiskBlock currentBlock;
        protected int blockIndex, offset;

        private boolean checkLower = true, checkUpper = true;

        public TermIterator(int startBlock, Expression expression, IteratorOrder order)
        {
            this.e = expression;
            this.order = order;
            this.blockIndex = startBlock;

            nextBlock();
        }

        protected DataTerm computeNext()
        {
            for (;;)
            {
                if (currentBlock == null)
                    return endOfData();

                if (offset >= 0 && offset < currentBlock.termCount())
                {
                    DataTerm currentTerm = currentBlock.getTerm(nextOffset());

                    // we need to step over all of the partial terms, in PREFIX mode,
                    // encountered by the query until upper-bound tells us to stop
                    if (e.getOp() == Op.PREFIX && currentTerm.isPartial())
                        continue;

                    // haven't reached the start of the query range yet, let's
                    // keep skip the current term until lower bound is satisfied
                    if (checkLower && !e.isLowerSatisfiedBy(currentTerm))
                        continue;

                    // flip the flag right on the first bounds match
                    // to avoid expensive comparisons
                    checkLower = false;

                    if (checkUpper && !e.isUpperSatisfiedBy(currentTerm))
                        return endOfData();

                    return currentTerm;
                }

                nextBlock();
            }
        }

        protected void nextBlock()
        {
            currentBlock = null;

            if (blockIndex < 0 || blockIndex >= dataLevel.blockCount)
                return;

            currentBlock = dataLevel.getBlock(nextBlockIndex());
            offset = checkLower ? order.startAt(currentBlock, e) : currentBlock.minOffset(order);

            // let's check the last term of the new block right away
            // if expression's upper bound is satisfied by it such means that we can avoid
            // doing any expensive upper bound checks for that block.
            checkUpper = e.hasUpper() && !e.isUpperSatisfiedBy(currentBlock.getTerm(currentBlock.maxOffset(order)));
        }

        protected int nextBlockIndex()
        {
            int current = blockIndex;
            blockIndex += order.step;
            return current;
        }

        protected int nextOffset()
        {
            int current = offset;
            offset += order.step;
            return current;
        }
    }
}