org.apache.cassandra.io.tries.Walker Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
There is a newer version: 5.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.io.tries;

import java.io.IOException;
import java.io.PrintStream;
import java.nio.ByteBuffer;
import java.util.Arrays;
import javax.annotation.concurrent.NotThreadSafe;

import org.apache.cassandra.io.sstable.format.Version;
import org.apache.cassandra.io.util.PageAware;
import org.apache.cassandra.io.util.Rebufferer;
import org.apache.cassandra.io.util.Rebufferer.BufferHolder;
import org.apache.cassandra.utils.bytecomparable.ByteComparable;
import org.apache.cassandra.utils.bytecomparable.ByteSource;
import org.apache.lucene.util.ArrayUtil;

/**
 * Thread-unsafe trie walking helper. This is analogous to {@link org.apache.cassandra.io.util.RandomAccessReader} for
 * tries -- takes an on-disk trie accessible via a supplied Rebufferer and lets user seek to nodes and work with them.
 * 
 * Assumes data was written using page-aware builder and thus no node crosses a page and thus a buffer boundary.
 * 

 * See {@code org/apache/cassandra/io/sstable/format/bti/BtiFormat.md} for a description of the mechanisms of writing
 * and reading an on-disk trie.
 */
@NotThreadSafe
public class Walker> implements AutoCloseable
{
    /** Value used to indicate a branch (e.g. lesser/greaterBranch) does not exist. */
    public static int NONE = TrieNode.NONE;

    private final Rebufferer source;
    protected final long root;

    // State relating to current node.
    private BufferHolder bh;    // from Rebufferer
    private int offset;         // offset of current node within buf
    protected TrieNode nodeType;  // type of current node
    protected ByteBuffer buf;   // buffer containing the data
    protected long position;    // file position of current node

    // State relating to searches.
    protected long greaterBranch;
    protected long lesserBranch;

    // Version of the byte comparable conversion to use
    public static final ByteComparable.Version BYTE_COMPARABLE_VERSION = ByteComparable.Version.OSS50;

    /**
     * Creates a walker. Rebufferer must be aligned and with a buffer size that is at least 4k.
     */
    public Walker(Rebufferer source, long root)
    {
        this.source = source;
        this.root = root;
        try
        {
            bh = source.rebuffer(root);
            buf = bh.buffer();
        }
        catch (RuntimeException ex)
        {
            if (bh != null) bh.release();
            source.closeReader();
            throw ex;
        }
    }

    public void close()
    {
        bh.release();
        source.closeReader();
    }

    protected final void go(long position)
    {
        long curOffset = position - bh.offset();
        if (curOffset < 0 || curOffset >= buf.limit())
        {
            bh.release();
            bh = Rebufferer.EMPTY; // prevents double release if the call below fails
            bh = source.rebuffer(position);
            buf = bh.buffer();
            curOffset = position - bh.offset();
            assert curOffset >= 0 && curOffset < buf.limit() : String.format("Invalid offset: %d, buf: %s, bh: %s", curOffset, buf, bh);
        }
        this.offset = (int) curOffset;
        this.position = position;
        nodeType = TrieNode.at(buf, (int) curOffset);
    }

    protected final int payloadFlags()
    {
        return nodeType.payloadFlags(buf, offset);
    }

    protected final boolean hasPayload()
    {
        return payloadFlags() != 0;
    }

    protected final int payloadPosition()
    {
        return nodeType.payloadPosition(buf, offset);
    }

    protected final int search(int transitionByte)
    {
        return nodeType.search(buf, offset, transitionByte);
    }

    protected final long transition(int childIndex)
    {
        return nodeType.transition(buf, offset, position, childIndex);
    }

    protected final long lastTransition()
    {
        return nodeType.lastTransition(buf, offset, position);
    }

    protected final long greaterTransition(int searchIndex, long defaultValue)
    {
        return nodeType.greaterTransition(buf, offset, position, searchIndex, defaultValue);
    }

    protected final long lesserTransition(int searchIndex, long defaultValue)
    {
        return nodeType.lesserTransition(buf, offset, position, searchIndex, defaultValue);
    }

    protected final int transitionByte(int childIndex)
    {
        return nodeType.transitionByte(buf, offset, childIndex);
    }

    protected final int transitionRange()
    {
        return nodeType.transitionRange(buf, offset);
    }

    protected final boolean hasChildren()
    {
        return transitionRange() > 0;
    }

    protected final void goMax(long pos)
    {
        go(pos);
        while (true)
        {
            long lastChild = lastTransition();
            if (lastChild == NONE)
                return;
            go(lastChild);
        }
    }

    protected final void goMin(long pos)
    {
        go(pos);
        while (true)
        {
            int payloadBits = payloadFlags();
            if (payloadBits > 0)
                return;

            long firstChild = transition(0);
            if (firstChild == NONE)
                return;
            go(firstChild);
        }
    }

    public interface Extractor
    {
        RESULT extract(VALUE walker, int payloadPosition, int payloadFlags) throws IOException;
    }

    /**
     * Follows the given key while there are transitions in the trie for it.
     *
     * @return the first unmatched byte of the key, may be {@link ByteSource#END_OF_STREAM}
     */
    public int follow(ByteComparable key)
    {
        ByteSource stream = key.asComparableBytes(BYTE_COMPARABLE_VERSION);
        go(root);
        while (true)
        {
            int b = stream.next();
            int childIndex = search(b);

            if (childIndex < 0)
                return b;

            go(transition(childIndex));
        }
    }

    /**
     * Follows the trie for a given key, remembering the closest greater branch.
     * On return the walker is positioned at the longest prefix that matches the input (with or without payload), and
     * min(greaterBranch) is the immediate greater neighbour.
     *
     * @return the first unmatched byte of the key, may be {@link ByteSource#END_OF_STREAM}
     */
    public int followWithGreater(ByteComparable key)
    {
        greaterBranch = NONE;

        ByteSource stream = key.asComparableBytes(BYTE_COMPARABLE_VERSION);
        go(root);
        while (true)
        {
            int b = stream.next();
            int searchIndex = search(b);

            greaterBranch = greaterTransition(searchIndex, greaterBranch);
            if (searchIndex < 0)
                return b;

            go(transition(searchIndex));
        }
    }

    /**
     * Follows the trie for a given key, remembering the closest lesser branch.
     * On return the walker is positioned at the longest prefix that matches the input (with or without payload), and
     * max(lesserBranch) is the immediate lesser neighbour.
     *
     * @return the first unmatched byte of the key, may be {@link ByteSource#END_OF_STREAM}
     */
    public int followWithLesser(ByteComparable key)
    {
        lesserBranch = NONE;

        ByteSource stream = key.asComparableBytes(BYTE_COMPARABLE_VERSION);
        go(root);
        while (true)
        {
            int b = stream.next();
            int searchIndex = search(b);

            lesserBranch = lesserTransition(searchIndex, lesserBranch);

            if (searchIndex < 0)
                return b;

            go(transition(searchIndex));
        }
    }


    /**
     * Takes a prefix of the given key. The prefix is in the sense of a separator key match, i.e. it is only
     * understood as valid if there are no greater entries in the trie (e.g. data at 'a' is ignored if 'ab' or 'abba'
     * is in the trie when looking for 'abc' or 'ac', but accepted when looking for 'aa').
     * In order to not have to go back to data that may have exited cache, payloads are extracted when the node is
     * visited (instead of saving the node's position), which requires an extractor to be passed as parameter.
     * @throws IOException 
     */
    @SuppressWarnings("unchecked")
    public  RESULT prefix(ByteComparable key, Extractor extractor) throws IOException
    {
        RESULT payload = null;

        ByteSource stream = key.asComparableBytes(BYTE_COMPARABLE_VERSION);
        go(root);
        while (true)
        {
            int b = stream.next();
            int childIndex = search(b);

            if (childIndex > 0)
                payload = null;
            else
            {
                int payloadBits = payloadFlags();
                if (payloadBits > 0)
                    payload = extractor.extract((CONCRETE) this, payloadPosition(), payloadBits);
                if (childIndex < 0)
                    return payload;
            }

            go(transition(childIndex));
        }
    }

    /**
     * Follows the trie for a given key, taking a prefix (in the sense above) and searching for neighboring values.
     * On return min(greaterBranch) and max(lesserBranch) are the immediate non-prefix neighbours for the sought value.
     * 
     * Note: in a separator trie the closest smaller neighbour can be another prefix of the given key. This method
     * does not take that into account. E.g. if trie contains "abba", "as" and "ask", looking for "asking" will find
     * "ask" as the match, but max(lesserBranch) will point to "abba" instead of the correct "as". This problem can
     * only occur if there is a valid prefix match.
     * @throws IOException 
     */
    @SuppressWarnings("unchecked")
    public  RESULT prefixAndNeighbours(ByteComparable key, Extractor extractor) throws IOException
    {
        RESULT payload = null;
        greaterBranch = NONE;
        lesserBranch = NONE;

        ByteSource stream = key.asComparableBytes(BYTE_COMPARABLE_VERSION);
        go(root);
        while (true)
        {
            int b = stream.next();
            int searchIndex = search(b);

            greaterBranch = greaterTransition(searchIndex, greaterBranch);

            if (searchIndex == -1 || searchIndex == 0)
            {
                int payloadBits = payloadFlags();
                if (payloadBits > 0)
                    payload = extractor.extract((CONCRETE) this, payloadPosition(), payloadBits);
            }
            else
            {
                lesserBranch = lesserTransition(searchIndex, lesserBranch);
                payload = null;
            }

            if (searchIndex < 0)
                return payload;

            go(transition(searchIndex));
        }
    }

    public ByteComparable getMaxTerm()
    {
        TransitionBytesCollector collector = new TransitionBytesCollector();
        go(root);
        while (true)
        {
            int lastIdx = transitionRange() - 1;
            long lastChild = transition(lastIdx);
            if (lastIdx < 0)
            {
                return collector.toByteComparable();
            }
            collector.add(transitionByte(lastIdx));
            go(lastChild);
        }
    }

    public ByteComparable getMinTerm()
    {
        TransitionBytesCollector collector = new TransitionBytesCollector();
        go(root);
        while (true)
        {
            if (hasPayload())
            {
                return collector.toByteComparable();
            }
            collector.add(transitionByte(0));
            go(transition(0));
        }
    }

    /**
     * To be used only in analysis.
     */
    protected int nodeTypeOrdinal()
    {
        return nodeType.ordinal;
    }

    /**
     * To be used only in analysis.
     */
    protected int nodeSize()
    {
        return payloadPosition() - offset;
    }

    public interface PayloadToString
    {
        String payloadAsString(ByteBuffer buf, int payloadPos, int payloadFlags, Version version) throws IOException;
    }

    public void dumpTrie(PrintStream out, PayloadToString payloadReader, Version version) throws IOException
    {
        out.print("ROOT");
        dumpTrie(out, payloadReader, root, "", version);
    }

    private void dumpTrie(PrintStream out, PayloadToString payloadReader, long node, String indent, Version version) throws IOException
    {
        go(node);
        int bits = payloadFlags();
        out.format(" %s@%x %s%n", nodeType.toString(), node, bits == 0 ? "" : payloadReader.payloadAsString(buf, payloadPosition(), bits, version));
        int range = transitionRange();
        for (int i = 0; i < range; ++i)
        {
            long child = transition(i);
            if (child == NONE)
                continue;
            out.format("%s%02x %s>", indent, transitionByte(i), PageAware.pageStart(position) == PageAware.pageStart(child) ? "--" : "==");
            dumpTrie(out, payloadReader, child, indent + "  ", version);
            go(node);
        }
    }

    @Override
    public String toString()
    {
        return String.format("[Trie Walker - NodeType: %s, source: %s, buffer: %s, buffer file offset: %d, Node buffer offset: %d, Node file position: %d]",
                             nodeType, source, buf, bh.offset(), offset, position);
    }

    public static class TransitionBytesCollector
    {
        protected byte[] bytes = new byte[32];
        protected int pos = 0;

        public void add(int b)
        {
            if (pos == bytes.length)
            {
                bytes = ArrayUtil.grow(bytes, pos + 1);
            }
            bytes[pos++] = (byte) b;
        }

        public void pop()
        {
            assert pos >= 0;
            pos--;
        }

        public ByteComparable toByteComparable()
        {
            if (pos <= 0)
                return null;
            byte[] value = new byte[pos];
            System.arraycopy(bytes, 0, value, 0, pos);
            return v -> ByteSource.fixedLength(value, 0, value.length);
        }

        @Override
        public String toString()
        {
            return String.format("[Bytes %s, pos %d]", Arrays.toString(bytes), pos);
        }
    }
}