All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.cassandra.index.sai.memory.TrieMemoryIndex Maven / Gradle / Ivy

Go to download

The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.

There is a newer version: 5.0.0
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.cassandra.index.sai.memory;

import java.nio.ByteBuffer;
import java.util.Iterator;
import java.util.Map;
import java.util.PriorityQueue;
import java.util.SortedSet;
import java.util.concurrent.atomic.LongAdder;
import java.util.function.Function;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import io.netty.util.concurrent.FastThreadLocal;
import org.apache.cassandra.db.Clustering;
import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.db.PartitionPosition;
import org.apache.cassandra.db.memtable.TrieMemtable;
import org.apache.cassandra.db.tries.InMemoryTrie;
import org.apache.cassandra.db.tries.Trie;
import org.apache.cassandra.dht.AbstractBounds;
import org.apache.cassandra.index.sai.QueryContext;
import org.apache.cassandra.index.sai.StorageAttachedIndex;
import org.apache.cassandra.index.sai.analyzer.AbstractAnalyzer;
import org.apache.cassandra.index.sai.disk.format.IndexDescriptor;
import org.apache.cassandra.index.sai.disk.v1.segment.SegmentMetadata;
import org.apache.cassandra.index.sai.iterators.KeyRangeIterator;
import org.apache.cassandra.index.sai.plan.Expression;
import org.apache.cassandra.index.sai.utils.IndexIdentifier;
import org.apache.cassandra.index.sai.utils.PrimaryKey;
import org.apache.cassandra.index.sai.utils.PrimaryKeys;
import org.apache.cassandra.utils.Pair;
import org.apache.cassandra.utils.bytecomparable.ByteComparable;
import org.apache.cassandra.utils.bytecomparable.ByteSource;
import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse;

/**
 * This is an in-memory index using the {@link InMemoryTrie} to store a {@link ByteComparable}
 * representation of the indexed values. Data is stored on-heap or off-heap and follows the
 * settings of the {@link TrieMemtable} to determine where.
 */
public class TrieMemoryIndex extends MemoryIndex
{
    private static final Logger logger = LoggerFactory.getLogger(TrieMemoryIndex.class);
    private static final int MAX_RECURSIVE_KEY_LENGTH = 128;

    private final InMemoryTrie data;
    private final PrimaryKeysReducer primaryKeysReducer;
    private final boolean isLiteral;

    private ByteBuffer minTerm;
    private ByteBuffer maxTerm;

    public TrieMemoryIndex(StorageAttachedIndex index)
    {
        super(index);
        this.data = new InMemoryTrie<>(TrieMemtable.BUFFER_TYPE);
        this.primaryKeysReducer = new PrimaryKeysReducer();
        // The use of the analyzer is within a synchronized block so can be considered thread-safe
        this.isLiteral = index.termType().isLiteral();
    }

    /**
     * Adds an index value to the in-memory index
     *
     * @param key partition key for the indexed value
     * @param clustering clustering for the indexed value
     * @param value indexed value
     * @return amount of heap allocated by the new value
     */
    @Override
    public synchronized long add(DecoratedKey key, Clustering clustering, ByteBuffer value)
    {
        value = index.termType().asIndexBytes(value);
        final PrimaryKey primaryKey = index.hasClustering() ? index.keyFactory().create(key, clustering)
                                                            : index.keyFactory().create(key);
        final long initialSizeOnHeap = data.sizeOnHeap();
        final long initialSizeOffHeap = data.sizeOffHeap();
        final long reducerHeapSize = primaryKeysReducer.heapAllocations();

        if (index.hasAnalyzer())
        {
            AbstractAnalyzer analyzer = index.analyzer();
            try
            {
                analyzer.reset(value);
                while (analyzer.hasNext())
                {
                    addTerm(primaryKey, analyzer.next());
                }
            }
            finally
            {
                analyzer.end();
            }
        }
        else
        {
            addTerm(primaryKey, value);
        }
        long onHeap = data.sizeOnHeap();
        long offHeap = data.sizeOffHeap();
        long heapAllocations = primaryKeysReducer.heapAllocations();
        return (onHeap - initialSizeOnHeap) + (offHeap - initialSizeOffHeap) + (heapAllocations - reducerHeapSize);
    }

    @Override
    public long update(DecoratedKey key, Clustering clustering, ByteBuffer oldValue, ByteBuffer newValue)
    {
        throw new UnsupportedOperationException();
    }

    /**
     * Search for an expression in the in-memory index within the {@link AbstractBounds} defined
     * by keyRange. This can either be an exact match or a range match.
     * 

* @param expression the {@link Expression} to search for * @param keyRange the {@link AbstractBounds} containing the key range to restrict the search to * @return a {@link KeyRangeIterator} containing the search results */ public KeyRangeIterator search(QueryContext queryContext, Expression expression, AbstractBounds keyRange) { if (logger.isTraceEnabled()) logger.trace("Searching memtable index on expression '{}'...", expression); switch (expression.getIndexOperator()) { case EQ: case CONTAINS_KEY: case CONTAINS_VALUE: return exactMatch(expression, keyRange); case RANGE: return rangeMatch(expression, keyRange); default: throw new IllegalArgumentException("Unsupported expression: " + expression); } } /** * Returns an {@link Iterator} over the entire dataset contained in the trie. This is used * when the index is flushed to disk. * * @return the iterator containing the trie data */ @Override public Iterator> iterator() { Iterator> iterator = data.entrySet().iterator(); return new Iterator<>() { @Override public boolean hasNext() { return iterator.hasNext(); } @Override public Pair next() { Map.Entry entry = iterator.next(); return Pair.create(decode(entry.getKey()), entry.getValue()); } }; } @Override public SegmentMetadata.ComponentMetadataMap writeDirect(IndexDescriptor indexDescriptor, IndexIdentifier indexIdentifier, Function postingTransformer) { throw new UnsupportedOperationException(); } @Override public boolean isEmpty() { return minTerm == null; } @Override public ByteBuffer getMinTerm() { return minTerm; } @Override public ByteBuffer getMaxTerm() { return maxTerm; } private void addTerm(PrimaryKey primaryKey, ByteBuffer term) { if (index.validateMaxTermSize(primaryKey.partitionKey(), term, false)) { setMinMaxTerm(term.duplicate()); final ByteComparable comparableBytes = asComparableBytes(term); try { if (term.limit() <= MAX_RECURSIVE_KEY_LENGTH) { data.putRecursive(comparableBytes, primaryKey, primaryKeysReducer); } else { data.apply(Trie.singleton(comparableBytes, primaryKey), primaryKeysReducer); } } catch (InMemoryTrie.SpaceExhaustedException e) { throw new RuntimeException(e); } } } private void setMinMaxTerm(ByteBuffer term) { assert term != null; minTerm = index.termType().min(term, minTerm); maxTerm = index.termType().max(term, maxTerm); } private ByteComparable asComparableBytes(ByteBuffer input) { return isLiteral ? version -> terminated(ByteSource.of(input, version)) : version -> index.termType().asComparableBytes(input, version); } private ByteComparable decode(ByteComparable term) { return isLiteral ? version -> ByteSourceInverse.unescape(ByteSource.peekable(term.asComparableBytes(version))) : term; } private ByteSource terminated(ByteSource src) { return new ByteSource() { boolean done = false; @Override public int next() { if (done) return END_OF_STREAM; int n = src.next(); if (n != END_OF_STREAM) return n; done = true; return ByteSource.TERMINATOR; } }; } private KeyRangeIterator exactMatch(Expression expression, AbstractBounds keyRange) { ByteComparable comparableMatch = expression.lower() == null ? ByteComparable.EMPTY : asComparableBytes(expression.lower().value.encoded); PrimaryKeys primaryKeys = data.get(comparableMatch); return primaryKeys == null ? KeyRangeIterator.empty() : new FilteringInMemoryKeyRangeIterator(primaryKeys.keys(), keyRange); } private static class Collector { private static final int MINIMUM_QUEUE_SIZE = 128; // Maintain the last queue size used on this index to use for the next range match. // This allows for receiving a stream of wide range queries where the queue size // is larger than we would want to default the size to. // TODO Investigate using a decaying histogram here to avoid the effect of outliers. private static final FastThreadLocal lastQueueSize = new FastThreadLocal<>() { protected Integer initialValue() { return MINIMUM_QUEUE_SIZE; } }; PrimaryKey minimumKey = null; PrimaryKey maximumKey = null; final PriorityQueue mergedKeys = new PriorityQueue<>(lastQueueSize.get()); final AbstractBounds keyRange; public Collector(AbstractBounds keyRange) { this.keyRange = keyRange; } public void processContent(PrimaryKeys keys) { if (keys.isEmpty()) return; SortedSet primaryKeys = keys.keys(); // shortcut to avoid generating iterator if (primaryKeys.size() == 1) { processKey(primaryKeys.first()); return; } // skip entire partition keys if they don't overlap if (!keyRange.right.isMinimum() && primaryKeys.first().partitionKey().compareTo(keyRange.right) > 0 || primaryKeys.last().partitionKey().compareTo(keyRange.left) < 0) return; primaryKeys.forEach(this::processKey); } public void updateLastQueueSize() { lastQueueSize.set(Math.max(MINIMUM_QUEUE_SIZE, mergedKeys.size())); } private void processKey(PrimaryKey key) { if (keyRange.contains(key.partitionKey())) { mergedKeys.add(key); minimumKey = minimumKey == null ? key : key.compareTo(minimumKey) < 0 ? key : minimumKey; maximumKey = maximumKey == null ? key : key.compareTo(maximumKey) > 0 ? key : maximumKey; } } } private KeyRangeIterator rangeMatch(Expression expression, AbstractBounds keyRange) { ByteComparable lowerBound, upperBound; boolean lowerInclusive, upperInclusive; if (expression.lower() != null) { lowerBound = asComparableBytes(expression.lower().value.encoded); lowerInclusive = expression.lower().inclusive; } else { lowerBound = ByteComparable.EMPTY; lowerInclusive = false; } if (expression.upper() != null) { upperBound = asComparableBytes(expression.upper().value.encoded); upperInclusive = expression.upper().inclusive; } else { upperBound = null; upperInclusive = false; } Collector cd = new Collector(keyRange); data.subtrie(lowerBound, lowerInclusive, upperBound, upperInclusive) .values() .forEach(cd::processContent); if (cd.mergedKeys.isEmpty()) { return KeyRangeIterator.empty(); } cd.updateLastQueueSize(); return new InMemoryKeyRangeIterator(cd.minimumKey, cd.maximumKey, cd.mergedKeys); } private static class PrimaryKeysReducer implements InMemoryTrie.UpsertTransformer { private final LongAdder heapAllocations = new LongAdder(); @Override public PrimaryKeys apply(PrimaryKeys existing, PrimaryKey neww) { if (existing == null) { existing = new PrimaryKeys(); heapAllocations.add(existing.unsharedHeapSize()); } heapAllocations.add(existing.add(neww)); return existing; } long heapAllocations() { return heapAllocations.longValue(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy