org.apache.cassandra.index.internal.composites.CompositesSearcher Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
There is a newer version: 5.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cassandra.index.internal.composites;

import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.db.*;
import org.apache.cassandra.db.filter.ClusteringIndexNamesFilter;
import org.apache.cassandra.db.filter.DataLimits;
import org.apache.cassandra.db.filter.RowFilter;
import org.apache.cassandra.db.partitions.UnfilteredPartitionIterator;
import org.apache.cassandra.db.rows.*;
import org.apache.cassandra.db.transform.Transformation;
import org.apache.cassandra.index.internal.CassandraIndex;
import org.apache.cassandra.index.internal.CassandraIndexSearcher;
import org.apache.cassandra.index.internal.IndexEntry;
import org.apache.cassandra.utils.btree.BTreeSet;
import org.apache.cassandra.utils.concurrent.OpOrder;


public class CompositesSearcher extends CassandraIndexSearcher
{
    private static final Logger logger = LoggerFactory.getLogger(CompositesSearcher.class);

    public CompositesSearcher(ReadCommand command,
                              RowFilter.Expression expression,
                              CassandraIndex index)
    {
        super(command, expression, index);
    }

    private boolean isMatchingEntry(DecoratedKey partitionKey, IndexEntry entry, ReadCommand command)
    {
        return command.selectsKey(partitionKey) && command.selectsClustering(partitionKey, entry.indexedEntryClustering);
    }

    protected UnfilteredPartitionIterator queryDataFromIndex(final DecoratedKey indexKey,
                                                             final RowIterator indexHits,
                                                             final ReadCommand command,
                                                             final ReadOrderGroup orderGroup)
    {
        assert indexHits.staticRow() == Rows.EMPTY_STATIC_ROW;

        return new UnfilteredPartitionIterator()
        {
            private IndexEntry nextEntry;

            private UnfilteredRowIterator next;

            public boolean isForThrift()
            {
                return command.isForThrift();
            }

            public CFMetaData metadata()
            {
                return command.metadata();
            }

            public boolean hasNext()
            {
                return prepareNext();
            }

            public UnfilteredRowIterator next()
            {
                if (next == null)
                    prepareNext();

                UnfilteredRowIterator toReturn = next;
                next = null;
                return toReturn;
            }

            private boolean prepareNext()
            {
                while (true)
                {
                    if (next != null)
                        return true;

                    if (nextEntry == null)
                    {
                        if (!indexHits.hasNext())
                            return false;

                        nextEntry = index.decodeEntry(indexKey, indexHits.next());
                    }

                    // Gather all index hits belonging to the same partition and query the data for those hits.
                    // TODO: it's much more efficient to do 1 read for all hits to the same partition than doing
                    // 1 read per index hit. However, this basically mean materializing all hits for a partition
                    // in memory so we should consider adding some paging mechanism. However, index hits should
                    // be relatively small so it's much better than the previous code that was materializing all
                    // *data* for a given partition.
                    BTreeSet.Builder clusterings = BTreeSet.builder(index.baseCfs.getComparator());
                    List entries = new ArrayList<>();
                    DecoratedKey partitionKey = index.baseCfs.decorateKey(nextEntry.indexedKey);

                    while (nextEntry != null && partitionKey.getKey().equals(nextEntry.indexedKey))
                    {
                        // We're queried a slice of the index, but some hits may not match some of the clustering column constraints
                        if (isMatchingEntry(partitionKey, nextEntry, command))
                        {
                            clusterings.add(nextEntry.indexedEntryClustering);
                            entries.add(nextEntry);
                        }

                        nextEntry = indexHits.hasNext() ? index.decodeEntry(indexKey, indexHits.next()) : null;
                    }

                    // Because we've eliminated entries that don't match the clustering columns, it's possible we added nothing
                    if (clusterings.isEmpty())
                        continue;

                    // Query the gathered index hits. We still need to filter stale hits from the resulting query.
                    ClusteringIndexNamesFilter filter = new ClusteringIndexNamesFilter(clusterings.build(), false);
                    SinglePartitionReadCommand dataCmd = SinglePartitionReadCommand.create(isForThrift(),
                                                                                           index.baseCfs.metadata,
                                                                                           command.nowInSec(),
                                                                                           command.columnFilter(),
                                                                                           command.rowFilter(),
                                                                                           DataLimits.NONE,
                                                                                           partitionKey,
                                                                                           filter,
                                                                                           null);
                    @SuppressWarnings("resource") // We close right away if empty, and if it's assign to next it will be called either
                    // by the next caller of next, or through closing this iterator is this come before.
                    UnfilteredRowIterator dataIter =
                        filterStaleEntries(dataCmd.queryMemtableAndDisk(index.baseCfs,
                                                                        orderGroup.baseReadOpOrderGroup()),
                                           indexKey.getKey(),
                                           entries,
                                           orderGroup.writeOpOrderGroup(),
                                           command.nowInSec());

                    if (dataIter.isEmpty())
                    {
                        dataIter.close();
                        continue;
                    }

                    next = dataIter;
                    return true;
                }
            }

            public void remove()
            {
                throw new UnsupportedOperationException();
            }

            public void close()
            {
                indexHits.close();
                if (next != null)
                    next.close();
            }
        };
    }

    private void deleteAllEntries(final List entries, final OpOrder.Group writeOp, final int nowInSec)
    {
        entries.forEach(entry ->
            index.deleteStaleEntry(entry.indexValue,
                                     entry.indexClustering,
                                     new DeletionTime(entry.timestamp, nowInSec),
                                     writeOp));
    }

    private UnfilteredRowIterator filterStaleEntries(UnfilteredRowIterator dataIter,
                                                     final ByteBuffer indexValue,
                                                     final List entries,
                                                     final OpOrder.Group writeOp,
                                                     final int nowInSec)
    {
        // collect stale index entries and delete them when we close this iterator
        final List staleEntries = new ArrayList<>();

        // if there is a partition level delete in the base table, we need to filter
        // any index entries which would be shadowed by it
        if (!dataIter.partitionLevelDeletion().isLive())
        {
            DeletionTime deletion = dataIter.partitionLevelDeletion();
            entries.forEach(e -> {
                if (deletion.deletes(e.timestamp))
                    staleEntries.add(e);
            });
        }

        ClusteringComparator comparator = dataIter.metadata().comparator;
        class Transform extends Transformation
        {
            private int entriesIdx;

            @Override
            public Row applyToRow(Row row)
            {
                IndexEntry entry = findEntry(row.clustering());
                if (!index.isStale(row, indexValue, nowInSec))
                    return row;

                staleEntries.add(entry);
                return null;
            }

            private IndexEntry findEntry(Clustering clustering)
            {
                assert entriesIdx < entries.size();
                while (entriesIdx < entries.size())
                {
                    IndexEntry entry = entries.get(entriesIdx++);
                    Clustering indexedEntryClustering = entry.indexedEntryClustering;
                    // The entries are in clustering order. So that the requested entry should be the
                    // next entry, the one at 'entriesIdx'. However, we can have stale entries, entries
                    // that have no corresponding row in the base table typically because of a range
                    // tombstone or partition level deletion. Delete such stale entries.
                    int cmp = comparator.compare(indexedEntryClustering, clustering);
                    assert cmp <= 0; // this would means entries are not in clustering order, which shouldn't happen
                    if (cmp == 0)
                        return entry;
                    else
                    {
                        // COMPACT COMPOSITE tables support null values in there clustering key but
                        // those tables do not support static columns. By consequence if a table
                        // has some static columns and all its clustering key elements are null
                        // it means that the partition exists and contains only static data
                       if (!dataIter.metadata().hasStaticColumns() || !containsOnlyNullValues(indexedEntryClustering))
                           staleEntries.add(entry);
                    }
                }
                // entries correspond to the rows we've queried, so we shouldn't have a row that has no corresponding entry.
                throw new AssertionError();
            }

            private boolean containsOnlyNullValues(Clustering indexedEntryClustering)
            {
                int i = 0;
                for (; i < indexedEntryClustering.size() && indexedEntryClustering.get(i) == null; i++);
                return i == indexedEntryClustering.size();
            }

            @Override
            public void onPartitionClose()
            {
                deleteAllEntries(staleEntries, writeOp, nowInSec);
            }
        }

        return Transformation.apply(dataIter, new Transform());
    }
}