org.apache.cassandra.service.reads.range.ScanAllRangesCommandIterator Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.
The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.cassandra.service.reads.range;

import java.util.HashSet;
import java.util.Set;

import com.google.common.base.Preconditions;

import org.apache.cassandra.db.ConsistencyLevel;
import org.apache.cassandra.db.EmptyIterators;
import org.apache.cassandra.db.Keyspace;
import org.apache.cassandra.db.PartitionRangeReadCommand;
import org.apache.cassandra.db.ReadCommand;
import org.apache.cassandra.db.partitions.PartitionIterator;
import org.apache.cassandra.index.Index;
import org.apache.cassandra.locator.EndpointsForRange;
import org.apache.cassandra.locator.InetAddressAndPort;
import org.apache.cassandra.locator.ReplicaPlan;
import org.apache.cassandra.locator.ReplicaPlans;
import org.apache.cassandra.net.Message;
import org.apache.cassandra.net.MessagingService;
import org.apache.cassandra.service.reads.DataResolver;
import org.apache.cassandra.service.reads.ReadCallback;
import org.apache.cassandra.service.reads.repair.NoopReadRepair;
import org.apache.cassandra.tracing.Tracing;
import org.apache.cassandra.utils.CloseableIterator;

/**
 * A custom {@link RangeCommandIterator} that queries all replicas required by consistency level at once with data range
 * specify in {@link PartitionRangeReadCommand}.
 * 
 * This is to speed up {@link Index.QueryPlan#isTopK()} queries that needs to find global top-k rows in the cluster, because
 * existing {@link RangeCommandIterator} has to execute a top-k search per vnode range which is wasting resources.
 * 

 * The implementation combines the replica plans for each data range into a single shared replica plan. This results in
 * queries using reconciliation where it may not be expected. This is handled in the {@link DataResolver} for top-K queries
 * so any usage for queries other that top-K should bear this in mind.
 * 
 * It is important to note that this implementation can only be used with {@link ConsistencyLevel#ONE} and {@link ConsistencyLevel#LOCAL_ONE}
 */
public class ScanAllRangesCommandIterator extends RangeCommandIterator
{
    private final Keyspace keyspace;

    ScanAllRangesCommandIterator(Keyspace keyspace, CloseableIterator replicaPlans,
                                 PartitionRangeReadCommand command,
                                 int totalRangeCount,
                                 long queryStartNanoTime)
    {
        super(replicaPlans, command, totalRangeCount, totalRangeCount, totalRangeCount, queryStartNanoTime);
        Preconditions.checkState(command.isTopK());

        this.keyspace = keyspace;
    }

    @Override
    protected PartitionIterator sendNextRequests()
    {
        // get all replicas to contact
        Set replicasToQuery = null;
        ConsistencyLevel consistencyLevel = null;
        while (replicaPlans.hasNext())
        {
            if (replicasToQuery == null)
                replicasToQuery = new HashSet<>();

            ReplicaPlan.ForRangeRead replicaPlan = replicaPlans.next();
            replicasToQuery.addAll(replicaPlan.contacts().endpoints());
            consistencyLevel = replicaPlan.consistencyLevel();
        }

        if (replicasToQuery == null || replicasToQuery.isEmpty())
            return EmptyIterators.partition();

        ReplicaPlan.ForRangeRead plan = ReplicaPlans.forFullRangeRead(keyspace, consistencyLevel, command.dataRange().keyRange(), replicasToQuery, totalRangeCount);
        ReplicaPlan.SharedForRangeRead sharedReplicaPlan = ReplicaPlan.shared(plan);
        DataResolver resolver = new DataResolver<>(command, sharedReplicaPlan, NoopReadRepair.instance, queryStartNanoTime, false);
        ReadCallback handler = new ReadCallback<>(resolver, command, sharedReplicaPlan, queryStartNanoTime);

        int nodes = 0;
        for (InetAddressAndPort endpoint : replicasToQuery)
        {
            Tracing.trace("Enqueuing request to {}", endpoint);
            Message message = command.createMessage(false);
            MessagingService.instance().sendWithCallback(message, endpoint, handler);
            nodes++;
        }

        rangesQueried += plan.vnodeCount();
        batchesRequested++;

        Tracing.trace("Submitted scanning all ranges requests to {} nodes", nodes);

        // skip read-repair for top-k query because data mismatch may be caused by top-k algorithm instead of actual inconsistency.
        return new SingleRangeResponse(resolver, handler, NoopReadRepair.instance);
    }
}