All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.cassandra.service.reads.repair.AbstractReadRepair Maven / Gradle / Ivy

Go to download

The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.

There is a newer version: 5.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.cassandra.service.reads.repair;

import java.util.function.Consumer;

import com.codahale.metrics.Meter;
import com.google.common.base.Preconditions;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.cassandra.concurrent.Stage;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.ConsistencyLevel;
import org.apache.cassandra.db.Keyspace;
import org.apache.cassandra.db.ReadCommand;
import org.apache.cassandra.db.SinglePartitionReadCommand;
import org.apache.cassandra.db.partitions.PartitionIterator;
import org.apache.cassandra.exceptions.ReadTimeoutException;
import org.apache.cassandra.locator.Endpoints;
import org.apache.cassandra.locator.Replica;
import org.apache.cassandra.locator.ReplicaPlan;
import org.apache.cassandra.metrics.ReadRepairMetrics;
import org.apache.cassandra.net.Message;
import org.apache.cassandra.net.MessagingService;
import org.apache.cassandra.service.StorageProxy;
import org.apache.cassandra.service.reads.DataResolver;
import org.apache.cassandra.service.reads.DigestResolver;
import org.apache.cassandra.service.reads.ReadCallback;
import org.apache.cassandra.tracing.Tracing;

import static java.util.concurrent.TimeUnit.MICROSECONDS;

public abstract class AbstractReadRepair, P extends ReplicaPlan.ForRead>
        implements ReadRepair
{
    protected static final Logger logger = LoggerFactory.getLogger(AbstractReadRepair.class);

    protected final ReadCommand command;
    protected final long queryStartNanoTime;
    protected final ReplicaPlan.Shared replicaPlan;
    protected final ColumnFamilyStore cfs;

    private volatile DigestRepair digestRepair = null;

    private static class DigestRepair, P extends ReplicaPlan.ForRead>
    {
        private final DataResolver dataResolver;
        private final ReadCallback readCallback;
        private final Consumer resultConsumer;

        public DigestRepair(DataResolver dataResolver, ReadCallback readCallback, Consumer resultConsumer)
        {
            this.dataResolver = dataResolver;
            this.readCallback = readCallback;
            this.resultConsumer = resultConsumer;
        }
    }

    public AbstractReadRepair(ReadCommand command,
                              ReplicaPlan.Shared replicaPlan,
                              long queryStartNanoTime)
    {
        this.command = command;
        this.queryStartNanoTime = queryStartNanoTime;
        this.replicaPlan = replicaPlan;
        this.cfs = Keyspace.openAndGetStore(command.metadata());
    }

    protected P replicaPlan()
    {
        return replicaPlan.get();
    }

    void sendReadCommand(Replica to, ReadCallback readCallback, boolean speculative, boolean trackRepairedStatus)
    {
        ReadCommand command = this.command;
        
        if (to.isSelf())
        {
            Stage.READ.maybeExecuteImmediately(new StorageProxy.LocalReadRunnable(command, readCallback, trackRepairedStatus));
            return;
        }

        if (to.isTransient())
        {
            // It's OK to send queries to transient nodes during RR, as we may have contacted them for their data request initially
            // So long as we don't use these to generate repair mutations, we're fine, and this is enforced by requiring
            // ReadOnlyReadRepair for transient keyspaces.
            command = command.copyAsTransientQuery(to);
        }

        if (Tracing.isTracing())
        {
            String type;
            if (speculative) type = to.isFull() ? "speculative full" : "speculative transient";
            else type = to.isFull() ? "full" : "transient";
            Tracing.trace("Enqueuing {} data read to {}", type, to);
        }

        Message message = command.createMessage(trackRepairedStatus && to.isFull());
        MessagingService.instance().sendWithCallback(message, to.endpoint(), readCallback);
    }

    abstract Meter getRepairMeter();

    // digestResolver isn't used here because we resend read requests to all participants
    public void startRepair(DigestResolver digestResolver, Consumer resultConsumer)
    {
        getRepairMeter().mark();

        /*
         * When repaired data tracking is enabled, a digest will be created from data reads from repaired SSTables.
         * The digests from each replica can then be compared on the coordinator to detect any divergence in their
         * repaired datasets. In this context, an SSTable is considered repaired if it is marked repaired or has a 
         * pending repair session which has been committed. In addition to the digest, a set of ids for any pending but 
         * as yet uncommitted repair sessions is recorded and returned to the coordinator. This is to help reduce false 
         * positives caused by compaction lagging which can leave sstables from committed sessions in the pending state
         * for a time.
         */
        boolean trackRepairedStatus = DatabaseDescriptor.getRepairedDataTrackingForPartitionReadsEnabled();

        // Do a full data read to resolve the correct response (and repair node that need be)
        DataResolver resolver = new DataResolver<>(command, replicaPlan, this, queryStartNanoTime, trackRepairedStatus);
        ReadCallback readCallback = new ReadCallback<>(resolver, command, replicaPlan, queryStartNanoTime);

        digestRepair = new DigestRepair<>(resolver, readCallback, resultConsumer);

        // if enabled, request additional info about repaired data from any full replicas
        for (Replica replica : replicaPlan().contacts())
        {
            sendReadCommand(replica, readCallback, false, trackRepairedStatus);
        }

        ReadRepairDiagnostics.startRepair(this, replicaPlan(), digestResolver);
    }

    public void awaitReads() throws ReadTimeoutException
    {
        DigestRepair repair = digestRepair;
        if (repair == null)
            return;

        try
        {
            repair.readCallback.awaitResults();
        }
        catch (ReadTimeoutException e)
        {
            ReadRepairMetrics.timedOut.mark();
            if (logger.isDebugEnabled() )
                logger.debug("Timed out merging read repair responses", e);
            throw e;
        }
        repair.resultConsumer.accept(digestRepair.dataResolver.resolve());
    }

    private boolean shouldSpeculate()
    {
        ConsistencyLevel consistency = replicaPlan().consistencyLevel();
        ConsistencyLevel speculativeCL = consistency.isDatacenterLocal() ? ConsistencyLevel.LOCAL_QUORUM : ConsistencyLevel.QUORUM;
        return  consistency != ConsistencyLevel.EACH_QUORUM
                && consistency.satisfies(speculativeCL, replicaPlan.get().replicationStrategy())
                && cfs.sampleReadLatencyMicros <= command.getTimeout(MICROSECONDS);
    }

    public void maybeSendAdditionalReads()
    {
        Preconditions.checkState(command instanceof SinglePartitionReadCommand,
                                 "maybeSendAdditionalReads can only be called for SinglePartitionReadCommand");
        DigestRepair repair = digestRepair;
        if (repair == null)
            return;

        if (shouldSpeculate() && !repair.readCallback.await(cfs.sampleReadLatencyMicros, MICROSECONDS))
        {
            Replica uncontacted = replicaPlan().firstUncontactedCandidate(replica -> true);
            if (uncontacted == null)
                return;

            replicaPlan.addToContacts(uncontacted);
            sendReadCommand(uncontacted, repair.readCallback, true, false);
            ReadRepairMetrics.speculatedRead.mark();
            ReadRepairDiagnostics.speculatedRead(this, uncontacted.endpoint(), replicaPlan());
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy