org.apache.cassandra.service.AbstractReadExecutor Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of cassandra-all Show documentation
Show all versions of cassandra-all Show documentation
Palantir open source project
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.service;
import java.net.InetAddress;
import java.util.Collection;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.TimeUnit;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Iterables;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.concurrent.KeyspaceAwareSepQueue;
import org.apache.cassandra.concurrent.Stage;
import org.apache.cassandra.concurrent.StageManager;
import org.apache.cassandra.config.CFMetaData.SpeculativeRetry.RetryType;
import org.apache.cassandra.config.Schema;
import org.apache.cassandra.config.ReadRepairDecision;
import org.apache.cassandra.db.ColumnFamilyStore;
import org.apache.cassandra.db.ConsistencyLevel;
import org.apache.cassandra.db.ReadCommand;
import org.apache.cassandra.db.ReadResponse;
import org.apache.cassandra.db.Row;
import org.apache.cassandra.db.Keyspace;
import org.apache.cassandra.exceptions.ReadFailureException;
import org.apache.cassandra.exceptions.ReadTimeoutException;
import org.apache.cassandra.exceptions.UnavailableException;
import org.apache.cassandra.metrics.PredictedSpeculativeRetryPerformanceMetrics;
import org.apache.cassandra.metrics.ReadRepairMetrics;
import org.apache.cassandra.net.MessageOut;
import org.apache.cassandra.net.MessagingService;
import org.apache.cassandra.service.StorageProxy.LocalReadRunnable;
import org.apache.cassandra.tracing.TraceState;
import org.apache.cassandra.tracing.Tracing;
import org.apache.cassandra.utils.FBUtilities;
/**
* Sends a read request to the replicas needed to satisfy a given ConsistencyLevel.
*
* Optionally, may perform additional requests to provide redundancy against replica failure:
* AlwaysSpeculatingReadExecutor will always send a request to one extra replica, while
* SpeculatingReadExecutor will wait until it looks like the original request is in danger
* of timing out before performing extra reads.
*/
public abstract class AbstractReadExecutor
{
private static final Logger logger = LoggerFactory.getLogger(AbstractReadExecutor.class);
protected final ReadCommand command;
protected final List targetReplicas;
protected final RowDigestResolver resolver;
protected final ReadCallback handler;
protected final TraceState traceState;
protected final ColumnFamilyStore cfs;
protected final ConcurrentLinkedQueue latencies;
AbstractReadExecutor(ReadCommand command, ConsistencyLevel consistencyLevel, List targetReplicas, ColumnFamilyStore cfs)
{
this.command = command;
this.targetReplicas = targetReplicas;
this.cfs = cfs;
resolver = new RowDigestResolver(command.ksName, command.key, targetReplicas.size());
traceState = Tracing.instance.get();
this.latencies = new ConcurrentLinkedQueue<>();
handler = new ReadCallback<>(resolver, consistencyLevel, command, targetReplicas, Optional.of(latencies));
}
@VisibleForTesting
boolean isLocalRequest(InetAddress replica) {
return replica.equals(FBUtilities.getBroadcastAddress());
}
protected void makeDataRequests(Iterable endpoints)
{
makeRequests(command, endpoints);
}
protected void makeDigestRequests(Iterable endpoints)
{
makeRequests(command.copy().setIsDigestQuery(true), endpoints);
}
private void makeRequests(ReadCommand readCommand, Iterable endpoints)
{
MessageOut message = null;
boolean hasLocalEndpoint = false;
for (InetAddress endpoint : endpoints)
{
if (isLocalRequest(endpoint))
{
hasLocalEndpoint = true;
continue;
}
if (traceState != null)
traceState.trace("reading {} from {}", readCommand.isDigestQuery() ? "digest" : "data", endpoint);
logger.trace("reading {} from {}", readCommand.isDigestQuery() ? "digest" : "data", endpoint);
if (message == null)
message = readCommand.createMessage();
// Handler adds remote requests latencies to list
MessagingService.instance().sendRRWithFailure(message, endpoint, handler);
}
// We delay the local (potentially blocking) read till the end to avoid stalling remote requests.
if (hasLocalEndpoint)
{
long localStart = System.nanoTime();
logger.trace("reading {} locally", readCommand.isDigestQuery() ? "digest" : "data");
KeyspaceAwareSepQueue.setCurrentKeyspace(command.ksName);
StageManager.getStage(stage(command)).maybeExecuteImmediately(new LocalReadRunnable(command, handler));
latencies.add(System.nanoTime() - localStart);
}
logger.trace("measured read latencies {} ns", latencies);
}
private static Stage stage(ReadCommand command) {
if (command.isCheap()) {
return Stage.READ_CHEAP;
} else {
return Stage.READ;
}
}
/**
* Perform additional requests if it looks like the original will time out. May block while it waits
* to see if the original requests are answered first.
*/
public abstract void maybeTryAdditionalReplicas();
/**
* Get the replicas involved in the [finished] request.
*
* @return target replicas + the extra replica, *IF* we speculated.
*/
public abstract Collection getContactedReplicas();
/**
* send the initial set of requests
*/
public abstract void executeAsync();
/**
* wait for an answer. Blocks until success or timeout, so it is caller's
* responsibility to call maybeTryAdditionalReplicas first.
*/
public Row get() throws ReadFailureException, ReadTimeoutException, DigestMismatchException
{
return handler.get();
}
/**
* Compare difference between passed timestamp and start field against Threshold cutoffs. If the threshold is
* exceeded, the current p99 latency of the retry endpoint is added to the threshold as the "predicted performance"
*/
public void writePredictedSpeculativeRetryPerformanceMetrics() {
InetAddress extraReplica = Iterables.getLast(targetReplicas);
for (PredictedSpeculativeRetryPerformanceMetrics metrics : getPredSpecRetryMetrics()) {
metrics.maybeWriteMetrics(cfs, this.latencies, extraReplica);
}
}
protected abstract List getPredSpecRetryMetrics();
/**
* @return an executor appropriate for the configured speculative read policy
*/
public static AbstractReadExecutor getReadExecutor(ReadCommand command, ConsistencyLevel consistencyLevel) throws UnavailableException
{
Keyspace keyspace = Keyspace.open(command.ksName);
List allReplicas = StorageProxy.getLiveSortedEndpoints(keyspace, command.key);
ReadRepairDecision repairDecision = Schema.instance.getCFMetaData(command.ksName, command.cfName).newReadRepairDecision();
List targetReplicas = consistencyLevel.filterForQuery(keyspace, allReplicas, repairDecision);
// Throw UAE early if we don't have enough replicas.
consistencyLevel.assureSufficientLiveNodes(keyspace, targetReplicas);
if (repairDecision != ReadRepairDecision.NONE)
{
Tracing.trace("Read-repair {}", repairDecision);
ReadRepairMetrics.attempted.mark();
Keyspace.open(command.ksName).getColumnFamilyStore(command.cfName).metric.attemptedReadRepairs.mark();
}
ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(command.cfName);
RetryType retryType = cfs.metadata.getSpeculativeRetry().type;
// Speculative retry is disabled *OR* there are simply no extra replicas to speculate.
if (retryType == RetryType.NONE || consistencyLevel.blockFor(keyspace) == allReplicas.size())
return new NeverSpeculatingReadExecutor(command, consistencyLevel, targetReplicas, cfs);
if (targetReplicas.size() == allReplicas.size())
{
// CL.ALL, RRD.GLOBAL or RRD.DC_LOCAL and a single-DC.
// We are going to contact every node anyway, so ask for 2 full data requests instead of 1, for redundancy
// (same amount of requests in total, but we turn 1 digest request into a full blown data request).
return new AlwaysSpeculatingReadExecutor(cfs, command, consistencyLevel, targetReplicas);
}
// RRD.NONE or RRD.DC_LOCAL w/ multiple DCs.
InetAddress extraReplica = allReplicas.get(targetReplicas.size());
// With repair decision DC_LOCAL all replicas/target replicas may be in different order, so
// we might have to find a replacement that's not already in targetReplicas.
if (repairDecision == ReadRepairDecision.DC_LOCAL && targetReplicas.contains(extraReplica))
{
for (InetAddress address : allReplicas)
{
if (!targetReplicas.contains(address))
{
extraReplica = address;
break;
}
}
}
targetReplicas.add(extraReplica);
if (retryType == RetryType.ALWAYS)
return new AlwaysSpeculatingReadExecutor(cfs, command, consistencyLevel, targetReplicas);
else // PERCENTILE or CUSTOM.
return new SpeculatingReadExecutor(cfs, command, consistencyLevel, targetReplicas);
}
@VisibleForTesting
static class NeverSpeculatingReadExecutor extends AbstractReadExecutor
{
protected static final List specRetryPerformanceMetrics =
PredictedSpeculativeRetryPerformanceMetrics.createMetricsByThresholds(NeverSpeculatingReadExecutor.class);
public NeverSpeculatingReadExecutor(ReadCommand command, ConsistencyLevel consistencyLevel, List targetReplicas, ColumnFamilyStore cfs)
{
super(command, consistencyLevel, targetReplicas, cfs);
}
public void executeAsync()
{
makeDataRequests(targetReplicas.subList(0, 1));
if (targetReplicas.size() > 1)
makeDigestRequests(targetReplicas.subList(1, targetReplicas.size()));
}
public void maybeTryAdditionalReplicas()
{
// no-op
}
public Collection getContactedReplicas()
{
return targetReplicas;
}
protected List getPredSpecRetryMetrics() {
return specRetryPerformanceMetrics;
}
}
@VisibleForTesting
static class SpeculatingReadExecutor extends AbstractReadExecutor
{
protected static final List specRetryPerformanceMetrics =
PredictedSpeculativeRetryPerformanceMetrics.createMetricsByThresholds(SpeculatingReadExecutor.class);
private volatile boolean speculated = false;
public SpeculatingReadExecutor(ColumnFamilyStore cfs,
ReadCommand command,
ConsistencyLevel consistencyLevel,
List targetReplicas)
{
super(command, consistencyLevel, targetReplicas, cfs);
}
public void executeAsync()
{
// if CL + RR result in covering all replicas, getReadExecutor forces AlwaysSpeculating. So we know
// that the last replica in our list is "extra."
List initialReplicas = targetReplicas.subList(0, targetReplicas.size() - 1);
if (handler.blockfor < initialReplicas.size())
{
// We're hitting additional targets for read repair. Since our "extra" replica is the least-
// preferred by the snitch, we do an extra data read to start with against a replica more
// likely to reply; better to let RR fail than the entire query.
makeDataRequests(initialReplicas.subList(0, 2));
if (initialReplicas.size() > 2)
makeDigestRequests(initialReplicas.subList(2, initialReplicas.size()));
}
else
{
// not doing read repair; all replies are important, so it doesn't matter which nodes we
// perform data reads against vs digest.
makeDataRequests(initialReplicas.subList(0, 1));
if (initialReplicas.size() > 1)
makeDigestRequests(initialReplicas.subList(1, initialReplicas.size()));
}
}
public void maybeTryAdditionalReplicas()
{
// no latency information, or we're overloaded
if (cfs.sampleLatencyNanos > TimeUnit.MILLISECONDS.toNanos(command.getTimeout()))
return;
if (!handler.await(cfs.sampleLatencyNanos, TimeUnit.NANOSECONDS))
{
// Could be waiting on the data, or on enough digests.
ReadCommand retryCommand = command;
if (resolver.getData() != null)
retryCommand = command.copy().setIsDigestQuery(true);
InetAddress extraReplica = Iterables.getLast(targetReplicas);
if (traceState != null)
traceState.trace("speculating read retry on {}", extraReplica);
logger.trace("speculating read retry on {}", extraReplica);
MessagingService.instance().sendRRWithFailure(retryCommand.createMessage(), extraReplica, handler);
speculated = true;
cfs.metric.speculativeRetries.inc();
}
}
public Collection getContactedReplicas()
{
return speculated
? targetReplicas
: targetReplicas.subList(0, targetReplicas.size() - 1);
}
protected List getPredSpecRetryMetrics() {
return specRetryPerformanceMetrics;
}
}
@VisibleForTesting
static class AlwaysSpeculatingReadExecutor extends AbstractReadExecutor
{
protected static final List specRetryPerformanceMetrics =
PredictedSpeculativeRetryPerformanceMetrics.createMetricsByThresholds(AlwaysSpeculatingReadExecutor.class);
public AlwaysSpeculatingReadExecutor(ColumnFamilyStore cfs,
ReadCommand command,
ConsistencyLevel consistencyLevel,
List targetReplicas)
{
super(command, consistencyLevel, targetReplicas, cfs);
}
public void maybeTryAdditionalReplicas()
{
// no-op
}
public Collection getContactedReplicas()
{
return targetReplicas;
}
@Override
public void executeAsync()
{
makeDataRequests(targetReplicas.subList(0, targetReplicas.size() > 1 ? 2 : 1));
if (targetReplicas.size() > 2)
makeDigestRequests(targetReplicas.subList(2, targetReplicas.size()));
cfs.metric.speculativeRetries.inc();
}
protected List getPredSpecRetryMetrics() {
return specRetryPerformanceMetrics;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy