All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.cassandra.service.paxos.ContentionStrategy Maven / Gradle / Ivy

Go to download

The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model.

There is a newer version: 5.0.2
Show newest version
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.cassandra.service.paxos;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;

import com.codahale.metrics.Snapshot;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.ConsistencyLevel;
import org.apache.cassandra.db.DecoratedKey;
import org.apache.cassandra.schema.TableMetadata;
import org.apache.cassandra.tracing.Tracing;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.cassandra.utils.NoSpamLogger;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.DoubleSupplier;
import java.util.function.LongBinaryOperator;
import java.util.function.Supplier;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static java.lang.Double.parseDouble;
import static java.lang.Integer.parseInt;
import static java.lang.Math.*;
import static java.util.Arrays.stream;
import static java.util.concurrent.TimeUnit.*;
import static org.apache.cassandra.config.DatabaseDescriptor.*;
import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.casReadMetrics;
import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.casWriteMetrics;
import static org.apache.cassandra.utils.Clock.Global.nanoTime;
import static org.apache.cassandra.utils.Clock.waitUntil;

/**
 * 

A strategy for making back-off decisions for Paxos operations that fail to make progress because of other paxos operations. * The strategy is defined by four factors:

    *
  • {@link #min} *
  • {@link #max} *
  • {@link #minDelta} *
  • {@link #waitRandomizer} *
* *

The first three represent time periods, and may be defined dynamically based on a simple calculation over:

    *
  • {@code pX()} recent experienced latency distribution for successful operations, * e.g. {@code p50(rw)} the maximum of read and write median latencies, * {@code p999(r)} the 99.9th percentile of read latencies *
  • {@code attempts} the number of failed attempts made by the operation so far *
  • {@code constant} a user provided floating point constant *
* *

Their calculation may take any of these forms *

  • constant {@code $constant$[mu]s} *
  • dynamic constant {@code pX() * constant} *
  • dynamic linear {@code pX() * constant * attempts} *
  • dynamic exponential {@code pX() * constant ^ attempts} * *

    Furthermore, the dynamic calculations can be bounded with a min/max, like so: * {@code min[mu]s <= dynamic expr <= max[mu]s} * * e.g. *

  • {@code 10ms <= p50(rw)*0.66} *
  • {@code 10ms <= p95(rw)*1.8^attempts <= 100ms} *
  • {@code 5ms <= p50(rw)*0.5} * *

    These calculations are put together to construct a range from which we draw a random number. * The period we wait for {@code X} will be drawn so that {@code min <= X < max}. * *

    With the constraint that {@code max} must be {@code minDelta} greater than {@code min}, * but no greater than its expression-defined maximum. {@code max} will be increased up until * this point, after which {@code min} will be decreased until this gap is imposed. * *

    The {@link #waitRandomizer} property specifies the manner in which a random value is drawn from the range. * It is defined using one of the following specifiers: *

  • uniform *
  • exp($power$) or exponential($power$) *
  • qexp($power$) or qexponential($power$) or quantizedexponential($power$) * * The uniform specifier is self-explanatory, selecting all values in the range with equal probability. * The exponential specifier draws values towards the end of the range with higher probability, raising * a floating point number in the range [0..1.0) to the power provided, and translating the resulting value * to a uniform value in the range. * The quantized exponential specifier partitions the range into {@code attempts} buckets, then applies the pure * exponential approach to draw values from [0..attempts), before drawing a uniform value from the corresponding bucket * *

    Finally, there is also a {@link #traceAfterAttempts} property that permits initiating tracing of operations * that experience a certain minimum number of failed paxos rounds due to contention. A setting of 0 or 1 will initiate * a trace session after the first failed ballot. */ public class ContentionStrategy { private static final Logger logger = LoggerFactory.getLogger(ContentionStrategy.class); private static final Pattern BOUND = Pattern.compile( "(?0|[0-9]+[mu]s)" + "|((?0|[0-9]+[mu]s) *<= *)?" + "(p(?[0-9]+)\\((?r|w|rw|wr)\\)|(?0|[0-9]+[mu]s))" + "\\s*([*]\\s*(?[0-9.]+)?\\s*(?[*^]\\s*attempts)?)?" + "( *<= *(?0|[0-9]+[mu]s))?"); private static final Pattern TIME = Pattern.compile( "0|([0-9]+)ms|([0-9]+)us"); private static final Pattern RANDOMIZER = Pattern.compile( "uniform|exp(onential)?[(](?[0-9.]+)[)]|q(uantized)?exp(onential)?[(](?[0-9.]+)[)]"); private static final String DEFAULT_WAIT_RANDOMIZER = "qexp(1.5)"; // at least 0ms, and at least 66% of median latency private static final String DEFAULT_MIN = "0 <= p50(rw)*0.66"; // at least 0ms, and at least 66% of median latency private static final String DEFAULT_MAX = "10ms <= p95(rw)*1.8^attempts <= 100ms"; // p95 latency with exponential back-off at rate of 1.8^attempts private static final String DEFAULT_MIN_DELTA = "5ms <= p50(rw)*0.5"; // at least 5ms, and at least 50% of median latency private static volatile ContentionStrategy current; // Factories can be useful for testing purposes, to supply custom implementations of selectors and modifiers. final static LatencySelectorFactory selectors = new LatencySelectorFactory(){}; final static LatencyModifierFactory modifiers = new LatencyModifierFactory(){}; final static WaitRandomizerFactory randomizers = new WaitRandomizerFactory(){}; static { current = new ContentionStrategy(defaultWaitRandomizer(), defaultMinWait(), defaultMaxWait(), defaultMinDelta(), Integer.MAX_VALUE); } static interface LatencyModifierFactory { default LatencyModifier identity() { return (l, a) -> l; } default LatencyModifier multiply(double constant) { return (l, a) -> saturatedCast(l * constant); } default LatencyModifier multiplyByAttempts(double multiply) { return (l, a) -> saturatedCast(l * multiply * a); } default LatencyModifier multiplyByAttemptsExp(double base) { return (l, a) -> saturatedCast(l * pow(base, a)); } } static interface LatencySupplier { abstract long get(double percentile); } static interface LatencySelector { abstract long select(LatencySupplier readLatencyHistogram, LatencySupplier writeLatencyHistogram); } static interface LatencySelectorFactory { default LatencySelector constant(long latency) { return (read, write) -> latency; } default LatencySelector read(double percentile) { return (read, write) -> read.get(percentile); } default LatencySelector write(double percentile) { return (read, write) -> write.get(percentile); } default LatencySelector maxReadWrite(double percentile) { return (read, write) -> max(read.get(percentile), write.get(percentile)); } } static interface LatencyModifier { long modify(long latency, int attempts); } static interface WaitRandomizer { abstract long wait(long min, long max, int attempts); } static interface WaitRandomizerFactory { default LongBinaryOperator uniformLongSupplier() { return (min, max) -> ThreadLocalRandom.current().nextLong(min, max); } // DO NOT USE METHOD HANDLES (want to fetch afresh each time) default DoubleSupplier uniformDoubleSupplier() { return () -> ThreadLocalRandom.current().nextDouble(); } default WaitRandomizer uniform() { return new Uniform(uniformLongSupplier()); } default WaitRandomizer exponential(double power) { return new Exponential(uniformLongSupplier(), uniformDoubleSupplier(), power); } default WaitRandomizer quantizedExponential(double power) { return new QuantizedExponential(uniformLongSupplier(), uniformDoubleSupplier(), power); } static class Uniform implements WaitRandomizer { final LongBinaryOperator uniformLong; public Uniform(LongBinaryOperator uniformLong) { this.uniformLong = uniformLong; } @Override public long wait(long min, long max, int attempts) { return uniformLong.applyAsLong(min, max); } } static abstract class AbstractExponential implements WaitRandomizer { final LongBinaryOperator uniformLong; final DoubleSupplier uniformDouble; final double power; public AbstractExponential(LongBinaryOperator uniformLong, DoubleSupplier uniformDouble, double power) { this.uniformLong = uniformLong; this.uniformDouble = uniformDouble; this.power = power; } } static class Exponential extends AbstractExponential { public Exponential(LongBinaryOperator uniformLong, DoubleSupplier uniformDouble, double power) { super(uniformLong, uniformDouble, power); } @Override public long wait(long min, long max, int attempts) { if (attempts == 1) return uniformLong.applyAsLong(min, max); double p = uniformDouble.getAsDouble(); long delta = max - min; delta *= Math.pow(p, power); return max - delta; } } static class QuantizedExponential extends AbstractExponential { public QuantizedExponential(LongBinaryOperator uniformLong, DoubleSupplier uniformDouble, double power) { super(uniformLong, uniformDouble, power); } @Override public long wait(long min, long max, int attempts) { long quanta = (max - min) / attempts; if (attempts == 1 || quanta == 0) return uniformLong.applyAsLong(min, max); double p = uniformDouble.getAsDouble(); int base = (int) (attempts * Math.pow(p, power)); return max - ThreadLocalRandom.current().nextLong(quanta * base, quanta * (base + 1)); } } } static class SnapshotAndTime { final long validUntil; final Snapshot snapshot; SnapshotAndTime(long validUntil, Snapshot snapshot) { this.validUntil = validUntil; this.snapshot = snapshot; } } static class TimeLimitedLatencySupplier extends AtomicReference implements LatencySupplier { final Supplier snapshotSupplier; final long validForNanos; TimeLimitedLatencySupplier(Supplier snapshotSupplier, long time, TimeUnit units) { this.snapshotSupplier = snapshotSupplier; this.validForNanos = units.toNanos(time); } private Snapshot getSnapshot() { long now = nanoTime(); SnapshotAndTime cur = get(); if (cur != null && cur.validUntil > now) return cur.snapshot; Snapshot newSnapshot = snapshotSupplier.get(); SnapshotAndTime next = new SnapshotAndTime(now + validForNanos, newSnapshot); if (compareAndSet(cur, next)) return next.snapshot; return accumulateAndGet(next, (a, b) -> a.validUntil > b.validUntil ? a : b).snapshot; } @Override public long get(double percentile) { return (long)getSnapshot().getValue(percentile); } } static class Bound { final long min, max, onFailure; final LatencyModifier modifier; final LatencySelector selector; final LatencySupplier reads, writes; Bound(long min, long max, long onFailure, LatencyModifier modifier, LatencySelector selector) { Preconditions.checkArgument(min<=max, "min (%s) must be less than or equal to max (%s)", min, max); this.min = min; this.max = max; this.onFailure = onFailure; this.modifier = modifier; this.selector = selector; this.reads = new TimeLimitedLatencySupplier(casReadMetrics.latency::getSnapshot, 10L, SECONDS); this.writes = new TimeLimitedLatencySupplier(casWriteMetrics.latency::getSnapshot, 10L, SECONDS); } long get(int attempts) { try { long base = selector.select(reads, writes); return max(min, min(max, modifier.modify(base, attempts))); } catch (Throwable t) { NoSpamLogger.getLogger(logger, 1L, MINUTES).info("", t); return onFailure; } } public String toString() { return "Bound{" + "min=" + min + ", max=" + max + ", onFailure=" + onFailure + ", modifier=" + modifier + ", selector=" + selector + '}'; } } final WaitRandomizer waitRandomizer; final Bound min, max, minDelta; final int traceAfterAttempts; public ContentionStrategy(String waitRandomizer, String min, String max, String minDelta, int traceAfterAttempts) { this.waitRandomizer = parseWaitRandomizer(waitRandomizer); this.min = parseBound(min, true); this.max = parseBound(max, false); this.minDelta = parseBound(minDelta, true); this.traceAfterAttempts = traceAfterAttempts; } public enum Type { READ("Contended Paxos Read"), WRITE("Contended Paxos Write"), REPAIR("Contended Paxos Repair"); final String traceTitle; final String lowercase; Type(String traceTitle) { this.traceTitle = traceTitle; this.lowercase = name().toLowerCase(); } } long computeWaitUntilForContention(int attempts, TableMetadata table, DecoratedKey partitionKey, ConsistencyLevel consistency, Type type) { if (attempts >= traceAfterAttempts && !Tracing.isTracing()) { Tracing.instance.newSession(Tracing.TraceType.QUERY); Tracing.instance.begin(type.traceTitle, ImmutableMap.of( "keyspace", table.keyspace, "table", table.name, "partitionKey", table.partitionKeyType.getString(partitionKey.getKey()), "consistency", consistency.name(), "kind", type.lowercase )); logger.info("Tracing contended paxos {} for key {} on {}.{} with trace id {}", type.lowercase, ByteBufferUtil.bytesToHex(partitionKey.getKey()), table.keyspace, table.name, Tracing.instance.getSessionId()); } long minWaitMicros = min.get(attempts); long maxWaitMicros = max.get(attempts); long minDeltaMicros = minDelta.get(attempts); if (minWaitMicros + minDeltaMicros > maxWaitMicros) { maxWaitMicros = minWaitMicros + minDeltaMicros; if (maxWaitMicros > this.max.max) { maxWaitMicros = this.max.max; minWaitMicros = max(this.min.min, min(this.min.max, maxWaitMicros - minDeltaMicros)); } } long wait = waitRandomizer.wait(minWaitMicros, maxWaitMicros, attempts); return nanoTime() + MICROSECONDS.toNanos(wait); } boolean doWaitForContention(long deadline, int attempts, TableMetadata table, DecoratedKey partitionKey, ConsistencyLevel consistency, Type type) { long until = computeWaitUntilForContention(attempts, table, partitionKey, consistency, type); if (until >= deadline) return false; try { waitUntil(until); } catch (InterruptedException e) { Thread.currentThread().interrupt(); return false; } return true; } static boolean waitForContention(long deadline, int attempts, TableMetadata table, DecoratedKey partitionKey, ConsistencyLevel consistency, Type type) { return current.doWaitForContention(deadline, attempts, table, partitionKey, consistency, type); } static long waitUntilForContention(int attempts, TableMetadata table, DecoratedKey partitionKey, ConsistencyLevel consistency, Type type) { return current.computeWaitUntilForContention(attempts, table, partitionKey, consistency, type); } static class ParsedStrategy { final String waitRandomizer, min, max, minDelta; final ContentionStrategy strategy; ParsedStrategy(String waitRandomizer, String min, String max, String minDelta, ContentionStrategy strategy) { this.waitRandomizer = waitRandomizer; this.min = min; this.max = max; this.minDelta = minDelta; this.strategy = strategy; } } @VisibleForTesting static ParsedStrategy parseStrategy(String spec) { String[] args = spec.split(","); String waitRandomizer = find(args, "random"); String min = find(args, "min"); String max = find(args, "max"); String minDelta = find(args, "delta"); String trace = find(args, "trace"); if (waitRandomizer == null) waitRandomizer = defaultWaitRandomizer(); if (min == null) min = defaultMinWait(); if (max == null) max = defaultMaxWait(); if (minDelta == null) minDelta = defaultMinDelta(); int traceAfterAttempts = trace == null ? current.traceAfterAttempts: Integer.parseInt(trace); ContentionStrategy strategy = new ContentionStrategy(waitRandomizer, min, max, minDelta, traceAfterAttempts); return new ParsedStrategy(waitRandomizer, min, max, minDelta, strategy); } public static void setStrategy(String spec) { ParsedStrategy parsed = parseStrategy(spec); current = parsed.strategy; setPaxosContentionWaitRandomizer(parsed.waitRandomizer); setPaxosContentionMinWait(parsed.min); setPaxosContentionMaxWait(parsed.max); setPaxosContentionMinDelta(parsed.minDelta); } public static String getStrategySpec() { return "min=" + defaultMinWait() + ",max=" + defaultMaxWait() + ",delta=" + defaultMinDelta() + ",random=" + defaultWaitRandomizer() + ",trace=" + current.traceAfterAttempts; } private static String find(String[] args, String param) { return stream(args).filter(s -> s.startsWith(param + '=')) .map(s -> s.substring(param.length() + 1)) .findFirst().orElse(null); } private static LatencySelector parseLatencySelector(Matcher m, LatencySelectorFactory selectors) { String perc = m.group("perc"); if (perc == null) return selectors.constant(parseInMicros(m.group("constbase"))); double percentile = parseDouble("0." + perc); String rw = m.group("rw"); if (rw.length() == 2) return selectors.maxReadWrite(percentile); else if ("r".equals(rw)) return selectors.read(percentile); else return selectors.write(percentile); } private static LatencyModifier parseLatencyModifier(Matcher m, LatencyModifierFactory modifiers) { String mod = m.group("mod"); if (mod == null) return modifiers.identity(); double modifier = parseDouble(mod); String modkind = m.group("modkind"); if (modkind == null) return modifiers.multiply(modifier); if (modkind.startsWith("*")) return modifiers.multiplyByAttempts(modifier); else if (modkind.startsWith("^")) return modifiers.multiplyByAttemptsExp(modifier); else throw new IllegalArgumentException("Unrecognised attempt modifier: " + modkind); } static long saturatedCast(double v) { if (v > Long.MAX_VALUE) return Long.MAX_VALUE; return (long) v; } static WaitRandomizer parseWaitRandomizer(String input) { return parseWaitRandomizer(input, randomizers); } static WaitRandomizer parseWaitRandomizer(String input, WaitRandomizerFactory randomizers) { Matcher m = RANDOMIZER.matcher(input); if (!m.matches()) throw new IllegalArgumentException(input + " does not match" + RANDOMIZER); String exp; exp = m.group("exp"); if (exp != null) return randomizers.exponential(Double.parseDouble(exp)); exp = m.group("qexp"); if (exp != null) return randomizers.quantizedExponential(Double.parseDouble(exp)); return randomizers.uniform(); } static Bound parseBound(String input, boolean isMin) { return parseBound(input, isMin, selectors, modifiers); } @VisibleForTesting static Bound parseBound(String input, boolean isMin, LatencySelectorFactory selectors, LatencyModifierFactory modifiers) { Matcher m = BOUND.matcher(input); if (!m.matches()) throw new IllegalArgumentException(input + " does not match " + BOUND); String maybeConst = m.group("const"); if (maybeConst != null) { long v = parseInMicros(maybeConst); return new Bound(v, v, v, modifiers.identity(), selectors.constant(v)); } long min = parseInMicros(m.group("min"), 0); long max = parseInMicros(m.group("max"), maxQueryTimeoutMicros() / 2); return new Bound(min, max, isMin ? min : max, parseLatencyModifier(m, modifiers), parseLatencySelector(m, selectors)); } private static long parseInMicros(String input, long orElse) { if (input == null) return orElse; return parseInMicros(input); } private static long parseInMicros(String input) { Matcher m = TIME.matcher(input); if (!m.matches()) throw new IllegalArgumentException(input + " does not match " + TIME); String text; if (null != (text = m.group(1))) return parseInt(text) * 1000; else if (null != (text = m.group(2))) return parseInt(text); else return 0; } @VisibleForTesting static String defaultWaitRandomizer() { return orElse(DatabaseDescriptor::getPaxosContentionWaitRandomizer, DEFAULT_WAIT_RANDOMIZER); } @VisibleForTesting static String defaultMinWait() { return orElse(DatabaseDescriptor::getPaxosContentionMinWait, DEFAULT_MIN); } @VisibleForTesting static String defaultMaxWait() { return orElse(DatabaseDescriptor::getPaxosContentionMaxWait, DEFAULT_MAX); } @VisibleForTesting static String defaultMinDelta() { return orElse(DatabaseDescriptor::getPaxosContentionMinDelta, DEFAULT_MIN_DELTA); } @VisibleForTesting static long maxQueryTimeoutMicros() { return max(max(getCasContentionTimeout(MICROSECONDS), getWriteRpcTimeout(MICROSECONDS)), getReadRpcTimeout(MICROSECONDS)); } private static String orElse(Supplier get, String orElse) { String result = get.get(); return result != null ? result : orElse; } }





  • © 2015 - 2024 Weber Informatics LLC | Privacy Policy