All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.palantir.atlasdb.keyvalue.cassandra.CassandraRequestExceptionHandler Maven / Gradle / Ivy

The newest version!
/*
 * (c) Copyright 2018 Palantir Technologies Inc. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.palantir.atlasdb.keyvalue.cassandra;

import com.google.common.annotations.VisibleForTesting;
import com.palantir.atlasdb.keyvalue.api.InsufficientConsistencyException;
import com.palantir.atlasdb.keyvalue.cassandra.pool.CassandraServer;
import com.palantir.common.exception.AtlasDbDependencyException;
import com.palantir.logsafe.SafeArg;
import com.palantir.logsafe.UnsafeArg;
import com.palantir.logsafe.logger.SafeLogger;
import com.palantir.logsafe.logger.SafeLoggerFactory;
import java.net.SocketTimeoutException;
import java.time.Duration;
import java.util.NoSuchElementException;
import java.util.Optional;
import java.util.concurrent.ThreadLocalRandom;
import java.util.function.Supplier;
import org.apache.cassandra.thrift.InvalidRequestException;
import org.apache.cassandra.thrift.TimedOutException;
import org.apache.cassandra.thrift.UnavailableException;
import org.apache.thrift.transport.TTransportException;

class CassandraRequestExceptionHandler {
    private static final SafeLogger log = SafeLoggerFactory.get(CassandraRequestExceptionHandler.class);

    private final Supplier maxTriesSameHost;
    private final Supplier maxTriesTotal;
    private final Supplier useConservativeHandler;
    private final Blacklist blacklist;
    private final Optional overrideStrategyForTest;

    CassandraRequestExceptionHandler(
            Supplier maxTriesSameHost,
            Supplier maxTriesTotal,
            Supplier useConservativeHandler,
            Blacklist blacklist) {
        this.maxTriesSameHost = maxTriesSameHost;
        this.maxTriesTotal = maxTriesTotal;
        this.useConservativeHandler = useConservativeHandler;
        this.blacklist = blacklist;
        this.overrideStrategyForTest = Optional.empty();
    }

    private CassandraRequestExceptionHandler(
            Supplier maxTriesSameHost, Supplier maxTriesTotal, Blacklist blacklist) {
        this.maxTriesSameHost = maxTriesSameHost;
        this.maxTriesTotal = maxTriesTotal;
        this.useConservativeHandler = () -> true;
        this.blacklist = blacklist;
        this.overrideStrategyForTest = Optional.of(NoBackoffForTesting.INSTANCE);
    }

    static CassandraRequestExceptionHandler withNoBackoffForTest(
            Supplier maxTriesSameHost, Supplier maxTriesTotal, Blacklist blacklist) {
        return new CassandraRequestExceptionHandler(maxTriesSameHost, maxTriesTotal, blacklist);
    }

    @SuppressWarnings("unchecked")
     void handleExceptionFromRequest(
            RetryableCassandraRequest req, CassandraServer serverTried, Exception ex) throws K {
        if (!isRetryable(ex)) {
            throw (K) ex;
        }

        RequestExceptionHandlerStrategy strategy = getStrategy();
        req.triedOnHost(serverTried);
        req.registerException(ex);
        int numberOfAttempts = req.getNumberOfAttempts();
        int numberOfAttemptsOnHost = req.getNumberOfAttemptsOnHost(serverTried);

        if (numberOfAttempts >= maxTriesTotal.get()) {
            throw logAndThrowException(numberOfAttempts, numberOfAttemptsOnHost, ex, serverTried, req);
        }

        if (shouldBlacklist(ex, numberOfAttemptsOnHost)) {
            blacklist.add(serverTried);
        }

        logNumberOfAttempts(ex, numberOfAttempts);
        handleBackoff(req, serverTried, ex, strategy);
        handleRetryOnDifferentHosts(req, serverTried, ex, strategy);
    }

    @VisibleForTesting
    RequestExceptionHandlerStrategy getStrategy() {
        return overrideStrategyForTest.orElseGet(this::resolveStrategy);
    }

    private RequestExceptionHandlerStrategy resolveStrategy() {
        if (useConservativeHandler.get()) {
            return Conservative.INSTANCE;
        } else {
            return LegacyExceptionHandler.INSTANCE;
        }
    }

    private static AtlasDbDependencyException logAndThrowException(
            int numberOfAttempts,
            int numberOfAttemptsOnHost,
            Exception ex,
            CassandraServer serverTried,
            RetryableCassandraRequest req) {
        log.warn(
                "Tried to connect to cassandra {} times. Exception message was: {} : {}",
                SafeArg.of("numTries", numberOfAttempts),
                SafeArg.of("numTriesOnHost", numberOfAttemptsOnHost),
                SafeArg.of("exceptionClass", ex.getClass().getTypeName()),
                SafeArg.of("serverHostNameTried", serverTried.cassandraHostName()),
                UnsafeArg.of("exceptionMessage", ex.getMessage()));
        throw req.throwLimitReached();
    }

    private void logNumberOfAttempts(Exception ex, int numberOfAttempts) {
        // Only log the actual exception the first time
        if (numberOfAttempts > 1) {
            log.debug(
                    "Error occurred talking to cassandra. Attempt {} of {}. Exception message was: {} : {}",
                    SafeArg.of("numTries", numberOfAttempts),
                    SafeArg.of("maxTotalTries", maxTriesTotal.get()),
                    SafeArg.of("exceptionClass", ex.getClass().getTypeName()),
                    UnsafeArg.of("exceptionMessage", ex.getMessage()));
        } else {
            log.debug(
                    "Error occurred talking to cassandra. Attempt {} of {}.",
                    SafeArg.of("numTries", numberOfAttempts),
                    SafeArg.of("maxTotalTries", maxTriesTotal.get()),
                    ex);
        }
    }

    @VisibleForTesting
    boolean shouldBlacklist(Exception ex, int numberOfAttempts) {
        return isConnectionException(ex)
                && numberOfAttempts >= maxTriesSameHost.get()
                && !isExceptionNotImplicatingThisParticularNode(ex);
    }

    private  void handleBackoff(
            RetryableCassandraRequest req,
            CassandraServer serverTried,
            Exception ex,
            RequestExceptionHandlerStrategy strategy) {
        if (!shouldBackoff(ex, strategy)) {
            return;
        }

        long backOffPeriod = strategy.getBackoffPeriod(req.getNumberOfAttemptsOnHost(serverTried));
        log.info(
                "Retrying a query, {}, with backoff of {}ms, intended for host {}.",
                UnsafeArg.of("queryString", req.getFunction().toString()),
                SafeArg.of("sleepDuration", backOffPeriod),
                SafeArg.of("cassandraHost", serverTried.cassandraHostName()),
                SafeArg.of("proxy", CassandraLogHelper.host(serverTried.proxy())));

        try {
            Thread.sleep(backOffPeriod);
        } catch (InterruptedException i) {
            Thread.currentThread().interrupt();
            throw new RuntimeException(i);
        }
    }

    @VisibleForTesting
    boolean shouldBackoff(Exception ex, RequestExceptionHandlerStrategy strategy) {
        return strategy.shouldBackoff(ex);
    }

    private  void handleRetryOnDifferentHosts(
            RetryableCassandraRequest req,
            CassandraServer serverTried,
            Exception ex,
            RequestExceptionHandlerStrategy strategy) {
        if (shouldRetryOnDifferentHost(ex, req.getNumberOfAttemptsOnHost(serverTried), strategy)) {
            log.info(
                    "Retrying a query intended for host {} on a different host.",
                    SafeArg.of("cassandraHost", serverTried.cassandraHostName()),
                    SafeArg.of("proxy", CassandraLogHelper.host(serverTried.proxy())));
            req.giveUpOnPreferredHost();
        }
    }

    @VisibleForTesting
    boolean shouldRetryOnDifferentHost(Exception ex, int numberOfAttempts, RequestExceptionHandlerStrategy strategy) {
        return strategy.shouldRetryOnDifferentHost(ex, maxTriesSameHost.get(), numberOfAttempts);
    }

    // Determine the behavior we want from each type of exception.

    @VisibleForTesting
    static boolean isRetryable(Exception ex) {
        return isConnectionException(ex)
                || isTransientException(ex)
                || isIndicativeOfCassandraLoad(ex)
                || isFastFailoverException(ex);
    }

    // Group exceptions by type.

    static boolean isConnectionException(Throwable ex) {
        return ex != null
                // tcp socket timeout, possibly indicating network flake, long GC, or restarting server.
                && (ex instanceof SocketTimeoutException
                        || ex instanceof CassandraClientFactory.ClientCreationFailedException
                        || isConnectionException(ex.getCause()));
    }

    private static boolean isTransientException(Throwable ex) {
        return ex != null
                // There's a problem with the connection to Cassandra.
                && (ex instanceof TTransportException || isTransientException(ex.getCause()));
    }

    static boolean isIndicativeOfCassandraLoad(Throwable ex) {
        // TODO (jkong): Make NoSuchElementException its own thing - that is NOT necessarily indicative of C* load.
        return ex != null
                // pool for this node is fully in use
                && (ex instanceof NoSuchElementException
                        // Cassandra timeout. Maybe took too long to CAS, or Cassandra is under load.
                        || ex instanceof TimedOutException
                        // remote cassandra node couldn't talk to enough other remote cassandra nodes to answer
                        || ex instanceof UnavailableException
                        // Not enough Cassandra nodes are up.
                        || ex instanceof InsufficientConsistencyException
                        || isIndicativeOfCassandraLoad(ex.getCause()));
    }

    static boolean isFastFailoverException(Throwable ex) {
        return ex != null
                // underlying cassandra table does not exist. The table might exist on other cassandra nodes.
                && (ex instanceof InvalidRequestException || isFastFailoverException(ex.getCause()));
    }

    static boolean isExceptionNotImplicatingThisParticularNode(Throwable ex) {
        // client pool has no more elements to give, so this service node has too many requests in flight.
        return ex instanceof NoSuchElementException
                // Other nodes are down. Technically this node might be isolated from a functional quorum via a network
                // partition but we consider that to be an edge case.
                || ex instanceof InsufficientConsistencyException
                || isIndicativeOfCassandraLoad(ex.getCause());
    }

    @VisibleForTesting
    interface RequestExceptionHandlerStrategy {
        boolean shouldBackoff(Exception ex);

        long getBackoffPeriod(int numberOfAttempts);

        /**
         * Exceptions that cause a host to be blacklisted shouldn't be retried on another host. As number of
         * retries are recorded per request, and we want the number of retries on that specific host to exceed a
         * determined value before blacklisting.
         */
        boolean shouldRetryOnDifferentHost(Exception ex, int maxTriesSameHost, int numberOfAttempts);
    }

    private static final class LegacyExceptionHandler implements RequestExceptionHandlerStrategy {
        private static final RequestExceptionHandlerStrategy INSTANCE = new LegacyExceptionHandler();

        private static final long BACKOFF_DURATION = Duration.ofSeconds(1).toMillis();
        private static final int FIVE_HUNDRED = 500;

        @Override
        public boolean shouldBackoff(Exception ex) {
            return isConnectionException(ex) || isIndicativeOfCassandraLoad(ex);
        }

        @Override
        public long getBackoffPeriod(int numberOfAttempts) {
            // And value between -500 and +500ms to backoff to better spread load on failover
            return numberOfAttempts * BACKOFF_DURATION
                    + (ThreadLocalRandom.current().nextInt(2 * FIVE_HUNDRED) - FIVE_HUNDRED);
        }

        @Override
        public boolean shouldRetryOnDifferentHost(Exception ex, int maxTriesSameHost, int numberOfAttempts) {
            return isFastFailoverException(ex) || numberOfAttempts >= maxTriesSameHost;
        }
    }

    private static class Conservative implements RequestExceptionHandlerStrategy {
        private static final RequestExceptionHandlerStrategy INSTANCE = new Conservative();
        private static final long MAX_BACKOFF = Duration.ofSeconds(30).toMillis();
        private static final double UNCERTAINTY = 0.5;
        private static final double BACKOFF_COEFF = 100;

        private static final long MAX_EXPECTED_BACKOFF = (long) (MAX_BACKOFF / (UNCERTAINTY + 1));

        @Override
        public boolean shouldBackoff(Exception ex) {
            return !isFastFailoverException(ex);
        }

        @Override
        public long getBackoffPeriod(int numberOfAttempts) {
            double randomCoeff = ThreadLocalRandom.current().nextDouble(1 - UNCERTAINTY, 1 + UNCERTAINTY);
            return (long) (Math.min(BACKOFF_COEFF * Math.pow(2, numberOfAttempts), (double) MAX_EXPECTED_BACKOFF)
                    * randomCoeff);
        }

        @Override
        public boolean shouldRetryOnDifferentHost(Exception ex, int maxTriesSameHost, int numberOfAttempts) {
            return isFastFailoverException(ex)
                    || isIndicativeOfCassandraLoad(ex)
                    || numberOfAttempts >= maxTriesSameHost;
        }
    }

    private static final class NoBackoffForTesting extends Conservative {
        private static final RequestExceptionHandlerStrategy INSTANCE = new NoBackoffForTesting();

        @Override
        public long getBackoffPeriod(int numberOfAttempts) {
            return 0;
        }
    }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy