com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServiceImpl Maven / Gradle / Ivy

Show more of this group Show more artifacts with this name
Show all versions of atlasdb-cassandra Show documentation
Palantir open source project
The newest version!
/*
 * (c) Copyright 2018 Palantir Technologies Inc. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.palantir.atlasdb.keyvalue.cassandra;

import com.codahale.metrics.Counter;
import com.datastax.driver.core.exceptions.DriverInternalError;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Functions;
import com.google.common.base.Predicates;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.SetMultimap;
import com.google.common.collect.Sets;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.UncheckedExecutionException;
import com.google.protobuf.ByteString;
import com.palantir.async.initializer.AsyncInitializer;
import com.palantir.atlasdb.AtlasDbConstants;
import com.palantir.atlasdb.AtlasDbMetricNames.CellFilterMetrics;
import com.palantir.atlasdb.CassandraTopologyValidationMetrics;
import com.palantir.atlasdb.cassandra.CassandraKeyValueServiceConfig;
import com.palantir.atlasdb.cassandra.CassandraKeyValueServiceRuntimeConfig;
import com.palantir.atlasdb.cassandra.CassandraMutationTimestampProvider;
import com.palantir.atlasdb.cassandra.CassandraMutationTimestampProviders;
import com.palantir.atlasdb.cassandra.CassandraServersConfigs.CassandraServersConfig;
import com.palantir.atlasdb.encoding.PtBytes;
import com.palantir.atlasdb.keyvalue.api.AsyncKeyValueService;
import com.palantir.atlasdb.keyvalue.api.BatchColumnRangeSelection;
import com.palantir.atlasdb.keyvalue.api.CandidateCellForSweeping;
import com.palantir.atlasdb.keyvalue.api.CandidateCellForSweepingRequest;
import com.palantir.atlasdb.keyvalue.api.Cell;
import com.palantir.atlasdb.keyvalue.api.CheckAndSetCompatibility;
import com.palantir.atlasdb.keyvalue.api.CheckAndSetException;
import com.palantir.atlasdb.keyvalue.api.CheckAndSetRequest;
import com.palantir.atlasdb.keyvalue.api.ClusterAvailabilityStatus;
import com.palantir.atlasdb.keyvalue.api.ColumnSelection;
import com.palantir.atlasdb.keyvalue.api.ImmutableCandidateCellForSweepingRequest;
import com.palantir.atlasdb.keyvalue.api.InsufficientConsistencyException;
import com.palantir.atlasdb.keyvalue.api.KeyAlreadyExistsException;
import com.palantir.atlasdb.keyvalue.api.KeyValueService;
import com.palantir.atlasdb.keyvalue.api.MultiCheckAndSetException;
import com.palantir.atlasdb.keyvalue.api.MultiCheckAndSetRequest;
import com.palantir.atlasdb.keyvalue.api.RangeRequest;
import com.palantir.atlasdb.keyvalue.api.RangeRequests;
import com.palantir.atlasdb.keyvalue.api.RetryLimitReachedException;
import com.palantir.atlasdb.keyvalue.api.RowColumnRangeIterator;
import com.palantir.atlasdb.keyvalue.api.RowResult;
import com.palantir.atlasdb.keyvalue.api.TableReference;
import com.palantir.atlasdb.keyvalue.api.TimestampRangeDelete;
import com.palantir.atlasdb.keyvalue.api.Value;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraClientPoolImpl.StartupChecks;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServices.ColumnAndTimestamp;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServices.StartTsResultsCollector;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraVerifier.CassandraVerifierConfig;
import com.palantir.atlasdb.keyvalue.cassandra.RowColumnRangeExtractor.RowColumnRangeResult;
import com.palantir.atlasdb.keyvalue.cassandra.async.client.creation.ClusterFactory.CassandraClusterConfig;
import com.palantir.atlasdb.keyvalue.cassandra.cas.CheckAndSetRunner;
import com.palantir.atlasdb.keyvalue.cassandra.paging.RowGetter;
import com.palantir.atlasdb.keyvalue.cassandra.pool.CassandraServer;
import com.palantir.atlasdb.keyvalue.cassandra.sweep.CandidateRowForSweeping;
import com.palantir.atlasdb.keyvalue.cassandra.sweep.CandidateRowsForSweepingIterator;
import com.palantir.atlasdb.keyvalue.cassandra.thrift.MutationMap;
import com.palantir.atlasdb.keyvalue.cassandra.thrift.SlicePredicates;
import com.palantir.atlasdb.keyvalue.cassandra.thrift.SlicePredicates.Limit;
import com.palantir.atlasdb.keyvalue.cassandra.thrift.SlicePredicates.Range;
import com.palantir.atlasdb.keyvalue.impl.AbstractKeyValueService;
import com.palantir.atlasdb.keyvalue.impl.Cells;
import com.palantir.atlasdb.keyvalue.impl.CheckAndSetResult;
import com.palantir.atlasdb.keyvalue.impl.IterablePartitioner;
import com.palantir.atlasdb.keyvalue.impl.KeyValueServices;
import com.palantir.atlasdb.keyvalue.impl.LocalRowColumnRangeIterator;
import com.palantir.atlasdb.logging.LoggingArgs;
import com.palantir.atlasdb.table.description.TableMetadata;
import com.palantir.atlasdb.util.AnnotatedCallable;
import com.palantir.atlasdb.util.AnnotationType;
import com.palantir.atlasdb.util.AtlasDbMetrics;
import com.palantir.atlasdb.util.MetricsManager;
import com.palantir.atlasdb.util.MetricsManagers;
import com.palantir.common.annotation.Idempotent;
import com.palantir.common.base.ClosableIterator;
import com.palantir.common.base.ClosableIterators;
import com.palantir.common.base.FunctionCheckedException;
import com.palantir.common.base.Throwables;
import com.palantir.common.exception.AtlasDbDependencyException;
import com.palantir.common.exception.PalantirRuntimeException;
import com.palantir.common.streams.KeyedStream;
import com.palantir.logsafe.Preconditions;
import com.palantir.logsafe.SafeArg;
import com.palantir.logsafe.logger.SafeLogger;
import com.palantir.logsafe.logger.SafeLoggerFactory;
import com.palantir.refreshable.Refreshable;
import com.palantir.tritium.metrics.registry.TaggedMetricRegistry;
import com.palantir.util.paging.AbstractPagingIterable;
import com.palantir.util.paging.SimpleTokenBackedResultsPage;
import com.palantir.util.paging.TokenBackedBasicResultsPage;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import javax.annotation.Nullable;
import one.util.streamex.EntryStream;
import org.apache.cassandra.thrift.CASResult;
import org.apache.cassandra.thrift.CfDef;
import org.apache.cassandra.thrift.Column;
import org.apache.cassandra.thrift.ColumnOrSuperColumn;
import org.apache.cassandra.thrift.ConsistencyLevel;
import org.apache.cassandra.thrift.Deletion;
import org.apache.cassandra.thrift.KeyPredicate;
import org.apache.cassandra.thrift.KsDef;
import org.apache.cassandra.thrift.Mutation;
import org.apache.cassandra.thrift.SlicePredicate;
import org.apache.thrift.TException;

/**
 * Each service can have one or many C* KVS.
 * For each C* KVS, it maintains a list of active nodes, and the client connections attached to each node:
 * 
 * n1->c1, c2, c3
 * n2->c5, c4, c9
 * n3->[N C* thrift client connections]
 * 

 * Where {n1, n2, n3} are the active nodes in the C* cluster. Also each
 * node contains the clients which are attached to the node.
 * if some nodes are down, and the change can be detected through active hosts,
 * and these inactive nodes will be removed afterwards.
 */
@SuppressWarnings({"FinalClass", "Not final for mocking in tests"})
public class CassandraKeyValueServiceImpl extends AbstractKeyValueService implements CassandraKeyValueService {

    @VisibleForTesting
    class InitializingWrapper extends AsyncInitializer implements AutoDelegate_CassandraKeyValueService {
        @Override
        public CassandraKeyValueServiceImpl delegate() {
            checkInitialized();
            return CassandraKeyValueServiceImpl.this;
        }

        @Override
        public Collection getDelegates() {
            return ImmutableList.of(delegate());
        }

        @Override
        protected void tryInitialize() {
            CassandraKeyValueServiceImpl.this.tryInitialize();
        }

        @Override
        public boolean supportsCheckAndSet() {
            return CassandraKeyValueServiceImpl.this.supportsCheckAndSet();
        }

        @Override
        public CheckAndSetCompatibility getCheckAndSetCompatibility() {
            return CassandraKeyValueServiceImpl.this.getCheckAndSetCompatibility();
        }

        @Override
        public boolean shouldTriggerCompactions() {
            return CassandraKeyValueServiceImpl.this.shouldTriggerCompactions();
        }

        @Override
        public CassandraClientPool getClientPool() {
            return CassandraKeyValueServiceImpl.this.getClientPool();
        }

        @Override
        protected String getInitializingClassName() {
            return "CassandraKeyValueService";
        }

        @Override
        public void close() {
            cancelInitialization(CassandraKeyValueServiceImpl.this::close);
        }
    }

    static final ConsistencyLevel WRITE_CONSISTENCY = ConsistencyLevel.EACH_QUORUM;
    static final ConsistencyLevel DELETE_CONSISTENCY = ConsistencyLevel.ALL;

    private final SafeLogger log;

    private final MetricsManager metricsManager;
    private final CassandraKeyValueServiceConfig config;
    private final CassandraClientPool clientPool;

    private final ReadConsistencyProvider readConsistencyProvider = new ReadConsistencyProvider();

    private final TracingQueryRunner queryRunner;
    private final WrappingQueryRunner wrappingQueryRunner;
    private final CellLoader cellLoader;
    private final AsyncKeyValueService asyncKeyValueService;
    private final RangeLoader rangeLoader;
    private final TaskRunner taskRunner;
    private final CellValuePutter cellValuePutter;
    private final CassandraTableMetadata tableMetadata;
    private final CassandraTableCreator cassandraTableCreator;
    private final CassandraTableDropper cassandraTableDropper;
    private final CassandraTableTruncator cassandraTableTruncator;
    private final CheckAndSetRunner checkAndSetRunner;

    private final CassandraTables cassandraTables;

    private final InitializingWrapper wrapper = new InitializingWrapper();

    private final CassandraMutationTimestampProvider mutationTimestampProvider;
    private final Refreshable runtimeConfig;
    private final CassandraVerifierConfig verifierConfig;
    private final Function, ResultsExtractor> extractorFactory;

    public static CassandraKeyValueService createForTesting(
            CassandraKeyValueServiceConfig config, Refreshable runtimeConfig) {
        MetricsManager metricsManager = MetricsManagers.createForTests();
        CassandraClientPool clientPool = CassandraClientPoolImpl.createImplForTest(
                metricsManager,
                config,
                runtimeConfig,
                StartupChecks.RUN,
                new Blacklist(
                        config,
                        runtimeConfig.map(CassandraKeyValueServiceRuntimeConfig::unresponsiveHostBackoffTimeSeconds)),
                CassandraTopologyValidator.create(
                        CassandraTopologyValidationMetrics.of(metricsManager.getTaggedRegistry()), runtimeConfig),
                new CassandraAbsentHostTracker(config.consecutiveAbsencesBeforePoolRemoval()));

        return createOrShutdownClientPool(
                metricsManager,
                config,
                runtimeConfig,
                clientPool,
                CassandraMutationTimestampProviders.legacyModeForTestsOnly(),
                SafeLoggerFactory.get(CassandraKeyValueService.class),
                AtlasDbConstants.DEFAULT_INITIALIZE_ASYNC);
    }

    public static CassandraKeyValueService create(
            MetricsManager metricsManager,
            CassandraKeyValueServiceConfig config,
            Refreshable runtimeConfig,
            CassandraMutationTimestampProvider mutationTimestampProvider) {
        return create(
                metricsManager,
                config,
                runtimeConfig,
                mutationTimestampProvider,
                AtlasDbConstants.DEFAULT_INITIALIZE_ASYNC);
    }

    public static CassandraKeyValueService create(
            MetricsManager metricsManager,
            CassandraKeyValueServiceConfig config,
            Refreshable runtimeConfig,
            CassandraMutationTimestampProvider mutationTimestampProvider,
            CassandraClientPool clientPool) {
        return createOrShutdownClientPool(
                metricsManager,
                config,
                runtimeConfig,
                clientPool,
                mutationTimestampProvider,
                SafeLoggerFactory.get(CassandraKeyValueService.class),
                AtlasDbConstants.DEFAULT_INITIALIZE_ASYNC);
    }

    public static CassandraKeyValueService create(
            MetricsManager metricsManager,
            CassandraKeyValueServiceConfig config,
            Refreshable runtimeConfig,
            CassandraMutationTimestampProvider mutationTimestampProvider,
            boolean initializeAsync) {
        return create(
                metricsManager,
                config,
                runtimeConfig,
                mutationTimestampProvider,
                SafeLoggerFactory.get(CassandraKeyValueService.class),
                initializeAsync);
    }

    @VisibleForTesting
    static CassandraKeyValueService create(
            MetricsManager metricsManager,
            CassandraKeyValueServiceConfig config,
            Refreshable runtimeConfig,
            CassandraMutationTimestampProvider mutationTimestampProvider,
            SafeLogger log) {
        return create(
                metricsManager,
                config,
                runtimeConfig,
                mutationTimestampProvider,
                log,
                AtlasDbConstants.DEFAULT_INITIALIZE_ASYNC);
    }

    @VisibleForTesting
    static CassandraKeyValueService create(
            MetricsManager metricsManager,
            CassandraKeyValueServiceConfig config,
            Refreshable runtimeConfig,
            CassandraMutationTimestampProvider mutationTimestampProvider,
            SafeLogger log,
            boolean initializeAsync) {
        CassandraClientPool clientPool =
                CassandraClientPoolImpl.create(metricsManager, config, runtimeConfig, initializeAsync);

        return createOrShutdownClientPool(
                metricsManager, config, runtimeConfig, clientPool, mutationTimestampProvider, log, initializeAsync);
    }

    private static CassandraKeyValueService createOrShutdownClientPool(
            MetricsManager metricsManager,
            CassandraKeyValueServiceConfig config,
            Refreshable runtimeConfig,
            CassandraClientPool clientPool,
            CassandraMutationTimestampProvider mutationTimestampProvider,
            SafeLogger log,
            boolean initializeAsync) {
        try {
            return createWithCqlClient(
                    metricsManager, config, runtimeConfig, clientPool, mutationTimestampProvider, log, initializeAsync);
        } catch (Exception e) {
            log.warn("Error occurred in creating Cassandra KVS. Now attempting to shut down client pool...", e);
            try {
                clientPool.shutdown();
                log.info("Cassandra client pool shut down.");
            } catch (RuntimeException internalException) {
                log.info("An error occurred whilst shutting down the Cassandra client pool", internalException);
                throw internalException;
            }
            throw Throwables.rewrapAndThrowUncheckedException(e);
        }
    }

    private static CassandraKeyValueService createWithCqlClient(
            MetricsManager metricsManager,
            CassandraKeyValueServiceConfig config,
            Refreshable runtimeConfig,
            CassandraClientPool clientPool,
            CassandraMutationTimestampProvider mutationTimestampProvider,
            SafeLogger log,
            boolean initializeAsync) {
        try {
            CassandraClusterConfig clusterConfig = CassandraClusterConfig.of(config, runtimeConfig.get());
            AsyncKeyValueService asyncKeyValueService = config.asyncKeyValueServiceFactory()
                    .constructAsyncKeyValueService(
                            metricsManager,
                            config.getKeyspaceOrThrow(),
                            clusterConfig,
                            runtimeConfig.map(CassandraKeyValueServiceRuntimeConfig::servers),
                            initializeAsync);

            return createAndInitialize(
                    metricsManager,
                    config,
                    runtimeConfig,
                    clientPool,
                    asyncKeyValueService,
                    mutationTimestampProvider,
                    log,
                    initializeAsync);
        } catch (Exception e) {
            log.warn("Exception during async KVS creation.", e);
            throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
        }
    }

    private static CassandraKeyValueService createAndInitialize(
            MetricsManager metricsManager,
            CassandraKeyValueServiceConfig config,
            Refreshable runtimeConfig,
            CassandraClientPool clientPool,
            AsyncKeyValueService asyncKeyValueService,
            CassandraMutationTimestampProvider mutationTimestampProvider,
            SafeLogger log,
            boolean initializeAsync) {
        Counter notLatestVisibleValueCellFilterCounter = // register counter once and reuse
                metricsManager.registerOrGetCounter(ValueExtractor.class, CellFilterMetrics.NOT_LATEST_VISIBLE_VALUE);
        Function, ResultsExtractor> extractorFactory = cellValueMap ->
                new ValueExtractor(metricsManager, cellValueMap, notLatestVisibleValueCellFilterCounter);
        CassandraKeyValueServiceImpl keyValueService = new CassandraKeyValueServiceImpl(
                log,
                metricsManager,
                config,
                asyncKeyValueService,
                runtimeConfig,
                clientPool,
                mutationTimestampProvider,
                extractorFactory);
        keyValueService.wrapper.initialize(initializeAsync);
        return keyValueService.wrapper.isInitialized() ? keyValueService : keyValueService.wrapper;
    }

    private CassandraKeyValueServiceImpl(
            SafeLogger log,
            MetricsManager metricsManager,
            CassandraKeyValueServiceConfig config,
            AsyncKeyValueService asyncKeyValueService,
            Refreshable runtimeConfig,
            CassandraClientPool clientPool,
            CassandraMutationTimestampProvider mutationTimestampProvider,
            Function, ResultsExtractor> extractorFactory) {
        super(createBlockingThreadpool(config, runtimeConfig.get().servers(), metricsManager));
        this.log = log;
        this.metricsManager = metricsManager;
        this.config = config;
        this.clientPool = clientPool;
        this.asyncKeyValueService = asyncKeyValueService;
        this.mutationTimestampProvider = mutationTimestampProvider;
        this.queryRunner = new TracingQueryRunner(log, () -> runtimeConfig.get().tracing());
        this.wrappingQueryRunner = new WrappingQueryRunner(queryRunner);
        this.cassandraTables = new CassandraTables(clientPool, config);
        this.taskRunner = new TaskRunner(executor);
        this.cellLoader = CellLoader.create(clientPool, wrappingQueryRunner, taskRunner, runtimeConfig);
        this.rangeLoader = new RangeLoader(clientPool, queryRunner, readConsistencyProvider, extractorFactory);
        this.cellValuePutter = new CellValuePutter(
                runtimeConfig,
                clientPool,
                taskRunner,
                wrappingQueryRunner,
                mutationTimestampProvider::getSweepSentinelWriteTimestamp);
        this.checkAndSetRunner = new CheckAndSetRunner(queryRunner);
        this.tableMetadata = new CassandraTableMetadata(rangeLoader, cassandraTables, clientPool, wrappingQueryRunner);
        this.cassandraTableCreator = new CassandraTableCreator(clientPool, config);
        this.cassandraTableTruncator = new CassandraTableTruncator(queryRunner, clientPool);
        this.cassandraTableDropper =
                new CassandraTableDropper(config, clientPool, tableMetadata, cassandraTableTruncator);
        this.runtimeConfig = runtimeConfig;
        this.verifierConfig = CassandraVerifierConfig.of(config, runtimeConfig.get());
        this.extractorFactory = extractorFactory;
    }

    private static ExecutorService createBlockingThreadpool(
            CassandraKeyValueServiceConfig config,
            CassandraServersConfig serversConfig,
            MetricsManager metricsManager) {
        return config.thriftExecutorServiceFactory()
                .orElseGet(() -> instrumentedFixedThreadPoolSupplier(
                        serversConfig,
                        config.poolSize(),
                        config.maxConnectionBurstSize(),
                        metricsManager.getTaggedRegistry()))
                .get();
    }

    private static Supplier instrumentedFixedThreadPoolSupplier(
            CassandraServersConfig serversConfig,
            int poolSize,
            int maxConnectionBurstSize,
            TaggedMetricRegistry registry) {
        return () -> {
            int numberOfThriftHosts = serversConfig.numberOfThriftHosts();
            int corePoolSize = poolSize * numberOfThriftHosts;
            int maxPoolSize = maxConnectionBurstSize * numberOfThriftHosts;
            return createThreadPoolWithoutSpans("Atlas Cassandra KVS", corePoolSize, maxPoolSize);
        };
    }

    @Override
    public boolean isInitialized() {
        return wrapper.isInitialized();
    }

    protected void initialize(boolean asyncInitialize) {
        wrapper.initialize(asyncInitialize);
    }

    private void tryInitialize() {
        createTable(AtlasDbConstants.DEFAULT_METADATA_TABLE, AtlasDbConstants.EMPTY_TABLE_METADATA);
        lowerConsistencyWhenSafe();
        upgradeFromOlderInternalSchema();
        CassandraKeyValueServices.warnUserInInitializationIfClusterAlreadyInInconsistentState(clientPool, config);
    }

    @VisibleForTesting
    void upgradeFromOlderInternalSchema() {
        try {
            Map metadataForTables = getMetadataForTables();
            final Collection updatedCfs = Lists.newArrayListWithExpectedSize(metadataForTables.size());

            List knownCfs = clientPool.runWithRetry(client ->
                    client.describe_keyspace(config.getKeyspaceOrThrow()).getCf_defs());

            for (CfDef clusterSideCf : knownCfs) {
                TableReference tableRef = CassandraKeyValueServices.tableReferenceFromCfDef(clusterSideCf);
                Optional relevantMetadata = lookupClusterSideMetadata(metadataForTables, tableRef);
                if (relevantMetadata.isPresent()) {
                    byte[] clusterSideMetadata = relevantMetadata.get();
                    CfDef clientSideCf = getCfForTable(tableRef, clusterSideMetadata, config.gcGraceSeconds());
                    if (!ColumnFamilyDefinitions.isMatchingCf(clientSideCf, clusterSideCf)) {
                        // mismatch; we have changed how we generate schema since we last persisted
                        log.warn("Upgrading table {} to new internal Cassandra schema", LoggingArgs.tableRef(tableRef));
                        updatedCfs.add(clientSideCf);
                    }
                } else if (!HiddenTables.isHidden(tableRef)) {
                    // Possible to get here from a race condition with another service starting up
                    // and performing schema upgrades concurrent with us doing this check
                    log.error(
                            "Found a table {} that did not have persisted"
                                    + " AtlasDB metadata. If you recently did a Palantir update, try waiting until"
                                    + " schema upgrades are completed on all backend CLIs/services etc and restarting"
                                    + " this service. If this error re-occurs on subsequent attempted startups, please"
                                    + " contact Palantir support.",
                            LoggingArgs.tableRef(tableRef));
                }
            }

            // we are racing another service to do these same operations here, but they are idempotent / safe
            Map emptyMetadataUpdate = ImmutableMap.of();
            if (!updatedCfs.isEmpty()) {
                putMetadataAndMaybeAlterTables(true, emptyMetadataUpdate, updatedCfs);
                log.info("New table-related settings were applied on startup!!");
            } else {
                log.info("No tables are being upgraded on startup. No updated table-related settings found.");
            }
        } catch (TException e) {
            log.error(
                    "Couldn't upgrade from an older internal Cassandra schema. New table-related settings may not have"
                            + " taken effect.",
                    e);
        }
    }

    private static Optional lookupClusterSideMetadata(
            Map metadataForTables, TableReference tableRef) {
        return Optional.ofNullable(metadataForTables.get(tableRef))
                .or(() -> Maps.filterEntries(metadataForTables, entry -> matchingIgnoreCase(entry.getKey(), tableRef))
                        .values()
                        .stream()
                        .findAny());
    }

    private void lowerConsistencyWhenSafe() {
        Set dcs;
        Map strategyOptions;

        try {
            dcs = clientPool.runWithRetry(client -> CassandraVerifier.sanityCheckDatacenters(client, verifierConfig));
            KsDef ksDef = clientPool.runWithRetry(client -> client.describe_keyspace(config.getKeyspaceOrThrow()));
            strategyOptions = new HashMap<>(ksDef.getStrategy_options());

            if (dcs.size() == 1) {
                String dc = dcs.iterator().next();
                if (strategyOptions.get(dc) != null) {
                    int currentRf = Integer.parseInt(strategyOptions.get(dc));
                    if (currentRf == runtimeConfig.get().replicationFactor()) {
                        if (currentRf == 2 && config.clusterMeetsNormalConsistencyGuarantees()) {
                            log.info("Setting Read Consistency to ONE, as cluster has only one datacenter at RF2.");
                            readConsistencyProvider.lowerConsistencyLevelToOne();
                        }
                    }
                }
            }
        } catch (TException e) {
            return;
        }
    }

    /**
     * Gets values from the key-value store. Requires a quorum of Cassandra nodes to be reachable.
     *
     * @param tableRef the name of the table to retrieve values from.
     * @param rows set containing the rows to retrieve values for.
     * @param selection specifies the set of columns to fetch.
     * @param startTs specifies the maximum timestamp (exclusive) at which to
     * retrieve each rows's value.
     * @return map of retrieved values. Values which do not exist (either
     * because they were deleted or never created in the first place)
     * are simply not returned.
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
     * @throws IllegalArgumentException if any of the requests were invalid
     * (e.g., attempting to retrieve values from a non-existent table).
     */
    @Override
    public Map getRows(
            TableReference tableRef, Iterable rows, ColumnSelection selection, long startTs) {
        if (!selection.allColumnsSelected()) {
            return getRowsForSpecificColumns(tableRef, rows, selection, startTs);
        }

        Set>> rowsByHost = HostPartitioner.partitionByHost(
                        clientPool, rows, Functions.identity())
                .entrySet();
        List>> tasks = new ArrayList<>(rowsByHost.size());
        for (final Map.Entry> hostAndRows : rowsByHost) {
            tasks.add(AnnotatedCallable.wrapWithThreadName(
                    AnnotationType.PREPEND,
                    "Atlas getRows " + hostAndRows.getValue().size() + " rows from " + tableRef + " on "
                            + hostAndRows.getKey().cassandraHostName(),
                    () -> getRowsForSingleHost(hostAndRows.getKey(), tableRef, hostAndRows.getValue(), startTs)));
        }
        List> perHostResults = taskRunner.runAllTasksCancelOnFailure(tasks);
        Map result = Maps.newHashMapWithExpectedSize(Iterables.size(rows));
        for (Map perHostResult : perHostResults) {
            result.putAll(perHostResult);
        }
        return result;
    }

    private Map getRowsForSingleHost(
            final CassandraServer host, final TableReference tableRef, final List rows, final long startTs) {
        try {
            int rowCount = 0;
            final Map result = new HashMap<>();
            int fetchBatchCount = runtimeConfig.get().fetchBatchCount();
            for (final List batch : Lists.partition(rows, fetchBatchCount)) {
                rowCount += batch.size();
                result.putAll(getAllCellsForRows(host, tableRef, batch, startTs));
            }
            if (rowCount > fetchBatchCount) {
                log.warn(
                        "Rebatched in getRows a call to {} that attempted to multiget {} rows; "
                                + "this may indicate overly-large batching on a higher level.\n{}",
                        LoggingArgs.tableRef(tableRef),
                        SafeArg.of("rowCount", rowCount),
                        SafeArg.of("stacktrace", CassandraKeyValueServices.getFilteredStackTrace("com.palantir")));
            }
            return ImmutableMap.copyOf(result);
        } catch (Exception e) {
            throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
        }
    }

    private Map getAllCellsForRows(
            final CassandraServer host, final TableReference tableRef, final List rows, final long startTs)
            throws Exception {

        ListMultimap result = ArrayListMultimap.create(rows.size(), 1);

        List query = rows.stream()
                .map(row -> keyPredicate(
                        ByteBuffer.wrap(row),
                        allPredicateWithLimit(runtimeConfig.get().fetchReadLimitPerRow())))
                .collect(Collectors.toList());

        while (!query.isEmpty()) {
            query = EntryStream.of(getForKeyPredicates(host, tableRef, query, startTs))
                    .filterValues(cells -> !cells.isEmpty())
                    .peekKeyValue(result::putAll)
                    .mapKeyValue((row, cells) -> keyPredicate(row, getNextLexicographicalSlicePredicate(cells)))
                    .collect(Collectors.toList());
        }

        ResultsExtractor extractor = extractorFactory.apply(Maps.newHashMapWithExpectedSize(result.size()));
        extractor.extractResults(Multimaps.asMap(result), startTs, ColumnSelection.all());
        return extractor.asMap();
    }

    private static KeyPredicate keyPredicate(ByteBuffer row, SlicePredicate predicate) {
        return new KeyPredicate().setKey(row).setPredicate(predicate);
    }

    private static SlicePredicate allPredicateWithLimit(int limit) {
        return SlicePredicates.create(Range.ALL, Limit.of(limit));
    }

    private Map> getForKeyPredicates(
            final CassandraServer host, final TableReference tableRef, List query, final long startTs)
            throws Exception {
        return clientPool.runWithRetryOnServer(
                host,
                new FunctionCheckedException>, Exception>() {
                    @Override
                    public Map> apply(CassandraClient client) throws Exception {

                        if (log.isTraceEnabled()) {
                            log.trace(
                                    "Requesting {} cells from {} starting at timestamp {} on {} "
                                            + "as part of fetching cells for key predicates.",
                                    SafeArg.of("cells", query.size()),
                                    LoggingArgs.tableRef(tableRef),
                                    SafeArg.of("startTs", startTs),
                                    SafeArg.of("host", host));
                        }

                        Map>> results =
                                wrappingQueryRunner.multiget_multislice(
                                        "getRows",
                                        client,
                                        tableRef,
                                        query,
                                        readConsistencyProvider.getConsistency(tableRef));

                        return Maps.transformValues(results, CellLoader::flattenReadOnlyLists);
                    }

                    @Override
                    public String toString() {
                        return "multiget_multislice(" + host.cassandraHostName() + ", " + tableRef + ", " + query.size()
                                + " cells)";
                    }
                });
    }

    private SlicePredicate getNextLexicographicalSlicePredicate(List columns) {
        Preconditions.checkState(!columns.isEmpty(), "Columns was empty. This is probably an AtlasDb bug");

        Column lastColumn = columns.get(columns.size() - 1).getColumn();
        ColumnAndTimestamp columnNameAndTimestamp = CassandraKeyValueServices.decomposeColumn(lastColumn.name);
        ByteBuffer nextLexicographicColumn = CassandraKeyValueServices.makeCompositeBuffer(
                RangeRequests.nextLexicographicName(columnNameAndTimestamp.columnName()), Long.MAX_VALUE);

        return SlicePredicates.create(
                Range.of(nextLexicographicColumn, Range.UNBOUND_END),
                Limit.of(runtimeConfig.get().fetchReadLimitPerRow()));
    }

    private static List wrap(List arrays) {
        List byteBuffers = new ArrayList<>(arrays.size());
        for (byte[] r : arrays) {
            byteBuffers.add(ByteBuffer.wrap(r));
        }
        return byteBuffers;
    }

    private Map getRowsForSpecificColumns(
            final TableReference tableRef, final Iterable rows, ColumnSelection selection, final long startTs) {
        Preconditions.checkArgument(!selection.allColumnsSelected(), "Must select specific columns");

        Collection selectedColumns = selection.getSelectedColumns();
        Set cells = Sets.newHashSetWithExpectedSize(selectedColumns.size() * Iterables.size(rows));
        for (byte[] row : rows) {
            for (byte[] col : selectedColumns) {
                cells.add(Cell.create(row, col));
            }
        }

        StartTsResultsCollector collector = new StartTsResultsCollector(startTs, extractorFactory);
        cellLoader.loadWithTs(
                "getRows",
                tableRef,
                cells,
                startTs,
                false,
                collector,
                readConsistencyProvider.getConsistency(tableRef));
        return collector.getCollectedResults();
    }

    /**
     * Gets values from the key-value store. Requires a quorum of Cassandra nodes to be reachable.
     *
     * @param tableRef the name of the table to retrieve values from.
     * @param timestampByCell specifies, for each row, the maximum timestamp (exclusive) at which to
     * retrieve that rows's value.
     * @return map of retrieved values. Values which do not exist (either
     * because they were deleted or never created in the first place)
     * are simply not returned.
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
     * @throws IllegalArgumentException if any of the requests were invalid
     * (e.g., attempting to retrieve values from a non-existent table).
     */
    @Override
    public Map get(TableReference tableRef, Map timestampByCell) {
        if (timestampByCell.isEmpty()) {
            log.info("Attempted get on '{}' table with empty cells", LoggingArgs.tableRef(tableRef));
            return ImmutableMap.of();
        }

        try {
            Long firstTs = timestampByCell.values().iterator().next();
            if (Iterables.all(timestampByCell.values(), Predicates.equalTo(firstTs))) {
                return get("get", tableRef, timestampByCell.keySet(), firstTs);
            }

            SetMultimap cellsByTs =
                    Multimaps.invertFrom(Multimaps.forMap(timestampByCell), HashMultimap.create());
            ImmutableMap.Builder builder = ImmutableMap.builder();
            for (long ts : cellsByTs.keySet()) {
                StartTsResultsCollector collector = new StartTsResultsCollector(ts, extractorFactory);
                cellLoader.loadWithTs(
                        "get",
                        tableRef,
                        cellsByTs.get(ts),
                        ts,
                        false,
                        collector,
                        readConsistencyProvider.getConsistency(tableRef));
                builder.putAll(collector.getCollectedResults());
            }
            return builder.buildOrThrow();
        } catch (Exception e) {
            throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
        }
    }

    private Map get(
            String kvsMethodName, TableReference tableRef, Set cells, long maxTimestampExclusive) {
        StartTsResultsCollector collector = new StartTsResultsCollector(maxTimestampExclusive, extractorFactory);
        cellLoader.loadWithTs(
                kvsMethodName,
                tableRef,
                cells,
                maxTimestampExclusive,
                false,
                collector,
                readConsistencyProvider.getConsistency(tableRef));
        return collector.getCollectedResults();
    }

    /**
     * Gets values from the key-value store for the specified rows and column range as separate iterators for each row.
     * Requires a quorum of Cassandra nodes to be reachable, otherwise, the returned iterators will throw an
     * {@link AtlasDbDependencyException} when their methods are called.
     *
     * @param tableRef the name of the table to retrieve values from.
     * @param rows set containing the rows to retrieve values for. Behavior is undefined if {@code rows}
     * contains duplicates (as defined by {@link Arrays#equals(byte[], byte[])}).
     * @param batchColumnRangeSelection specifies the column range and the per-row batchSize to fetch.
     * @param timestamp specifies the maximum timestamp (exclusive) at which to retrieve each rows's value.
     * @return map of row names to {@link RowColumnRangeIterator}. Each {@link RowColumnRangeIterator} can iterate over
     * the values that are spanned by the {@code batchColumnRangeSelection} in increasing order by column name.
     * @throws IllegalArgumentException if {@code rows} contains duplicates.
     */
    @Override
    public Map getRowsColumnRange(
            TableReference tableRef,
            Iterable rows,
            BatchColumnRangeSelection batchColumnRangeSelection,
            long timestamp) {
        Set>> rowsByHost = HostPartitioner.partitionByHost(
                        clientPool, rows, Functions.identity())
                .entrySet();
        List>> tasks = new ArrayList<>(rowsByHost.size());
        for (final Map.Entry> hostAndRows : rowsByHost) {
            tasks.add(AnnotatedCallable.wrapWithThreadName(
                    AnnotationType.PREPEND,
                    "Atlas getRowsColumnRange " + hostAndRows.getValue().size() + " rows from " + tableRef + " on "
                            + hostAndRows.getKey().cassandraHostName(),
                    () -> getRowsColumnRangeIteratorForSingleHost(
                            hostAndRows.getKey(),
                            tableRef,
                            hostAndRows.getValue(),
                            batchColumnRangeSelection,
                            timestamp)));
        }

        List> perHostResults = taskRunner.runAllTasksCancelOnFailure(tasks);
        Map result = Maps.newHashMapWithExpectedSize(Iterables.size(rows));
        for (Map perHostResult : perHostResults) {
            result.putAll(perHostResult);
        }
        return result;
    }

    private Map getRowsColumnRangeIteratorForSingleHost(
            CassandraServer host,
            TableReference tableRef,
            List rows,
            BatchColumnRangeSelection batchColumnRangeSelection,
            long startTs) {
        try {
            RowColumnRangeResult firstPage =
                    getRowsColumnRangeForSingleHost(host, tableRef, rows, batchColumnRangeSelection, startTs);

            Map> results = firstPage.getResults();
            Map rowsToLastCompositeColumns = firstPage.getRowsToLastCompositeColumns();
            IdentityHashMap incompleteRowsToNextColumns = new IdentityHashMap<>();
            for (Map.Entry e : rowsToLastCompositeColumns.entrySet()) {
                byte[] row = e.getKey();
                byte[] col = CassandraKeyValueServices.decomposeColumnName(e.getValue())
                        .columnName();
                // If we read a version of the cell before our start timestamp, it will be the most recent version
                // readable to us and we can continue to the next column. Otherwise we have to continue reading
                // this column.
                Map rowResult = results.get(row);
                boolean completedCell = (rowResult != null) && rowResult.containsKey(Cell.create(row, col));
                boolean endOfRange = isEndOfColumnRange(
                        completedCell, col, firstPage.getRowsToRawColumnCount().get(row), batchColumnRangeSelection);
                if (!endOfRange) {
                    byte[] nextCol = getNextColumnRangeColumn(completedCell, col);
                    incompleteRowsToNextColumns.put(row, nextCol);
                }
            }

            Map ret = Maps.newHashMapWithExpectedSize(rows.size());
            for (byte[] row : rowsToLastCompositeColumns.keySet()) {
                Iterator> resultIterator;
                Map result = results.get(row);
                if (result != null) {
                    resultIterator = result.entrySet().iterator();
                } else {
                    resultIterator = Collections.emptyIterator();
                }
                byte[] nextCol = incompleteRowsToNextColumns.get(row);
                if (nextCol == null) {
                    ret.put(row, new LocalRowColumnRangeIterator(resultIterator));
                } else {
                    BatchColumnRangeSelection newColumnRange = BatchColumnRangeSelection.create(
                            nextCol, batchColumnRangeSelection.getEndCol(), batchColumnRangeSelection.getBatchHint());
                    ret.put(
                            row,
                            new LocalRowColumnRangeIterator(Iterators.concat(
                                    resultIterator, getRowColumnRange(host, tableRef, row, newColumnRange, startTs))));
                }
            }
            // We saw no Cassandra results at all for these rows, so the entire column range is empty for these rows.
            for (byte[] row : firstPage.getEmptyRows()) {
                ret.put(row, new LocalRowColumnRangeIterator(Collections.emptyIterator()));
            }
            return ret;
        } catch (Exception e) {
            throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
        }
    }

    private RowColumnRangeResult getRowsColumnRangeForSingleHost(
            CassandraServer host,
            TableReference tableRef,
            List rows,
            BatchColumnRangeSelection batchColumnRangeSelection,
            long startTs) {
        try {
            return clientPool.runWithRetryOnServer(
                    host, new FunctionCheckedException() {
                        @Override
                        public RowColumnRangeResult apply(CassandraClient client) throws Exception {
                            Range range = createColumnRange(
                                    batchColumnRangeSelection.getStartCol(),
                                    batchColumnRangeSelection.getEndCol(),
                                    startTs);
                            Limit limit = Limit.of(batchColumnRangeSelection.getBatchHint());
                            SlicePredicate pred = SlicePredicates.create(range, limit);

                            Map> results = wrappingQueryRunner.multiget(
                                    "getRowsColumnRange",
                                    client,
                                    tableRef,
                                    wrap(rows),
                                    pred,
                                    readConsistencyProvider.getConsistency(tableRef));

                            return RowColumnRangeExtractor.extract(rows, results, startTs, metricsManager);
                        }

                        @Override
                        public String toString() {
                            return "multiget_slice(" + tableRef.getQualifiedName() + ", "
                                    + rows.size() + " rows, " + batchColumnRangeSelection.getBatchHint()
                                    + " max columns)";
                        }
                    });
        } catch (Exception e) {
            throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
        }
    }

    private Iterator> getRowColumnRange(
            CassandraServer host,
            TableReference tableRef,
            byte[] row,
            BatchColumnRangeSelection batchColumnRangeSelection,
            long startTs) {
        return ClosableIterators.wrapWithEmptyClose(
                new AbstractPagingIterable<
                        Map.Entry, TokenBackedBasicResultsPage, byte[]>>() {
                    @Override
                    protected TokenBackedBasicResultsPage, byte[]> getFirstPage()
                            throws Exception {
                        return page(batchColumnRangeSelection.getStartCol());
                    }

                    @Override
                    protected TokenBackedBasicResultsPage, byte[]> getNextPage(
                            TokenBackedBasicResultsPage, byte[]> previous) throws Exception {
                        return page(previous.getTokenForNextPage());
                    }

                    TokenBackedBasicResultsPage, byte[]> page(final byte[] startCol)
                            throws Exception {
                        return clientPool.runWithRetryOnServer(
                                host,
                                new FunctionCheckedException<
                                        CassandraClient,
                                        TokenBackedBasicResultsPage, byte[]>,
                                        Exception>() {
                                    @Override
                                    public TokenBackedBasicResultsPage, byte[]> apply(
                                            CassandraClient client) throws Exception {
                                        Range range = createColumnRange(
                                                startCol, batchColumnRangeSelection.getEndCol(), startTs);
                                        Limit limit = Limit.of(batchColumnRangeSelection.getBatchHint());
                                        SlicePredicate pred = SlicePredicates.create(range, limit);

                                        ByteBuffer rowByteBuffer = ByteBuffer.wrap(row);

                                        Map> results =
                                                wrappingQueryRunner.multiget(
                                                        "getRowsColumnRange",
                                                        client,
                                                        tableRef,
                                                        ImmutableList.of(rowByteBuffer),
                                                        pred,
                                                        readConsistencyProvider.getConsistency(tableRef));

                                        if (results.isEmpty()) {
                                            return SimpleTokenBackedResultsPage.create(
                                                    startCol, ImmutableList.of(), false);
                                        }
                                        List values = Iterables.getOnlyElement(results.values());
                                        if (values.isEmpty()) {
                                            return SimpleTokenBackedResultsPage.create(
                                                    startCol, ImmutableList.of(), false);
                                        }

                                        // May be empty if all results are at ts > startTs
                                        Map ret = RowColumnRangeExtractor.extract(
                                                        ImmutableList.of(row), results, startTs, metricsManager)
                                                .getResults()
                                                .getOrDefault(row, Collections.emptyMap());
                                        ColumnOrSuperColumn lastColumn = values.get(values.size() - 1);
                                        byte[] lastCol = CassandraKeyValueServices.decomposeColumnName(
                                                        lastColumn.getColumn())
                                                .columnName();
                                        // Same idea as the getRows case to handle seeing only newer entries of a column
                                        boolean completedCell = ret.get(Cell.create(row, lastCol)) != null;
                                        if (isEndOfColumnRange(
                                                completedCell, lastCol, values.size(), batchColumnRangeSelection)) {
                                            return SimpleTokenBackedResultsPage.create(lastCol, ret.entrySet(), false);
                                        }
                                        byte[] nextCol = getNextColumnRangeColumn(completedCell, lastCol);
                                        return SimpleTokenBackedResultsPage.create(nextCol, ret.entrySet(), true);
                                    }

                                    @Override
                                    public String toString() {
                                        return "multiget_slice(" + tableRef.getQualifiedName() + ", single row, "
                                                + batchColumnRangeSelection.getBatchHint() + " batch hint)";
                                    }
                                });
                    }
                }.iterator());
    }

    private static boolean isEndOfColumnRange(
            boolean completedCell, byte[] lastCol, int numRawResults, BatchColumnRangeSelection columnRangeSelection) {
        return (numRawResults < columnRangeSelection.getBatchHint())
                || (completedCell
                        && (RangeRequests.isLastRowName(lastCol)
                                || Arrays.equals(
                                        RangeRequests.nextLexicographicName(lastCol),
                                        columnRangeSelection.getEndCol())));
    }

    private static byte[] getNextColumnRangeColumn(boolean completedCell, byte[] lastCol) {
        if (!completedCell) {
            return lastCol;
        } else {
            return RangeRequests.nextLexicographicName(lastCol);
        }
    }

    private static Range createColumnRange(byte[] startColOrEmpty, byte[] endColExlusiveOrEmpty, long startTs) {
        ByteBuffer start =
                startColOrEmpty.length == 0 ? Range.UNBOUND_START : Range.startOfColumn(startColOrEmpty, startTs);
        ByteBuffer end = endColExlusiveOrEmpty.length == 0
                ? Range.UNBOUND_END
                : Range.endOfColumnIncludingSentinels(RangeRequests.previousLexicographicName(endColExlusiveOrEmpty));
        return Range.of(start, end);
    }

    /**
     * Puts values into the key-value store. This call does not guarantee atomicity across cells.
     * On failure, it is possible that some of the requests have succeeded (without having been rolled
     * back). Similarly, concurrent batched requests may interleave.
     * 

     * Requires a quorum of Cassandra nodes to be reachable.
     *
     * @param tableRef the name of the table to put values into.
     * @param values map containing the key-value entries to put.
     * @param timestamp must be non-negative and not equal to {@link Long#MAX_VALUE}
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
     */
    @Override
    public void put(final TableReference tableRef, final Map values, final long timestamp) {
        try {
            cellValuePutter.put(
                    "put", tableRef, KeyValueServices.toConstantTimestampValues(values.entrySet(), timestamp));
        } catch (Exception e) {
            throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
        }
    }

    /**
     * Puts values into the key-value store with individually specified timestamps. This call does not
     * guarantee atomicity across cells. On failure, it is possible that some of the requests have succeeded
     * (without having been rolled back). Similarly, concurrent batched requests may interleave.
     * 

     * Requires a quorum of Cassandra nodes to be reachable.
     *
     * @param tableRef the name of the table to put values into.
     * @param values map containing the key-value entries to put with
     * non-negative timestamps less than {@link Long#MAX_VALUE}.
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
     */
    @Override
    public void putWithTimestamps(TableReference tableRef, Multimap values) {
        try {
            cellValuePutter.put("putWithTimestamps", tableRef, values.entries());
        } catch (Exception e) {
            throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
        }
    }

    @Override
    protected int getMultiPutBatchCount() {
        return runtimeConfig.get().mutationBatchCount();
    }

    /**
     * Puts values into the key-value store. This call does not guarantee atomicity across cells.
     * On failure, it is possible that some of the requests have succeeded (without having been rolled
     * back). Similarly, concurrent batched requests may interleave.
     * 

     * Overridden to batch more intelligently than the default implementation.
     * 

     * Requires a quorum of Cassandra nodes to be reachable.
     *
     * @param valuesByTable map containing the key-value entries to put by table.
     * @param timestamp must be non-negative and not equal to {@link Long#MAX_VALUE}
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
     */
    @Override
    public void multiPut(Map> valuesByTable, long timestamp)
            throws KeyAlreadyExistsException {
        List flattened = new ArrayList<>();
        for (Map.Entry> tableAndValues : valuesByTable.entrySet()) {
            for (Map.Entry entry : tableAndValues.getValue().entrySet()) {
                flattened.add(new TableCellAndValue(tableAndValues.getKey(), entry.getKey(), entry.getValue()));
            }
        }
        Map> partitionedByHost =
                HostPartitioner.partitionByHost(clientPool, flattened, TableCellAndValue::extractRowName);

        List> callables = new ArrayList<>();
        for (Map.Entry> entry : partitionedByHost.entrySet()) {
            callables.addAll(getMultiPutTasksForSingleHost(entry.getKey(), entry.getValue(), timestamp));
        }
        taskRunner.runAllTasksCancelOnFailure(callables);
    }

    private List> getMultiPutTasksForSingleHost(
            final CassandraServer host, Collection values, final long timestamp) {
        Iterable> partitioned = IterablePartitioner.partitionByCountAndBytes(
                values,
                getMultiPutBatchCount(),
                getMultiPutBatchSizeBytes(),
                extractTableNames(values).toString(),
                TableCellAndValue::getSize);
        List> tasks = new ArrayList<>();
        for (final List batch : partitioned) {
            final Set tableRefs = extractTableNames(batch);
            tasks.add(AnnotatedCallable.wrapWithThreadName(
                    AnnotationType.PREPEND,
                    "Atlas multiPut of " + batch.size() + " cells into " + tableRefs + " on "
                            + host.cassandraHostName(),
                    () -> multiPutForSingleHostInternal(host, tableRefs, batch, timestamp)));
        }
        return tasks;
    }

    private static Set extractTableNames(Iterable tableCellAndValues) {
        Set tableRefs = new HashSet<>();
        for (TableCellAndValue tableCellAndValue : tableCellAndValues) {
            tableRefs.add(tableCellAndValue.tableRef);
        }
        return tableRefs;
    }

    private Void multiPutForSingleHostInternal(
            final CassandraServer host,
            final Set tableRefs,
            final List batch,
            long timestamp)
            throws Exception {
        final MutationMap mutationMap = convertToMutations(batch, timestamp);
        return clientPool.runWithRetryOnServer(host, new FunctionCheckedException() {
            @Override
            public Void apply(CassandraClient client) throws Exception {
                return wrappingQueryRunner.batchMutate("multiPut", client, tableRefs, mutationMap, WRITE_CONSISTENCY);
            }

            @Override
            public String toString() {
                return "batch_mutate(" + host.cassandraHostName() + ", " + tableRefs + ", " + batch.size() + " values)";
            }
        });
    }

    private static MutationMap convertToMutations(List batch, long timestamp) {
        MutationMap mutationMap = new MutationMap();
        for (TableCellAndValue tableCellAndValue : batch) {
            Cell cell = tableCellAndValue.cell;
            Column col = CassandraKeyValueServices.createColumn(cell, Value.create(tableCellAndValue.value, timestamp));
            ColumnOrSuperColumn colOrSup = new ColumnOrSuperColumn();
            colOrSup.setColumn(col);
            Mutation mutation = new Mutation();
            mutation.setColumn_or_supercolumn(colOrSup);

            mutationMap.addMutationForCell(cell, tableCellAndValue.tableRef, mutation);
        }
        return mutationMap;
    }

    /**
     * Truncate a table in the key-value store.
     * 

     * This is preferred to dropping and re-adding a table, as live schema changes can
     * be a complicated topic for distributed databases.
     * 

     * Requires all Cassandra nodes to be reachable.
     *
     * @param tableRef the name of the table to truncate.
     * @throws AtlasDbDependencyException if not all Cassandra nodes are reachable.
     * @throws RuntimeException if the table does not exist.
     */
    @Override
    public void truncateTable(final TableReference tableRef) {
        truncateTables(ImmutableSet.of(tableRef));
    }

    /**
     * Truncates tables in the key-value store.
     * 

     * This can be slightly faster than repeatedly truncating individual tables.
     * 

     * Requires all Cassandra nodes to be reachable.
     *
     * @param tablesToTruncate set od tables to truncate.
     * @throws AtlasDbDependencyException if not all Cassandra nodes are reachable.
     * @throws RuntimeException if the table does not exist.
     */
    @Override
    public void truncateTables(final Set tablesToTruncate) {
        cassandraTableTruncator.truncateTables(tablesToTruncate);
    }

    /**
     * Deletes values from the key-value store.
     * 

     * Requires all Cassandra nodes to be up and available, otherwise throws an PalantirRuntimeException.
     *
     * @param tableRef the name of the table to delete values from.
     * @param keys map containing the keys to delete values for.
     * @throws PalantirRuntimeException if not all hosts respond successfully.
     */
    @Override
    public void delete(TableReference tableRef, Multimap keys) {
        new CellDeleter(
                        clientPool,
                        wrappingQueryRunner,
                        DELETE_CONSISTENCY,
                        mutationTimestampProvider.getDeletionTimestampOperatorForBatchDelete())
                .delete(tableRef, keys);
    }

    @VisibleForTesting
    CfDef getCfForTable(TableReference tableRef, byte[] rawMetadata, int gcGraceSeconds) {
        return ColumnFamilyDefinitions.getCfDef(config.getKeyspaceOrThrow(), tableRef, gcGraceSeconds, rawMetadata);
    }

    // TODO(unknown): after cassandra change: handle multiRanges
    @Override
    @Idempotent
    public Map, byte[]>> getFirstBatchForRanges(
            TableReference tableRef, Iterable rangeRequests, long timestamp) {
        int concurrency = config.rangesConcurrency();
        return KeyValueServices.getFirstBatchForRangesUsingGetRangeConcurrent(
                executor, this, tableRef, rangeRequests, timestamp, concurrency);
    }

    // TODO(unknown): after cassandra change: handle reverse ranges
    // TODO(unknown): after cassandra change: handle column filtering

    /**
     * For each row in the specified range, returns the most recent version strictly before timestamp. Requires a
     * quorum of Cassandra nodes to be reachable.
     * 

     * Remember to close any {@link ClosableIterator}s you get in a finally block.
     *
     * @param rangeRequest the range to load.
     * @param timestamp specifies the maximum timestamp (exclusive) at which to retrieve each row's value.
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
     */
    @Override
    @Idempotent
    public ClosableIterator> getRange(
            TableReference tableRef, RangeRequest rangeRequest, long timestamp) {
        return rangeLoader.getRange(tableRef, rangeRequest, timestamp);
    }

    /**
     * Gets timestamp values from the key-value store. For each row, this returns all associated
     * timestamps < given_ts.
     * 

     * This method has stronger consistency guarantees than regular read requests. This must return all timestamps
     * stored anywhere in the system (because of sweep). Unless all nodes are up and available, this method will
     * throw an InsufficientConsistencyException.
     *
     * @param tableRef the name of the table to read from.
     * @param rangeRequest the range to load.
     * @param timestamp the maximum timestamp to load.
     * @throws InsufficientConsistencyException if not all hosts respond successfully.
     */
    @Override
    @Idempotent
    public ClosableIterator>> getRangeOfTimestamps(
            TableReference tableRef, RangeRequest rangeRequest, long timestamp) {
        CandidateCellForSweepingRequest request = ImmutableCandidateCellForSweepingRequest.builder()
                .startRowInclusive(rangeRequest.getStartInclusive())
                .maxTimestampExclusive(timestamp)
                .shouldCheckIfLatestValueIsEmpty(false)
                .shouldDeleteGarbageCollectionSentinels(true)
                .build();
        return getCandidateRowsForSweeping("getRangeOfTimestamps", tableRef, request)
                .flatMap(rows -> rows)
                .map(CandidateRowForSweeping::toRowResult)
                .stopWhen(rowResult -> !rangeRequest.inRange(rowResult.getRowName()));
    }

    @Override
    public ClosableIterator> getCandidateCellsForSweeping(
            TableReference tableRef, CandidateCellForSweepingRequest request) {
        return getCandidateRowsForSweeping("getCandidateCellsForSweeping", tableRef, request)
                .map(rows -> rows.stream()
                        .map(CandidateRowForSweeping::cells)
                        .flatMap(List::stream)
                        .collect(Collectors.toList()));
    }

    private ClosableIterator> getCandidateRowsForSweeping(
            String kvsMethodName, TableReference tableRef, CandidateCellForSweepingRequest request) {
        RowGetter rowGetter = new RowGetter(clientPool, queryRunner, ConsistencyLevel.ALL, tableRef);
        return new CandidateRowsForSweepingIterator(
                (iteratorTableRef, cells, maxTimestampExclusive) ->
                        get(kvsMethodName, iteratorTableRef, cells, maxTimestampExclusive),
                newInstrumentedCqlExecutor(),
                rowGetter,
                tableRef,
                request,
                runtimeConfig.map(CassandraKeyValueServiceRuntimeConfig::sweepReadThreads));
    }

    /**
     * Returns a sorted list of row keys in the specified range; see
     * {@link CassandraKeyValueService#getRowKeysInRange(TableReference, byte[], byte[], int)}.
     * 

     * Implementation specific: this method specifically does not read any of the columns and can therefore be used
     * in the presence of wide rows. However, as a side-effect, it may return row where the row only contains Cassandra
     * tombstones.
     */
    @Override
    public List getRowKeysInRange(TableReference tableRef, byte[] startRow, byte[] endRow, int maxResults) {
        RowGetter rowGetter = new RowGetter(clientPool, queryRunner, ConsistencyLevel.QUORUM, tableRef);
        return rowGetter.getRowKeysInRange(startRow, endRow, maxResults);
    }

    private CqlExecutor newInstrumentedCqlExecutor() {
        return AtlasDbMetrics.instrument(
                metricsManager.getRegistry(), CqlExecutor.class, new CqlExecutorImpl(clientPool, ConsistencyLevel.ALL));
    }

    /**
     * Drop the table, and also delete its table metadata. Requires a quorum of Cassandra nodes to be reachable.
     *
     * @param tableRef the name of the table to drop.
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster
     * cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during
     * its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we
     * may drop the tables, but fail to to persist the changes to the _metadata table.
     * @throws UncheckedExecutionException if there are multiple schema mutation lock tables.
     */
    @Override
    public void dropTable(final TableReference tableRef) {
        dropTables(ImmutableSet.of(tableRef));
    }

    /**
     * Drop the tables, and also delete their table metadata. Requires a quorum of Cassandra nodes to be reachable.
     * 

     * Main gains here vs. dropTable:
     * - problems excepting, we will basically be serializing a rapid series of schema changes
     * through a single host checked out from the client pool, so reduced chance of schema disagreement issues
     * - client-side in-memory lock to prevent misbehaving callers from shooting themselves in the foot
     * - one less round trip
     *
     * @param tablesToDrop the set of tables to drop.
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster
     * cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during
     * its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we
     * may drop the tables, but fail to to persist the changes to the _metadata table.
     * @throws UncheckedExecutionException if there are multiple schema mutation lock tables.
     */
    @Override
    public void dropTables(final Set tablesToDrop) {
        cassandraTableDropper.dropTables(tablesToDrop);
    }

    /**
     * Creates a table with the specified name. If the table already exists, no action is performed
     * (the table is left in its current state). Requires a quorum of Cassandra nodes to be reachable.
     *
     * @param tableRef the name of the table to create.
     * @param metadata the metadata of the table to create.
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster
     * cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during
     * its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we
     * may fail to persist the changes to the _metadata table.
     * @throws UncheckedExecutionException if there are multiple schema mutation lock tables.
     */
    @Override
    public void createTable(final TableReference tableRef, final byte[] metadata) {
        createTables(ImmutableMap.of(tableRef, metadata));
    }

    /**
     * Creates a table with the specified name. If the table already exists, no action is performed
     * (the table is left in its current state).
     * 

     * Requires a quorum of Cassandra nodes to be up and available.
     * 

     * Main gains here vs. createTable:
     * - problems excepting, we will basically be serializing a rapid series of schema changes
     * through a single host checked out from the client pool, so reduced chance of schema disagreement issues
     * - client-side in-memory lock to prevent misbehaving callers from shooting themselves in the foot
     * - one less round trip
     * 

     * createTables(existingTable, newMetadata) can perform a metadata-only update. Additionally, it is possible
     * that this metadata-only update performs a schema mutation by altering the CFDef (e. g., user changes metadata
     * of existing table to have new compression block size). This does not require the schema mutation lock, as it
     * does not alter the CfId
     *
     * @param tablesToMetadata a mapping of names of tables to create to their respective metadata.
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster
     * cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during
     * its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we
     * may fail to persist the changes to the _metadata table.
     * @throws UncheckedExecutionException if there are multiple schema mutation lock tables.
     */
    @Override
    public void createTables(final Map tablesToMetadata) {
        Map tablesToCreate = tableMetadata.filterOutExistingTables(tablesToMetadata);
        Map tablesToAlter = tableMetadata.filterOutNoOpMetadataChanges(tablesToMetadata);

        boolean onlyMetadataChangesAreForNewTables = tablesToAlter.keySet().equals(tablesToCreate.keySet());
        boolean putMetadataWillNeedASchemaChange = !onlyMetadataChangesAreForNewTables;

        if (!tablesToCreate.isEmpty()) {
            LoggingArgs.SafeAndUnsafeTableReferences safeAndUnsafe = LoggingArgs.tableRefs(tablesToCreate.keySet());
            log.info("Creating tables {} and {}", safeAndUnsafe.safeTableRefs(), safeAndUnsafe.unsafeTableRefs());
            cassandraTableCreator.createTables(tablesToCreate);
        }
        internalPutMetadataForTables(tablesToAlter, putMetadataWillNeedASchemaChange);
    }

    /**
     * Return the list of tables stored in this key value service. Requires a quorum of Cassandra nodes to be reachable
     * and agree on schema versions.
     * 

     * This will not contain the names of any hidden tables (e. g., the _metadata table).
     *
     * @return a set of TableReferences (table names) for all the visible tables
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster
     * cannot come to an agreement on schema versions.
     */
    @Override
    public Set getAllTableNames() {
        return cassandraTables
                .getTableReferencesWithoutFiltering()
                .filter(tr -> !HiddenTables.isHidden(tr))
                .collect(Collectors.toSet());
    }

    /**
     * Gets the metadata for a given table. Do not use this method to see if a table exists as it can return false
     * positives. Requires a quorum of Cassandra nodes to be reachable.
     *
     * @param tableRef the name of the table to get metadata for.
     * @return a byte array representing the metadata for the table. Array is empty if no table
     * with the given name exists. Consider {@link TableMetadata#BYTES_HYDRATOR} for hydrating.
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
     */
    @Override
    public byte[] getMetadataForTable(TableReference tableRef) {
        // try and get with a single-key lookup
        String lowerCaseTableName = tableRef.getQualifiedName().toLowerCase(Locale.ROOT);
        Map rows = getRows(
                AtlasDbConstants.DEFAULT_METADATA_TABLE,
                ImmutableSet.of(lowerCaseTableName.getBytes(StandardCharsets.UTF_8)),
                ColumnSelection.all(),
                Long.MAX_VALUE);

        if (!rows.isEmpty()) {
            return Iterables.getOnlyElement(rows.values()).getContents();
        }

        // if unsuccessful with fast code-path, we need to check if this table exists but was written at a key
        // before we started enforcing only writing lower-case canonicalised versions of keys
        return Optional.ofNullable(getMetadataForTables().get(tableRef)).orElse(AtlasDbConstants.EMPTY_TABLE_METADATA);
    }

    private static boolean matchingIgnoreCase(@Nullable TableReference t1, TableReference t2) {
        if (t1 != null) {
            return t1.getQualifiedName().equalsIgnoreCase(t2.getQualifiedName());
        } else {
            return t2 == null;
        }
    }

    /**
     * Gets the metadata for all non-hidden tables. Requires a quorum of Cassandra nodes to be reachable.
     *
     * @return a mapping of table names to their respective metadata in form of a byte array.  Consider
     * {@link TableMetadata#BYTES_HYDRATOR} for hydrating.
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are available.
     */
    @Override
    public Map getMetadataForTables() {
        return tableMetadata.getMetadataForTables();
    }

    /**
     * Records the specified metadata for a given table. Requires a quorum of Cassandra nodes to be reachable.
     *
     * @param tableRef the name of the table to record metadata for.
     * @param meta a byte array representing the metadata to record.
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster
     * cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during
     * its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we
     * may fail to persist the changes to the _metadata table.
     */
    @Override
    public void putMetadataForTable(final TableReference tableRef, final byte[] meta) {
        putMetadataForTables(ImmutableMap.of(tableRef, meta));
    }

    /**
     * For each specified table records the respective metadata. Requires a quorum of Cassandra nodes to be reachable.
     *
     * @param tableRefToMetadata a mapping from each table's name to the respective byte array representing
     * the metadata to record.
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster
     * cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during
     * its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we
     * may fail to persist the changes to the _metadata table.
     */
    @Override
    public void putMetadataForTables(final Map tableRefToMetadata) {
        internalPutMetadataForTables(tableRefToMetadata, true);
    }

    @SuppressWarnings("checkstyle:RegexpSinglelineJava")
    private void internalPutMetadataForTables(
            Map tableRefToMetadata, boolean possiblyNeedToPerformSettingsChanges) {
        if (tableRefToMetadata.isEmpty()) {
            return;
        }

        Map tableRefToNewCell = Maps.transformEntries(
                tableRefToMetadata, (tableRef, metadata) -> CassandraKeyValueServices.getMetadataCell(tableRef));
        Map tableRefToOldCell = Maps.transformEntries(
                tableRefToMetadata, (tableRef, metadata) -> CassandraKeyValueServices.getOldMetadataCell(tableRef));

        // technically we're racing other nodes from here on, during an update period,
        // but the penalty for not caring is just some superfluous schema mutations and a
        // few dead rows in the metadata table.
        Map existingMetadataAtNewName = get(
                AtlasDbConstants.DEFAULT_METADATA_TABLE,
                tableRefToNewCell.values().stream()
                        .collect(Collectors.toMap(Functions.identity(), Functions.constant(Long.MAX_VALUE))));

        Map existingMetadataAtOldName = get(
                AtlasDbConstants.DEFAULT_METADATA_TABLE,
                tableRefToOldCell.values().stream()
                        .collect(Collectors.toMap(Functions.identity(), Functions.constant(Long.MAX_VALUE))));

        final Map updatedMetadata = new HashMap<>();
        final Set updatedCfs = new HashSet<>();

        tableRefToNewCell.forEach((tableRef, newCell) -> {
            if (existingMetadataAtNewName.containsKey(newCell)) {
                if (metadataIsDifferent(
                        existingMetadataAtNewName.get(newCell).getContents(), tableRefToMetadata.get(tableRef))) {
                    // found existing metadata at new name, but we're performing an update
                    updatedMetadata.put(newCell, tableRefToMetadata.get(tableRef));
                    updatedCfs.add(getCfForTable(tableRef, tableRefToMetadata.get(tableRef), config.gcGraceSeconds()));
                }
            } else if (existingMetadataAtOldName.containsKey(tableRefToOldCell.get(tableRef))) {
                if (metadataIsDifferent(
                        existingMetadataAtOldName
                                .get(tableRefToOldCell.get(tableRef))
                                .getContents(),
                        tableRefToMetadata.get(tableRef))) {
                    // found existing metadata at old name, but we're performing an update
                    updatedMetadata.put(tableRefToOldCell.get(tableRef), tableRefToMetadata.get(tableRef));
                    updatedCfs.add(getCfForTable(tableRef, tableRefToMetadata.get(tableRef), config.gcGraceSeconds()));
                }
            } else {
                // didn't find an existing metadata at old or new names, this is completely new;
                // thus, let's write it out with the new format
                updatedMetadata.put(tableRefToNewCell.get(tableRef), tableRefToMetadata.get(tableRef));
                updatedCfs.add(getCfForTable(tableRef, tableRefToMetadata.get(tableRef), config.gcGraceSeconds()));
            }
        });

        if (!updatedMetadata.isEmpty()) {
            putMetadataAndMaybeAlterTables(possiblyNeedToPerformSettingsChanges, updatedMetadata, updatedCfs);
        }
    }

    private static boolean metadataIsDifferent(byte[] existingMetadata, byte[] requestMetadata) {
        return !Arrays.equals(existingMetadata, requestMetadata);
    }

    private void putMetadataAndMaybeAlterTables(
            boolean possiblyNeedToPerformSettingsChanges, Map newMetadata, Collection updatedCfs) {
        try {
            clientPool.runWithRetry(client -> {
                if (possiblyNeedToPerformSettingsChanges) {
                    for (CfDef cf : updatedCfs) {
                        client.system_update_column_family(cf);
                    }

                    CassandraKeyValueServices.waitForSchemaVersions(
                            config.schemaMutationTimeoutMillis(),
                            client,
                            schemaChangeDescriptionForPutMetadataForTables(updatedCfs));
                }
                // Done with actual schema mutation, push the metadata
                put(AtlasDbConstants.DEFAULT_METADATA_TABLE, newMetadata, System.currentTimeMillis());
                return null;
            });
        } catch (Exception e) {
            throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
        }
    }

    private static String schemaChangeDescriptionForPutMetadataForTables(Collection updatedCfs) {
        String tables = updatedCfs.stream()
                .map(CassandraKeyValueServices::tableReferenceFromCfDef)
                .map(Object::toString)
                .collect(Collectors.toList())
                .toString();
        return String.format(
                "after updating the column family for tables %s in a call to put metadata for tables", tables);
    }

    @Override
    public void deleteRange(final TableReference tableRef, final RangeRequest range) {
        if (range.equals(RangeRequest.all())) {
            try {
                cassandraTableTruncator.truncateTables(ImmutableSet.of(tableRef));
            } catch (AtlasDbDependencyException e) {
                log.info(
                        "Tried to make a deleteRange({}, RangeRequest.all())"
                                + " into a more garbage-cleanup friendly truncate(), but this failed.",
                        LoggingArgs.tableRef(tableRef),
                        e);

                super.deleteRange(tableRef, range);
            }
        } else if (isForSingleRow(range.getStartInclusive(), range.getEndExclusive())) {
            try {
                long timestamp = mutationTimestampProvider.getRemoveTimestamp();
                byte[] row = range.getStartInclusive();
                clientPool.runWithRetry(client -> {
                    client.remove("deleteRange", tableRef, row, timestamp, DELETE_CONSISTENCY);
                    return null;
                });
            } catch (RetryLimitReachedException e) {
                throw CassandraUtils.wrapInIceForDeleteOrRethrow(e);
            } catch (TException e) {
                throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
            }
        } else {
            super.deleteRange(tableRef, range);
        }
    }

    private static boolean isForSingleRow(byte[] startInclusive, byte[] endExclusive) {
        if (startInclusive.length == 0 || endExclusive.length == 0) {
            return false;
        }
        return Arrays.equals(endExclusive, RangeRequests.nextLexicographicName(startInclusive));
    }

    @Override
    public void deleteRows(TableReference tableRef, Iterable rows) {
        Set actualKeys = StreamSupport.stream(rows.spliterator(), false)
                .map(ByteBuffer::wrap)
                .collect(Collectors.toSet());
        if (actualKeys.isEmpty()) {
            return;
        }
        long timestamp = mutationTimestampProvider.getRemoveTimestamp();

        Map>> mutationMap = KeyedStream.of(actualKeys)
                .map(row -> new Deletion().setTimestamp(timestamp))
                .map(deletion -> new Mutation().setDeletion(deletion))
                .map(mutation -> keyMutationMapByColumnFamily(tableRef, mutation))
                .collectToMap();

        try {
            clientPool.runWithRetry(client -> {
                client.batch_mutate("deleteRows", mutationMap, DELETE_CONSISTENCY);
                return null;
            });
        } catch (RetryLimitReachedException e) {
            throw CassandraUtils.wrapInIceForDeleteOrRethrow(e);
        } catch (TException e) {
            throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
        }
    }

    private static Map> keyMutationMapByColumnFamily(
            TableReference tableRef, Mutation mutation) {
        return ImmutableMap.of(AbstractKeyValueService.internalTableName(tableRef), ImmutableList.of(mutation));
    }

    @Override
    public void deleteAllTimestamps(TableReference tableRef, Map deletes) {
        new CellRangeDeleter(
                        clientPool,
                        wrappingQueryRunner,
                        DELETE_CONSISTENCY,
                        mutationTimestampProvider::getRangeTombstoneTimestamp)
                .deleteAllTimestamps(tableRef, deletes);
    }

    /**
     * Performs non-destructive cleanup when the KVS is no longer needed.
     */
    @Override
    public void close() {
        clientPool.shutdown();
        asyncKeyValueService.close();
        super.close();
    }

    /**
     * Adds a value with timestamp = Value.INVALID_VALUE_TIMESTAMP to each of the given cells. If
     * a value already exists at that time stamp, nothing is written for that cell.
     * 

     * Requires a quorum of Cassandra nodes to be reachable.
     *
     * @param tableRef the name of the table to add the value to.
     * @param cells a set of cells to store the values in.
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
     */
    @Override
    public void addGarbageCollectionSentinelValues(TableReference tableRef, Iterable cells) {
        try {
            final Value value = Value.create(PtBytes.EMPTY_BYTE_ARRAY, Value.INVALID_VALUE_TIMESTAMP);
            cellValuePutter.putWithOverriddenTimestamps(
                    "addGarbageCollectionSentinelValues",
                    tableRef,
                    Iterables.transform(cells, cell -> Maps.immutableEntry(cell, value)));
        } catch (Exception e) {
            throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
        }
    }

    /**
     * Gets timestamp values from the key-value store. For each cell, this returns all associated
     * timestamps < given_ts.
     * 

     * This method has stronger consistency guarantees than regular read requests. This must return
     * all timestamps stored anywhere in the system (because of sweep). Unless all nodes are up and available, this
     * method will throw a PalantirRuntimeException.
     *
     * @param tableRef the name of the table to retrieve timestamps from.
     * @param cells set containg cells to retrieve timestamps for.
     * @param ts maximum timestamp to get (exclusive).
     * @return multimap of timestamps by cell
     * @throws AtlasDbDependencyException if not all Cassandra nodes are reachable.
     */
    @Override
    public Multimap getAllTimestamps(TableReference tableRef, Set cells, long ts) {
        return cellLoader.getAllTimestamps(tableRef, cells, ts, DELETE_CONSISTENCY);
    }

    /**
     * Puts values into the key-value store. This call does not guarantee
     * atomicity across cells. On failure, it is possible that some of the requests will
     * have succeeded (without having been rolled back). Similarly, concurrent batched requests may
     * interleave.  However, concurrent writes to the same Cell will not both report success.
     * One of them will throw {@link KeyAlreadyExistsException}.
     * 

     * Requires a quorum of Cassandra nodes to be reachable.
     *
     * @param tableRef the name of the table to put values into.
     * @param values map containing the key-value entries to put.
     * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
     * @throws KeyAlreadyExistsException if you are putting a Cell with the same timestamp as one that already exists.
     */
    @Override
    public void putUnlessExists(final TableReference tableRef, final Map values)
            throws KeyAlreadyExistsException {
        try {
            Optional failure = clientPool.runWithRetry(client -> {
                Map> partitionedEntries = partitionPerRow(values);

                for (Map.Entry> partition : partitionedEntries.entrySet()) {
                    CASResult casResult =
                            putUnlessExistsSinglePartition(tableRef, client, partition.getKey(), partition.getValue());
                    if (!casResult.isSuccess()) {
                        return Optional.of(new KeyAlreadyExistsException(
                                "The cells in the table already exist.",
                                casResult.getCurrent_values().stream()
                                        .map(column -> Cell.create(
                                                partition.getKey().toByteArray(),
                                                CassandraKeyValueServices.decomposeColumn(column.bufferForName())
                                                        .columnName()))
                                        .collect(Collectors.toList()),
                                LoggingArgs.tableRef(tableRef)));
                    }
                }
                return Optional.empty();
            });
            failure.ifPresent(exception -> {
                throw exception;
            });
        } catch (KeyAlreadyExistsException e) {
            throw e;
        } catch (Exception e) {
            throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
        }
    }

    @Override
    public void setOnce(TableReference tableRef, Map values) {
        try {
            cellValuePutter.set(
                    "setOnce",
                    tableRef,
                    KeyValueServices.toConstantTimestampValues(values.entrySet(), AtlasDbConstants.TRANSACTION_TS));
        } catch (Exception e) {
            throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
        }
    }

    public static Map> partitionPerRow(Map values) {
        return values.entrySet().stream()
                .collect(Collectors.groupingBy(
                        entry -> ByteString.copyFrom(entry.getKey().getRowName()),
                        Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)));
    }

    private static CASResult putUnlessExistsSinglePartition(
            TableReference tableRef, CassandraClient client, ByteString row, Map partition)
            throws TException {
        return client.put_unless_exists(
                tableRef,
                ByteBuffer.wrap(row.toByteArray()),
                partition.entrySet().stream()
                        .map(CassandraKeyValueServiceImpl::prepareColumnForPutUnlessExists)
                        .collect(Collectors.toList()),
                ConsistencyLevel.SERIAL,
                WRITE_CONSISTENCY);
    }

    private static Column prepareColumnForPutUnlessExists(Map.Entry insertion) {
        return new Column(CassandraKeyValueServices.makeCompositeBuffer(
                        insertion.getKey().getColumnName(),
                        // Atlas timestamp
                        CassandraConstants.CAS_TABLE_TIMESTAMP))
                // Cassandra timestamp
                .setTimestamp(CassandraConstants.CAS_TABLE_TIMESTAMP)
                .setValue(insertion.getValue());
    }

    @Override
    public CheckAndSetCompatibility getCheckAndSetCompatibility() {
        return CheckAndSetCompatibility.supportedBuilder()
                .supportsMultiCheckAndSetOperations(true)
                .supportsDetailOnFailure(true)
                .consistentOnFailure(false)
                .build();
    }

    /**
     * Performs a check-and-set into the key-value store.
     * Please see {@link CheckAndSetRequest} for information about how to create this request,
     * and {@link KeyValueService} for more detailed documentation.
     * 

     * Does not require all Cassandra nodes to be up and available, works as long as quorum is achieved.
     *
     * @param request the request, including table, cell, old value and new value.
     * @throws CheckAndSetException if the stored value for the cell was not as expected.
     */
    @Override
    public void checkAndSet(final CheckAndSetRequest request) throws CheckAndSetException {
        try {
            CheckAndSetResult casResult =
                    clientPool.runWithRetry(client -> checkAndSetRunner.executeCheckAndSet(client, request));
            if (!casResult.successful()) {
                List currentValues = casResult.existingValues().stream()
                        .map(ByteString::toByteArray)
                        .collect(Collectors.toList());

                throw new CheckAndSetException(
                        request.cell(), request.table(), request.oldValue().orElse(null), currentValues);
            }
        } catch (CheckAndSetException e) {
            throw e;
        } catch (Exception e) {
            throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
        }
    }

    /**
     * Performs a check-and-set for multiple cells in a row into the key-value store.
     * Please see {@link MultiCheckAndSetRequest} for information about how to create this request,
     * and {@link KeyValueService} for more detailed documentation.
     * 

     * If the call completes successfully, then you know that the old cells initially had the values you expected.
     * In this case, you can be sure that all your cells have been updated to their new values.
     * If the old cells initially did not have the values you expected, none of the cells will be updated and
     * {@link MultiCheckAndSetException} will be thrown.
     * Reads concurrent with this operation will not see a partial update.
     * 

     * Another thing to note is that the check operation will **only be performed on values of cells that are declared
     * in the set of expected values** i.e. the check operation DOES NOT take updates into account.
     * 
     * Does not require all Cassandra nodes to be up and available, works as long as quorum is achieved.
     *
     * @param request the request, including table, rowName, old values and new values.
     * @throws MultiCheckAndSetException if the stored values for the cells were not as expected.
     */
    @Override
    public void multiCheckAndSet(MultiCheckAndSetRequest request) throws MultiCheckAndSetException {
        TableReference tableRef = request.tableRef();
        ByteBuffer row = ByteBuffer.wrap(request.rowName());

        List oldCol = request.expected().entrySet().stream()
                .map(CassandraKeyValueServiceImpl::prepareColumnForPutUnlessExists)
                .collect(Collectors.toList());
        List newCol = request.updates().entrySet().stream()
                .map(CassandraKeyValueServiceImpl::prepareColumnForPutUnlessExists)
                .collect(Collectors.toList());
        try {
            CASResult casResult = clientPool.runWithRetry(client ->
                    client.cas(tableRef, row, oldCol, newCol, ConsistencyLevel.SERIAL, ConsistencyLevel.EACH_QUORUM));
            if (!casResult.isSuccess()) {
                Map currentValues = KeyedStream.of(casResult.getCurrent_values())
                        .mapKeys(column -> Cell.create(
                                request.rowName(),
                                CassandraKeyValueServices.decomposeColumn(column.bufferForName())
                                        .columnName()))
                        .map(Column::getValue)
                        .collectToMap();

                throw new MultiCheckAndSetException(
                        LoggingArgs.tableRef(tableRef), request.rowName(), request.expected(), currentValues);
            }
        } catch (MultiCheckAndSetException e) {
            throw e;
        } catch (Exception e) {
            log.error("Error while executing multi-checkAndSet operation.", e);
            throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
        }
    }

    @Override
    public void compactInternally(TableReference tableRef) {
        log.info(
                "Called compactInternally on {}, but this is a no-op for Cassandra KVS."
                        + "Cassandra should eventually decide to compact this table for itself.",
                LoggingArgs.tableRef(tableRef));
    }

    @Override
    public ClusterAvailabilityStatus getClusterAvailabilityStatus() {
        ClusterAvailabilityStatus clusterStatus = getStatusByRunningOperationsOnEachHost();
        if (isClusterQuorumAvaialble(clusterStatus) && !doesConfigReplicationFactorMatchWithCluster()) {
            return ClusterAvailabilityStatus.TERMINAL;
        }
        return clusterStatus;
    }

    @Override
    public boolean sweepsEntriesInStrictlyNonDecreasingFashion() {
        return true;
    }

    private static boolean isClusterQuorumAvaialble(ClusterAvailabilityStatus clusterStatus) {
        return clusterStatus.equals(ClusterAvailabilityStatus.ALL_AVAILABLE)
                || clusterStatus.equals(ClusterAvailabilityStatus.QUORUM_AVAILABLE);
    }

    private boolean doesConfigReplicationFactorMatchWithCluster() {
        return clientPool.runWithRetry(client -> {
            try {
                CassandraVerifier.currentRfOnKeyspaceMatchesDesiredRf(client, verifierConfig);
                return true;
            } catch (Exception e) {
                log.warn("The config and Cassandra cluster do not agree on the replication factor.", e);
                return false;
            }
        });
    }

    private ClusterAvailabilityStatus getStatusByRunningOperationsOnEachHost() {
        int countUnreachableNodes = 0;
        for (CassandraServer server : clientPool.getCurrentPools().keySet()) {
            try {
                clientPool.runOnCassandraServer(server, CassandraVerifier.healthCheck);
                if (!partitionerIsValid(server)) {
                    return ClusterAvailabilityStatus.TERMINAL;
                }
            } catch (Exception e) {
                countUnreachableNodes++;
            }
        }
        return getNodeAvailabilityStatus(countUnreachableNodes);
    }

    private boolean partitionerIsValid(CassandraServer host) {
        try {
            clientPool.runOnCassandraServer(host, clientPool.getValidatePartitioner());
            return true;
        } catch (Exception e) {
            return false;
        }
    }

    private ClusterAvailabilityStatus getNodeAvailabilityStatus(int countUnreachableNodes) {
        if (countUnreachableNodes == 0) {
            return ClusterAvailabilityStatus.ALL_AVAILABLE;
        } else if (isQuorumAvailable(countUnreachableNodes)) {
            return ClusterAvailabilityStatus.QUORUM_AVAILABLE;
        } else {
            return ClusterAvailabilityStatus.NO_QUORUM_AVAILABLE;
        }
    }

    private boolean isQuorumAvailable(int countUnreachableNodes) {
        int replicationFactor = runtimeConfig.get().replicationFactor();
        return countUnreachableNodes < (replicationFactor + 1) / 2;
    }

    @Override
    public CassandraClientPool getClientPool() {
        return clientPool;
    }

    @Override
    public TracingQueryRunner getTracingQueryRunner() {
        return queryRunner;
    }

    @Override
    public CassandraTables getCassandraTables() {
        return cassandraTables;
    }

    @Override
    public boolean performanceIsSensitiveToTombstones() {
        return true;
    }

    /**
     * Asynchronously gets values from the cassandra key-value store.
     *
     * @param tableRef the name of the table to retrieve values from.
     * @param timestampByCell specifies, for each row, the maximum timestamp (exclusive) at which to
     * retrieve that rows's value.
     * @return listenable future map of retrieved values. Values which do not exist (either
     * because they were deleted or never created in the first place)
     * are simply not returned.
     */
    @Override
    public ListenableFuture> getAsync(TableReference tableRef, Map timestampByCell) {
        if (timestampByCell.isEmpty()) {
            log.info("Attempted get with no specified cells", LoggingArgs.tableRef(tableRef));
            return Futures.immediateFuture(ImmutableMap.of());
        }
        if (asyncKeyValueService.isValid()) {
            try {
                return Futures.catching(
                        asyncKeyValueService.getAsync(tableRef, timestampByCell),
                        IllegalStateException.class,
                        e -> {
                            log.warn(
                                    "CQL Client closed during getAsync. Delegating to synchronous get. This should be"
                                            + " very rare, and only happen once after the Cassandra Server list has"
                                            + " changed.",
                                    e);
                            return this.get(tableRef, timestampByCell);
                        },
                        executor);
            } catch (IllegalStateException | DriverInternalError e) {
                // If the container is closed, or we've reloaded into an invalid ThrowingCqlClient, after testing for
                // validity
                return Futures.immediateFuture(this.get(tableRef, timestampByCell));
            }
        } else {
            return Futures.immediateFuture(this.get(tableRef, timestampByCell));
        }
    }

    private static class TableCellAndValue {

        private static byte[] extractRowName(TableCellAndValue input) {
            return input.cell.getRowName();
        }

        private static Long getSize(TableCellAndValue input) {
            return input.value.length + Cells.getApproxSizeOfCell(input.cell);
        }

        private final TableReference tableRef;
        private final Cell cell;
        private final byte[] value;

        TableCellAndValue(TableReference tableRef, Cell cell, byte[] value) {
            this.tableRef = tableRef;
            this.cell = cell;
            this.value = value;
        }
    }
}