All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServiceImpl Maven / Gradle / Ivy

The newest version!
/*
 * (c) Copyright 2018 Palantir Technologies Inc. All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.palantir.atlasdb.keyvalue.cassandra;

import com.codahale.metrics.Counter;
import com.datastax.driver.core.exceptions.DriverInternalError;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Functions;
import com.google.common.base.Predicates;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.SetMultimap;
import com.google.common.collect.Sets;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.UncheckedExecutionException;
import com.google.protobuf.ByteString;
import com.palantir.async.initializer.AsyncInitializer;
import com.palantir.atlasdb.AtlasDbConstants;
import com.palantir.atlasdb.AtlasDbMetricNames.CellFilterMetrics;
import com.palantir.atlasdb.CassandraTopologyValidationMetrics;
import com.palantir.atlasdb.cassandra.CassandraKeyValueServiceConfig;
import com.palantir.atlasdb.cassandra.CassandraKeyValueServiceRuntimeConfig;
import com.palantir.atlasdb.cassandra.CassandraMutationTimestampProvider;
import com.palantir.atlasdb.cassandra.CassandraMutationTimestampProviders;
import com.palantir.atlasdb.cassandra.CassandraServersConfigs.CassandraServersConfig;
import com.palantir.atlasdb.encoding.PtBytes;
import com.palantir.atlasdb.keyvalue.api.AsyncKeyValueService;
import com.palantir.atlasdb.keyvalue.api.BatchColumnRangeSelection;
import com.palantir.atlasdb.keyvalue.api.CandidateCellForSweeping;
import com.palantir.atlasdb.keyvalue.api.CandidateCellForSweepingRequest;
import com.palantir.atlasdb.keyvalue.api.Cell;
import com.palantir.atlasdb.keyvalue.api.CheckAndSetCompatibility;
import com.palantir.atlasdb.keyvalue.api.CheckAndSetException;
import com.palantir.atlasdb.keyvalue.api.CheckAndSetRequest;
import com.palantir.atlasdb.keyvalue.api.ClusterAvailabilityStatus;
import com.palantir.atlasdb.keyvalue.api.ColumnSelection;
import com.palantir.atlasdb.keyvalue.api.ImmutableCandidateCellForSweepingRequest;
import com.palantir.atlasdb.keyvalue.api.InsufficientConsistencyException;
import com.palantir.atlasdb.keyvalue.api.KeyAlreadyExistsException;
import com.palantir.atlasdb.keyvalue.api.KeyValueService;
import com.palantir.atlasdb.keyvalue.api.MultiCheckAndSetException;
import com.palantir.atlasdb.keyvalue.api.MultiCheckAndSetRequest;
import com.palantir.atlasdb.keyvalue.api.RangeRequest;
import com.palantir.atlasdb.keyvalue.api.RangeRequests;
import com.palantir.atlasdb.keyvalue.api.RetryLimitReachedException;
import com.palantir.atlasdb.keyvalue.api.RowColumnRangeIterator;
import com.palantir.atlasdb.keyvalue.api.RowResult;
import com.palantir.atlasdb.keyvalue.api.TableReference;
import com.palantir.atlasdb.keyvalue.api.TimestampRangeDelete;
import com.palantir.atlasdb.keyvalue.api.Value;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraClientPoolImpl.StartupChecks;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServices.ColumnAndTimestamp;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServices.StartTsResultsCollector;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraVerifier.CassandraVerifierConfig;
import com.palantir.atlasdb.keyvalue.cassandra.RowColumnRangeExtractor.RowColumnRangeResult;
import com.palantir.atlasdb.keyvalue.cassandra.async.client.creation.ClusterFactory.CassandraClusterConfig;
import com.palantir.atlasdb.keyvalue.cassandra.cas.CheckAndSetRunner;
import com.palantir.atlasdb.keyvalue.cassandra.paging.RowGetter;
import com.palantir.atlasdb.keyvalue.cassandra.pool.CassandraServer;
import com.palantir.atlasdb.keyvalue.cassandra.sweep.CandidateRowForSweeping;
import com.palantir.atlasdb.keyvalue.cassandra.sweep.CandidateRowsForSweepingIterator;
import com.palantir.atlasdb.keyvalue.cassandra.thrift.MutationMap;
import com.palantir.atlasdb.keyvalue.cassandra.thrift.SlicePredicates;
import com.palantir.atlasdb.keyvalue.cassandra.thrift.SlicePredicates.Limit;
import com.palantir.atlasdb.keyvalue.cassandra.thrift.SlicePredicates.Range;
import com.palantir.atlasdb.keyvalue.impl.AbstractKeyValueService;
import com.palantir.atlasdb.keyvalue.impl.Cells;
import com.palantir.atlasdb.keyvalue.impl.CheckAndSetResult;
import com.palantir.atlasdb.keyvalue.impl.IterablePartitioner;
import com.palantir.atlasdb.keyvalue.impl.KeyValueServices;
import com.palantir.atlasdb.keyvalue.impl.LocalRowColumnRangeIterator;
import com.palantir.atlasdb.logging.LoggingArgs;
import com.palantir.atlasdb.table.description.TableMetadata;
import com.palantir.atlasdb.util.AnnotatedCallable;
import com.palantir.atlasdb.util.AnnotationType;
import com.palantir.atlasdb.util.AtlasDbMetrics;
import com.palantir.atlasdb.util.MetricsManager;
import com.palantir.atlasdb.util.MetricsManagers;
import com.palantir.common.annotation.Idempotent;
import com.palantir.common.base.ClosableIterator;
import com.palantir.common.base.ClosableIterators;
import com.palantir.common.base.FunctionCheckedException;
import com.palantir.common.base.Throwables;
import com.palantir.common.exception.AtlasDbDependencyException;
import com.palantir.common.exception.PalantirRuntimeException;
import com.palantir.common.streams.KeyedStream;
import com.palantir.logsafe.Preconditions;
import com.palantir.logsafe.SafeArg;
import com.palantir.logsafe.logger.SafeLogger;
import com.palantir.logsafe.logger.SafeLoggerFactory;
import com.palantir.refreshable.Refreshable;
import com.palantir.tritium.metrics.registry.TaggedMetricRegistry;
import com.palantir.util.paging.AbstractPagingIterable;
import com.palantir.util.paging.SimpleTokenBackedResultsPage;
import com.palantir.util.paging.TokenBackedBasicResultsPage;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import javax.annotation.Nullable;
import one.util.streamex.EntryStream;
import org.apache.cassandra.thrift.CASResult;
import org.apache.cassandra.thrift.CfDef;
import org.apache.cassandra.thrift.Column;
import org.apache.cassandra.thrift.ColumnOrSuperColumn;
import org.apache.cassandra.thrift.ConsistencyLevel;
import org.apache.cassandra.thrift.Deletion;
import org.apache.cassandra.thrift.KeyPredicate;
import org.apache.cassandra.thrift.KsDef;
import org.apache.cassandra.thrift.Mutation;
import org.apache.cassandra.thrift.SlicePredicate;
import org.apache.thrift.TException;

/**
 * Each service can have one or many C* KVS.
 * For each C* KVS, it maintains a list of active nodes, and the client connections attached to each node:
 * 

* n1->c1, c2, c3 * n2->c5, c4, c9 * n3->[N C* thrift client connections] *

* Where {n1, n2, n3} are the active nodes in the C* cluster. Also each * node contains the clients which are attached to the node. * if some nodes are down, and the change can be detected through active hosts, * and these inactive nodes will be removed afterwards. */ @SuppressWarnings({"FinalClass", "Not final for mocking in tests"}) public class CassandraKeyValueServiceImpl extends AbstractKeyValueService implements CassandraKeyValueService { @VisibleForTesting class InitializingWrapper extends AsyncInitializer implements AutoDelegate_CassandraKeyValueService { @Override public CassandraKeyValueServiceImpl delegate() { checkInitialized(); return CassandraKeyValueServiceImpl.this; } @Override public Collection getDelegates() { return ImmutableList.of(delegate()); } @Override protected void tryInitialize() { CassandraKeyValueServiceImpl.this.tryInitialize(); } @Override public boolean supportsCheckAndSet() { return CassandraKeyValueServiceImpl.this.supportsCheckAndSet(); } @Override public CheckAndSetCompatibility getCheckAndSetCompatibility() { return CassandraKeyValueServiceImpl.this.getCheckAndSetCompatibility(); } @Override public boolean shouldTriggerCompactions() { return CassandraKeyValueServiceImpl.this.shouldTriggerCompactions(); } @Override public CassandraClientPool getClientPool() { return CassandraKeyValueServiceImpl.this.getClientPool(); } @Override protected String getInitializingClassName() { return "CassandraKeyValueService"; } @Override public void close() { cancelInitialization(CassandraKeyValueServiceImpl.this::close); } } static final ConsistencyLevel WRITE_CONSISTENCY = ConsistencyLevel.EACH_QUORUM; static final ConsistencyLevel DELETE_CONSISTENCY = ConsistencyLevel.ALL; private final SafeLogger log; private final MetricsManager metricsManager; private final CassandraKeyValueServiceConfig config; private final CassandraClientPool clientPool; private final ReadConsistencyProvider readConsistencyProvider = new ReadConsistencyProvider(); private final TracingQueryRunner queryRunner; private final WrappingQueryRunner wrappingQueryRunner; private final CellLoader cellLoader; private final AsyncKeyValueService asyncKeyValueService; private final RangeLoader rangeLoader; private final TaskRunner taskRunner; private final CellValuePutter cellValuePutter; private final CassandraTableMetadata tableMetadata; private final CassandraTableCreator cassandraTableCreator; private final CassandraTableDropper cassandraTableDropper; private final CassandraTableTruncator cassandraTableTruncator; private final CheckAndSetRunner checkAndSetRunner; private final CassandraTables cassandraTables; private final InitializingWrapper wrapper = new InitializingWrapper(); private final CassandraMutationTimestampProvider mutationTimestampProvider; private final Refreshable runtimeConfig; private final CassandraVerifierConfig verifierConfig; private final Function, ResultsExtractor> extractorFactory; public static CassandraKeyValueService createForTesting( CassandraKeyValueServiceConfig config, Refreshable runtimeConfig) { MetricsManager metricsManager = MetricsManagers.createForTests(); CassandraClientPool clientPool = CassandraClientPoolImpl.createImplForTest( metricsManager, config, runtimeConfig, StartupChecks.RUN, new Blacklist( config, runtimeConfig.map(CassandraKeyValueServiceRuntimeConfig::unresponsiveHostBackoffTimeSeconds)), CassandraTopologyValidator.create( CassandraTopologyValidationMetrics.of(metricsManager.getTaggedRegistry()), runtimeConfig), new CassandraAbsentHostTracker(config.consecutiveAbsencesBeforePoolRemoval())); return createOrShutdownClientPool( metricsManager, config, runtimeConfig, clientPool, CassandraMutationTimestampProviders.legacyModeForTestsOnly(), SafeLoggerFactory.get(CassandraKeyValueService.class), AtlasDbConstants.DEFAULT_INITIALIZE_ASYNC); } public static CassandraKeyValueService create( MetricsManager metricsManager, CassandraKeyValueServiceConfig config, Refreshable runtimeConfig, CassandraMutationTimestampProvider mutationTimestampProvider) { return create( metricsManager, config, runtimeConfig, mutationTimestampProvider, AtlasDbConstants.DEFAULT_INITIALIZE_ASYNC); } public static CassandraKeyValueService create( MetricsManager metricsManager, CassandraKeyValueServiceConfig config, Refreshable runtimeConfig, CassandraMutationTimestampProvider mutationTimestampProvider, CassandraClientPool clientPool) { return createOrShutdownClientPool( metricsManager, config, runtimeConfig, clientPool, mutationTimestampProvider, SafeLoggerFactory.get(CassandraKeyValueService.class), AtlasDbConstants.DEFAULT_INITIALIZE_ASYNC); } public static CassandraKeyValueService create( MetricsManager metricsManager, CassandraKeyValueServiceConfig config, Refreshable runtimeConfig, CassandraMutationTimestampProvider mutationTimestampProvider, boolean initializeAsync) { return create( metricsManager, config, runtimeConfig, mutationTimestampProvider, SafeLoggerFactory.get(CassandraKeyValueService.class), initializeAsync); } @VisibleForTesting static CassandraKeyValueService create( MetricsManager metricsManager, CassandraKeyValueServiceConfig config, Refreshable runtimeConfig, CassandraMutationTimestampProvider mutationTimestampProvider, SafeLogger log) { return create( metricsManager, config, runtimeConfig, mutationTimestampProvider, log, AtlasDbConstants.DEFAULT_INITIALIZE_ASYNC); } @VisibleForTesting static CassandraKeyValueService create( MetricsManager metricsManager, CassandraKeyValueServiceConfig config, Refreshable runtimeConfig, CassandraMutationTimestampProvider mutationTimestampProvider, SafeLogger log, boolean initializeAsync) { CassandraClientPool clientPool = CassandraClientPoolImpl.create(metricsManager, config, runtimeConfig, initializeAsync); return createOrShutdownClientPool( metricsManager, config, runtimeConfig, clientPool, mutationTimestampProvider, log, initializeAsync); } private static CassandraKeyValueService createOrShutdownClientPool( MetricsManager metricsManager, CassandraKeyValueServiceConfig config, Refreshable runtimeConfig, CassandraClientPool clientPool, CassandraMutationTimestampProvider mutationTimestampProvider, SafeLogger log, boolean initializeAsync) { try { return createWithCqlClient( metricsManager, config, runtimeConfig, clientPool, mutationTimestampProvider, log, initializeAsync); } catch (Exception e) { log.warn("Error occurred in creating Cassandra KVS. Now attempting to shut down client pool...", e); try { clientPool.shutdown(); log.info("Cassandra client pool shut down."); } catch (RuntimeException internalException) { log.info("An error occurred whilst shutting down the Cassandra client pool", internalException); throw internalException; } throw Throwables.rewrapAndThrowUncheckedException(e); } } private static CassandraKeyValueService createWithCqlClient( MetricsManager metricsManager, CassandraKeyValueServiceConfig config, Refreshable runtimeConfig, CassandraClientPool clientPool, CassandraMutationTimestampProvider mutationTimestampProvider, SafeLogger log, boolean initializeAsync) { try { CassandraClusterConfig clusterConfig = CassandraClusterConfig.of(config, runtimeConfig.get()); AsyncKeyValueService asyncKeyValueService = config.asyncKeyValueServiceFactory() .constructAsyncKeyValueService( metricsManager, config.getKeyspaceOrThrow(), clusterConfig, runtimeConfig.map(CassandraKeyValueServiceRuntimeConfig::servers), initializeAsync); return createAndInitialize( metricsManager, config, runtimeConfig, clientPool, asyncKeyValueService, mutationTimestampProvider, log, initializeAsync); } catch (Exception e) { log.warn("Exception during async KVS creation.", e); throw Throwables.unwrapAndThrowAtlasDbDependencyException(e); } } private static CassandraKeyValueService createAndInitialize( MetricsManager metricsManager, CassandraKeyValueServiceConfig config, Refreshable runtimeConfig, CassandraClientPool clientPool, AsyncKeyValueService asyncKeyValueService, CassandraMutationTimestampProvider mutationTimestampProvider, SafeLogger log, boolean initializeAsync) { Counter notLatestVisibleValueCellFilterCounter = // register counter once and reuse metricsManager.registerOrGetCounter(ValueExtractor.class, CellFilterMetrics.NOT_LATEST_VISIBLE_VALUE); Function, ResultsExtractor> extractorFactory = cellValueMap -> new ValueExtractor(metricsManager, cellValueMap, notLatestVisibleValueCellFilterCounter); CassandraKeyValueServiceImpl keyValueService = new CassandraKeyValueServiceImpl( log, metricsManager, config, asyncKeyValueService, runtimeConfig, clientPool, mutationTimestampProvider, extractorFactory); keyValueService.wrapper.initialize(initializeAsync); return keyValueService.wrapper.isInitialized() ? keyValueService : keyValueService.wrapper; } private CassandraKeyValueServiceImpl( SafeLogger log, MetricsManager metricsManager, CassandraKeyValueServiceConfig config, AsyncKeyValueService asyncKeyValueService, Refreshable runtimeConfig, CassandraClientPool clientPool, CassandraMutationTimestampProvider mutationTimestampProvider, Function, ResultsExtractor> extractorFactory) { super(createBlockingThreadpool(config, runtimeConfig.get().servers(), metricsManager)); this.log = log; this.metricsManager = metricsManager; this.config = config; this.clientPool = clientPool; this.asyncKeyValueService = asyncKeyValueService; this.mutationTimestampProvider = mutationTimestampProvider; this.queryRunner = new TracingQueryRunner(log, () -> runtimeConfig.get().tracing()); this.wrappingQueryRunner = new WrappingQueryRunner(queryRunner); this.cassandraTables = new CassandraTables(clientPool, config); this.taskRunner = new TaskRunner(executor); this.cellLoader = CellLoader.create(clientPool, wrappingQueryRunner, taskRunner, runtimeConfig); this.rangeLoader = new RangeLoader(clientPool, queryRunner, readConsistencyProvider, extractorFactory); this.cellValuePutter = new CellValuePutter( runtimeConfig, clientPool, taskRunner, wrappingQueryRunner, mutationTimestampProvider::getSweepSentinelWriteTimestamp); this.checkAndSetRunner = new CheckAndSetRunner(queryRunner); this.tableMetadata = new CassandraTableMetadata(rangeLoader, cassandraTables, clientPool, wrappingQueryRunner); this.cassandraTableCreator = new CassandraTableCreator(clientPool, config); this.cassandraTableTruncator = new CassandraTableTruncator(queryRunner, clientPool); this.cassandraTableDropper = new CassandraTableDropper(config, clientPool, tableMetadata, cassandraTableTruncator); this.runtimeConfig = runtimeConfig; this.verifierConfig = CassandraVerifierConfig.of(config, runtimeConfig.get()); this.extractorFactory = extractorFactory; } private static ExecutorService createBlockingThreadpool( CassandraKeyValueServiceConfig config, CassandraServersConfig serversConfig, MetricsManager metricsManager) { return config.thriftExecutorServiceFactory() .orElseGet(() -> instrumentedFixedThreadPoolSupplier( serversConfig, config.poolSize(), config.maxConnectionBurstSize(), metricsManager.getTaggedRegistry())) .get(); } private static Supplier instrumentedFixedThreadPoolSupplier( CassandraServersConfig serversConfig, int poolSize, int maxConnectionBurstSize, TaggedMetricRegistry registry) { return () -> { int numberOfThriftHosts = serversConfig.numberOfThriftHosts(); int corePoolSize = poolSize * numberOfThriftHosts; int maxPoolSize = maxConnectionBurstSize * numberOfThriftHosts; return createThreadPoolWithoutSpans("Atlas Cassandra KVS", corePoolSize, maxPoolSize); }; } @Override public boolean isInitialized() { return wrapper.isInitialized(); } protected void initialize(boolean asyncInitialize) { wrapper.initialize(asyncInitialize); } private void tryInitialize() { createTable(AtlasDbConstants.DEFAULT_METADATA_TABLE, AtlasDbConstants.EMPTY_TABLE_METADATA); lowerConsistencyWhenSafe(); upgradeFromOlderInternalSchema(); CassandraKeyValueServices.warnUserInInitializationIfClusterAlreadyInInconsistentState(clientPool, config); } @VisibleForTesting void upgradeFromOlderInternalSchema() { try { Map metadataForTables = getMetadataForTables(); final Collection updatedCfs = Lists.newArrayListWithExpectedSize(metadataForTables.size()); List knownCfs = clientPool.runWithRetry(client -> client.describe_keyspace(config.getKeyspaceOrThrow()).getCf_defs()); for (CfDef clusterSideCf : knownCfs) { TableReference tableRef = CassandraKeyValueServices.tableReferenceFromCfDef(clusterSideCf); Optional relevantMetadata = lookupClusterSideMetadata(metadataForTables, tableRef); if (relevantMetadata.isPresent()) { byte[] clusterSideMetadata = relevantMetadata.get(); CfDef clientSideCf = getCfForTable(tableRef, clusterSideMetadata, config.gcGraceSeconds()); if (!ColumnFamilyDefinitions.isMatchingCf(clientSideCf, clusterSideCf)) { // mismatch; we have changed how we generate schema since we last persisted log.warn("Upgrading table {} to new internal Cassandra schema", LoggingArgs.tableRef(tableRef)); updatedCfs.add(clientSideCf); } } else if (!HiddenTables.isHidden(tableRef)) { // Possible to get here from a race condition with another service starting up // and performing schema upgrades concurrent with us doing this check log.error( "Found a table {} that did not have persisted" + " AtlasDB metadata. If you recently did a Palantir update, try waiting until" + " schema upgrades are completed on all backend CLIs/services etc and restarting" + " this service. If this error re-occurs on subsequent attempted startups, please" + " contact Palantir support.", LoggingArgs.tableRef(tableRef)); } } // we are racing another service to do these same operations here, but they are idempotent / safe Map emptyMetadataUpdate = ImmutableMap.of(); if (!updatedCfs.isEmpty()) { putMetadataAndMaybeAlterTables(true, emptyMetadataUpdate, updatedCfs); log.info("New table-related settings were applied on startup!!"); } else { log.info("No tables are being upgraded on startup. No updated table-related settings found."); } } catch (TException e) { log.error( "Couldn't upgrade from an older internal Cassandra schema. New table-related settings may not have" + " taken effect.", e); } } private static Optional lookupClusterSideMetadata( Map metadataForTables, TableReference tableRef) { return Optional.ofNullable(metadataForTables.get(tableRef)) .or(() -> Maps.filterEntries(metadataForTables, entry -> matchingIgnoreCase(entry.getKey(), tableRef)) .values() .stream() .findAny()); } private void lowerConsistencyWhenSafe() { Set dcs; Map strategyOptions; try { dcs = clientPool.runWithRetry(client -> CassandraVerifier.sanityCheckDatacenters(client, verifierConfig)); KsDef ksDef = clientPool.runWithRetry(client -> client.describe_keyspace(config.getKeyspaceOrThrow())); strategyOptions = new HashMap<>(ksDef.getStrategy_options()); if (dcs.size() == 1) { String dc = dcs.iterator().next(); if (strategyOptions.get(dc) != null) { int currentRf = Integer.parseInt(strategyOptions.get(dc)); if (currentRf == runtimeConfig.get().replicationFactor()) { if (currentRf == 2 && config.clusterMeetsNormalConsistencyGuarantees()) { log.info("Setting Read Consistency to ONE, as cluster has only one datacenter at RF2."); readConsistencyProvider.lowerConsistencyLevelToOne(); } } } } } catch (TException e) { return; } } /** * Gets values from the key-value store. Requires a quorum of Cassandra nodes to be reachable. * * @param tableRef the name of the table to retrieve values from. * @param rows set containing the rows to retrieve values for. * @param selection specifies the set of columns to fetch. * @param startTs specifies the maximum timestamp (exclusive) at which to * retrieve each rows's value. * @return map of retrieved values. Values which do not exist (either * because they were deleted or never created in the first place) * are simply not returned. * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable. * @throws IllegalArgumentException if any of the requests were invalid * (e.g., attempting to retrieve values from a non-existent table). */ @Override public Map getRows( TableReference tableRef, Iterable rows, ColumnSelection selection, long startTs) { if (!selection.allColumnsSelected()) { return getRowsForSpecificColumns(tableRef, rows, selection, startTs); } Set>> rowsByHost = HostPartitioner.partitionByHost( clientPool, rows, Functions.identity()) .entrySet(); List>> tasks = new ArrayList<>(rowsByHost.size()); for (final Map.Entry> hostAndRows : rowsByHost) { tasks.add(AnnotatedCallable.wrapWithThreadName( AnnotationType.PREPEND, "Atlas getRows " + hostAndRows.getValue().size() + " rows from " + tableRef + " on " + hostAndRows.getKey().cassandraHostName(), () -> getRowsForSingleHost(hostAndRows.getKey(), tableRef, hostAndRows.getValue(), startTs))); } List> perHostResults = taskRunner.runAllTasksCancelOnFailure(tasks); Map result = Maps.newHashMapWithExpectedSize(Iterables.size(rows)); for (Map perHostResult : perHostResults) { result.putAll(perHostResult); } return result; } private Map getRowsForSingleHost( final CassandraServer host, final TableReference tableRef, final List rows, final long startTs) { try { int rowCount = 0; final Map result = new HashMap<>(); int fetchBatchCount = runtimeConfig.get().fetchBatchCount(); for (final List batch : Lists.partition(rows, fetchBatchCount)) { rowCount += batch.size(); result.putAll(getAllCellsForRows(host, tableRef, batch, startTs)); } if (rowCount > fetchBatchCount) { log.warn( "Rebatched in getRows a call to {} that attempted to multiget {} rows; " + "this may indicate overly-large batching on a higher level.\n{}", LoggingArgs.tableRef(tableRef), SafeArg.of("rowCount", rowCount), SafeArg.of("stacktrace", CassandraKeyValueServices.getFilteredStackTrace("com.palantir"))); } return ImmutableMap.copyOf(result); } catch (Exception e) { throw Throwables.unwrapAndThrowAtlasDbDependencyException(e); } } private Map getAllCellsForRows( final CassandraServer host, final TableReference tableRef, final List rows, final long startTs) throws Exception { ListMultimap result = ArrayListMultimap.create(rows.size(), 1); List query = rows.stream() .map(row -> keyPredicate( ByteBuffer.wrap(row), allPredicateWithLimit(runtimeConfig.get().fetchReadLimitPerRow()))) .collect(Collectors.toList()); while (!query.isEmpty()) { query = EntryStream.of(getForKeyPredicates(host, tableRef, query, startTs)) .filterValues(cells -> !cells.isEmpty()) .peekKeyValue(result::putAll) .mapKeyValue((row, cells) -> keyPredicate(row, getNextLexicographicalSlicePredicate(cells))) .collect(Collectors.toList()); } ResultsExtractor extractor = extractorFactory.apply(Maps.newHashMapWithExpectedSize(result.size())); extractor.extractResults(Multimaps.asMap(result), startTs, ColumnSelection.all()); return extractor.asMap(); } private static KeyPredicate keyPredicate(ByteBuffer row, SlicePredicate predicate) { return new KeyPredicate().setKey(row).setPredicate(predicate); } private static SlicePredicate allPredicateWithLimit(int limit) { return SlicePredicates.create(Range.ALL, Limit.of(limit)); } private Map> getForKeyPredicates( final CassandraServer host, final TableReference tableRef, List query, final long startTs) throws Exception { return clientPool.runWithRetryOnServer( host, new FunctionCheckedException>, Exception>() { @Override public Map> apply(CassandraClient client) throws Exception { if (log.isTraceEnabled()) { log.trace( "Requesting {} cells from {} starting at timestamp {} on {} " + "as part of fetching cells for key predicates.", SafeArg.of("cells", query.size()), LoggingArgs.tableRef(tableRef), SafeArg.of("startTs", startTs), SafeArg.of("host", host)); } Map>> results = wrappingQueryRunner.multiget_multislice( "getRows", client, tableRef, query, readConsistencyProvider.getConsistency(tableRef)); return Maps.transformValues(results, CellLoader::flattenReadOnlyLists); } @Override public String toString() { return "multiget_multislice(" + host.cassandraHostName() + ", " + tableRef + ", " + query.size() + " cells)"; } }); } private SlicePredicate getNextLexicographicalSlicePredicate(List columns) { Preconditions.checkState(!columns.isEmpty(), "Columns was empty. This is probably an AtlasDb bug"); Column lastColumn = columns.get(columns.size() - 1).getColumn(); ColumnAndTimestamp columnNameAndTimestamp = CassandraKeyValueServices.decomposeColumn(lastColumn.name); ByteBuffer nextLexicographicColumn = CassandraKeyValueServices.makeCompositeBuffer( RangeRequests.nextLexicographicName(columnNameAndTimestamp.columnName()), Long.MAX_VALUE); return SlicePredicates.create( Range.of(nextLexicographicColumn, Range.UNBOUND_END), Limit.of(runtimeConfig.get().fetchReadLimitPerRow())); } private static List wrap(List arrays) { List byteBuffers = new ArrayList<>(arrays.size()); for (byte[] r : arrays) { byteBuffers.add(ByteBuffer.wrap(r)); } return byteBuffers; } private Map getRowsForSpecificColumns( final TableReference tableRef, final Iterable rows, ColumnSelection selection, final long startTs) { Preconditions.checkArgument(!selection.allColumnsSelected(), "Must select specific columns"); Collection selectedColumns = selection.getSelectedColumns(); Set cells = Sets.newHashSetWithExpectedSize(selectedColumns.size() * Iterables.size(rows)); for (byte[] row : rows) { for (byte[] col : selectedColumns) { cells.add(Cell.create(row, col)); } } StartTsResultsCollector collector = new StartTsResultsCollector(startTs, extractorFactory); cellLoader.loadWithTs( "getRows", tableRef, cells, startTs, false, collector, readConsistencyProvider.getConsistency(tableRef)); return collector.getCollectedResults(); } /** * Gets values from the key-value store. Requires a quorum of Cassandra nodes to be reachable. * * @param tableRef the name of the table to retrieve values from. * @param timestampByCell specifies, for each row, the maximum timestamp (exclusive) at which to * retrieve that rows's value. * @return map of retrieved values. Values which do not exist (either * because they were deleted or never created in the first place) * are simply not returned. * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable. * @throws IllegalArgumentException if any of the requests were invalid * (e.g., attempting to retrieve values from a non-existent table). */ @Override public Map get(TableReference tableRef, Map timestampByCell) { if (timestampByCell.isEmpty()) { log.info("Attempted get on '{}' table with empty cells", LoggingArgs.tableRef(tableRef)); return ImmutableMap.of(); } try { Long firstTs = timestampByCell.values().iterator().next(); if (Iterables.all(timestampByCell.values(), Predicates.equalTo(firstTs))) { return get("get", tableRef, timestampByCell.keySet(), firstTs); } SetMultimap cellsByTs = Multimaps.invertFrom(Multimaps.forMap(timestampByCell), HashMultimap.create()); ImmutableMap.Builder builder = ImmutableMap.builder(); for (long ts : cellsByTs.keySet()) { StartTsResultsCollector collector = new StartTsResultsCollector(ts, extractorFactory); cellLoader.loadWithTs( "get", tableRef, cellsByTs.get(ts), ts, false, collector, readConsistencyProvider.getConsistency(tableRef)); builder.putAll(collector.getCollectedResults()); } return builder.buildOrThrow(); } catch (Exception e) { throw Throwables.unwrapAndThrowAtlasDbDependencyException(e); } } private Map get( String kvsMethodName, TableReference tableRef, Set cells, long maxTimestampExclusive) { StartTsResultsCollector collector = new StartTsResultsCollector(maxTimestampExclusive, extractorFactory); cellLoader.loadWithTs( kvsMethodName, tableRef, cells, maxTimestampExclusive, false, collector, readConsistencyProvider.getConsistency(tableRef)); return collector.getCollectedResults(); } /** * Gets values from the key-value store for the specified rows and column range as separate iterators for each row. * Requires a quorum of Cassandra nodes to be reachable, otherwise, the returned iterators will throw an * {@link AtlasDbDependencyException} when their methods are called. * * @param tableRef the name of the table to retrieve values from. * @param rows set containing the rows to retrieve values for. Behavior is undefined if {@code rows} * contains duplicates (as defined by {@link Arrays#equals(byte[], byte[])}). * @param batchColumnRangeSelection specifies the column range and the per-row batchSize to fetch. * @param timestamp specifies the maximum timestamp (exclusive) at which to retrieve each rows's value. * @return map of row names to {@link RowColumnRangeIterator}. Each {@link RowColumnRangeIterator} can iterate over * the values that are spanned by the {@code batchColumnRangeSelection} in increasing order by column name. * @throws IllegalArgumentException if {@code rows} contains duplicates. */ @Override public Map getRowsColumnRange( TableReference tableRef, Iterable rows, BatchColumnRangeSelection batchColumnRangeSelection, long timestamp) { Set>> rowsByHost = HostPartitioner.partitionByHost( clientPool, rows, Functions.identity()) .entrySet(); List>> tasks = new ArrayList<>(rowsByHost.size()); for (final Map.Entry> hostAndRows : rowsByHost) { tasks.add(AnnotatedCallable.wrapWithThreadName( AnnotationType.PREPEND, "Atlas getRowsColumnRange " + hostAndRows.getValue().size() + " rows from " + tableRef + " on " + hostAndRows.getKey().cassandraHostName(), () -> getRowsColumnRangeIteratorForSingleHost( hostAndRows.getKey(), tableRef, hostAndRows.getValue(), batchColumnRangeSelection, timestamp))); } List> perHostResults = taskRunner.runAllTasksCancelOnFailure(tasks); Map result = Maps.newHashMapWithExpectedSize(Iterables.size(rows)); for (Map perHostResult : perHostResults) { result.putAll(perHostResult); } return result; } private Map getRowsColumnRangeIteratorForSingleHost( CassandraServer host, TableReference tableRef, List rows, BatchColumnRangeSelection batchColumnRangeSelection, long startTs) { try { RowColumnRangeResult firstPage = getRowsColumnRangeForSingleHost(host, tableRef, rows, batchColumnRangeSelection, startTs); Map> results = firstPage.getResults(); Map rowsToLastCompositeColumns = firstPage.getRowsToLastCompositeColumns(); IdentityHashMap incompleteRowsToNextColumns = new IdentityHashMap<>(); for (Map.Entry e : rowsToLastCompositeColumns.entrySet()) { byte[] row = e.getKey(); byte[] col = CassandraKeyValueServices.decomposeColumnName(e.getValue()) .columnName(); // If we read a version of the cell before our start timestamp, it will be the most recent version // readable to us and we can continue to the next column. Otherwise we have to continue reading // this column. Map rowResult = results.get(row); boolean completedCell = (rowResult != null) && rowResult.containsKey(Cell.create(row, col)); boolean endOfRange = isEndOfColumnRange( completedCell, col, firstPage.getRowsToRawColumnCount().get(row), batchColumnRangeSelection); if (!endOfRange) { byte[] nextCol = getNextColumnRangeColumn(completedCell, col); incompleteRowsToNextColumns.put(row, nextCol); } } Map ret = Maps.newHashMapWithExpectedSize(rows.size()); for (byte[] row : rowsToLastCompositeColumns.keySet()) { Iterator> resultIterator; Map result = results.get(row); if (result != null) { resultIterator = result.entrySet().iterator(); } else { resultIterator = Collections.emptyIterator(); } byte[] nextCol = incompleteRowsToNextColumns.get(row); if (nextCol == null) { ret.put(row, new LocalRowColumnRangeIterator(resultIterator)); } else { BatchColumnRangeSelection newColumnRange = BatchColumnRangeSelection.create( nextCol, batchColumnRangeSelection.getEndCol(), batchColumnRangeSelection.getBatchHint()); ret.put( row, new LocalRowColumnRangeIterator(Iterators.concat( resultIterator, getRowColumnRange(host, tableRef, row, newColumnRange, startTs)))); } } // We saw no Cassandra results at all for these rows, so the entire column range is empty for these rows. for (byte[] row : firstPage.getEmptyRows()) { ret.put(row, new LocalRowColumnRangeIterator(Collections.emptyIterator())); } return ret; } catch (Exception e) { throw Throwables.unwrapAndThrowAtlasDbDependencyException(e); } } private RowColumnRangeResult getRowsColumnRangeForSingleHost( CassandraServer host, TableReference tableRef, List rows, BatchColumnRangeSelection batchColumnRangeSelection, long startTs) { try { return clientPool.runWithRetryOnServer( host, new FunctionCheckedException() { @Override public RowColumnRangeResult apply(CassandraClient client) throws Exception { Range range = createColumnRange( batchColumnRangeSelection.getStartCol(), batchColumnRangeSelection.getEndCol(), startTs); Limit limit = Limit.of(batchColumnRangeSelection.getBatchHint()); SlicePredicate pred = SlicePredicates.create(range, limit); Map> results = wrappingQueryRunner.multiget( "getRowsColumnRange", client, tableRef, wrap(rows), pred, readConsistencyProvider.getConsistency(tableRef)); return RowColumnRangeExtractor.extract(rows, results, startTs, metricsManager); } @Override public String toString() { return "multiget_slice(" + tableRef.getQualifiedName() + ", " + rows.size() + " rows, " + batchColumnRangeSelection.getBatchHint() + " max columns)"; } }); } catch (Exception e) { throw Throwables.unwrapAndThrowAtlasDbDependencyException(e); } } private Iterator> getRowColumnRange( CassandraServer host, TableReference tableRef, byte[] row, BatchColumnRangeSelection batchColumnRangeSelection, long startTs) { return ClosableIterators.wrapWithEmptyClose( new AbstractPagingIterable< Map.Entry, TokenBackedBasicResultsPage, byte[]>>() { @Override protected TokenBackedBasicResultsPage, byte[]> getFirstPage() throws Exception { return page(batchColumnRangeSelection.getStartCol()); } @Override protected TokenBackedBasicResultsPage, byte[]> getNextPage( TokenBackedBasicResultsPage, byte[]> previous) throws Exception { return page(previous.getTokenForNextPage()); } TokenBackedBasicResultsPage, byte[]> page(final byte[] startCol) throws Exception { return clientPool.runWithRetryOnServer( host, new FunctionCheckedException< CassandraClient, TokenBackedBasicResultsPage, byte[]>, Exception>() { @Override public TokenBackedBasicResultsPage, byte[]> apply( CassandraClient client) throws Exception { Range range = createColumnRange( startCol, batchColumnRangeSelection.getEndCol(), startTs); Limit limit = Limit.of(batchColumnRangeSelection.getBatchHint()); SlicePredicate pred = SlicePredicates.create(range, limit); ByteBuffer rowByteBuffer = ByteBuffer.wrap(row); Map> results = wrappingQueryRunner.multiget( "getRowsColumnRange", client, tableRef, ImmutableList.of(rowByteBuffer), pred, readConsistencyProvider.getConsistency(tableRef)); if (results.isEmpty()) { return SimpleTokenBackedResultsPage.create( startCol, ImmutableList.of(), false); } List values = Iterables.getOnlyElement(results.values()); if (values.isEmpty()) { return SimpleTokenBackedResultsPage.create( startCol, ImmutableList.of(), false); } // May be empty if all results are at ts > startTs Map ret = RowColumnRangeExtractor.extract( ImmutableList.of(row), results, startTs, metricsManager) .getResults() .getOrDefault(row, Collections.emptyMap()); ColumnOrSuperColumn lastColumn = values.get(values.size() - 1); byte[] lastCol = CassandraKeyValueServices.decomposeColumnName( lastColumn.getColumn()) .columnName(); // Same idea as the getRows case to handle seeing only newer entries of a column boolean completedCell = ret.get(Cell.create(row, lastCol)) != null; if (isEndOfColumnRange( completedCell, lastCol, values.size(), batchColumnRangeSelection)) { return SimpleTokenBackedResultsPage.create(lastCol, ret.entrySet(), false); } byte[] nextCol = getNextColumnRangeColumn(completedCell, lastCol); return SimpleTokenBackedResultsPage.create(nextCol, ret.entrySet(), true); } @Override public String toString() { return "multiget_slice(" + tableRef.getQualifiedName() + ", single row, " + batchColumnRangeSelection.getBatchHint() + " batch hint)"; } }); } }.iterator()); } private static boolean isEndOfColumnRange( boolean completedCell, byte[] lastCol, int numRawResults, BatchColumnRangeSelection columnRangeSelection) { return (numRawResults < columnRangeSelection.getBatchHint()) || (completedCell && (RangeRequests.isLastRowName(lastCol) || Arrays.equals( RangeRequests.nextLexicographicName(lastCol), columnRangeSelection.getEndCol()))); } private static byte[] getNextColumnRangeColumn(boolean completedCell, byte[] lastCol) { if (!completedCell) { return lastCol; } else { return RangeRequests.nextLexicographicName(lastCol); } } private static Range createColumnRange(byte[] startColOrEmpty, byte[] endColExlusiveOrEmpty, long startTs) { ByteBuffer start = startColOrEmpty.length == 0 ? Range.UNBOUND_START : Range.startOfColumn(startColOrEmpty, startTs); ByteBuffer end = endColExlusiveOrEmpty.length == 0 ? Range.UNBOUND_END : Range.endOfColumnIncludingSentinels(RangeRequests.previousLexicographicName(endColExlusiveOrEmpty)); return Range.of(start, end); } /** * Puts values into the key-value store. This call does not guarantee atomicity across cells. * On failure, it is possible that some of the requests have succeeded (without having been rolled * back). Similarly, concurrent batched requests may interleave. *

* Requires a quorum of Cassandra nodes to be reachable. * * @param tableRef the name of the table to put values into. * @param values map containing the key-value entries to put. * @param timestamp must be non-negative and not equal to {@link Long#MAX_VALUE} * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable. */ @Override public void put(final TableReference tableRef, final Map values, final long timestamp) { try { cellValuePutter.put( "put", tableRef, KeyValueServices.toConstantTimestampValues(values.entrySet(), timestamp)); } catch (Exception e) { throw Throwables.unwrapAndThrowAtlasDbDependencyException(e); } } /** * Puts values into the key-value store with individually specified timestamps. This call does not * guarantee atomicity across cells. On failure, it is possible that some of the requests have succeeded * (without having been rolled back). Similarly, concurrent batched requests may interleave. *

* Requires a quorum of Cassandra nodes to be reachable. * * @param tableRef the name of the table to put values into. * @param values map containing the key-value entries to put with * non-negative timestamps less than {@link Long#MAX_VALUE}. * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable. */ @Override public void putWithTimestamps(TableReference tableRef, Multimap values) { try { cellValuePutter.put("putWithTimestamps", tableRef, values.entries()); } catch (Exception e) { throw Throwables.unwrapAndThrowAtlasDbDependencyException(e); } } @Override protected int getMultiPutBatchCount() { return runtimeConfig.get().mutationBatchCount(); } /** * Puts values into the key-value store. This call does not guarantee atomicity across cells. * On failure, it is possible that some of the requests have succeeded (without having been rolled * back). Similarly, concurrent batched requests may interleave. *

* Overridden to batch more intelligently than the default implementation. *

* Requires a quorum of Cassandra nodes to be reachable. * * @param valuesByTable map containing the key-value entries to put by table. * @param timestamp must be non-negative and not equal to {@link Long#MAX_VALUE} * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable. */ @Override public void multiPut(Map> valuesByTable, long timestamp) throws KeyAlreadyExistsException { List flattened = new ArrayList<>(); for (Map.Entry> tableAndValues : valuesByTable.entrySet()) { for (Map.Entry entry : tableAndValues.getValue().entrySet()) { flattened.add(new TableCellAndValue(tableAndValues.getKey(), entry.getKey(), entry.getValue())); } } Map> partitionedByHost = HostPartitioner.partitionByHost(clientPool, flattened, TableCellAndValue::extractRowName); List> callables = new ArrayList<>(); for (Map.Entry> entry : partitionedByHost.entrySet()) { callables.addAll(getMultiPutTasksForSingleHost(entry.getKey(), entry.getValue(), timestamp)); } taskRunner.runAllTasksCancelOnFailure(callables); } private List> getMultiPutTasksForSingleHost( final CassandraServer host, Collection values, final long timestamp) { Iterable> partitioned = IterablePartitioner.partitionByCountAndBytes( values, getMultiPutBatchCount(), getMultiPutBatchSizeBytes(), extractTableNames(values).toString(), TableCellAndValue::getSize); List> tasks = new ArrayList<>(); for (final List batch : partitioned) { final Set tableRefs = extractTableNames(batch); tasks.add(AnnotatedCallable.wrapWithThreadName( AnnotationType.PREPEND, "Atlas multiPut of " + batch.size() + " cells into " + tableRefs + " on " + host.cassandraHostName(), () -> multiPutForSingleHostInternal(host, tableRefs, batch, timestamp))); } return tasks; } private static Set extractTableNames(Iterable tableCellAndValues) { Set tableRefs = new HashSet<>(); for (TableCellAndValue tableCellAndValue : tableCellAndValues) { tableRefs.add(tableCellAndValue.tableRef); } return tableRefs; } private Void multiPutForSingleHostInternal( final CassandraServer host, final Set tableRefs, final List batch, long timestamp) throws Exception { final MutationMap mutationMap = convertToMutations(batch, timestamp); return clientPool.runWithRetryOnServer(host, new FunctionCheckedException() { @Override public Void apply(CassandraClient client) throws Exception { return wrappingQueryRunner.batchMutate("multiPut", client, tableRefs, mutationMap, WRITE_CONSISTENCY); } @Override public String toString() { return "batch_mutate(" + host.cassandraHostName() + ", " + tableRefs + ", " + batch.size() + " values)"; } }); } private static MutationMap convertToMutations(List batch, long timestamp) { MutationMap mutationMap = new MutationMap(); for (TableCellAndValue tableCellAndValue : batch) { Cell cell = tableCellAndValue.cell; Column col = CassandraKeyValueServices.createColumn(cell, Value.create(tableCellAndValue.value, timestamp)); ColumnOrSuperColumn colOrSup = new ColumnOrSuperColumn(); colOrSup.setColumn(col); Mutation mutation = new Mutation(); mutation.setColumn_or_supercolumn(colOrSup); mutationMap.addMutationForCell(cell, tableCellAndValue.tableRef, mutation); } return mutationMap; } /** * Truncate a table in the key-value store. *

* This is preferred to dropping and re-adding a table, as live schema changes can * be a complicated topic for distributed databases. *

* Requires all Cassandra nodes to be reachable. * * @param tableRef the name of the table to truncate. * @throws AtlasDbDependencyException if not all Cassandra nodes are reachable. * @throws RuntimeException if the table does not exist. */ @Override public void truncateTable(final TableReference tableRef) { truncateTables(ImmutableSet.of(tableRef)); } /** * Truncates tables in the key-value store. *

* This can be slightly faster than repeatedly truncating individual tables. *

* Requires all Cassandra nodes to be reachable. * * @param tablesToTruncate set od tables to truncate. * @throws AtlasDbDependencyException if not all Cassandra nodes are reachable. * @throws RuntimeException if the table does not exist. */ @Override public void truncateTables(final Set tablesToTruncate) { cassandraTableTruncator.truncateTables(tablesToTruncate); } /** * Deletes values from the key-value store. *

* Requires all Cassandra nodes to be up and available, otherwise throws an PalantirRuntimeException. * * @param tableRef the name of the table to delete values from. * @param keys map containing the keys to delete values for. * @throws PalantirRuntimeException if not all hosts respond successfully. */ @Override public void delete(TableReference tableRef, Multimap keys) { new CellDeleter( clientPool, wrappingQueryRunner, DELETE_CONSISTENCY, mutationTimestampProvider.getDeletionTimestampOperatorForBatchDelete()) .delete(tableRef, keys); } @VisibleForTesting CfDef getCfForTable(TableReference tableRef, byte[] rawMetadata, int gcGraceSeconds) { return ColumnFamilyDefinitions.getCfDef(config.getKeyspaceOrThrow(), tableRef, gcGraceSeconds, rawMetadata); } // TODO(unknown): after cassandra change: handle multiRanges @Override @Idempotent public Map, byte[]>> getFirstBatchForRanges( TableReference tableRef, Iterable rangeRequests, long timestamp) { int concurrency = config.rangesConcurrency(); return KeyValueServices.getFirstBatchForRangesUsingGetRangeConcurrent( executor, this, tableRef, rangeRequests, timestamp, concurrency); } // TODO(unknown): after cassandra change: handle reverse ranges // TODO(unknown): after cassandra change: handle column filtering /** * For each row in the specified range, returns the most recent version strictly before timestamp. Requires a * quorum of Cassandra nodes to be reachable. *

* Remember to close any {@link ClosableIterator}s you get in a finally block. * * @param rangeRequest the range to load. * @param timestamp specifies the maximum timestamp (exclusive) at which to retrieve each row's value. * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable. */ @Override @Idempotent public ClosableIterator> getRange( TableReference tableRef, RangeRequest rangeRequest, long timestamp) { return rangeLoader.getRange(tableRef, rangeRequest, timestamp); } /** * Gets timestamp values from the key-value store. For each row, this returns all associated * timestamps < given_ts. *

* This method has stronger consistency guarantees than regular read requests. This must return all timestamps * stored anywhere in the system (because of sweep). Unless all nodes are up and available, this method will * throw an InsufficientConsistencyException. * * @param tableRef the name of the table to read from. * @param rangeRequest the range to load. * @param timestamp the maximum timestamp to load. * @throws InsufficientConsistencyException if not all hosts respond successfully. */ @Override @Idempotent public ClosableIterator>> getRangeOfTimestamps( TableReference tableRef, RangeRequest rangeRequest, long timestamp) { CandidateCellForSweepingRequest request = ImmutableCandidateCellForSweepingRequest.builder() .startRowInclusive(rangeRequest.getStartInclusive()) .maxTimestampExclusive(timestamp) .shouldCheckIfLatestValueIsEmpty(false) .shouldDeleteGarbageCollectionSentinels(true) .build(); return getCandidateRowsForSweeping("getRangeOfTimestamps", tableRef, request) .flatMap(rows -> rows) .map(CandidateRowForSweeping::toRowResult) .stopWhen(rowResult -> !rangeRequest.inRange(rowResult.getRowName())); } @Override public ClosableIterator> getCandidateCellsForSweeping( TableReference tableRef, CandidateCellForSweepingRequest request) { return getCandidateRowsForSweeping("getCandidateCellsForSweeping", tableRef, request) .map(rows -> rows.stream() .map(CandidateRowForSweeping::cells) .flatMap(List::stream) .collect(Collectors.toList())); } private ClosableIterator> getCandidateRowsForSweeping( String kvsMethodName, TableReference tableRef, CandidateCellForSweepingRequest request) { RowGetter rowGetter = new RowGetter(clientPool, queryRunner, ConsistencyLevel.ALL, tableRef); return new CandidateRowsForSweepingIterator( (iteratorTableRef, cells, maxTimestampExclusive) -> get(kvsMethodName, iteratorTableRef, cells, maxTimestampExclusive), newInstrumentedCqlExecutor(), rowGetter, tableRef, request, runtimeConfig.map(CassandraKeyValueServiceRuntimeConfig::sweepReadThreads)); } /** * Returns a sorted list of row keys in the specified range; see * {@link CassandraKeyValueService#getRowKeysInRange(TableReference, byte[], byte[], int)}. *

* Implementation specific: this method specifically does not read any of the columns and can therefore be used * in the presence of wide rows. However, as a side-effect, it may return row where the row only contains Cassandra * tombstones. */ @Override public List getRowKeysInRange(TableReference tableRef, byte[] startRow, byte[] endRow, int maxResults) { RowGetter rowGetter = new RowGetter(clientPool, queryRunner, ConsistencyLevel.QUORUM, tableRef); return rowGetter.getRowKeysInRange(startRow, endRow, maxResults); } private CqlExecutor newInstrumentedCqlExecutor() { return AtlasDbMetrics.instrument( metricsManager.getRegistry(), CqlExecutor.class, new CqlExecutorImpl(clientPool, ConsistencyLevel.ALL)); } /** * Drop the table, and also delete its table metadata. Requires a quorum of Cassandra nodes to be reachable. * * @param tableRef the name of the table to drop. * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster * cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during * its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we * may drop the tables, but fail to to persist the changes to the _metadata table. * @throws UncheckedExecutionException if there are multiple schema mutation lock tables. */ @Override public void dropTable(final TableReference tableRef) { dropTables(ImmutableSet.of(tableRef)); } /** * Drop the tables, and also delete their table metadata. Requires a quorum of Cassandra nodes to be reachable. *

* Main gains here vs. dropTable: * - problems excepting, we will basically be serializing a rapid series of schema changes * through a single host checked out from the client pool, so reduced chance of schema disagreement issues * - client-side in-memory lock to prevent misbehaving callers from shooting themselves in the foot * - one less round trip * * @param tablesToDrop the set of tables to drop. * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster * cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during * its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we * may drop the tables, but fail to to persist the changes to the _metadata table. * @throws UncheckedExecutionException if there are multiple schema mutation lock tables. */ @Override public void dropTables(final Set tablesToDrop) { cassandraTableDropper.dropTables(tablesToDrop); } /** * Creates a table with the specified name. If the table already exists, no action is performed * (the table is left in its current state). Requires a quorum of Cassandra nodes to be reachable. * * @param tableRef the name of the table to create. * @param metadata the metadata of the table to create. * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster * cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during * its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we * may fail to persist the changes to the _metadata table. * @throws UncheckedExecutionException if there are multiple schema mutation lock tables. */ @Override public void createTable(final TableReference tableRef, final byte[] metadata) { createTables(ImmutableMap.of(tableRef, metadata)); } /** * Creates a table with the specified name. If the table already exists, no action is performed * (the table is left in its current state). *

* Requires a quorum of Cassandra nodes to be up and available. *

* Main gains here vs. createTable: * - problems excepting, we will basically be serializing a rapid series of schema changes * through a single host checked out from the client pool, so reduced chance of schema disagreement issues * - client-side in-memory lock to prevent misbehaving callers from shooting themselves in the foot * - one less round trip *

* createTables(existingTable, newMetadata) can perform a metadata-only update. Additionally, it is possible * that this metadata-only update performs a schema mutation by altering the CFDef (e. g., user changes metadata * of existing table to have new compression block size). This does not require the schema mutation lock, as it * does not alter the CfId * * @param tablesToMetadata a mapping of names of tables to create to their respective metadata. * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster * cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during * its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we * may fail to persist the changes to the _metadata table. * @throws UncheckedExecutionException if there are multiple schema mutation lock tables. */ @Override public void createTables(final Map tablesToMetadata) { Map tablesToCreate = tableMetadata.filterOutExistingTables(tablesToMetadata); Map tablesToAlter = tableMetadata.filterOutNoOpMetadataChanges(tablesToMetadata); boolean onlyMetadataChangesAreForNewTables = tablesToAlter.keySet().equals(tablesToCreate.keySet()); boolean putMetadataWillNeedASchemaChange = !onlyMetadataChangesAreForNewTables; if (!tablesToCreate.isEmpty()) { LoggingArgs.SafeAndUnsafeTableReferences safeAndUnsafe = LoggingArgs.tableRefs(tablesToCreate.keySet()); log.info("Creating tables {} and {}", safeAndUnsafe.safeTableRefs(), safeAndUnsafe.unsafeTableRefs()); cassandraTableCreator.createTables(tablesToCreate); } internalPutMetadataForTables(tablesToAlter, putMetadataWillNeedASchemaChange); } /** * Return the list of tables stored in this key value service. Requires a quorum of Cassandra nodes to be reachable * and agree on schema versions. *

* This will not contain the names of any hidden tables (e. g., the _metadata table). * * @return a set of TableReferences (table names) for all the visible tables * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster * cannot come to an agreement on schema versions. */ @Override public Set getAllTableNames() { return cassandraTables .getTableReferencesWithoutFiltering() .filter(tr -> !HiddenTables.isHidden(tr)) .collect(Collectors.toSet()); } /** * Gets the metadata for a given table. Do not use this method to see if a table exists as it can return false * positives. Requires a quorum of Cassandra nodes to be reachable. * * @param tableRef the name of the table to get metadata for. * @return a byte array representing the metadata for the table. Array is empty if no table * with the given name exists. Consider {@link TableMetadata#BYTES_HYDRATOR} for hydrating. * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable. */ @Override public byte[] getMetadataForTable(TableReference tableRef) { // try and get with a single-key lookup String lowerCaseTableName = tableRef.getQualifiedName().toLowerCase(Locale.ROOT); Map rows = getRows( AtlasDbConstants.DEFAULT_METADATA_TABLE, ImmutableSet.of(lowerCaseTableName.getBytes(StandardCharsets.UTF_8)), ColumnSelection.all(), Long.MAX_VALUE); if (!rows.isEmpty()) { return Iterables.getOnlyElement(rows.values()).getContents(); } // if unsuccessful with fast code-path, we need to check if this table exists but was written at a key // before we started enforcing only writing lower-case canonicalised versions of keys return Optional.ofNullable(getMetadataForTables().get(tableRef)).orElse(AtlasDbConstants.EMPTY_TABLE_METADATA); } private static boolean matchingIgnoreCase(@Nullable TableReference t1, TableReference t2) { if (t1 != null) { return t1.getQualifiedName().equalsIgnoreCase(t2.getQualifiedName()); } else { return t2 == null; } } /** * Gets the metadata for all non-hidden tables. Requires a quorum of Cassandra nodes to be reachable. * * @return a mapping of table names to their respective metadata in form of a byte array. Consider * {@link TableMetadata#BYTES_HYDRATOR} for hydrating. * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are available. */ @Override public Map getMetadataForTables() { return tableMetadata.getMetadataForTables(); } /** * Records the specified metadata for a given table. Requires a quorum of Cassandra nodes to be reachable. * * @param tableRef the name of the table to record metadata for. * @param meta a byte array representing the metadata to record. * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster * cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during * its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we * may fail to persist the changes to the _metadata table. */ @Override public void putMetadataForTable(final TableReference tableRef, final byte[] meta) { putMetadataForTables(ImmutableMap.of(tableRef, meta)); } /** * For each specified table records the respective metadata. Requires a quorum of Cassandra nodes to be reachable. * * @param tableRefToMetadata a mapping from each table's name to the respective byte array representing * the metadata to record. * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster * cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during * its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we * may fail to persist the changes to the _metadata table. */ @Override public void putMetadataForTables(final Map tableRefToMetadata) { internalPutMetadataForTables(tableRefToMetadata, true); } @SuppressWarnings("checkstyle:RegexpSinglelineJava") private void internalPutMetadataForTables( Map tableRefToMetadata, boolean possiblyNeedToPerformSettingsChanges) { if (tableRefToMetadata.isEmpty()) { return; } Map tableRefToNewCell = Maps.transformEntries( tableRefToMetadata, (tableRef, metadata) -> CassandraKeyValueServices.getMetadataCell(tableRef)); Map tableRefToOldCell = Maps.transformEntries( tableRefToMetadata, (tableRef, metadata) -> CassandraKeyValueServices.getOldMetadataCell(tableRef)); // technically we're racing other nodes from here on, during an update period, // but the penalty for not caring is just some superfluous schema mutations and a // few dead rows in the metadata table. Map existingMetadataAtNewName = get( AtlasDbConstants.DEFAULT_METADATA_TABLE, tableRefToNewCell.values().stream() .collect(Collectors.toMap(Functions.identity(), Functions.constant(Long.MAX_VALUE)))); Map existingMetadataAtOldName = get( AtlasDbConstants.DEFAULT_METADATA_TABLE, tableRefToOldCell.values().stream() .collect(Collectors.toMap(Functions.identity(), Functions.constant(Long.MAX_VALUE)))); final Map updatedMetadata = new HashMap<>(); final Set updatedCfs = new HashSet<>(); tableRefToNewCell.forEach((tableRef, newCell) -> { if (existingMetadataAtNewName.containsKey(newCell)) { if (metadataIsDifferent( existingMetadataAtNewName.get(newCell).getContents(), tableRefToMetadata.get(tableRef))) { // found existing metadata at new name, but we're performing an update updatedMetadata.put(newCell, tableRefToMetadata.get(tableRef)); updatedCfs.add(getCfForTable(tableRef, tableRefToMetadata.get(tableRef), config.gcGraceSeconds())); } } else if (existingMetadataAtOldName.containsKey(tableRefToOldCell.get(tableRef))) { if (metadataIsDifferent( existingMetadataAtOldName .get(tableRefToOldCell.get(tableRef)) .getContents(), tableRefToMetadata.get(tableRef))) { // found existing metadata at old name, but we're performing an update updatedMetadata.put(tableRefToOldCell.get(tableRef), tableRefToMetadata.get(tableRef)); updatedCfs.add(getCfForTable(tableRef, tableRefToMetadata.get(tableRef), config.gcGraceSeconds())); } } else { // didn't find an existing metadata at old or new names, this is completely new; // thus, let's write it out with the new format updatedMetadata.put(tableRefToNewCell.get(tableRef), tableRefToMetadata.get(tableRef)); updatedCfs.add(getCfForTable(tableRef, tableRefToMetadata.get(tableRef), config.gcGraceSeconds())); } }); if (!updatedMetadata.isEmpty()) { putMetadataAndMaybeAlterTables(possiblyNeedToPerformSettingsChanges, updatedMetadata, updatedCfs); } } private static boolean metadataIsDifferent(byte[] existingMetadata, byte[] requestMetadata) { return !Arrays.equals(existingMetadata, requestMetadata); } private void putMetadataAndMaybeAlterTables( boolean possiblyNeedToPerformSettingsChanges, Map newMetadata, Collection updatedCfs) { try { clientPool.runWithRetry(client -> { if (possiblyNeedToPerformSettingsChanges) { for (CfDef cf : updatedCfs) { client.system_update_column_family(cf); } CassandraKeyValueServices.waitForSchemaVersions( config.schemaMutationTimeoutMillis(), client, schemaChangeDescriptionForPutMetadataForTables(updatedCfs)); } // Done with actual schema mutation, push the metadata put(AtlasDbConstants.DEFAULT_METADATA_TABLE, newMetadata, System.currentTimeMillis()); return null; }); } catch (Exception e) { throw Throwables.unwrapAndThrowAtlasDbDependencyException(e); } } private static String schemaChangeDescriptionForPutMetadataForTables(Collection updatedCfs) { String tables = updatedCfs.stream() .map(CassandraKeyValueServices::tableReferenceFromCfDef) .map(Object::toString) .collect(Collectors.toList()) .toString(); return String.format( "after updating the column family for tables %s in a call to put metadata for tables", tables); } @Override public void deleteRange(final TableReference tableRef, final RangeRequest range) { if (range.equals(RangeRequest.all())) { try { cassandraTableTruncator.truncateTables(ImmutableSet.of(tableRef)); } catch (AtlasDbDependencyException e) { log.info( "Tried to make a deleteRange({}, RangeRequest.all())" + " into a more garbage-cleanup friendly truncate(), but this failed.", LoggingArgs.tableRef(tableRef), e); super.deleteRange(tableRef, range); } } else if (isForSingleRow(range.getStartInclusive(), range.getEndExclusive())) { try { long timestamp = mutationTimestampProvider.getRemoveTimestamp(); byte[] row = range.getStartInclusive(); clientPool.runWithRetry(client -> { client.remove("deleteRange", tableRef, row, timestamp, DELETE_CONSISTENCY); return null; }); } catch (RetryLimitReachedException e) { throw CassandraUtils.wrapInIceForDeleteOrRethrow(e); } catch (TException e) { throw Throwables.unwrapAndThrowAtlasDbDependencyException(e); } } else { super.deleteRange(tableRef, range); } } private static boolean isForSingleRow(byte[] startInclusive, byte[] endExclusive) { if (startInclusive.length == 0 || endExclusive.length == 0) { return false; } return Arrays.equals(endExclusive, RangeRequests.nextLexicographicName(startInclusive)); } @Override public void deleteRows(TableReference tableRef, Iterable rows) { Set actualKeys = StreamSupport.stream(rows.spliterator(), false) .map(ByteBuffer::wrap) .collect(Collectors.toSet()); if (actualKeys.isEmpty()) { return; } long timestamp = mutationTimestampProvider.getRemoveTimestamp(); Map>> mutationMap = KeyedStream.of(actualKeys) .map(row -> new Deletion().setTimestamp(timestamp)) .map(deletion -> new Mutation().setDeletion(deletion)) .map(mutation -> keyMutationMapByColumnFamily(tableRef, mutation)) .collectToMap(); try { clientPool.runWithRetry(client -> { client.batch_mutate("deleteRows", mutationMap, DELETE_CONSISTENCY); return null; }); } catch (RetryLimitReachedException e) { throw CassandraUtils.wrapInIceForDeleteOrRethrow(e); } catch (TException e) { throw Throwables.unwrapAndThrowAtlasDbDependencyException(e); } } private static Map> keyMutationMapByColumnFamily( TableReference tableRef, Mutation mutation) { return ImmutableMap.of(AbstractKeyValueService.internalTableName(tableRef), ImmutableList.of(mutation)); } @Override public void deleteAllTimestamps(TableReference tableRef, Map deletes) { new CellRangeDeleter( clientPool, wrappingQueryRunner, DELETE_CONSISTENCY, mutationTimestampProvider::getRangeTombstoneTimestamp) .deleteAllTimestamps(tableRef, deletes); } /** * Performs non-destructive cleanup when the KVS is no longer needed. */ @Override public void close() { clientPool.shutdown(); asyncKeyValueService.close(); super.close(); } /** * Adds a value with timestamp = Value.INVALID_VALUE_TIMESTAMP to each of the given cells. If * a value already exists at that time stamp, nothing is written for that cell. *

* Requires a quorum of Cassandra nodes to be reachable. * * @param tableRef the name of the table to add the value to. * @param cells a set of cells to store the values in. * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable. */ @Override public void addGarbageCollectionSentinelValues(TableReference tableRef, Iterable cells) { try { final Value value = Value.create(PtBytes.EMPTY_BYTE_ARRAY, Value.INVALID_VALUE_TIMESTAMP); cellValuePutter.putWithOverriddenTimestamps( "addGarbageCollectionSentinelValues", tableRef, Iterables.transform(cells, cell -> Maps.immutableEntry(cell, value))); } catch (Exception e) { throw Throwables.unwrapAndThrowAtlasDbDependencyException(e); } } /** * Gets timestamp values from the key-value store. For each cell, this returns all associated * timestamps < given_ts. *

* This method has stronger consistency guarantees than regular read requests. This must return * all timestamps stored anywhere in the system (because of sweep). Unless all nodes are up and available, this * method will throw a PalantirRuntimeException. * * @param tableRef the name of the table to retrieve timestamps from. * @param cells set containg cells to retrieve timestamps for. * @param ts maximum timestamp to get (exclusive). * @return multimap of timestamps by cell * @throws AtlasDbDependencyException if not all Cassandra nodes are reachable. */ @Override public Multimap getAllTimestamps(TableReference tableRef, Set cells, long ts) { return cellLoader.getAllTimestamps(tableRef, cells, ts, DELETE_CONSISTENCY); } /** * Puts values into the key-value store. This call does not guarantee * atomicity across cells. On failure, it is possible that some of the requests will * have succeeded (without having been rolled back). Similarly, concurrent batched requests may * interleave. However, concurrent writes to the same Cell will not both report success. * One of them will throw {@link KeyAlreadyExistsException}. *

* Requires a quorum of Cassandra nodes to be reachable. * * @param tableRef the name of the table to put values into. * @param values map containing the key-value entries to put. * @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable. * @throws KeyAlreadyExistsException if you are putting a Cell with the same timestamp as one that already exists. */ @Override public void putUnlessExists(final TableReference tableRef, final Map values) throws KeyAlreadyExistsException { try { Optional failure = clientPool.runWithRetry(client -> { Map> partitionedEntries = partitionPerRow(values); for (Map.Entry> partition : partitionedEntries.entrySet()) { CASResult casResult = putUnlessExistsSinglePartition(tableRef, client, partition.getKey(), partition.getValue()); if (!casResult.isSuccess()) { return Optional.of(new KeyAlreadyExistsException( "The cells in the table already exist.", casResult.getCurrent_values().stream() .map(column -> Cell.create( partition.getKey().toByteArray(), CassandraKeyValueServices.decomposeColumn(column.bufferForName()) .columnName())) .collect(Collectors.toList()), LoggingArgs.tableRef(tableRef))); } } return Optional.empty(); }); failure.ifPresent(exception -> { throw exception; }); } catch (KeyAlreadyExistsException e) { throw e; } catch (Exception e) { throw Throwables.unwrapAndThrowAtlasDbDependencyException(e); } } @Override public void setOnce(TableReference tableRef, Map values) { try { cellValuePutter.set( "setOnce", tableRef, KeyValueServices.toConstantTimestampValues(values.entrySet(), AtlasDbConstants.TRANSACTION_TS)); } catch (Exception e) { throw Throwables.unwrapAndThrowAtlasDbDependencyException(e); } } public static Map> partitionPerRow(Map values) { return values.entrySet().stream() .collect(Collectors.groupingBy( entry -> ByteString.copyFrom(entry.getKey().getRowName()), Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue))); } private static CASResult putUnlessExistsSinglePartition( TableReference tableRef, CassandraClient client, ByteString row, Map partition) throws TException { return client.put_unless_exists( tableRef, ByteBuffer.wrap(row.toByteArray()), partition.entrySet().stream() .map(CassandraKeyValueServiceImpl::prepareColumnForPutUnlessExists) .collect(Collectors.toList()), ConsistencyLevel.SERIAL, WRITE_CONSISTENCY); } private static Column prepareColumnForPutUnlessExists(Map.Entry insertion) { return new Column(CassandraKeyValueServices.makeCompositeBuffer( insertion.getKey().getColumnName(), // Atlas timestamp CassandraConstants.CAS_TABLE_TIMESTAMP)) // Cassandra timestamp .setTimestamp(CassandraConstants.CAS_TABLE_TIMESTAMP) .setValue(insertion.getValue()); } @Override public CheckAndSetCompatibility getCheckAndSetCompatibility() { return CheckAndSetCompatibility.supportedBuilder() .supportsMultiCheckAndSetOperations(true) .supportsDetailOnFailure(true) .consistentOnFailure(false) .build(); } /** * Performs a check-and-set into the key-value store. * Please see {@link CheckAndSetRequest} for information about how to create this request, * and {@link KeyValueService} for more detailed documentation. *

* Does not require all Cassandra nodes to be up and available, works as long as quorum is achieved. * * @param request the request, including table, cell, old value and new value. * @throws CheckAndSetException if the stored value for the cell was not as expected. */ @Override public void checkAndSet(final CheckAndSetRequest request) throws CheckAndSetException { try { CheckAndSetResult casResult = clientPool.runWithRetry(client -> checkAndSetRunner.executeCheckAndSet(client, request)); if (!casResult.successful()) { List currentValues = casResult.existingValues().stream() .map(ByteString::toByteArray) .collect(Collectors.toList()); throw new CheckAndSetException( request.cell(), request.table(), request.oldValue().orElse(null), currentValues); } } catch (CheckAndSetException e) { throw e; } catch (Exception e) { throw Throwables.unwrapAndThrowAtlasDbDependencyException(e); } } /** * Performs a check-and-set for multiple cells in a row into the key-value store. * Please see {@link MultiCheckAndSetRequest} for information about how to create this request, * and {@link KeyValueService} for more detailed documentation. *

* If the call completes successfully, then you know that the old cells initially had the values you expected. * In this case, you can be sure that all your cells have been updated to their new values. * If the old cells initially did not have the values you expected, none of the cells will be updated and * {@link MultiCheckAndSetException} will be thrown. * Reads concurrent with this operation will not see a partial update. *

* Another thing to note is that the check operation will **only be performed on values of cells that are declared * in the set of expected values** i.e. the check operation DOES NOT take updates into account. *

* Does not require all Cassandra nodes to be up and available, works as long as quorum is achieved. * * @param request the request, including table, rowName, old values and new values. * @throws MultiCheckAndSetException if the stored values for the cells were not as expected. */ @Override public void multiCheckAndSet(MultiCheckAndSetRequest request) throws MultiCheckAndSetException { TableReference tableRef = request.tableRef(); ByteBuffer row = ByteBuffer.wrap(request.rowName()); List oldCol = request.expected().entrySet().stream() .map(CassandraKeyValueServiceImpl::prepareColumnForPutUnlessExists) .collect(Collectors.toList()); List newCol = request.updates().entrySet().stream() .map(CassandraKeyValueServiceImpl::prepareColumnForPutUnlessExists) .collect(Collectors.toList()); try { CASResult casResult = clientPool.runWithRetry(client -> client.cas(tableRef, row, oldCol, newCol, ConsistencyLevel.SERIAL, ConsistencyLevel.EACH_QUORUM)); if (!casResult.isSuccess()) { Map currentValues = KeyedStream.of(casResult.getCurrent_values()) .mapKeys(column -> Cell.create( request.rowName(), CassandraKeyValueServices.decomposeColumn(column.bufferForName()) .columnName())) .map(Column::getValue) .collectToMap(); throw new MultiCheckAndSetException( LoggingArgs.tableRef(tableRef), request.rowName(), request.expected(), currentValues); } } catch (MultiCheckAndSetException e) { throw e; } catch (Exception e) { log.error("Error while executing multi-checkAndSet operation.", e); throw Throwables.unwrapAndThrowAtlasDbDependencyException(e); } } @Override public void compactInternally(TableReference tableRef) { log.info( "Called compactInternally on {}, but this is a no-op for Cassandra KVS." + "Cassandra should eventually decide to compact this table for itself.", LoggingArgs.tableRef(tableRef)); } @Override public ClusterAvailabilityStatus getClusterAvailabilityStatus() { ClusterAvailabilityStatus clusterStatus = getStatusByRunningOperationsOnEachHost(); if (isClusterQuorumAvaialble(clusterStatus) && !doesConfigReplicationFactorMatchWithCluster()) { return ClusterAvailabilityStatus.TERMINAL; } return clusterStatus; } @Override public boolean sweepsEntriesInStrictlyNonDecreasingFashion() { return true; } private static boolean isClusterQuorumAvaialble(ClusterAvailabilityStatus clusterStatus) { return clusterStatus.equals(ClusterAvailabilityStatus.ALL_AVAILABLE) || clusterStatus.equals(ClusterAvailabilityStatus.QUORUM_AVAILABLE); } private boolean doesConfigReplicationFactorMatchWithCluster() { return clientPool.runWithRetry(client -> { try { CassandraVerifier.currentRfOnKeyspaceMatchesDesiredRf(client, verifierConfig); return true; } catch (Exception e) { log.warn("The config and Cassandra cluster do not agree on the replication factor.", e); return false; } }); } private ClusterAvailabilityStatus getStatusByRunningOperationsOnEachHost() { int countUnreachableNodes = 0; for (CassandraServer server : clientPool.getCurrentPools().keySet()) { try { clientPool.runOnCassandraServer(server, CassandraVerifier.healthCheck); if (!partitionerIsValid(server)) { return ClusterAvailabilityStatus.TERMINAL; } } catch (Exception e) { countUnreachableNodes++; } } return getNodeAvailabilityStatus(countUnreachableNodes); } private boolean partitionerIsValid(CassandraServer host) { try { clientPool.runOnCassandraServer(host, clientPool.getValidatePartitioner()); return true; } catch (Exception e) { return false; } } private ClusterAvailabilityStatus getNodeAvailabilityStatus(int countUnreachableNodes) { if (countUnreachableNodes == 0) { return ClusterAvailabilityStatus.ALL_AVAILABLE; } else if (isQuorumAvailable(countUnreachableNodes)) { return ClusterAvailabilityStatus.QUORUM_AVAILABLE; } else { return ClusterAvailabilityStatus.NO_QUORUM_AVAILABLE; } } private boolean isQuorumAvailable(int countUnreachableNodes) { int replicationFactor = runtimeConfig.get().replicationFactor(); return countUnreachableNodes < (replicationFactor + 1) / 2; } @Override public CassandraClientPool getClientPool() { return clientPool; } @Override public TracingQueryRunner getTracingQueryRunner() { return queryRunner; } @Override public CassandraTables getCassandraTables() { return cassandraTables; } @Override public boolean performanceIsSensitiveToTombstones() { return true; } /** * Asynchronously gets values from the cassandra key-value store. * * @param tableRef the name of the table to retrieve values from. * @param timestampByCell specifies, for each row, the maximum timestamp (exclusive) at which to * retrieve that rows's value. * @return listenable future map of retrieved values. Values which do not exist (either * because they were deleted or never created in the first place) * are simply not returned. */ @Override public ListenableFuture> getAsync(TableReference tableRef, Map timestampByCell) { if (timestampByCell.isEmpty()) { log.info("Attempted get with no specified cells", LoggingArgs.tableRef(tableRef)); return Futures.immediateFuture(ImmutableMap.of()); } if (asyncKeyValueService.isValid()) { try { return Futures.catching( asyncKeyValueService.getAsync(tableRef, timestampByCell), IllegalStateException.class, e -> { log.warn( "CQL Client closed during getAsync. Delegating to synchronous get. This should be" + " very rare, and only happen once after the Cassandra Server list has" + " changed.", e); return this.get(tableRef, timestampByCell); }, executor); } catch (IllegalStateException | DriverInternalError e) { // If the container is closed, or we've reloaded into an invalid ThrowingCqlClient, after testing for // validity return Futures.immediateFuture(this.get(tableRef, timestampByCell)); } } else { return Futures.immediateFuture(this.get(tableRef, timestampByCell)); } } private static class TableCellAndValue { private static byte[] extractRowName(TableCellAndValue input) { return input.cell.getRowName(); } private static Long getSize(TableCellAndValue input) { return input.value.length + Cells.getApproxSizeOfCell(input.cell); } private final TableReference tableRef; private final Cell cell; private final byte[] value; TableCellAndValue(TableReference tableRef, Cell cell, byte[] value) { this.tableRef = tableRef; this.cell = cell; this.value = value; } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy