Please wait. This can take some minutes ...
Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance.
Project price only 1 $
You can buy this project and download/modify it how often you want.
com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServiceImpl Maven / Gradle / Ivy
/*
* (c) Copyright 2018 Palantir Technologies Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.palantir.atlasdb.keyvalue.cassandra;
import com.codahale.metrics.Counter;
import com.datastax.driver.core.exceptions.DriverInternalError;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Functions;
import com.google.common.base.Predicates;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.ListMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.SetMultimap;
import com.google.common.collect.Sets;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.common.util.concurrent.UncheckedExecutionException;
import com.google.protobuf.ByteString;
import com.palantir.async.initializer.AsyncInitializer;
import com.palantir.atlasdb.AtlasDbConstants;
import com.palantir.atlasdb.AtlasDbMetricNames.CellFilterMetrics;
import com.palantir.atlasdb.CassandraTopologyValidationMetrics;
import com.palantir.atlasdb.cassandra.CassandraKeyValueServiceConfig;
import com.palantir.atlasdb.cassandra.CassandraKeyValueServiceRuntimeConfig;
import com.palantir.atlasdb.cassandra.CassandraMutationTimestampProvider;
import com.palantir.atlasdb.cassandra.CassandraMutationTimestampProviders;
import com.palantir.atlasdb.cassandra.CassandraServersConfigs.CassandraServersConfig;
import com.palantir.atlasdb.encoding.PtBytes;
import com.palantir.atlasdb.keyvalue.api.AsyncKeyValueService;
import com.palantir.atlasdb.keyvalue.api.BatchColumnRangeSelection;
import com.palantir.atlasdb.keyvalue.api.CandidateCellForSweeping;
import com.palantir.atlasdb.keyvalue.api.CandidateCellForSweepingRequest;
import com.palantir.atlasdb.keyvalue.api.Cell;
import com.palantir.atlasdb.keyvalue.api.CheckAndSetCompatibility;
import com.palantir.atlasdb.keyvalue.api.CheckAndSetException;
import com.palantir.atlasdb.keyvalue.api.CheckAndSetRequest;
import com.palantir.atlasdb.keyvalue.api.ClusterAvailabilityStatus;
import com.palantir.atlasdb.keyvalue.api.ColumnSelection;
import com.palantir.atlasdb.keyvalue.api.ImmutableCandidateCellForSweepingRequest;
import com.palantir.atlasdb.keyvalue.api.InsufficientConsistencyException;
import com.palantir.atlasdb.keyvalue.api.KeyAlreadyExistsException;
import com.palantir.atlasdb.keyvalue.api.KeyValueService;
import com.palantir.atlasdb.keyvalue.api.MultiCheckAndSetException;
import com.palantir.atlasdb.keyvalue.api.MultiCheckAndSetRequest;
import com.palantir.atlasdb.keyvalue.api.RangeRequest;
import com.palantir.atlasdb.keyvalue.api.RangeRequests;
import com.palantir.atlasdb.keyvalue.api.RetryLimitReachedException;
import com.palantir.atlasdb.keyvalue.api.RowColumnRangeIterator;
import com.palantir.atlasdb.keyvalue.api.RowResult;
import com.palantir.atlasdb.keyvalue.api.TableReference;
import com.palantir.atlasdb.keyvalue.api.TimestampRangeDelete;
import com.palantir.atlasdb.keyvalue.api.Value;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraClientPoolImpl.StartupChecks;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServices.ColumnAndTimestamp;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraKeyValueServices.StartTsResultsCollector;
import com.palantir.atlasdb.keyvalue.cassandra.CassandraVerifier.CassandraVerifierConfig;
import com.palantir.atlasdb.keyvalue.cassandra.RowColumnRangeExtractor.RowColumnRangeResult;
import com.palantir.atlasdb.keyvalue.cassandra.async.client.creation.ClusterFactory.CassandraClusterConfig;
import com.palantir.atlasdb.keyvalue.cassandra.cas.CheckAndSetRunner;
import com.palantir.atlasdb.keyvalue.cassandra.paging.RowGetter;
import com.palantir.atlasdb.keyvalue.cassandra.pool.CassandraServer;
import com.palantir.atlasdb.keyvalue.cassandra.sweep.CandidateRowForSweeping;
import com.palantir.atlasdb.keyvalue.cassandra.sweep.CandidateRowsForSweepingIterator;
import com.palantir.atlasdb.keyvalue.cassandra.thrift.MutationMap;
import com.palantir.atlasdb.keyvalue.cassandra.thrift.SlicePredicates;
import com.palantir.atlasdb.keyvalue.cassandra.thrift.SlicePredicates.Limit;
import com.palantir.atlasdb.keyvalue.cassandra.thrift.SlicePredicates.Range;
import com.palantir.atlasdb.keyvalue.impl.AbstractKeyValueService;
import com.palantir.atlasdb.keyvalue.impl.Cells;
import com.palantir.atlasdb.keyvalue.impl.CheckAndSetResult;
import com.palantir.atlasdb.keyvalue.impl.IterablePartitioner;
import com.palantir.atlasdb.keyvalue.impl.KeyValueServices;
import com.palantir.atlasdb.keyvalue.impl.LocalRowColumnRangeIterator;
import com.palantir.atlasdb.logging.LoggingArgs;
import com.palantir.atlasdb.table.description.TableMetadata;
import com.palantir.atlasdb.util.AnnotatedCallable;
import com.palantir.atlasdb.util.AnnotationType;
import com.palantir.atlasdb.util.AtlasDbMetrics;
import com.palantir.atlasdb.util.MetricsManager;
import com.palantir.atlasdb.util.MetricsManagers;
import com.palantir.common.annotation.Idempotent;
import com.palantir.common.base.ClosableIterator;
import com.palantir.common.base.ClosableIterators;
import com.palantir.common.base.FunctionCheckedException;
import com.palantir.common.base.Throwables;
import com.palantir.common.exception.AtlasDbDependencyException;
import com.palantir.common.exception.PalantirRuntimeException;
import com.palantir.common.streams.KeyedStream;
import com.palantir.logsafe.Preconditions;
import com.palantir.logsafe.SafeArg;
import com.palantir.logsafe.logger.SafeLogger;
import com.palantir.logsafe.logger.SafeLoggerFactory;
import com.palantir.refreshable.Refreshable;
import com.palantir.tritium.metrics.registry.TaggedMetricRegistry;
import com.palantir.util.paging.AbstractPagingIterable;
import com.palantir.util.paging.SimpleTokenBackedResultsPage;
import com.palantir.util.paging.TokenBackedBasicResultsPage;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import javax.annotation.Nullable;
import one.util.streamex.EntryStream;
import org.apache.cassandra.thrift.CASResult;
import org.apache.cassandra.thrift.CfDef;
import org.apache.cassandra.thrift.Column;
import org.apache.cassandra.thrift.ColumnOrSuperColumn;
import org.apache.cassandra.thrift.ConsistencyLevel;
import org.apache.cassandra.thrift.Deletion;
import org.apache.cassandra.thrift.KeyPredicate;
import org.apache.cassandra.thrift.KsDef;
import org.apache.cassandra.thrift.Mutation;
import org.apache.cassandra.thrift.SlicePredicate;
import org.apache.thrift.TException;
/**
* Each service can have one or many C* KVS.
* For each C* KVS, it maintains a list of active nodes, and the client connections attached to each node:
*
* n1->c1, c2, c3
* n2->c5, c4, c9
* n3->[N C* thrift client connections]
*
* Where {n1, n2, n3} are the active nodes in the C* cluster. Also each
* node contains the clients which are attached to the node.
* if some nodes are down, and the change can be detected through active hosts,
* and these inactive nodes will be removed afterwards.
*/
@SuppressWarnings({"FinalClass", "Not final for mocking in tests"})
public class CassandraKeyValueServiceImpl extends AbstractKeyValueService implements CassandraKeyValueService {
@VisibleForTesting
class InitializingWrapper extends AsyncInitializer implements AutoDelegate_CassandraKeyValueService {
@Override
public CassandraKeyValueServiceImpl delegate() {
checkInitialized();
return CassandraKeyValueServiceImpl.this;
}
@Override
public Collection getDelegates() {
return ImmutableList.of(delegate());
}
@Override
protected void tryInitialize() {
CassandraKeyValueServiceImpl.this.tryInitialize();
}
@Override
public boolean supportsCheckAndSet() {
return CassandraKeyValueServiceImpl.this.supportsCheckAndSet();
}
@Override
public CheckAndSetCompatibility getCheckAndSetCompatibility() {
return CassandraKeyValueServiceImpl.this.getCheckAndSetCompatibility();
}
@Override
public boolean shouldTriggerCompactions() {
return CassandraKeyValueServiceImpl.this.shouldTriggerCompactions();
}
@Override
public CassandraClientPool getClientPool() {
return CassandraKeyValueServiceImpl.this.getClientPool();
}
@Override
protected String getInitializingClassName() {
return "CassandraKeyValueService";
}
@Override
public void close() {
cancelInitialization(CassandraKeyValueServiceImpl.this::close);
}
}
static final ConsistencyLevel WRITE_CONSISTENCY = ConsistencyLevel.EACH_QUORUM;
static final ConsistencyLevel DELETE_CONSISTENCY = ConsistencyLevel.ALL;
private final SafeLogger log;
private final MetricsManager metricsManager;
private final CassandraKeyValueServiceConfig config;
private final CassandraClientPool clientPool;
private final ReadConsistencyProvider readConsistencyProvider = new ReadConsistencyProvider();
private final TracingQueryRunner queryRunner;
private final WrappingQueryRunner wrappingQueryRunner;
private final CellLoader cellLoader;
private final AsyncKeyValueService asyncKeyValueService;
private final RangeLoader rangeLoader;
private final TaskRunner taskRunner;
private final CellValuePutter cellValuePutter;
private final CassandraTableMetadata tableMetadata;
private final CassandraTableCreator cassandraTableCreator;
private final CassandraTableDropper cassandraTableDropper;
private final CassandraTableTruncator cassandraTableTruncator;
private final CheckAndSetRunner checkAndSetRunner;
private final CassandraTables cassandraTables;
private final InitializingWrapper wrapper = new InitializingWrapper();
private final CassandraMutationTimestampProvider mutationTimestampProvider;
private final Refreshable runtimeConfig;
private final CassandraVerifierConfig verifierConfig;
private final Function, ResultsExtractor> extractorFactory;
public static CassandraKeyValueService createForTesting(
CassandraKeyValueServiceConfig config, Refreshable runtimeConfig) {
MetricsManager metricsManager = MetricsManagers.createForTests();
CassandraClientPool clientPool = CassandraClientPoolImpl.createImplForTest(
metricsManager,
config,
runtimeConfig,
StartupChecks.RUN,
new Blacklist(
config,
runtimeConfig.map(CassandraKeyValueServiceRuntimeConfig::unresponsiveHostBackoffTimeSeconds)),
CassandraTopologyValidator.create(
CassandraTopologyValidationMetrics.of(metricsManager.getTaggedRegistry()), runtimeConfig),
new CassandraAbsentHostTracker(config.consecutiveAbsencesBeforePoolRemoval()));
return createOrShutdownClientPool(
metricsManager,
config,
runtimeConfig,
clientPool,
CassandraMutationTimestampProviders.legacyModeForTestsOnly(),
SafeLoggerFactory.get(CassandraKeyValueService.class),
AtlasDbConstants.DEFAULT_INITIALIZE_ASYNC);
}
public static CassandraKeyValueService create(
MetricsManager metricsManager,
CassandraKeyValueServiceConfig config,
Refreshable runtimeConfig,
CassandraMutationTimestampProvider mutationTimestampProvider) {
return create(
metricsManager,
config,
runtimeConfig,
mutationTimestampProvider,
AtlasDbConstants.DEFAULT_INITIALIZE_ASYNC);
}
public static CassandraKeyValueService create(
MetricsManager metricsManager,
CassandraKeyValueServiceConfig config,
Refreshable runtimeConfig,
CassandraMutationTimestampProvider mutationTimestampProvider,
CassandraClientPool clientPool) {
return createOrShutdownClientPool(
metricsManager,
config,
runtimeConfig,
clientPool,
mutationTimestampProvider,
SafeLoggerFactory.get(CassandraKeyValueService.class),
AtlasDbConstants.DEFAULT_INITIALIZE_ASYNC);
}
public static CassandraKeyValueService create(
MetricsManager metricsManager,
CassandraKeyValueServiceConfig config,
Refreshable runtimeConfig,
CassandraMutationTimestampProvider mutationTimestampProvider,
boolean initializeAsync) {
return create(
metricsManager,
config,
runtimeConfig,
mutationTimestampProvider,
SafeLoggerFactory.get(CassandraKeyValueService.class),
initializeAsync);
}
@VisibleForTesting
static CassandraKeyValueService create(
MetricsManager metricsManager,
CassandraKeyValueServiceConfig config,
Refreshable runtimeConfig,
CassandraMutationTimestampProvider mutationTimestampProvider,
SafeLogger log) {
return create(
metricsManager,
config,
runtimeConfig,
mutationTimestampProvider,
log,
AtlasDbConstants.DEFAULT_INITIALIZE_ASYNC);
}
@VisibleForTesting
static CassandraKeyValueService create(
MetricsManager metricsManager,
CassandraKeyValueServiceConfig config,
Refreshable runtimeConfig,
CassandraMutationTimestampProvider mutationTimestampProvider,
SafeLogger log,
boolean initializeAsync) {
CassandraClientPool clientPool =
CassandraClientPoolImpl.create(metricsManager, config, runtimeConfig, initializeAsync);
return createOrShutdownClientPool(
metricsManager, config, runtimeConfig, clientPool, mutationTimestampProvider, log, initializeAsync);
}
private static CassandraKeyValueService createOrShutdownClientPool(
MetricsManager metricsManager,
CassandraKeyValueServiceConfig config,
Refreshable runtimeConfig,
CassandraClientPool clientPool,
CassandraMutationTimestampProvider mutationTimestampProvider,
SafeLogger log,
boolean initializeAsync) {
try {
return createWithCqlClient(
metricsManager, config, runtimeConfig, clientPool, mutationTimestampProvider, log, initializeAsync);
} catch (Exception e) {
log.warn("Error occurred in creating Cassandra KVS. Now attempting to shut down client pool...", e);
try {
clientPool.shutdown();
log.info("Cassandra client pool shut down.");
} catch (RuntimeException internalException) {
log.info("An error occurred whilst shutting down the Cassandra client pool", internalException);
throw internalException;
}
throw Throwables.rewrapAndThrowUncheckedException(e);
}
}
private static CassandraKeyValueService createWithCqlClient(
MetricsManager metricsManager,
CassandraKeyValueServiceConfig config,
Refreshable runtimeConfig,
CassandraClientPool clientPool,
CassandraMutationTimestampProvider mutationTimestampProvider,
SafeLogger log,
boolean initializeAsync) {
try {
CassandraClusterConfig clusterConfig = CassandraClusterConfig.of(config, runtimeConfig.get());
AsyncKeyValueService asyncKeyValueService = config.asyncKeyValueServiceFactory()
.constructAsyncKeyValueService(
metricsManager,
config.getKeyspaceOrThrow(),
clusterConfig,
runtimeConfig.map(CassandraKeyValueServiceRuntimeConfig::servers),
initializeAsync);
return createAndInitialize(
metricsManager,
config,
runtimeConfig,
clientPool,
asyncKeyValueService,
mutationTimestampProvider,
log,
initializeAsync);
} catch (Exception e) {
log.warn("Exception during async KVS creation.", e);
throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
}
}
private static CassandraKeyValueService createAndInitialize(
MetricsManager metricsManager,
CassandraKeyValueServiceConfig config,
Refreshable runtimeConfig,
CassandraClientPool clientPool,
AsyncKeyValueService asyncKeyValueService,
CassandraMutationTimestampProvider mutationTimestampProvider,
SafeLogger log,
boolean initializeAsync) {
Counter notLatestVisibleValueCellFilterCounter = // register counter once and reuse
metricsManager.registerOrGetCounter(ValueExtractor.class, CellFilterMetrics.NOT_LATEST_VISIBLE_VALUE);
Function, ResultsExtractor> extractorFactory = cellValueMap ->
new ValueExtractor(metricsManager, cellValueMap, notLatestVisibleValueCellFilterCounter);
CassandraKeyValueServiceImpl keyValueService = new CassandraKeyValueServiceImpl(
log,
metricsManager,
config,
asyncKeyValueService,
runtimeConfig,
clientPool,
mutationTimestampProvider,
extractorFactory);
keyValueService.wrapper.initialize(initializeAsync);
return keyValueService.wrapper.isInitialized() ? keyValueService : keyValueService.wrapper;
}
private CassandraKeyValueServiceImpl(
SafeLogger log,
MetricsManager metricsManager,
CassandraKeyValueServiceConfig config,
AsyncKeyValueService asyncKeyValueService,
Refreshable runtimeConfig,
CassandraClientPool clientPool,
CassandraMutationTimestampProvider mutationTimestampProvider,
Function, ResultsExtractor> extractorFactory) {
super(createBlockingThreadpool(config, runtimeConfig.get().servers(), metricsManager));
this.log = log;
this.metricsManager = metricsManager;
this.config = config;
this.clientPool = clientPool;
this.asyncKeyValueService = asyncKeyValueService;
this.mutationTimestampProvider = mutationTimestampProvider;
this.queryRunner = new TracingQueryRunner(log, () -> runtimeConfig.get().tracing());
this.wrappingQueryRunner = new WrappingQueryRunner(queryRunner);
this.cassandraTables = new CassandraTables(clientPool, config);
this.taskRunner = new TaskRunner(executor);
this.cellLoader = CellLoader.create(clientPool, wrappingQueryRunner, taskRunner, runtimeConfig);
this.rangeLoader = new RangeLoader(clientPool, queryRunner, readConsistencyProvider, extractorFactory);
this.cellValuePutter = new CellValuePutter(
runtimeConfig,
clientPool,
taskRunner,
wrappingQueryRunner,
mutationTimestampProvider::getSweepSentinelWriteTimestamp);
this.checkAndSetRunner = new CheckAndSetRunner(queryRunner);
this.tableMetadata = new CassandraTableMetadata(rangeLoader, cassandraTables, clientPool, wrappingQueryRunner);
this.cassandraTableCreator = new CassandraTableCreator(clientPool, config);
this.cassandraTableTruncator = new CassandraTableTruncator(queryRunner, clientPool);
this.cassandraTableDropper =
new CassandraTableDropper(config, clientPool, tableMetadata, cassandraTableTruncator);
this.runtimeConfig = runtimeConfig;
this.verifierConfig = CassandraVerifierConfig.of(config, runtimeConfig.get());
this.extractorFactory = extractorFactory;
}
private static ExecutorService createBlockingThreadpool(
CassandraKeyValueServiceConfig config,
CassandraServersConfig serversConfig,
MetricsManager metricsManager) {
return config.thriftExecutorServiceFactory()
.orElseGet(() -> instrumentedFixedThreadPoolSupplier(
serversConfig,
config.poolSize(),
config.maxConnectionBurstSize(),
metricsManager.getTaggedRegistry()))
.get();
}
private static Supplier instrumentedFixedThreadPoolSupplier(
CassandraServersConfig serversConfig,
int poolSize,
int maxConnectionBurstSize,
TaggedMetricRegistry registry) {
return () -> {
int numberOfThriftHosts = serversConfig.numberOfThriftHosts();
int corePoolSize = poolSize * numberOfThriftHosts;
int maxPoolSize = maxConnectionBurstSize * numberOfThriftHosts;
return createThreadPoolWithoutSpans("Atlas Cassandra KVS", corePoolSize, maxPoolSize);
};
}
@Override
public boolean isInitialized() {
return wrapper.isInitialized();
}
protected void initialize(boolean asyncInitialize) {
wrapper.initialize(asyncInitialize);
}
private void tryInitialize() {
createTable(AtlasDbConstants.DEFAULT_METADATA_TABLE, AtlasDbConstants.EMPTY_TABLE_METADATA);
lowerConsistencyWhenSafe();
upgradeFromOlderInternalSchema();
CassandraKeyValueServices.warnUserInInitializationIfClusterAlreadyInInconsistentState(clientPool, config);
}
@VisibleForTesting
void upgradeFromOlderInternalSchema() {
try {
Map metadataForTables = getMetadataForTables();
final Collection updatedCfs = Lists.newArrayListWithExpectedSize(metadataForTables.size());
List knownCfs = clientPool.runWithRetry(client ->
client.describe_keyspace(config.getKeyspaceOrThrow()).getCf_defs());
for (CfDef clusterSideCf : knownCfs) {
TableReference tableRef = CassandraKeyValueServices.tableReferenceFromCfDef(clusterSideCf);
Optional relevantMetadata = lookupClusterSideMetadata(metadataForTables, tableRef);
if (relevantMetadata.isPresent()) {
byte[] clusterSideMetadata = relevantMetadata.get();
CfDef clientSideCf = getCfForTable(tableRef, clusterSideMetadata, config.gcGraceSeconds());
if (!ColumnFamilyDefinitions.isMatchingCf(clientSideCf, clusterSideCf)) {
// mismatch; we have changed how we generate schema since we last persisted
log.warn("Upgrading table {} to new internal Cassandra schema", LoggingArgs.tableRef(tableRef));
updatedCfs.add(clientSideCf);
}
} else if (!HiddenTables.isHidden(tableRef)) {
// Possible to get here from a race condition with another service starting up
// and performing schema upgrades concurrent with us doing this check
log.error(
"Found a table {} that did not have persisted"
+ " AtlasDB metadata. If you recently did a Palantir update, try waiting until"
+ " schema upgrades are completed on all backend CLIs/services etc and restarting"
+ " this service. If this error re-occurs on subsequent attempted startups, please"
+ " contact Palantir support.",
LoggingArgs.tableRef(tableRef));
}
}
// we are racing another service to do these same operations here, but they are idempotent / safe
Map emptyMetadataUpdate = ImmutableMap.of();
if (!updatedCfs.isEmpty()) {
putMetadataAndMaybeAlterTables(true, emptyMetadataUpdate, updatedCfs);
log.info("New table-related settings were applied on startup!!");
} else {
log.info("No tables are being upgraded on startup. No updated table-related settings found.");
}
} catch (TException e) {
log.error(
"Couldn't upgrade from an older internal Cassandra schema. New table-related settings may not have"
+ " taken effect.",
e);
}
}
private static Optional lookupClusterSideMetadata(
Map metadataForTables, TableReference tableRef) {
return Optional.ofNullable(metadataForTables.get(tableRef))
.or(() -> Maps.filterEntries(metadataForTables, entry -> matchingIgnoreCase(entry.getKey(), tableRef))
.values()
.stream()
.findAny());
}
private void lowerConsistencyWhenSafe() {
Set dcs;
Map strategyOptions;
try {
dcs = clientPool.runWithRetry(client -> CassandraVerifier.sanityCheckDatacenters(client, verifierConfig));
KsDef ksDef = clientPool.runWithRetry(client -> client.describe_keyspace(config.getKeyspaceOrThrow()));
strategyOptions = new HashMap<>(ksDef.getStrategy_options());
if (dcs.size() == 1) {
String dc = dcs.iterator().next();
if (strategyOptions.get(dc) != null) {
int currentRf = Integer.parseInt(strategyOptions.get(dc));
if (currentRf == runtimeConfig.get().replicationFactor()) {
if (currentRf == 2 && config.clusterMeetsNormalConsistencyGuarantees()) {
log.info("Setting Read Consistency to ONE, as cluster has only one datacenter at RF2.");
readConsistencyProvider.lowerConsistencyLevelToOne();
}
}
}
}
} catch (TException e) {
return;
}
}
/**
* Gets values from the key-value store. Requires a quorum of Cassandra nodes to be reachable.
*
* @param tableRef the name of the table to retrieve values from.
* @param rows set containing the rows to retrieve values for.
* @param selection specifies the set of columns to fetch.
* @param startTs specifies the maximum timestamp (exclusive) at which to
* retrieve each rows's value.
* @return map of retrieved values. Values which do not exist (either
* because they were deleted or never created in the first place)
* are simply not returned.
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
* @throws IllegalArgumentException if any of the requests were invalid
* (e.g., attempting to retrieve values from a non-existent table).
*/
@Override
public Map getRows(
TableReference tableRef, Iterable rows, ColumnSelection selection, long startTs) {
if (!selection.allColumnsSelected()) {
return getRowsForSpecificColumns(tableRef, rows, selection, startTs);
}
Set>> rowsByHost = HostPartitioner.partitionByHost(
clientPool, rows, Functions.identity())
.entrySet();
List>> tasks = new ArrayList<>(rowsByHost.size());
for (final Map.Entry> hostAndRows : rowsByHost) {
tasks.add(AnnotatedCallable.wrapWithThreadName(
AnnotationType.PREPEND,
"Atlas getRows " + hostAndRows.getValue().size() + " rows from " + tableRef + " on "
+ hostAndRows.getKey().cassandraHostName(),
() -> getRowsForSingleHost(hostAndRows.getKey(), tableRef, hostAndRows.getValue(), startTs)));
}
List> perHostResults = taskRunner.runAllTasksCancelOnFailure(tasks);
Map result = Maps.newHashMapWithExpectedSize(Iterables.size(rows));
for (Map perHostResult : perHostResults) {
result.putAll(perHostResult);
}
return result;
}
private Map getRowsForSingleHost(
final CassandraServer host, final TableReference tableRef, final List rows, final long startTs) {
try {
int rowCount = 0;
final Map result = new HashMap<>();
int fetchBatchCount = runtimeConfig.get().fetchBatchCount();
for (final List batch : Lists.partition(rows, fetchBatchCount)) {
rowCount += batch.size();
result.putAll(getAllCellsForRows(host, tableRef, batch, startTs));
}
if (rowCount > fetchBatchCount) {
log.warn(
"Rebatched in getRows a call to {} that attempted to multiget {} rows; "
+ "this may indicate overly-large batching on a higher level.\n{}",
LoggingArgs.tableRef(tableRef),
SafeArg.of("rowCount", rowCount),
SafeArg.of("stacktrace", CassandraKeyValueServices.getFilteredStackTrace("com.palantir")));
}
return ImmutableMap.copyOf(result);
} catch (Exception e) {
throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
}
}
private Map getAllCellsForRows(
final CassandraServer host, final TableReference tableRef, final List rows, final long startTs)
throws Exception {
ListMultimap result = ArrayListMultimap.create(rows.size(), 1);
List query = rows.stream()
.map(row -> keyPredicate(
ByteBuffer.wrap(row),
allPredicateWithLimit(runtimeConfig.get().fetchReadLimitPerRow())))
.collect(Collectors.toList());
while (!query.isEmpty()) {
query = EntryStream.of(getForKeyPredicates(host, tableRef, query, startTs))
.filterValues(cells -> !cells.isEmpty())
.peekKeyValue(result::putAll)
.mapKeyValue((row, cells) -> keyPredicate(row, getNextLexicographicalSlicePredicate(cells)))
.collect(Collectors.toList());
}
ResultsExtractor extractor = extractorFactory.apply(Maps.newHashMapWithExpectedSize(result.size()));
extractor.extractResults(Multimaps.asMap(result), startTs, ColumnSelection.all());
return extractor.asMap();
}
private static KeyPredicate keyPredicate(ByteBuffer row, SlicePredicate predicate) {
return new KeyPredicate().setKey(row).setPredicate(predicate);
}
private static SlicePredicate allPredicateWithLimit(int limit) {
return SlicePredicates.create(Range.ALL, Limit.of(limit));
}
private Map> getForKeyPredicates(
final CassandraServer host, final TableReference tableRef, List query, final long startTs)
throws Exception {
return clientPool.runWithRetryOnServer(
host,
new FunctionCheckedException>, Exception>() {
@Override
public Map> apply(CassandraClient client) throws Exception {
if (log.isTraceEnabled()) {
log.trace(
"Requesting {} cells from {} starting at timestamp {} on {} "
+ "as part of fetching cells for key predicates.",
SafeArg.of("cells", query.size()),
LoggingArgs.tableRef(tableRef),
SafeArg.of("startTs", startTs),
SafeArg.of("host", host));
}
Map>> results =
wrappingQueryRunner.multiget_multislice(
"getRows",
client,
tableRef,
query,
readConsistencyProvider.getConsistency(tableRef));
return Maps.transformValues(results, CellLoader::flattenReadOnlyLists);
}
@Override
public String toString() {
return "multiget_multislice(" + host.cassandraHostName() + ", " + tableRef + ", " + query.size()
+ " cells)";
}
});
}
private SlicePredicate getNextLexicographicalSlicePredicate(List columns) {
Preconditions.checkState(!columns.isEmpty(), "Columns was empty. This is probably an AtlasDb bug");
Column lastColumn = columns.get(columns.size() - 1).getColumn();
ColumnAndTimestamp columnNameAndTimestamp = CassandraKeyValueServices.decomposeColumn(lastColumn.name);
ByteBuffer nextLexicographicColumn = CassandraKeyValueServices.makeCompositeBuffer(
RangeRequests.nextLexicographicName(columnNameAndTimestamp.columnName()), Long.MAX_VALUE);
return SlicePredicates.create(
Range.of(nextLexicographicColumn, Range.UNBOUND_END),
Limit.of(runtimeConfig.get().fetchReadLimitPerRow()));
}
private static List wrap(List arrays) {
List byteBuffers = new ArrayList<>(arrays.size());
for (byte[] r : arrays) {
byteBuffers.add(ByteBuffer.wrap(r));
}
return byteBuffers;
}
private Map getRowsForSpecificColumns(
final TableReference tableRef, final Iterable rows, ColumnSelection selection, final long startTs) {
Preconditions.checkArgument(!selection.allColumnsSelected(), "Must select specific columns");
Collection selectedColumns = selection.getSelectedColumns();
Set cells = Sets.newHashSetWithExpectedSize(selectedColumns.size() * Iterables.size(rows));
for (byte[] row : rows) {
for (byte[] col : selectedColumns) {
cells.add(Cell.create(row, col));
}
}
StartTsResultsCollector collector = new StartTsResultsCollector(startTs, extractorFactory);
cellLoader.loadWithTs(
"getRows",
tableRef,
cells,
startTs,
false,
collector,
readConsistencyProvider.getConsistency(tableRef));
return collector.getCollectedResults();
}
/**
* Gets values from the key-value store. Requires a quorum of Cassandra nodes to be reachable.
*
* @param tableRef the name of the table to retrieve values from.
* @param timestampByCell specifies, for each row, the maximum timestamp (exclusive) at which to
* retrieve that rows's value.
* @return map of retrieved values. Values which do not exist (either
* because they were deleted or never created in the first place)
* are simply not returned.
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
* @throws IllegalArgumentException if any of the requests were invalid
* (e.g., attempting to retrieve values from a non-existent table).
*/
@Override
public Map get(TableReference tableRef, Map timestampByCell) {
if (timestampByCell.isEmpty()) {
log.info("Attempted get on '{}' table with empty cells", LoggingArgs.tableRef(tableRef));
return ImmutableMap.of();
}
try {
Long firstTs = timestampByCell.values().iterator().next();
if (Iterables.all(timestampByCell.values(), Predicates.equalTo(firstTs))) {
return get("get", tableRef, timestampByCell.keySet(), firstTs);
}
SetMultimap cellsByTs =
Multimaps.invertFrom(Multimaps.forMap(timestampByCell), HashMultimap.create());
ImmutableMap.Builder builder = ImmutableMap.builder();
for (long ts : cellsByTs.keySet()) {
StartTsResultsCollector collector = new StartTsResultsCollector(ts, extractorFactory);
cellLoader.loadWithTs(
"get",
tableRef,
cellsByTs.get(ts),
ts,
false,
collector,
readConsistencyProvider.getConsistency(tableRef));
builder.putAll(collector.getCollectedResults());
}
return builder.buildOrThrow();
} catch (Exception e) {
throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
}
}
private Map get(
String kvsMethodName, TableReference tableRef, Set cells, long maxTimestampExclusive) {
StartTsResultsCollector collector = new StartTsResultsCollector(maxTimestampExclusive, extractorFactory);
cellLoader.loadWithTs(
kvsMethodName,
tableRef,
cells,
maxTimestampExclusive,
false,
collector,
readConsistencyProvider.getConsistency(tableRef));
return collector.getCollectedResults();
}
/**
* Gets values from the key-value store for the specified rows and column range as separate iterators for each row.
* Requires a quorum of Cassandra nodes to be reachable, otherwise, the returned iterators will throw an
* {@link AtlasDbDependencyException} when their methods are called.
*
* @param tableRef the name of the table to retrieve values from.
* @param rows set containing the rows to retrieve values for. Behavior is undefined if {@code rows}
* contains duplicates (as defined by {@link Arrays#equals(byte[], byte[])}).
* @param batchColumnRangeSelection specifies the column range and the per-row batchSize to fetch.
* @param timestamp specifies the maximum timestamp (exclusive) at which to retrieve each rows's value.
* @return map of row names to {@link RowColumnRangeIterator}. Each {@link RowColumnRangeIterator} can iterate over
* the values that are spanned by the {@code batchColumnRangeSelection} in increasing order by column name.
* @throws IllegalArgumentException if {@code rows} contains duplicates.
*/
@Override
public Map getRowsColumnRange(
TableReference tableRef,
Iterable rows,
BatchColumnRangeSelection batchColumnRangeSelection,
long timestamp) {
Set>> rowsByHost = HostPartitioner.partitionByHost(
clientPool, rows, Functions.identity())
.entrySet();
List>> tasks = new ArrayList<>(rowsByHost.size());
for (final Map.Entry> hostAndRows : rowsByHost) {
tasks.add(AnnotatedCallable.wrapWithThreadName(
AnnotationType.PREPEND,
"Atlas getRowsColumnRange " + hostAndRows.getValue().size() + " rows from " + tableRef + " on "
+ hostAndRows.getKey().cassandraHostName(),
() -> getRowsColumnRangeIteratorForSingleHost(
hostAndRows.getKey(),
tableRef,
hostAndRows.getValue(),
batchColumnRangeSelection,
timestamp)));
}
List> perHostResults = taskRunner.runAllTasksCancelOnFailure(tasks);
Map result = Maps.newHashMapWithExpectedSize(Iterables.size(rows));
for (Map perHostResult : perHostResults) {
result.putAll(perHostResult);
}
return result;
}
private Map getRowsColumnRangeIteratorForSingleHost(
CassandraServer host,
TableReference tableRef,
List rows,
BatchColumnRangeSelection batchColumnRangeSelection,
long startTs) {
try {
RowColumnRangeResult firstPage =
getRowsColumnRangeForSingleHost(host, tableRef, rows, batchColumnRangeSelection, startTs);
Map> results = firstPage.getResults();
Map rowsToLastCompositeColumns = firstPage.getRowsToLastCompositeColumns();
IdentityHashMap incompleteRowsToNextColumns = new IdentityHashMap<>();
for (Map.Entry e : rowsToLastCompositeColumns.entrySet()) {
byte[] row = e.getKey();
byte[] col = CassandraKeyValueServices.decomposeColumnName(e.getValue())
.columnName();
// If we read a version of the cell before our start timestamp, it will be the most recent version
// readable to us and we can continue to the next column. Otherwise we have to continue reading
// this column.
Map rowResult = results.get(row);
boolean completedCell = (rowResult != null) && rowResult.containsKey(Cell.create(row, col));
boolean endOfRange = isEndOfColumnRange(
completedCell, col, firstPage.getRowsToRawColumnCount().get(row), batchColumnRangeSelection);
if (!endOfRange) {
byte[] nextCol = getNextColumnRangeColumn(completedCell, col);
incompleteRowsToNextColumns.put(row, nextCol);
}
}
Map ret = Maps.newHashMapWithExpectedSize(rows.size());
for (byte[] row : rowsToLastCompositeColumns.keySet()) {
Iterator> resultIterator;
Map result = results.get(row);
if (result != null) {
resultIterator = result.entrySet().iterator();
} else {
resultIterator = Collections.emptyIterator();
}
byte[] nextCol = incompleteRowsToNextColumns.get(row);
if (nextCol == null) {
ret.put(row, new LocalRowColumnRangeIterator(resultIterator));
} else {
BatchColumnRangeSelection newColumnRange = BatchColumnRangeSelection.create(
nextCol, batchColumnRangeSelection.getEndCol(), batchColumnRangeSelection.getBatchHint());
ret.put(
row,
new LocalRowColumnRangeIterator(Iterators.concat(
resultIterator, getRowColumnRange(host, tableRef, row, newColumnRange, startTs))));
}
}
// We saw no Cassandra results at all for these rows, so the entire column range is empty for these rows.
for (byte[] row : firstPage.getEmptyRows()) {
ret.put(row, new LocalRowColumnRangeIterator(Collections.emptyIterator()));
}
return ret;
} catch (Exception e) {
throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
}
}
private RowColumnRangeResult getRowsColumnRangeForSingleHost(
CassandraServer host,
TableReference tableRef,
List rows,
BatchColumnRangeSelection batchColumnRangeSelection,
long startTs) {
try {
return clientPool.runWithRetryOnServer(
host, new FunctionCheckedException() {
@Override
public RowColumnRangeResult apply(CassandraClient client) throws Exception {
Range range = createColumnRange(
batchColumnRangeSelection.getStartCol(),
batchColumnRangeSelection.getEndCol(),
startTs);
Limit limit = Limit.of(batchColumnRangeSelection.getBatchHint());
SlicePredicate pred = SlicePredicates.create(range, limit);
Map> results = wrappingQueryRunner.multiget(
"getRowsColumnRange",
client,
tableRef,
wrap(rows),
pred,
readConsistencyProvider.getConsistency(tableRef));
return RowColumnRangeExtractor.extract(rows, results, startTs, metricsManager);
}
@Override
public String toString() {
return "multiget_slice(" + tableRef.getQualifiedName() + ", "
+ rows.size() + " rows, " + batchColumnRangeSelection.getBatchHint()
+ " max columns)";
}
});
} catch (Exception e) {
throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
}
}
private Iterator> getRowColumnRange(
CassandraServer host,
TableReference tableRef,
byte[] row,
BatchColumnRangeSelection batchColumnRangeSelection,
long startTs) {
return ClosableIterators.wrapWithEmptyClose(
new AbstractPagingIterable<
Map.Entry, TokenBackedBasicResultsPage, byte[]>>() {
@Override
protected TokenBackedBasicResultsPage, byte[]> getFirstPage()
throws Exception {
return page(batchColumnRangeSelection.getStartCol());
}
@Override
protected TokenBackedBasicResultsPage, byte[]> getNextPage(
TokenBackedBasicResultsPage, byte[]> previous) throws Exception {
return page(previous.getTokenForNextPage());
}
TokenBackedBasicResultsPage, byte[]> page(final byte[] startCol)
throws Exception {
return clientPool.runWithRetryOnServer(
host,
new FunctionCheckedException<
CassandraClient,
TokenBackedBasicResultsPage, byte[]>,
Exception>() {
@Override
public TokenBackedBasicResultsPage, byte[]> apply(
CassandraClient client) throws Exception {
Range range = createColumnRange(
startCol, batchColumnRangeSelection.getEndCol(), startTs);
Limit limit = Limit.of(batchColumnRangeSelection.getBatchHint());
SlicePredicate pred = SlicePredicates.create(range, limit);
ByteBuffer rowByteBuffer = ByteBuffer.wrap(row);
Map> results =
wrappingQueryRunner.multiget(
"getRowsColumnRange",
client,
tableRef,
ImmutableList.of(rowByteBuffer),
pred,
readConsistencyProvider.getConsistency(tableRef));
if (results.isEmpty()) {
return SimpleTokenBackedResultsPage.create(
startCol, ImmutableList.of(), false);
}
List values = Iterables.getOnlyElement(results.values());
if (values.isEmpty()) {
return SimpleTokenBackedResultsPage.create(
startCol, ImmutableList.of(), false);
}
// May be empty if all results are at ts > startTs
Map ret = RowColumnRangeExtractor.extract(
ImmutableList.of(row), results, startTs, metricsManager)
.getResults()
.getOrDefault(row, Collections.emptyMap());
ColumnOrSuperColumn lastColumn = values.get(values.size() - 1);
byte[] lastCol = CassandraKeyValueServices.decomposeColumnName(
lastColumn.getColumn())
.columnName();
// Same idea as the getRows case to handle seeing only newer entries of a column
boolean completedCell = ret.get(Cell.create(row, lastCol)) != null;
if (isEndOfColumnRange(
completedCell, lastCol, values.size(), batchColumnRangeSelection)) {
return SimpleTokenBackedResultsPage.create(lastCol, ret.entrySet(), false);
}
byte[] nextCol = getNextColumnRangeColumn(completedCell, lastCol);
return SimpleTokenBackedResultsPage.create(nextCol, ret.entrySet(), true);
}
@Override
public String toString() {
return "multiget_slice(" + tableRef.getQualifiedName() + ", single row, "
+ batchColumnRangeSelection.getBatchHint() + " batch hint)";
}
});
}
}.iterator());
}
private static boolean isEndOfColumnRange(
boolean completedCell, byte[] lastCol, int numRawResults, BatchColumnRangeSelection columnRangeSelection) {
return (numRawResults < columnRangeSelection.getBatchHint())
|| (completedCell
&& (RangeRequests.isLastRowName(lastCol)
|| Arrays.equals(
RangeRequests.nextLexicographicName(lastCol),
columnRangeSelection.getEndCol())));
}
private static byte[] getNextColumnRangeColumn(boolean completedCell, byte[] lastCol) {
if (!completedCell) {
return lastCol;
} else {
return RangeRequests.nextLexicographicName(lastCol);
}
}
private static Range createColumnRange(byte[] startColOrEmpty, byte[] endColExlusiveOrEmpty, long startTs) {
ByteBuffer start =
startColOrEmpty.length == 0 ? Range.UNBOUND_START : Range.startOfColumn(startColOrEmpty, startTs);
ByteBuffer end = endColExlusiveOrEmpty.length == 0
? Range.UNBOUND_END
: Range.endOfColumnIncludingSentinels(RangeRequests.previousLexicographicName(endColExlusiveOrEmpty));
return Range.of(start, end);
}
/**
* Puts values into the key-value store. This call does not guarantee atomicity across cells.
* On failure, it is possible that some of the requests have succeeded (without having been rolled
* back). Similarly, concurrent batched requests may interleave.
*
* Requires a quorum of Cassandra nodes to be reachable.
*
* @param tableRef the name of the table to put values into.
* @param values map containing the key-value entries to put.
* @param timestamp must be non-negative and not equal to {@link Long#MAX_VALUE}
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
*/
@Override
public void put(final TableReference tableRef, final Map values, final long timestamp) {
try {
cellValuePutter.put(
"put", tableRef, KeyValueServices.toConstantTimestampValues(values.entrySet(), timestamp));
} catch (Exception e) {
throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
}
}
/**
* Puts values into the key-value store with individually specified timestamps. This call does not
* guarantee atomicity across cells. On failure, it is possible that some of the requests have succeeded
* (without having been rolled back). Similarly, concurrent batched requests may interleave.
*
* Requires a quorum of Cassandra nodes to be reachable.
*
* @param tableRef the name of the table to put values into.
* @param values map containing the key-value entries to put with
* non-negative timestamps less than {@link Long#MAX_VALUE}.
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
*/
@Override
public void putWithTimestamps(TableReference tableRef, Multimap values) {
try {
cellValuePutter.put("putWithTimestamps", tableRef, values.entries());
} catch (Exception e) {
throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
}
}
@Override
protected int getMultiPutBatchCount() {
return runtimeConfig.get().mutationBatchCount();
}
/**
* Puts values into the key-value store. This call does not guarantee atomicity across cells.
* On failure, it is possible that some of the requests have succeeded (without having been rolled
* back). Similarly, concurrent batched requests may interleave.
*
* Overridden to batch more intelligently than the default implementation.
*
* Requires a quorum of Cassandra nodes to be reachable.
*
* @param valuesByTable map containing the key-value entries to put by table.
* @param timestamp must be non-negative and not equal to {@link Long#MAX_VALUE}
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
*/
@Override
public void multiPut(Map> valuesByTable, long timestamp)
throws KeyAlreadyExistsException {
List flattened = new ArrayList<>();
for (Map.Entry> tableAndValues : valuesByTable.entrySet()) {
for (Map.Entry entry : tableAndValues.getValue().entrySet()) {
flattened.add(new TableCellAndValue(tableAndValues.getKey(), entry.getKey(), entry.getValue()));
}
}
Map> partitionedByHost =
HostPartitioner.partitionByHost(clientPool, flattened, TableCellAndValue::extractRowName);
List> callables = new ArrayList<>();
for (Map.Entry> entry : partitionedByHost.entrySet()) {
callables.addAll(getMultiPutTasksForSingleHost(entry.getKey(), entry.getValue(), timestamp));
}
taskRunner.runAllTasksCancelOnFailure(callables);
}
private List> getMultiPutTasksForSingleHost(
final CassandraServer host, Collection values, final long timestamp) {
Iterable> partitioned = IterablePartitioner.partitionByCountAndBytes(
values,
getMultiPutBatchCount(),
getMultiPutBatchSizeBytes(),
extractTableNames(values).toString(),
TableCellAndValue::getSize);
List> tasks = new ArrayList<>();
for (final List batch : partitioned) {
final Set tableRefs = extractTableNames(batch);
tasks.add(AnnotatedCallable.wrapWithThreadName(
AnnotationType.PREPEND,
"Atlas multiPut of " + batch.size() + " cells into " + tableRefs + " on "
+ host.cassandraHostName(),
() -> multiPutForSingleHostInternal(host, tableRefs, batch, timestamp)));
}
return tasks;
}
private static Set extractTableNames(Iterable tableCellAndValues) {
Set tableRefs = new HashSet<>();
for (TableCellAndValue tableCellAndValue : tableCellAndValues) {
tableRefs.add(tableCellAndValue.tableRef);
}
return tableRefs;
}
private Void multiPutForSingleHostInternal(
final CassandraServer host,
final Set tableRefs,
final List batch,
long timestamp)
throws Exception {
final MutationMap mutationMap = convertToMutations(batch, timestamp);
return clientPool.runWithRetryOnServer(host, new FunctionCheckedException() {
@Override
public Void apply(CassandraClient client) throws Exception {
return wrappingQueryRunner.batchMutate("multiPut", client, tableRefs, mutationMap, WRITE_CONSISTENCY);
}
@Override
public String toString() {
return "batch_mutate(" + host.cassandraHostName() + ", " + tableRefs + ", " + batch.size() + " values)";
}
});
}
private static MutationMap convertToMutations(List batch, long timestamp) {
MutationMap mutationMap = new MutationMap();
for (TableCellAndValue tableCellAndValue : batch) {
Cell cell = tableCellAndValue.cell;
Column col = CassandraKeyValueServices.createColumn(cell, Value.create(tableCellAndValue.value, timestamp));
ColumnOrSuperColumn colOrSup = new ColumnOrSuperColumn();
colOrSup.setColumn(col);
Mutation mutation = new Mutation();
mutation.setColumn_or_supercolumn(colOrSup);
mutationMap.addMutationForCell(cell, tableCellAndValue.tableRef, mutation);
}
return mutationMap;
}
/**
* Truncate a table in the key-value store.
*
* This is preferred to dropping and re-adding a table, as live schema changes can
* be a complicated topic for distributed databases.
*
* Requires all Cassandra nodes to be reachable.
*
* @param tableRef the name of the table to truncate.
* @throws AtlasDbDependencyException if not all Cassandra nodes are reachable.
* @throws RuntimeException if the table does not exist.
*/
@Override
public void truncateTable(final TableReference tableRef) {
truncateTables(ImmutableSet.of(tableRef));
}
/**
* Truncates tables in the key-value store.
*
* This can be slightly faster than repeatedly truncating individual tables.
*
* Requires all Cassandra nodes to be reachable.
*
* @param tablesToTruncate set od tables to truncate.
* @throws AtlasDbDependencyException if not all Cassandra nodes are reachable.
* @throws RuntimeException if the table does not exist.
*/
@Override
public void truncateTables(final Set tablesToTruncate) {
cassandraTableTruncator.truncateTables(tablesToTruncate);
}
/**
* Deletes values from the key-value store.
*
* Requires all Cassandra nodes to be up and available, otherwise throws an PalantirRuntimeException.
*
* @param tableRef the name of the table to delete values from.
* @param keys map containing the keys to delete values for.
* @throws PalantirRuntimeException if not all hosts respond successfully.
*/
@Override
public void delete(TableReference tableRef, Multimap keys) {
new CellDeleter(
clientPool,
wrappingQueryRunner,
DELETE_CONSISTENCY,
mutationTimestampProvider.getDeletionTimestampOperatorForBatchDelete())
.delete(tableRef, keys);
}
@VisibleForTesting
CfDef getCfForTable(TableReference tableRef, byte[] rawMetadata, int gcGraceSeconds) {
return ColumnFamilyDefinitions.getCfDef(config.getKeyspaceOrThrow(), tableRef, gcGraceSeconds, rawMetadata);
}
// TODO(unknown): after cassandra change: handle multiRanges
@Override
@Idempotent
public Map, byte[]>> getFirstBatchForRanges(
TableReference tableRef, Iterable rangeRequests, long timestamp) {
int concurrency = config.rangesConcurrency();
return KeyValueServices.getFirstBatchForRangesUsingGetRangeConcurrent(
executor, this, tableRef, rangeRequests, timestamp, concurrency);
}
// TODO(unknown): after cassandra change: handle reverse ranges
// TODO(unknown): after cassandra change: handle column filtering
/**
* For each row in the specified range, returns the most recent version strictly before timestamp. Requires a
* quorum of Cassandra nodes to be reachable.
*
* Remember to close any {@link ClosableIterator}s you get in a finally block.
*
* @param rangeRequest the range to load.
* @param timestamp specifies the maximum timestamp (exclusive) at which to retrieve each row's value.
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
*/
@Override
@Idempotent
public ClosableIterator> getRange(
TableReference tableRef, RangeRequest rangeRequest, long timestamp) {
return rangeLoader.getRange(tableRef, rangeRequest, timestamp);
}
/**
* Gets timestamp values from the key-value store. For each row, this returns all associated
* timestamps < given_ts.
*
* This method has stronger consistency guarantees than regular read requests. This must return all timestamps
* stored anywhere in the system (because of sweep). Unless all nodes are up and available, this method will
* throw an InsufficientConsistencyException.
*
* @param tableRef the name of the table to read from.
* @param rangeRequest the range to load.
* @param timestamp the maximum timestamp to load.
* @throws InsufficientConsistencyException if not all hosts respond successfully.
*/
@Override
@Idempotent
public ClosableIterator>> getRangeOfTimestamps(
TableReference tableRef, RangeRequest rangeRequest, long timestamp) {
CandidateCellForSweepingRequest request = ImmutableCandidateCellForSweepingRequest.builder()
.startRowInclusive(rangeRequest.getStartInclusive())
.maxTimestampExclusive(timestamp)
.shouldCheckIfLatestValueIsEmpty(false)
.shouldDeleteGarbageCollectionSentinels(true)
.build();
return getCandidateRowsForSweeping("getRangeOfTimestamps", tableRef, request)
.flatMap(rows -> rows)
.map(CandidateRowForSweeping::toRowResult)
.stopWhen(rowResult -> !rangeRequest.inRange(rowResult.getRowName()));
}
@Override
public ClosableIterator> getCandidateCellsForSweeping(
TableReference tableRef, CandidateCellForSweepingRequest request) {
return getCandidateRowsForSweeping("getCandidateCellsForSweeping", tableRef, request)
.map(rows -> rows.stream()
.map(CandidateRowForSweeping::cells)
.flatMap(List::stream)
.collect(Collectors.toList()));
}
private ClosableIterator> getCandidateRowsForSweeping(
String kvsMethodName, TableReference tableRef, CandidateCellForSweepingRequest request) {
RowGetter rowGetter = new RowGetter(clientPool, queryRunner, ConsistencyLevel.ALL, tableRef);
return new CandidateRowsForSweepingIterator(
(iteratorTableRef, cells, maxTimestampExclusive) ->
get(kvsMethodName, iteratorTableRef, cells, maxTimestampExclusive),
newInstrumentedCqlExecutor(),
rowGetter,
tableRef,
request,
runtimeConfig.map(CassandraKeyValueServiceRuntimeConfig::sweepReadThreads));
}
/**
* Returns a sorted list of row keys in the specified range; see
* {@link CassandraKeyValueService#getRowKeysInRange(TableReference, byte[], byte[], int)}.
*
* Implementation specific: this method specifically does not read any of the columns and can therefore be used
* in the presence of wide rows. However, as a side-effect, it may return row where the row only contains Cassandra
* tombstones.
*/
@Override
public List getRowKeysInRange(TableReference tableRef, byte[] startRow, byte[] endRow, int maxResults) {
RowGetter rowGetter = new RowGetter(clientPool, queryRunner, ConsistencyLevel.QUORUM, tableRef);
return rowGetter.getRowKeysInRange(startRow, endRow, maxResults);
}
private CqlExecutor newInstrumentedCqlExecutor() {
return AtlasDbMetrics.instrument(
metricsManager.getRegistry(), CqlExecutor.class, new CqlExecutorImpl(clientPool, ConsistencyLevel.ALL));
}
/**
* Drop the table, and also delete its table metadata. Requires a quorum of Cassandra nodes to be reachable.
*
* @param tableRef the name of the table to drop.
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster
* cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during
* its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we
* may drop the tables, but fail to to persist the changes to the _metadata table.
* @throws UncheckedExecutionException if there are multiple schema mutation lock tables.
*/
@Override
public void dropTable(final TableReference tableRef) {
dropTables(ImmutableSet.of(tableRef));
}
/**
* Drop the tables, and also delete their table metadata. Requires a quorum of Cassandra nodes to be reachable.
*
* Main gains here vs. dropTable:
* - problems excepting, we will basically be serializing a rapid series of schema changes
* through a single host checked out from the client pool, so reduced chance of schema disagreement issues
* - client-side in-memory lock to prevent misbehaving callers from shooting themselves in the foot
* - one less round trip
*
* @param tablesToDrop the set of tables to drop.
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster
* cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during
* its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we
* may drop the tables, but fail to to persist the changes to the _metadata table.
* @throws UncheckedExecutionException if there are multiple schema mutation lock tables.
*/
@Override
public void dropTables(final Set tablesToDrop) {
cassandraTableDropper.dropTables(tablesToDrop);
}
/**
* Creates a table with the specified name. If the table already exists, no action is performed
* (the table is left in its current state). Requires a quorum of Cassandra nodes to be reachable.
*
* @param tableRef the name of the table to create.
* @param metadata the metadata of the table to create.
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster
* cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during
* its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we
* may fail to persist the changes to the _metadata table.
* @throws UncheckedExecutionException if there are multiple schema mutation lock tables.
*/
@Override
public void createTable(final TableReference tableRef, final byte[] metadata) {
createTables(ImmutableMap.of(tableRef, metadata));
}
/**
* Creates a table with the specified name. If the table already exists, no action is performed
* (the table is left in its current state).
*
* Requires a quorum of Cassandra nodes to be up and available.
*
* Main gains here vs. createTable:
* - problems excepting, we will basically be serializing a rapid series of schema changes
* through a single host checked out from the client pool, so reduced chance of schema disagreement issues
* - client-side in-memory lock to prevent misbehaving callers from shooting themselves in the foot
* - one less round trip
*
* createTables(existingTable, newMetadata) can perform a metadata-only update. Additionally, it is possible
* that this metadata-only update performs a schema mutation by altering the CFDef (e. g., user changes metadata
* of existing table to have new compression block size). This does not require the schema mutation lock, as it
* does not alter the CfId
*
* @param tablesToMetadata a mapping of names of tables to create to their respective metadata.
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster
* cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during
* its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we
* may fail to persist the changes to the _metadata table.
* @throws UncheckedExecutionException if there are multiple schema mutation lock tables.
*/
@Override
public void createTables(final Map tablesToMetadata) {
Map tablesToCreate = tableMetadata.filterOutExistingTables(tablesToMetadata);
Map tablesToAlter = tableMetadata.filterOutNoOpMetadataChanges(tablesToMetadata);
boolean onlyMetadataChangesAreForNewTables = tablesToAlter.keySet().equals(tablesToCreate.keySet());
boolean putMetadataWillNeedASchemaChange = !onlyMetadataChangesAreForNewTables;
if (!tablesToCreate.isEmpty()) {
LoggingArgs.SafeAndUnsafeTableReferences safeAndUnsafe = LoggingArgs.tableRefs(tablesToCreate.keySet());
log.info("Creating tables {} and {}", safeAndUnsafe.safeTableRefs(), safeAndUnsafe.unsafeTableRefs());
cassandraTableCreator.createTables(tablesToCreate);
}
internalPutMetadataForTables(tablesToAlter, putMetadataWillNeedASchemaChange);
}
/**
* Return the list of tables stored in this key value service. Requires a quorum of Cassandra nodes to be reachable
* and agree on schema versions.
*
* This will not contain the names of any hidden tables (e. g., the _metadata table).
*
* @return a set of TableReferences (table names) for all the visible tables
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster
* cannot come to an agreement on schema versions.
*/
@Override
public Set getAllTableNames() {
return cassandraTables
.getTableReferencesWithoutFiltering()
.filter(tr -> !HiddenTables.isHidden(tr))
.collect(Collectors.toSet());
}
/**
* Gets the metadata for a given table. Do not use this method to see if a table exists as it can return false
* positives. Requires a quorum of Cassandra nodes to be reachable.
*
* @param tableRef the name of the table to get metadata for.
* @return a byte array representing the metadata for the table. Array is empty if no table
* with the given name exists. Consider {@link TableMetadata#BYTES_HYDRATOR} for hydrating.
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
*/
@Override
public byte[] getMetadataForTable(TableReference tableRef) {
// try and get with a single-key lookup
String lowerCaseTableName = tableRef.getQualifiedName().toLowerCase(Locale.ROOT);
Map rows = getRows(
AtlasDbConstants.DEFAULT_METADATA_TABLE,
ImmutableSet.of(lowerCaseTableName.getBytes(StandardCharsets.UTF_8)),
ColumnSelection.all(),
Long.MAX_VALUE);
if (!rows.isEmpty()) {
return Iterables.getOnlyElement(rows.values()).getContents();
}
// if unsuccessful with fast code-path, we need to check if this table exists but was written at a key
// before we started enforcing only writing lower-case canonicalised versions of keys
return Optional.ofNullable(getMetadataForTables().get(tableRef)).orElse(AtlasDbConstants.EMPTY_TABLE_METADATA);
}
private static boolean matchingIgnoreCase(@Nullable TableReference t1, TableReference t2) {
if (t1 != null) {
return t1.getQualifiedName().equalsIgnoreCase(t2.getQualifiedName());
} else {
return t2 == null;
}
}
/**
* Gets the metadata for all non-hidden tables. Requires a quorum of Cassandra nodes to be reachable.
*
* @return a mapping of table names to their respective metadata in form of a byte array. Consider
* {@link TableMetadata#BYTES_HYDRATOR} for hydrating.
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are available.
*/
@Override
public Map getMetadataForTables() {
return tableMetadata.getMetadataForTables();
}
/**
* Records the specified metadata for a given table. Requires a quorum of Cassandra nodes to be reachable.
*
* @param tableRef the name of the table to record metadata for.
* @param meta a byte array representing the metadata to record.
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster
* cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during
* its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we
* may fail to persist the changes to the _metadata table.
*/
@Override
public void putMetadataForTable(final TableReference tableRef, final byte[] meta) {
putMetadataForTables(ImmutableMap.of(tableRef, meta));
}
/**
* For each specified table records the respective metadata. Requires a quorum of Cassandra nodes to be reachable.
*
* @param tableRefToMetadata a mapping from each table's name to the respective byte array representing
* the metadata to record.
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable, or the cluster
* cannot come to an agreement on schema versions. Note that this method is not atomic: if quorum is lost during
* its execution or Cassandra nodes fail to settle on a schema version after the Cassandra schema is mutated, we
* may fail to persist the changes to the _metadata table.
*/
@Override
public void putMetadataForTables(final Map tableRefToMetadata) {
internalPutMetadataForTables(tableRefToMetadata, true);
}
@SuppressWarnings("checkstyle:RegexpSinglelineJava")
private void internalPutMetadataForTables(
Map tableRefToMetadata, boolean possiblyNeedToPerformSettingsChanges) {
if (tableRefToMetadata.isEmpty()) {
return;
}
Map tableRefToNewCell = Maps.transformEntries(
tableRefToMetadata, (tableRef, metadata) -> CassandraKeyValueServices.getMetadataCell(tableRef));
Map tableRefToOldCell = Maps.transformEntries(
tableRefToMetadata, (tableRef, metadata) -> CassandraKeyValueServices.getOldMetadataCell(tableRef));
// technically we're racing other nodes from here on, during an update period,
// but the penalty for not caring is just some superfluous schema mutations and a
// few dead rows in the metadata table.
Map existingMetadataAtNewName = get(
AtlasDbConstants.DEFAULT_METADATA_TABLE,
tableRefToNewCell.values().stream()
.collect(Collectors.toMap(Functions.identity(), Functions.constant(Long.MAX_VALUE))));
Map existingMetadataAtOldName = get(
AtlasDbConstants.DEFAULT_METADATA_TABLE,
tableRefToOldCell.values().stream()
.collect(Collectors.toMap(Functions.identity(), Functions.constant(Long.MAX_VALUE))));
final Map updatedMetadata = new HashMap<>();
final Set updatedCfs = new HashSet<>();
tableRefToNewCell.forEach((tableRef, newCell) -> {
if (existingMetadataAtNewName.containsKey(newCell)) {
if (metadataIsDifferent(
existingMetadataAtNewName.get(newCell).getContents(), tableRefToMetadata.get(tableRef))) {
// found existing metadata at new name, but we're performing an update
updatedMetadata.put(newCell, tableRefToMetadata.get(tableRef));
updatedCfs.add(getCfForTable(tableRef, tableRefToMetadata.get(tableRef), config.gcGraceSeconds()));
}
} else if (existingMetadataAtOldName.containsKey(tableRefToOldCell.get(tableRef))) {
if (metadataIsDifferent(
existingMetadataAtOldName
.get(tableRefToOldCell.get(tableRef))
.getContents(),
tableRefToMetadata.get(tableRef))) {
// found existing metadata at old name, but we're performing an update
updatedMetadata.put(tableRefToOldCell.get(tableRef), tableRefToMetadata.get(tableRef));
updatedCfs.add(getCfForTable(tableRef, tableRefToMetadata.get(tableRef), config.gcGraceSeconds()));
}
} else {
// didn't find an existing metadata at old or new names, this is completely new;
// thus, let's write it out with the new format
updatedMetadata.put(tableRefToNewCell.get(tableRef), tableRefToMetadata.get(tableRef));
updatedCfs.add(getCfForTable(tableRef, tableRefToMetadata.get(tableRef), config.gcGraceSeconds()));
}
});
if (!updatedMetadata.isEmpty()) {
putMetadataAndMaybeAlterTables(possiblyNeedToPerformSettingsChanges, updatedMetadata, updatedCfs);
}
}
private static boolean metadataIsDifferent(byte[] existingMetadata, byte[] requestMetadata) {
return !Arrays.equals(existingMetadata, requestMetadata);
}
private void putMetadataAndMaybeAlterTables(
boolean possiblyNeedToPerformSettingsChanges, Map newMetadata, Collection updatedCfs) {
try {
clientPool.runWithRetry(client -> {
if (possiblyNeedToPerformSettingsChanges) {
for (CfDef cf : updatedCfs) {
client.system_update_column_family(cf);
}
CassandraKeyValueServices.waitForSchemaVersions(
config.schemaMutationTimeoutMillis(),
client,
schemaChangeDescriptionForPutMetadataForTables(updatedCfs));
}
// Done with actual schema mutation, push the metadata
put(AtlasDbConstants.DEFAULT_METADATA_TABLE, newMetadata, System.currentTimeMillis());
return null;
});
} catch (Exception e) {
throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
}
}
private static String schemaChangeDescriptionForPutMetadataForTables(Collection updatedCfs) {
String tables = updatedCfs.stream()
.map(CassandraKeyValueServices::tableReferenceFromCfDef)
.map(Object::toString)
.collect(Collectors.toList())
.toString();
return String.format(
"after updating the column family for tables %s in a call to put metadata for tables", tables);
}
@Override
public void deleteRange(final TableReference tableRef, final RangeRequest range) {
if (range.equals(RangeRequest.all())) {
try {
cassandraTableTruncator.truncateTables(ImmutableSet.of(tableRef));
} catch (AtlasDbDependencyException e) {
log.info(
"Tried to make a deleteRange({}, RangeRequest.all())"
+ " into a more garbage-cleanup friendly truncate(), but this failed.",
LoggingArgs.tableRef(tableRef),
e);
super.deleteRange(tableRef, range);
}
} else if (isForSingleRow(range.getStartInclusive(), range.getEndExclusive())) {
try {
long timestamp = mutationTimestampProvider.getRemoveTimestamp();
byte[] row = range.getStartInclusive();
clientPool.runWithRetry(client -> {
client.remove("deleteRange", tableRef, row, timestamp, DELETE_CONSISTENCY);
return null;
});
} catch (RetryLimitReachedException e) {
throw CassandraUtils.wrapInIceForDeleteOrRethrow(e);
} catch (TException e) {
throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
}
} else {
super.deleteRange(tableRef, range);
}
}
private static boolean isForSingleRow(byte[] startInclusive, byte[] endExclusive) {
if (startInclusive.length == 0 || endExclusive.length == 0) {
return false;
}
return Arrays.equals(endExclusive, RangeRequests.nextLexicographicName(startInclusive));
}
@Override
public void deleteRows(TableReference tableRef, Iterable rows) {
Set actualKeys = StreamSupport.stream(rows.spliterator(), false)
.map(ByteBuffer::wrap)
.collect(Collectors.toSet());
if (actualKeys.isEmpty()) {
return;
}
long timestamp = mutationTimestampProvider.getRemoveTimestamp();
Map>> mutationMap = KeyedStream.of(actualKeys)
.map(row -> new Deletion().setTimestamp(timestamp))
.map(deletion -> new Mutation().setDeletion(deletion))
.map(mutation -> keyMutationMapByColumnFamily(tableRef, mutation))
.collectToMap();
try {
clientPool.runWithRetry(client -> {
client.batch_mutate("deleteRows", mutationMap, DELETE_CONSISTENCY);
return null;
});
} catch (RetryLimitReachedException e) {
throw CassandraUtils.wrapInIceForDeleteOrRethrow(e);
} catch (TException e) {
throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
}
}
private static Map> keyMutationMapByColumnFamily(
TableReference tableRef, Mutation mutation) {
return ImmutableMap.of(AbstractKeyValueService.internalTableName(tableRef), ImmutableList.of(mutation));
}
@Override
public void deleteAllTimestamps(TableReference tableRef, Map deletes) {
new CellRangeDeleter(
clientPool,
wrappingQueryRunner,
DELETE_CONSISTENCY,
mutationTimestampProvider::getRangeTombstoneTimestamp)
.deleteAllTimestamps(tableRef, deletes);
}
/**
* Performs non-destructive cleanup when the KVS is no longer needed.
*/
@Override
public void close() {
clientPool.shutdown();
asyncKeyValueService.close();
super.close();
}
/**
* Adds a value with timestamp = Value.INVALID_VALUE_TIMESTAMP to each of the given cells. If
* a value already exists at that time stamp, nothing is written for that cell.
*
* Requires a quorum of Cassandra nodes to be reachable.
*
* @param tableRef the name of the table to add the value to.
* @param cells a set of cells to store the values in.
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
*/
@Override
public void addGarbageCollectionSentinelValues(TableReference tableRef, Iterable cells) {
try {
final Value value = Value.create(PtBytes.EMPTY_BYTE_ARRAY, Value.INVALID_VALUE_TIMESTAMP);
cellValuePutter.putWithOverriddenTimestamps(
"addGarbageCollectionSentinelValues",
tableRef,
Iterables.transform(cells, cell -> Maps.immutableEntry(cell, value)));
} catch (Exception e) {
throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
}
}
/**
* Gets timestamp values from the key-value store. For each cell, this returns all associated
* timestamps < given_ts.
*
* This method has stronger consistency guarantees than regular read requests. This must return
* all timestamps stored anywhere in the system (because of sweep). Unless all nodes are up and available, this
* method will throw a PalantirRuntimeException.
*
* @param tableRef the name of the table to retrieve timestamps from.
* @param cells set containg cells to retrieve timestamps for.
* @param ts maximum timestamp to get (exclusive).
* @return multimap of timestamps by cell
* @throws AtlasDbDependencyException if not all Cassandra nodes are reachable.
*/
@Override
public Multimap getAllTimestamps(TableReference tableRef, Set cells, long ts) {
return cellLoader.getAllTimestamps(tableRef, cells, ts, DELETE_CONSISTENCY);
}
/**
* Puts values into the key-value store. This call does not guarantee
* atomicity across cells. On failure, it is possible that some of the requests will
* have succeeded (without having been rolled back). Similarly, concurrent batched requests may
* interleave. However, concurrent writes to the same Cell will not both report success.
* One of them will throw {@link KeyAlreadyExistsException}.
*
* Requires a quorum of Cassandra nodes to be reachable.
*
* @param tableRef the name of the table to put values into.
* @param values map containing the key-value entries to put.
* @throws AtlasDbDependencyException if fewer than a quorum of Cassandra nodes are reachable.
* @throws KeyAlreadyExistsException if you are putting a Cell with the same timestamp as one that already exists.
*/
@Override
public void putUnlessExists(final TableReference tableRef, final Map values)
throws KeyAlreadyExistsException {
try {
Optional failure = clientPool.runWithRetry(client -> {
Map> partitionedEntries = partitionPerRow(values);
for (Map.Entry> partition : partitionedEntries.entrySet()) {
CASResult casResult =
putUnlessExistsSinglePartition(tableRef, client, partition.getKey(), partition.getValue());
if (!casResult.isSuccess()) {
return Optional.of(new KeyAlreadyExistsException(
"The cells in the table already exist.",
casResult.getCurrent_values().stream()
.map(column -> Cell.create(
partition.getKey().toByteArray(),
CassandraKeyValueServices.decomposeColumn(column.bufferForName())
.columnName()))
.collect(Collectors.toList()),
LoggingArgs.tableRef(tableRef)));
}
}
return Optional.empty();
});
failure.ifPresent(exception -> {
throw exception;
});
} catch (KeyAlreadyExistsException e) {
throw e;
} catch (Exception e) {
throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
}
}
@Override
public void setOnce(TableReference tableRef, Map values) {
try {
cellValuePutter.set(
"setOnce",
tableRef,
KeyValueServices.toConstantTimestampValues(values.entrySet(), AtlasDbConstants.TRANSACTION_TS));
} catch (Exception e) {
throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
}
}
public static Map> partitionPerRow(Map values) {
return values.entrySet().stream()
.collect(Collectors.groupingBy(
entry -> ByteString.copyFrom(entry.getKey().getRowName()),
Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)));
}
private static CASResult putUnlessExistsSinglePartition(
TableReference tableRef, CassandraClient client, ByteString row, Map partition)
throws TException {
return client.put_unless_exists(
tableRef,
ByteBuffer.wrap(row.toByteArray()),
partition.entrySet().stream()
.map(CassandraKeyValueServiceImpl::prepareColumnForPutUnlessExists)
.collect(Collectors.toList()),
ConsistencyLevel.SERIAL,
WRITE_CONSISTENCY);
}
private static Column prepareColumnForPutUnlessExists(Map.Entry insertion) {
return new Column(CassandraKeyValueServices.makeCompositeBuffer(
insertion.getKey().getColumnName(),
// Atlas timestamp
CassandraConstants.CAS_TABLE_TIMESTAMP))
// Cassandra timestamp
.setTimestamp(CassandraConstants.CAS_TABLE_TIMESTAMP)
.setValue(insertion.getValue());
}
@Override
public CheckAndSetCompatibility getCheckAndSetCompatibility() {
return CheckAndSetCompatibility.supportedBuilder()
.supportsMultiCheckAndSetOperations(true)
.supportsDetailOnFailure(true)
.consistentOnFailure(false)
.build();
}
/**
* Performs a check-and-set into the key-value store.
* Please see {@link CheckAndSetRequest} for information about how to create this request,
* and {@link KeyValueService} for more detailed documentation.
*
* Does not require all Cassandra nodes to be up and available, works as long as quorum is achieved.
*
* @param request the request, including table, cell, old value and new value.
* @throws CheckAndSetException if the stored value for the cell was not as expected.
*/
@Override
public void checkAndSet(final CheckAndSetRequest request) throws CheckAndSetException {
try {
CheckAndSetResult casResult =
clientPool.runWithRetry(client -> checkAndSetRunner.executeCheckAndSet(client, request));
if (!casResult.successful()) {
List currentValues = casResult.existingValues().stream()
.map(ByteString::toByteArray)
.collect(Collectors.toList());
throw new CheckAndSetException(
request.cell(), request.table(), request.oldValue().orElse(null), currentValues);
}
} catch (CheckAndSetException e) {
throw e;
} catch (Exception e) {
throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
}
}
/**
* Performs a check-and-set for multiple cells in a row into the key-value store.
* Please see {@link MultiCheckAndSetRequest} for information about how to create this request,
* and {@link KeyValueService} for more detailed documentation.
*
* If the call completes successfully, then you know that the old cells initially had the values you expected.
* In this case, you can be sure that all your cells have been updated to their new values.
* If the old cells initially did not have the values you expected, none of the cells will be updated and
* {@link MultiCheckAndSetException} will be thrown.
* Reads concurrent with this operation will not see a partial update.
*
* Another thing to note is that the check operation will **only be performed on values of cells that are declared
* in the set of expected values** i.e. the check operation DOES NOT take updates into account.
*
* Does not require all Cassandra nodes to be up and available, works as long as quorum is achieved.
*
* @param request the request, including table, rowName, old values and new values.
* @throws MultiCheckAndSetException if the stored values for the cells were not as expected.
*/
@Override
public void multiCheckAndSet(MultiCheckAndSetRequest request) throws MultiCheckAndSetException {
TableReference tableRef = request.tableRef();
ByteBuffer row = ByteBuffer.wrap(request.rowName());
List oldCol = request.expected().entrySet().stream()
.map(CassandraKeyValueServiceImpl::prepareColumnForPutUnlessExists)
.collect(Collectors.toList());
List newCol = request.updates().entrySet().stream()
.map(CassandraKeyValueServiceImpl::prepareColumnForPutUnlessExists)
.collect(Collectors.toList());
try {
CASResult casResult = clientPool.runWithRetry(client ->
client.cas(tableRef, row, oldCol, newCol, ConsistencyLevel.SERIAL, ConsistencyLevel.EACH_QUORUM));
if (!casResult.isSuccess()) {
Map currentValues = KeyedStream.of(casResult.getCurrent_values())
.mapKeys(column -> Cell.create(
request.rowName(),
CassandraKeyValueServices.decomposeColumn(column.bufferForName())
.columnName()))
.map(Column::getValue)
.collectToMap();
throw new MultiCheckAndSetException(
LoggingArgs.tableRef(tableRef), request.rowName(), request.expected(), currentValues);
}
} catch (MultiCheckAndSetException e) {
throw e;
} catch (Exception e) {
log.error("Error while executing multi-checkAndSet operation.", e);
throw Throwables.unwrapAndThrowAtlasDbDependencyException(e);
}
}
@Override
public void compactInternally(TableReference tableRef) {
log.info(
"Called compactInternally on {}, but this is a no-op for Cassandra KVS."
+ "Cassandra should eventually decide to compact this table for itself.",
LoggingArgs.tableRef(tableRef));
}
@Override
public ClusterAvailabilityStatus getClusterAvailabilityStatus() {
ClusterAvailabilityStatus clusterStatus = getStatusByRunningOperationsOnEachHost();
if (isClusterQuorumAvaialble(clusterStatus) && !doesConfigReplicationFactorMatchWithCluster()) {
return ClusterAvailabilityStatus.TERMINAL;
}
return clusterStatus;
}
@Override
public boolean sweepsEntriesInStrictlyNonDecreasingFashion() {
return true;
}
private static boolean isClusterQuorumAvaialble(ClusterAvailabilityStatus clusterStatus) {
return clusterStatus.equals(ClusterAvailabilityStatus.ALL_AVAILABLE)
|| clusterStatus.equals(ClusterAvailabilityStatus.QUORUM_AVAILABLE);
}
private boolean doesConfigReplicationFactorMatchWithCluster() {
return clientPool.runWithRetry(client -> {
try {
CassandraVerifier.currentRfOnKeyspaceMatchesDesiredRf(client, verifierConfig);
return true;
} catch (Exception e) {
log.warn("The config and Cassandra cluster do not agree on the replication factor.", e);
return false;
}
});
}
private ClusterAvailabilityStatus getStatusByRunningOperationsOnEachHost() {
int countUnreachableNodes = 0;
for (CassandraServer server : clientPool.getCurrentPools().keySet()) {
try {
clientPool.runOnCassandraServer(server, CassandraVerifier.healthCheck);
if (!partitionerIsValid(server)) {
return ClusterAvailabilityStatus.TERMINAL;
}
} catch (Exception e) {
countUnreachableNodes++;
}
}
return getNodeAvailabilityStatus(countUnreachableNodes);
}
private boolean partitionerIsValid(CassandraServer host) {
try {
clientPool.runOnCassandraServer(host, clientPool.getValidatePartitioner());
return true;
} catch (Exception e) {
return false;
}
}
private ClusterAvailabilityStatus getNodeAvailabilityStatus(int countUnreachableNodes) {
if (countUnreachableNodes == 0) {
return ClusterAvailabilityStatus.ALL_AVAILABLE;
} else if (isQuorumAvailable(countUnreachableNodes)) {
return ClusterAvailabilityStatus.QUORUM_AVAILABLE;
} else {
return ClusterAvailabilityStatus.NO_QUORUM_AVAILABLE;
}
}
private boolean isQuorumAvailable(int countUnreachableNodes) {
int replicationFactor = runtimeConfig.get().replicationFactor();
return countUnreachableNodes < (replicationFactor + 1) / 2;
}
@Override
public CassandraClientPool getClientPool() {
return clientPool;
}
@Override
public TracingQueryRunner getTracingQueryRunner() {
return queryRunner;
}
@Override
public CassandraTables getCassandraTables() {
return cassandraTables;
}
@Override
public boolean performanceIsSensitiveToTombstones() {
return true;
}
/**
* Asynchronously gets values from the cassandra key-value store.
*
* @param tableRef the name of the table to retrieve values from.
* @param timestampByCell specifies, for each row, the maximum timestamp (exclusive) at which to
* retrieve that rows's value.
* @return listenable future map of retrieved values. Values which do not exist (either
* because they were deleted or never created in the first place)
* are simply not returned.
*/
@Override
public ListenableFuture> getAsync(TableReference tableRef, Map timestampByCell) {
if (timestampByCell.isEmpty()) {
log.info("Attempted get with no specified cells", LoggingArgs.tableRef(tableRef));
return Futures.immediateFuture(ImmutableMap.of());
}
if (asyncKeyValueService.isValid()) {
try {
return Futures.catching(
asyncKeyValueService.getAsync(tableRef, timestampByCell),
IllegalStateException.class,
e -> {
log.warn(
"CQL Client closed during getAsync. Delegating to synchronous get. This should be"
+ " very rare, and only happen once after the Cassandra Server list has"
+ " changed.",
e);
return this.get(tableRef, timestampByCell);
},
executor);
} catch (IllegalStateException | DriverInternalError e) {
// If the container is closed, or we've reloaded into an invalid ThrowingCqlClient, after testing for
// validity
return Futures.immediateFuture(this.get(tableRef, timestampByCell));
}
} else {
return Futures.immediateFuture(this.get(tableRef, timestampByCell));
}
}
private static class TableCellAndValue {
private static byte[] extractRowName(TableCellAndValue input) {
return input.cell.getRowName();
}
private static Long getSize(TableCellAndValue input) {
return input.value.length + Cells.getApproxSizeOfCell(input.cell);
}
private final TableReference tableRef;
private final Cell cell;
private final byte[] value;
TableCellAndValue(TableReference tableRef, Cell cell, byte[] value) {
this.tableRef = tableRef;
this.cell = cell;
this.value = value;
}
}
}
| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |