com.palantir.atlasdb.keyvalue.dbkvs.impl.DbKvs Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of atlasdb-dbkvs Show documentation
Show all versions of atlasdb-dbkvs Show documentation
Palantir open source project
/*
* (c) Copyright 2018 Palantir Technologies Inc. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.palantir.atlasdb.keyvalue.dbkvs.impl;
import com.google.common.base.Function;
import com.google.common.base.Functions;
import com.google.common.base.Stopwatch;
import com.google.common.base.Suppliers;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.Collections2;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSortedMap;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.Ordering;
import com.google.common.collect.Sets;
import com.google.common.util.concurrent.Atomics;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.palantir.async.initializer.AsyncInitializer;
import com.palantir.atlasdb.AtlasDbConstants;
import com.palantir.atlasdb.keyvalue.api.BatchColumnRangeSelection;
import com.palantir.atlasdb.keyvalue.api.CandidateCellForSweeping;
import com.palantir.atlasdb.keyvalue.api.CandidateCellForSweepingRequest;
import com.palantir.atlasdb.keyvalue.api.Cell;
import com.palantir.atlasdb.keyvalue.api.CheckAndSetCompatibility;
import com.palantir.atlasdb.keyvalue.api.CheckAndSetException;
import com.palantir.atlasdb.keyvalue.api.CheckAndSetRequest;
import com.palantir.atlasdb.keyvalue.api.ClusterAvailabilityStatus;
import com.palantir.atlasdb.keyvalue.api.ColumnRangeSelection;
import com.palantir.atlasdb.keyvalue.api.ColumnSelection;
import com.palantir.atlasdb.keyvalue.api.KeyAlreadyExistsException;
import com.palantir.atlasdb.keyvalue.api.KeyValueService;
import com.palantir.atlasdb.keyvalue.api.MultiCheckAndSetException;
import com.palantir.atlasdb.keyvalue.api.MultiCheckAndSetRequest;
import com.palantir.atlasdb.keyvalue.api.RangeRequest;
import com.palantir.atlasdb.keyvalue.api.RangeRequests;
import com.palantir.atlasdb.keyvalue.api.RowColumnRangeIterator;
import com.palantir.atlasdb.keyvalue.api.RowResult;
import com.palantir.atlasdb.keyvalue.api.TableReference;
import com.palantir.atlasdb.keyvalue.api.TimestampRangeDelete;
import com.palantir.atlasdb.keyvalue.api.Value;
import com.palantir.atlasdb.keyvalue.dbkvs.DbKeyValueServiceConfig;
import com.palantir.atlasdb.keyvalue.dbkvs.DdlConfig;
import com.palantir.atlasdb.keyvalue.dbkvs.H2DdlConfig;
import com.palantir.atlasdb.keyvalue.dbkvs.ImmutablePostgresDdlConfig;
import com.palantir.atlasdb.keyvalue.dbkvs.OracleDdlConfig;
import com.palantir.atlasdb.keyvalue.dbkvs.OracleTableNameGetter;
import com.palantir.atlasdb.keyvalue.dbkvs.OracleTableNameGetterImpl;
import com.palantir.atlasdb.keyvalue.dbkvs.PostgresDdlConfig;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.batch.AccumulatorStrategies;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.batch.BatchingStrategies;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.batch.BatchingTaskRunner;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.batch.ImmediateSingleBatchTaskRunner;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.batch.ParallelTaskRunner;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.oracle.OracleCellTsPageLoader;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.oracle.OracleGetRange;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.oracle.OracleOverflowValueLoader;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.postgres.DbkvsVersionException;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.postgres.PostgresCellTsPageLoader;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.postgres.PostgresGetRange;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.postgres.PostgresPrefixedTableNames;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.ranges.DbKvsGetRange;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.ranges.DbKvsGetRanges;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.sweep.CellTsPairLoader;
import com.palantir.atlasdb.keyvalue.dbkvs.impl.sweep.DbKvsGetCandidateCellsForSweeping;
import com.palantir.atlasdb.keyvalue.dbkvs.util.DbKvsPartitioners;
import com.palantir.atlasdb.keyvalue.impl.AbstractKeyValueService;
import com.palantir.atlasdb.keyvalue.impl.Cells;
import com.palantir.atlasdb.keyvalue.impl.IterablePartitioner;
import com.palantir.atlasdb.keyvalue.impl.LocalRowColumnRangeIterator;
import com.palantir.atlasdb.logging.LoggingArgs;
import com.palantir.atlasdb.spi.SharedResourcesConfig;
import com.palantir.atlasdb.tracing.TraceStatistics;
import com.palantir.common.annotation.Output;
import com.palantir.common.base.ClosableIterator;
import com.palantir.common.base.ClosableIterators;
import com.palantir.common.base.Throwables;
import com.palantir.common.collect.Maps2;
import com.palantir.common.concurrent.BlockingWorkerPool;
import com.palantir.common.concurrent.PTExecutors;
import com.palantir.common.concurrent.SharedFixedExecutors;
import com.palantir.exception.PalantirSqlException;
import com.palantir.logsafe.Preconditions;
import com.palantir.logsafe.SafeArg;
import com.palantir.logsafe.UnsafeArg;
import com.palantir.logsafe.exceptions.SafeNullPointerException;
import com.palantir.logsafe.logger.SafeLogger;
import com.palantir.logsafe.logger.SafeLoggerFactory;
import com.palantir.nexus.db.sql.AgnosticLightResultRow;
import com.palantir.nexus.db.sql.AgnosticResultRow;
import com.palantir.nexus.db.sql.AgnosticResultSet;
import com.palantir.nexus.db.sql.SqlConnection;
import com.palantir.nylon.threads.ThreadNames;
import com.palantir.util.crypto.Sha256Hash;
import com.palantir.util.paging.AbstractPagingIterable;
import com.palantir.util.paging.SimpleTokenBackedResultsPage;
import com.palantir.util.paging.TokenBackedBasicResultsPage;
import java.sql.Connection;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
public final class DbKvs extends AbstractKeyValueService implements DbKeyValueService {
private static final SafeLogger log = SafeLoggerFactory.get(DbKvs.class);
public static final String ROW = "row_name";
public static final String COL = "col_name";
public static final String TIMESTAMP = "ts";
public static final String VAL = "val";
public static final long DEFAULT_GET_RANGE_OF_TS_BATCH = 1_000_000L;
private long maxRangeOfTimestampsBatchSize = DEFAULT_GET_RANGE_OF_TS_BATCH;
private final DdlConfig config;
private final DbTableFactory dbTables;
private final SqlConnectionSupplier connections;
private final BatchingTaskRunner batchingQueryRunner;
private final OverflowValueLoader overflowValueLoader;
private final DbKvsGetRange getRangeStrategy;
private final DbKvsGetCandidateCellsForSweeping getCandidateCellsForSweepingStrategy;
private final InitializingWrapper wrapper = new InitializingWrapper();
public static DbKeyValueService create(DbKeyValueServiceConfig config, SqlConnectionSupplier sqlConnSupplier) {
return create(config, sqlConnSupplier, AtlasDbConstants.DEFAULT_INITIALIZE_ASYNC);
}
public static DbKeyValueService create(
DbKeyValueServiceConfig config, SqlConnectionSupplier sqlConnSupplier, boolean initializeAsync) {
DbKvs dbKvs = createNoInit(config.ddl(), sqlConnSupplier, config.sharedResourcesConfig());
dbKvs.wrapper.initialize(initializeAsync);
return dbKvs.wrapper.isInitialized() ? dbKvs : dbKvs.wrapper;
}
/**
* Constructor for a SQL (either Postgres or Oracle) backed key value store. This method should not
* be used directly and is exposed to support legacy software. Instead you should prefer the use of
* ConnectionManagerAwareDbKvs which will instantiate a properly initialized DbKVS using the above create method
*/
public static DbKvs createNoInit(
DdlConfig config,
SqlConnectionSupplier connections,
Optional sharedResourcesConfig) {
ExecutorService executor = SharedFixedExecutors.createOrGetShared(
"Atlas Relational KVS",
config.poolSize(),
sharedResourcesConfig.map(SharedResourcesConfig::sharedKvsExecutorSize));
return config.accept(new DdlConfig.Visitor<>() {
@Override
public DbKvs visit(PostgresDdlConfig postgresDdlConfig) {
return createPostgres(executor, postgresDdlConfig, connections);
}
@Override
public DbKvs visit(H2DdlConfig h2DdlConfig) {
PostgresDdlConfig postgresDdlConfig =
ImmutablePostgresDdlConfig.builder().from(h2DdlConfig).build();
return createPostgres(executor, postgresDdlConfig, connections);
}
@Override
public DbKvs visit(OracleDdlConfig oracleDdlConfig) {
return createOracle(executor, oracleDdlConfig, connections);
}
});
}
private static DbKvs createPostgres(
ExecutorService executor, PostgresDdlConfig config, SqlConnectionSupplier connections) {
PostgresPrefixedTableNames prefixedTableNames = new PostgresPrefixedTableNames(config);
DbTableFactory tableFactory = new PostgresDbTableFactory(config, prefixedTableNames);
TableMetadataCache tableMetadataCache = new TableMetadataCache(tableFactory);
CellTsPairLoader cellTsPairLoader = new PostgresCellTsPageLoader(prefixedTableNames, connections);
return new DbKvs(
executor,
config,
tableFactory,
connections,
new ParallelTaskRunner(
newFixedThreadPool(config.poolSize()), config.fetchBatchSize(), config.poolQosSize()),
(conns, tbl, ids) -> Collections.emptyMap(), // no overflow on postgres
new PostgresGetRange(prefixedTableNames, connections, tableMetadataCache),
new DbKvsGetCandidateCellsForSweeping(cellTsPairLoader));
}
private static DbKvs createOracle(
ExecutorService executor, OracleDdlConfig oracleDdlConfig, SqlConnectionSupplier connections) {
OracleTableNameGetter tableNameGetter = OracleTableNameGetterImpl.createDefault(oracleDdlConfig);
OraclePrefixedTableNames prefixedTableNames = new OraclePrefixedTableNames(tableNameGetter);
TableValueStyleCacheImpl valueStyleCache = new TableValueStyleCacheImpl();
DbTableFactory tableFactory = new OracleDbTableFactory(
oracleDdlConfig, tableNameGetter, prefixedTableNames, valueStyleCache, executor);
TableMetadataCache tableMetadataCache = new TableMetadataCache(tableFactory);
OverflowValueLoader overflowValueLoader = new OracleOverflowValueLoader(oracleDdlConfig, tableNameGetter);
DbKvsGetRange getRange = new OracleGetRange(
connections,
overflowValueLoader,
tableNameGetter,
valueStyleCache,
tableMetadataCache,
oracleDdlConfig);
CellTsPairLoader cellTsPageLoader =
new OracleCellTsPageLoader(connections, tableNameGetter, valueStyleCache, oracleDdlConfig);
return new DbKvs(
executor,
oracleDdlConfig,
new OracleDbTableFactory(
oracleDdlConfig,
tableNameGetter,
prefixedTableNames,
valueStyleCache,
PTExecutors.newSingleThreadScheduledExecutor()),
connections,
new ImmediateSingleBatchTaskRunner(),
overflowValueLoader,
getRange,
new DbKvsGetCandidateCellsForSweeping(cellTsPageLoader));
}
private DbKvs(
ExecutorService executor,
DdlConfig config,
DbTableFactory dbTables,
SqlConnectionSupplier connections,
BatchingTaskRunner batchingQueryRunner,
OverflowValueLoader overflowValueLoader,
DbKvsGetRange getRangeStrategy,
DbKvsGetCandidateCellsForSweeping getCandidateCellsForSweepingStrategy) {
super(executor, config.poolQosSize());
this.config = config;
this.dbTables = dbTables;
this.connections = connections;
this.batchingQueryRunner = batchingQueryRunner;
this.overflowValueLoader = overflowValueLoader;
this.getRangeStrategy = getRangeStrategy;
this.getCandidateCellsForSweepingStrategy = getCandidateCellsForSweepingStrategy;
}
private static ExecutorService newFixedThreadPool(int maxPoolSize) {
return PTExecutors.newFixedThreadPool(maxPoolSize, "Atlas DbKvs reader");
}
private void init() {
checkDatabaseVersion();
databaseSpecificInitialization();
createMetadataTable();
}
private void databaseSpecificInitialization() {
runInitialization(new Function() {
@Nullable
@Override
public Void apply(@Nonnull DbTableInitializer initializer) {
initializer.createUtilityTables();
return null;
}
});
}
private void createMetadataTable() {
runInitialization((Function) initializer -> {
initializer.createMetadataTable(config.metadataTable().getQualifiedName());
return null;
});
}
@Override
public void close() {
super.close();
dbTables.close();
connections.close();
batchingQueryRunner.close();
}
@Override
public Map getRows(
TableReference tableRef, Iterable rows, ColumnSelection columnSelection, long timestamp) {
return getRowsBatching(tableRef, rows, columnSelection, timestamp);
}
@Override
public Map get(TableReference tableRef, Map timestampByCell) {
return batchingQueryRunner.runTask(
timestampByCell,
BatchingStrategies.forMap(),
AccumulatorStrategies.forMap(),
cellBatch -> runReadAndExtractResults(tableRef, table -> table.getLatestCells(cellBatch, true)));
}
private Map getRowsBatching(
TableReference tableRef, Iterable rows, ColumnSelection columnSelection, long timestamp) {
return batchingQueryRunner.runTask(
rows,
BatchingStrategies.forIterable(),
AccumulatorStrategies.forMap(),
rowBatch -> runReadAndExtractResults(
tableRef, table -> table.getLatestRows(rowBatch, columnSelection, timestamp, true)));
}
private Map runReadAndExtractResults(
TableReference tableRef, Function> query) {
return runRead(tableRef, table -> extractResults(table, tableRef, query.apply(table)));
}
@SuppressWarnings("deprecation")
private Map extractResults(
DbReadTable table, TableReference tableRef, ClosableIterator rows) {
Map results = new HashMap<>();
Map overflowResults = new HashMap<>();
try (ClosableIterator iter = rows) {
boolean hasOverflow = table.hasOverflowValues();
while (iter.hasNext()) {
AgnosticLightResultRow row = iter.next();
Cell cell = Cell.create(row.getBytes(ROW), row.getBytes(COL));
TraceStatistics.incBytesRead(cell.getRowName().length);
TraceStatistics.incBytesRead(cell.getColumnName().length);
Long overflowId = hasOverflow ? row.getLongObject("overflow") : null;
if (overflowId == null) {
Value value = Value.create(row.getBytes(VAL), row.getLong(TIMESTAMP));
TraceStatistics.incBytesRead(value.getContents().length);
Value oldValue = results.put(cell, value);
if (oldValue != null && oldValue.getTimestamp() > value.getTimestamp()) {
results.put(cell, oldValue);
}
} else {
// Note: the bytes read for overflow values are tracked when fetching the actual value, this
// just pulls a pointer out of the DB (two longs)
TraceStatistics.incBytesRead(2 * 8);
OverflowValue ov = ImmutableOverflowValue.of(row.getLong(TIMESTAMP), overflowId);
OverflowValue oldOv = overflowResults.put(cell, ov);
if (oldOv != null && oldOv.ts() > ov.ts()) {
overflowResults.put(cell, oldOv);
}
}
}
}
fillOverflowValues(table.getConnectionSupplier(), tableRef, overflowResults, results);
return results;
}
@Override
public Map getLatestTimestamps(TableReference tableRef, Map timestampByCell) {
return batchingQueryRunner.runTask(
timestampByCell,
BatchingStrategies.forMap(),
AccumulatorStrategies.forMap(),
cellBatch -> runRead(tableRef, table -> doGetLatestTimestamps(table, cellBatch)));
}
private static Map doGetLatestTimestamps(DbReadTable table, Map timestampByCell) {
try (ClosableIterator iter = table.getLatestCells(timestampByCell, false)) {
Map results = new HashMap<>();
while (iter.hasNext()) {
AgnosticLightResultRow row = iter.next();
Cell cell = Cell.create(row.getBytes(ROW), row.getBytes(COL));
long ts = row.getLong(TIMESTAMP);
Long oldTs = results.put(cell, ts);
if (oldTs != null && oldTs > ts) {
results.put(cell, oldTs);
}
}
return results;
}
}
public Function, Long> getByteSizingFunction() {
return entry -> Cells.getApproxSizeOfCell(entry.getKey()) + entry.getValue().length;
}
public Function, Long> getValueSizingFunction() {
return entry ->
Cells.getApproxSizeOfCell(entry.getKey()) + entry.getValue().getContents().length;
}
/**
* @see com.palantir.atlasdb.keyvalue.api.KeyValueService#multiPut(java.util.Map, long)
*/
@Override
public void multiPut(Map> valuesByTable, final long timestamp)
throws KeyAlreadyExistsException {
List> callables = new ArrayList<>();
for (Map.Entry> e : valuesByTable.entrySet()) {
final TableReference table = e.getKey();
// We sort here because some key value stores are more efficient if you store adjacent keys together.
NavigableMap sortedMap = ImmutableSortedMap.copyOf(e.getValue());
Iterable>> partitions = IterablePartitioner.partitionByCountAndBytes(
sortedMap.entrySet(),
getMultiPutBatchCount(),
getMultiPutBatchSizeBytes(),
table,
entry -> entry == null ? 0 : entry.getValue().length + Cells.getApproxSizeOfCell(entry.getKey()));
for (final List> p : partitions) {
callables.add(() -> {
String originalName = Thread.currentThread().getName();
ThreadNames.setThreadName(
Thread.currentThread(), "Atlas multiPut of " + p.size() + " cells into " + table);
try {
put(table, Maps2.fromEntries(p), timestamp);
return null;
} finally {
ThreadNames.setThreadName(Thread.currentThread(), originalName);
}
});
}
}
BlockingWorkerPool pool = new BlockingWorkerPool<>(executor, executorQosSize);
try {
for (Callable callable : callables) {
pool.submitCallable(callable);
}
pool.waitForSubmittedTasks();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new RuntimeException(e);
}
}
@Override
public void put(TableReference tableRef, Map values, long timestamp)
throws KeyAlreadyExistsException {
put(tableRef, values, timestamp, true);
}
private void put(TableReference tableRef, Map values, long timestamp, boolean idempotent) {
Iterable>> batches = IterablePartitioner.partitionByCountAndBytes(
values.entrySet(),
config.mutationBatchCount(),
config.mutationBatchSizeBytes(),
tableRef,
getByteSizingFunction());
runReadWrite(tableRef, (readTable, writeTable) -> {
for (List> batch : batches) {
try {
writeTable.put(batch, timestamp);
} catch (KeyAlreadyExistsException e) {
if (idempotent) {
putIfNotUpdate(readTable, writeTable, tableRef, batch, timestamp, e);
} else {
throw e;
}
}
}
return null;
});
}
private void putIfNotUpdate(
DbReadTable readTable,
DbWriteTable writeTable,
TableReference tableRef,
List> batch,
KeyAlreadyExistsException ex) {
Map timestampByCell = new HashMap<>();
for (Map.Entry entry : batch) {
timestampByCell.put(entry.getKey(), entry.getValue().getTimestamp() + 1);
}
Map results = extractResults(readTable, tableRef, readTable.getLatestCells(timestampByCell, true));
ListIterator> iter = batch.listIterator();
while (iter.hasNext()) {
Map.Entry entry = iter.next();
Cell key = entry.getKey();
Value value = entry.getValue();
if (results.containsKey(key)) {
if (results.get(key).equals(value)) {
iter.remove();
} else {
throw new KeyAlreadyExistsException(
"primary key violation", ex, UnsafeArg.of("key", key), UnsafeArg.of("value", value));
}
}
}
writeTable.put(batch);
}
private void putIfNotUpdate(
DbReadTable readTable,
DbWriteTable writeTable,
TableReference tableRef,
List> batch,
long timestamp,
KeyAlreadyExistsException ex) {
List> batchValues = Lists.transform(
batch, input -> Maps.immutableEntry(input.getKey(), Value.create(input.getValue(), timestamp)));
putIfNotUpdate(readTable, writeTable, tableRef, batchValues, ex);
}
@Override
public void putWithTimestamps(TableReference tableRef, Multimap cellValues)
throws KeyAlreadyExistsException {
Iterable>> batches = IterablePartitioner.partitionByCountAndBytes(
cellValues.entries(),
config.mutationBatchCount(),
config.mutationBatchSizeBytes(),
tableRef,
getValueSizingFunction());
runReadWrite(tableRef, (readTable, writeTable) -> {
for (List> batch : batches) {
try {
writeTable.put(batch);
} catch (KeyAlreadyExistsException e) {
putIfNotUpdate(readTable, writeTable, tableRef, batch, e);
}
}
return null;
});
}
@Override
public void putUnlessExists(TableReference tableRef, Map values) throws KeyAlreadyExistsException {
put(tableRef, values, AtlasDbConstants.TRANSACTION_TS, false);
}
@Override
public void setOnce(TableReference tableRef, Map values) {
throw new UnsupportedOperationException();
}
@Override
public void checkAndSet(CheckAndSetRequest checkAndSetRequest) throws CheckAndSetException {
if (checkAndSetRequest.oldValue().isPresent()) {
executeCheckAndSet(checkAndSetRequest);
} else {
executePutUnlessExists(checkAndSetRequest);
}
}
@Override
public void multiCheckAndSet(MultiCheckAndSetRequest multiCheckAndSetRequest) throws MultiCheckAndSetException {
throw new UnsupportedOperationException("DbKvs does not support multi-checkAndSet operation!");
}
private void executeCheckAndSet(CheckAndSetRequest request) {
Preconditions.checkArgument(request.oldValue().isPresent());
runWrite(request.table(), table -> {
//noinspection OptionalGetWithoutIsPresent
table.update(
request.cell(),
AtlasDbConstants.TRANSACTION_TS,
request.oldValue().get(),
request.newValue());
return null;
});
}
private void executePutUnlessExists(CheckAndSetRequest checkAndSetRequest) {
try {
Map value = ImmutableMap.of(checkAndSetRequest.cell(), checkAndSetRequest.newValue());
putUnlessExists(checkAndSetRequest.table(), value);
} catch (KeyAlreadyExistsException e) {
throw new CheckAndSetException("Value unexpectedly present when running check and set", e);
}
}
@Override
public void delete(TableReference tableRef, Multimap keys) {
// QA-86494: We sort our deletes here because we have seen oracle deadlock errors here.
ImmutableList> sorted = ORDERING.immutableSortedCopy(keys.entries());
Iterable>> partitions = IterablePartitioner.partitionByCountAndBytes(
sorted,
10000,
getMultiPutBatchSizeBytes(),
tableRef,
entry -> Cells.getApproxSizeOfCell(entry.getKey()) + 8);
runWriteForceAutocommit(tableRef, (Function) table -> {
for (List> partition : partitions) {
table.delete(partition);
}
return null;
});
}
private static final Ordering> ORDERING = Ordering.from((entry1, entry2) -> {
int comparison = Ordering.natural().compare(entry1.getKey(), entry2.getKey());
if (comparison == 0) {
comparison = Ordering.natural().compare(entry1.getValue(), entry2.getValue());
}
return comparison;
});
@Override
public void deleteRange(TableReference tableRef, RangeRequest range) {
runWriteForceAutocommit(tableRef, (Function) table -> {
table.delete(range);
return null;
});
}
@Override
public void deleteAllTimestamps(TableReference tableRef, Map deletes) {
runWriteForceAutocommit(tableRef, (Function) table -> {
table.deleteAllTimestamps(deletes);
return null;
});
}
@Override
public Map, byte[]>> getFirstBatchForRanges(
TableReference tableRef, Iterable rangeRequests, long timestamp) {
return new DbKvsGetRanges(this, dbTables.getDbType(), connections, dbTables.getPrefixedTableNames())
.getFirstBatchForRanges(tableRef, rangeRequests, timestamp);
}
@Override
public ClosableIterator> getRange(
TableReference tableRef, RangeRequest rangeRequest, long timestamp) {
return ClosableIterators.wrapWithEmptyClose(getRangeStrategy.getRange(tableRef, rangeRequest, timestamp));
}
public void setMaxRangeOfTimestampsBatchSize(long newValue) {
maxRangeOfTimestampsBatchSize = newValue;
}
public long getMaxRangeOfTimestampsBatchSize() {
return maxRangeOfTimestampsBatchSize;
}
/**
* @param tableRef the name of the table to read from.
* @param rangeRequest the range to load.
* @param timestamp the maximum timestamp to load.
*
* @return Each row that has fewer than maxRangeOfTimestampsBatchSize entries is guaranteed to be returned in a
* single RowResult. If a row has more than maxRangeOfTimestampsBatchSize results, it will potentially be split
* into multiple RowResults, by finishing the current column; see example below. Note that:
* 1) this may cause a RowResult to have more than maxRangeOfTimestampsBatchSize entries
* 2) this may still finish a row, in which case there is going to be only one RowResult for that row.
* It is, furthermore, guaranteed that the columns will be read in ascending order
*
* E.g., for the following table, rangeRequest taking all rows in ascending order,
* maxRangeOfTimestampsBatchSize == 5, and timestamp 10:
*
* a | b | c | d
* ------------------------------------------------
* a | (1, 2, 3) | (1, 2, 3) | (4, 5, 6) | (4, 5, 6)|
* ------------------------------------------------
* b | (1, 3, 5) | - | (1) | - |
* ------------------------------------------------
* c | (1, 2) | (1, 2) | (4, 5, 6) | (4, 5, 6)|
* ------------------------------------------------
* d | (1, 3, 5) | - | (1, 2, 3) | - |
* ------------------------------------------------
* e | (1, 3) | - | - | - |
* ------------------------------------------------
*
* The RowResults will be:
* 1. (a, (a -> 1, 2, 3; b -> 1, 2, 3))
* 2. (a, (c -> 4, 5, 6; d -> 4, 5, 6))
*
* 3. (b, (a -> 1, 3, 5; b -> 1))
*
* 4. (c, (a -> 1, 2, b -> 1, 2; c -> 4, 5, 6))
* 5. (c, (d -> 4, 5, 6))
*
* 6. (d, (a -> 1, 3, 5, c -> 1, 2, 3))
*
* 7. (e, (a -> 1, 3))
*/
@Override
public ClosableIterator>> getRangeOfTimestamps(
TableReference tableRef, RangeRequest rangeRequest, long timestamp) {
Iterable>> rows =
new AbstractPagingIterable<
RowResult>, TokenBackedBasicResultsPage>, Token>>() {
@Override
protected TokenBackedBasicResultsPage>, Token> getFirstPage() {
return getTimestampsPage(
tableRef, rangeRequest, timestamp, maxRangeOfTimestampsBatchSize, Tokens.INITIAL);
}
@Override
protected TokenBackedBasicResultsPage>, Token> getNextPage(
TokenBackedBasicResultsPage>, Token> previous) {
Token token = previous.getTokenForNextPage();
RangeRequest newRange = rangeRequest
.getBuilder()
.startRowInclusive(token.row())
.build();
return getTimestampsPage(tableRef, newRange, timestamp, maxRangeOfTimestampsBatchSize, token);
}
};
return ClosableIterators.wrapWithEmptyClose(rows.iterator());
}
@Override
public ClosableIterator> getCandidateCellsForSweeping(
TableReference tableRef, CandidateCellForSweepingRequest request) {
return ClosableIterators.wrapWithEmptyClose(
getCandidateCellsForSweepingStrategy.getCandidateCellsForSweeping(tableRef, request));
}
private TokenBackedBasicResultsPage>, Token> getTimestampsPage(
TableReference tableRef, RangeRequest range, long timestamp, long batchSize, Token token) {
Stopwatch watch = Stopwatch.createStarted();
try {
return runRead(tableRef, table -> getTimestampsPageInternal(table, range, timestamp, batchSize, token));
} finally {
log.debug(
"Call to KVS.getTimestampsPage on table {} took {} ms.",
LoggingArgs.tableRef(tableRef),
SafeArg.of("elapsed", watch.elapsed(TimeUnit.MILLISECONDS)));
}
}
private TokenBackedBasicResultsPage>, Token> getTimestampsPageInternal(
DbReadTable table, RangeRequest range, long timestamp, long batchSize, Token token) {
Set rows = Collections.newSetFromMap(new IdentityHashMap<>());
int maxRows = getMaxRowsFromBatchHint(range.getBatchHint());
try (ClosableIterator rangeResults = table.getRange(range, timestamp, maxRows)) {
while (rows.size() < maxRows && rangeResults.hasNext()) {
byte[] rowName = rangeResults.next().getBytes(ROW);
if (rowName != null) {
rows.add(rowName);
}
}
if (rows.isEmpty()) {
return SimpleTokenBackedResultsPage.create(null, ImmutableList.of(), false);
}
}
ColumnSelection cols = range.getColumnNames().isEmpty()
? ColumnSelection.all()
: ColumnSelection.create(range.getColumnNames());
TimestampsByCellResultWithToken result =
getTimestampsByCell(table, rows, cols, timestamp, batchSize, range.isReverse(), token);
NavigableMap>> cellsByRow =
Cells.breakCellsUpByRow(Multimaps.asMap(result.entries));
if (range.isReverse()) {
cellsByRow = cellsByRow.descendingMap();
}
List>> finalResults = cellsByRow.entrySet().stream()
.map(entry -> RowResult.create(entry.getKey(), entry.getValue()))
.collect(Collectors.toList());
return SimpleTokenBackedResultsPage.create(result.getToken(), finalResults, result.mayHaveMoreResults());
}
private TimestampsByCellResultWithToken getTimestampsByCell(
DbReadTable table,
Iterable rows,
ColumnSelection columns,
long timestamp,
long batchSize,
boolean isReverse,
Token token) {
try (ClosableIterator iterator =
table.getAllRows(rows, columns, timestamp, false, DbReadTable.Order.fromBoolean(isReverse))) {
return TimestampsByCellResultWithToken.create(iterator, token, batchSize, isReverse);
}
}
@Override
public Map getRowsColumnRange(
TableReference tableRef,
Iterable rows,
BatchColumnRangeSelection batchColumnRangeSelection,
long timestamp) {
List rowList = ImmutableList.copyOf(rows);
Map>> firstPage =
getFirstRowsColumnRangePage(tableRef, rowList, batchColumnRangeSelection, timestamp);
Map ret = Maps.newHashMapWithExpectedSize(rowList.size());
for (Map.Entry>> e : firstPage.entrySet()) {
List> results = e.getValue();
if (results.isEmpty()) {
ret.put(e.getKey(), new LocalRowColumnRangeIterator(e.getValue().iterator()));
continue;
}
byte[] lastCol = results.get(results.size() - 1).getKey().getColumnName();
RowColumnRangeIterator firstPageIter =
new LocalRowColumnRangeIterator(e.getValue().iterator());
if (isEndOfColumnRange(lastCol, batchColumnRangeSelection.getEndCol())) {
ret.put(e.getKey(), firstPageIter);
} else {
byte[] nextCol = RangeRequests.nextLexicographicName(lastCol);
BatchColumnRangeSelection nextColumnRangeSelection = BatchColumnRangeSelection.create(
nextCol, batchColumnRangeSelection.getEndCol(), batchColumnRangeSelection.getBatchHint());
Iterator> nextPagesIter =
getRowColumnRange(tableRef, e.getKey(), nextColumnRangeSelection, timestamp);
ret.put(e.getKey(), new LocalRowColumnRangeIterator(Iterators.concat(firstPageIter, nextPagesIter)));
}
}
return ret;
}
@Override
public RowColumnRangeIterator getRowsColumnRange(
TableReference tableRef,
Iterable rows,
ColumnRangeSelection columnRangeSelection,
int cellBatchHint,
long timestamp) {
List rowList = ImmutableList.copyOf(rows);
Map rowHashesToBytes = Maps.uniqueIndex(rowList, Sha256Hash::computeHash);
Map columnCountByRowHash =
getColumnCounts(tableRef, rowList, columnRangeSelection, timestamp);
Iterator | | | | | | | | | | | | | | | | | | | | | | | | | |