com.bazaarvoice.emodb.sor.db.astyanax.CqlBlockedDataReaderDAO Maven / Gradle / Ivy
package com.bazaarvoice.emodb.sor.db.astyanax;
import com.bazaarvoice.emodb.common.api.impl.LimitCounter;
import com.bazaarvoice.emodb.common.cassandra.CqlDriverConfiguration;
import com.bazaarvoice.emodb.common.cassandra.cqldriver.AdaptiveResultSet;
import com.bazaarvoice.emodb.common.uuid.TimeUUIDs;
import com.bazaarvoice.emodb.sor.api.Change;
import com.bazaarvoice.emodb.sor.api.Compaction;
import com.bazaarvoice.emodb.sor.api.ReadConsistency;
import com.bazaarvoice.emodb.sor.api.UnknownTableException;
import com.bazaarvoice.emodb.sor.db.DAOUtils;
import com.bazaarvoice.emodb.sor.db.DataReaderDAO;
import com.bazaarvoice.emodb.sor.db.Key;
import com.bazaarvoice.emodb.sor.db.MultiTableScanOptions;
import com.bazaarvoice.emodb.sor.db.MultiTableScanResult;
import com.bazaarvoice.emodb.sor.db.Record;
import com.bazaarvoice.emodb.sor.db.RecordEntryRawMetadata;
import com.bazaarvoice.emodb.sor.db.ScanRange;
import com.bazaarvoice.emodb.sor.db.ScanRangeSplits;
import com.bazaarvoice.emodb.sor.db.cql.CachingRowGroupIterator;
import com.bazaarvoice.emodb.sor.db.cql.CqlForMultiGets;
import com.bazaarvoice.emodb.sor.db.cql.CqlForScans;
import com.bazaarvoice.emodb.sor.db.cql.CqlReaderDAODelegate;
import com.bazaarvoice.emodb.sor.db.cql.RowGroupResultSetIterator;
import com.bazaarvoice.emodb.sor.db.test.DeltaClusteringKey;
import com.bazaarvoice.emodb.table.db.DroppedTableException;
import com.bazaarvoice.emodb.table.db.Table;
import com.bazaarvoice.emodb.table.db.TableSet;
import com.bazaarvoice.emodb.table.db.astyanax.AstyanaxStorage;
import com.bazaarvoice.emodb.table.db.astyanax.AstyanaxTable;
import com.bazaarvoice.emodb.table.db.astyanax.Placement;
import com.bazaarvoice.emodb.table.db.astyanax.PlacementCache;
import com.bazaarvoice.emodb.table.db.eventregistry.StorageReaderDAO;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.codahale.metrics.annotation.Timed;
import com.datastax.driver.core.CodecRegistry;
import com.datastax.driver.core.ConsistencyLevel;
import com.datastax.driver.core.ProtocolVersion;
import com.datastax.driver.core.ResultSet;
import com.datastax.driver.core.Row;
import com.datastax.driver.core.Session;
import com.datastax.driver.core.Statement;
import com.datastax.driver.core.querybuilder.QueryBuilder;
import com.datastax.driver.core.querybuilder.Select;
import com.datastax.driver.core.utils.MoreFutures;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Supplier;
import com.google.common.base.Suppliers;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.BoundType;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Ordering;
import com.google.common.collect.PeekingIterator;
import com.google.common.collect.Range;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.inject.Inject;
import com.netflix.astyanax.model.ByteBufferRange;
import com.netflix.astyanax.util.ByteBufferRangeImpl;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.nio.ByteBuffer;
import java.time.Instant;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Spliterators;
import java.util.UUID;
import java.util.concurrent.TimeoutException;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import static com.datastax.driver.core.querybuilder.QueryBuilder.asc;
import static com.datastax.driver.core.querybuilder.QueryBuilder.desc;
import static com.datastax.driver.core.querybuilder.QueryBuilder.eq;
import static com.datastax.driver.core.querybuilder.QueryBuilder.gt;
import static com.datastax.driver.core.querybuilder.QueryBuilder.gte;
import static com.datastax.driver.core.querybuilder.QueryBuilder.in;
import static com.datastax.driver.core.querybuilder.QueryBuilder.lt;
import static com.datastax.driver.core.querybuilder.QueryBuilder.lte;
import static com.datastax.driver.core.querybuilder.QueryBuilder.token;
import static com.google.common.base.Preconditions.checkArgument;
import static java.util.Objects.requireNonNull;
import static java.util.Optional.ofNullable;
// Delegates to AstyanaxReaderDAO for non-CQL stuff
// Once we transition fully, we will stop delegating to Astyanax
public class CqlBlockedDataReaderDAO implements DataReaderDAO, StorageReaderDAO {
private final Logger _log = LoggerFactory.getLogger(CqlBlockedDataReaderDAO.class);
/**
* Depending on the placement and type of data being queried (delta or delta history) the names of the
* columns being queried can change. However, by quering the columns in a fixed well-known order in each
* {@link QueryBuilder#select()} the results can be efficiently read by position rather than name.
*/
private static final int ROW_KEY_RESULT_SET_COLUMN = 0;
private static final int CHANGE_ID_RESULT_SET_COLUMN = 1;
private static final int VALUE_RESULT_SET_COLUMN = 2;
private static final int BLOCK_RESULT_SET_COLUMN = 3;
private final DataReaderDAO _astyanaxReaderDAO;
private final ChangeEncoder _changeEncoder;
private final PlacementCache _placementCache;
private final CqlDriverConfiguration _driverConfig;
private final Meter _randomReadMeter;
private final Timer _readBatchTimer;
private final DAOUtils _daoUtils;
private final int _deltaPrefixLength;
// Support AB testing of various uses of the CQL driver versus the older but (at this point) more vetted Astyanax driver.
private volatile Supplier _useCqlForMultiGets = Suppliers.ofInstance(true);
private volatile Supplier _useCqlForScans = Suppliers.ofInstance(true);
@Inject
public CqlBlockedDataReaderDAO(@CqlReaderDAODelegate DataReaderDAO delegate, PlacementCache placementCache,
CqlDriverConfiguration driverConfig, ChangeEncoder changeEncoder,
MetricRegistry metricRegistry, DAOUtils daoUtils, @PrefixLength int deltaPrefixLength) {
_astyanaxReaderDAO = requireNonNull(delegate, "delegate");
_placementCache = placementCache;
_driverConfig = driverConfig;
_changeEncoder = changeEncoder;
_randomReadMeter = metricRegistry.meter(getMetricName("random-reads"));
_readBatchTimer = metricRegistry.timer(getMetricName("readBatch"));
_deltaPrefixLength = deltaPrefixLength;
_daoUtils = daoUtils;
}
private String getMetricName(String name) {
return MetricRegistry.name("bv.emodb.sor", "CqlDataReaderDAO", name);
}
// Since AB testing of CQL driver is temporary until proven out don't change the constructor to support this feature.
// Inject the AB testing flags independently. This will make backing these settings out easier in the future.
@Inject
public void setUseCqlforMultiGets(@CqlForMultiGets Supplier useCqlForMultiGets) {
_useCqlForMultiGets = requireNonNull(useCqlForMultiGets, "useCqlForMultiGets");
}
@Inject
public void setUseCqlforScans(@CqlForScans Supplier useCqlForScans) {
_useCqlForScans = requireNonNull(useCqlForScans, "useCqlForScans");
}
/**
* This CQL based read method works for a row with 64 deltas of 3 MB each. The same read with the AstyanaxDataReaderDAO
* would give Thrift frame errors.
*/
@Override
public Record read(Key key, ReadConsistency consistency) {
requireNonNull(key, "key");
requireNonNull(consistency, "consistency");
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
ByteBuffer rowKey = storage.getRowKey(key.getKey());
return read(key, rowKey, consistency, placement);
}
@Override
public Iterator readAll(Collection keys, final ReadConsistency consistency) {
if (!_useCqlForMultiGets.get()) {
return _astyanaxReaderDAO.readAll(keys, consistency);
}
requireNonNull(keys, "keys");
requireNonNull(consistency, "consistency");
// Group the keys by placement. Each placement will result in a separate set of queries. Dedup keys.
Multimap placementMap = HashMultimap.create();
for (Key key : keys) {
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
placementMap.put((DeltaPlacement) storage.getPlacement(), key);
}
// Return an iterator that will loop over the placements and perform a query for each placement and
// return the resulting decoded rows.
return touch(Iterators.concat(Iterators.transform(placementMap.asMap().entrySet().iterator(),
entry -> readBatch(entry.getKey(), entry.getValue(), consistency))));
}
@Override
public String getPlacementCluster(String placementName) {
requireNonNull(placementName, "placement");
DeltaPlacement placement = (DeltaPlacement) _placementCache.get(placementName);
return placement.getKeyspace().getClusterName();
}
private Record read(Key key, ByteBuffer rowKey, ReadConsistency consistency, DeltaPlacement placement) {
requireNonNull(key, "key");
requireNonNull(consistency, "consistency");
BlockedDeltaTableDDL tableDDL = placement.getBlockedDeltaTableDDL();
Statement statement = selectDeltaFrom(tableDDL)
.where(eq(tableDDL.getRowKeyColumnName(), rowKey))
.setConsistencyLevel(SorConsistencies.toCql(consistency));
// Track metrics
_randomReadMeter.mark();
Iterator> groupedRows = deltaQuery(placement, statement, true, "Failed to read record %s", key);
Iterable rows;
if (groupedRows.hasNext()) {
rows = groupedRows.next();
} else {
rows = ImmutableList.of();
}
// Convert the results into a Record object, lazily fetching the rest of the columns as necessary.
return newRecordFromCql(key, rows, placement, ByteBufferUtil.bytesToHex(rowKey));
}
/**
* Synchronously executes the provided statement. The statement must query the delta table as returned from
* {@link com.bazaarvoice.emodb.sor.db.astyanax.DeltaPlacement#getBlockedDeltaTableDDL()}
*/
private Iterator> deltaQuery(DeltaPlacement placement, Statement statement, boolean singleRow,
String errorContext, Object... errorContextArgs) {
return doDeltaQuery(placement, statement, singleRow, false, errorContext, errorContextArgs);
}
/**
* Asynchronously executes the provided statement. Although the iterator is returned immediately the actual results
* may still be loading in the background. The statement must query the delta table as returned from
* {@link com.bazaarvoice.emodb.sor.db.astyanax.DeltaPlacement#getBlockedDeltaTableDDL()}
*/
private Iterator> deltaQueryAsync(DeltaPlacement placement, Statement statement, boolean singleRow,
String errorContext, Object... errorContextArgs) {
return doDeltaQuery(placement, statement, singleRow, true, errorContext, errorContextArgs);
}
private Iterator> doDeltaQuery(DeltaPlacement placement, Statement statement, boolean singleRow, boolean async,
String errorContext, Object... errorContextArgs) {
// Set the fetch size and prefetch limits depending on whether the query is for a single row or multiple rows.
int fetchSize = singleRow ? _driverConfig.getSingleRowFetchSize() : _driverConfig.getMultiRowFetchSize();
int prefetchLimit = singleRow ? _driverConfig.getSingleRowPrefetchLimit() : _driverConfig.getMultiRowPrefetchLimit();
Session session = placement.getKeyspace().getCqlSession();
DeltaRowGroupResultSetIterator deltaRowGroupResultSetIterator;
if (async) {
ListenableFuture resultSetFuture = AdaptiveResultSet.executeAdaptiveQueryAsync(session, statement, fetchSize);
deltaRowGroupResultSetIterator = new DeltaRowGroupResultSetIterator(
resultSetFuture, prefetchLimit, placement, statement.getConsistencyLevel());
Futures.addCallback(resultSetFuture, new MoreFutures.FailureCallback() {
@Override
public void onFailure(Throwable t) {
_log.error(String.format(errorContext, errorContextArgs), t);
}
});
} else {
try {
ResultSet resultSet = AdaptiveResultSet.executeAdaptiveQuery(session, statement, fetchSize);
deltaRowGroupResultSetIterator = new DeltaRowGroupResultSetIterator(
resultSet, prefetchLimit, placement, statement.getConsistencyLevel());
} catch (Throwable t) {
_log.error(String.format(errorContext, errorContextArgs), t);
throw t;
}
}
return new CachingRowGroupIterator(deltaRowGroupResultSetIterator, _driverConfig.getRecordCacheSize(), _driverConfig.getRecordSoftCacheSize());
}
/**
* Creates a Record instance for a given key and list of rows. All rows must be from the same Cassandra row;
* in other words, it is expected that row.getBytesUnsafe(ROW_KEY_RESULT_SET_COLUMN) returns the same value for
* each row in rows.
*/
private Record newRecordFromCql(Key key, Iterable rows, Placement placement, String rowKey) {
Session session = placement.getKeyspace().getCqlSession();
ProtocolVersion protocolVersion = session.getCluster().getConfiguration().getProtocolOptions().getProtocolVersion();
CodecRegistry codecRegistry = session.getCluster().getConfiguration().getCodecRegistry();
Iterator> changeIter = decodeChangesFromCql(new CqlDeltaIterator(rows.iterator(), BLOCK_RESULT_SET_COLUMN, CHANGE_ID_RESULT_SET_COLUMN, VALUE_RESULT_SET_COLUMN, false, _deltaPrefixLength, protocolVersion, codecRegistry, rowKey));
Iterator> compactionIter = decodeCompactionsFromCql(new CqlDeltaIterator(rows.iterator(), BLOCK_RESULT_SET_COLUMN, CHANGE_ID_RESULT_SET_COLUMN, VALUE_RESULT_SET_COLUMN, false, _deltaPrefixLength, protocolVersion, codecRegistry, rowKey));
Iterator rawMetadataIter = rawMetadataFromCql(new CqlDeltaIterator(rows.iterator(), BLOCK_RESULT_SET_COLUMN, CHANGE_ID_RESULT_SET_COLUMN, VALUE_RESULT_SET_COLUMN, false, _deltaPrefixLength, protocolVersion, codecRegistry, rowKey));
return new RecordImpl(key, compactionIter, changeIter, rawMetadataIter);
}
/**
* Converts a list of rows into Change instances.
*/
private Iterator> decodeChangesFromCql(final Iterator iter) {
return Iterators.transform(iter, row ->
Maps.immutableEntry(new DeltaClusteringKey(getChangeId(row), row.getNumBlocks()), _changeEncoder.decodeChange(getChangeId(row), _daoUtils.skipPrefix(getValue(row)))));
}
/**
* Like {@link #decodeChangesFromCql(java.util.Iterator)} except filtered to only include compactions.
*/
private Iterator> decodeCompactionsFromCql(final Iterator iter) {
return new AbstractIterator>() {
@Override
protected Map.Entry computeNext() {
while (iter.hasNext()) {
StitchedRow row = iter.next();
Compaction compaction = _changeEncoder.decodeCompaction(_daoUtils.skipPrefix(getValue(row)));
if (compaction != null) {
return Maps.immutableEntry(new DeltaClusteringKey(getChangeId(row),row.getNumBlocks()), compaction);
}
}
return endOfData();
}
};
}
/**
* Converts the rows from the provided iterator into raw metadata.
*/
private Iterator rawMetadataFromCql(final Iterator iter) {
return Iterators.transform(iter, row -> new RecordEntryRawMetadata()
.withTimestamp(TimeUUIDs.getTimeMillis(getChangeId(row)))
.withSize(_daoUtils.skipPrefix(getValue(row)).remaining()));
}
/**
* Read a batch of keys that all belong to the same placement (ColumnFamily).
*/
private Iterator readBatch(final DeltaPlacement placement, final Collection keys, final ReadConsistency consistency) {
requireNonNull(keys, "keys");
// Convert the keys to ByteBuffer Cassandra row keys
List> rowKeys = Lists.newArrayListWithCapacity(keys.size());
for (Key key : keys) {
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
rowKeys.add(Maps.immutableEntry(storage.getRowKey(key.getKey()), key));
}
// Sort the keys by their byte array encoding to get some locality w/queries.
Collections.sort(rowKeys, Ordering.natural().onResultOf(entry -> entry.getKey()));
// Group them into batches. Cassandra may have to seek each row so prefer smaller batches.
List>> batches = Lists.partition(rowKeys, _driverConfig.getMaxRandomRowsBatchSize());
// This algorithm is arranged such that rows are return in pages with size _fetchSize. The rows are grouped
// into row groups by common row key. The first RECORD_CACHE_SIZE rows are cached for the row group
// and any remaining rows are cached using soft references. This places an upper bound on the memory
// requirements needed while iterating. If at any time a soft reference is lost C* is re-queried to
// fetch the missing columns.
return Iterators.concat(Iterators.transform(batches.iterator(),
rowKeySubset -> {
Timer.Context timerCtx = _readBatchTimer.time();
try {
return rowQuery(rowKeySubset, consistency, placement);
} finally {
timerCtx.stop();
}
}));
}
/**
* Returns an iterator for the Records keyed by the provided row keys. An empty record is returned for any
* key which does not have a corresponding row in C*.
*/
private Iterator rowQuery(final List> rowKeys, final ReadConsistency consistency,
final DeltaPlacement placement) {
List keys = Lists.newArrayListWithCapacity(rowKeys.size());
final Map rawKeyMap = Maps.newHashMap();
for (Map.Entry entry : rowKeys) {
keys.add(entry.getKey());
rawKeyMap.put(entry.getKey(), entry.getValue());
}
BlockedDeltaTableDDL tableDDL = placement.getBlockedDeltaTableDDL();
Statement statement = selectDeltaFrom(tableDDL)
.where(in(tableDDL.getRowKeyColumnName(), keys))
.setConsistencyLevel(SorConsistencies.toCql(consistency));
Iterator> rowGroups = deltaQueryAsync(placement, statement, false, "Failed to read records %s", rawKeyMap.values());
return Iterators.concat(
// First iterator reads the row groups found and transforms them to Records
Iterators.transform(rowGroups, rows -> {
ByteBuffer keyBytes = getRawKeyFromRowGroup(rows);
Key key = rawKeyMap.remove(keyBytes);
assert key != null : "Query returned row with a key out of bound";
return newRecordFromCql(key, rows, placement, ByteBufferUtil.bytesToHex(keyBytes));
}),
// Second iterator returns an empty Record for each key queried but not found.
new AbstractIterator() {
private Iterator _nonExistentKeyIterator;
@Override
protected Record computeNext() {
// Lazily return an empty record for each key not found in the previous iterator.
// rawKeyMap.iterator() must not be called until the first iterator is completely spent.
if (_nonExistentKeyIterator == null) {
_nonExistentKeyIterator = rawKeyMap.values().iterator();
}
return _nonExistentKeyIterator.hasNext() ?
emptyRecord(_nonExistentKeyIterator.next()) :
endOfData();
}
});
}
/**
* Returns a select statement builder for a {@link TableDDL} with the columns ordered in the order set by
* {@link #ROW_KEY_RESULT_SET_COLUMN}, {@link #CHANGE_ID_RESULT_SET_COLUMN}, and {@link #VALUE_RESULT_SET_COLUMN}.
*/
private Select selectFrom(TableDDL tableDDL) {
return QueryBuilder.select()
.column(tableDDL.getRowKeyColumnName()) // ROW_KEY_RESULT_SET_COLUMN
.column(tableDDL.getChangeIdColumnName()) // CHANGE_ID_RESULT_SET_COLUMN
.column(tableDDL.getValueColumnName()) // VALUE_RESULT_SET_COLUMN
.from(tableDDL.getTableMetadata());
}
private Select selectDeltaFrom(BlockedDeltaTableDDL tableDDL) {
return QueryBuilder.select()
.column(tableDDL.getRowKeyColumnName()) // ROW_KEY_RESULT_SET_COLUMN
.column(tableDDL.getChangeIdColumnName()) // CHANGE_ID_RESULT_SET_COLUMN
.column(tableDDL.getValueColumnName()) // VALUE_RESULT_SET_COLUMN
.column(tableDDL.getBlockColumnName()) // BLOCK_ID_RESULT_SET COLUMN
.from(tableDDL.getTableMetadata());
}
private ByteBuffer getKey(Row row) {
return row.getBytesUnsafe(ROW_KEY_RESULT_SET_COLUMN);
}
private UUID getChangeId(Row row) {
return row.getUUID(CHANGE_ID_RESULT_SET_COLUMN);
}
private int getBlock(Row row) {
return row.getInt(BLOCK_RESULT_SET_COLUMN);
}
private ByteBuffer getValue(Row row) {
return row.getBytesUnsafe(VALUE_RESULT_SET_COLUMN);
}
/**
* A few notes on this method:
*
* - All rows in the row group have the same key, so choosing the first row is safe.
* - The rowGroup will always contain at least one row.
* - The row group has at least the first row in hard cache, so iterating to the first row will never
* result in a new CQL query.
*
*/
private ByteBuffer getRawKeyFromRowGroup(Iterable rowGroup) {
Iterator iter = rowGroup.iterator();
// Sanity check
assert iter.hasNext() : "Row group should never contain zero rows";
return getKey(iter.next());
}
/*
Its similar to getRawKeyFromRowGroup but should be used where the rowGroup can have no rows as well.
*/
private ByteBuffer getRawKeyFromRowGroupOrNull(Iterable filteredRowGroup) {
Iterator iter = filteredRowGroup.iterator();
return iter.hasNext() ? getKey(iter.next()) : null;
}
private Iterator touch(Iterator iter) {
// Could return a Guava PeekingIterator after "if (iter.hasNext()) iter.peek()", but simply calling hasNext()
// is sufficient for the iterator implementations used by this DAO class...
iter.hasNext();
return iter;
}
@Timed(name = "bv.emodb.sor.CqlDataReaderDAO.scan", absolute = true)
@Override
public Iterator scan(Table tbl, @Nullable String fromKeyExclusive, final LimitCounter ignore_limit,
final ReadConsistency consistency) {
// Note: The LimitCounter is passed in as an artifact of Astyanax batching and was used as a mechanism to
// control paging. The CQL driver natively performs this functionality so it is not used here. The caller
// will apply limit boundaries on the results from this method.
if (!_useCqlForScans.get()) {
return _astyanaxReaderDAO.scan(tbl, fromKeyExclusive, ignore_limit, consistency);
}
requireNonNull(tbl, "table");
requireNonNull(consistency, "consistency");
final AstyanaxTable table = (AstyanaxTable) tbl;
AstyanaxStorage storage = table.getReadStorage();
final DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
// Loop over all the range prefixes (2^shardsLog2 of them) and, for each, execute Cassandra queries to
// page through the records with that prefix.
final Iterator scanIter = storage.scanIterator(fromKeyExclusive);
return touch(Iterators.concat(new AbstractIterator>() {
@Override
protected Iterator computeNext() {
if (scanIter.hasNext()) {
ByteBufferRange keyRange = scanIter.next();
return recordScan(placement, table, keyRange, consistency);
}
return endOfData();
}
}));
}
@Override
public Iterator getSplit(Table tbl, String split, @Nullable String fromKeyExclusive, LimitCounter ignore_limit,
ReadConsistency consistency) {
// Note: The LimitCounter is passed in as an artifact of Astyanax batching and was used as a mechanism to
// control paging. The CQL driver natively performs this functionality so it is not used here. The caller
// will apply limit boundaries on the results from this method.
if (!_useCqlForScans.get()) {
return _astyanaxReaderDAO.getSplit(tbl, split, fromKeyExclusive, ignore_limit, consistency);
}
requireNonNull(tbl, "table");
requireNonNull(split, "split");
requireNonNull(consistency, "consistency");
ByteBufferRange splitRange = SplitFormat.decode(split);
AstyanaxTable table = (AstyanaxTable) tbl;
AstyanaxStorage storage = getStorageForSplit(table, splitRange);
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
ByteBufferRange keyRange = storage.getSplitRange(splitRange, fromKeyExclusive, split);
// The fromKeyExclusive might be equal to the end token of the split. If so, there's nothing to return.
if (keyRange.getStart().equals(keyRange.getEnd())) {
return Collections.emptyIterator();
}
return recordScan(placement, table, keyRange, consistency);
}
/**
* Scans a range of keys and returns an iterator containing each row's columns as an iterable.
*/
private Iterator> rowScan(DeltaPlacement placement, @Nullable AstyanaxTable table, ByteBufferRange keyRange,
ReadConsistency consistency) {
ByteBuffer startToken = keyRange.getStart();
ByteBuffer endToken = keyRange.getEnd();
// Note: if Cassandra is asked to perform a token range query where start >= end it will wrap
// around which is absolutely *not* what we want.
checkArgument(AstyanaxStorage.compareKeys(startToken, endToken) < 0, "Cannot scan rows which loop from maximum- to minimum-token");
BlockedDeltaTableDDL tableDDL = placement.getBlockedDeltaTableDDL();
Statement statement = selectDeltaFrom(tableDDL)
.where(gt(token(tableDDL.getRowKeyColumnName()), startToken))
.and(lte(token(tableDDL.getRowKeyColumnName()), endToken))
.setConsistencyLevel(SorConsistencies.toCql(consistency));
return deltaQueryAsync(placement, statement, false, "Failed to scan token range [%s, %s] for %s",
ByteBufferUtil.bytesToHex(startToken), ByteBufferUtil.bytesToHex(endToken),
table != null ? table : "multiple tables");
}
/**
* Similar to {@link #rowScan(DeltaPlacement, AstyanaxTable, com.netflix.astyanax.model.ByteBufferRange, com.bazaarvoice.emodb.sor.api.ReadConsistency)}
* except this method converts each C* row into a Record.
*/
private Iterator recordScan(DeltaPlacement placement, AstyanaxTable table, ByteBufferRange keyRange,
ReadConsistency consistency) {
Iterator> rowGroups = rowScan(placement, table, keyRange, consistency);
return decodeRows(rowGroups, table, placement);
}
/**
* Converts rows from a single C* row to a Record.
*/
private Iterator decodeRows(Iterator> rowGroups, final AstyanaxTable table, Placement placement) {
return Iterators.transform(rowGroups, rowGroup -> {
String key = AstyanaxStorage.getContentKey(getRawKeyFromRowGroup(rowGroup));
return newRecordFromCql(new Key(table, key), rowGroup, placement, ByteBufferUtil.bytesToHex(getRawKeyFromRowGroupOrNull(rowGroup)));
});
}
@Override
public Iterator multiTableScan(final MultiTableScanOptions query, final TableSet tables,
final LimitCounter limit, final ReadConsistency consistency, @Nullable Instant cutoffTime) {
if (!_useCqlForScans.get()) {
return _astyanaxReaderDAO.multiTableScan(query, tables, limit, consistency, cutoffTime);
}
requireNonNull(query, "query");
String placementName = requireNonNull(query.getPlacement(), "placement");
final DeltaPlacement placement = (DeltaPlacement) _placementCache.get(placementName);
ScanRange scanRange = ofNullable(query.getScanRange()).orElse(ScanRange.all());
// Since the range may wrap from high to low end of the token range we need to unwrap it
List ranges = scanRange.unwrapped();
return touch(FluentIterable.from(ranges)
.transformAndConcat(rowRange -> scanMultiTableRows(
tables, placement, rowRange.asByteBufferRange(), limit, query.isIncludeDeletedTables(),
query.isIncludeMirrorTables(), consistency, cutoffTime))
.iterator());
}
/**
* Decodes rows returned by scanning across tables.
*/
private Iterable scanMultiTableRows(
final TableSet tables, final DeltaPlacement placement, final ByteBufferRange rowRange,
final LimitCounter limit, final boolean includeDroppedTables, final boolean includeMirrorTables,
final ReadConsistency consistency, final Instant cutoffTime) {
// Avoiding pinning multiple decoded rows into memory at once.
return () -> limit.limit(new AbstractIterator() {
private PeekingIterator> _iter = Iterators.peekingIterator(
rowScan(placement, null, rowRange, consistency));
private long _lastTableUuid = -1;
private AstyanaxTable _table = null;
private boolean _droppedTable;
private boolean _primaryTable;
@Override
protected MultiTableScanResult computeNext() {
while (_iter.hasNext()) {
// Get the next rows from the grouping iterator. All rows in the returned Iterable
// are from the same Cassandra wide row (in other words, they share the same key).
final Iterable rows = _iter.next();
// filter the rows if a cutOff time is specified.
Iterable filteredRows = rows;
if (cutoffTime != null) {
filteredRows = getFilteredRows(rows, cutoffTime);
}
// Convert the filteredRows into a Record object
ByteBuffer rowKey = getRawKeyFromRowGroupOrNull(filteredRows);
// rowKey can be null if "all" the rows of the cassandra record are after the cutoff time. In such case ignore that record and continue.
if (rowKey == null) {
continue;
}
long tableUuid = AstyanaxStorage.getTableUuid(rowKey);
if (_lastTableUuid != tableUuid) {
_lastTableUuid = tableUuid;
try {
_table = (AstyanaxTable) tables.getByUuid(tableUuid);
} catch (UnknownTableException e) {
_table = AstyanaxTable.createUnknown(tableUuid, placement, e.getTable());
} catch (DroppedTableException e) {
_table = AstyanaxTable.createUnknown(tableUuid, placement, e.getPriorTable());
}
_droppedTable = _table.isUnknownTable();
_primaryTable = _table.getReadStorage().hasUUID(tableUuid);
}
// Skip dropped and mirror tables if configured
if ((!includeDroppedTables && _droppedTable) || (!includeMirrorTables && !_primaryTable)) {
_iter = skipToNextTable(tableUuid);
continue;
}
int shardId = AstyanaxStorage.getShardId(rowKey);
String key = AstyanaxStorage.getContentKey(rowKey);
Record record = newRecordFromCql(new Key(_table, key), filteredRows, placement, ByteBufferUtil.bytesToHex(rowKey));
return new MultiTableScanResult(rowKey, shardId, tableUuid, _droppedTable, record);
}
return endOfData();
}
private PeekingIterator> skipToNextTable(long tableUuid) {
// Iterate over the next 10 row groups first to check for a table switch. This avoids starting a new range
// query if the number of rows in the undesired table is small.
int skipLimit = 10;
Iterable rowGroup = null;
while (skipLimit != 0 && _iter.hasNext()) {
rowGroup = _iter.peek();
ByteBuffer rawKey = getRawKeyFromRowGroup(rowGroup);
long nextTableUuid = AstyanaxStorage.getTableUuid(rawKey);
if (nextTableUuid != tableUuid) {
// This is the first row of a new table
return _iter;
} else {
_iter.next();
skipLimit -= 1;
}
}
if (_iter.hasNext()) {
// Skip the table entirely by starting a new query on the next possible table
assert rowGroup != null;
int shardId = AstyanaxStorage.getShardId(getRawKeyFromRowGroup(rowGroup));
ByteBuffer nextPossibleTableStart = AstyanaxStorage.getRowKeyRaw(shardId, tableUuid + 1, "");
ByteBuffer end = rowRange.getEnd();
if (AstyanaxStorage.compareKeys(nextPossibleTableStart, end) < 0) {
// We haven't reached the last end boundary of the original range scan
ByteBufferRange updatedRange = new ByteBufferRangeImpl(nextPossibleTableStart, end, -1, false);
return Iterators.peekingIterator(rowScan(placement, null, updatedRange, consistency));
}
}
return Iterators.peekingIterator(Collections.emptyIterator());
}
});
}
private AstyanaxStorage getStorageForSplit(AstyanaxTable table, ByteBufferRange splitRange) {
// During a table move, after the internal copy is complete getSplits() will return split IDs that point to
// the new storage location (table.getReadStorage()) but must still support old split IDs from the old
// storage location for a while.
if (!table.getReadStorage().contains(splitRange.getStart())) {
for (AstyanaxStorage storage : table.getWriteStorage()) {
if (storage.contains(splitRange.getStart()) && storage.getReadsAllowed()) {
return storage;
}
}
}
return table.getReadStorage();
}
/**
* Implementation of {@link RowGroupResultSetIterator} with implementations for reading from a delta table.
*/
private class DeltaRowGroupResultSetIterator extends RowGroupResultSetIterator {
private final DeltaPlacement _placement;
private final ConsistencyLevel _consistency;
private DeltaRowGroupResultSetIterator(ResultSet resultSet, int prefetchLimit,
DeltaPlacement placement, ConsistencyLevel consistency) {
super(resultSet, prefetchLimit);
_placement = placement;
_consistency = consistency;
}
private DeltaRowGroupResultSetIterator(ListenableFuture resultSetFuture, int prefetchLimit,
DeltaPlacement placement, ConsistencyLevel consistency) {
super(resultSetFuture, prefetchLimit);
_placement = placement;
_consistency = consistency;
}
@Override
protected Object getKeyForRow(Row row) {
return CqlBlockedDataReaderDAO.this.getKey(row);
}
@Override
protected ResultSet queryRowGroupRowsAfter(Row row) {
Statement statement = selectDeltaFrom(_placement.getBlockedDeltaTableDDL())
.where(eq(_placement.getBlockedDeltaTableDDL().getRowKeyColumnName(), getKey(row)))
.and(gt(ImmutableList.of(_placement.getBlockedDeltaTableDDL().getChangeIdColumnName(), _placement.getBlockedDeltaTableDDL().getBlockColumnName()),
ImmutableList.of(getChangeId(row), getBlock(row))))
.orderBy(asc(_placement.getBlockedDeltaTableDDL().getChangeIdColumnName()))
.setConsistencyLevel(_consistency);
return AdaptiveResultSet.executeAdaptiveQuery(_placement.getKeyspace().getCqlSession(), statement, _driverConfig.getSingleRowFetchSize());
}
}
/**
* Reads columns from the delta or delta history table. The range of columns, order, and limit can be
* parameterized.
*/
private ResultSet columnScan(DeltaPlacement placement, TableDDL tableDDL, ByteBuffer rowKey, Range columnRange,
boolean ascending, ConsistencyLevel consistency) {
Select.Where where = (tableDDL == placement.getBlockedDeltaTableDDL() ? selectDeltaFrom(placement.getBlockedDeltaTableDDL()) : selectFrom(tableDDL))
.where(eq(tableDDL.getRowKeyColumnName(), rowKey));
if (columnRange.hasLowerBound()) {
if (columnRange.lowerBoundType() == BoundType.CLOSED) {
where = where.and(gte(tableDDL.getChangeIdColumnName(), columnRange.lowerEndpoint().getUuid()));
} else {
where = where.and(gt(tableDDL.getChangeIdColumnName(), columnRange.lowerEndpoint().getUuid()));
}
}
if (columnRange.hasUpperBound()) {
if (columnRange.upperBoundType() == BoundType.CLOSED) {
where = where.and(lte(tableDDL.getChangeIdColumnName(), columnRange.upperEndpoint().getUuid()));
} else {
where = where.and(lt(tableDDL.getChangeIdColumnName(), columnRange.upperEndpoint().getUuid()));
}
}
Statement statement = where
.orderBy(ascending ? asc(tableDDL.getChangeIdColumnName()) : desc(tableDDL.getChangeIdColumnName()))
.setConsistencyLevel(consistency);
return AdaptiveResultSet.executeAdaptiveQuery(placement.getKeyspace().getCqlSession(), statement, _driverConfig.getSingleRowFetchSize());
}
@Override
public Iterator readTimeline(Key key, boolean includeContentData, UUID start, UUID end,
boolean reversed, long limit, ReadConsistency readConsistency) {
requireNonNull(key, "key");
checkArgument(limit > 0, "Limit must be >0");
requireNonNull(readConsistency, "consistency");
// Even though the API allows for a long limit CQL only supports integer values. Anything longer than MAX_INT
// is impractical given that a single Cassandra record must practically hold less than 2G rows since a wide row
// cannot be larger than 2G bytes.
int scaledLimit = (int) Math.min(Integer.MAX_VALUE, limit);
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
ByteBuffer rowKey = storage.getRowKey(key.getKey());
Range columnRange = toRange(start, end, reversed);
ConsistencyLevel consistency = SorConsistencies.toCql(readConsistency);
// Read Delta and Compaction objects
Iterator deltas = Collections.emptyIterator();
if (includeContentData) {
TableDDL deltaDDL = placement.getBlockedDeltaTableDDL();
ProtocolVersion protocolVersion = placement.getKeyspace().getCqlSession().getCluster().getConfiguration().getProtocolOptions().getProtocolVersion();
CodecRegistry codecRegistry = placement.getKeyspace().getCqlSession().getCluster().getConfiguration().getCodecRegistry();
deltas = decodeDeltaColumns(Iterators.limit(new CqlDeltaIterator(columnScan(placement, deltaDDL, rowKey, columnRange, !reversed, consistency).iterator(), BLOCK_RESULT_SET_COLUMN, CHANGE_ID_RESULT_SET_COLUMN, VALUE_RESULT_SET_COLUMN, reversed, _deltaPrefixLength, protocolVersion, codecRegistry, ByteBufferUtil.bytesToHex(rowKey)), scaledLimit));
}
// Read History objects
Iterator deltaHistory = Collections.emptyIterator();
TableDDL deltaHistoryDDL = placement.getDeltaHistoryTableDDL();
deltaHistory = decodeColumns(Iterators.limit(columnScan(placement, deltaHistoryDDL, rowKey, columnRange, !reversed, consistency).iterator(), scaledLimit));
return touch(MergeIterator.merge(deltas, deltaHistory, reversed));
}
@Override
public Iterator getExistingHistories(Key key, UUID start, UUID end, ReadConsistency readConsistency) {
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
ByteBuffer rowKey = storage.getRowKey(key.getKey());
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
Range columnRange = toRange(start, end, true);
ConsistencyLevel consistency = SorConsistencies.toCql(readConsistency);
TableDDL deltaHistoryDDL = placement.getDeltaHistoryTableDDL();
return decodeColumns(columnScan(placement, deltaHistoryDDL, rowKey, columnRange, false, consistency).iterator());
}
/**
* Transforms the provided Row iterator into a {@link Change} iterator.
*/
private Iterator decodeColumns(Iterator iter) {
return Iterators.transform(iter, row -> _changeEncoder.decodeChange(getChangeId(row), getValue(row)));
}
private Iterator decodeDeltaColumns(Iterator iter) {
return Iterators.transform(iter, row -> _changeEncoder.decodeChange(getChangeId(row), _daoUtils.skipPrefix(getValue(row))));
}
/**
* Converts a TimeUUID set of endpoints into a {@link Range}. of {@link RangeTimeUUID}s. Both end points
* are considered closed; that is, they are included in the range.
*/
private Range toRange(@Nullable UUID start, @Nullable UUID end, boolean reversed) {
// If the range is reversed then start and end will also be reversed and must therefore be swapped.
if (reversed) {
UUID tmp = start;
start = end;
end = tmp;
}
if (start == null) {
if (end == null) {
return Range.all();
} else {
return Range.atMost(new RangeTimeUUID(end));
}
} else if (end == null) {
return Range.atLeast(new RangeTimeUUID(start));
}
return Range.closed(new RangeTimeUUID(start), new RangeTimeUUID(end));
}
/**
* {@link Range} needs comparable type. This class thinly encapsulates a UUID and sorts as a TimeUUID.
*/
private static class RangeTimeUUID implements Comparable {
private final UUID _uuid;
private RangeTimeUUID(UUID uuid) {
_uuid = uuid;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (!(o instanceof RangeTimeUUID)) {
return false;
}
return _uuid.equals(((RangeTimeUUID) o)._uuid);
}
@Override
public int hashCode() {
return _uuid.hashCode();
}
@Override
public int compareTo(RangeTimeUUID o) {
return TimeUUIDs.compare(_uuid, o._uuid);
}
private UUID getUuid() {
return _uuid;
}
}
/**
* Helper method to return a record with no rows.
*/
private Record emptyRecord(Key key) {
return new RecordImpl(key,
Collections.emptyIterator(),
Collections.emptyIterator(),
Collections.emptyIterator());
}
@VisibleForTesting
public static Iterable getFilteredRows(Iterable rows, Instant cutoffTime) {
if (cutoffTime == null) {
return rows;
}
return () -> Iterators.filter(rows.iterator(), row -> (TimeUUIDs.getTimeMillis(row.getUUID(CHANGE_ID_RESULT_SET_COLUMN)) < cutoffTime.toEpochMilli()));
}
// The following methods rely on using the Cassandra thrift call describe_splits_ex()
to split
// a token range into portions of approximately equal size. There is currently no equivalent client-side
// support for this call using CQL. Therefore they must always defer to the Asytanax implementation.
@Override
public List getSplits(Table table, int recordsPerSplit, int localResplits) throws TimeoutException {
return _astyanaxReaderDAO.getSplits(table, recordsPerSplit, localResplits);
}
@Override
public ScanRangeSplits getScanRangeSplits(String placement, int desiredRecordsPerSplit, Optional subrange) {
return _astyanaxReaderDAO.getScanRangeSplits(placement, desiredRecordsPerSplit, subrange);
}
@Override
public long count(Table table, ReadConsistency consistency) {
return _astyanaxReaderDAO.count(table, consistency);
}
@Override
public long count(Table table, @Nullable Integer limit, ReadConsistency consistency) {
return _astyanaxReaderDAO.count(table, limit, consistency);
}
@Override
public Stream getKeysForStorage(AstyanaxStorage storage) {
// Loop over all the range prefixes (2^shardsLog2 of them) and, for each, execute Cassandra queries to
// page through the rowkeys with that prefix.
final DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
BlockedDeltaTableDDL tableDDL = placement.getBlockedDeltaTableDDL();
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(storage.scanIterator(null), 0), false)
.map(keyRange ->
QueryBuilder.select()
.distinct()
.column(tableDDL.getRowKeyColumnName())
.from(tableDDL.getTableMetadata())
.where(gt(token(tableDDL.getRowKeyColumnName()), keyRange.getStart()))
.and(lte(token(tableDDL.getRowKeyColumnName()), keyRange.getEnd()))
.setConsistencyLevel(ConsistencyLevel.ALL))
.flatMap(statement ->
StreamSupport.stream(
Spliterators.spliteratorUnknownSize(
deltaQuery(placement, statement, false, "Failed to scan keys for storage %s", storage.toString()),
0),
false))
.map(this::getRawKeyFromRowGroup)
.map(AstyanaxStorage::getContentKey);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy