com.bazaarvoice.emodb.sor.db.astyanax.CqlBlockedDataReaderDAO Maven / Gradle / Ivy
package com.bazaarvoice.emodb.sor.db.astyanax;
import com.bazaarvoice.emodb.common.api.impl.LimitCounter;
import com.bazaarvoice.emodb.common.cassandra.CqlDriverConfiguration;
import com.bazaarvoice.emodb.common.cassandra.cqldriver.AdaptiveResultSet;
import com.bazaarvoice.emodb.common.uuid.TimeUUIDs;
import com.bazaarvoice.emodb.sor.api.Change;
import com.bazaarvoice.emodb.sor.api.Compaction;
import com.bazaarvoice.emodb.sor.api.ReadConsistency;
import com.bazaarvoice.emodb.sor.api.UnknownTableException;
import com.bazaarvoice.emodb.sor.db.DAOUtils;
import com.bazaarvoice.emodb.sor.db.DataReaderDAO;
import com.bazaarvoice.emodb.sor.db.Key;
import com.bazaarvoice.emodb.sor.db.MultiTableScanOptions;
import com.bazaarvoice.emodb.sor.db.MultiTableScanResult;
import com.bazaarvoice.emodb.sor.db.Record;
import com.bazaarvoice.emodb.sor.db.RecordEntryRawMetadata;
import com.bazaarvoice.emodb.sor.db.ScanRange;
import com.bazaarvoice.emodb.sor.db.ScanRangeSplits;
import com.bazaarvoice.emodb.sor.db.cql.CachingRowGroupIterator;
import com.bazaarvoice.emodb.sor.db.cql.CqlForMultiGets;
import com.bazaarvoice.emodb.sor.db.cql.CqlForScans;
import com.bazaarvoice.emodb.sor.db.cql.CqlReaderDAODelegate;
import com.bazaarvoice.emodb.sor.db.cql.RowGroupResultSetIterator;
import com.bazaarvoice.emodb.sor.db.test.DeltaClusteringKey;
import com.bazaarvoice.emodb.table.db.DroppedTableException;
import com.bazaarvoice.emodb.table.db.Table;
import com.bazaarvoice.emodb.table.db.TableSet;
import com.bazaarvoice.emodb.table.db.astyanax.AstyanaxStorage;
import com.bazaarvoice.emodb.table.db.astyanax.AstyanaxTable;
import com.bazaarvoice.emodb.table.db.astyanax.Placement;
import com.bazaarvoice.emodb.table.db.astyanax.PlacementCache;
import com.bazaarvoice.emodb.table.db.eventregistry.StorageReaderDAO;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.codahale.metrics.annotation.Timed;
import com.datastax.driver.core.CodecRegistry;
import com.datastax.driver.core.ConsistencyLevel;
import com.datastax.driver.core.ProtocolVersion;
import com.datastax.driver.core.ResultSet;
import com.datastax.driver.core.Row;
import com.datastax.driver.core.Session;
import com.datastax.driver.core.Statement;
import com.datastax.driver.core.querybuilder.QueryBuilder;
import com.datastax.driver.core.querybuilder.Select;
import com.datastax.driver.core.utils.MoreFutures;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.nio.ByteBuffer;
import java.time.Instant;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Spliterators;
import java.util.UUID;
import java.util.concurrent.TimeoutException;
import static com.datastax.driver.core.querybuilder.QueryBuilder.asc;
import static com.datastax.driver.core.querybuilder.QueryBuilder.desc;
import static com.datastax.driver.core.querybuilder.QueryBuilder.eq;
import static;
import static com.datastax.driver.core.querybuilder.QueryBuilder.gte;
import static;
import static;
import static com.datastax.driver.core.querybuilder.QueryBuilder.lte;
import static com.datastax.driver.core.querybuilder.QueryBuilder.token;
import static;
import static java.util.Objects.requireNonNull;
import static java.util.Optional.ofNullable;
// Delegates to AstyanaxReaderDAO for non-CQL stuff
// Once we transition fully, we will stop delegating to Astyanax
public class CqlBlockedDataReaderDAO implements DataReaderDAO, StorageReaderDAO {
private final Logger _log = LoggerFactory.getLogger(CqlBlockedDataReaderDAO.class);
* Depending on the placement and type of data being queried (delta or delta history) the names of the
* columns being queried can change. However, by quering the columns in a fixed well-known order in each
* {@link QueryBuilder#select()} the results can be efficiently read by position rather than name.
private static final int ROW_KEY_RESULT_SET_COLUMN = 0;
private static final int CHANGE_ID_RESULT_SET_COLUMN = 1;
private static final int VALUE_RESULT_SET_COLUMN = 2;
private static final int BLOCK_RESULT_SET_COLUMN = 3;
private final DataReaderDAO _astyanaxReaderDAO;
private final ChangeEncoder _changeEncoder;
private final PlacementCache _placementCache;
private final CqlDriverConfiguration _driverConfig;
private final Meter _randomReadMeter;
private final Timer _readBatchTimer;
private final DAOUtils _daoUtils;
private final int _deltaPrefixLength;
// Support AB testing of various uses of the CQL driver versus the older but (at this point) more vetted Astyanax driver.
private volatile Supplier _useCqlForMultiGets = Suppliers.ofInstance(true);
private volatile Supplier _useCqlForScans = Suppliers.ofInstance(true);
public CqlBlockedDataReaderDAO(@CqlReaderDAODelegate DataReaderDAO delegate, PlacementCache placementCache,
CqlDriverConfiguration driverConfig, ChangeEncoder changeEncoder,
MetricRegistry metricRegistry, DAOUtils daoUtils, @PrefixLength int deltaPrefixLength) {
_astyanaxReaderDAO = requireNonNull(delegate, "delegate");
_placementCache = placementCache;
_driverConfig = driverConfig;
_changeEncoder = changeEncoder;
_randomReadMeter = metricRegistry.meter(getMetricName("random-reads"));
_readBatchTimer = metricRegistry.timer(getMetricName("readBatch"));
_deltaPrefixLength = deltaPrefixLength;
_daoUtils = daoUtils;
private String getMetricName(String name) {
return"bv.emodb.sor", "CqlDataReaderDAO", name);
// Since AB testing of CQL driver is temporary until proven out don't change the constructor to support this feature.
// Inject the AB testing flags independently. This will make backing these settings out easier in the future.
public void setUseCqlforMultiGets(@CqlForMultiGets Supplier useCqlForMultiGets) {
_useCqlForMultiGets = requireNonNull(useCqlForMultiGets, "useCqlForMultiGets");
public void setUseCqlforScans(@CqlForScans Supplier useCqlForScans) {
_useCqlForScans = requireNonNull(useCqlForScans, "useCqlForScans");
* This CQL based read method works for a row with 64 deltas of 3 MB each. The same read with the AstyanaxDataReaderDAO
* would give Thrift frame errors.
public Record read(Key key, ReadConsistency consistency) {
requireNonNull(key, "key");
requireNonNull(consistency, "consistency");
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
ByteBuffer rowKey = storage.getRowKey(key.getKey());
return read(key, rowKey, consistency, placement);
public Iterator readAll(Collection keys, final ReadConsistency consistency) {
if (!_useCqlForMultiGets.get()) {
return _astyanaxReaderDAO.readAll(keys, consistency);
requireNonNull(keys, "keys");
requireNonNull(consistency, "consistency");
// Group the keys by placement. Each placement will result in a separate set of queries. Dedup keys.
Multimap placementMap = HashMultimap.create();
for (Key key : keys) {
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
placementMap.put((DeltaPlacement) storage.getPlacement(), key);
// Return an iterator that will loop over the placements and perform a query for each placement and
// return the resulting decoded rows.
return touch(Iterators.concat(Iterators.transform(placementMap.asMap().entrySet().iterator(),
entry -> readBatch(entry.getKey(), entry.getValue(), consistency))));
public String getPlacementCluster(String placementName) {
requireNonNull(placementName, "placement");
DeltaPlacement placement = (DeltaPlacement) _placementCache.get(placementName);
return placement.getKeyspace().getClusterName();
private Record read(Key key, ByteBuffer rowKey, ReadConsistency consistency, DeltaPlacement placement) {
requireNonNull(key, "key");
requireNonNull(consistency, "consistency");
BlockedDeltaTableDDL tableDDL = placement.getBlockedDeltaTableDDL();
Statement statement = selectDeltaFrom(tableDDL)
.where(eq(tableDDL.getRowKeyColumnName(), rowKey))
// Track metrics
Iterator> groupedRows = deltaQuery(placement, statement, true, "Failed to read record %s", key);
Iterable rows;
if (groupedRows.hasNext()) {
rows =;
} else {
rows = ImmutableList.of();
// Convert the results into a Record object, lazily fetching the rest of the columns as necessary.
return newRecordFromCql(key, rows, placement, ByteBufferUtil.bytesToHex(rowKey));
* Synchronously executes the provided statement. The statement must query the delta table as returned from
* {@link com.bazaarvoice.emodb.sor.db.astyanax.DeltaPlacement#getBlockedDeltaTableDDL()}
private Iterator> deltaQuery(DeltaPlacement placement, Statement statement, boolean singleRow,
String errorContext, Object... errorContextArgs) {
return doDeltaQuery(placement, statement, singleRow, false, errorContext, errorContextArgs);
* Asynchronously executes the provided statement. Although the iterator is returned immediately the actual results
* may still be loading in the background. The statement must query the delta table as returned from
* {@link com.bazaarvoice.emodb.sor.db.astyanax.DeltaPlacement#getBlockedDeltaTableDDL()}
private Iterator> deltaQueryAsync(DeltaPlacement placement, Statement statement, boolean singleRow,
String errorContext, Object... errorContextArgs) {
return doDeltaQuery(placement, statement, singleRow, true, errorContext, errorContextArgs);
private Iterator> doDeltaQuery(DeltaPlacement placement, Statement statement, boolean singleRow, boolean async,
String errorContext, Object... errorContextArgs) {
// Set the fetch size and prefetch limits depending on whether the query is for a single row or multiple rows.
int fetchSize = singleRow ? _driverConfig.getSingleRowFetchSize() : _driverConfig.getMultiRowFetchSize();
int prefetchLimit = singleRow ? _driverConfig.getSingleRowPrefetchLimit() : _driverConfig.getMultiRowPrefetchLimit();
Session session = placement.getKeyspace().getCqlSession();
DeltaRowGroupResultSetIterator deltaRowGroupResultSetIterator;
if (async) {
ListenableFuture resultSetFuture = AdaptiveResultSet.executeAdaptiveQueryAsync(session, statement, fetchSize);
deltaRowGroupResultSetIterator = new DeltaRowGroupResultSetIterator(
resultSetFuture, prefetchLimit, placement, statement.getConsistencyLevel());
Futures.addCallback(resultSetFuture, new MoreFutures.FailureCallback() {
public void onFailure(Throwable t) {
_log.error(String.format(errorContext, errorContextArgs), t);
} else {
try {
ResultSet resultSet = AdaptiveResultSet.executeAdaptiveQuery(session, statement, fetchSize);
deltaRowGroupResultSetIterator = new DeltaRowGroupResultSetIterator(
resultSet, prefetchLimit, placement, statement.getConsistencyLevel());
} catch (Throwable t) {
_log.error(String.format(errorContext, errorContextArgs), t);
throw t;
return new CachingRowGroupIterator(deltaRowGroupResultSetIterator, _driverConfig.getRecordCacheSize(), _driverConfig.getRecordSoftCacheSize());
* Creates a Record instance for a given key and list of rows. All rows must be from the same Cassandra row;
* in other words, it is expected that row.getBytesUnsafe(ROW_KEY_RESULT_SET_COLUMN) returns the same value for
* each row in rows.
private Record newRecordFromCql(Key key, Iterable rows, Placement placement, String rowKey) {
Session session = placement.getKeyspace().getCqlSession();
ProtocolVersion protocolVersion = session.getCluster().getConfiguration().getProtocolOptions().getProtocolVersion();
CodecRegistry codecRegistry = session.getCluster().getConfiguration().getCodecRegistry();
Iterator> changeIter = decodeChangesFromCql(new CqlDeltaIterator(rows.iterator(), BLOCK_RESULT_SET_COLUMN, CHANGE_ID_RESULT_SET_COLUMN, VALUE_RESULT_SET_COLUMN, false, _deltaPrefixLength, protocolVersion, codecRegistry, rowKey));
Iterator> compactionIter = decodeCompactionsFromCql(new CqlDeltaIterator(rows.iterator(), BLOCK_RESULT_SET_COLUMN, CHANGE_ID_RESULT_SET_COLUMN, VALUE_RESULT_SET_COLUMN, false, _deltaPrefixLength, protocolVersion, codecRegistry, rowKey));
Iterator rawMetadataIter = rawMetadataFromCql(new CqlDeltaIterator(rows.iterator(), BLOCK_RESULT_SET_COLUMN, CHANGE_ID_RESULT_SET_COLUMN, VALUE_RESULT_SET_COLUMN, false, _deltaPrefixLength, protocolVersion, codecRegistry, rowKey));
return new RecordImpl(key, compactionIter, changeIter, rawMetadataIter);
* Converts a list of rows into Change instances.
private Iterator> decodeChangesFromCql(final Iterator iter) {
return Iterators.transform(iter, row ->
Maps.immutableEntry(new DeltaClusteringKey(getChangeId(row), row.getNumBlocks()), _changeEncoder.decodeChange(getChangeId(row), _daoUtils.skipPrefix(getValue(row)))));
* Like {@link #decodeChangesFromCql(java.util.Iterator)} except filtered to only include compactions.
private Iterator> decodeCompactionsFromCql(final Iterator iter) {
return new AbstractIterator>() {
protected Map.Entry computeNext() {
while (iter.hasNext()) {
StitchedRow row =;
Compaction compaction = _changeEncoder.decodeCompaction(_daoUtils.skipPrefix(getValue(row)));
if (compaction != null) {
return Maps.immutableEntry(new DeltaClusteringKey(getChangeId(row),row.getNumBlocks()), compaction);
return endOfData();
* Converts the rows from the provided iterator into raw metadata.
private Iterator rawMetadataFromCql(final Iterator iter) {
return Iterators.transform(iter, row -> new RecordEntryRawMetadata()
* Read a batch of keys that all belong to the same placement (ColumnFamily).
private Iterator readBatch(final DeltaPlacement placement, final Collection keys, final ReadConsistency consistency) {
requireNonNull(keys, "keys");
// Convert the keys to ByteBuffer Cassandra row keys
List> rowKeys = Lists.newArrayListWithCapacity(keys.size());
for (Key key : keys) {
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
rowKeys.add(Maps.immutableEntry(storage.getRowKey(key.getKey()), key));
// Sort the keys by their byte array encoding to get some locality w/queries.
Collections.sort(rowKeys, Ordering.natural().onResultOf(entry -> entry.getKey()));
// Group them into batches. Cassandra may have to seek each row so prefer smaller batches.
List>> batches = Lists.partition(rowKeys, _driverConfig.getMaxRandomRowsBatchSize());
// This algorithm is arranged such that rows are return in pages with size _fetchSize. The rows are grouped
// into row groups by common row key. The first RECORD_CACHE_SIZE rows are cached for the row group
// and any remaining rows are cached using soft references. This places an upper bound on the memory
// requirements needed while iterating. If at any time a soft reference is lost C* is re-queried to
// fetch the missing columns.
return Iterators.concat(Iterators.transform(batches.iterator(),
rowKeySubset -> {
Timer.Context timerCtx = _readBatchTimer.time();
try {
return rowQuery(rowKeySubset, consistency, placement);
} finally {
* Returns an iterator for the Records keyed by the provided row keys. An empty record is returned for any
* key which does not have a corresponding row in C*.
private Iterator rowQuery(final List> rowKeys, final ReadConsistency consistency,
final DeltaPlacement placement) {
List keys = Lists.newArrayListWithCapacity(rowKeys.size());
final Map rawKeyMap = Maps.newHashMap();
for (Map.Entry entry : rowKeys) {
rawKeyMap.put(entry.getKey(), entry.getValue());
BlockedDeltaTableDDL tableDDL = placement.getBlockedDeltaTableDDL();
Statement statement = selectDeltaFrom(tableDDL)
.where(in(tableDDL.getRowKeyColumnName(), keys))
Iterator> rowGroups = deltaQueryAsync(placement, statement, false, "Failed to read records %s", rawKeyMap.values());
return Iterators.concat(
// First iterator reads the row groups found and transforms them to Records
Iterators.transform(rowGroups, rows -> {
ByteBuffer keyBytes = getRawKeyFromRowGroup(rows);
Key key = rawKeyMap.remove(keyBytes);
assert key != null : "Query returned row with a key out of bound";
return newRecordFromCql(key, rows, placement, ByteBufferUtil.bytesToHex(keyBytes));
// Second iterator returns an empty Record for each key queried but not found.
new AbstractIterator() {
private Iterator _nonExistentKeyIterator;
protected Record computeNext() {
// Lazily return an empty record for each key not found in the previous iterator.
// rawKeyMap.iterator() must not be called until the first iterator is completely spent.
if (_nonExistentKeyIterator == null) {
_nonExistentKeyIterator = rawKeyMap.values().iterator();
return _nonExistentKeyIterator.hasNext() ?
emptyRecord( :
* Returns a select statement builder for a {@link TableDDL} with the columns ordered in the order set by
private Select selectFrom(TableDDL tableDDL) {
.column(tableDDL.getRowKeyColumnName()) // ROW_KEY_RESULT_SET_COLUMN
.column(tableDDL.getChangeIdColumnName()) // CHANGE_ID_RESULT_SET_COLUMN
.column(tableDDL.getValueColumnName()) // VALUE_RESULT_SET_COLUMN
private Select selectDeltaFrom(BlockedDeltaTableDDL tableDDL) {
.column(tableDDL.getRowKeyColumnName()) // ROW_KEY_RESULT_SET_COLUMN
.column(tableDDL.getChangeIdColumnName()) // CHANGE_ID_RESULT_SET_COLUMN
.column(tableDDL.getValueColumnName()) // VALUE_RESULT_SET_COLUMN
.column(tableDDL.getBlockColumnName()) // BLOCK_ID_RESULT_SET COLUMN
private ByteBuffer getKey(Row row) {
return row.getBytesUnsafe(ROW_KEY_RESULT_SET_COLUMN);
private UUID getChangeId(Row row) {
private int getBlock(Row row) {
return row.getInt(BLOCK_RESULT_SET_COLUMN);
private ByteBuffer getValue(Row row) {
return row.getBytesUnsafe(VALUE_RESULT_SET_COLUMN);
* A few notes on this method:
* - All rows in the row group have the same key, so choosing the first row is safe.
* - The rowGroup will always contain at least one row.
* - The row group has at least the first row in hard cache, so iterating to the first row will never
* result in a new CQL query.
private ByteBuffer getRawKeyFromRowGroup(Iterable rowGroup) {
Iterator iter = rowGroup.iterator();
// Sanity check
assert iter.hasNext() : "Row group should never contain zero rows";
return getKey(;
Its similar to getRawKeyFromRowGroup but should be used where the rowGroup can have no rows as well.
private ByteBuffer getRawKeyFromRowGroupOrNull(Iterable filteredRowGroup) {
Iterator iter = filteredRowGroup.iterator();
return iter.hasNext() ? getKey( : null;
private Iterator touch(Iterator iter) {
// Could return a Guava PeekingIterator after "if (iter.hasNext()) iter.peek()", but simply calling hasNext()
// is sufficient for the iterator implementations used by this DAO class...
return iter;
@Timed(name = "bv.emodb.sor.CqlDataReaderDAO.scan", absolute = true)
public Iterator scan(Table tbl, @Nullable String fromKeyExclusive, final LimitCounter ignore_limit,
final ReadConsistency consistency) {
// Note: The LimitCounter is passed in as an artifact of Astyanax batching and was used as a mechanism to
// control paging. The CQL driver natively performs this functionality so it is not used here. The caller
// will apply limit boundaries on the results from this method.
if (!_useCqlForScans.get()) {
return _astyanaxReaderDAO.scan(tbl, fromKeyExclusive, ignore_limit, consistency);
requireNonNull(tbl, "table");
requireNonNull(consistency, "consistency");
final AstyanaxTable table = (AstyanaxTable) tbl;
AstyanaxStorage storage = table.getReadStorage();
final DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
// Loop over all the range prefixes (2^shardsLog2 of them) and, for each, execute Cassandra queries to
// page through the records with that prefix.
final Iterator scanIter = storage.scanIterator(fromKeyExclusive);
return touch(Iterators.concat(new AbstractIterator>() {
protected Iterator computeNext() {
if (scanIter.hasNext()) {
ByteBufferRange keyRange =;
return recordScan(placement, table, keyRange, consistency);
return endOfData();
public Iterator getSplit(Table tbl, String split, @Nullable String fromKeyExclusive, LimitCounter ignore_limit,
ReadConsistency consistency) {
// Note: The LimitCounter is passed in as an artifact of Astyanax batching and was used as a mechanism to
// control paging. The CQL driver natively performs this functionality so it is not used here. The caller
// will apply limit boundaries on the results from this method.
if (!_useCqlForScans.get()) {
return _astyanaxReaderDAO.getSplit(tbl, split, fromKeyExclusive, ignore_limit, consistency);
requireNonNull(tbl, "table");
requireNonNull(split, "split");
requireNonNull(consistency, "consistency");
ByteBufferRange splitRange = SplitFormat.decode(split);
AstyanaxTable table = (AstyanaxTable) tbl;
AstyanaxStorage storage = getStorageForSplit(table, splitRange);
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
ByteBufferRange keyRange = storage.getSplitRange(splitRange, fromKeyExclusive, split);
// The fromKeyExclusive might be equal to the end token of the split. If so, there's nothing to return.
if (keyRange.getStart().equals(keyRange.getEnd())) {
return Collections.emptyIterator();
return recordScan(placement, table, keyRange, consistency);
* Scans a range of keys and returns an iterator containing each row's columns as an iterable.
private Iterator> rowScan(DeltaPlacement placement, @Nullable AstyanaxTable table, ByteBufferRange keyRange,
ReadConsistency consistency) {
ByteBuffer startToken = keyRange.getStart();
ByteBuffer endToken = keyRange.getEnd();
// Note: if Cassandra is asked to perform a token range query where start >= end it will wrap
// around which is absolutely *not* what we want.
checkArgument(AstyanaxStorage.compareKeys(startToken, endToken) < 0, "Cannot scan rows which loop from maximum- to minimum-token");
BlockedDeltaTableDDL tableDDL = placement.getBlockedDeltaTableDDL();
Statement statement = selectDeltaFrom(tableDDL)
.where(gt(token(tableDDL.getRowKeyColumnName()), startToken))
.and(lte(token(tableDDL.getRowKeyColumnName()), endToken))
return deltaQueryAsync(placement, statement, false, "Failed to scan token range [%s, %s] for %s",
ByteBufferUtil.bytesToHex(startToken), ByteBufferUtil.bytesToHex(endToken),
table != null ? table : "multiple tables");
* Similar to {@link #rowScan(DeltaPlacement, AstyanaxTable,, com.bazaarvoice.emodb.sor.api.ReadConsistency)}
* except this method converts each C* row into a Record.
private Iterator recordScan(DeltaPlacement placement, AstyanaxTable table, ByteBufferRange keyRange,
ReadConsistency consistency) {
Iterator> rowGroups = rowScan(placement, table, keyRange, consistency);
return decodeRows(rowGroups, table, placement);
* Converts rows from a single C* row to a Record.
private Iterator decodeRows(Iterator> rowGroups, final AstyanaxTable table, Placement placement) {
return Iterators.transform(rowGroups, rowGroup -> {
String key = AstyanaxStorage.getContentKey(getRawKeyFromRowGroup(rowGroup));
return newRecordFromCql(new Key(table, key), rowGroup, placement, ByteBufferUtil.bytesToHex(getRawKeyFromRowGroupOrNull(rowGroup)));
public Iterator multiTableScan(final MultiTableScanOptions query, final TableSet tables,
final LimitCounter limit, final ReadConsistency consistency, @Nullable Instant cutoffTime) {
if (!_useCqlForScans.get()) {
return _astyanaxReaderDAO.multiTableScan(query, tables, limit, consistency, cutoffTime);
requireNonNull(query, "query");
String placementName = requireNonNull(query.getPlacement(), "placement");
final DeltaPlacement placement = (DeltaPlacement) _placementCache.get(placementName);
ScanRange scanRange = ofNullable(query.getScanRange()).orElse(ScanRange.all());
// Since the range may wrap from high to low end of the token range we need to unwrap it
List ranges = scanRange.unwrapped();
return touch(FluentIterable.from(ranges)
.transformAndConcat(rowRange -> scanMultiTableRows(
tables, placement, rowRange.asByteBufferRange(), limit, query.isIncludeDeletedTables(),
query.isIncludeMirrorTables(), consistency, cutoffTime))
* Decodes rows returned by scanning across tables.
private Iterable scanMultiTableRows(
final TableSet tables, final DeltaPlacement placement, final ByteBufferRange rowRange,
final LimitCounter limit, final boolean includeDroppedTables, final boolean includeMirrorTables,
final ReadConsistency consistency, final Instant cutoffTime) {
// Avoiding pinning multiple decoded rows into memory at once.
return () -> limit.limit(new AbstractIterator() {
private PeekingIterator> _iter = Iterators.peekingIterator(
rowScan(placement, null, rowRange, consistency));
private long _lastTableUuid = -1;
private AstyanaxTable _table = null;
private boolean _droppedTable;
private boolean _primaryTable;
protected MultiTableScanResult computeNext() {
while (_iter.hasNext()) {
// Get the next rows from the grouping iterator. All rows in the returned Iterable
// are from the same Cassandra wide row (in other words, they share the same key).
final Iterable rows =;
// filter the rows if a cutOff time is specified.
Iterable filteredRows = rows;
if (cutoffTime != null) {
filteredRows = getFilteredRows(rows, cutoffTime);
// Convert the filteredRows into a Record object
ByteBuffer rowKey = getRawKeyFromRowGroupOrNull(filteredRows);
// rowKey can be null if "all" the rows of the cassandra record are after the cutoff time. In such case ignore that record and continue.
if (rowKey == null) {
long tableUuid = AstyanaxStorage.getTableUuid(rowKey);
if (_lastTableUuid != tableUuid) {
_lastTableUuid = tableUuid;
try {
_table = (AstyanaxTable) tables.getByUuid(tableUuid);
} catch (UnknownTableException e) {
_table = AstyanaxTable.createUnknown(tableUuid, placement, e.getTable());
} catch (DroppedTableException e) {
_table = AstyanaxTable.createUnknown(tableUuid, placement, e.getPriorTable());
_droppedTable = _table.isUnknownTable();
_primaryTable = _table.getReadStorage().hasUUID(tableUuid);
// Skip dropped and mirror tables if configured
if ((!includeDroppedTables && _droppedTable) || (!includeMirrorTables && !_primaryTable)) {
_iter = skipToNextTable(tableUuid);
int shardId = AstyanaxStorage.getShardId(rowKey);
String key = AstyanaxStorage.getContentKey(rowKey);
Record record = newRecordFromCql(new Key(_table, key), filteredRows, placement, ByteBufferUtil.bytesToHex(rowKey));
return new MultiTableScanResult(rowKey, shardId, tableUuid, _droppedTable, record);
return endOfData();
private PeekingIterator> skipToNextTable(long tableUuid) {
// Iterate over the next 10 row groups first to check for a table switch. This avoids starting a new range
// query if the number of rows in the undesired table is small.
int skipLimit = 10;
Iterable rowGroup = null;
while (skipLimit != 0 && _iter.hasNext()) {
rowGroup = _iter.peek();
ByteBuffer rawKey = getRawKeyFromRowGroup(rowGroup);
long nextTableUuid = AstyanaxStorage.getTableUuid(rawKey);
if (nextTableUuid != tableUuid) {
// This is the first row of a new table
return _iter;
} else {;
skipLimit -= 1;
if (_iter.hasNext()) {
// Skip the table entirely by starting a new query on the next possible table
assert rowGroup != null;
int shardId = AstyanaxStorage.getShardId(getRawKeyFromRowGroup(rowGroup));
ByteBuffer nextPossibleTableStart = AstyanaxStorage.getRowKeyRaw(shardId, tableUuid + 1, "");
ByteBuffer end = rowRange.getEnd();
if (AstyanaxStorage.compareKeys(nextPossibleTableStart, end) < 0) {
// We haven't reached the last end boundary of the original range scan
ByteBufferRange updatedRange = new ByteBufferRangeImpl(nextPossibleTableStart, end, -1, false);
return Iterators.peekingIterator(rowScan(placement, null, updatedRange, consistency));
return Iterators.peekingIterator(Collections.emptyIterator());
private AstyanaxStorage getStorageForSplit(AstyanaxTable table, ByteBufferRange splitRange) {
// During a table move, after the internal copy is complete getSplits() will return split IDs that point to
// the new storage location (table.getReadStorage()) but must still support old split IDs from the old
// storage location for a while.
if (!table.getReadStorage().contains(splitRange.getStart())) {
for (AstyanaxStorage storage : table.getWriteStorage()) {
if (storage.contains(splitRange.getStart()) && storage.getReadsAllowed()) {
return storage;
return table.getReadStorage();
* Implementation of {@link RowGroupResultSetIterator} with implementations for reading from a delta table.
private class DeltaRowGroupResultSetIterator extends RowGroupResultSetIterator {
private final DeltaPlacement _placement;
private final ConsistencyLevel _consistency;
private DeltaRowGroupResultSetIterator(ResultSet resultSet, int prefetchLimit,
DeltaPlacement placement, ConsistencyLevel consistency) {
super(resultSet, prefetchLimit);
_placement = placement;
_consistency = consistency;
private DeltaRowGroupResultSetIterator(ListenableFuture resultSetFuture, int prefetchLimit,
DeltaPlacement placement, ConsistencyLevel consistency) {
super(resultSetFuture, prefetchLimit);
_placement = placement;
_consistency = consistency;
protected Object getKeyForRow(Row row) {
return CqlBlockedDataReaderDAO.this.getKey(row);
protected ResultSet queryRowGroupRowsAfter(Row row) {
Statement statement = selectDeltaFrom(_placement.getBlockedDeltaTableDDL())
.where(eq(_placement.getBlockedDeltaTableDDL().getRowKeyColumnName(), getKey(row)))
.and(gt(ImmutableList.of(_placement.getBlockedDeltaTableDDL().getChangeIdColumnName(), _placement.getBlockedDeltaTableDDL().getBlockColumnName()),
ImmutableList.of(getChangeId(row), getBlock(row))))
return AdaptiveResultSet.executeAdaptiveQuery(_placement.getKeyspace().getCqlSession(), statement, _driverConfig.getSingleRowFetchSize());
* Reads columns from the delta or delta history table. The range of columns, order, and limit can be
* parameterized.
private ResultSet columnScan(DeltaPlacement placement, TableDDL tableDDL, ByteBuffer rowKey, Range columnRange,
boolean ascending, ConsistencyLevel consistency) {
Select.Where where = (tableDDL == placement.getBlockedDeltaTableDDL() ? selectDeltaFrom(placement.getBlockedDeltaTableDDL()) : selectFrom(tableDDL))
.where(eq(tableDDL.getRowKeyColumnName(), rowKey));
if (columnRange.hasLowerBound()) {
if (columnRange.lowerBoundType() == BoundType.CLOSED) {
where = where.and(gte(tableDDL.getChangeIdColumnName(), columnRange.lowerEndpoint().getUuid()));
} else {
where = where.and(gt(tableDDL.getChangeIdColumnName(), columnRange.lowerEndpoint().getUuid()));
if (columnRange.hasUpperBound()) {
if (columnRange.upperBoundType() == BoundType.CLOSED) {
where = where.and(lte(tableDDL.getChangeIdColumnName(), columnRange.upperEndpoint().getUuid()));
} else {
where = where.and(lt(tableDDL.getChangeIdColumnName(), columnRange.upperEndpoint().getUuid()));
Statement statement = where
.orderBy(ascending ? asc(tableDDL.getChangeIdColumnName()) : desc(tableDDL.getChangeIdColumnName()))
return AdaptiveResultSet.executeAdaptiveQuery(placement.getKeyspace().getCqlSession(), statement, _driverConfig.getSingleRowFetchSize());
public Iterator readTimeline(Key key, boolean includeContentData, UUID start, UUID end,
boolean reversed, long limit, ReadConsistency readConsistency) {
requireNonNull(key, "key");
checkArgument(limit > 0, "Limit must be >0");
requireNonNull(readConsistency, "consistency");
// Even though the API allows for a long limit CQL only supports integer values. Anything longer than MAX_INT
// is impractical given that a single Cassandra record must practically hold less than 2G rows since a wide row
// cannot be larger than 2G bytes.
int scaledLimit = (int) Math.min(Integer.MAX_VALUE, limit);
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
ByteBuffer rowKey = storage.getRowKey(key.getKey());
Range columnRange = toRange(start, end, reversed);
ConsistencyLevel consistency = SorConsistencies.toCql(readConsistency);
// Read Delta and Compaction objects
Iterator deltas = Collections.emptyIterator();
if (includeContentData) {
TableDDL deltaDDL = placement.getBlockedDeltaTableDDL();
ProtocolVersion protocolVersion = placement.getKeyspace().getCqlSession().getCluster().getConfiguration().getProtocolOptions().getProtocolVersion();
CodecRegistry codecRegistry = placement.getKeyspace().getCqlSession().getCluster().getConfiguration().getCodecRegistry();
deltas = decodeDeltaColumns(Iterators.limit(new CqlDeltaIterator(columnScan(placement, deltaDDL, rowKey, columnRange, !reversed, consistency).iterator(), BLOCK_RESULT_SET_COLUMN, CHANGE_ID_RESULT_SET_COLUMN, VALUE_RESULT_SET_COLUMN, reversed, _deltaPrefixLength, protocolVersion, codecRegistry, ByteBufferUtil.bytesToHex(rowKey)), scaledLimit));
// Read History objects
Iterator deltaHistory = Collections.emptyIterator();
TableDDL deltaHistoryDDL = placement.getDeltaHistoryTableDDL();
deltaHistory = decodeColumns(Iterators.limit(columnScan(placement, deltaHistoryDDL, rowKey, columnRange, !reversed, consistency).iterator(), scaledLimit));
return touch(MergeIterator.merge(deltas, deltaHistory, reversed));
public Iterator getExistingHistories(Key key, UUID start, UUID end, ReadConsistency readConsistency) {
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
ByteBuffer rowKey = storage.getRowKey(key.getKey());
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
Range columnRange = toRange(start, end, true);
ConsistencyLevel consistency = SorConsistencies.toCql(readConsistency);
TableDDL deltaHistoryDDL = placement.getDeltaHistoryTableDDL();
return decodeColumns(columnScan(placement, deltaHistoryDDL, rowKey, columnRange, false, consistency).iterator());
* Transforms the provided Row iterator into a {@link Change} iterator.
private Iterator decodeColumns(Iterator iter) {
return Iterators.transform(iter, row -> _changeEncoder.decodeChange(getChangeId(row), getValue(row)));
private Iterator decodeDeltaColumns(Iterator iter) {
return Iterators.transform(iter, row -> _changeEncoder.decodeChange(getChangeId(row), _daoUtils.skipPrefix(getValue(row))));
* Converts a TimeUUID set of endpoints into a {@link Range}. of {@link RangeTimeUUID}s. Both end points
* are considered closed; that is, they are included in the range.
private Range toRange(@Nullable UUID start, @Nullable UUID end, boolean reversed) {
// If the range is reversed then start and end will also be reversed and must therefore be swapped.
if (reversed) {
UUID tmp = start;
start = end;
end = tmp;
if (start == null) {
if (end == null) {
return Range.all();
} else {
return Range.atMost(new RangeTimeUUID(end));
} else if (end == null) {
return Range.atLeast(new RangeTimeUUID(start));
return Range.closed(new RangeTimeUUID(start), new RangeTimeUUID(end));
* {@link Range} needs comparable type. This class thinly encapsulates a UUID and sorts as a TimeUUID.
private static class RangeTimeUUID implements Comparable {
private final UUID _uuid;
private RangeTimeUUID(UUID uuid) {
_uuid = uuid;
public boolean equals(Object o) {
if (this == o) {
return true;
if (!(o instanceof RangeTimeUUID)) {
return false;
return _uuid.equals(((RangeTimeUUID) o)._uuid);
public int hashCode() {
return _uuid.hashCode();
public int compareTo(RangeTimeUUID o) {
return, o._uuid);
private UUID getUuid() {
return _uuid;
* Helper method to return a record with no rows.
private Record emptyRecord(Key key) {
return new RecordImpl(key,
public static Iterable getFilteredRows(Iterable rows, Instant cutoffTime) {
if (cutoffTime == null) {
return rows;
return () -> Iterators.filter(rows.iterator(), row -> (TimeUUIDs.getTimeMillis(row.getUUID(CHANGE_ID_RESULT_SET_COLUMN)) < cutoffTime.toEpochMilli()));
// The following methods rely on using the Cassandra thrift call describe_splits_ex()
to split
// a token range into portions of approximately equal size. There is currently no equivalent client-side
// support for this call using CQL. Therefore they must always defer to the Asytanax implementation.
public List getSplits(Table table, int recordsPerSplit, int localResplits) throws TimeoutException {
return _astyanaxReaderDAO.getSplits(table, recordsPerSplit, localResplits);
public ScanRangeSplits getScanRangeSplits(String placement, int desiredRecordsPerSplit, Optional subrange) {
return _astyanaxReaderDAO.getScanRangeSplits(placement, desiredRecordsPerSplit, subrange);
public long count(Table table, ReadConsistency consistency) {
return _astyanaxReaderDAO.count(table, consistency);
public long count(Table table, @Nullable Integer limit, ReadConsistency consistency) {
return _astyanaxReaderDAO.count(table, limit, consistency);
public Stream getKeysForStorage(AstyanaxStorage storage) {
// Loop over all the range prefixes (2^shardsLog2 of them) and, for each, execute Cassandra queries to
// page through the rowkeys with that prefix.
final DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
BlockedDeltaTableDDL tableDDL = placement.getBlockedDeltaTableDDL();
return, 0), false)
.map(keyRange ->
.where(gt(token(tableDDL.getRowKeyColumnName()), keyRange.getStart()))
.and(lte(token(tableDDL.getRowKeyColumnName()), keyRange.getEnd()))
.flatMap(statement ->
deltaQuery(placement, statement, false, "Failed to scan keys for storage %s", storage.toString()),
© 2015 - 2025 Weber Informatics LLC | Privacy Policy