com.bazaarvoice.emodb.sor.db.astyanax.CqlBlockedDataReaderDAO Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of emodb-sor Show documentation
There is a newer version: 6.5.190
package com.bazaarvoice.emodb.sor.db.astyanax;

import com.bazaarvoice.emodb.common.api.impl.LimitCounter;
import com.bazaarvoice.emodb.common.cassandra.CqlDriverConfiguration;
import com.bazaarvoice.emodb.common.cassandra.cqldriver.AdaptiveResultSet;
import com.bazaarvoice.emodb.common.uuid.TimeUUIDs;
import com.bazaarvoice.emodb.sor.api.Change;
import com.bazaarvoice.emodb.sor.api.Compaction;
import com.bazaarvoice.emodb.sor.api.ReadConsistency;
import com.bazaarvoice.emodb.sor.api.UnknownTableException;
import com.bazaarvoice.emodb.sor.db.DAOUtils;
import com.bazaarvoice.emodb.sor.db.DataReaderDAO;
import com.bazaarvoice.emodb.sor.db.Key;
import com.bazaarvoice.emodb.sor.db.MultiTableScanOptions;
import com.bazaarvoice.emodb.sor.db.MultiTableScanResult;
import com.bazaarvoice.emodb.sor.db.Record;
import com.bazaarvoice.emodb.sor.db.RecordEntryRawMetadata;
import com.bazaarvoice.emodb.sor.db.ScanRange;
import com.bazaarvoice.emodb.sor.db.ScanRangeSplits;
import com.bazaarvoice.emodb.sor.db.cql.CachingRowGroupIterator;
import com.bazaarvoice.emodb.sor.db.cql.CqlForMultiGets;
import com.bazaarvoice.emodb.sor.db.cql.CqlForScans;
import com.bazaarvoice.emodb.sor.db.cql.CqlReaderDAODelegate;
import com.bazaarvoice.emodb.sor.db.cql.RowGroupResultSetIterator;
import com.bazaarvoice.emodb.sor.db.test.DeltaClusteringKey;
import com.bazaarvoice.emodb.table.db.DroppedTableException;
import com.bazaarvoice.emodb.table.db.Table;
import com.bazaarvoice.emodb.table.db.TableSet;
import com.bazaarvoice.emodb.table.db.astyanax.AstyanaxStorage;
import com.bazaarvoice.emodb.table.db.astyanax.AstyanaxTable;
import com.bazaarvoice.emodb.table.db.astyanax.Placement;
import com.bazaarvoice.emodb.table.db.astyanax.PlacementCache;
import com.bazaarvoice.emodb.table.db.eventregistry.StorageReaderDAO;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.codahale.metrics.annotation.Timed;
import com.datastax.driver.core.CodecRegistry;
import com.datastax.driver.core.ConsistencyLevel;
import com.datastax.driver.core.ProtocolVersion;
import com.datastax.driver.core.ResultSet;
import com.datastax.driver.core.Row;
import com.datastax.driver.core.Session;
import com.datastax.driver.core.Statement;
import com.datastax.driver.core.querybuilder.QueryBuilder;
import com.datastax.driver.core.querybuilder.Select;
import com.datastax.driver.core.utils.MoreFutures;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Supplier;
import com.google.common.base.Suppliers;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.BoundType;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Ordering;
import com.google.common.collect.PeekingIterator;
import com.google.common.collect.Range;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import com.google.inject.Inject;
import com.netflix.astyanax.model.ByteBufferRange;
import com.netflix.astyanax.util.ByteBufferRangeImpl;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;
import java.nio.ByteBuffer;
import java.time.Instant;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Spliterators;
import java.util.UUID;
import java.util.concurrent.TimeoutException;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import static com.datastax.driver.core.querybuilder.QueryBuilder.asc;
import static com.datastax.driver.core.querybuilder.QueryBuilder.desc;
import static com.datastax.driver.core.querybuilder.QueryBuilder.eq;
import static com.datastax.driver.core.querybuilder.QueryBuilder.gt;
import static com.datastax.driver.core.querybuilder.QueryBuilder.gte;
import static com.datastax.driver.core.querybuilder.QueryBuilder.in;
import static com.datastax.driver.core.querybuilder.QueryBuilder.lt;
import static com.datastax.driver.core.querybuilder.QueryBuilder.lte;
import static com.datastax.driver.core.querybuilder.QueryBuilder.token;
import static com.google.common.base.Preconditions.checkArgument;
import static java.util.Objects.requireNonNull;
import static java.util.Optional.ofNullable;

// Delegates to AstyanaxReaderDAO for non-CQL stuff
// Once we transition fully, we will stop delegating to Astyanax
public class CqlBlockedDataReaderDAO implements DataReaderDAO, StorageReaderDAO {

    private final Logger _log = LoggerFactory.getLogger(CqlBlockedDataReaderDAO.class);

    /**
     * Depending on the placement and type of data being queried (delta or delta history) the names of the
     * columns being queried can change.  However, by quering the columns in a fixed well-known order in each
     * {@link QueryBuilder#select()} the results can be efficiently read by position rather than name.
     */
    private static final int ROW_KEY_RESULT_SET_COLUMN = 0;
    private static final int CHANGE_ID_RESULT_SET_COLUMN = 1;
    private static final int VALUE_RESULT_SET_COLUMN = 2;
    private static final int BLOCK_RESULT_SET_COLUMN = 3;

    private final DataReaderDAO _astyanaxReaderDAO;
    private final ChangeEncoder _changeEncoder;
    private final PlacementCache _placementCache;
    private final CqlDriverConfiguration _driverConfig;
    private final Meter _randomReadMeter;
    private final Timer _readBatchTimer;
    private final DAOUtils _daoUtils;
    private final int _deltaPrefixLength;

    // Support AB testing of various uses of the CQL driver versus the older but (at this point) more vetted Astyanax driver.
    private volatile Supplier _useCqlForMultiGets = Suppliers.ofInstance(true);
    private volatile Supplier _useCqlForScans = Suppliers.ofInstance(true);

    @Inject
    public CqlBlockedDataReaderDAO(@CqlReaderDAODelegate DataReaderDAO delegate, PlacementCache placementCache,
                                   CqlDriverConfiguration driverConfig, ChangeEncoder changeEncoder,
                                   MetricRegistry metricRegistry, DAOUtils daoUtils, @PrefixLength int deltaPrefixLength) {
        _astyanaxReaderDAO = requireNonNull(delegate, "delegate");
        _placementCache = placementCache;
        _driverConfig = driverConfig;
        _changeEncoder = changeEncoder;
        _randomReadMeter = metricRegistry.meter(getMetricName("random-reads"));
        _readBatchTimer = metricRegistry.timer(getMetricName("readBatch"));
        _deltaPrefixLength = deltaPrefixLength;
        _daoUtils = daoUtils;

    }

    private String getMetricName(String name) {
        return MetricRegistry.name("bv.emodb.sor", "CqlDataReaderDAO", name);
    }

    // Since AB testing of CQL driver is temporary until proven out don't change the constructor to support this feature.
    // Inject the AB testing flags independently.  This will make backing these settings out easier in the future.

    @Inject
    public void setUseCqlforMultiGets(@CqlForMultiGets Supplier useCqlForMultiGets) {
        _useCqlForMultiGets = requireNonNull(useCqlForMultiGets, "useCqlForMultiGets");
    }

    @Inject
    public void setUseCqlforScans(@CqlForScans Supplier useCqlForScans) {
        _useCqlForScans = requireNonNull(useCqlForScans, "useCqlForScans");
    }

    /**
     * This CQL based read method works for a row with 64 deltas of 3 MB each. The same read with the AstyanaxDataReaderDAO
     * would give Thrift frame errors.
     */
    @Override
    public Record read(Key key, ReadConsistency consistency) {
        requireNonNull(key, "key");
        requireNonNull(consistency, "consistency");

        AstyanaxTable table = (AstyanaxTable) key.getTable();
        AstyanaxStorage storage = table.getReadStorage();
        DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
        ByteBuffer rowKey = storage.getRowKey(key.getKey());

        return read(key, rowKey, consistency, placement);
    }

    @Override
    public Iterator readAll(Collection keys, final ReadConsistency consistency) {
        if (!_useCqlForMultiGets.get()) {
            return _astyanaxReaderDAO.readAll(keys, consistency);
        }

        requireNonNull(keys, "keys");
        requireNonNull(consistency, "consistency");

        // Group the keys by placement.  Each placement will result in a separate set of queries.  Dedup keys.
        Multimap placementMap = HashMultimap.create();
        for (Key key : keys) {
            AstyanaxTable table = (AstyanaxTable) key.getTable();
            AstyanaxStorage storage = table.getReadStorage();
            placementMap.put((DeltaPlacement) storage.getPlacement(), key);
        }

        // Return an iterator that will loop over the placements and perform a query for each placement and
        // return the resulting decoded rows.
        return touch(Iterators.concat(Iterators.transform(placementMap.asMap().entrySet().iterator(),
                entry -> readBatch(entry.getKey(), entry.getValue(), consistency))));
    }

    @Override
    public String getPlacementCluster(String placementName) {
        requireNonNull(placementName, "placement");

        DeltaPlacement placement = (DeltaPlacement) _placementCache.get(placementName);
        return placement.getKeyspace().getClusterName();
    }

    private Record read(Key key, ByteBuffer rowKey, ReadConsistency consistency, DeltaPlacement placement) {
        requireNonNull(key, "key");
        requireNonNull(consistency, "consistency");

        BlockedDeltaTableDDL tableDDL = placement.getBlockedDeltaTableDDL();

        Statement statement = selectDeltaFrom(tableDDL)
                .where(eq(tableDDL.getRowKeyColumnName(), rowKey))
                .setConsistencyLevel(SorConsistencies.toCql(consistency));


        // Track metrics
        _randomReadMeter.mark();

        Iterator> groupedRows = deltaQuery(placement, statement, true, "Failed to read record %s", key);

        Iterable rows;
        if (groupedRows.hasNext()) {
            rows = groupedRows.next();
        } else {
            rows = ImmutableList.of();
        }

        // Convert the results into a Record object, lazily fetching the rest of the columns as necessary.
        return newRecordFromCql(key, rows, placement, ByteBufferUtil.bytesToHex(rowKey));
    }

    /**
     * Synchronously executes the provided statement.  The statement must query the delta table as returned from
     * {@link com.bazaarvoice.emodb.sor.db.astyanax.DeltaPlacement#getBlockedDeltaTableDDL()}
     */
    private Iterator> deltaQuery(DeltaPlacement placement, Statement statement, boolean singleRow,
                                               String errorContext, Object... errorContextArgs) {
        return doDeltaQuery(placement, statement, singleRow, false, errorContext, errorContextArgs);
    }

    /**
     * Asynchronously executes the provided statement.  Although the iterator is returned immediately the actual results
     * may still be loading in the background.  The statement must query the delta table as returned from
     * {@link com.bazaarvoice.emodb.sor.db.astyanax.DeltaPlacement#getBlockedDeltaTableDDL()}
     */
    private Iterator> deltaQueryAsync(DeltaPlacement placement, Statement statement, boolean singleRow,
                                                    String errorContext, Object... errorContextArgs) {
        return doDeltaQuery(placement, statement, singleRow, true, errorContext, errorContextArgs);
    }

    private Iterator> doDeltaQuery(DeltaPlacement placement, Statement statement, boolean singleRow, boolean async,
                                                 String errorContext, Object... errorContextArgs) {
        // Set the fetch size and prefetch limits depending on whether the query is for a single row or multiple rows.
        int fetchSize = singleRow ? _driverConfig.getSingleRowFetchSize() : _driverConfig.getMultiRowFetchSize();
        int prefetchLimit = singleRow ? _driverConfig.getSingleRowPrefetchLimit() : _driverConfig.getMultiRowPrefetchLimit();

        Session session = placement.getKeyspace().getCqlSession();
        DeltaRowGroupResultSetIterator deltaRowGroupResultSetIterator;

        if (async) {
            ListenableFuture resultSetFuture = AdaptiveResultSet.executeAdaptiveQueryAsync(session, statement, fetchSize);

            deltaRowGroupResultSetIterator = new DeltaRowGroupResultSetIterator(
                    resultSetFuture, prefetchLimit, placement, statement.getConsistencyLevel());

            Futures.addCallback(resultSetFuture, new MoreFutures.FailureCallback() {
                @Override
                public void onFailure(Throwable t) {
                    _log.error(String.format(errorContext, errorContextArgs), t);
                }
            });
        } else {
            try {
                ResultSet resultSet = AdaptiveResultSet.executeAdaptiveQuery(session, statement, fetchSize);
                deltaRowGroupResultSetIterator = new DeltaRowGroupResultSetIterator(
                        resultSet, prefetchLimit, placement, statement.getConsistencyLevel());
            } catch (Throwable t) {
                _log.error(String.format(errorContext, errorContextArgs), t);
                throw t;
            }
        }

        return new CachingRowGroupIterator(deltaRowGroupResultSetIterator, _driverConfig.getRecordCacheSize(), _driverConfig.getRecordSoftCacheSize());
    }
    /**
     * Creates a Record instance for a given key and list of rows.  All rows must be from the same Cassandra row;
     * in other words, it is expected that row.getBytesUnsafe(ROW_KEY_RESULT_SET_COLUMN) returns the same value for
     * each row in rows.
     */
    private Record newRecordFromCql(Key key, Iterable rows, Placement placement, String rowKey) {
        Session session = placement.getKeyspace().getCqlSession();
        ProtocolVersion protocolVersion = session.getCluster().getConfiguration().getProtocolOptions().getProtocolVersion();
        CodecRegistry codecRegistry = session.getCluster().getConfiguration().getCodecRegistry();

        Iterator> changeIter = decodeChangesFromCql(new CqlDeltaIterator(rows.iterator(), BLOCK_RESULT_SET_COLUMN, CHANGE_ID_RESULT_SET_COLUMN, VALUE_RESULT_SET_COLUMN, false, _deltaPrefixLength, protocolVersion, codecRegistry, rowKey));
        Iterator> compactionIter = decodeCompactionsFromCql(new CqlDeltaIterator(rows.iterator(), BLOCK_RESULT_SET_COLUMN, CHANGE_ID_RESULT_SET_COLUMN, VALUE_RESULT_SET_COLUMN, false, _deltaPrefixLength, protocolVersion, codecRegistry, rowKey));
        Iterator rawMetadataIter = rawMetadataFromCql(new CqlDeltaIterator(rows.iterator(), BLOCK_RESULT_SET_COLUMN, CHANGE_ID_RESULT_SET_COLUMN, VALUE_RESULT_SET_COLUMN, false, _deltaPrefixLength, protocolVersion, codecRegistry, rowKey));

        return new RecordImpl(key, compactionIter, changeIter, rawMetadataIter);
    }

    /**
     * Converts a list of rows into Change instances.
     */
    private Iterator> decodeChangesFromCql(final Iterator iter) {
        return Iterators.transform(iter, row ->
                Maps.immutableEntry(new DeltaClusteringKey(getChangeId(row), row.getNumBlocks()), _changeEncoder.decodeChange(getChangeId(row), _daoUtils.skipPrefix(getValue(row)))));
    }

    /**
     * Like {@link #decodeChangesFromCql(java.util.Iterator)} except filtered to only include compactions.
     */
    private Iterator> decodeCompactionsFromCql(final Iterator iter) {
        return new AbstractIterator>() {
            @Override
            protected Map.Entry computeNext() {
                while (iter.hasNext()) {
                    StitchedRow row = iter.next();
                    Compaction compaction = _changeEncoder.decodeCompaction(_daoUtils.skipPrefix(getValue(row)));
                    if (compaction != null) {
                        return Maps.immutableEntry(new DeltaClusteringKey(getChangeId(row),row.getNumBlocks()), compaction);
                    }
                }
                return endOfData();
            }
        };
    }

    /**
     * Converts the rows from the provided iterator into raw metadata.
     */
    private Iterator rawMetadataFromCql(final Iterator iter) {
        return Iterators.transform(iter, row -> new RecordEntryRawMetadata()
                .withTimestamp(TimeUUIDs.getTimeMillis(getChangeId(row)))
                .withSize(_daoUtils.skipPrefix(getValue(row)).remaining()));
    }

    /**
     * Read a batch of keys that all belong to the same placement (ColumnFamily).
     */
    private Iterator readBatch(final DeltaPlacement placement, final Collection keys, final ReadConsistency consistency) {
        requireNonNull(keys, "keys");

        // Convert the keys to ByteBuffer Cassandra row keys
        List> rowKeys = Lists.newArrayListWithCapacity(keys.size());
        for (Key key : keys) {
            AstyanaxTable table = (AstyanaxTable) key.getTable();
            AstyanaxStorage storage = table.getReadStorage();
            rowKeys.add(Maps.immutableEntry(storage.getRowKey(key.getKey()), key));
        }

        // Sort the keys by their byte array encoding to get some locality w/queries.
        Collections.sort(rowKeys, Ordering.natural().onResultOf(entry -> entry.getKey()));

        // Group them into batches.  Cassandra may have to seek each row so prefer smaller batches.
        List>> batches = Lists.partition(rowKeys, _driverConfig.getMaxRandomRowsBatchSize());

        // This algorithm is arranged such that rows are return in pages with size _fetchSize.  The rows are grouped
        // into row groups by common row key.  The first RECORD_CACHE_SIZE rows are cached for the row group
        // and any remaining rows are cached using soft references.  This places an upper bound on the memory
        // requirements needed while iterating.  If at any time a soft reference is lost C* is re-queried to
        // fetch the missing columns.

        return Iterators.concat(Iterators.transform(batches.iterator(),
                rowKeySubset -> {
                    Timer.Context timerCtx = _readBatchTimer.time();
                    try {
                        return rowQuery(rowKeySubset, consistency, placement);
                    } finally {
                        timerCtx.stop();
                    }
                }));
    }

    /**
     * Returns an iterator for the Records keyed by the provided row keys.  An empty record is returned for any
     * key which does not have a corresponding row in C*.
     */
    private Iterator rowQuery(final List> rowKeys, final ReadConsistency consistency,
                                      final DeltaPlacement placement) {
        List keys = Lists.newArrayListWithCapacity(rowKeys.size());
        final Map rawKeyMap = Maps.newHashMap();
        for (Map.Entry entry : rowKeys) {
            keys.add(entry.getKey());
            rawKeyMap.put(entry.getKey(), entry.getValue());
        }

        BlockedDeltaTableDDL tableDDL = placement.getBlockedDeltaTableDDL();

        Statement statement = selectDeltaFrom(tableDDL)
                .where(in(tableDDL.getRowKeyColumnName(), keys))
                .setConsistencyLevel(SorConsistencies.toCql(consistency));

        Iterator> rowGroups = deltaQueryAsync(placement, statement, false, "Failed to read records %s", rawKeyMap.values());

        return Iterators.concat(
                // First iterator reads the row groups found and transforms them to Records
                Iterators.transform(rowGroups, rows -> {
                    ByteBuffer keyBytes = getRawKeyFromRowGroup(rows);
                    Key key = rawKeyMap.remove(keyBytes);
                    assert key != null : "Query returned row with a key out of bound";
                    return newRecordFromCql(key, rows, placement, ByteBufferUtil.bytesToHex(keyBytes));
                }),
                // Second iterator returns an empty Record for each key queried but not found.
                new AbstractIterator() {
                    private Iterator _nonExistentKeyIterator;

                    @Override
                    protected Record computeNext() {
                        // Lazily return an empty record for each key not found in the previous iterator.
                        // rawKeyMap.iterator() must not be called until the first iterator is completely spent.
                        if (_nonExistentKeyIterator == null) {
                            _nonExistentKeyIterator = rawKeyMap.values().iterator();
                        }
                        return _nonExistentKeyIterator.hasNext() ?
                                emptyRecord(_nonExistentKeyIterator.next()) :
                                endOfData();
                    }
                });
    }

    /**
     * Returns a select statement builder for a {@link TableDDL} with the columns ordered in the order set by
     * {@link #ROW_KEY_RESULT_SET_COLUMN}, {@link #CHANGE_ID_RESULT_SET_COLUMN}, and {@link #VALUE_RESULT_SET_COLUMN}.
     */
    private Select selectFrom(TableDDL tableDDL) {
        return QueryBuilder.select()
                .column(tableDDL.getRowKeyColumnName())     // ROW_KEY_RESULT_SET_COLUMN
                .column(tableDDL.getChangeIdColumnName())   // CHANGE_ID_RESULT_SET_COLUMN
                .column(tableDDL.getValueColumnName())      // VALUE_RESULT_SET_COLUMN
                .from(tableDDL.getTableMetadata());
    }

    private Select selectDeltaFrom(BlockedDeltaTableDDL tableDDL) {
        return QueryBuilder.select()
                .column(tableDDL.getRowKeyColumnName())     // ROW_KEY_RESULT_SET_COLUMN
                .column(tableDDL.getChangeIdColumnName())   // CHANGE_ID_RESULT_SET_COLUMN
                .column(tableDDL.getValueColumnName())      // VALUE_RESULT_SET_COLUMN
                .column(tableDDL.getBlockColumnName())      // BLOCK_ID_RESULT_SET COLUMN
                .from(tableDDL.getTableMetadata());
    }

    private ByteBuffer getKey(Row row) {
        return row.getBytesUnsafe(ROW_KEY_RESULT_SET_COLUMN);
    }

    private UUID getChangeId(Row row) {
        return row.getUUID(CHANGE_ID_RESULT_SET_COLUMN);
    }

    private int getBlock(Row row) {
        return row.getInt(BLOCK_RESULT_SET_COLUMN);
    }

    private ByteBuffer getValue(Row row) {
        return row.getBytesUnsafe(VALUE_RESULT_SET_COLUMN);
    }

    /**
     * A few notes on this method:
     * 
     * All rows in the row group have the same key, so choosing the first row is safe.
     * The rowGroup will always contain at least one row.
     * The row group has at least the first row in hard cache, so iterating to the first row will never
     * result in a new CQL query.
     * 
     */
    private ByteBuffer getRawKeyFromRowGroup(Iterable rowGroup) {
        Iterator iter = rowGroup.iterator();
        // Sanity check
        assert iter.hasNext() : "Row group should never contain zero rows";
        return getKey(iter.next());
    }

    /*
      Its similar to getRawKeyFromRowGroup but should be used where the rowGroup can have no rows as well.
    */
    private ByteBuffer getRawKeyFromRowGroupOrNull(Iterable filteredRowGroup) {
        Iterator iter = filteredRowGroup.iterator();
        return iter.hasNext() ? getKey(iter.next()) : null;
    }

    private  Iterator touch(Iterator iter) {
        // Could return a Guava PeekingIterator after "if (iter.hasNext()) iter.peek()", but simply calling hasNext()
        // is sufficient for the iterator implementations used by this DAO class...
        iter.hasNext();
        return iter;
    }

    @Timed(name = "bv.emodb.sor.CqlDataReaderDAO.scan", absolute = true)
    @Override
    public Iterator scan(Table tbl, @Nullable String fromKeyExclusive, final LimitCounter ignore_limit,
                                 final ReadConsistency consistency) {
        // Note:  The LimitCounter is passed in as an artifact of Astyanax batching and was used as a mechanism to
        // control paging.  The CQL driver natively performs this functionality so it is not used here.  The caller
        // will apply limit boundaries on the results from this method.
        if (!_useCqlForScans.get()) {
            return _astyanaxReaderDAO.scan(tbl, fromKeyExclusive, ignore_limit, consistency);
        }

        requireNonNull(tbl, "table");
        requireNonNull(consistency, "consistency");

        final AstyanaxTable table = (AstyanaxTable) tbl;
        AstyanaxStorage storage = table.getReadStorage();
        final DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();

        // Loop over all the range prefixes (2^shardsLog2 of them) and, for each, execute Cassandra queries to
        // page through the records with that prefix.
        final Iterator scanIter = storage.scanIterator(fromKeyExclusive);
        return touch(Iterators.concat(new AbstractIterator>() {
            @Override
            protected Iterator computeNext() {
                if (scanIter.hasNext()) {
                    ByteBufferRange keyRange = scanIter.next();
                    return recordScan(placement, table, keyRange, consistency);
                }
                return endOfData();
            }
        }));
    }

    @Override
    public Iterator getSplit(Table tbl, String split, @Nullable String fromKeyExclusive, LimitCounter ignore_limit,
                                     ReadConsistency consistency) {
        // Note:  The LimitCounter is passed in as an artifact of Astyanax batching and was used as a mechanism to
        // control paging.  The CQL driver natively performs this functionality so it is not used here.  The caller
        // will apply limit boundaries on the results from this method.
        if (!_useCqlForScans.get()) {
            return _astyanaxReaderDAO.getSplit(tbl, split, fromKeyExclusive, ignore_limit, consistency);
        }

        requireNonNull(tbl, "table");
        requireNonNull(split, "split");
        requireNonNull(consistency, "consistency");

        ByteBufferRange splitRange = SplitFormat.decode(split);

        AstyanaxTable table = (AstyanaxTable) tbl;
        AstyanaxStorage storage = getStorageForSplit(table, splitRange);
        DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();

        ByteBufferRange keyRange = storage.getSplitRange(splitRange, fromKeyExclusive, split);
        // The fromKeyExclusive might be equal to the end token of the split.  If so, there's nothing to return.
        if (keyRange.getStart().equals(keyRange.getEnd())) {
            return Collections.emptyIterator();
        }

        return recordScan(placement, table, keyRange, consistency);
    }

    /**
     * Scans a range of keys and returns an iterator containing each row's columns as an iterable.
     */
    private Iterator> rowScan(DeltaPlacement placement, @Nullable AstyanaxTable table, ByteBufferRange keyRange,
                                            ReadConsistency consistency) {
        ByteBuffer startToken = keyRange.getStart();
        ByteBuffer endToken = keyRange.getEnd();

        // Note: if Cassandra is asked to perform a token range query where start >= end it will wrap
        // around which is absolutely *not* what we want.
        checkArgument(AstyanaxStorage.compareKeys(startToken, endToken) < 0, "Cannot scan rows which loop from maximum- to minimum-token");

        BlockedDeltaTableDDL tableDDL = placement.getBlockedDeltaTableDDL();

        Statement statement = selectDeltaFrom(tableDDL)
                .where(gt(token(tableDDL.getRowKeyColumnName()), startToken))
                .and(lte(token(tableDDL.getRowKeyColumnName()), endToken))
                .setConsistencyLevel(SorConsistencies.toCql(consistency));

        return deltaQueryAsync(placement, statement, false, "Failed to scan token range [%s, %s] for %s",
                ByteBufferUtil.bytesToHex(startToken), ByteBufferUtil.bytesToHex(endToken),
                table != null ? table : "multiple tables");
    }

    /**
     * Similar to {@link #rowScan(DeltaPlacement, AstyanaxTable, com.netflix.astyanax.model.ByteBufferRange, com.bazaarvoice.emodb.sor.api.ReadConsistency)}
     * except this method converts each C* row into a Record.
     */
    private Iterator recordScan(DeltaPlacement placement, AstyanaxTable table, ByteBufferRange keyRange,
                                        ReadConsistency consistency) {

        Iterator> rowGroups = rowScan(placement, table, keyRange, consistency);
        return decodeRows(rowGroups, table, placement);
    }

    /**
     * Converts rows from a single C* row to a Record.
     */
    private Iterator decodeRows(Iterator> rowGroups, final AstyanaxTable table, Placement placement) {
        return Iterators.transform(rowGroups, rowGroup -> {
            String key = AstyanaxStorage.getContentKey(getRawKeyFromRowGroup(rowGroup));
            return newRecordFromCql(new Key(table, key), rowGroup, placement, ByteBufferUtil.bytesToHex(getRawKeyFromRowGroupOrNull(rowGroup)));
        });
    }

    @Override
    public Iterator multiTableScan(final MultiTableScanOptions query, final TableSet tables,
                                                         final LimitCounter limit, final ReadConsistency consistency, @Nullable Instant cutoffTime) {
        if (!_useCqlForScans.get()) {
            return _astyanaxReaderDAO.multiTableScan(query, tables, limit, consistency, cutoffTime);
        }

        requireNonNull(query, "query");
        String placementName = requireNonNull(query.getPlacement(), "placement");
        final DeltaPlacement placement = (DeltaPlacement) _placementCache.get(placementName);

        ScanRange scanRange = ofNullable(query.getScanRange()).orElse(ScanRange.all());

        // Since the range may wrap from high to low end of the token range we need to unwrap it
        List ranges = scanRange.unwrapped();

        return touch(FluentIterable.from(ranges)
                .transformAndConcat(rowRange -> scanMultiTableRows(
                        tables, placement, rowRange.asByteBufferRange(), limit, query.isIncludeDeletedTables(),
                        query.isIncludeMirrorTables(), consistency, cutoffTime))
                .iterator());
    }

    /**
     * Decodes rows returned by scanning across tables.
     */
    private Iterable scanMultiTableRows(
            final TableSet tables, final DeltaPlacement placement, final ByteBufferRange rowRange,
            final LimitCounter limit, final boolean includeDroppedTables, final boolean includeMirrorTables,
            final ReadConsistency consistency, final Instant cutoffTime) {

        // Avoiding pinning multiple decoded rows into memory at once.
        return () -> limit.limit(new AbstractIterator() {
            private PeekingIterator> _iter = Iterators.peekingIterator(
                    rowScan(placement, null, rowRange, consistency));

            private long _lastTableUuid = -1;
            private AstyanaxTable _table = null;
            private boolean _droppedTable;
            private boolean _primaryTable;

            @Override
            protected MultiTableScanResult computeNext() {
                while (_iter.hasNext()) {
                    // Get the next rows from the grouping iterator.  All rows in the returned Iterable
                    // are from the same Cassandra wide row (in other words, they share the same key).
                    final Iterable rows = _iter.next();

                    // filter the rows if a cutOff time is specified.
                    Iterable filteredRows = rows;
                    if (cutoffTime != null) {
                        filteredRows = getFilteredRows(rows, cutoffTime);
                    }

                    // Convert the filteredRows into a Record object
                    ByteBuffer rowKey = getRawKeyFromRowGroupOrNull(filteredRows);
                    // rowKey can be null if "all" the rows of the cassandra record are after the cutoff time. In such case ignore that record and continue.
                    if (rowKey == null) {
                        continue;
                    }

                    long tableUuid = AstyanaxStorage.getTableUuid(rowKey);
                    if (_lastTableUuid != tableUuid) {
                        _lastTableUuid = tableUuid;
                        try {
                            _table = (AstyanaxTable) tables.getByUuid(tableUuid);
                        } catch (UnknownTableException e) {
                            _table = AstyanaxTable.createUnknown(tableUuid, placement, e.getTable());
                        } catch (DroppedTableException e) {
                            _table = AstyanaxTable.createUnknown(tableUuid, placement, e.getPriorTable());
                        }
                        _droppedTable = _table.isUnknownTable();
                        _primaryTable = _table.getReadStorage().hasUUID(tableUuid);
                    }

                    // Skip dropped and mirror tables if configured
                    if ((!includeDroppedTables && _droppedTable) || (!includeMirrorTables && !_primaryTable)) {
                        _iter = skipToNextTable(tableUuid);
                        continue;
                    }

                    int shardId = AstyanaxStorage.getShardId(rowKey);
                    String key = AstyanaxStorage.getContentKey(rowKey);
                    Record record = newRecordFromCql(new Key(_table, key), filteredRows, placement, ByteBufferUtil.bytesToHex(rowKey));
                    return new MultiTableScanResult(rowKey, shardId, tableUuid, _droppedTable, record);
                }

                return endOfData();
            }

            private PeekingIterator> skipToNextTable(long tableUuid) {
                // Iterate over the next 10 row groups first to check for a table switch.  This avoids starting a new range
                // query if the number of rows in the undesired table is small.
                int skipLimit = 10;
                Iterable rowGroup = null;

                while (skipLimit != 0 && _iter.hasNext()) {
                    rowGroup = _iter.peek();
                    ByteBuffer rawKey = getRawKeyFromRowGroup(rowGroup);
                    long nextTableUuid = AstyanaxStorage.getTableUuid(rawKey);
                    if (nextTableUuid != tableUuid) {
                        // This is the first row of a new table
                        return _iter;
                    } else {
                        _iter.next();
                        skipLimit -= 1;
                    }
                }

                if (_iter.hasNext()) {
                    // Skip the table entirely by starting a new query on the next possible table
                    assert rowGroup != null;
                    int shardId = AstyanaxStorage.getShardId(getRawKeyFromRowGroup(rowGroup));
                    ByteBuffer nextPossibleTableStart = AstyanaxStorage.getRowKeyRaw(shardId, tableUuid + 1, "");
                    ByteBuffer end = rowRange.getEnd();

                    if (AstyanaxStorage.compareKeys(nextPossibleTableStart, end) < 0) {
                        // We haven't reached the last end boundary of the original range scan
                        ByteBufferRange updatedRange = new ByteBufferRangeImpl(nextPossibleTableStart, end, -1, false);
                        return Iterators.peekingIterator(rowScan(placement, null, updatedRange, consistency));
                    }
                }

                return Iterators.peekingIterator(Collections.emptyIterator());
            }
        });
    }

    private AstyanaxStorage getStorageForSplit(AstyanaxTable table, ByteBufferRange splitRange) {
        // During a table move, after the internal copy is complete getSplits() will return split IDs that point to
        // the new storage location (table.getReadStorage()) but must still support old split IDs from the old
        // storage location for a while.
        if (!table.getReadStorage().contains(splitRange.getStart())) {
            for (AstyanaxStorage storage : table.getWriteStorage()) {
                if (storage.contains(splitRange.getStart()) && storage.getReadsAllowed()) {
                    return storage;
                }
            }
        }
        return table.getReadStorage();
    }

    /**
     * Implementation of {@link RowGroupResultSetIterator} with implementations for reading from a delta table.
     */
    private class DeltaRowGroupResultSetIterator extends RowGroupResultSetIterator {
        private final DeltaPlacement _placement;
        private final ConsistencyLevel _consistency;

        private DeltaRowGroupResultSetIterator(ResultSet resultSet, int prefetchLimit,
                                               DeltaPlacement placement, ConsistencyLevel consistency) {
            super(resultSet, prefetchLimit);
            _placement = placement;
            _consistency = consistency;
        }

        private DeltaRowGroupResultSetIterator(ListenableFuture resultSetFuture, int prefetchLimit,
                                               DeltaPlacement placement, ConsistencyLevel consistency) {
            super(resultSetFuture, prefetchLimit);
            _placement = placement;
            _consistency = consistency;
        }

        @Override
        protected Object getKeyForRow(Row row) {
            return CqlBlockedDataReaderDAO.this.getKey(row);
        }

        @Override
        protected ResultSet queryRowGroupRowsAfter(Row row) {
            Statement statement = selectDeltaFrom(_placement.getBlockedDeltaTableDDL())
                    .where(eq(_placement.getBlockedDeltaTableDDL().getRowKeyColumnName(), getKey(row)))
                    .and(gt(ImmutableList.of(_placement.getBlockedDeltaTableDDL().getChangeIdColumnName(), _placement.getBlockedDeltaTableDDL().getBlockColumnName()),
                            ImmutableList.of(getChangeId(row), getBlock(row))))
                    .orderBy(asc(_placement.getBlockedDeltaTableDDL().getChangeIdColumnName()))
                    .setConsistencyLevel(_consistency);

            return AdaptiveResultSet.executeAdaptiveQuery(_placement.getKeyspace().getCqlSession(), statement, _driverConfig.getSingleRowFetchSize());
        }
    }

    /**
     * Reads columns from the delta or delta history table.  The range of columns, order, and limit can be
     * parameterized.
     */
    private ResultSet columnScan(DeltaPlacement placement, TableDDL tableDDL, ByteBuffer rowKey, Range columnRange,
                                 boolean ascending, ConsistencyLevel consistency) {

        Select.Where where = (tableDDL == placement.getBlockedDeltaTableDDL() ? selectDeltaFrom(placement.getBlockedDeltaTableDDL()) : selectFrom(tableDDL))
                .where(eq(tableDDL.getRowKeyColumnName(), rowKey));

        if (columnRange.hasLowerBound()) {
            if (columnRange.lowerBoundType() == BoundType.CLOSED) {
                where = where.and(gte(tableDDL.getChangeIdColumnName(), columnRange.lowerEndpoint().getUuid()));
            } else {
                where = where.and(gt(tableDDL.getChangeIdColumnName(), columnRange.lowerEndpoint().getUuid()));
            }
        }

        if (columnRange.hasUpperBound()) {
            if (columnRange.upperBoundType() == BoundType.CLOSED) {
                where = where.and(lte(tableDDL.getChangeIdColumnName(), columnRange.upperEndpoint().getUuid()));
            } else {
                where = where.and(lt(tableDDL.getChangeIdColumnName(), columnRange.upperEndpoint().getUuid()));
            }
        }

        Statement statement = where
                .orderBy(ascending ? asc(tableDDL.getChangeIdColumnName()) : desc(tableDDL.getChangeIdColumnName()))
                .setConsistencyLevel(consistency);

        return AdaptiveResultSet.executeAdaptiveQuery(placement.getKeyspace().getCqlSession(), statement, _driverConfig.getSingleRowFetchSize());
    }

    @Override
    public Iterator readTimeline(Key key, boolean includeContentData, UUID start, UUID end,
                                         boolean reversed, long limit, ReadConsistency readConsistency) {
        requireNonNull(key, "key");
        checkArgument(limit > 0, "Limit must be >0");
        requireNonNull(readConsistency, "consistency");

        // Even though the API allows for a long limit CQL only supports integer values.  Anything longer than MAX_INT
        // is impractical given that a single Cassandra record must practically hold less than 2G rows since a wide row
        // cannot be larger than 2G bytes.

        int scaledLimit = (int) Math.min(Integer.MAX_VALUE, limit);

        AstyanaxTable table = (AstyanaxTable) key.getTable();
        AstyanaxStorage storage = table.getReadStorage();
        DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
        ByteBuffer rowKey = storage.getRowKey(key.getKey());

        Range columnRange = toRange(start, end, reversed);
        ConsistencyLevel consistency = SorConsistencies.toCql(readConsistency);

        // Read Delta and Compaction objects
        Iterator deltas = Collections.emptyIterator();
        if (includeContentData) {
            TableDDL deltaDDL = placement.getBlockedDeltaTableDDL();
            ProtocolVersion protocolVersion = placement.getKeyspace().getCqlSession().getCluster().getConfiguration().getProtocolOptions().getProtocolVersion();
            CodecRegistry codecRegistry = placement.getKeyspace().getCqlSession().getCluster().getConfiguration().getCodecRegistry();

            deltas = decodeDeltaColumns(Iterators.limit(new CqlDeltaIterator(columnScan(placement, deltaDDL, rowKey, columnRange, !reversed, consistency).iterator(), BLOCK_RESULT_SET_COLUMN, CHANGE_ID_RESULT_SET_COLUMN, VALUE_RESULT_SET_COLUMN, reversed, _deltaPrefixLength, protocolVersion, codecRegistry, ByteBufferUtil.bytesToHex(rowKey)), scaledLimit));
        }

        // Read History objects
        Iterator deltaHistory = Collections.emptyIterator();
        TableDDL deltaHistoryDDL = placement.getDeltaHistoryTableDDL();
        deltaHistory = decodeColumns(Iterators.limit(columnScan(placement, deltaHistoryDDL, rowKey, columnRange, !reversed, consistency).iterator(), scaledLimit));

        return touch(MergeIterator.merge(deltas, deltaHistory, reversed));
    }

    @Override
    public Iterator getExistingHistories(Key key, UUID start, UUID end, ReadConsistency readConsistency) {
        AstyanaxTable table = (AstyanaxTable) key.getTable();
        AstyanaxStorage storage = table.getReadStorage();
        ByteBuffer rowKey = storage.getRowKey(key.getKey());
        DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
        Range columnRange = toRange(start, end, true);
        ConsistencyLevel consistency = SorConsistencies.toCql(readConsistency);
        TableDDL deltaHistoryDDL = placement.getDeltaHistoryTableDDL();
        return decodeColumns(columnScan(placement, deltaHistoryDDL, rowKey, columnRange, false, consistency).iterator());
    }

    /**
     * Transforms the provided Row iterator into a {@link Change} iterator.
     */
    private Iterator decodeColumns(Iterator iter) {
        return Iterators.transform(iter, row -> _changeEncoder.decodeChange(getChangeId(row), getValue(row)));
    }

    private Iterator decodeDeltaColumns(Iterator iter) {
        return Iterators.transform(iter, row -> _changeEncoder.decodeChange(getChangeId(row), _daoUtils.skipPrefix(getValue(row))));
    }

    /**
     * Converts a TimeUUID set of endpoints into a {@link Range}. of {@link RangeTimeUUID}s.  Both end points
     * are considered closed; that is, they are included in the range.
     */
    private Range toRange(@Nullable UUID start, @Nullable UUID end, boolean reversed) {
        // If the range is reversed then start and end will also be reversed and must therefore be swapped.
        if (reversed) {
            UUID tmp = start;
            start = end;
            end = tmp;
        }

        if (start == null) {
            if (end == null) {
                return Range.all();
            } else {
                return Range.atMost(new RangeTimeUUID(end));
            }
        } else if (end == null) {
            return Range.atLeast(new RangeTimeUUID(start));
        }
        return Range.closed(new RangeTimeUUID(start), new RangeTimeUUID(end));
    }

    /**
     * {@link Range} needs comparable type.  This class thinly encapsulates a UUID and sorts as a TimeUUID.
     */
    private static class RangeTimeUUID implements Comparable {
        private final UUID _uuid;

        private RangeTimeUUID(UUID uuid) {
            _uuid = uuid;
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) {
                return true;
            }
            if (!(o instanceof RangeTimeUUID)) {
                return false;
            }
            return _uuid.equals(((RangeTimeUUID) o)._uuid);
        }

        @Override
        public int hashCode() {
            return _uuid.hashCode();
        }

        @Override
        public int compareTo(RangeTimeUUID o) {
            return TimeUUIDs.compare(_uuid, o._uuid);
        }

        private UUID getUuid() {
            return _uuid;
        }
    }

    /**
     * Helper method to return a record with no rows.
     */
    private Record emptyRecord(Key key) {
        return new RecordImpl(key,
                Collections.emptyIterator(),
                Collections.emptyIterator(),
                Collections.emptyIterator());
    }

    @VisibleForTesting
    public static Iterable getFilteredRows(Iterable rows, Instant cutoffTime) {
        if (cutoffTime == null) {
            return rows;
        }
        return () -> Iterators.filter(rows.iterator(), row -> (TimeUUIDs.getTimeMillis(row.getUUID(CHANGE_ID_RESULT_SET_COLUMN)) < cutoffTime.toEpochMilli()));
    }

    // The following methods rely on using the Cassandra thrift call describe_splits_ex() to split
    // a token range into portions of approximately equal size.  There is currently no equivalent client-side
    // support for this call using CQL.  Therefore they must always defer to the Asytanax implementation.

    @Override
    public List getSplits(Table table, int recordsPerSplit, int localResplits) throws TimeoutException {
        return _astyanaxReaderDAO.getSplits(table, recordsPerSplit, localResplits);
    }

    @Override
    public ScanRangeSplits getScanRangeSplits(String placement, int desiredRecordsPerSplit, Optional subrange) {
        return _astyanaxReaderDAO.getScanRangeSplits(placement, desiredRecordsPerSplit, subrange);
    }

    @Override
    public long count(Table table, ReadConsistency consistency) {
        return _astyanaxReaderDAO.count(table, consistency);
    }

    @Override
    public long count(Table table, @Nullable Integer limit, ReadConsistency consistency) {
        return _astyanaxReaderDAO.count(table, limit, consistency);
    }

    @Override
    public Stream getKeysForStorage(AstyanaxStorage storage) {

        // Loop over all the range prefixes (2^shardsLog2 of them) and, for each, execute Cassandra queries to
        // page through the rowkeys with that prefix.
        final DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
        BlockedDeltaTableDDL tableDDL = placement.getBlockedDeltaTableDDL();

        return StreamSupport.stream(Spliterators.spliteratorUnknownSize(storage.scanIterator(null), 0), false)
                .map(keyRange ->
                        QueryBuilder.select()
                            .distinct()
                            .column(tableDDL.getRowKeyColumnName())
                            .from(tableDDL.getTableMetadata())
                            .where(gt(token(tableDDL.getRowKeyColumnName()), keyRange.getStart()))
                            .and(lte(token(tableDDL.getRowKeyColumnName()), keyRange.getEnd()))
                            .setConsistencyLevel(ConsistencyLevel.ALL))
                .flatMap(statement ->
                        StreamSupport.stream(
                                Spliterators.spliteratorUnknownSize(
                                        deltaQuery(placement, statement, false, "Failed to scan keys for storage %s", storage.toString()),
                                        0),
                                false))
                .map(this::getRawKeyFromRowGroup)
                .map(AstyanaxStorage::getContentKey);
    }

}