com.bazaarvoice.emodb.sor.db.astyanax.AstyanaxBlockedDataReaderDAO Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of emodb-sor Show documentation
There is a newer version: 6.5.190
package com.bazaarvoice.emodb.sor.db.astyanax;

import com.bazaarvoice.emodb.common.api.impl.LimitCounter;
import com.bazaarvoice.emodb.common.cassandra.CassandraKeyspace;
import com.bazaarvoice.emodb.common.cassandra.astyanax.KeyspaceUtil;
import com.bazaarvoice.emodb.common.cassandra.nio.BufferUtils;
import com.bazaarvoice.emodb.common.uuid.TimeUUIDs;
import com.bazaarvoice.emodb.sor.api.Change;
import com.bazaarvoice.emodb.sor.api.Compaction;
import com.bazaarvoice.emodb.sor.api.ReadConsistency;
import com.bazaarvoice.emodb.sor.api.UnknownTableException;
import com.bazaarvoice.emodb.sor.core.AbstractBatchReader;
import com.bazaarvoice.emodb.sor.db.DAOUtils;
import com.bazaarvoice.emodb.sor.db.DataReaderDAO;
import com.bazaarvoice.emodb.sor.db.HistoryMigrationScanResult;
import com.bazaarvoice.emodb.sor.db.Key;
import com.bazaarvoice.emodb.sor.db.MigrationScanResult;
import com.bazaarvoice.emodb.sor.db.MultiTableScanOptions;
import com.bazaarvoice.emodb.sor.db.MultiTableScanResult;
import com.bazaarvoice.emodb.sor.db.Record;
import com.bazaarvoice.emodb.sor.db.RecordEntryRawMetadata;
import com.bazaarvoice.emodb.sor.db.ScanRange;
import com.bazaarvoice.emodb.sor.db.ScanRangeSplits;
import com.bazaarvoice.emodb.sor.db.test.DeltaClusteringKey;
import com.bazaarvoice.emodb.table.db.DroppedTableException;
import com.bazaarvoice.emodb.table.db.Table;
import com.bazaarvoice.emodb.table.db.TableSet;
import com.bazaarvoice.emodb.table.db.astyanax.AstyanaxStorage;
import com.bazaarvoice.emodb.table.db.astyanax.AstyanaxTable;
import com.bazaarvoice.emodb.table.db.astyanax.PlacementCache;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.codahale.metrics.annotation.Timed;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Predicates;
import com.google.common.base.Throwables;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.Ordering;
import com.google.common.collect.PeekingIterator;
import com.google.inject.Inject;
import com.netflix.astyanax.CassandraOperationType;
import com.netflix.astyanax.Execution;
import com.netflix.astyanax.Keyspace;
import com.netflix.astyanax.connectionpool.ConnectionContext;
import com.netflix.astyanax.connectionpool.ConnectionPool;
import com.netflix.astyanax.connectionpool.OperationResult;
import com.netflix.astyanax.connectionpool.TokenRange;
import com.netflix.astyanax.connectionpool.exceptions.ConnectionException;
import com.netflix.astyanax.connectionpool.exceptions.IsTimeoutException;
import com.netflix.astyanax.connectionpool.impl.TokenRangeImpl;
import com.netflix.astyanax.model.ByteBufferRange;
import com.netflix.astyanax.model.CfSplit;
import com.netflix.astyanax.model.Column;
import com.netflix.astyanax.model.ColumnFamily;
import com.netflix.astyanax.model.ColumnList;
import com.netflix.astyanax.model.Row;
import com.netflix.astyanax.model.Rows;
import com.netflix.astyanax.shallows.EmptyKeyspaceTracerFactory;
import com.netflix.astyanax.thrift.AbstractKeyspaceOperationImpl;
import com.netflix.astyanax.util.ByteBufferRangeImpl;
import com.netflix.astyanax.util.RangeBuilder;
import org.apache.cassandra.dht.ByteOrderedPartitioner;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.thrift.Cassandra;
import org.apache.cassandra.thrift.EndpointDetails;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.thrift.transport.TTransportException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import javax.annotation.Nullable;
import java.nio.ByteBuffer;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Deque;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
import java.util.concurrent.TimeoutException;

import static com.google.common.base.Preconditions.checkArgument;
import static java.util.Objects.requireNonNull;
import static java.util.Optional.ofNullable;

/**
 * Cassandra implementation of {@link DataReaderDAO} that uses the Netflix Astyanax client library.
 */
public class AstyanaxBlockedDataReaderDAO implements DataReaderDAO, DataCopyReaderDAO, AstyanaxKeyScanner {

    private final Logger _log = LoggerFactory.getLogger(AstyanaxBlockedDataReaderDAO.class);

    private static final int MAX_RANDOM_ROWS_BATCH = 50;
    private static final int MAX_SCAN_ROWS_BATCH = 250;
    private static final int SCAN_ROW_BATCH_INCREMENT = 50;
    private static final int MAX_COLUMNS_BATCH = 50;
    private static final int MAX_COLUMN_SCAN_BATCH = 250;

    private static final Token.TokenFactory _tokenFactory = new ByteOrderedPartitioner().getTokenFactory();
    private static final ByteBufferRange _maxColumnsRange = new RangeBuilder().setLimit(MAX_COLUMNS_BATCH).build();

    private final ChangeEncoder _changeEncoder;
    private final PlacementCache _placementCache;
    private final Timer _readBatchTimer;
    private final Timer _scanBatchTimer;
    private final Meter _randomReadMeter;
    private final Meter _scanReadMeter;
    private final Meter _largeRowReadMeter;
    private final Meter _copyMeter;
    private final DAOUtils _daoUtils;
    private final int _deltaPrefixLength;

    @Inject
    public AstyanaxBlockedDataReaderDAO(PlacementCache placementCache, ChangeEncoder changeEncoder, MetricRegistry metricRegistry,
                                        DAOUtils daoUtils, @PrefixLength int deltaPrefixLength) {
        checkArgument(deltaPrefixLength > 0, "delta prefix length must be > 0");

        _placementCache = placementCache;
        _changeEncoder = changeEncoder;
        _readBatchTimer = metricRegistry.timer(getMetricName("readBatch"));
        _scanBatchTimer = metricRegistry.timer(getMetricName("scanBatch"));
        _randomReadMeter = metricRegistry.meter(getMetricName("random-reads"));
        _scanReadMeter = metricRegistry.meter(getMetricName("scan-reads"));
        _largeRowReadMeter = metricRegistry.meter(getMetricName("large-row-reads"));
        _copyMeter = metricRegistry.meter(getMetricName("copy"));
        _daoUtils = daoUtils;
        _deltaPrefixLength = deltaPrefixLength;
    }

    private String getMetricName(String name) {
        return MetricRegistry.name("bv.emodb.sor", "AstyanaxDataReaderDAO", name);
    }

    @Timed(name = "bv.emodb.sor.AstyanaxDataReaderDAO.count", absolute = true)
    @Override
    public long count(Table table, ReadConsistency consistency) {
        return count(table, null, consistency);
    }

    @Timed(name = "bv.emodb.sor.AstyanaxDataReaderDAO.count", absolute = true)
    @Override
    public long count(Table tbl, @Nullable Integer limit, ReadConsistency consistency) {
        requireNonNull(tbl, "table");
        requireNonNull(consistency, "consistency");

        // The current implementation scans through every row in the table.  It's very expensive for large tables.
        // Given a limit, count up to the limit, and then estimate for the remaining range splits.

        AstyanaxTable table = (AstyanaxTable) tbl;
        AstyanaxStorage storage = table.getReadStorage();

        // Range query all the shards and count the number of rows in each.
        long count = 0;
        Iterator it = scanKeys(storage, consistency);
        while (it.hasNext()) {
            String fromKey = it.next();
            count++;
            if (limit != null && count > limit) {
                // Clients may just want to distinguish "a few" vs. "lots.  Calculate an exact count up to 'limit'
                // then estimate anything larger by adding the estimated sizes for the remaining splits.
                count += approximateCount(table, consistency, fromKey);
                return count;
            }
        }

        return count;
    }

    private long approximateCount(Table tbl, ReadConsistency consistency, String fromKey) {
        requireNonNull(tbl, "table");
        requireNonNull(consistency, "consistency");

        long count = 0;
        List cfSplits = getCfSplits(tbl, 10000, fromKey);
        for (CfSplit split : cfSplits) {
            count += split.getRowCount();
        }

        return count;
    }

    @Timed(name = "bv.emodb.sor.AstyanaxDataReaderDAO.read", absolute = true)
    @Override
    public Record read(Key key, ReadConsistency consistency) {
        requireNonNull(key, "key");
        requireNonNull(consistency, "consistency");

        AstyanaxTable table = (AstyanaxTable) key.getTable();
        AstyanaxStorage storage = table.getReadStorage();
        DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
        ByteBuffer rowKey = storage.getRowKey(key.getKey());

        // Query for Delta & Compaction info, just the first 50 columns for now.
        ColumnList columns = execute(placement.getKeyspace()
                        .prepareQuery(placement.getBlockedDeltaColumnFamily(), SorConsistencies.toAstyanax(consistency))
                        .getKey(rowKey)
                        .withColumnRange(_maxColumnsRange),
                "read record at placement %s, table %s, key %s",
                placement.getName(), table.getName(), key.getKey());

        // Track metrics
        _randomReadMeter.mark();

        // Convert the results into a Record object, lazily fetching the rest of the columns as necessary.
        return newRecord(key, rowKey, columns, _maxColumnsRange.getLimit(), consistency, null);
    }

    @Timed(name = "bv.emodb.sor.AstyanaxDataReaderDAO.readAll", absolute = true)
    @Override
    public Iterator readAll(Collection keys, final ReadConsistency consistency) {
        requireNonNull(keys, "keys");
        requireNonNull(consistency, "consistency");

        // Group the keys by placement.  Each placement will result in a separate set of queries.  Dedup keys.
        Multimap placementMap = HashMultimap.create();
        for (Key key : keys) {
            AstyanaxTable table = (AstyanaxTable) key.getTable();
            AstyanaxStorage storage = table.getReadStorage();
            placementMap.put((DeltaPlacement) storage.getPlacement(), key);
        }

        // Return an iterator that will loop over the placements and perform a query for each placement and
        // return the resulting decoded rows.
        return touch(Iterators.concat(Iterators.transform(placementMap.asMap().entrySet().iterator(),
                new Function>, Iterator>() {
                    @Override
                    public Iterator apply(Map.Entry> entry) {
                        return readBatch(entry.getKey(), entry.getValue(), consistency);
                    }
                })));
    }

    /**
     * Read a batch of keys that all belong to the same placement (ColumnFamily).
     */
    private Iterator readBatch(final DeltaPlacement placement, Collection keys, final ReadConsistency consistency) {
        requireNonNull(keys, "keys");

        // Convert the keys to ByteBuffer Cassandra row keys
        List> rowKeys = Lists.newArrayListWithCapacity(keys.size());
        for (Key key : keys) {
            AstyanaxTable table = (AstyanaxTable) key.getTable();
            AstyanaxStorage storage = table.getReadStorage();
            rowKeys.add(Maps.immutableEntry(storage.getRowKey(key.getKey()), key));
        }

        // Sort the keys by their byte array encoding to get some locality w/queries.
        Collections.sort(rowKeys, Ordering.natural().onResultOf(entryKeyFunction()));

        // Group them into batches.  Cassandra may have to seek to each row so prefer smaller batches.
        List>> batches = Lists.partition(rowKeys, MAX_RANDOM_ROWS_BATCH);

        // This algorithm is arranged such that only one row of raw decoded changes is pinned in memory at a time.
        // If there are lots of rows with large #s of deltas our memory use should be bounded by the size of the
        // single row with the most/largest deltas + the largest raw thrift byte buffers for a single query.
        return Iterators.concat(Iterators.transform(batches.iterator(),
                new Function>, Iterator>() {
                    @Override
                    public Iterator apply(List> rowKeys) {
                        Timer.Context timerCtx = _readBatchTimer.time();
                        try {
                            return rowQuery(placement, rowKeys, consistency);
                        } finally {
                            timerCtx.stop();
                        }
                    }
                }));
    }

    @Override
    public Iterator readTimeline(Key key, boolean includeContentData, UUID start, UUID end, boolean reversed,
                                         long limit, ReadConsistency consistency) {
        requireNonNull(key, "key");
        checkArgument(limit > 0, "Limit must be >0");
        requireNonNull(consistency, "consistency");

        AstyanaxTable table = (AstyanaxTable) key.getTable();
        AstyanaxStorage storage = table.getReadStorage();
        DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
        ByteBuffer rowKey = storage.getRowKey(key.getKey());

        // Read Delta and Compaction objects
        Iterator deltas = Collections.emptyIterator();
        if (includeContentData) {
            ColumnFamily cf = placement.getBlockedDeltaColumnFamily();
            DeltaKey deltaStart = start != null ? new DeltaKey(start, 0) : null;
            DeltaKey deltaEnd = end != null ? new DeltaKey(end, Integer.MAX_VALUE) : null;
            deltas = decodeDeltaColumns(new LimitCounter(limit).limit(new AstyanaxDeltaIterator(columnScan(rowKey, placement, cf, deltaStart, deltaEnd, reversed, _deltaKeyInc, Long.MAX_VALUE, 0, consistency), reversed, _deltaPrefixLength, ByteBufferUtil.bytesToHex((rowKey)))));

        }

        // Read History objects
        Iterator deltaHistory = Collections.emptyIterator();
        ColumnFamily deltaHistoryCf = placement.getDeltaHistoryColumnFamily();
        deltaHistory = decodeColumns(columnScan(rowKey, placement, deltaHistoryCf, start, end, reversed, _uuidInc, limit, 0, consistency));

        return touch(MergeIterator.merge(deltas, deltaHistory, reversed));
    }

    @Override
    public Iterator getExistingHistories(Key key, UUID start, UUID end, ReadConsistency consistency) {
        AstyanaxTable table = (AstyanaxTable) key.getTable();
        AstyanaxStorage storage = table.getReadStorage();
        ByteBuffer rowKey = storage.getRowKey(key.getKey());
        DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
        ColumnFamily cf = placement.getDeltaHistoryColumnFamily();
        return decodeColumns(columnScan(rowKey, placement, cf, start, end, true, _uuidInc, MAX_COLUMN_SCAN_BATCH, 0, consistency));
    }

    @Timed(name = "bv.emodb.sor.AstyanaxDataReaderDAO.scan", absolute = true)
    @Override
    public Iterator scan(Table tbl, @Nullable String fromKeyExclusive, final LimitCounter limit, final ReadConsistency consistency) {
        requireNonNull(tbl, "table");
        requireNonNull(limit, "limit");
        requireNonNull(consistency, "consistency");

        final AstyanaxTable table = (AstyanaxTable) tbl;
        AstyanaxStorage storage = table.getReadStorage();
        final DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();

        // Loop over all the range prefixes (2^shardsLog2 of them) and, for each, execute Cassandra queries to
        // page through the records with that prefix.
        final Iterator scanIter = storage.scanIterator(fromKeyExclusive);
        return touch(Iterators.concat(new AbstractIterator>() {
            @Override
            protected Iterator computeNext() {
                if (scanIter.hasNext()) {
                    ByteBufferRange keyRange = scanIter.next();
                    return decodeRows(
                            rowScan(placement, keyRange, _maxColumnsRange, limit, consistency),
                            table, _maxColumnsRange.getLimit(), consistency);
                }
                return endOfData();
            }
        }));
    }

    @Override
    public Iterator scanKeys(AstyanaxStorage storage, final ReadConsistency consistency) {
        requireNonNull(storage, "storage");
        requireNonNull(consistency, "consistency");

        final DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();

        // We just want row keys, but get at least one column so we can ignore range ghosts.
        final ByteBufferRange columnRange = new RangeBuilder().setLimit(1).build();
        final LimitCounter unlimited = LimitCounter.max();

        // Loop over all the range prefixes (2^shardsLog2 of them) and, for each, execute Cassandra queries to
        // page through the records with that prefix.
        final Iterator scanIter = storage.scanIterator(null);
        return touch(Iterators.concat(new AbstractIterator>() {
            @Override
            protected Iterator computeNext() {
                if (scanIter.hasNext()) {
                    ByteBufferRange keyRange = scanIter.next();
                    return decodeKeys(rowScan(placement, keyRange, columnRange, unlimited, consistency));
                }
                return endOfData();
            }
        }));
    }

    // Manually split the token ranges using ByteOrderedPartitioner's midpoint method
    @VisibleForTesting
    public List resplitLocally(String startToken, String endToken, int numResplits) {
        List splitTokens = ImmutableList.of(_tokenFactory.fromString(startToken), _tokenFactory.fromString(endToken));
        for (int i = 0; i < numResplits; i++) {
            List newTokens = new ArrayList<>(splitTokens.size() * 2 - 1);
            for (int j = 0; j < splitTokens.size() - 1; j++) {
                newTokens.add(splitTokens.get(j));
                newTokens.add(ByteOrderedPartitioner.instance.midpoint(splitTokens.get(j), splitTokens.get(j + 1)));
            }
            newTokens.add(splitTokens.get(splitTokens.size() - 1));
            splitTokens = newTokens;
        }
        return splitTokens;
    }

    @Timed(name = "bv.emodb.sor.AstyanaxDataReaderDAO.getSplits", absolute = true)
    @Override
    public List getSplits(Table tbl, int recordsPerSplit, int localResplits) throws TimeoutException {
        requireNonNull(tbl, "table");
        checkArgument(recordsPerSplit > 0);
        checkArgument(localResplits >= 0);

        try {
            List splits = new ArrayList<>();
            List cfSplits = getCfSplits(tbl, recordsPerSplit);
            for (CfSplit split : cfSplits) {

                List splitTokens = resplitLocally(split.getStartToken(), split.getEndToken(), localResplits);

                for (int i = 0; i < splitTokens.size() - 1; i++) {
                    splits.add(SplitFormat.encode(new ByteBufferRangeImpl(_tokenFactory.toByteArray(splitTokens.get(i)),
                            _tokenFactory.toByteArray(splitTokens.get(i + 1)), -1, false)));
                }

            }
            // Randomize the splits so, if processed somewhat in parallel, requests distribute around the ring.
            Collections.shuffle(splits);
            return splits;
        } catch (Exception e) {
            if (isTimeoutException(e)) {
                throw new TimeoutException();
            } else {
                throw Throwables.propagate(e);
            }
        }
    }

    private List getCfSplits(Table tbl, int desiredRecordsPerSplit) {
        return getCfSplits(tbl, desiredRecordsPerSplit, null);
    }

    private List getCfSplits(Table tbl, int desiredRecordsPerSplit, @Nullable String fromKey) {
        requireNonNull(tbl, "table");

        AstyanaxTable table = (AstyanaxTable) tbl;
        AstyanaxStorage storage = table.getReadStorage();
        DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
        Keyspace keyspace = placement.getKeyspace().getAstyanaxKeyspace();
        ColumnFamily cf = placement.getBlockedDeltaColumnFamily();

        // Create at least one split per shard, perhaps more if a shard is large.
        List splits = Lists.newArrayList();
        Iterator it = storage.scanIterator(fromKey);
        Collection allTokenRanges = describeCassandraTopology(keyspace).values();
        while (it.hasNext()) {
            ByteBufferRange keyRange = it.next();

            String start = toTokenString(keyRange.getStart());
            String end = toTokenString(keyRange.getEnd());

            splits.addAll(getCfSplits(keyspace, cf, start, end, desiredRecordsPerSplit, allTokenRanges));
        }
        return splits;
    }

    private List getCfSplits(Keyspace keyspace, ColumnFamily cf, String start,
                                      String end, int desiredRecordsPerSplit, Iterable allTokenRanges) {
        // There is a hole in the describeSplitsEx() call where if the call is routed to a Cassandra node which does
        // not have a replica of the requested token range then it will return a single split equivalent to the requested
        // range.  To accommodate this each query is routed to a host that is verified to have a replica of the range.

        ScanRange splitRange = ScanRange.create(parseTokenString(start), parseTokenString(end));
        List cfSplits = Lists.newArrayList();

        // Iterate over the entire ring to find the token ranges which overlap with the provided range
        for (TokenRange hostTokenRange : allTokenRanges) {
            ScanRange hostSplitRange = ScanRange.create(
                    parseTokenString(hostTokenRange.getStartToken()),
                    parseTokenString(hostTokenRange.getEndToken()));

            // Use the intersection to determine if there is overlap
            for (ScanRange intersection : splitRange.intersection(hostSplitRange)) {
                // Try once on each host until splits are returned

                List intersectionSplits = null;
                for (Iterator hosts = hostTokenRange.getEndpoints().iterator(); hosts.hasNext() && intersectionSplits == null; ) {
                    String host = hosts.next();
                    try {
                        intersectionSplits = KeyspaceUtil.pin(keyspace).toHost(host)
                                .describeSplitsEx(cf.getName(), toTokenString(intersection.getFrom()),
                                        toTokenString(intersection.getTo()),
                                        desiredRecordsPerSplit, intersection.getFrom());
                    } catch (ConnectionException e) {
                        // If there is another host to try then do so, otherwise raise the exception
                        if (!hosts.hasNext()) {
                            throw Throwables.propagate(e);
                        }
                    }
                }

                assert intersectionSplits != null : "Exception would have been thrown if no host had responded successfully";
                cfSplits.addAll(intersectionSplits);
            }
        }

        return cfSplits;
    }

    @Timed(name = "bv.emodb.sor.AstyanaxDataReaderDAO.getSplit", absolute = true)
    @Override
    public Iterator getSplit(Table tbl, String split, @Nullable String fromKeyExclusive, LimitCounter limit, ReadConsistency consistency) {
        requireNonNull(tbl, "table");
        requireNonNull(split, "split");
        requireNonNull(limit, "limit");
        requireNonNull(consistency, "consistency");

        ByteBufferRange splitRange = SplitFormat.decode(split);

        AstyanaxTable table = (AstyanaxTable) tbl;
        AstyanaxStorage storage = getStorageForSplit(table, splitRange);
        DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();

        ByteBufferRange keyRange = storage.getSplitRange(splitRange, fromKeyExclusive, split);
        // The fromKeyExclusive might be equal to the end token of the split.  If so, there's nothing to return.
        if (keyRange.getStart().equals(keyRange.getEnd())) {
            return Collections.emptyIterator();
        }

        // In contrast to the scan() method, scan a single range prefix (the one associated with this split).
        return touch(decodeRows(
                rowScan(placement, keyRange, _maxColumnsRange, limit, consistency),
                table, _maxColumnsRange.getLimit(), consistency));
    }

    private AstyanaxStorage getStorageForSplit(AstyanaxTable table, ByteBufferRange splitRange) {
        // During a table move, after the internal copy is complete getSplits() will return split IDs that point to
        // the new storage location (table.getReadStorage()) but must still support old split IDs from the old
        // storage location for a while.
        if (!table.getReadStorage().contains(splitRange.getStart())) {
            for (AstyanaxStorage storage : table.getWriteStorage()) {
                if (storage.contains(splitRange.getStart()) && storage.getReadsAllowed()) {
                    return storage;
                }
            }
        }
        return table.getReadStorage();
    }

    /**
     * Gets the topology for a Cassandra keyspace as a Multimap, where the keys identify a rack (or availability zone
     * in Amazon) and the values are the token ranges for each host in that rack.  For example, for a well distributed
     * ring of 12 hosts and a replication factor of 3 this method would return a Multimap with 3 keys and each key would
     * contain 4 token ranges.
     */
    private Multimap describeCassandraTopology(final Keyspace keyspace) {
        try {
            @SuppressWarnings("unchecked")
            ConnectionPool connectionPool = (ConnectionPool) keyspace.getConnectionPool();

            return connectionPool.executeWithFailover(
                    new AbstractKeyspaceOperationImpl>(EmptyKeyspaceTracerFactory.getInstance().newTracer(CassandraOperationType.DESCRIBE_RING), keyspace.getKeyspaceName()) {
                        @Override
                        protected Multimap internalExecute(Cassandra.Client client, ConnectionContext state)
                                throws Exception {
                            Multimap racks = ArrayListMultimap.create();
                            for (org.apache.cassandra.thrift.TokenRange tokenRange : client.describe_local_ring(getKeyspace())) {
                                // The final local endpoint "owns" the token range, the rest are for replication
                                EndpointDetails endpointDetails = Iterables.getLast(tokenRange.getEndpoint_details());
                                racks.put(endpointDetails.getRack(),
                                        new TokenRangeImpl(tokenRange.getStart_token(), tokenRange.getEnd_token(), tokenRange.getEndpoints()));
                            }
                            return Multimaps.unmodifiableMultimap(racks);
                        }
                    },
                    keyspace.getConfig().getRetryPolicy().duplicate()).getResult();
        } catch (ConnectionException e) {
            throw Throwables.propagate(e);
        }
    }

    @Override
    public ScanRangeSplits getScanRangeSplits(String placementName, int desiredRecordsPerSplit, Optional subrange) {
        requireNonNull(placementName, "placement");
        checkArgument(desiredRecordsPerSplit >= 0, "Min records per split too low");

        DeltaPlacement placement = (DeltaPlacement) _placementCache.get(placementName);
        CassandraKeyspace keyspace = placement.getKeyspace();
        ColumnFamily cf = placement.getBlockedDeltaColumnFamily();

        // Get the topology so the splits can be grouped by rack
        Multimap racks = describeCassandraTopology(keyspace.getAstyanaxKeyspace());
        Collection allTokenRanges = racks.values();
        ScanRangeSplits.Builder builder = ScanRangeSplits.builder();

        for (Map.Entry> entry : racks.asMap().entrySet()) {
            String rack = entry.getKey();

            Collection tokenRanges = entry.getValue();
            for (TokenRange tokenRange : tokenRanges) {
                if (subrange.isPresent()) {
                    // Find the intersecting token ranges (if any) and add the splits for the intersection
                    ByteBuffer rangeStart = parseTokenString(tokenRange.getStartToken());
                    ByteBuffer rangeEnd = parseTokenString(tokenRange.getEndToken());

                    List intersections = ScanRange.create(rangeStart, rangeEnd).intersection(subrange.get());
                    for (ScanRange scanRange : intersections) {
                        TokenRange intersectingTokenRange = new TokenRangeImpl(
                                toTokenString(scanRange.getFrom()), toTokenString(scanRange.getTo()), tokenRange.getEndpoints());

                        addScanRangeSplitsForTokenRange(keyspace, cf, rack, intersectingTokenRange,
                                desiredRecordsPerSplit, allTokenRanges, builder);
                    }
                } else {
                    // Add splits for the entire token range
                    addScanRangeSplitsForTokenRange(keyspace, cf, rack, tokenRange, desiredRecordsPerSplit,
                            allTokenRanges, builder);
                }
            }
        }

        return builder.build();
    }

    private void addScanRangeSplitsForTokenRange(CassandraKeyspace keyspace, ColumnFamily cf, String rack,
                                                 TokenRange tokenRange, int desiredRecordsPerSplit, Iterable allTokenRanges,
                                                 ScanRangeSplits.Builder builder) {
        // Split the token range into sub-ranges with approximately the desired number of records per split
        String rangeStart = tokenRange.getStartToken();

        Deque splitWorkQueue = new LinkedList<>();
        splitWorkQueue.push(new AstyanaxBlockedDataReaderDAO.ScanRangeSplitWorkItem(tokenRange, desiredRecordsPerSplit, true));

        AstyanaxBlockedDataReaderDAO.ScanRangeSplitWorkItem splitWork;
        while ((splitWork = splitWorkQueue.poll()) != null) {
            try {
                List splits = getCfSplits(
                        keyspace.getAstyanaxKeyspace(), cf, splitWork.range.getStartToken(), splitWork.range.getEndToken(),
                        splitWork.desiredRecordsPerSplit, allTokenRanges);

                for (CfSplit split : splits) {
                    if (splitWork.desiredRecordsPerSplit <= desiredRecordsPerSplit) {
                        ByteBuffer begin = parseTokenString(split.getStartToken());
                        ByteBuffer finish = parseTokenString(split.getEndToken());
                        builder.addScanRange(rack, rangeStart, ScanRange.create(begin, finish));
                    } else {
                        // This work item was for a larger-than-desired split created due to a previous timeout.
                        // Add the split back to the work queue to be split again at a smaller size.  Note that
                        // retryOnTimeout is set to false since we've already established that growing and subdividing
                        // it won't help.
                        TokenRange newWorkTokenRange = new TokenRangeImpl(split.getStartToken(), split.getEndToken(), splitWork.range.getEndpoints());
                        int newWorkDesiredSize = splitWork.desiredRecordsPerSplit / 10;
                        splitWorkQueue.push(
                                new AstyanaxBlockedDataReaderDAO.ScanRangeSplitWorkItem(newWorkTokenRange, newWorkDesiredSize, false));
                        _log.debug("Decreasing scan range split to {} for keyspace {} and range {}", newWorkDesiredSize, keyspace.getName(), newWorkTokenRange);
                    }
                }
            } catch (Exception e) {
                if (isTimeoutException(e)) {
                    if (splitWork.retryOnTimeout) {
                        // Try again with 10 times the desired number of records per split, up to a reasonable maximum
                        int retryDesiredRecordsPerSplit = (int) Math.min(splitWork.desiredRecordsPerSplit * 10L, Integer.MAX_VALUE);
                        boolean retryOnTimeout = retryDesiredRecordsPerSplit < desiredRecordsPerSplit * 1000 && retryDesiredRecordsPerSplit != Integer.MAX_VALUE;
                        splitWorkQueue.push(new AstyanaxBlockedDataReaderDAO.ScanRangeSplitWorkItem(splitWork.range, retryDesiredRecordsPerSplit, retryOnTimeout));
                        _log.debug("Increasing scan range split to {} for keyspace {} and range {}", retryDesiredRecordsPerSplit, keyspace.getName(), splitWork.range);
                    } else {
                        // Either we've already grown the token range to the maximum size we're willing to try
                        // or we've already succeeded at the larger split size but are still timing out at the smaller one.
                        // Either way our best choice at this point is to return the over-sized range.  The caller will
                        // have to adjust around this later.
                        ByteBuffer begin = parseTokenString(splitWork.range.getStartToken());
                        ByteBuffer finish = parseTokenString(splitWork.range.getEndToken());
                        builder.addScanRange(rack, rangeStart, ScanRange.create(begin, finish));
                        _log.warn("Unable to generate scan range split below {} for keyspace {} and range {}",
                                splitWork.desiredRecordsPerSplit, keyspace.getName(), splitWork.range);
                    }
                } else {
                    throw Throwables.propagate(e);
                }
            }
        }
    }

    private class ScanRangeSplitWorkItem {
        final TokenRange range;
        final int desiredRecordsPerSplit;
        final boolean retryOnTimeout;

        public ScanRangeSplitWorkItem(TokenRange range, int desiredRecordsPerSplit, boolean retryOnTimeout) {
            this.range = range;
            this.desiredRecordsPerSplit = desiredRecordsPerSplit;
            this.retryOnTimeout = retryOnTimeout;
        }
    }

    @Override
    public String getPlacementCluster(String placementName) {
        requireNonNull(placementName, "placement");

        DeltaPlacement placement = (DeltaPlacement) _placementCache.get(placementName);
        return placement.getKeyspace().getClusterName();
    }

    /**
     * Queries for records across multiple tables.  Data is returned in order by by shard then table.  If the caller
     * wants to get all rows for a table he will need to stitch the data together.
     */
    @Override
    public Iterator multiTableScan(final MultiTableScanOptions query, final TableSet tables,
                                                         final LimitCounter limit, final ReadConsistency consistency, @Nullable Instant cutoffTime) {
        requireNonNull(query, "query");
        String placementName = requireNonNull(query.getPlacement(), "placement");
        final DeltaPlacement placement = (DeltaPlacement) _placementCache.get(placementName);

        ScanRange scanRange = ofNullable(query.getScanRange()).orElse(ScanRange.all());

        // Since the range may wrap from high to low end of the token range we need to unwrap it
        List ranges = scanRange.unwrapped();

        return touch(FluentIterable.from(ranges)
                .transformAndConcat(new Function>() {
                    @Override
                    public Iterable apply(final ScanRange rowRange) {
                        return new Iterable() {
                            @Override
                            public Iterator iterator() {
                                return scanMultiTableRows(
                                        tables, placement, rowRange.asByteBufferRange(), limit, query.isIncludeDeletedTables(),
                                        query.isIncludeMirrorTables(), _maxColumnsRange.getLimit(), consistency, cutoffTime);
                            }
                        };
                    }
                })
                .iterator());

    }

    @Override
    public Iterator getDeltasForStorage(AstyanaxStorage source) {
        DeltaPlacement sourcePlacement = (DeltaPlacement) source.getPlacement();
        ColumnFamily sourceCf = sourcePlacement.getBlockedDeltaColumnFamily();

        Iterator scanIter = source.scanIterator(null);

        return Iterators.concat(Iterators.transform(scanIter, keyRange -> {
            Iterator> rows =
                    rowScan(sourcePlacement, sourceCf, keyRange, _maxColumnsRange, LimitCounter.max(), ReadConsistency.STRONG);

            return Iterators.concat(Iterators.transform(rows, row -> {
                ColumnList columns = row.getColumns();
                Iterator> concatColumns = columns.iterator();
                if (columns.size() >= _maxColumnsRange.getLimit()) {
                    DeltaKey lastColumn = row.getColumns().getColumnByIndex(columns.size() - 1).getName();
                    concatColumns = Iterators.concat(concatColumns, columnScan(row.getRawKey(), sourcePlacement, sourceCf, lastColumn, null,
                            false, _deltaKeyInc, Long.MAX_VALUE, 1, ReadConsistency.STRONG));
                }

                Iterator uuidColumns = new AstyanaxDeltaIterator(concatColumns, false, _deltaPrefixLength, ByteBufferUtil.bytesToHex(row.getRawKey()));

                return Iterators.transform(uuidColumns, column -> new MigrationScanResult(row.getRawKey(), column.getName(), _daoUtils.skipPrefix(column.getByteBufferValue())));
            }));
        }));
    }

    @Override
    public Iterator getHistoriesForStorage(AstyanaxStorage source) {

        DeltaPlacement placement = (DeltaPlacement) source.getPlacement();
        ColumnFamily cf = placement.getDeltaHistoryColumnFamily();

        return Iterators.concat(Iterators.transform(source.scanIterator(null), keyRange -> {
            Iterator> rows =
                    rowScan(placement, cf, keyRange, _maxColumnsRange, LimitCounter.max(), ReadConsistency.STRONG);

            return Iterators.concat(Iterators.transform(rows, row -> {
                ColumnList columns = row.getColumns();
                Iterator> concatColumns = columns.iterator();
                if (columns.size() >= _maxColumnsRange.getLimit()) {
                    UUID lastColumn = row.getColumns().getColumnByIndex(columns.size() - 1).getName();
                    concatColumns = Iterators.concat(concatColumns, columnScan(row.getRawKey(), placement, cf, lastColumn, null,
                            false, _uuidInc, Long.MAX_VALUE, 1, ReadConsistency.STRONG));
                }
                return Iterators.transform(concatColumns, column -> new HistoryMigrationScanResult(row.getRawKey(), column.getName(), column.getByteBufferValue(), column.getTtl()));
            }));
        }));
    }

    /**
     * Queries for rows given an enumerated list of Cassandra row keys.
     */
    private Iterator rowQuery(DeltaPlacement placement,
                                      List> keys,
                                      ReadConsistency consistency) {
        // Build the list of row IDs to query for.
        List rowIds = Lists.transform(keys, entryKeyFunction());

        // Query for Delta & Compaction info, just the first 50 columns for now.
        final Rows rows = execute(placement.getKeyspace()
                        .prepareQuery(placement.getBlockedDeltaColumnFamily(), SorConsistencies.toAstyanax(consistency))
                        .getKeySlice(rowIds)
                        .withColumnRange(_maxColumnsRange),
                "query %d keys from placement %s", rowIds.size(), placement.getName());

        // Track metrics
        _randomReadMeter.mark(rowIds.size());

        // Return an iterator that decodes the row results, avoiding pinning multiple decoded rows into memory at once.
        return decodeRows(keys, rows, _maxColumnsRange.getLimit(), consistency);
    }

    /**
     * Scans for rows within the specified range, exclusive on start and inclusive on end.
     */
    private Iterator> rowScan(final DeltaPlacement placement,
                                                        final ByteBufferRange rowRange,
                                                        final ByteBufferRange columnRange,
                                                        final LimitCounter limit,
                                                        final ReadConsistency consistency) {
        return rowScan(placement, placement.getBlockedDeltaColumnFamily(), rowRange, columnRange, limit, consistency);
    }

    /**
     * Scans for rows within the specified range, exclusive on start and inclusive on end.
     */
    private  Iterator> rowScan(final DeltaPlacement placement,
                                                     final ColumnFamily columnFamily,
                                                     final ByteBufferRange rowRange,
                                                     final ByteBufferRange columnRange,
                                                     final LimitCounter limit,
                                                     final ReadConsistency consistency) {
        // In the first batch request no more than 50 rows.
        int initialBatchSize = (int) Math.min(limit.remaining(), 50);

        return new AbstractBatchReader>(1, initialBatchSize, MAX_SCAN_ROWS_BATCH, SCAN_ROW_BATCH_INCREMENT) {
            private ByteBuffer _rangeStart = rowRange.getStart();
            private final ByteBuffer _rangeEnd = rowRange.getEnd();
            private int _minimumLimit = 1;
            private boolean _done;

            @Override
            protected boolean hasNextBatch() {
                return !_done;
            }

            @Override
            protected Iterator> nextBatch(int batchSize)
                    throws Exception {
                // Note: if Cassandra is asked to perform a token range query where start >= end it will wrap
                // around which is absolutely *not* what we want since it could return data for another table.
                if (_done || BufferUtils.compareUnsigned(_rangeStart, _rangeEnd) >= 0) {
                    _done = true;
                    return Collections.emptyIterator();
                }

                Timer.Context timer = _scanBatchTimer.time();
                try {
                    int adjustedBatchSize = (int) Math.min(Math.max(limit.remaining(), _minimumLimit), batchSize);
                    // Increase the minimum limit a bit each time around so if we start encountering lots of range
                    // ghosts we eventually scan through them at a reasonable rate.
                    _minimumLimit = Math.min(_minimumLimit + 3, MAX_SCAN_ROWS_BATCH);

                    // Pass token strings to get exclusive start behavior, to support 'fromBlobIdExclusive'.
                    String startToken = toTokenString(_rangeStart);
                    String endToken = toTokenString(_rangeEnd);
                    Rows rows = execute(placement.getKeyspace()
                                    .prepareQuery(columnFamily, SorConsistencies.toAstyanax(consistency))
                                    .getKeyRange(null, null, startToken, endToken, adjustedBatchSize)
                                    .withColumnRange(columnRange),
                            "scan rows in placement %s, column family %s from %s to %s",
                            placement.getName(), columnFamily.getName(), startToken, endToken);

                    if (rows.size() >= adjustedBatchSize) {
                        // Save the last row key so we can use it as the start (exclusive) if we must query to get more data.
                        _rangeStart = rows.getRowByIndex(rows.size() - 1).getKey();
                        // If that row key was the end of our range then we're done.
                        _done = _rangeStart.equals(_rangeEnd);
                    } else {
                        // If we got fewer rows than we asked for, another query won't find more rows.
                        _done = true;
                    }

                    // Track metrics
                    _scanReadMeter.mark(rows.size());

                    // Return the rows.  Filter out range ghosts (deleted rows with no columns)
                    final Iterator> rowIter = rows.iterator();
                    return new AbstractIterator>() {
                        @Override
                        protected Row computeNext() {
                            while (rowIter.hasNext()) {
                                Row row = rowIter.next();
                                if (!row.getColumns().isEmpty()) {
                                    return row;
                                }
                            }
                            return endOfData();
                        }
                    };
                } finally {
                    timer.stop();
                }
            }

            @Override
            protected boolean isTimeoutException(Exception e) {
                return AstyanaxBlockedDataReaderDAO.this.isTimeoutException(e);
            }

            @Override
            protected boolean isDataSizeException(Exception e) {
                for (Throwable t : Throwables.getCausalChain(e)) {
                    // If the root cause is a thrift frame size overflow then the current batch size returned too
                    // much data.  Unfortunately there is no specific exception thrown for this so we have to
                    // check for the generic exception type, TTransportException, and then narrow by the message.
                    //
                    // Sample message:
                    // Frame size (17339288) larger than max length (16384000)!
                    if (t instanceof TTransportException) {
                        String message = t.getMessage();
                        if (message != null &&
                                message.startsWith("Frame size") &&
                                message.contains("larger than max length")) {
                            return true;
                        }
                    }
                }
                return false;
            }
        };
    }

    private boolean isTimeoutException(Exception e) {
        return Iterables.tryFind(Throwables.getCausalChain(e), Predicates.instanceOf(IsTimeoutException.class)).isPresent();
    }


    /**
     * Scans a single row for columns within the specified range, inclusive or exclusive on start based on whether
     * page is non-zero, and inclusive on end.
     */
    private  Iterator> columnScan(final ByteBuffer rowKey,
                                               final DeltaPlacement placement,
                                               final ColumnFamily columnFamily,
                                               final C start,
                                               final C end,
                                               final boolean reversed,
                                               final ColumnInc columnInc,
                                               final long limit,
                                               final long page,
                                               final ReadConsistency consistency) {
        return Iterators.concat(new AbstractIterator>>() {
            private C _from = start;
            private long _remaining = limit;
            private long _page = page;

            @Override
            protected Iterator> computeNext() {
                if (_remaining <= 0) {
                    return endOfData();
                }

                // For page N+1, treat "_from" as exclusive.  Since Cassandra doesn't support exclusive column ranges
                // bump the from value up to the next possible time UUID (assumes from != null when page != 0).
                if (_page > 0) {
                    if (_from.equals(end)) {
                        return endOfData();
                    }
                    _from = reversed ? columnInc.previous(_from) : columnInc.next(_from);
                    if (_from == null) {
                        return endOfData();
                    }
                }

                // Execute the query
                int batchSize = (int) Math.min(_remaining, MAX_COLUMN_SCAN_BATCH);
                ColumnList columns = execute(placement.getKeyspace()
                                .prepareQuery(columnFamily, SorConsistencies.toAstyanax(consistency))
                                .getKey(rowKey)
                                .withColumnRange(_from, end, reversed, batchSize),
                        "scan columns in placement %s, column family %s, row %s, from %s to %s",
                        placement.getName(), columnFamily.getName(), rowKey, start, end);

                // Update state for the next iteration.
                if (columns.size() >= batchSize) {
                    // Save the last column key so we can use it as the start (exclusive) if we must query to get more data.
                    _from = columns.getColumnByIndex(columns.size() - 1).getName();
                    _remaining = _remaining - columns.size();
                    _page++;
                } else {
                    // If we got fewer columns than we asked for, another query won't find more columns.
                    _remaining = 0;
                }

                // Track metrics.  For rows w/more than 50 columns, count subsequent reads w/_largeRowReadMeter.
                (_page == 0 ? _randomReadMeter : _largeRowReadMeter).mark();

                return columns.iterator();
            }
        });
    }

    private interface ColumnInc {
        C previous(C col);

        C next(C col);
    }

    private static final ColumnInc _uuidInc = new ColumnInc() {
        @Override
        public UUID previous(UUID col) {
            return TimeUUIDs.getPrevious(col);
        }

        @Override
        public UUID next(UUID col) {
            return TimeUUIDs.getNext(col);
        }
    };

    private static final ColumnInc _deltaKeyInc = new ColumnInc() {
        @Override
        public DeltaKey previous(DeltaKey col) {
            if (col.getBlock() == 0) {
                return new DeltaKey(_uuidInc.previous(col.getChangeId()), Integer.MAX_VALUE);
            }
            return new DeltaKey(col.getChangeId(), col.getBlock() - 1);
        }

        @Override
        public DeltaKey next(DeltaKey col) {
            if (col.getBlock() == Integer.MAX_VALUE) {
                return new DeltaKey(_uuidInc.next(col.getChangeId()), 0);
            }
            return new DeltaKey(col.getChangeId(), col.getBlock() + 1);
        }
    };

    /**
     * Decodes row keys returned by scanning a table.
     */
    private Iterator decodeKeys(final Iterator> iter) {
        return new AbstractIterator() {
            @Override
            protected String computeNext() {
                while (iter.hasNext()) {
                    Row row = iter.next();
                    if (!row.getColumns().isEmpty()) { // Ignore range ghosts
                        return AstyanaxStorage.getContentKey(row.getRawKey());
                    }
                }
                return endOfData();
            }
        };
    }

    /**
     * Decodes rows returned by querying for a specific set of rows.
     */
    private Iterator decodeRows(List> keys, final Rows rows,
                                        final int largeRowThreshold, final ReadConsistency consistency) {
        // Avoiding pinning multiple decoded rows into memory at once.
        return Iterators.transform(keys.iterator(), new Function, Record>() {
            @Override
            public Record apply(Map.Entry entry) {
                Row row = rows.getRow(entry.getKey());
                if (row == null) {
                    return emptyRecord(entry.getValue());
                }
                // Convert the results into a Record object, lazily fetching the rest of the columns as necessary.
                return newRecord(entry.getValue(), row.getRawKey(), row.getColumns(), largeRowThreshold, consistency, null);
            }
        });
    }

    /**
     * Decodes rows returned by scanning a table.
     */
    private Iterator decodeRows(Iterator> iter, final AstyanaxTable table,
                                        final int largeRowThreshold, final ReadConsistency consistency) {
        // Avoiding pinning multiple decoded rows into memory at once.
        return Iterators.transform(iter, new Function, Record>() {
            @Override
            public Record apply(Row row) {
                // Convert the results into a Record object, lazily fetching the rest of the columns as necessary.
                String key = AstyanaxStorage.getContentKey(row.getRawKey());
                return newRecord(new Key(table, key), row.getRawKey(), row.getColumns(), largeRowThreshold, consistency, null);
            }
        });
    }

    /**
     * Decodes rows returned by scanning across tables.
     */
    private Iterator scanMultiTableRows(
            final TableSet tables, final DeltaPlacement placement, final ByteBufferRange rowRange,
            final LimitCounter limit, final boolean includeDroppedTables, final boolean includeMirrorTables,
            final int largeRowThreshold, final ReadConsistency consistency, @Nullable final Instant cutoffTime) {

        // Avoiding pinning multiple decoded rows into memory at once.
        return limit.limit(new AbstractIterator() {
            private PeekingIterator> _iter = Iterators.peekingIterator(
                    rowScan(placement, rowRange, _maxColumnsRange, LimitCounter.max(), consistency));

            private long _lastTableUuid = -1;
            private AstyanaxTable _table = null;
            private boolean _droppedTable;
            private boolean _primaryTable;

            @Override
            protected MultiTableScanResult computeNext() {
                while (_iter.hasNext()) {
                    Row row = _iter.next();
                    ColumnList rowColumns = row.getColumns();

                    // Convert the results into a Record object, lazily fetching the rest of the columns as necessary.
                    ByteBuffer rowKey = row.getRawKey();

                    long tableUuid = AstyanaxStorage.getTableUuid(rowKey);
                    if (_lastTableUuid != tableUuid) {
                        _lastTableUuid = tableUuid;
                        try {
                            _table = (AstyanaxTable) tables.getByUuid(tableUuid);
                        } catch (UnknownTableException e) {
                            _table = AstyanaxTable.createUnknown(tableUuid, placement, e.getTable());
                        } catch (DroppedTableException e) {
                            _table = AstyanaxTable.createUnknown(tableUuid, placement, e.getPriorTable());
                        }
                        _droppedTable = _table.isUnknownTable();
                        _primaryTable = _table.getReadStorage().hasUUID(tableUuid);
                    }

                    // Skip dropped and mirror tables if configured
                    if ((!includeDroppedTables && _droppedTable) || (!includeMirrorTables && !_primaryTable)) {
                        _iter = skipToNextTable(tableUuid);
                        continue;
                    }

                    int shardId = AstyanaxStorage.getShardId(rowKey);
                    String key = AstyanaxStorage.getContentKey(rowKey);
                    Record record = newRecord(new Key(_table, key), rowKey, rowColumns, largeRowThreshold, consistency, cutoffTime);
                    return new MultiTableScanResult(rowKey, shardId, tableUuid, _droppedTable, record);
                }

                return endOfData();
            }

            private PeekingIterator> skipToNextTable(long tableUuid) {
                // Iterate over the next 50 rows first to check for a table switch.  This avoids starting a new range
                // query if the number of rows in the undesired table is small.
                int skipLimit = 50;
                Row row = null;

                while (skipLimit != 0 && _iter.hasNext()) {
                    row = _iter.peek();
                    long nextTableUuid = AstyanaxStorage.getTableUuid(row.getRawKey());
                    if (nextTableUuid != tableUuid) {
                        // This is the first row of a new table
                        return _iter;
                    } else {
                        _iter.next();
                        skipLimit -= 1;
                    }
                }

                if (_iter.hasNext()) {
                    // Skip the table entirely by starting a new query on the next possible table
                    assert row != null;
                    int shardId = AstyanaxStorage.getShardId(row.getRawKey());
                    ByteBuffer nextPossibleTableStart = AstyanaxStorage.getRowKeyRaw(shardId, tableUuid + 1, "");
                    ByteBuffer end = rowRange.getEnd();

                    if (AstyanaxStorage.compareKeys(nextPossibleTableStart, end) < 0) {
                        // We haven't reached the last end boundary of the original range scan
                        ByteBufferRange updatedRange = new ByteBufferRangeImpl(nextPossibleTableStart, end, -1, false);
                        return Iterators.peekingIterator(
                                rowScan(placement, updatedRange, _maxColumnsRange, LimitCounter.max(), consistency));
                    }
                }

                return Iterators.peekingIterator(Collections.emptyIterator());
            }
        });
    }

    private Record newRecord(Key key, ByteBuffer rowKey, ColumnList columns, int largeRowThreshold, ReadConsistency consistency, @Nullable final Instant cutoffTime) {

        Iterator> changeIter = getFilteredColumnIter(columns.iterator(), cutoffTime);
        Iterator> compactionIter = getFilteredColumnIter(columns.iterator(), cutoffTime);
        Iterator> rawMetadataIter = getFilteredColumnIter(columns.iterator(), cutoffTime);

        if (columns.size() >= largeRowThreshold) {
            // A large row such that the first query likely returned only a subset of all the columns.  Lazily fetch
            // the rest while ensuring we never load all columns into memory at the same time.  The current
            // Compactor+Resolver implementation must scan the row twice: once to find compaction records and once to
            // find deltas.  So we must call columnScan() twice, once for each.
            DeltaKey lastColumn = columns.getColumnByIndex(columns.size() - 1).getName();

            AstyanaxTable table = (AstyanaxTable) key.getTable();
            AstyanaxStorage storage = table.getReadStorage();
            DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
            ColumnFamily columnFamily = placement.getBlockedDeltaColumnFamily();

            // Execute the same scan 3 times, returning 3 iterators that process the results in different ways.  In
            // practice at most two of the iterators are actually consumed (one or more is ignored) so the columnScan
            // should avoid actually doing any work until the first item is fetched from the iterator.
            changeIter = Iterators.concat(changeIter,
                    getFilteredColumnIter(columnScan(rowKey, placement, columnFamily, lastColumn, null, false, _deltaKeyInc, Long.MAX_VALUE, 1, consistency), cutoffTime));
            compactionIter = Iterators.concat(compactionIter,
                    getFilteredColumnIter(columnScan(rowKey, placement, columnFamily, lastColumn, null, false, _deltaKeyInc, Long.MAX_VALUE, 1, consistency), cutoffTime));
            rawMetadataIter = Iterators.concat(rawMetadataIter,
                    getFilteredColumnIter(columnScan(rowKey, placement, columnFamily, lastColumn, null, false, _deltaKeyInc, Long.MAX_VALUE, 1, consistency), cutoffTime));
        }

        Iterator> deltaChangeIter = decodeChanges(new AstyanaxDeltaIterator(changeIter, false, _deltaPrefixLength, ByteBufferUtil.bytesToHex((rowKey))));
        Iterator> deltaCompactionIter = decodeCompactions(new AstyanaxDeltaIterator(compactionIter, false, _deltaPrefixLength, ByteBufferUtil.bytesToHex((rowKey))));
        Iterator deltaRawMetadataIter = rawMetadata(new AstyanaxDeltaIterator(rawMetadataIter, false, _deltaPrefixLength, ByteBufferUtil.bytesToHex((rowKey))));

        return new RecordImpl(key, deltaCompactionIter, deltaChangeIter, deltaRawMetadataIter);
    }

    private Record emptyRecord(Key key) {
        return new RecordImpl(key,
                Collections.emptyIterator(),
                Collections.emptyIterator(),
                Collections.emptyIterator());
    }

    private Iterator decodeColumns(Iterator> iter) {
        return Iterators.transform(iter, column -> _changeEncoder.decodeChange(column.getName(), column.getByteBufferValue()));
    }

    private Iterator decodeDeltaColumns(Iterator iter) {
        return Iterators.transform(iter, column -> _changeEncoder.decodeChange(column.getName(), _daoUtils.skipPrefix(column.getByteBufferValue())));
    }

    private Iterator> decodeChanges(final Iterator iter) {
        return Iterators.transform(iter, new Function>() {
            @Override
            public Map.Entry apply(StitchedColumn column) {
                Change change = _changeEncoder.decodeChange(column.getName(), _daoUtils.skipPrefix(column.getByteBufferValue()));
                return Maps.immutableEntry(new DeltaClusteringKey(column.getName(), column.getNumBlocks()), change);
            }
        });
    }

    private Iterator> decodeCompactions(final Iterator iter) {
        return new AbstractIterator>() {
            @Override
            protected Map.Entry computeNext() {
                while (iter.hasNext()) {
                    StitchedColumn column = iter.next();
                    Compaction compaction = _changeEncoder.decodeCompaction(_daoUtils.skipPrefix(column.getByteBufferValue()));
                    if (compaction != null) {
                        return Maps.immutableEntry(new DeltaClusteringKey(column.getName(), column.getNumBlocks()), compaction);
                    }
                }
                return endOfData();
            }
        };
    }

    private Iterator rawMetadata(final Iterator iter) {
        return Iterators.transform(iter, new Function, RecordEntryRawMetadata>() {
            @Override
            public RecordEntryRawMetadata apply(Column column) {
                return new RecordEntryRawMetadata()
                        .withTimestamp(TimeUUIDs.getTimeMillis(column.getName()))
                        .withSize(_daoUtils.skipPrefix(column.getByteBufferValue()).remaining());
            }
        });
    }

    private  R execute(Execution execution, String operation, Object... operationArguments) {
        OperationResult operationResult;
        try {
            operationResult = execution.execute();
        } catch (ConnectionException e) {
            for (int i = 0; i < operationArguments.length; i++) {
                if (operationArguments[i] instanceof ByteBuffer) {
                    operationArguments[i] = ByteBufferUtil.bytesToHex((ByteBuffer) operationArguments[i]);
                }
            }
            String message = "Failed to " + String.format(operation, operationArguments);
            throw new RuntimeException(message, e);
        }
        return operationResult.getResult();
    }

    private String toTokenString(ByteBuffer bytes) {
        return _tokenFactory.toString(_tokenFactory.fromByteArray(bytes));
    }

    private ByteBuffer parseTokenString(String string) {
        return _tokenFactory.toByteArray(_tokenFactory.fromString(string));
    }

    /**
     * Force computation of the first item in an iterator so metrics calculations for a method reflect the cost of
     * the first batch of results.
     */
    private  Iterator touch(Iterator iter) {
        // Could return a Guava PeekingIterator after "if (iter.hasNext()) iter.peek()", but simply calling hasNext()
        // is sufficient for the iterator implementations used by this DAO class...
        iter.hasNext();
        return iter;
    }

    private Function, ByteBuffer> entryKeyFunction() {
        return new Function, ByteBuffer>() {
            @Override
            public ByteBuffer apply(Map.Entry entry) {
                return entry.getKey();
            }
        };
    }

    @VisibleForTesting
    public static Iterator> getFilteredColumnIter(Iterator> columnIter, @Nullable Instant cutoffTime) {
        if (cutoffTime == null) {
            return columnIter;
        }
        return Iterators.filter(columnIter, column -> (TimeUUIDs.getTimeMillis(column.getName().getChangeId()) < cutoffTime.toEpochMilli()));
    }
}