com.bazaarvoice.emodb.sor.db.astyanax.AstyanaxBlockedDataReaderDAO Maven / Gradle / Ivy
package com.bazaarvoice.emodb.sor.db.astyanax;
import com.bazaarvoice.emodb.common.api.impl.LimitCounter;
import com.bazaarvoice.emodb.common.cassandra.CassandraKeyspace;
import com.bazaarvoice.emodb.common.cassandra.astyanax.KeyspaceUtil;
import com.bazaarvoice.emodb.common.cassandra.nio.BufferUtils;
import com.bazaarvoice.emodb.common.uuid.TimeUUIDs;
import com.bazaarvoice.emodb.sor.api.Change;
import com.bazaarvoice.emodb.sor.api.Compaction;
import com.bazaarvoice.emodb.sor.api.ReadConsistency;
import com.bazaarvoice.emodb.sor.api.UnknownTableException;
import com.bazaarvoice.emodb.sor.core.AbstractBatchReader;
import com.bazaarvoice.emodb.sor.db.DAOUtils;
import com.bazaarvoice.emodb.sor.db.DataReaderDAO;
import com.bazaarvoice.emodb.sor.db.HistoryMigrationScanResult;
import com.bazaarvoice.emodb.sor.db.Key;
import com.bazaarvoice.emodb.sor.db.MigrationScanResult;
import com.bazaarvoice.emodb.sor.db.MultiTableScanOptions;
import com.bazaarvoice.emodb.sor.db.MultiTableScanResult;
import com.bazaarvoice.emodb.sor.db.Record;
import com.bazaarvoice.emodb.sor.db.RecordEntryRawMetadata;
import com.bazaarvoice.emodb.sor.db.ScanRange;
import com.bazaarvoice.emodb.sor.db.ScanRangeSplits;
import com.bazaarvoice.emodb.sor.db.test.DeltaClusteringKey;
import com.bazaarvoice.emodb.table.db.DroppedTableException;
import com.bazaarvoice.emodb.table.db.Table;
import com.bazaarvoice.emodb.table.db.TableSet;
import com.bazaarvoice.emodb.table.db.astyanax.AstyanaxStorage;
import com.bazaarvoice.emodb.table.db.astyanax.AstyanaxTable;
import com.bazaarvoice.emodb.table.db.astyanax.PlacementCache;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.Timer;
import com.codahale.metrics.annotation.Timed;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Predicates;
import com.google.common.base.Throwables;
import com.google.common.collect.AbstractIterator;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.FluentIterable;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Multimap;
import com.google.common.collect.Multimaps;
import com.google.common.collect.Ordering;
import com.google.common.collect.PeekingIterator;
import com.google.inject.Inject;
import com.netflix.astyanax.CassandraOperationType;
import com.netflix.astyanax.Execution;
import com.netflix.astyanax.Keyspace;
import com.netflix.astyanax.connectionpool.ConnectionContext;
import com.netflix.astyanax.connectionpool.ConnectionPool;
import com.netflix.astyanax.connectionpool.OperationResult;
import com.netflix.astyanax.connectionpool.TokenRange;
import com.netflix.astyanax.connectionpool.exceptions.ConnectionException;
import com.netflix.astyanax.connectionpool.exceptions.IsTimeoutException;
import com.netflix.astyanax.connectionpool.impl.TokenRangeImpl;
import com.netflix.astyanax.model.ByteBufferRange;
import com.netflix.astyanax.model.CfSplit;
import com.netflix.astyanax.model.Column;
import com.netflix.astyanax.model.ColumnFamily;
import com.netflix.astyanax.model.ColumnList;
import com.netflix.astyanax.model.Row;
import com.netflix.astyanax.model.Rows;
import com.netflix.astyanax.shallows.EmptyKeyspaceTracerFactory;
import com.netflix.astyanax.thrift.AbstractKeyspaceOperationImpl;
import com.netflix.astyanax.util.ByteBufferRangeImpl;
import com.netflix.astyanax.util.RangeBuilder;
import org.apache.cassandra.dht.ByteOrderedPartitioner;
import org.apache.cassandra.dht.Token;
import org.apache.cassandra.thrift.Cassandra;
import org.apache.cassandra.thrift.EndpointDetails;
import org.apache.cassandra.utils.ByteBufferUtil;
import org.apache.thrift.transport.TTransportException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import java.nio.ByteBuffer;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Deque;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.UUID;
import java.util.concurrent.TimeoutException;
import static com.google.common.base.Preconditions.checkArgument;
import static java.util.Objects.requireNonNull;
import static java.util.Optional.ofNullable;
/**
* Cassandra implementation of {@link DataReaderDAO} that uses the Netflix Astyanax client library.
*/
public class AstyanaxBlockedDataReaderDAO implements DataReaderDAO, DataCopyReaderDAO, AstyanaxKeyScanner {
private final Logger _log = LoggerFactory.getLogger(AstyanaxBlockedDataReaderDAO.class);
private static final int MAX_RANDOM_ROWS_BATCH = 50;
private static final int MAX_SCAN_ROWS_BATCH = 250;
private static final int SCAN_ROW_BATCH_INCREMENT = 50;
private static final int MAX_COLUMNS_BATCH = 50;
private static final int MAX_COLUMN_SCAN_BATCH = 250;
private static final Token.TokenFactory _tokenFactory = new ByteOrderedPartitioner().getTokenFactory();
private static final ByteBufferRange _maxColumnsRange = new RangeBuilder().setLimit(MAX_COLUMNS_BATCH).build();
private final ChangeEncoder _changeEncoder;
private final PlacementCache _placementCache;
private final Timer _readBatchTimer;
private final Timer _scanBatchTimer;
private final Meter _randomReadMeter;
private final Meter _scanReadMeter;
private final Meter _largeRowReadMeter;
private final Meter _copyMeter;
private final DAOUtils _daoUtils;
private final int _deltaPrefixLength;
@Inject
public AstyanaxBlockedDataReaderDAO(PlacementCache placementCache, ChangeEncoder changeEncoder, MetricRegistry metricRegistry,
DAOUtils daoUtils, @PrefixLength int deltaPrefixLength) {
checkArgument(deltaPrefixLength > 0, "delta prefix length must be > 0");
_placementCache = placementCache;
_changeEncoder = changeEncoder;
_readBatchTimer = metricRegistry.timer(getMetricName("readBatch"));
_scanBatchTimer = metricRegistry.timer(getMetricName("scanBatch"));
_randomReadMeter = metricRegistry.meter(getMetricName("random-reads"));
_scanReadMeter = metricRegistry.meter(getMetricName("scan-reads"));
_largeRowReadMeter = metricRegistry.meter(getMetricName("large-row-reads"));
_copyMeter = metricRegistry.meter(getMetricName("copy"));
_daoUtils = daoUtils;
_deltaPrefixLength = deltaPrefixLength;
}
private String getMetricName(String name) {
return MetricRegistry.name("bv.emodb.sor", "AstyanaxDataReaderDAO", name);
}
@Timed(name = "bv.emodb.sor.AstyanaxDataReaderDAO.count", absolute = true)
@Override
public long count(Table table, ReadConsistency consistency) {
return count(table, null, consistency);
}
@Timed(name = "bv.emodb.sor.AstyanaxDataReaderDAO.count", absolute = true)
@Override
public long count(Table tbl, @Nullable Integer limit, ReadConsistency consistency) {
requireNonNull(tbl, "table");
requireNonNull(consistency, "consistency");
// The current implementation scans through every row in the table. It's very expensive for large tables.
// Given a limit, count up to the limit, and then estimate for the remaining range splits.
AstyanaxTable table = (AstyanaxTable) tbl;
AstyanaxStorage storage = table.getReadStorage();
// Range query all the shards and count the number of rows in each.
long count = 0;
Iterator it = scanKeys(storage, consistency);
while (it.hasNext()) {
String fromKey = it.next();
count++;
if (limit != null && count > limit) {
// Clients may just want to distinguish "a few" vs. "lots. Calculate an exact count up to 'limit'
// then estimate anything larger by adding the estimated sizes for the remaining splits.
count += approximateCount(table, consistency, fromKey);
return count;
}
}
return count;
}
private long approximateCount(Table tbl, ReadConsistency consistency, String fromKey) {
requireNonNull(tbl, "table");
requireNonNull(consistency, "consistency");
long count = 0;
List cfSplits = getCfSplits(tbl, 10000, fromKey);
for (CfSplit split : cfSplits) {
count += split.getRowCount();
}
return count;
}
@Timed(name = "bv.emodb.sor.AstyanaxDataReaderDAO.read", absolute = true)
@Override
public Record read(Key key, ReadConsistency consistency) {
requireNonNull(key, "key");
requireNonNull(consistency, "consistency");
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
ByteBuffer rowKey = storage.getRowKey(key.getKey());
// Query for Delta & Compaction info, just the first 50 columns for now.
ColumnList columns = execute(placement.getKeyspace()
.prepareQuery(placement.getBlockedDeltaColumnFamily(), SorConsistencies.toAstyanax(consistency))
.getKey(rowKey)
.withColumnRange(_maxColumnsRange),
"read record at placement %s, table %s, key %s",
placement.getName(), table.getName(), key.getKey());
// Track metrics
_randomReadMeter.mark();
// Convert the results into a Record object, lazily fetching the rest of the columns as necessary.
return newRecord(key, rowKey, columns, _maxColumnsRange.getLimit(), consistency, null);
}
@Timed(name = "bv.emodb.sor.AstyanaxDataReaderDAO.readAll", absolute = true)
@Override
public Iterator readAll(Collection keys, final ReadConsistency consistency) {
requireNonNull(keys, "keys");
requireNonNull(consistency, "consistency");
// Group the keys by placement. Each placement will result in a separate set of queries. Dedup keys.
Multimap placementMap = HashMultimap.create();
for (Key key : keys) {
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
placementMap.put((DeltaPlacement) storage.getPlacement(), key);
}
// Return an iterator that will loop over the placements and perform a query for each placement and
// return the resulting decoded rows.
return touch(Iterators.concat(Iterators.transform(placementMap.asMap().entrySet().iterator(),
new Function>, Iterator>() {
@Override
public Iterator apply(Map.Entry> entry) {
return readBatch(entry.getKey(), entry.getValue(), consistency);
}
})));
}
/**
* Read a batch of keys that all belong to the same placement (ColumnFamily).
*/
private Iterator readBatch(final DeltaPlacement placement, Collection keys, final ReadConsistency consistency) {
requireNonNull(keys, "keys");
// Convert the keys to ByteBuffer Cassandra row keys
List> rowKeys = Lists.newArrayListWithCapacity(keys.size());
for (Key key : keys) {
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
rowKeys.add(Maps.immutableEntry(storage.getRowKey(key.getKey()), key));
}
// Sort the keys by their byte array encoding to get some locality w/queries.
Collections.sort(rowKeys, Ordering.natural().onResultOf(entryKeyFunction()));
// Group them into batches. Cassandra may have to seek to each row so prefer smaller batches.
List>> batches = Lists.partition(rowKeys, MAX_RANDOM_ROWS_BATCH);
// This algorithm is arranged such that only one row of raw decoded changes is pinned in memory at a time.
// If there are lots of rows with large #s of deltas our memory use should be bounded by the size of the
// single row with the most/largest deltas + the largest raw thrift byte buffers for a single query.
return Iterators.concat(Iterators.transform(batches.iterator(),
new Function>, Iterator>() {
@Override
public Iterator apply(List> rowKeys) {
Timer.Context timerCtx = _readBatchTimer.time();
try {
return rowQuery(placement, rowKeys, consistency);
} finally {
timerCtx.stop();
}
}
}));
}
@Override
public Iterator readTimeline(Key key, boolean includeContentData, UUID start, UUID end, boolean reversed,
long limit, ReadConsistency consistency) {
requireNonNull(key, "key");
checkArgument(limit > 0, "Limit must be >0");
requireNonNull(consistency, "consistency");
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
ByteBuffer rowKey = storage.getRowKey(key.getKey());
// Read Delta and Compaction objects
Iterator deltas = Collections.emptyIterator();
if (includeContentData) {
ColumnFamily cf = placement.getBlockedDeltaColumnFamily();
DeltaKey deltaStart = start != null ? new DeltaKey(start, 0) : null;
DeltaKey deltaEnd = end != null ? new DeltaKey(end, Integer.MAX_VALUE) : null;
deltas = decodeDeltaColumns(new LimitCounter(limit).limit(new AstyanaxDeltaIterator(columnScan(rowKey, placement, cf, deltaStart, deltaEnd, reversed, _deltaKeyInc, Long.MAX_VALUE, 0, consistency), reversed, _deltaPrefixLength, ByteBufferUtil.bytesToHex((rowKey)))));
}
// Read History objects
Iterator deltaHistory = Collections.emptyIterator();
ColumnFamily deltaHistoryCf = placement.getDeltaHistoryColumnFamily();
deltaHistory = decodeColumns(columnScan(rowKey, placement, deltaHistoryCf, start, end, reversed, _uuidInc, limit, 0, consistency));
return touch(MergeIterator.merge(deltas, deltaHistory, reversed));
}
@Override
public Iterator getExistingHistories(Key key, UUID start, UUID end, ReadConsistency consistency) {
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
ByteBuffer rowKey = storage.getRowKey(key.getKey());
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
ColumnFamily cf = placement.getDeltaHistoryColumnFamily();
return decodeColumns(columnScan(rowKey, placement, cf, start, end, true, _uuidInc, MAX_COLUMN_SCAN_BATCH, 0, consistency));
}
@Timed(name = "bv.emodb.sor.AstyanaxDataReaderDAO.scan", absolute = true)
@Override
public Iterator scan(Table tbl, @Nullable String fromKeyExclusive, final LimitCounter limit, final ReadConsistency consistency) {
requireNonNull(tbl, "table");
requireNonNull(limit, "limit");
requireNonNull(consistency, "consistency");
final AstyanaxTable table = (AstyanaxTable) tbl;
AstyanaxStorage storage = table.getReadStorage();
final DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
// Loop over all the range prefixes (2^shardsLog2 of them) and, for each, execute Cassandra queries to
// page through the records with that prefix.
final Iterator scanIter = storage.scanIterator(fromKeyExclusive);
return touch(Iterators.concat(new AbstractIterator>() {
@Override
protected Iterator computeNext() {
if (scanIter.hasNext()) {
ByteBufferRange keyRange = scanIter.next();
return decodeRows(
rowScan(placement, keyRange, _maxColumnsRange, limit, consistency),
table, _maxColumnsRange.getLimit(), consistency);
}
return endOfData();
}
}));
}
@Override
public Iterator scanKeys(AstyanaxStorage storage, final ReadConsistency consistency) {
requireNonNull(storage, "storage");
requireNonNull(consistency, "consistency");
final DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
// We just want row keys, but get at least one column so we can ignore range ghosts.
final ByteBufferRange columnRange = new RangeBuilder().setLimit(1).build();
final LimitCounter unlimited = LimitCounter.max();
// Loop over all the range prefixes (2^shardsLog2 of them) and, for each, execute Cassandra queries to
// page through the records with that prefix.
final Iterator scanIter = storage.scanIterator(null);
return touch(Iterators.concat(new AbstractIterator>() {
@Override
protected Iterator computeNext() {
if (scanIter.hasNext()) {
ByteBufferRange keyRange = scanIter.next();
return decodeKeys(rowScan(placement, keyRange, columnRange, unlimited, consistency));
}
return endOfData();
}
}));
}
// Manually split the token ranges using ByteOrderedPartitioner's midpoint method
@VisibleForTesting
public List resplitLocally(String startToken, String endToken, int numResplits) {
List splitTokens = ImmutableList.of(_tokenFactory.fromString(startToken), _tokenFactory.fromString(endToken));
for (int i = 0; i < numResplits; i++) {
List newTokens = new ArrayList<>(splitTokens.size() * 2 - 1);
for (int j = 0; j < splitTokens.size() - 1; j++) {
newTokens.add(splitTokens.get(j));
newTokens.add(ByteOrderedPartitioner.instance.midpoint(splitTokens.get(j), splitTokens.get(j + 1)));
}
newTokens.add(splitTokens.get(splitTokens.size() - 1));
splitTokens = newTokens;
}
return splitTokens;
}
@Timed(name = "bv.emodb.sor.AstyanaxDataReaderDAO.getSplits", absolute = true)
@Override
public List getSplits(Table tbl, int recordsPerSplit, int localResplits) throws TimeoutException {
requireNonNull(tbl, "table");
checkArgument(recordsPerSplit > 0);
checkArgument(localResplits >= 0);
try {
List splits = new ArrayList<>();
List cfSplits = getCfSplits(tbl, recordsPerSplit);
for (CfSplit split : cfSplits) {
List splitTokens = resplitLocally(split.getStartToken(), split.getEndToken(), localResplits);
for (int i = 0; i < splitTokens.size() - 1; i++) {
splits.add(SplitFormat.encode(new ByteBufferRangeImpl(_tokenFactory.toByteArray(splitTokens.get(i)),
_tokenFactory.toByteArray(splitTokens.get(i + 1)), -1, false)));
}
}
// Randomize the splits so, if processed somewhat in parallel, requests distribute around the ring.
Collections.shuffle(splits);
return splits;
} catch (Exception e) {
if (isTimeoutException(e)) {
throw new TimeoutException();
} else {
throw Throwables.propagate(e);
}
}
}
private List getCfSplits(Table tbl, int desiredRecordsPerSplit) {
return getCfSplits(tbl, desiredRecordsPerSplit, null);
}
private List getCfSplits(Table tbl, int desiredRecordsPerSplit, @Nullable String fromKey) {
requireNonNull(tbl, "table");
AstyanaxTable table = (AstyanaxTable) tbl;
AstyanaxStorage storage = table.getReadStorage();
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
Keyspace keyspace = placement.getKeyspace().getAstyanaxKeyspace();
ColumnFamily cf = placement.getBlockedDeltaColumnFamily();
// Create at least one split per shard, perhaps more if a shard is large.
List splits = Lists.newArrayList();
Iterator it = storage.scanIterator(fromKey);
Collection allTokenRanges = describeCassandraTopology(keyspace).values();
while (it.hasNext()) {
ByteBufferRange keyRange = it.next();
String start = toTokenString(keyRange.getStart());
String end = toTokenString(keyRange.getEnd());
splits.addAll(getCfSplits(keyspace, cf, start, end, desiredRecordsPerSplit, allTokenRanges));
}
return splits;
}
private List getCfSplits(Keyspace keyspace, ColumnFamily cf, String start,
String end, int desiredRecordsPerSplit, Iterable allTokenRanges) {
// There is a hole in the describeSplitsEx() call where if the call is routed to a Cassandra node which does
// not have a replica of the requested token range then it will return a single split equivalent to the requested
// range. To accommodate this each query is routed to a host that is verified to have a replica of the range.
ScanRange splitRange = ScanRange.create(parseTokenString(start), parseTokenString(end));
List cfSplits = Lists.newArrayList();
// Iterate over the entire ring to find the token ranges which overlap with the provided range
for (TokenRange hostTokenRange : allTokenRanges) {
ScanRange hostSplitRange = ScanRange.create(
parseTokenString(hostTokenRange.getStartToken()),
parseTokenString(hostTokenRange.getEndToken()));
// Use the intersection to determine if there is overlap
for (ScanRange intersection : splitRange.intersection(hostSplitRange)) {
// Try once on each host until splits are returned
List intersectionSplits = null;
for (Iterator hosts = hostTokenRange.getEndpoints().iterator(); hosts.hasNext() && intersectionSplits == null; ) {
String host = hosts.next();
try {
intersectionSplits = KeyspaceUtil.pin(keyspace).toHost(host)
.describeSplitsEx(cf.getName(), toTokenString(intersection.getFrom()),
toTokenString(intersection.getTo()),
desiredRecordsPerSplit, intersection.getFrom());
} catch (ConnectionException e) {
// If there is another host to try then do so, otherwise raise the exception
if (!hosts.hasNext()) {
throw Throwables.propagate(e);
}
}
}
assert intersectionSplits != null : "Exception would have been thrown if no host had responded successfully";
cfSplits.addAll(intersectionSplits);
}
}
return cfSplits;
}
@Timed(name = "bv.emodb.sor.AstyanaxDataReaderDAO.getSplit", absolute = true)
@Override
public Iterator getSplit(Table tbl, String split, @Nullable String fromKeyExclusive, LimitCounter limit, ReadConsistency consistency) {
requireNonNull(tbl, "table");
requireNonNull(split, "split");
requireNonNull(limit, "limit");
requireNonNull(consistency, "consistency");
ByteBufferRange splitRange = SplitFormat.decode(split);
AstyanaxTable table = (AstyanaxTable) tbl;
AstyanaxStorage storage = getStorageForSplit(table, splitRange);
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
ByteBufferRange keyRange = storage.getSplitRange(splitRange, fromKeyExclusive, split);
// The fromKeyExclusive might be equal to the end token of the split. If so, there's nothing to return.
if (keyRange.getStart().equals(keyRange.getEnd())) {
return Collections.emptyIterator();
}
// In contrast to the scan() method, scan a single range prefix (the one associated with this split).
return touch(decodeRows(
rowScan(placement, keyRange, _maxColumnsRange, limit, consistency),
table, _maxColumnsRange.getLimit(), consistency));
}
private AstyanaxStorage getStorageForSplit(AstyanaxTable table, ByteBufferRange splitRange) {
// During a table move, after the internal copy is complete getSplits() will return split IDs that point to
// the new storage location (table.getReadStorage()) but must still support old split IDs from the old
// storage location for a while.
if (!table.getReadStorage().contains(splitRange.getStart())) {
for (AstyanaxStorage storage : table.getWriteStorage()) {
if (storage.contains(splitRange.getStart()) && storage.getReadsAllowed()) {
return storage;
}
}
}
return table.getReadStorage();
}
/**
* Gets the topology for a Cassandra keyspace as a Multimap, where the keys identify a rack (or availability zone
* in Amazon) and the values are the token ranges for each host in that rack. For example, for a well distributed
* ring of 12 hosts and a replication factor of 3 this method would return a Multimap with 3 keys and each key would
* contain 4 token ranges.
*/
private Multimap describeCassandraTopology(final Keyspace keyspace) {
try {
@SuppressWarnings("unchecked")
ConnectionPool connectionPool = (ConnectionPool) keyspace.getConnectionPool();
return connectionPool.executeWithFailover(
new AbstractKeyspaceOperationImpl>(EmptyKeyspaceTracerFactory.getInstance().newTracer(CassandraOperationType.DESCRIBE_RING), keyspace.getKeyspaceName()) {
@Override
protected Multimap internalExecute(Cassandra.Client client, ConnectionContext state)
throws Exception {
Multimap racks = ArrayListMultimap.create();
for (org.apache.cassandra.thrift.TokenRange tokenRange : client.describe_local_ring(getKeyspace())) {
// The final local endpoint "owns" the token range, the rest are for replication
EndpointDetails endpointDetails = Iterables.getLast(tokenRange.getEndpoint_details());
racks.put(endpointDetails.getRack(),
new TokenRangeImpl(tokenRange.getStart_token(), tokenRange.getEnd_token(), tokenRange.getEndpoints()));
}
return Multimaps.unmodifiableMultimap(racks);
}
},
keyspace.getConfig().getRetryPolicy().duplicate()).getResult();
} catch (ConnectionException e) {
throw Throwables.propagate(e);
}
}
@Override
public ScanRangeSplits getScanRangeSplits(String placementName, int desiredRecordsPerSplit, Optional subrange) {
requireNonNull(placementName, "placement");
checkArgument(desiredRecordsPerSplit >= 0, "Min records per split too low");
DeltaPlacement placement = (DeltaPlacement) _placementCache.get(placementName);
CassandraKeyspace keyspace = placement.getKeyspace();
ColumnFamily cf = placement.getBlockedDeltaColumnFamily();
// Get the topology so the splits can be grouped by rack
Multimap racks = describeCassandraTopology(keyspace.getAstyanaxKeyspace());
Collection allTokenRanges = racks.values();
ScanRangeSplits.Builder builder = ScanRangeSplits.builder();
for (Map.Entry> entry : racks.asMap().entrySet()) {
String rack = entry.getKey();
Collection tokenRanges = entry.getValue();
for (TokenRange tokenRange : tokenRanges) {
if (subrange.isPresent()) {
// Find the intersecting token ranges (if any) and add the splits for the intersection
ByteBuffer rangeStart = parseTokenString(tokenRange.getStartToken());
ByteBuffer rangeEnd = parseTokenString(tokenRange.getEndToken());
List intersections = ScanRange.create(rangeStart, rangeEnd).intersection(subrange.get());
for (ScanRange scanRange : intersections) {
TokenRange intersectingTokenRange = new TokenRangeImpl(
toTokenString(scanRange.getFrom()), toTokenString(scanRange.getTo()), tokenRange.getEndpoints());
addScanRangeSplitsForTokenRange(keyspace, cf, rack, intersectingTokenRange,
desiredRecordsPerSplit, allTokenRanges, builder);
}
} else {
// Add splits for the entire token range
addScanRangeSplitsForTokenRange(keyspace, cf, rack, tokenRange, desiredRecordsPerSplit,
allTokenRanges, builder);
}
}
}
return builder.build();
}
private void addScanRangeSplitsForTokenRange(CassandraKeyspace keyspace, ColumnFamily cf, String rack,
TokenRange tokenRange, int desiredRecordsPerSplit, Iterable allTokenRanges,
ScanRangeSplits.Builder builder) {
// Split the token range into sub-ranges with approximately the desired number of records per split
String rangeStart = tokenRange.getStartToken();
Deque splitWorkQueue = new LinkedList<>();
splitWorkQueue.push(new AstyanaxBlockedDataReaderDAO.ScanRangeSplitWorkItem(tokenRange, desiredRecordsPerSplit, true));
AstyanaxBlockedDataReaderDAO.ScanRangeSplitWorkItem splitWork;
while ((splitWork = splitWorkQueue.poll()) != null) {
try {
List splits = getCfSplits(
keyspace.getAstyanaxKeyspace(), cf, splitWork.range.getStartToken(), splitWork.range.getEndToken(),
splitWork.desiredRecordsPerSplit, allTokenRanges);
for (CfSplit split : splits) {
if (splitWork.desiredRecordsPerSplit <= desiredRecordsPerSplit) {
ByteBuffer begin = parseTokenString(split.getStartToken());
ByteBuffer finish = parseTokenString(split.getEndToken());
builder.addScanRange(rack, rangeStart, ScanRange.create(begin, finish));
} else {
// This work item was for a larger-than-desired split created due to a previous timeout.
// Add the split back to the work queue to be split again at a smaller size. Note that
// retryOnTimeout is set to false since we've already established that growing and subdividing
// it won't help.
TokenRange newWorkTokenRange = new TokenRangeImpl(split.getStartToken(), split.getEndToken(), splitWork.range.getEndpoints());
int newWorkDesiredSize = splitWork.desiredRecordsPerSplit / 10;
splitWorkQueue.push(
new AstyanaxBlockedDataReaderDAO.ScanRangeSplitWorkItem(newWorkTokenRange, newWorkDesiredSize, false));
_log.debug("Decreasing scan range split to {} for keyspace {} and range {}", newWorkDesiredSize, keyspace.getName(), newWorkTokenRange);
}
}
} catch (Exception e) {
if (isTimeoutException(e)) {
if (splitWork.retryOnTimeout) {
// Try again with 10 times the desired number of records per split, up to a reasonable maximum
int retryDesiredRecordsPerSplit = (int) Math.min(splitWork.desiredRecordsPerSplit * 10L, Integer.MAX_VALUE);
boolean retryOnTimeout = retryDesiredRecordsPerSplit < desiredRecordsPerSplit * 1000 && retryDesiredRecordsPerSplit != Integer.MAX_VALUE;
splitWorkQueue.push(new AstyanaxBlockedDataReaderDAO.ScanRangeSplitWorkItem(splitWork.range, retryDesiredRecordsPerSplit, retryOnTimeout));
_log.debug("Increasing scan range split to {} for keyspace {} and range {}", retryDesiredRecordsPerSplit, keyspace.getName(), splitWork.range);
} else {
// Either we've already grown the token range to the maximum size we're willing to try
// or we've already succeeded at the larger split size but are still timing out at the smaller one.
// Either way our best choice at this point is to return the over-sized range. The caller will
// have to adjust around this later.
ByteBuffer begin = parseTokenString(splitWork.range.getStartToken());
ByteBuffer finish = parseTokenString(splitWork.range.getEndToken());
builder.addScanRange(rack, rangeStart, ScanRange.create(begin, finish));
_log.warn("Unable to generate scan range split below {} for keyspace {} and range {}",
splitWork.desiredRecordsPerSplit, keyspace.getName(), splitWork.range);
}
} else {
throw Throwables.propagate(e);
}
}
}
}
private class ScanRangeSplitWorkItem {
final TokenRange range;
final int desiredRecordsPerSplit;
final boolean retryOnTimeout;
public ScanRangeSplitWorkItem(TokenRange range, int desiredRecordsPerSplit, boolean retryOnTimeout) {
this.range = range;
this.desiredRecordsPerSplit = desiredRecordsPerSplit;
this.retryOnTimeout = retryOnTimeout;
}
}
@Override
public String getPlacementCluster(String placementName) {
requireNonNull(placementName, "placement");
DeltaPlacement placement = (DeltaPlacement) _placementCache.get(placementName);
return placement.getKeyspace().getClusterName();
}
/**
* Queries for records across multiple tables. Data is returned in order by by shard then table. If the caller
* wants to get all rows for a table he will need to stitch the data together.
*/
@Override
public Iterator multiTableScan(final MultiTableScanOptions query, final TableSet tables,
final LimitCounter limit, final ReadConsistency consistency, @Nullable Instant cutoffTime) {
requireNonNull(query, "query");
String placementName = requireNonNull(query.getPlacement(), "placement");
final DeltaPlacement placement = (DeltaPlacement) _placementCache.get(placementName);
ScanRange scanRange = ofNullable(query.getScanRange()).orElse(ScanRange.all());
// Since the range may wrap from high to low end of the token range we need to unwrap it
List ranges = scanRange.unwrapped();
return touch(FluentIterable.from(ranges)
.transformAndConcat(new Function>() {
@Override
public Iterable apply(final ScanRange rowRange) {
return new Iterable() {
@Override
public Iterator iterator() {
return scanMultiTableRows(
tables, placement, rowRange.asByteBufferRange(), limit, query.isIncludeDeletedTables(),
query.isIncludeMirrorTables(), _maxColumnsRange.getLimit(), consistency, cutoffTime);
}
};
}
})
.iterator());
}
@Override
public Iterator extends MigrationScanResult> getDeltasForStorage(AstyanaxStorage source) {
DeltaPlacement sourcePlacement = (DeltaPlacement) source.getPlacement();
ColumnFamily sourceCf = sourcePlacement.getBlockedDeltaColumnFamily();
Iterator scanIter = source.scanIterator(null);
return Iterators.concat(Iterators.transform(scanIter, keyRange -> {
Iterator> rows =
rowScan(sourcePlacement, sourceCf, keyRange, _maxColumnsRange, LimitCounter.max(), ReadConsistency.STRONG);
return Iterators.concat(Iterators.transform(rows, row -> {
ColumnList columns = row.getColumns();
Iterator> concatColumns = columns.iterator();
if (columns.size() >= _maxColumnsRange.getLimit()) {
DeltaKey lastColumn = row.getColumns().getColumnByIndex(columns.size() - 1).getName();
concatColumns = Iterators.concat(concatColumns, columnScan(row.getRawKey(), sourcePlacement, sourceCf, lastColumn, null,
false, _deltaKeyInc, Long.MAX_VALUE, 1, ReadConsistency.STRONG));
}
Iterator uuidColumns = new AstyanaxDeltaIterator(concatColumns, false, _deltaPrefixLength, ByteBufferUtil.bytesToHex(row.getRawKey()));
return Iterators.transform(uuidColumns, column -> new MigrationScanResult(row.getRawKey(), column.getName(), _daoUtils.skipPrefix(column.getByteBufferValue())));
}));
}));
}
@Override
public Iterator extends HistoryMigrationScanResult> getHistoriesForStorage(AstyanaxStorage source) {
DeltaPlacement placement = (DeltaPlacement) source.getPlacement();
ColumnFamily cf = placement.getDeltaHistoryColumnFamily();
return Iterators.concat(Iterators.transform(source.scanIterator(null), keyRange -> {
Iterator> rows =
rowScan(placement, cf, keyRange, _maxColumnsRange, LimitCounter.max(), ReadConsistency.STRONG);
return Iterators.concat(Iterators.transform(rows, row -> {
ColumnList columns = row.getColumns();
Iterator> concatColumns = columns.iterator();
if (columns.size() >= _maxColumnsRange.getLimit()) {
UUID lastColumn = row.getColumns().getColumnByIndex(columns.size() - 1).getName();
concatColumns = Iterators.concat(concatColumns, columnScan(row.getRawKey(), placement, cf, lastColumn, null,
false, _uuidInc, Long.MAX_VALUE, 1, ReadConsistency.STRONG));
}
return Iterators.transform(concatColumns, column -> new HistoryMigrationScanResult(row.getRawKey(), column.getName(), column.getByteBufferValue(), column.getTtl()));
}));
}));
}
/**
* Queries for rows given an enumerated list of Cassandra row keys.
*/
private Iterator rowQuery(DeltaPlacement placement,
List> keys,
ReadConsistency consistency) {
// Build the list of row IDs to query for.
List rowIds = Lists.transform(keys, entryKeyFunction());
// Query for Delta & Compaction info, just the first 50 columns for now.
final Rows rows = execute(placement.getKeyspace()
.prepareQuery(placement.getBlockedDeltaColumnFamily(), SorConsistencies.toAstyanax(consistency))
.getKeySlice(rowIds)
.withColumnRange(_maxColumnsRange),
"query %d keys from placement %s", rowIds.size(), placement.getName());
// Track metrics
_randomReadMeter.mark(rowIds.size());
// Return an iterator that decodes the row results, avoiding pinning multiple decoded rows into memory at once.
return decodeRows(keys, rows, _maxColumnsRange.getLimit(), consistency);
}
/**
* Scans for rows within the specified range, exclusive on start and inclusive on end.
*/
private Iterator> rowScan(final DeltaPlacement placement,
final ByteBufferRange rowRange,
final ByteBufferRange columnRange,
final LimitCounter limit,
final ReadConsistency consistency) {
return rowScan(placement, placement.getBlockedDeltaColumnFamily(), rowRange, columnRange, limit, consistency);
}
/**
* Scans for rows within the specified range, exclusive on start and inclusive on end.
*/
private Iterator> rowScan(final DeltaPlacement placement,
final ColumnFamily columnFamily,
final ByteBufferRange rowRange,
final ByteBufferRange columnRange,
final LimitCounter limit,
final ReadConsistency consistency) {
// In the first batch request no more than 50 rows.
int initialBatchSize = (int) Math.min(limit.remaining(), 50);
return new AbstractBatchReader>(1, initialBatchSize, MAX_SCAN_ROWS_BATCH, SCAN_ROW_BATCH_INCREMENT) {
private ByteBuffer _rangeStart = rowRange.getStart();
private final ByteBuffer _rangeEnd = rowRange.getEnd();
private int _minimumLimit = 1;
private boolean _done;
@Override
protected boolean hasNextBatch() {
return !_done;
}
@Override
protected Iterator> nextBatch(int batchSize)
throws Exception {
// Note: if Cassandra is asked to perform a token range query where start >= end it will wrap
// around which is absolutely *not* what we want since it could return data for another table.
if (_done || BufferUtils.compareUnsigned(_rangeStart, _rangeEnd) >= 0) {
_done = true;
return Collections.emptyIterator();
}
Timer.Context timer = _scanBatchTimer.time();
try {
int adjustedBatchSize = (int) Math.min(Math.max(limit.remaining(), _minimumLimit), batchSize);
// Increase the minimum limit a bit each time around so if we start encountering lots of range
// ghosts we eventually scan through them at a reasonable rate.
_minimumLimit = Math.min(_minimumLimit + 3, MAX_SCAN_ROWS_BATCH);
// Pass token strings to get exclusive start behavior, to support 'fromBlobIdExclusive'.
String startToken = toTokenString(_rangeStart);
String endToken = toTokenString(_rangeEnd);
Rows rows = execute(placement.getKeyspace()
.prepareQuery(columnFamily, SorConsistencies.toAstyanax(consistency))
.getKeyRange(null, null, startToken, endToken, adjustedBatchSize)
.withColumnRange(columnRange),
"scan rows in placement %s, column family %s from %s to %s",
placement.getName(), columnFamily.getName(), startToken, endToken);
if (rows.size() >= adjustedBatchSize) {
// Save the last row key so we can use it as the start (exclusive) if we must query to get more data.
_rangeStart = rows.getRowByIndex(rows.size() - 1).getKey();
// If that row key was the end of our range then we're done.
_done = _rangeStart.equals(_rangeEnd);
} else {
// If we got fewer rows than we asked for, another query won't find more rows.
_done = true;
}
// Track metrics
_scanReadMeter.mark(rows.size());
// Return the rows. Filter out range ghosts (deleted rows with no columns)
final Iterator> rowIter = rows.iterator();
return new AbstractIterator>() {
@Override
protected Row computeNext() {
while (rowIter.hasNext()) {
Row row = rowIter.next();
if (!row.getColumns().isEmpty()) {
return row;
}
}
return endOfData();
}
};
} finally {
timer.stop();
}
}
@Override
protected boolean isTimeoutException(Exception e) {
return AstyanaxBlockedDataReaderDAO.this.isTimeoutException(e);
}
@Override
protected boolean isDataSizeException(Exception e) {
for (Throwable t : Throwables.getCausalChain(e)) {
// If the root cause is a thrift frame size overflow then the current batch size returned too
// much data. Unfortunately there is no specific exception thrown for this so we have to
// check for the generic exception type, TTransportException, and then narrow by the message.
//
// Sample message:
// Frame size (17339288) larger than max length (16384000)!
if (t instanceof TTransportException) {
String message = t.getMessage();
if (message != null &&
message.startsWith("Frame size") &&
message.contains("larger than max length")) {
return true;
}
}
}
return false;
}
};
}
private boolean isTimeoutException(Exception e) {
return Iterables.tryFind(Throwables.getCausalChain(e), Predicates.instanceOf(IsTimeoutException.class)).isPresent();
}
/**
* Scans a single row for columns within the specified range, inclusive or exclusive on start based on whether
* page
is non-zero, and inclusive on end.
*/
private Iterator> columnScan(final ByteBuffer rowKey,
final DeltaPlacement placement,
final ColumnFamily columnFamily,
final C start,
final C end,
final boolean reversed,
final ColumnInc columnInc,
final long limit,
final long page,
final ReadConsistency consistency) {
return Iterators.concat(new AbstractIterator>>() {
private C _from = start;
private long _remaining = limit;
private long _page = page;
@Override
protected Iterator> computeNext() {
if (_remaining <= 0) {
return endOfData();
}
// For page N+1, treat "_from" as exclusive. Since Cassandra doesn't support exclusive column ranges
// bump the from value up to the next possible time UUID (assumes from != null when page != 0).
if (_page > 0) {
if (_from.equals(end)) {
return endOfData();
}
_from = reversed ? columnInc.previous(_from) : columnInc.next(_from);
if (_from == null) {
return endOfData();
}
}
// Execute the query
int batchSize = (int) Math.min(_remaining, MAX_COLUMN_SCAN_BATCH);
ColumnList columns = execute(placement.getKeyspace()
.prepareQuery(columnFamily, SorConsistencies.toAstyanax(consistency))
.getKey(rowKey)
.withColumnRange(_from, end, reversed, batchSize),
"scan columns in placement %s, column family %s, row %s, from %s to %s",
placement.getName(), columnFamily.getName(), rowKey, start, end);
// Update state for the next iteration.
if (columns.size() >= batchSize) {
// Save the last column key so we can use it as the start (exclusive) if we must query to get more data.
_from = columns.getColumnByIndex(columns.size() - 1).getName();
_remaining = _remaining - columns.size();
_page++;
} else {
// If we got fewer columns than we asked for, another query won't find more columns.
_remaining = 0;
}
// Track metrics. For rows w/more than 50 columns, count subsequent reads w/_largeRowReadMeter.
(_page == 0 ? _randomReadMeter : _largeRowReadMeter).mark();
return columns.iterator();
}
});
}
private interface ColumnInc {
C previous(C col);
C next(C col);
}
private static final ColumnInc _uuidInc = new ColumnInc() {
@Override
public UUID previous(UUID col) {
return TimeUUIDs.getPrevious(col);
}
@Override
public UUID next(UUID col) {
return TimeUUIDs.getNext(col);
}
};
private static final ColumnInc _deltaKeyInc = new ColumnInc() {
@Override
public DeltaKey previous(DeltaKey col) {
if (col.getBlock() == 0) {
return new DeltaKey(_uuidInc.previous(col.getChangeId()), Integer.MAX_VALUE);
}
return new DeltaKey(col.getChangeId(), col.getBlock() - 1);
}
@Override
public DeltaKey next(DeltaKey col) {
if (col.getBlock() == Integer.MAX_VALUE) {
return new DeltaKey(_uuidInc.next(col.getChangeId()), 0);
}
return new DeltaKey(col.getChangeId(), col.getBlock() + 1);
}
};
/**
* Decodes row keys returned by scanning a table.
*/
private Iterator decodeKeys(final Iterator> iter) {
return new AbstractIterator() {
@Override
protected String computeNext() {
while (iter.hasNext()) {
Row row = iter.next();
if (!row.getColumns().isEmpty()) { // Ignore range ghosts
return AstyanaxStorage.getContentKey(row.getRawKey());
}
}
return endOfData();
}
};
}
/**
* Decodes rows returned by querying for a specific set of rows.
*/
private Iterator decodeRows(List> keys, final Rows rows,
final int largeRowThreshold, final ReadConsistency consistency) {
// Avoiding pinning multiple decoded rows into memory at once.
return Iterators.transform(keys.iterator(), new Function, Record>() {
@Override
public Record apply(Map.Entry entry) {
Row row = rows.getRow(entry.getKey());
if (row == null) {
return emptyRecord(entry.getValue());
}
// Convert the results into a Record object, lazily fetching the rest of the columns as necessary.
return newRecord(entry.getValue(), row.getRawKey(), row.getColumns(), largeRowThreshold, consistency, null);
}
});
}
/**
* Decodes rows returned by scanning a table.
*/
private Iterator decodeRows(Iterator> iter, final AstyanaxTable table,
final int largeRowThreshold, final ReadConsistency consistency) {
// Avoiding pinning multiple decoded rows into memory at once.
return Iterators.transform(iter, new Function, Record>() {
@Override
public Record apply(Row row) {
// Convert the results into a Record object, lazily fetching the rest of the columns as necessary.
String key = AstyanaxStorage.getContentKey(row.getRawKey());
return newRecord(new Key(table, key), row.getRawKey(), row.getColumns(), largeRowThreshold, consistency, null);
}
});
}
/**
* Decodes rows returned by scanning across tables.
*/
private Iterator scanMultiTableRows(
final TableSet tables, final DeltaPlacement placement, final ByteBufferRange rowRange,
final LimitCounter limit, final boolean includeDroppedTables, final boolean includeMirrorTables,
final int largeRowThreshold, final ReadConsistency consistency, @Nullable final Instant cutoffTime) {
// Avoiding pinning multiple decoded rows into memory at once.
return limit.limit(new AbstractIterator() {
private PeekingIterator> _iter = Iterators.peekingIterator(
rowScan(placement, rowRange, _maxColumnsRange, LimitCounter.max(), consistency));
private long _lastTableUuid = -1;
private AstyanaxTable _table = null;
private boolean _droppedTable;
private boolean _primaryTable;
@Override
protected MultiTableScanResult computeNext() {
while (_iter.hasNext()) {
Row row = _iter.next();
ColumnList rowColumns = row.getColumns();
// Convert the results into a Record object, lazily fetching the rest of the columns as necessary.
ByteBuffer rowKey = row.getRawKey();
long tableUuid = AstyanaxStorage.getTableUuid(rowKey);
if (_lastTableUuid != tableUuid) {
_lastTableUuid = tableUuid;
try {
_table = (AstyanaxTable) tables.getByUuid(tableUuid);
} catch (UnknownTableException e) {
_table = AstyanaxTable.createUnknown(tableUuid, placement, e.getTable());
} catch (DroppedTableException e) {
_table = AstyanaxTable.createUnknown(tableUuid, placement, e.getPriorTable());
}
_droppedTable = _table.isUnknownTable();
_primaryTable = _table.getReadStorage().hasUUID(tableUuid);
}
// Skip dropped and mirror tables if configured
if ((!includeDroppedTables && _droppedTable) || (!includeMirrorTables && !_primaryTable)) {
_iter = skipToNextTable(tableUuid);
continue;
}
int shardId = AstyanaxStorage.getShardId(rowKey);
String key = AstyanaxStorage.getContentKey(rowKey);
Record record = newRecord(new Key(_table, key), rowKey, rowColumns, largeRowThreshold, consistency, cutoffTime);
return new MultiTableScanResult(rowKey, shardId, tableUuid, _droppedTable, record);
}
return endOfData();
}
private PeekingIterator> skipToNextTable(long tableUuid) {
// Iterate over the next 50 rows first to check for a table switch. This avoids starting a new range
// query if the number of rows in the undesired table is small.
int skipLimit = 50;
Row row = null;
while (skipLimit != 0 && _iter.hasNext()) {
row = _iter.peek();
long nextTableUuid = AstyanaxStorage.getTableUuid(row.getRawKey());
if (nextTableUuid != tableUuid) {
// This is the first row of a new table
return _iter;
} else {
_iter.next();
skipLimit -= 1;
}
}
if (_iter.hasNext()) {
// Skip the table entirely by starting a new query on the next possible table
assert row != null;
int shardId = AstyanaxStorage.getShardId(row.getRawKey());
ByteBuffer nextPossibleTableStart = AstyanaxStorage.getRowKeyRaw(shardId, tableUuid + 1, "");
ByteBuffer end = rowRange.getEnd();
if (AstyanaxStorage.compareKeys(nextPossibleTableStart, end) < 0) {
// We haven't reached the last end boundary of the original range scan
ByteBufferRange updatedRange = new ByteBufferRangeImpl(nextPossibleTableStart, end, -1, false);
return Iterators.peekingIterator(
rowScan(placement, updatedRange, _maxColumnsRange, LimitCounter.max(), consistency));
}
}
return Iterators.peekingIterator(Collections.emptyIterator());
}
});
}
private Record newRecord(Key key, ByteBuffer rowKey, ColumnList columns, int largeRowThreshold, ReadConsistency consistency, @Nullable final Instant cutoffTime) {
Iterator> changeIter = getFilteredColumnIter(columns.iterator(), cutoffTime);
Iterator> compactionIter = getFilteredColumnIter(columns.iterator(), cutoffTime);
Iterator> rawMetadataIter = getFilteredColumnIter(columns.iterator(), cutoffTime);
if (columns.size() >= largeRowThreshold) {
// A large row such that the first query likely returned only a subset of all the columns. Lazily fetch
// the rest while ensuring we never load all columns into memory at the same time. The current
// Compactor+Resolver implementation must scan the row twice: once to find compaction records and once to
// find deltas. So we must call columnScan() twice, once for each.
DeltaKey lastColumn = columns.getColumnByIndex(columns.size() - 1).getName();
AstyanaxTable table = (AstyanaxTable) key.getTable();
AstyanaxStorage storage = table.getReadStorage();
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
ColumnFamily columnFamily = placement.getBlockedDeltaColumnFamily();
// Execute the same scan 3 times, returning 3 iterators that process the results in different ways. In
// practice at most two of the iterators are actually consumed (one or more is ignored) so the columnScan
// should avoid actually doing any work until the first item is fetched from the iterator.
changeIter = Iterators.concat(changeIter,
getFilteredColumnIter(columnScan(rowKey, placement, columnFamily, lastColumn, null, false, _deltaKeyInc, Long.MAX_VALUE, 1, consistency), cutoffTime));
compactionIter = Iterators.concat(compactionIter,
getFilteredColumnIter(columnScan(rowKey, placement, columnFamily, lastColumn, null, false, _deltaKeyInc, Long.MAX_VALUE, 1, consistency), cutoffTime));
rawMetadataIter = Iterators.concat(rawMetadataIter,
getFilteredColumnIter(columnScan(rowKey, placement, columnFamily, lastColumn, null, false, _deltaKeyInc, Long.MAX_VALUE, 1, consistency), cutoffTime));
}
Iterator> deltaChangeIter = decodeChanges(new AstyanaxDeltaIterator(changeIter, false, _deltaPrefixLength, ByteBufferUtil.bytesToHex((rowKey))));
Iterator> deltaCompactionIter = decodeCompactions(new AstyanaxDeltaIterator(compactionIter, false, _deltaPrefixLength, ByteBufferUtil.bytesToHex((rowKey))));
Iterator deltaRawMetadataIter = rawMetadata(new AstyanaxDeltaIterator(rawMetadataIter, false, _deltaPrefixLength, ByteBufferUtil.bytesToHex((rowKey))));
return new RecordImpl(key, deltaCompactionIter, deltaChangeIter, deltaRawMetadataIter);
}
private Record emptyRecord(Key key) {
return new RecordImpl(key,
Collections.emptyIterator(),
Collections.emptyIterator(),
Collections.emptyIterator());
}
private Iterator decodeColumns(Iterator> iter) {
return Iterators.transform(iter, column -> _changeEncoder.decodeChange(column.getName(), column.getByteBufferValue()));
}
private Iterator decodeDeltaColumns(Iterator iter) {
return Iterators.transform(iter, column -> _changeEncoder.decodeChange(column.getName(), _daoUtils.skipPrefix(column.getByteBufferValue())));
}
private Iterator> decodeChanges(final Iterator iter) {
return Iterators.transform(iter, new Function>() {
@Override
public Map.Entry apply(StitchedColumn column) {
Change change = _changeEncoder.decodeChange(column.getName(), _daoUtils.skipPrefix(column.getByteBufferValue()));
return Maps.immutableEntry(new DeltaClusteringKey(column.getName(), column.getNumBlocks()), change);
}
});
}
private Iterator> decodeCompactions(final Iterator iter) {
return new AbstractIterator>() {
@Override
protected Map.Entry computeNext() {
while (iter.hasNext()) {
StitchedColumn column = iter.next();
Compaction compaction = _changeEncoder.decodeCompaction(_daoUtils.skipPrefix(column.getByteBufferValue()));
if (compaction != null) {
return Maps.immutableEntry(new DeltaClusteringKey(column.getName(), column.getNumBlocks()), compaction);
}
}
return endOfData();
}
};
}
private Iterator rawMetadata(final Iterator iter) {
return Iterators.transform(iter, new Function, RecordEntryRawMetadata>() {
@Override
public RecordEntryRawMetadata apply(Column column) {
return new RecordEntryRawMetadata()
.withTimestamp(TimeUUIDs.getTimeMillis(column.getName()))
.withSize(_daoUtils.skipPrefix(column.getByteBufferValue()).remaining());
}
});
}
private R execute(Execution execution, String operation, Object... operationArguments) {
OperationResult operationResult;
try {
operationResult = execution.execute();
} catch (ConnectionException e) {
for (int i = 0; i < operationArguments.length; i++) {
if (operationArguments[i] instanceof ByteBuffer) {
operationArguments[i] = ByteBufferUtil.bytesToHex((ByteBuffer) operationArguments[i]);
}
}
String message = "Failed to " + String.format(operation, operationArguments);
throw new RuntimeException(message, e);
}
return operationResult.getResult();
}
private String toTokenString(ByteBuffer bytes) {
return _tokenFactory.toString(_tokenFactory.fromByteArray(bytes));
}
private ByteBuffer parseTokenString(String string) {
return _tokenFactory.toByteArray(_tokenFactory.fromString(string));
}
/**
* Force computation of the first item in an iterator so metrics calculations for a method reflect the cost of
* the first batch of results.
*/
private Iterator touch(Iterator iter) {
// Could return a Guava PeekingIterator after "if (iter.hasNext()) iter.peek()", but simply calling hasNext()
// is sufficient for the iterator implementations used by this DAO class...
iter.hasNext();
return iter;
}
private Function, ByteBuffer> entryKeyFunction() {
return new Function, ByteBuffer>() {
@Override
public ByteBuffer apply(Map.Entry entry) {
return entry.getKey();
}
};
}
@VisibleForTesting
public static Iterator> getFilteredColumnIter(Iterator> columnIter, @Nullable Instant cutoffTime) {
if (cutoffTime == null) {
return columnIter;
}
return Iterators.filter(columnIter, column -> (TimeUUIDs.getTimeMillis(column.getName().getChangeId()) < cutoffTime.toEpochMilli()));
}
}