com.bazaarvoice.emodb.sor.db.astyanax.AstyanaxDataWriterDAO Maven / Gradle / Ivy
package com.bazaarvoice.emodb.sor.db.astyanax;
import com.bazaarvoice.emodb.common.api.Ttls;
import com.bazaarvoice.emodb.common.cassandra.CassandraKeyspace;
import com.bazaarvoice.emodb.sor.api.Compaction;
import com.bazaarvoice.emodb.sor.api.DeltaSizeLimitException;
import com.bazaarvoice.emodb.sor.api.History;
import com.bazaarvoice.emodb.sor.api.ReadConsistency;
import com.bazaarvoice.emodb.sor.api.WriteConsistency;
import com.bazaarvoice.emodb.sor.core.HistoryStore;
import com.bazaarvoice.emodb.sor.db.DAOUtils;
import com.bazaarvoice.emodb.sor.db.DataWriterDAO;
import com.bazaarvoice.emodb.sor.db.RecordUpdate;
import com.bazaarvoice.emodb.sor.db.test.DeltaClusteringKey;
import com.bazaarvoice.emodb.sor.delta.Delta;
import com.bazaarvoice.emodb.sor.delta.Literal;
import com.bazaarvoice.emodb.sor.delta.MapDelta;
import com.bazaarvoice.emodb.table.db.Table;
import com.bazaarvoice.emodb.table.db.astyanax.AstyanaxStorage;
import com.bazaarvoice.emodb.table.db.astyanax.AstyanaxTable;
import com.bazaarvoice.emodb.table.db.astyanax.DataPurgeDAO;
import com.bazaarvoice.emodb.table.db.astyanax.FullConsistencyTimeProvider;
import com.bazaarvoice.emodb.table.db.consistency.HintsConsistencyTimeProvider;
import com.codahale.metrics.Meter;
import com.codahale.metrics.MetricRegistry;
import com.codahale.metrics.annotation.Timed;
import com.google.common.base.Predicates;
import com.google.common.base.Throwables;
import com.google.common.collect.Collections2;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.io.ByteStreams;
import com.google.common.io.CountingOutputStream;
import com.google.inject.Inject;
import com.netflix.astyanax.ColumnListMutation;
import com.netflix.astyanax.Execution;
import com.netflix.astyanax.MutationBatch;
import com.netflix.astyanax.connectionpool.OperationResult;
import com.netflix.astyanax.connectionpool.exceptions.ConnectionException;
import com.netflix.astyanax.model.ConsistencyLevel;
import com.netflix.astyanax.serializers.StringSerializer;
import com.netflix.astyanax.thrift.AbstractThriftMutationBatchImpl;
import org.apache.cassandra.thrift.Cassandra;
import org.apache.commons.lang3.StringUtils;
import org.apache.thrift.TException;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.apache.thrift.transport.TIOStreamTransport;
import org.apache.thrift.transport.TTransportException;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Collection;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import static java.util.Objects.hash;
import static java.util.Objects.requireNonNull;
/**
* Cassandra implementation of {@link DataWriterDAO} that uses the Netflix Astyanax client library.
*/
public class AstyanaxDataWriterDAO implements DataWriterDAO, DataPurgeDAO {
private static final int MAX_BATCH_SIZE = 100;
private static final int MAX_PENDING_SIZE = 200;
// Must match thrift_framed_transport_size_in_mb value from cassandra.yaml
private static final int MAX_THRIFT_FRAMED_TRANSPORT_SIZE = 15 * 1024 * 1024;
// Because of the thrift framed transport size conservatively limit the size of deltas
// to allow ample room for additional metadata and protocol overhead.
private static final int MAX_DELTA_SIZE = 10 * 1024 * 1024; // 10 MB delta limit, measured in UTF-8 bytes
private final AstyanaxKeyScanner _keyScanner;
private final DataWriterDAO _cqlWriterDAO;
private final ChangeEncoder _changeEncoder;
private final Meter _updateMeter;
private final Meter _oversizeUpdateMeter;
private final FullConsistencyTimeProvider _fullConsistencyTimeProvider;
private final DAOUtils _daoUtils;
private final String _deltaPrefix;
private final int _deltaPrefixLength;
// The difference between full consistency and "raw" consistency provider is that full consistency also includes
// a minimum lag of 5 minutes, whereas "raw" consistency timestamp just gives us the last known good FCT which could be less than 5 minutes.
// We use this for efficiency reasons, the only use case right now is to delete "compaction-owned" deltas, once we
// know that compaction is within FCT.
private final HintsConsistencyTimeProvider _rawConsistencyTimeProvider;
private final HistoryStore _historyStore;
@Inject
public AstyanaxDataWriterDAO(@AstyanaxWriterDAODelegate DataWriterDAO delegate, AstyanaxKeyScanner keyScanner,
FullConsistencyTimeProvider fullConsistencyTimeProvider, HistoryStore historyStore,
HintsConsistencyTimeProvider rawConsistencyTimeProvider,
ChangeEncoder changeEncoder, MetricRegistry metricRegistry,
DAOUtils daoUtils, @BlockSize int deltaBlockSize,
@PrefixLength int deltaPrefixLength) {
_cqlWriterDAO = requireNonNull(delegate, "delegate");
_keyScanner = requireNonNull(keyScanner, "keyScanner");
_fullConsistencyTimeProvider = requireNonNull(fullConsistencyTimeProvider, "fullConsistencyTimeProvider");
_rawConsistencyTimeProvider = requireNonNull(rawConsistencyTimeProvider, "rawConsistencyTimeProvider");
_historyStore = requireNonNull(historyStore, "historyStore");
_changeEncoder = requireNonNull(changeEncoder, "changeEncoder");
_updateMeter = metricRegistry.meter(getMetricName("updates"));
_oversizeUpdateMeter = metricRegistry.meter(getMetricName("oversizeUpdates"));
_daoUtils = daoUtils;
_deltaPrefix = StringUtils.repeat('0', deltaPrefixLength);
_deltaPrefixLength = deltaPrefixLength;
}
private String getMetricName(String name) {
return MetricRegistry.name("bv.emodb.sor", "AstyanaxDataWriterDAO", name);
}
@Override
public long getFullConsistencyTimestamp(Table tbl) {
return getFullConsistencyTimestamp((AstyanaxTable)tbl, _fullConsistencyTimeProvider);
}
@Override
public long getRawConsistencyTimestamp(Table tbl) {
return getFullConsistencyTimestamp((AstyanaxTable)tbl, _rawConsistencyTimeProvider);
}
private long getFullConsistencyTimestamp(AstyanaxTable tbl, FullConsistencyTimeProvider fullConsistencyTimeProvider) {
// Compaction runs off the "read" storage. If there are multiple back-end write storage configurations,
// we don't care whether the secondary is falling behind, only the primary that we read from matters.
DeltaPlacement placement = (DeltaPlacement) tbl.getReadStorage().getPlacement();
String clusterName = placement.getKeyspace().getClusterName();
return fullConsistencyTimeProvider.getMaxTimeStamp(clusterName);
}
@Timed(name = "bv.emodb.sor.AstyanaxDataWriterDAO.updateAll", absolute = true)
@Override
public void updateAll(Iterator updates, UpdateListener listener) {
Map> batchMap = Maps.newLinkedHashMap();
int numPending = 0;
// Group the updates by distinct placement and consistency since a Cassandra mutation only works
// with a single keyspace and consistency at a time.
while (updates.hasNext()) {
RecordUpdate update = updates.next();
AstyanaxTable table = (AstyanaxTable) update.getTable();
for (AstyanaxStorage storage : table.getWriteStorage()) {
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
BatchKey batchKey = new BatchKey(placement, update.getConsistency());
List batch = batchMap.get(batchKey);
if (batch == null) {
batchMap.put(batchKey, batch = Lists.newArrayList());
}
batch.add(new BatchUpdate(storage, update));
numPending++;
// Flush this batch if it's bigger than the maximum mutation we want to send to Cassandra. Alternatively,
// don't queue more than MAX_PENDING_SIZE updates in memory at a time, to keep max mem usage down. Go
// ahead and flush all the batches at once, even if some are still small, in order to avoid potentially
// extreme re-ordering of writes (say a batch contains 1 record in placement A followed by 100k records in
// placement B, we shouldn't delay writing A until after all B records).
if (batch.size() >= MAX_BATCH_SIZE || numPending >= MAX_PENDING_SIZE) {
writeAll(batchMap, listener);
batchMap.clear();
numPending = 0;
}
}
}
// Flush final batches.
writeAll(batchMap, listener);
}
private void writeAll(Map> batchMap, UpdateListener listener) {
for (Map.Entry> entry : batchMap.entrySet()) {
write(entry.getKey(), entry.getValue(), listener);
}
}
private void putBlockedDeltaColumn(ColumnListMutation mutation, UUID changeId, ByteBuffer encodedDelta) {
List blocks = _daoUtils.getDeltaBlocks(encodedDelta);
for (int i = 0; i < blocks.size(); i++) {
mutation.putColumn(new DeltaKey(changeId, i), blocks.get(i));
}
}
private void write(BatchKey batchKey, List updates, UpdateListener listener) {
// Invoke the configured listener. This is used to write events to the databus.
listener.beforeWrite(Collections2.transform(updates, BatchUpdate::getUpdate));
DeltaPlacement placement = batchKey.getPlacement();
MutationBatch mutation = placement.getKeyspace().prepareMutationBatch(batchKey.getConsistency());
int approxMutationSize = 0;
int updateCount = 0;
for (BatchUpdate batchUpdate : updates) {
AstyanaxStorage storage = batchUpdate.getStorage();
RecordUpdate update = batchUpdate.getUpdate();
ByteBuffer rowKey = storage.getRowKey(update.getKey());
Delta delta = update.getDelta();
String deltaString = delta.toString();
Set tags = update.getTags();
// Set any change flags which may make reading this delta back more efficient. Currently the only case
// for this is for a literal map delta.
EnumSet changeFlags = EnumSet.noneOf(ChangeFlag.class);
if (delta.isConstant()) {
changeFlags.add(ChangeFlag.CONSTANT_DELTA);
}
if (delta instanceof MapDelta || (delta instanceof Literal && ((Literal) delta).getValue() instanceof Map)) {
changeFlags.add(ChangeFlag.MAP_DELTA);
}
// Regardless of migration stage, we will still encode both deltas versions
// The values are encoded in a flexible format that allows versioning of the strings
ByteBuffer encodedBlockDelta = stringToByteBuffer(_changeEncoder.encodeDelta(deltaString, changeFlags, tags, new StringBuilder(_deltaPrefix)).toString());
ByteBuffer encodedDelta = encodedBlockDelta.duplicate();
encodedDelta.position(encodedDelta.position() + _deltaPrefixLength);
int blockDeltaSize = encodedBlockDelta.remaining();
UUID changeId = update.getChangeId();
// Validate sizes of individual deltas
if (blockDeltaSize > MAX_DELTA_SIZE) {
_oversizeUpdateMeter.mark();
throw new DeltaSizeLimitException("Delta exceeds size limit of " + MAX_DELTA_SIZE + ": " + blockDeltaSize, blockDeltaSize);
}
// Perform a quick validation that the size of the mutation batch as a whole won't exceed the thrift threshold.
// This validation is inexact and overly-conservative but it is cheap and fast.
if (!mutation.isEmpty() && approxMutationSize + blockDeltaSize > MAX_DELTA_SIZE) {
// Adding the next row may exceed the Thrift threshold. Check definitively now. This is fairly expensive
// which is why we don't do it unless the cheap check above passes.
MutationBatch potentiallyOversizeMutation = placement.getKeyspace().prepareMutationBatch(batchKey.getConsistency());
potentiallyOversizeMutation.mergeShallow(mutation);
putBlockedDeltaColumn(potentiallyOversizeMutation.withRow(placement.getBlockedDeltaColumnFamily(), rowKey), changeId, encodedBlockDelta);
if (getMutationBatchSize(potentiallyOversizeMutation) >= MAX_THRIFT_FRAMED_TRANSPORT_SIZE) {
// Execute the mutation batch now. As a side-effect this empties the mutation batch
// so we can continue using the same instance.
execute(mutation, "batch update %d records in placement %s", updateCount, placement.getName());
approxMutationSize = 0;
updateCount = 0;
}
}
putBlockedDeltaColumn(mutation.withRow(placement.getBlockedDeltaColumnFamily(), rowKey), changeId, encodedBlockDelta);
approxMutationSize += blockDeltaSize;
updateCount += 1;
}
execute(mutation, "batch update %d records in placement %s", updateCount, placement.getName());
// Invoke the configured listener. This is used to write audits.
listener.afterWrite(Collections2.transform(updates, BatchUpdate::getUpdate));
_updateMeter.mark(updates.size());
}
private ByteBuffer stringToByteBuffer(String str) {
return StringSerializer.get().toByteBuffer(str);
}
/**
* We need to make sure that compaction is written *before* the compacted deltas are deleted.
* This should be a synchronous operation.
*/
@Timed(name = "bv.emodb.sor.AstyanaxDataWriterDAO.compact", absolute = true)
@Override
public void compact(Table tbl, String key, UUID compactionKey, Compaction compaction, UUID changeId,
Delta delta, Collection changesToDelete, List historyList, WriteConsistency consistency) {
// delegate to CQL Writer for double compaction writing
_cqlWriterDAO.compact(tbl, key, compactionKey, compaction, changeId, delta, changesToDelete, historyList, consistency);
}
@Timed (name = "bv.emodb.sorAstyanaxDataWriterDAO.storeCompactedDeltas", absolute = true)
@Override
public void storeCompactedDeltas(Table tbl, String key, List histories, WriteConsistency consistency) {
requireNonNull(tbl, "table");
requireNonNull(key, "key");
requireNonNull(histories, "histories");
requireNonNull(consistency, "consistency");
AstyanaxTable table = (AstyanaxTable) tbl;
for (AstyanaxStorage storage : table.getWriteStorage()) {
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
CassandraKeyspace keyspace = placement.getKeyspace();
ByteBuffer rowKey = storage.getRowKey(key);
MutationBatch mutation = keyspace.prepareMutationBatch(SorConsistencies.toAstyanax(consistency));
ColumnListMutation rowMutation = mutation.withRow(placement.getDeltaHistoryColumnFamily(), rowKey);
for (History history : histories) {
rowMutation.putColumn(history.getChangeId(),
_changeEncoder.encodeHistory(history),
Ttls.toSeconds(_historyStore.getHistoryTtl(), 1, null));
}
execute(mutation, "store %d compacted deltas for placement %s, table %s, key %s",
histories.size(), placement.getName(), table.getName(), key);
}
}
@Timed(name = "bv.emodb.sor.AstyanaxDataWriterDAO.purgeUnsafe", absolute = true)
@Override
public void purgeUnsafe(Table tbl) {
requireNonNull(tbl, "table");
AstyanaxTable table = (AstyanaxTable) tbl;
for (AstyanaxStorage storage : table.getWriteStorage()) {
purge(storage, noop());
}
}
// DataPurgeDAO
@Override
public void purge(AstyanaxStorage storage, Runnable progress) {
DeltaPlacement placement = (DeltaPlacement) storage.getPlacement();
CassandraKeyspace keyspace = placement.getKeyspace();
// Scan all the shards and delete all the rows we find.
MutationBatch mutation = keyspace.prepareMutationBatch(SorConsistencies.toAstyanax(WriteConsistency.STRONG));
Iterator keyIter = _keyScanner.scanKeys(storage, ReadConsistency.STRONG);
while (keyIter.hasNext()) {
ByteBuffer rowKey = storage.getRowKey(keyIter.next());
mutation.withRow(placement.getBlockedDeltaColumnFamily(), rowKey).delete();
if (mutation.getRowCount() >= 100) {
progress.run();
execute(mutation, "purge %d records from placement %s", mutation.getRowCount(), placement.getName());
mutation.discardMutations();
}
}
if (!mutation.isEmpty()) {
progress.run();
execute(mutation, "purge %d records from placement %s", mutation.getRowCount(), placement.getName());
}
}
private R execute(Execution execution, String operation, Object... operationArguments) {
OperationResult operationResult;
try {
operationResult = execution.execute();
} catch (ConnectionException e) {
String message = String.format(operation, operationArguments);
if (isThriftFramedTransportSizeOverrun(execution, e)) {
throw new ThriftFramedTransportSizeException("Thrift request to large to " + message, e);
}
throw new RuntimeException("Failed to " + message, e);
}
return operationResult.getResult();
}
private boolean isThriftFramedTransportSizeOverrun(Execution> execution, ConnectionException exception) {
// Thrift framed transport size overruns don't have an explicit exception, but they fall under the general
// umbrella of "unknown" thrift transport exceptions.
Optional thriftException =
Iterables.tryFind(Throwables.getCausalChain(exception), Predicates.instanceOf(TTransportException.class))
.transform(java.util.Optional::of)
.or(java.util.Optional.empty());
//noinspection ThrowableResultOfMethodCallIgnored
if (!thriftException.isPresent() || ((TTransportException) thriftException.get()).getType() != TTransportException.UNKNOWN) {
return false;
}
return execution instanceof MutationBatch &&
getMutationBatchSize((MutationBatch) execution) >= MAX_THRIFT_FRAMED_TRANSPORT_SIZE;
}
private int getMutationBatchSize(MutationBatch mutation) {
assert mutation instanceof AbstractThriftMutationBatchImpl : "MutationBatch is not an instance of AbstractThriftMutationBatchImpl";
try (CountingOutputStream countingOut = new CountingOutputStream(ByteStreams.nullOutputStream())) {
TIOStreamTransport transport = new TIOStreamTransport(countingOut);
Cassandra.batch_mutate_args args = new Cassandra.batch_mutate_args();
args.setMutation_map(((AbstractThriftMutationBatchImpl) mutation).getMutationMap());
args.write(new TBinaryProtocol(transport));
return (int) countingOut.getCount();
} catch (TException | IOException e) {
throw Throwables.propagate(e);
}
}
private Runnable noop() {
return new Runnable() {
@Override
public void run() {
// Do nothing
}
};
}
/** Key used for grouping batches of update operations for execution. */
private static class BatchKey {
private final DeltaPlacement _placement;
private final ConsistencyLevel _consistency;
BatchKey(DeltaPlacement placement, WriteConsistency consistency) {
_placement = placement;
_consistency = SorConsistencies.toAstyanax(consistency);
}
DeltaPlacement getPlacement() {
return _placement;
}
ConsistencyLevel getConsistency() {
return _consistency;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (!(o instanceof BatchKey)) {
return false;
}
BatchKey batchKey = (BatchKey) o;
return _consistency == batchKey.getConsistency() &&
_placement.equals(batchKey.getPlacement());
}
@Override
public int hashCode() {
return hash(_placement, _consistency);
}
}
/** Value used for grouping batches of update operations for execution. */
private static class BatchUpdate {
private final AstyanaxStorage _storage;
private final RecordUpdate _update;
BatchUpdate(AstyanaxStorage storage, RecordUpdate record) {
_storage = storage;
_update = record;
}
AstyanaxStorage getStorage() {
return _storage;
}
RecordUpdate getUpdate() {
return _update;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy