All Downloads are FREE. Search and download functionalities are using the official Maven repository.

dev.responsive.kafka.internal.db.CassandraFactTable Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2024 Responsive Computing, Inc.
 *
 * This source code is licensed under the Responsive Business Source License Agreement v1.0
 * available at:
 *
 * https://www.responsive.dev/legal/responsive-bsl-10
 *
 * This software requires a valid Commercial License Key for production use. Trial and commercial
 * licenses can be obtained at https://www.responsive.dev
 */

package dev.responsive.kafka.internal.db;

import static com.datastax.oss.driver.api.querybuilder.QueryBuilder.bindMarker;
import static dev.responsive.kafka.internal.db.ColumnName.DATA_KEY;
import static dev.responsive.kafka.internal.db.ColumnName.DATA_VALUE;
import static dev.responsive.kafka.internal.db.ColumnName.OFFSET;
import static dev.responsive.kafka.internal.db.ColumnName.PARTITION_KEY;
import static dev.responsive.kafka.internal.db.ColumnName.ROW_TYPE;
import static dev.responsive.kafka.internal.db.ColumnName.TIMESTAMP;
import static dev.responsive.kafka.internal.db.ColumnName.TTL_SECONDS;
import static dev.responsive.kafka.internal.stores.ResponsiveStoreRegistration.NO_COMMITTED_OFFSET;

import com.datastax.oss.driver.api.core.cql.BoundStatement;
import com.datastax.oss.driver.api.core.cql.PreparedStatement;
import com.datastax.oss.driver.api.core.cql.Row;
import com.datastax.oss.driver.api.core.type.DataTypes;
import com.datastax.oss.driver.api.querybuilder.QueryBuilder;
import com.datastax.oss.driver.api.querybuilder.SchemaBuilder;
import com.datastax.oss.driver.api.querybuilder.schema.CreateTableWithOptions;
import dev.responsive.kafka.api.stores.TtlProvider.TtlDuration;
import dev.responsive.kafka.internal.db.spec.RemoteTableSpec;
import dev.responsive.kafka.internal.stores.TtlResolver;
import java.nio.ByteBuffer;
import java.time.Instant;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import javax.annotation.CheckReturnValue;
import org.apache.kafka.common.utils.Bytes;
import org.apache.kafka.streams.state.KeyValueIterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class CassandraFactTable implements RemoteKVTable {

  private static final Logger LOG = LoggerFactory.getLogger(
      CassandraFactTable.class);

  private final String name;
  private final CassandraClient client;
  private final Optional> ttlResolver;

  private final PreparedStatement get;
  private final PreparedStatement getWithTimestamp;
  private final PreparedStatement insert;
  private final PreparedStatement insertWithTtl;
  private final PreparedStatement delete;
  private final PreparedStatement fetchOffset;
  private final PreparedStatement setOffset;

  public CassandraFactTable(
      final String name,
      final CassandraClient client,
      final Optional> ttlResolver,
      final PreparedStatement get,
      final PreparedStatement getWithTimestamp,
      final PreparedStatement insert,
      final PreparedStatement insertWithTtl,
      final PreparedStatement delete,
      final PreparedStatement fetchOffset,
      final PreparedStatement setOffset
  ) {
    this.name = name;
    this.client = client;
    this.ttlResolver = ttlResolver;
    this.get = get;
    this.getWithTimestamp = getWithTimestamp;
    this.insert = insert;
    this.insertWithTtl = insertWithTtl;
    this.delete = delete;
    this.fetchOffset = fetchOffset;
    this.setOffset = setOffset;
  }

  public static CassandraFactTable create(
      final RemoteTableSpec spec,
      final CassandraClient client
  ) {
    final String name = spec.tableName();
    final var ttlResolver = spec.ttlResolver();
    LOG.info("Creating fact data table {} in remote store.", name);

    final CreateTableWithOptions createTable = spec.applyDefaultOptions(
        createTable(name, ttlResolver)
    );

    // separate metadata from the main table for the fact schema, this is acceptable
    // because we don't use the metadata at all for fencing operations and writes to
    // it do not need to be atomic (transactional with the original table). we cannot
    // effectively use the same table (as we do with the normal KeyValueSchema) because
    // TWCS cannot properly compact files if there are any overwrites, which there are
    // for the metadata columns
    final CreateTableWithOptions createMetadataTable = SchemaBuilder
        .createTable(metadataTable(name))
        .ifNotExists()
        .withPartitionKey(ROW_TYPE.column(), DataTypes.TINYINT)
        .withPartitionKey(PARTITION_KEY.column(), DataTypes.INT)
        .withColumn(OFFSET.column(), DataTypes.BIGINT);

    client.execute(createTable.build());
    client.execute(createMetadataTable.build());

    final var insert = client.prepare(
        QueryBuilder
            .insertInto(name)
            .value(ROW_TYPE.column(), RowType.DATA_ROW.literal())
            .value(DATA_KEY.column(), bindMarker(DATA_KEY.bind()))
            .value(TIMESTAMP.column(), bindMarker(TIMESTAMP.bind()))
            .value(DATA_VALUE.column(), bindMarker(DATA_VALUE.bind()))
            .build(),
        QueryOp.WRITE
    );

    final var insertWithTtl = client.prepare(
        QueryBuilder
            .insertInto(name)
            .value(ROW_TYPE.column(), RowType.DATA_ROW.literal())
            .value(DATA_KEY.column(), bindMarker(DATA_KEY.bind()))
            .value(TIMESTAMP.column(), bindMarker(TIMESTAMP.bind()))
            .value(DATA_VALUE.column(), bindMarker(DATA_VALUE.bind()))
            .usingTtl(bindMarker(TTL_SECONDS.bind()))
            .build(),
        QueryOp.WRITE
    );

    final var get = client.prepare(
        QueryBuilder
            .selectFrom(name)
            .columns(DATA_VALUE.column())
            .where(ROW_TYPE.relation().isEqualTo(RowType.DATA_ROW.literal()))
            .where(DATA_KEY.relation().isEqualTo(bindMarker(DATA_KEY.bind())))
            .where(TIMESTAMP.relation().isGreaterThanOrEqualTo(bindMarker(TIMESTAMP.bind())))
            // ALLOW FILTERING is OK b/c the query only scans one partition (it actually  only
            // returns a single value)
            .allowFiltering()
            .build(),
        QueryOp.READ
    );

    final var getWithTimestamp = client.prepare(
        QueryBuilder
            .selectFrom(name)
            .columns(DATA_VALUE.column(), TIMESTAMP.column())
            .where(ROW_TYPE.relation().isEqualTo(RowType.DATA_ROW.literal()))
            .where(DATA_KEY.relation().isEqualTo(bindMarker(DATA_KEY.bind())))
            // ALLOW FILTERING is OK b/c the query only scans one partition (it actually  only
            // returns a single value)
            .allowFiltering()
            .build(),
        QueryOp.READ
    );

    final var delete = client.prepare(
        QueryBuilder
            .deleteFrom(name)
            .where(ROW_TYPE.relation().isEqualTo(RowType.DATA_ROW.literal()))
            .where(DATA_KEY.relation().isEqualTo(bindMarker(DATA_KEY.bind())))
            .build(),
        QueryOp.WRITE
    );

    final var fetchOffset = client.prepare(
        QueryBuilder
            .selectFrom(metadataTable(name))
            .column(OFFSET.column())
            .where(ROW_TYPE.relation().isEqualTo(RowType.METADATA_ROW.literal()))
            .where(PARTITION_KEY.relation().isEqualTo(bindMarker(PARTITION_KEY.bind())))
            .build(),
        QueryOp.READ
    );

    final var setOffset = client.prepare(
        QueryBuilder
            .update(metadataTable(name))
            .setColumn(OFFSET.column(), bindMarker(OFFSET.bind()))
            .where(ROW_TYPE.relation().isEqualTo(RowType.METADATA_ROW.literal()))
            .where(PARTITION_KEY.relation().isEqualTo(bindMarker(PARTITION_KEY.bind())))
            .build(),
        QueryOp.WRITE
    );

    return new CassandraFactTable(
        name,
        client,
        ttlResolver,
        get,
        getWithTimestamp,
        insert,
        insertWithTtl,
        delete,
        fetchOffset,
        setOffset
    );
  }

  private static CreateTableWithOptions createTable(
      final String tableName,
      final Optional> ttlResolver
  ) {
    final var baseOptions = SchemaBuilder
        .createTable(tableName)
        .ifNotExists()
        .withPartitionKey(ROW_TYPE.column(), DataTypes.TINYINT)
        .withPartitionKey(DATA_KEY.column(), DataTypes.BLOB)
        .withColumn(TIMESTAMP.column(), DataTypes.TIMESTAMP)
        .withColumn(DATA_VALUE.column(), DataTypes.BLOB);

    if (ttlResolver.isPresent() && ttlResolver.get().defaultTtl().isFinite()) {
      return baseOptions.withDefaultTimeToLiveSeconds(
          (int) ttlResolver.get().defaultTtl().toSeconds());
    } else {
      return baseOptions;
    }
  }

  @Override
  public String name() {
    return name;
  }

  @Override
  public CassandraFactFlushManager init(
      final int kafkaPartition
  ) {
    client.execute(
        QueryBuilder.insertInto(metadataTable(name))
            .value(ROW_TYPE.column(), RowType.METADATA_ROW.literal())
            .value(PARTITION_KEY.column(), PARTITION_KEY.literal(kafkaPartition))
            .value(OFFSET.column(), OFFSET.literal(NO_COMMITTED_OFFSET))
            .ifNotExists()
            .build()
    );

    return new CassandraFactFlushManager(this, client, kafkaPartition);
  }

  @Override
  public long fetchOffset(final int kafkaPartition) {
    final BoundStatement bound = fetchOffset
        .bind()
        .setInt(PARTITION_KEY.bind(), kafkaPartition);
    final List result = client.execute(bound).all();

    if (result.size() > 1) {
      throw new IllegalStateException(String.format(
          "Expected at most one offset row for %s[%s] but got %d",
          name, kafkaPartition, result.size()));
    } else if (result.isEmpty()) {
      return NO_COMMITTED_OFFSET;
    } else {
      final long offset = result.get(0).getLong(OFFSET.column());
      LOG.info("Got offset for {}[{}]: {}", name, kafkaPartition, offset);
      return offset;
    }
  }

  public BoundStatement setOffset(
      final int kafkaPartition,
      final long offset
  ) {
    LOG.info("Setting offset in metadata table {} for {}[{}] to {}",
             metadataTable(name), name, kafkaPartition, offset);
    return setOffset
        .bind()
        .setInt(PARTITION_KEY.bind(), kafkaPartition)
        .setLong(OFFSET.bind(), offset);
  }

  @Override
  public long approximateNumEntries(final int kafkaPartition) {
    throw new UnsupportedOperationException(
        "approximateNumEntries is not supported on fact tables");
  }

  @Override
  @CheckReturnValue
  public BoundStatement delete(
      final int kafkaPartition,
      final Bytes key
  ) {
    return delete
        .bind()
        .setByteBuffer(DATA_KEY.bind(), ByteBuffer.wrap(key.get()));
  }

  @Override
  @CheckReturnValue
  public BoundStatement insert(
      final int kafkaPartition,
      final Bytes key,
      final byte[] value,
      final long epochMillis
  ) {
    if (ttlResolver.isPresent()) {
      final Optional rowTtl = ttlResolver.get().computeTtl(key, value);

      // If user happens to return same ttl value as the default, skip applying it at
      // the row level since this is less efficient in Scylla
      if (rowTtl.isPresent() && !rowTtl.get().equals(ttlResolver.get().defaultTtl())) {

        // You can set the row ttl to 0 in Scylla to apply no/infinite ttl
        final int rowTtlOverrideSeconds = rowTtl.get().isFinite()
            ? (int) rowTtl.get().toSeconds()
            : 0;

        return insertWithTtl
            .bind()
            .setByteBuffer(DATA_KEY.bind(), ByteBuffer.wrap(key.get()))
            .setByteBuffer(DATA_VALUE.bind(), ByteBuffer.wrap(value))
            .setInstant(TIMESTAMP.bind(), Instant.ofEpochMilli(epochMillis))
            .setInt(TTL_SECONDS.bind(), rowTtlOverrideSeconds);
      }
    }

    return insert
        .bind()
        .setByteBuffer(DATA_KEY.bind(), ByteBuffer.wrap(key.get()))
        .setByteBuffer(DATA_VALUE.bind(), ByteBuffer.wrap(value))
        .setInstant(TIMESTAMP.bind(), Instant.ofEpochMilli(epochMillis));
  }

  @Override
  public byte[] get(final int kafkaPartition, final Bytes key, long streamTimeMs) {
    if (ttlResolver.isEmpty()) {
      return simpleGet(key);
    } else if (ttlResolver.get().needsValueToComputeTtl()) {
      return postFilterGet(key, streamTimeMs);
    } else {
      final TtlDuration ttl = ttlResolver.get().resolveTtl(key, null);
      if (ttl.isFinite()) {
        final long minValidTimeMs = streamTimeMs - ttl.toMillis();
        return preFilterGet(key, minValidTimeMs);
      } else {
        return simpleGet(key);
      }
    }
  }

  /**
   * Simple "get" with no filtering for when ttl is infinite or there is no ttl at all
   */
  private byte[] simpleGet(final Bytes key) {
    // Just delegate to the preFilterGet with a min valid timestamp of -1
    // since this should not exclude anything since it's not worth having
    // a third "get" PreparedStatement without the gte(timestamp) filter
    return preFilterGet(key, -1L);
  }

  /**
   * Simple "get" with server-side filtering on ttl. Used when ttl is possible
   * to compute based on the key alone, is default only, or has no ttl at all
   */
  private byte[] preFilterGet(final Bytes key, final long minValidTimeMs) {
    final BoundStatement getQuery = get
        .bind()
        .setByteBuffer(DATA_KEY.bind(), ByteBuffer.wrap(key.get()))
        .setInstant(TIMESTAMP.bind(), Instant.ofEpochMilli(minValidTimeMs));

    final List result = client.execute(getQuery).all();

    if (result.size() > 1) {
      throw new IllegalStateException("Received multiple results for the same key");
    } else if (result.isEmpty()) {
      return null;
    } else {
      return getValueFromRow(result.get(0));
    }
  }

  private byte[] postFilterGet(final Bytes key, long streamTimeMs) {
    final BoundStatement getQuery = getWithTimestamp
        .bind()
        .setByteBuffer(DATA_KEY.bind(), ByteBuffer.wrap(key.get()));

    final List result = client.execute(getQuery).all();

    if (result.size() > 1) {
      throw new IllegalStateException("Received multiple results for the same key");
    } else if (result.isEmpty()) {
      return null;
    }

    final Row rowResult = result.get(0);
    final byte[] value = getValueFromRow(rowResult);
    final TtlDuration ttl = ttlResolver.get().resolveTtl(key, value);

    if (ttl.isFinite()) {
      final long minValidTsFromValue = streamTimeMs - ttl.toMillis();
      final long recordTs = rowResult.getInstant(TIMESTAMP.column()).toEpochMilli();
      if (recordTs < minValidTsFromValue) {
        return null;
      }
    }

    return value;
  }

  private byte[] getValueFromRow(final Row row) {
    return Objects.requireNonNull(row.getByteBuffer(DATA_VALUE.column())).array();
  }

  @Override
  public KeyValueIterator range(
      final int kafkaPartition,
      final Bytes from,
      final Bytes to,
      long streamTimeMs
  ) {
    throw new UnsupportedOperationException("range scans are not supported on fact tables.");
  }

  @Override
  public KeyValueIterator all(
      final int kafkaPartition,
      long streamTimeMs
  ) {
    throw new UnsupportedOperationException("all is not supported on fact tables");
  }

  private static String metadataTable(final String tableName) {
    return tableName + "_md";
  }

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy