com.snowflake.kafka.connector.internal.streaming.SnowflakeSinkServiceV2 Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of snowflake-kafka-connector Show documentation
Snowflake Kafka Connect Sink Connector
There is a newer version: 2.4.1
Show newest version
package com.snowflake.kafka.connector.internal.streaming;

import static com.snowflake.kafka.connector.SnowflakeSinkConnectorConfig.BUFFER_SIZE_BYTES_DEFAULT;
import static com.snowflake.kafka.connector.SnowflakeSinkConnectorConfig.SNOWFLAKE_ROLE;
import static com.snowflake.kafka.connector.internal.streaming.StreamingUtils.STREAMING_BUFFER_COUNT_RECORDS_DEFAULT;
import static com.snowflake.kafka.connector.internal.streaming.StreamingUtils.STREAMING_BUFFER_FLUSH_TIME_DEFAULT_SEC;
import static com.snowflake.kafka.connector.internal.streaming.TopicPartitionChannel.NO_OFFSET_TOKEN_REGISTERED_IN_SNOWFLAKE;

import com.codahale.metrics.MetricRegistry;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Strings;
import com.snowflake.kafka.connector.SnowflakeSinkConnectorConfig;
import com.snowflake.kafka.connector.Utils;
import com.snowflake.kafka.connector.dlq.KafkaRecordErrorReporter;
import com.snowflake.kafka.connector.internal.KCLogger;
import com.snowflake.kafka.connector.internal.SnowflakeConnectionService;
import com.snowflake.kafka.connector.internal.SnowflakeErrors;
import com.snowflake.kafka.connector.internal.SnowflakeSinkService;
import com.snowflake.kafka.connector.internal.metrics.MetricsJmxReporter;
import com.snowflake.kafka.connector.internal.telemetry.SnowflakeTelemetryService;
import com.snowflake.kafka.connector.records.RecordService;
import com.snowflake.kafka.connector.records.SnowflakeMetadataConfig;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import net.snowflake.ingest.streaming.SnowflakeStreamingIngestClient;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.connect.sink.SinkRecord;
import org.apache.kafka.connect.sink.SinkTaskContext;

/**
 * This is per task configuration. A task can be assigned multiple partitions. Major methods are
 * startTask, insert, getOffset and close methods.
 *
 * StartTask: Called when partitions are assigned. Responsible for generating the POJOs.
 *
 * 
Insert and getOffset are called when {@link
 * com.snowflake.kafka.connector.SnowflakeSinkTask#put(Collection)} and {@link
 * com.snowflake.kafka.connector.SnowflakeSinkTask#preCommit(Map)} APIs are called.
 *
 * 
This implementation of SinkService uses Streaming Snowpipe (Streaming Ingestion)
 *
 * 
Hence this initializes the channel, opens, closes. The StreamingIngestChannel resides inside
 * {@link TopicPartitionChannel} which is per partition.
 */
public class SnowflakeSinkServiceV2 implements SnowflakeSinkService {

  private static final KCLogger LOGGER = new KCLogger(SnowflakeSinkServiceV2.class.getName());

  // Assume next three values are a threshold after which we will call insertRows API
  // Set in config (Time based flush) in seconds
  private long flushTimeSeconds;
  // Set in config (buffer size based flush) in bytes
  private long fileSizeBytes;

  // Set in config (Threshold before we call insertRows API) corresponds to # of
  // records in kafka
  private long recordNum;

  // Used to connect to Snowflake, could be null during testing
  private final SnowflakeConnectionService conn;

  private final RecordService recordService;
  private final SnowflakeTelemetryService telemetryService;
  private Map topicToTableMap;

  // Behavior to be set at the start of connector start. (For tombstone records)
  private SnowflakeSinkConnectorConfig.BehaviorOnNullValues behaviorOnNullValues;

  // default is true unless the configuration provided is false;
  // If this is true, we will enable Mbean for required classes and emit JMX metrics for monitoring
  private boolean enableCustomJMXMonitoring = SnowflakeSinkConnectorConfig.JMX_OPT_DEFAULT;
  private MetricsJmxReporter metricsJmxReporter;

  /**
   * Fetching this from {@link org.apache.kafka.connect.sink.SinkTaskContext}'s {@link
   * org.apache.kafka.connect.sink.ErrantRecordReporter}
   */
  private KafkaRecordErrorReporter kafkaRecordErrorReporter;

  /* SinkTaskContext has access to all methods/APIs available to talk to Kafka Connect runtime*/
  private SinkTaskContext sinkTaskContext;

  // ------ Streaming Ingest ------ //
  // needs url, username. p8 key, role name
  private SnowflakeStreamingIngestClient streamingIngestClient;

  // Config set in JSON
  private final Map connectorConfig;

  private boolean enableSchematization;

  /**
   * Key is formulated in {@link #partitionChannelKey(String, String, int)} }
   *
   * 
value is the Streaming Ingest Channel implementation (Wrapped around TopicPartitionChannel)
   */
  private final Map partitionsToChannel;

  // Cache for schema evolution
  private final Map tableName2SchemaEvolutionPermission;

  public SnowflakeSinkServiceV2(
      SnowflakeConnectionService conn, Map connectorConfig) {
    if (conn == null || conn.isClosed()) {
      throw SnowflakeErrors.ERROR_5010.getException();
    }

    this.fileSizeBytes = StreamingUtils.STREAMING_BUFFER_BYTES_DEFAULT;
    this.recordNum = StreamingUtils.STREAMING_BUFFER_COUNT_RECORDS_DEFAULT;
    this.flushTimeSeconds = StreamingUtils.STREAMING_BUFFER_FLUSH_TIME_DEFAULT_SEC;
    this.conn = conn;
    this.telemetryService = conn.getTelemetryClient();
    this.recordService = new RecordService(this.telemetryService);
    this.topicToTableMap = new HashMap<>();

    // Setting the default value in constructor
    // meaning it will not ignore the null values (Tombstone records wont be ignored/filtered)
    this.behaviorOnNullValues = SnowflakeSinkConnectorConfig.BehaviorOnNullValues.DEFAULT;

    this.connectorConfig = connectorConfig;

    this.enableSchematization =
        this.recordService.setAndGetEnableSchematizationFromConfig(this.connectorConfig);

    this.streamingIngestClient =
        StreamingClientProvider.getStreamingClientProviderInstance()
            .getClient(this.connectorConfig);

    this.partitionsToChannel = new HashMap<>();

    this.tableName2SchemaEvolutionPermission = new HashMap<>();

    // jmx
    String connectorName =
        conn == null || Strings.isNullOrEmpty(this.conn.getConnectorName())
            ? "default_connector"
            : this.conn.getConnectorName();
    this.metricsJmxReporter = new MetricsJmxReporter(new MetricRegistry(), connectorName);
  }

  @VisibleForTesting
  public SnowflakeSinkServiceV2(
      long flushTimeSeconds,
      long fileSizeBytes,
      long recordNum,
      SnowflakeConnectionService conn,
      RecordService recordService,
      SnowflakeTelemetryService telemetryService,
      Map topicToTableMap,
      SnowflakeSinkConnectorConfig.BehaviorOnNullValues behaviorOnNullValues,
      boolean enableCustomJMXMonitoring,
      KafkaRecordErrorReporter kafkaRecordErrorReporter,
      SinkTaskContext sinkTaskContext,
      SnowflakeStreamingIngestClient streamingIngestClient,
      Map connectorConfig,
      boolean enableSchematization,
      Map partitionsToChannel) {
    this.flushTimeSeconds = flushTimeSeconds;
    this.fileSizeBytes = fileSizeBytes;
    this.recordNum = recordNum;
    this.conn = conn;
    this.recordService = recordService;
    this.telemetryService = telemetryService;
    this.topicToTableMap = topicToTableMap;
    this.behaviorOnNullValues = behaviorOnNullValues;
    this.enableCustomJMXMonitoring = enableCustomJMXMonitoring;
    this.kafkaRecordErrorReporter = kafkaRecordErrorReporter;
    this.sinkTaskContext = sinkTaskContext;
    this.streamingIngestClient = streamingIngestClient;
    this.connectorConfig = connectorConfig;
    this.streamingIngestClient =
        StreamingClientProvider.getStreamingClientProviderInstance()
            .getClient(this.connectorConfig);
    this.enableSchematization = enableSchematization;
    this.partitionsToChannel = partitionsToChannel;

    this.tableName2SchemaEvolutionPermission = new HashMap<>();
    if (this.topicToTableMap != null) {
      this.topicToTableMap.forEach(
          (topic, tableName) -> {
            populateSchemaEvolutionPermissions(tableName);
          });
    }
  }

  /**
   * Creates a table if it doesnt exist in Snowflake.
   *
   * 
Initializes the Channel and partitionsToChannel map with new instance of {@link
   * TopicPartitionChannel}
   *
   * @param tableName destination table name
   * @param topicPartition TopicPartition passed from Kafka
   */
  @Override
  public void startPartition(String tableName, TopicPartition topicPartition) {
    // the table should be present before opening a channel so let's do a table existence check here
    createTableIfNotExists(tableName);

    // Create channel for the given partition
    createStreamingChannelForTopicPartition(
        tableName, topicPartition, tableName2SchemaEvolutionPermission.get(tableName));
  }

  /**
   * Initializes multiple Channels and partitionsToChannel maps with new instances of {@link
   * TopicPartitionChannel}
   *
   * @param partitions collection of topic partition
   * @param topic2Table map of topic to table name
   */
  @Override
  public void startPartitions(
      Collection partitions, Map topic2Table) {
    partitions.forEach(
        tp -> {
          String tableName = Utils.tableName(tp.topic(), topic2Table);
          createTableIfNotExists(tableName);

          createStreamingChannelForTopicPartition(
              tableName, tp, tableName2SchemaEvolutionPermission.get(tableName));
        });
  }

  /**
   * Always opens a new channel and creates a new instance of TopicPartitionChannel.
   *
   * 
This is essentially a blind write to partitionsToChannel. i.e. we do not check if it is
   * presented or not.
   */
  private void createStreamingChannelForTopicPartition(
      final String tableName,
      final TopicPartition topicPartition,
      boolean hasSchemaEvolutionPermission) {
    final String partitionChannelKey =
        partitionChannelKey(
            conn.getConnectorName(), topicPartition.topic(), topicPartition.partition());
    // Create new instance of TopicPartitionChannel which will always open the channel.
    partitionsToChannel.put(
        partitionChannelKey,
        new TopicPartitionChannel(
            this.streamingIngestClient,
            topicPartition,
            partitionChannelKey, // Streaming channel name
            tableName,
            hasSchemaEvolutionPermission,
            new StreamingBufferThreshold(this.flushTimeSeconds, this.fileSizeBytes, this.recordNum),
            this.connectorConfig,
            this.kafkaRecordErrorReporter,
            this.sinkTaskContext,
            this.conn,
            this.recordService,
            this.conn.getTelemetryClient(),
            this.enableCustomJMXMonitoring,
            this.metricsJmxReporter));
  }

  /**
   * Inserts the given record into buffer and then eventually calls insertRows API if buffer
   * threshold has reached.
   *
   * 
TODO: SNOW-473896 - Please note we will get away with Buffering logic in future commits.
   *
   * @param records records coming from Kafka. Please note, they are not just from single topic and
   *     partition. It depends on the kafka connect worker node which can consume from multiple
   *     Topic and multiple Partitions
   */
  @Override
  public void insert(Collection records) {
    // note that records can be empty but, we will still need to check for time based flush
    for (SinkRecord record : records) {
      // check if need to handle null value records
      if (recordService.shouldSkipNullValue(record, behaviorOnNullValues)) {
        continue;
      }
      // While inserting into buffer, we will check for count threshold and buffered bytes
      // threshold.
      insert(record);
    }

    // check all partitions to see if they need to be flushed based on time
    for (TopicPartitionChannel partitionChannel : partitionsToChannel.values()) {
      // Time based flushing
      partitionChannel.insertBufferedRecordsIfFlushTimeThresholdReached();
    }
  }

  /**
   * Inserts individual records into buffer. It fetches the TopicPartitionChannel from the map and
   * then each partition(Streaming channel) calls its respective insertRows API
   *
   * @param record record content
   */
  @Override
  public void insert(SinkRecord record) {
    String partitionChannelKey =
        partitionChannelKey(this.conn.getConnectorName(), record.topic(), record.kafkaPartition());
    // init a new topic partition if it's not presented in cache or if channel is closed
    if (!partitionsToChannel.containsKey(partitionChannelKey)
        || partitionsToChannel.get(partitionChannelKey).isChannelClosed()) {
      LOGGER.warn(
          "Topic: {} Partition: {} hasn't been initialized by OPEN function",
          record.topic(),
          record.kafkaPartition());
      startPartition(
          Utils.tableName(record.topic(), this.topicToTableMap),
          new TopicPartition(record.topic(), record.kafkaPartition()));
    }

    TopicPartitionChannel channelPartition = partitionsToChannel.get(partitionChannelKey);
    channelPartition.insertRecordToBuffer(record);
  }

  @Override
  public long getOffset(TopicPartition topicPartition) {
    String partitionChannelKey =
        partitionChannelKey(
            conn.getConnectorName(), topicPartition.topic(), topicPartition.partition());
    if (partitionsToChannel.containsKey(partitionChannelKey)) {
      long offset = partitionsToChannel.get(partitionChannelKey).getOffsetSafeToCommitToKafka();
      partitionsToChannel.get(partitionChannelKey).setLatestConsumerOffset(offset);
      return offset;
    } else {
      LOGGER.warn(
          "Topic: {} Partition: {} hasn't been initialized to get offset",
          topicPartition.topic(),
          topicPartition.partition());
      return NO_OFFSET_TOKEN_REGISTERED_IN_SNOWFLAKE;
    }
  }

  @Override
  public int getPartitionCount() {
    return partitionsToChannel.size();
  }

  @Override
  public void callAllGetOffset() {
    // undefined
  }

  @Override
  public void closeAll() {
    partitionsToChannel.forEach(
        (partitionChannelKey, topicPartitionChannel) -> {
          LOGGER.info("Closing partition channel:{}", partitionChannelKey);
          topicPartitionChannel.closeChannel();
        });
    partitionsToChannel.clear();

    StreamingClientProvider.getStreamingClientProviderInstance()
        .closeClient(this.streamingIngestClient);
  }

  /**
   * This function is called during rebalance.
   *
   * 
All the channels are closed. The client is still active. Upon rebalance, (inside {@link
   * com.snowflake.kafka.connector.SnowflakeSinkTask#open(Collection)} we will reopen the channel.
   *
   * 
We will wipe the cache partitionsToChannel so that in {@link
   * com.snowflake.kafka.connector.SnowflakeSinkTask#open(Collection)} we reinstantiate and fetch
   * offsetToken
   *
   * @param partitions a list of topic partition
   */
  @Override
  public void close(Collection partitions) {
    partitions.forEach(
        topicPartition -> {
          final String partitionChannelKey =
              partitionChannelKey(
                  conn.getConnectorName(), topicPartition.topic(), topicPartition.partition());
          TopicPartitionChannel topicPartitionChannel =
              partitionsToChannel.get(partitionChannelKey);
          // Check for null since it's possible that the something goes wrong even before the
          // channels are created
          if (topicPartitionChannel != null) {
            topicPartitionChannel.closeChannel();
          }
          LOGGER.info(
              "Closing partitionChannel:{}, partition:{}, topic:{}",
              topicPartitionChannel == null ? null : topicPartitionChannel.getChannelName(),
              topicPartition.topic(),
              topicPartition.partition());
          partitionsToChannel.remove(partitionChannelKey);
        });
    LOGGER.info(
        "Closing {} partitions and remaining partitions which are not closed are:{}, with size:{}",
        partitions.size(),
        partitionsToChannel.keySet().toString(),
        partitionsToChannel.size());
  }

  @Override
  public void setIsStoppedToTrue() {}

  /* Undefined */
  @Override
  public boolean isClosed() {
    return false;
  }

  @Override
  public void setRecordNumber(long num) {
    if (num < 0) {
      LOGGER.error("number of record in each file is {}, it is negative, reset to 0", num);
      this.recordNum = STREAMING_BUFFER_COUNT_RECORDS_DEFAULT;
    } else {
      this.recordNum = num;
      LOGGER.info("Set number of records for buffer threshold to {}", num);
    }
  }

  /**
   * Assume this is buffer size in bytes, since this is streaming ingestion
   *
   * @param size in bytes - a non negative long number representing size of internal buffer for
   *     flush.
   */
  @Override
  public void setFileSize(long size) {
    if (size < SnowflakeSinkConnectorConfig.BUFFER_SIZE_BYTES_MIN) {
      LOGGER.error(
          "Buffer size is {} bytes, it is smaller than the minimum buffer "
              + "size {} bytes, reset to the default buffer size",
          size,
          BUFFER_SIZE_BYTES_DEFAULT);
      this.fileSizeBytes = BUFFER_SIZE_BYTES_DEFAULT;
    } else {
      this.fileSizeBytes = size;
      LOGGER.info("set buffer size limitation to {} bytes", size);
    }
  }

  @Override
  public void setTopic2TableMap(Map topicToTableMap) {
    this.topicToTableMap = topicToTableMap;
  }

  @Override
  public void setFlushTime(long time) {
    if (time < StreamingUtils.STREAMING_BUFFER_FLUSH_TIME_MINIMUM_SEC) {
      LOGGER.error(
          "flush time is {} seconds, it is smaller than the minimum "
              + "flush time {} seconds, reset to the default flush time",
          time,
          STREAMING_BUFFER_FLUSH_TIME_DEFAULT_SEC);
      this.flushTimeSeconds = STREAMING_BUFFER_FLUSH_TIME_DEFAULT_SEC;
    } else {
      this.flushTimeSeconds = time;
      LOGGER.info("set flush time to {} seconds", time);
    }
  }

  @Override
  public void setMetadataConfig(SnowflakeMetadataConfig configMap) {
    this.recordService.setMetadataConfig(configMap);
  }

  @Override
  public long getRecordNumber() {
    return this.recordNum;
  }

  @Override
  public long getFlushTime() {
    return this.flushTimeSeconds;
  }

  /**
   * This is more of size in bytes of buffered records. This necessarily doesnt translates to files
   * created by Streaming Ingest since they are compressed. So there is no 1:1 mapping.
   */
  @Override
  public long getFileSize() {
    return this.fileSizeBytes;
  }

  @Override
  public void setBehaviorOnNullValuesConfig(
      SnowflakeSinkConnectorConfig.BehaviorOnNullValues behavior) {
    this.behaviorOnNullValues = behavior;
  }

  @Override
  public void setCustomJMXMetrics(boolean enableJMX) {
    this.enableCustomJMXMonitoring = enableJMX;
  }

  @Override
  public SnowflakeSinkConnectorConfig.BehaviorOnNullValues getBehaviorOnNullValuesConfig() {
    return this.behaviorOnNullValues;
  }

  /* Set this to send records to DLQ. */
  @Override
  public void setErrorReporter(KafkaRecordErrorReporter kafkaRecordErrorReporter) {
    this.kafkaRecordErrorReporter = kafkaRecordErrorReporter;
  }

  @Override
  public void setSinkTaskContext(SinkTaskContext sinkTaskContext) {
    this.sinkTaskContext = sinkTaskContext;
  }

  @Override
  public Optional getMetricRegistry(String partitionChannelKey) {
    return this.partitionsToChannel.containsKey(partitionChannelKey)
        ? Optional.of(
            this.partitionsToChannel
                .get(partitionChannelKey)
                .getSnowflakeTelemetryChannelStatus()
                .getMetricsJmxReporter()
                .getMetricRegistry())
        : Optional.empty();
  }

  /**
   * Gets a unique identifier consisting of connector name, topic name and partition number.
   *
   * @param connectorName Connector name is always unique. (Two connectors with same name won't be
   *     allowed by Connector Framework)
   *     Note: Customers can have same named connector in different connector runtimes (Like DEV
   *     or PROD)
   * @param topic topic name
   * @param partition partition number
   * @return combinartion of topic and partition
   */
  @VisibleForTesting
  public static String partitionChannelKey(String connectorName, String topic, int partition) {
    return connectorName + "_" + topic + "_" + partition;
  }

  /* Used for testing */
  @VisibleForTesting
  SnowflakeStreamingIngestClient getStreamingIngestClient() {
    return StreamingClientProvider.getStreamingClientProviderInstance()
        .getClient(this.connectorConfig);
  }

  /**
   * Used for testing Only
   *
   * @param topicPartitionChannelKey look {@link #partitionChannelKey(String, int)} for key format
   * @return TopicPartitionChannel if present in partitionsToChannel Map else null
   */
  @VisibleForTesting
  protected Optional getTopicPartitionChannelFromCacheKey(
      final String topicPartitionChannelKey) {
    return Optional.ofNullable(
        this.partitionsToChannel.getOrDefault(topicPartitionChannelKey, null));
  }

  // ------ Streaming Ingest Related Functions ------ //
  private void createTableIfNotExists(final String tableName) {
    if (this.conn.tableExist(tableName)) {
      if (!this.enableSchematization) {
        if (this.conn.isTableCompatible(tableName)) {
          LOGGER.info("Using existing table {}.", tableName);
        } else {
          throw SnowflakeErrors.ERROR_5003.getException(
              "table name: " + tableName, this.telemetryService);
        }
      } else {
        this.conn.appendMetaColIfNotExist(tableName);
      }
    } else {
      LOGGER.info("Creating new table {}.", tableName);
      if (this.enableSchematization) {
        // Always create the table with RECORD_METADATA only and rely on schema evolution to update
        // the schema
        this.conn.createTableWithOnlyMetadataColumn(tableName);
      } else {
        this.conn.createTable(tableName);
      }
    }

    // Populate schema evolution cache if needed
    populateSchemaEvolutionPermissions(tableName);
  }

  private void populateSchemaEvolutionPermissions(String tableName) {
    if (!tableName2SchemaEvolutionPermission.containsKey(tableName)) {
      if (enableSchematization) {
        tableName2SchemaEvolutionPermission.put(
            tableName,
            conn != null
                && conn.hasSchemaEvolutionPermission(
                    tableName, connectorConfig.get(SNOWFLAKE_ROLE)));
      } else {
        tableName2SchemaEvolutionPermission.put(tableName, false);
      }
    }
  }
}