com.snowflake.kafka.connector.internal.streaming.SnowflakeSinkServiceV2 Maven / Gradle / Ivy
Show all versions of snowflake-kafka-connector Show documentation
package com.snowflake.kafka.connector.internal.streaming;
import static com.snowflake.kafka.connector.SnowflakeSinkConnectorConfig.BUFFER_SIZE_BYTES_DEFAULT;
import static com.snowflake.kafka.connector.SnowflakeSinkConnectorConfig.SNOWFLAKE_ROLE;
import static com.snowflake.kafka.connector.internal.streaming.StreamingUtils.STREAMING_BUFFER_COUNT_RECORDS_DEFAULT;
import static com.snowflake.kafka.connector.internal.streaming.StreamingUtils.STREAMING_BUFFER_FLUSH_TIME_DEFAULT_SEC;
import static com.snowflake.kafka.connector.internal.streaming.TopicPartitionChannel.NO_OFFSET_TOKEN_REGISTERED_IN_SNOWFLAKE;
import com.codahale.metrics.MetricRegistry;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Strings;
import com.snowflake.kafka.connector.SnowflakeSinkConnectorConfig;
import com.snowflake.kafka.connector.Utils;
import com.snowflake.kafka.connector.dlq.KafkaRecordErrorReporter;
import com.snowflake.kafka.connector.internal.KCLogger;
import com.snowflake.kafka.connector.internal.SnowflakeConnectionService;
import com.snowflake.kafka.connector.internal.SnowflakeErrors;
import com.snowflake.kafka.connector.internal.SnowflakeSinkService;
import com.snowflake.kafka.connector.internal.metrics.MetricsJmxReporter;
import com.snowflake.kafka.connector.internal.telemetry.SnowflakeTelemetryService;
import com.snowflake.kafka.connector.records.RecordService;
import com.snowflake.kafka.connector.records.SnowflakeMetadataConfig;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import net.snowflake.ingest.streaming.SnowflakeStreamingIngestClient;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.connect.sink.SinkRecord;
import org.apache.kafka.connect.sink.SinkTaskContext;
/**
* This is per task configuration. A task can be assigned multiple partitions. Major methods are
* startTask, insert, getOffset and close methods.
*
* StartTask: Called when partitions are assigned. Responsible for generating the POJOs.
*
*
Insert and getOffset are called when {@link
* com.snowflake.kafka.connector.SnowflakeSinkTask#put(Collection)} and {@link
* com.snowflake.kafka.connector.SnowflakeSinkTask#preCommit(Map)} APIs are called.
*
*
This implementation of SinkService uses Streaming Snowpipe (Streaming Ingestion)
*
*
Hence this initializes the channel, opens, closes. The StreamingIngestChannel resides inside
* {@link TopicPartitionChannel} which is per partition.
*/
public class SnowflakeSinkServiceV2 implements SnowflakeSinkService {
private static final KCLogger LOGGER = new KCLogger(SnowflakeSinkServiceV2.class.getName());
// Assume next three values are a threshold after which we will call insertRows API
// Set in config (Time based flush) in seconds
private long flushTimeSeconds;
// Set in config (buffer size based flush) in bytes
private long fileSizeBytes;
// Set in config (Threshold before we call insertRows API) corresponds to # of
// records in kafka
private long recordNum;
// Used to connect to Snowflake, could be null during testing
private final SnowflakeConnectionService conn;
private final RecordService recordService;
private final SnowflakeTelemetryService telemetryService;
private Map topicToTableMap;
// Behavior to be set at the start of connector start. (For tombstone records)
private SnowflakeSinkConnectorConfig.BehaviorOnNullValues behaviorOnNullValues;
// default is true unless the configuration provided is false;
// If this is true, we will enable Mbean for required classes and emit JMX metrics for monitoring
private boolean enableCustomJMXMonitoring = SnowflakeSinkConnectorConfig.JMX_OPT_DEFAULT;
private MetricsJmxReporter metricsJmxReporter;
/**
* Fetching this from {@link org.apache.kafka.connect.sink.SinkTaskContext}'s {@link
* org.apache.kafka.connect.sink.ErrantRecordReporter}
*/
private KafkaRecordErrorReporter kafkaRecordErrorReporter;
/* SinkTaskContext has access to all methods/APIs available to talk to Kafka Connect runtime*/
private SinkTaskContext sinkTaskContext;
// ------ Streaming Ingest ------ //
// needs url, username. p8 key, role name
private SnowflakeStreamingIngestClient streamingIngestClient;
// Config set in JSON
private final Map connectorConfig;
private boolean enableSchematization;
/**
* Key is formulated in {@link #partitionChannelKey(String, String, int)} }
*
* value is the Streaming Ingest Channel implementation (Wrapped around TopicPartitionChannel)
*/
private final Map partitionsToChannel;
// Cache for schema evolution
private final Map tableName2SchemaEvolutionPermission;
public SnowflakeSinkServiceV2(
SnowflakeConnectionService conn, Map connectorConfig) {
if (conn == null || conn.isClosed()) {
throw SnowflakeErrors.ERROR_5010.getException();
}
this.fileSizeBytes = StreamingUtils.STREAMING_BUFFER_BYTES_DEFAULT;
this.recordNum = StreamingUtils.STREAMING_BUFFER_COUNT_RECORDS_DEFAULT;
this.flushTimeSeconds = StreamingUtils.STREAMING_BUFFER_FLUSH_TIME_DEFAULT_SEC;
this.conn = conn;
this.telemetryService = conn.getTelemetryClient();
this.recordService = new RecordService(this.telemetryService);
this.topicToTableMap = new HashMap<>();
// Setting the default value in constructor
// meaning it will not ignore the null values (Tombstone records wont be ignored/filtered)
this.behaviorOnNullValues = SnowflakeSinkConnectorConfig.BehaviorOnNullValues.DEFAULT;
this.connectorConfig = connectorConfig;
this.enableSchematization =
this.recordService.setAndGetEnableSchematizationFromConfig(this.connectorConfig);
this.streamingIngestClient =
StreamingClientProvider.getStreamingClientProviderInstance()
.getClient(this.connectorConfig);
this.partitionsToChannel = new HashMap<>();
this.tableName2SchemaEvolutionPermission = new HashMap<>();
// jmx
String connectorName =
conn == null || Strings.isNullOrEmpty(this.conn.getConnectorName())
? "default_connector"
: this.conn.getConnectorName();
this.metricsJmxReporter = new MetricsJmxReporter(new MetricRegistry(), connectorName);
}
@VisibleForTesting
public SnowflakeSinkServiceV2(
long flushTimeSeconds,
long fileSizeBytes,
long recordNum,
SnowflakeConnectionService conn,
RecordService recordService,
SnowflakeTelemetryService telemetryService,
Map topicToTableMap,
SnowflakeSinkConnectorConfig.BehaviorOnNullValues behaviorOnNullValues,
boolean enableCustomJMXMonitoring,
KafkaRecordErrorReporter kafkaRecordErrorReporter,
SinkTaskContext sinkTaskContext,
SnowflakeStreamingIngestClient streamingIngestClient,
Map connectorConfig,
boolean enableSchematization,
Map partitionsToChannel) {
this.flushTimeSeconds = flushTimeSeconds;
this.fileSizeBytes = fileSizeBytes;
this.recordNum = recordNum;
this.conn = conn;
this.recordService = recordService;
this.telemetryService = telemetryService;
this.topicToTableMap = topicToTableMap;
this.behaviorOnNullValues = behaviorOnNullValues;
this.enableCustomJMXMonitoring = enableCustomJMXMonitoring;
this.kafkaRecordErrorReporter = kafkaRecordErrorReporter;
this.sinkTaskContext = sinkTaskContext;
this.streamingIngestClient = streamingIngestClient;
this.connectorConfig = connectorConfig;
this.streamingIngestClient =
StreamingClientProvider.getStreamingClientProviderInstance()
.getClient(this.connectorConfig);
this.enableSchematization = enableSchematization;
this.partitionsToChannel = partitionsToChannel;
this.tableName2SchemaEvolutionPermission = new HashMap<>();
if (this.topicToTableMap != null) {
this.topicToTableMap.forEach(
(topic, tableName) -> {
populateSchemaEvolutionPermissions(tableName);
});
}
}
/**
* Creates a table if it doesnt exist in Snowflake.
*
* Initializes the Channel and partitionsToChannel map with new instance of {@link
* TopicPartitionChannel}
*
* @param tableName destination table name
* @param topicPartition TopicPartition passed from Kafka
*/
@Override
public void startPartition(String tableName, TopicPartition topicPartition) {
// the table should be present before opening a channel so let's do a table existence check here
createTableIfNotExists(tableName);
// Create channel for the given partition
createStreamingChannelForTopicPartition(
tableName, topicPartition, tableName2SchemaEvolutionPermission.get(tableName));
}
/**
* Initializes multiple Channels and partitionsToChannel maps with new instances of {@link
* TopicPartitionChannel}
*
* @param partitions collection of topic partition
* @param topic2Table map of topic to table name
*/
@Override
public void startPartitions(
Collection partitions, Map topic2Table) {
partitions.forEach(
tp -> {
String tableName = Utils.tableName(tp.topic(), topic2Table);
createTableIfNotExists(tableName);
createStreamingChannelForTopicPartition(
tableName, tp, tableName2SchemaEvolutionPermission.get(tableName));
});
}
/**
* Always opens a new channel and creates a new instance of TopicPartitionChannel.
*
* This is essentially a blind write to partitionsToChannel. i.e. we do not check if it is
* presented or not.
*/
private void createStreamingChannelForTopicPartition(
final String tableName,
final TopicPartition topicPartition,
boolean hasSchemaEvolutionPermission) {
final String partitionChannelKey =
partitionChannelKey(
conn.getConnectorName(), topicPartition.topic(), topicPartition.partition());
// Create new instance of TopicPartitionChannel which will always open the channel.
partitionsToChannel.put(
partitionChannelKey,
new TopicPartitionChannel(
this.streamingIngestClient,
topicPartition,
partitionChannelKey, // Streaming channel name
tableName,
hasSchemaEvolutionPermission,
new StreamingBufferThreshold(this.flushTimeSeconds, this.fileSizeBytes, this.recordNum),
this.connectorConfig,
this.kafkaRecordErrorReporter,
this.sinkTaskContext,
this.conn,
this.recordService,
this.conn.getTelemetryClient(),
this.enableCustomJMXMonitoring,
this.metricsJmxReporter));
}
/**
* Inserts the given record into buffer and then eventually calls insertRows API if buffer
* threshold has reached.
*
*
TODO: SNOW-473896 - Please note we will get away with Buffering logic in future commits.
*
* @param records records coming from Kafka. Please note, they are not just from single topic and
* partition. It depends on the kafka connect worker node which can consume from multiple
* Topic and multiple Partitions
*/
@Override
public void insert(Collection records) {
// note that records can be empty but, we will still need to check for time based flush
for (SinkRecord record : records) {
// check if need to handle null value records
if (recordService.shouldSkipNullValue(record, behaviorOnNullValues)) {
continue;
}
// While inserting into buffer, we will check for count threshold and buffered bytes
// threshold.
insert(record);
}
// check all partitions to see if they need to be flushed based on time
for (TopicPartitionChannel partitionChannel : partitionsToChannel.values()) {
// Time based flushing
partitionChannel.insertBufferedRecordsIfFlushTimeThresholdReached();
}
}
/**
* Inserts individual records into buffer. It fetches the TopicPartitionChannel from the map and
* then each partition(Streaming channel) calls its respective insertRows API
*
* @param record record content
*/
@Override
public void insert(SinkRecord record) {
String partitionChannelKey =
partitionChannelKey(this.conn.getConnectorName(), record.topic(), record.kafkaPartition());
// init a new topic partition if it's not presented in cache or if channel is closed
if (!partitionsToChannel.containsKey(partitionChannelKey)
|| partitionsToChannel.get(partitionChannelKey).isChannelClosed()) {
LOGGER.warn(
"Topic: {} Partition: {} hasn't been initialized by OPEN function",
record.topic(),
record.kafkaPartition());
startPartition(
Utils.tableName(record.topic(), this.topicToTableMap),
new TopicPartition(record.topic(), record.kafkaPartition()));
}
TopicPartitionChannel channelPartition = partitionsToChannel.get(partitionChannelKey);
channelPartition.insertRecordToBuffer(record);
}
@Override
public long getOffset(TopicPartition topicPartition) {
String partitionChannelKey =
partitionChannelKey(
conn.getConnectorName(), topicPartition.topic(), topicPartition.partition());
if (partitionsToChannel.containsKey(partitionChannelKey)) {
long offset = partitionsToChannel.get(partitionChannelKey).getOffsetSafeToCommitToKafka();
partitionsToChannel.get(partitionChannelKey).setLatestConsumerOffset(offset);
return offset;
} else {
LOGGER.warn(
"Topic: {} Partition: {} hasn't been initialized to get offset",
topicPartition.topic(),
topicPartition.partition());
return NO_OFFSET_TOKEN_REGISTERED_IN_SNOWFLAKE;
}
}
@Override
public int getPartitionCount() {
return partitionsToChannel.size();
}
@Override
public void callAllGetOffset() {
// undefined
}
@Override
public void closeAll() {
partitionsToChannel.forEach(
(partitionChannelKey, topicPartitionChannel) -> {
LOGGER.info("Closing partition channel:{}", partitionChannelKey);
topicPartitionChannel.closeChannel();
});
partitionsToChannel.clear();
StreamingClientProvider.getStreamingClientProviderInstance()
.closeClient(this.streamingIngestClient);
}
/**
* This function is called during rebalance.
*
* All the channels are closed. The client is still active. Upon rebalance, (inside {@link
* com.snowflake.kafka.connector.SnowflakeSinkTask#open(Collection)} we will reopen the channel.
*
*
We will wipe the cache partitionsToChannel so that in {@link
* com.snowflake.kafka.connector.SnowflakeSinkTask#open(Collection)} we reinstantiate and fetch
* offsetToken
*
* @param partitions a list of topic partition
*/
@Override
public void close(Collection partitions) {
partitions.forEach(
topicPartition -> {
final String partitionChannelKey =
partitionChannelKey(
conn.getConnectorName(), topicPartition.topic(), topicPartition.partition());
TopicPartitionChannel topicPartitionChannel =
partitionsToChannel.get(partitionChannelKey);
// Check for null since it's possible that the something goes wrong even before the
// channels are created
if (topicPartitionChannel != null) {
topicPartitionChannel.closeChannel();
}
LOGGER.info(
"Closing partitionChannel:{}, partition:{}, topic:{}",
topicPartitionChannel == null ? null : topicPartitionChannel.getChannelName(),
topicPartition.topic(),
topicPartition.partition());
partitionsToChannel.remove(partitionChannelKey);
});
LOGGER.info(
"Closing {} partitions and remaining partitions which are not closed are:{}, with size:{}",
partitions.size(),
partitionsToChannel.keySet().toString(),
partitionsToChannel.size());
}
@Override
public void setIsStoppedToTrue() {}
/* Undefined */
@Override
public boolean isClosed() {
return false;
}
@Override
public void setRecordNumber(long num) {
if (num < 0) {
LOGGER.error("number of record in each file is {}, it is negative, reset to 0", num);
this.recordNum = STREAMING_BUFFER_COUNT_RECORDS_DEFAULT;
} else {
this.recordNum = num;
LOGGER.info("Set number of records for buffer threshold to {}", num);
}
}
/**
* Assume this is buffer size in bytes, since this is streaming ingestion
*
* @param size in bytes - a non negative long number representing size of internal buffer for
* flush.
*/
@Override
public void setFileSize(long size) {
if (size < SnowflakeSinkConnectorConfig.BUFFER_SIZE_BYTES_MIN) {
LOGGER.error(
"Buffer size is {} bytes, it is smaller than the minimum buffer "
+ "size {} bytes, reset to the default buffer size",
size,
BUFFER_SIZE_BYTES_DEFAULT);
this.fileSizeBytes = BUFFER_SIZE_BYTES_DEFAULT;
} else {
this.fileSizeBytes = size;
LOGGER.info("set buffer size limitation to {} bytes", size);
}
}
@Override
public void setTopic2TableMap(Map topicToTableMap) {
this.topicToTableMap = topicToTableMap;
}
@Override
public void setFlushTime(long time) {
if (time < StreamingUtils.STREAMING_BUFFER_FLUSH_TIME_MINIMUM_SEC) {
LOGGER.error(
"flush time is {} seconds, it is smaller than the minimum "
+ "flush time {} seconds, reset to the default flush time",
time,
STREAMING_BUFFER_FLUSH_TIME_DEFAULT_SEC);
this.flushTimeSeconds = STREAMING_BUFFER_FLUSH_TIME_DEFAULT_SEC;
} else {
this.flushTimeSeconds = time;
LOGGER.info("set flush time to {} seconds", time);
}
}
@Override
public void setMetadataConfig(SnowflakeMetadataConfig configMap) {
this.recordService.setMetadataConfig(configMap);
}
@Override
public long getRecordNumber() {
return this.recordNum;
}
@Override
public long getFlushTime() {
return this.flushTimeSeconds;
}
/**
* This is more of size in bytes of buffered records. This necessarily doesnt translates to files
* created by Streaming Ingest since they are compressed. So there is no 1:1 mapping.
*/
@Override
public long getFileSize() {
return this.fileSizeBytes;
}
@Override
public void setBehaviorOnNullValuesConfig(
SnowflakeSinkConnectorConfig.BehaviorOnNullValues behavior) {
this.behaviorOnNullValues = behavior;
}
@Override
public void setCustomJMXMetrics(boolean enableJMX) {
this.enableCustomJMXMonitoring = enableJMX;
}
@Override
public SnowflakeSinkConnectorConfig.BehaviorOnNullValues getBehaviorOnNullValuesConfig() {
return this.behaviorOnNullValues;
}
/* Set this to send records to DLQ. */
@Override
public void setErrorReporter(KafkaRecordErrorReporter kafkaRecordErrorReporter) {
this.kafkaRecordErrorReporter = kafkaRecordErrorReporter;
}
@Override
public void setSinkTaskContext(SinkTaskContext sinkTaskContext) {
this.sinkTaskContext = sinkTaskContext;
}
@Override
public Optional getMetricRegistry(String partitionChannelKey) {
return this.partitionsToChannel.containsKey(partitionChannelKey)
? Optional.of(
this.partitionsToChannel
.get(partitionChannelKey)
.getSnowflakeTelemetryChannelStatus()
.getMetricsJmxReporter()
.getMetricRegistry())
: Optional.empty();
}
/**
* Gets a unique identifier consisting of connector name, topic name and partition number.
*
* @param connectorName Connector name is always unique. (Two connectors with same name won't be
* allowed by Connector Framework)
* Note: Customers can have same named connector in different connector runtimes (Like DEV
* or PROD)
* @param topic topic name
* @param partition partition number
* @return combinartion of topic and partition
*/
@VisibleForTesting
public static String partitionChannelKey(String connectorName, String topic, int partition) {
return connectorName + "_" + topic + "_" + partition;
}
/* Used for testing */
@VisibleForTesting
SnowflakeStreamingIngestClient getStreamingIngestClient() {
return StreamingClientProvider.getStreamingClientProviderInstance()
.getClient(this.connectorConfig);
}
/**
* Used for testing Only
*
* @param topicPartitionChannelKey look {@link #partitionChannelKey(String, int)} for key format
* @return TopicPartitionChannel if present in partitionsToChannel Map else null
*/
@VisibleForTesting
protected Optional getTopicPartitionChannelFromCacheKey(
final String topicPartitionChannelKey) {
return Optional.ofNullable(
this.partitionsToChannel.getOrDefault(topicPartitionChannelKey, null));
}
// ------ Streaming Ingest Related Functions ------ //
private void createTableIfNotExists(final String tableName) {
if (this.conn.tableExist(tableName)) {
if (!this.enableSchematization) {
if (this.conn.isTableCompatible(tableName)) {
LOGGER.info("Using existing table {}.", tableName);
} else {
throw SnowflakeErrors.ERROR_5003.getException(
"table name: " + tableName, this.telemetryService);
}
} else {
this.conn.appendMetaColIfNotExist(tableName);
}
} else {
LOGGER.info("Creating new table {}.", tableName);
if (this.enableSchematization) {
// Always create the table with RECORD_METADATA only and rely on schema evolution to update
// the schema
this.conn.createTableWithOnlyMetadataColumn(tableName);
} else {
this.conn.createTable(tableName);
}
}
// Populate schema evolution cache if needed
populateSchemaEvolutionPermissions(tableName);
}
private void populateSchemaEvolutionPermissions(String tableName) {
if (!tableName2SchemaEvolutionPermission.containsKey(tableName)) {
if (enableSchematization) {
tableName2SchemaEvolutionPermission.put(
tableName,
conn != null
&& conn.hasSchemaEvolutionPermission(
tableName, connectorConfig.get(SNOWFLAKE_ROLE)));
} else {
tableName2SchemaEvolutionPermission.put(tableName, false);
}
}
}
}