net.snowflake.ingest.streaming.internal.SnowflakeStreamingIngestChannelInternal Maven / Gradle / Ivy
The newest version!
/*
* Copyright (c) 2021-2024 Snowflake Computing Inc. All rights reserved.
*/
package net.snowflake.ingest.streaming.internal;
import static net.snowflake.ingest.utils.Constants.INSERT_THROTTLE_MAX_RETRY_COUNT;
import static net.snowflake.ingest.utils.Constants.RESPONSE_SUCCESS;
import static net.snowflake.ingest.utils.ParameterProvider.MAX_MEMORY_LIMIT_IN_BYTES_DEFAULT;
import com.google.common.annotations.VisibleForTesting;
import java.time.ZoneId;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.stream.Collectors;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import net.snowflake.ingest.streaming.DropChannelRequest;
import net.snowflake.ingest.streaming.InsertValidationResponse;
import net.snowflake.ingest.streaming.OffsetTokenVerificationFunction;
import net.snowflake.ingest.streaming.OpenChannelRequest;
import net.snowflake.ingest.streaming.SnowflakeStreamingIngestChannel;
import net.snowflake.ingest.utils.ErrorCode;
import net.snowflake.ingest.utils.Logging;
import net.snowflake.ingest.utils.SFException;
import net.snowflake.ingest.utils.Utils;
import org.apache.parquet.column.ParquetProperties;
/**
* The first version of implementation for SnowflakeStreamingIngestChannel
*
* @param type of column data {@link ParquetChunkData})
*/
class SnowflakeStreamingIngestChannelInternal implements SnowflakeStreamingIngestChannel {
private static final Logging logger = new Logging(SnowflakeStreamingIngestChannelInternal.class);
// this context contains channel immutable identification and encryption attributes
private final ChannelFlushContext channelFlushContext;
// Reference to the row buffer
private final RowBuffer rowBuffer;
private final long insertThrottleIntervalInMs;
private final int insertThrottleThresholdInBytes;
private final int insertThrottleThresholdInPercentage;
private final long maxMemoryLimitInBytes;
// Indicates whether the channel is closed
private volatile boolean isClosed;
// Reference to the client that owns this channel
private final SnowflakeStreamingIngestClientInternal owningClient;
// State of the channel that will be shared with its underlying buffer
private final ChannelRuntimeState channelState;
// Internal map of column name -> column properties
private final Map tableColumns;
// The latest cause of channel invalidation
private String invalidationCause;
private final MemoryInfoProvider memoryInfoProvider;
private volatile long freeMemoryInBytes = 0;
/** Default constructor */
SnowflakeStreamingIngestChannelInternal(
String name,
String dbName,
String schemaName,
String tableName,
String endOffsetToken,
Long channelSequencer,
Long rowSequencer,
@Nonnull SnowflakeStreamingIngestClientInternal client,
String encryptionKey,
Long encryptionKeyId,
OpenChannelRequest.OnErrorOption onErrorOption,
ZoneId defaultTimezone,
OffsetTokenVerificationFunction offsetTokenVerificationFunction,
ParquetProperties.WriterVersion parquetWriterVersion) {
this.isClosed = false;
this.owningClient = client;
this.insertThrottleIntervalInMs =
this.owningClient.getParameterProvider().getInsertThrottleIntervalInMs();
this.insertThrottleThresholdInBytes =
this.owningClient.getParameterProvider().getInsertThrottleThresholdInBytes();
this.insertThrottleThresholdInPercentage =
this.owningClient.getParameterProvider().getInsertThrottleThresholdInPercentage();
this.maxMemoryLimitInBytes =
this.owningClient.getParameterProvider().getMaxMemoryLimitInBytes();
this.memoryInfoProvider = MemoryInfoProviderFromRuntime.getInstance();
this.channelFlushContext =
new ChannelFlushContext(
name, dbName, schemaName, tableName, channelSequencer, encryptionKey, encryptionKeyId);
this.channelState = new ChannelRuntimeState(endOffsetToken, rowSequencer, true);
this.rowBuffer =
AbstractRowBuffer.createRowBuffer(
onErrorOption,
defaultTimezone,
client.getParameterProvider().getBlobFormatVersion(),
getFullyQualifiedName(),
this::collectRowSize,
channelState,
new ClientBufferParameters(owningClient, parquetWriterVersion),
offsetTokenVerificationFunction,
parquetWriterVersion,
owningClient.getTelemetryService());
this.tableColumns = new HashMap<>();
logger.logInfo(
"Channel={} created for table={}",
this.channelFlushContext.getName(),
this.channelFlushContext.getTableName());
}
/**
* Get the fully qualified channel name
*
* @return fully qualified name of the channel, in the format of
* dbName.schemaName.tableName.channelName
*/
@Override
public String getFullyQualifiedName() {
return channelFlushContext.getFullyQualifiedName();
}
/**
* Get the name of the channel
*
* @return name of the channel
*/
@Override
public String getName() {
return this.channelFlushContext.getName();
}
@Override
public String getDBName() {
return this.channelFlushContext.getDbName();
}
@Override
public String getSchemaName() {
return this.channelFlushContext.getSchemaName();
}
@Override
public String getTableName() {
return this.channelFlushContext.getTableName();
}
Long getChannelSequencer() {
return this.channelFlushContext.getChannelSequencer();
}
/** @return current state of the channel */
@VisibleForTesting
ChannelRuntimeState getChannelState() {
return this.channelState;
}
/**
* Get the fully qualified table name that the channel belongs to
*
* @return fully qualified table name, in the format of dbName.schemaName.tableName
*/
@Override
public String getFullyQualifiedTableName() {
return channelFlushContext.getFullyQualifiedTableName();
}
/**
* Get all the data needed to build the blob during flush
*
* @return a ChannelData object
*/
ChannelData getData() {
ChannelData data = this.rowBuffer.flush();
if (data != null) {
data.setChannelContext(channelFlushContext);
}
return data;
}
/** @return a boolean to indicate whether the channel is valid or not */
@Override
public boolean isValid() {
return this.channelState.isValid();
}
/** Mark the channel as invalid, and release resources */
void invalidate(String message, String invalidationCause) {
this.channelState.invalidate();
this.invalidationCause = invalidationCause;
this.rowBuffer.close("invalidate");
logger.logWarn(
"Channel is invalidated, name={}, channel sequencer={}, row sequencer={}, message={}",
getFullyQualifiedName(),
channelFlushContext.getChannelSequencer(),
channelState.getRowSequencer(),
message);
}
/** @return a boolean to indicate whether the channel is closed or not */
@Override
public boolean isClosed() {
return this.isClosed;
}
/** Mark the channel as closed */
void markClosed() {
this.isClosed = true;
logger.logInfo(
"Channel is marked as closed, name={}, channel sequencer={}, row sequencer={}",
getFullyQualifiedName(),
channelFlushContext.getChannelSequencer(),
channelState.getRowSequencer());
}
/**
* Flush all data in memory to persistent storage and register with a Snowflake table
*
* @param closing whether the flush is called as part of channel closing
* @return future which will be complete when the flush the data is registered
*/
CompletableFuture flush(boolean closing) {
// Skip this check for closing because we need to set the channel to closed first and then flush
// in case there is any leftover rows
if (isClosed() && !closing) {
throw new SFException(ErrorCode.CLOSED_CHANNEL, getFullyQualifiedName());
}
// Simply return if there is no data in the channel, this might not work if we support public
// flush API since there could a concurrent insert at the same time
if (this.rowBuffer.getSize() == 0) {
return CompletableFuture.completedFuture(null);
}
return this.owningClient.flush(false);
}
/**
* Close the channel (this will flush in-flight buffered data)
*
* @return future which will be complete when the channel is closed
*/
@Override
public CompletableFuture close() {
return this.close(false);
}
@Override
public CompletableFuture close(boolean drop) {
checkValidation();
if (isClosed()) {
return CompletableFuture.completedFuture(null);
}
markClosed();
return flush(true)
.thenRunAsync(
() -> {
List> uncommittedChannels =
this.owningClient.verifyChannelsAreFullyCommitted(
Collections.singletonList(this));
this.rowBuffer.close("close");
this.owningClient.removeChannelIfSequencersMatch(this);
// Throw an exception if the channel is invalid or has any uncommitted rows
if (!isValid() || !uncommittedChannels.isEmpty()) {
throw new SFException(
ErrorCode.CHANNELS_WITH_UNCOMMITTED_ROWS,
uncommittedChannels.stream()
.map(SnowflakeStreamingIngestChannelInternal::getFullyQualifiedName)
.collect(Collectors.toList()));
}
if (drop) {
DropChannelRequest.DropChannelRequestBuilder builder =
DropChannelRequest.builder(this.getChannelContext().getName())
.setDBName(this.getDBName())
.setTableName(this.getTableName())
.setSchemaName(this.getSchemaName());
this.owningClient.dropChannel(
new DropChannelVersionRequest(builder, this.getChannelSequencer()));
}
});
}
/**
* Setup the column fields and vectors using the column metadata from the server
*
* @param columns
*/
// TODO: need to verify with the table schema when supporting sub-columns
void setupSchema(List columns) {
logger.logDebug("Setup schema for channel={}, schema={}", getFullyQualifiedName(), columns);
this.rowBuffer.setupSchema(columns);
columns.forEach(c -> tableColumns.putIfAbsent(c.getName(), new ColumnProperties(c)));
}
/**
* --------------------------------------------------------------------------------------------
* Insert one row into the channel
* --------------------------------------------------------------------------------------------
*/
/**
* The row is represented using Map where the key is column name and the value is a row of data
*
* @param row object data to write
* @param offsetToken offset of given row, used for replay in case of failures
* @return insert response that possibly contains errors because of insertion failures
* @throws SFException when the channel is invalid or closed
*/
@Override
public InsertValidationResponse insertRow(Map row, String offsetToken) {
return insertRows(Collections.singletonList(row), offsetToken, offsetToken);
}
/**
* --------------------------------------------------------------------------------------------
* Insert a batch of rows into the channel
* --------------------------------------------------------------------------------------------
*/
/**
* Insert a batch of rows into the channel, each row is represented using Map where the key is
* column name and the value is a row of data. See {@link
* SnowflakeStreamingIngestChannel#insertRow(Map, String)} for more information about accepted
* values.
*
* @param rows object data to write
* @param startOffsetToken start offset of the batch/row-set
* @param endOffsetToken end offset of the batch/row-set, used for replay in case of failures, *
* It could be null if you don't plan on replaying or can't replay
* @return insert response that possibly contains errors because of insertion failures
*/
@Override
public InsertValidationResponse insertRows(
Iterable