net.snowflake.ingest.streaming.internal.ParquetRowBuffer Maven / Gradle / Ivy
/*
* Copyright (c) 2022 Snowflake Computing Inc. All rights reserved.
*/
package net.snowflake.ingest.streaming.internal;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.Consumer;
import net.snowflake.client.jdbc.internal.google.common.collect.Sets;
import net.snowflake.ingest.connection.TelemetryService;
import net.snowflake.ingest.streaming.OffsetTokenVerificationFunction;
import net.snowflake.ingest.streaming.OpenChannelRequest;
import net.snowflake.ingest.utils.Constants;
import net.snowflake.ingest.utils.ErrorCode;
import net.snowflake.ingest.utils.SFException;
import org.apache.parquet.hadoop.BdecParquetWriter;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
/**
* The buffer in the Streaming Ingest channel that holds the un-flushed rows, these rows will be
* converted to Parquet format for faster processing
*/
public class ParquetRowBuffer extends AbstractRowBuffer {
private static final String PARQUET_MESSAGE_TYPE_NAME = "bdec";
private final Map fieldIndex;
/* map that contains metadata like typeinfo for columns and other information needed by the server scanner */
private final Map metadata;
/* Unflushed rows as Java objects. Needed for the Parquet w/o memory optimization. */
private final List> data;
/* BDEC Parquet writer. It is used to buffer unflushed data in Parquet internal buffers instead of using Java objects */
private BdecParquetWriter bdecParquetWriter;
private ByteArrayOutputStream fileOutput;
private final List> tempData;
private MessageType schema;
/** Construct a ParquetRowBuffer object. */
ParquetRowBuffer(
OpenChannelRequest.OnErrorOption onErrorOption,
ZoneId defaultTimezone,
String fullyQualifiedChannelName,
Consumer rowSizeMetric,
ChannelRuntimeState channelRuntimeState,
ClientBufferParameters clientBufferParameters,
OffsetTokenVerificationFunction offsetTokenVerificationFunction,
TelemetryService telemetryService) {
super(
onErrorOption,
defaultTimezone,
fullyQualifiedChannelName,
rowSizeMetric,
channelRuntimeState,
clientBufferParameters,
offsetTokenVerificationFunction,
telemetryService);
this.fieldIndex = new HashMap<>();
this.metadata = new HashMap<>();
this.data = new ArrayList<>();
this.tempData = new ArrayList<>();
}
@Override
public void setupSchema(List columns) {
fieldIndex.clear();
metadata.clear();
metadata.put("sfVer", "1,1");
List parquetTypes = new ArrayList<>();
int id = 1;
for (ColumnMetadata column : columns) {
validateColumnCollation(column);
ParquetTypeGenerator.ParquetTypeInfo typeInfo =
ParquetTypeGenerator.generateColumnParquetTypeInfo(column, id);
parquetTypes.add(typeInfo.getParquetType());
this.metadata.putAll(typeInfo.getMetadata());
int columnIndex = parquetTypes.size() - 1;
fieldIndex.put(
column.getInternalName(),
new ParquetColumn(column, columnIndex, typeInfo.getPrimitiveTypeName()));
if (!column.getNullable()) {
addNonNullableFieldName(column.getInternalName());
}
this.statsMap.put(
column.getInternalName(),
new RowBufferStats(column.getName(), column.getCollation(), column.getOrdinal()));
if (onErrorOption == OpenChannelRequest.OnErrorOption.ABORT
|| onErrorOption == OpenChannelRequest.OnErrorOption.SKIP_BATCH) {
this.tempStatsMap.put(
column.getInternalName(),
new RowBufferStats(column.getName(), column.getCollation(), column.getOrdinal()));
}
id++;
}
schema = new MessageType(PARQUET_MESSAGE_TYPE_NAME, parquetTypes);
createFileWriter();
tempData.clear();
data.clear();
}
/** Create BDEC file writer if Parquet memory optimization is enabled. */
private void createFileWriter() {
fileOutput = new ByteArrayOutputStream();
try {
if (clientBufferParameters.getEnableParquetInternalBuffering()) {
bdecParquetWriter =
new BdecParquetWriter(
fileOutput,
schema,
metadata,
channelFullyQualifiedName,
clientBufferParameters.getMaxChunkSizeInBytes(),
clientBufferParameters.getBdecParquetCompression());
} else {
this.bdecParquetWriter = null;
}
data.clear();
} catch (IOException e) {
throw new SFException(ErrorCode.INTERNAL_ERROR, "cannot create parquet writer", e);
}
}
@Override
boolean hasColumn(String name) {
return fieldIndex.containsKey(name);
}
@Override
float addRow(
Map row,
int bufferedRowIndex,
Map statsMap,
Set formattedInputColumnNames,
final long insertRowIndex) {
return addRow(row, this::writeRow, statsMap, formattedInputColumnNames, insertRowIndex);
}
void writeRow(List
© 2015 - 2024 Weber Informatics LLC | Privacy Policy