net.snowflake.ingest.streaming.internal.ParquetRowBuffer Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of snowflake-ingest-sdk Show documentation
Snowflake Ingest SDK
There is a newer version: 3.0.1
/*
 * Copyright (c) 2022 Snowflake Computing Inc. All rights reserved.
 */

package net.snowflake.ingest.streaming.internal;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.time.ZoneId;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.function.Consumer;
import net.snowflake.client.jdbc.internal.google.common.collect.Sets;
import net.snowflake.ingest.connection.TelemetryService;
import net.snowflake.ingest.streaming.OffsetTokenVerificationFunction;
import net.snowflake.ingest.streaming.OpenChannelRequest;
import net.snowflake.ingest.utils.Constants;
import net.snowflake.ingest.utils.ErrorCode;
import net.snowflake.ingest.utils.SFException;
import org.apache.parquet.hadoop.BdecParquetWriter;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;

/**
 * The buffer in the Streaming Ingest channel that holds the un-flushed rows, these rows will be
 * converted to Parquet format for faster processing
 */
public class ParquetRowBuffer extends AbstractRowBuffer {
  private static final String PARQUET_MESSAGE_TYPE_NAME = "bdec";

  private final Map fieldIndex;

  /* map that contains metadata like typeinfo for columns and other information needed by the server scanner */
  private final Map metadata;

  /* Unflushed rows as Java objects. Needed for the Parquet w/o memory optimization. */
  private final List> data;

  /* BDEC Parquet writer. It is used to buffer unflushed data in Parquet internal buffers instead of using Java objects */
  private BdecParquetWriter bdecParquetWriter;

  private ByteArrayOutputStream fileOutput;
  private final List> tempData;

  private MessageType schema;

  /** Construct a ParquetRowBuffer object. */
  ParquetRowBuffer(
      OpenChannelRequest.OnErrorOption onErrorOption,
      ZoneId defaultTimezone,
      String fullyQualifiedChannelName,
      Consumer rowSizeMetric,
      ChannelRuntimeState channelRuntimeState,
      ClientBufferParameters clientBufferParameters,
      OffsetTokenVerificationFunction offsetTokenVerificationFunction,
      TelemetryService telemetryService) {
    super(
        onErrorOption,
        defaultTimezone,
        fullyQualifiedChannelName,
        rowSizeMetric,
        channelRuntimeState,
        clientBufferParameters,
        offsetTokenVerificationFunction,
        telemetryService);
    this.fieldIndex = new HashMap<>();
    this.metadata = new HashMap<>();
    this.data = new ArrayList<>();
    this.tempData = new ArrayList<>();
  }

  @Override
  public void setupSchema(List columns) {
    fieldIndex.clear();
    metadata.clear();
    metadata.put("sfVer", "1,1");
    List parquetTypes = new ArrayList<>();
    int id = 1;
    for (ColumnMetadata column : columns) {
      validateColumnCollation(column);
      ParquetTypeGenerator.ParquetTypeInfo typeInfo =
          ParquetTypeGenerator.generateColumnParquetTypeInfo(column, id);
      parquetTypes.add(typeInfo.getParquetType());
      this.metadata.putAll(typeInfo.getMetadata());
      int columnIndex = parquetTypes.size() - 1;
      fieldIndex.put(
          column.getInternalName(),
          new ParquetColumn(column, columnIndex, typeInfo.getPrimitiveTypeName()));
      if (!column.getNullable()) {
        addNonNullableFieldName(column.getInternalName());
      }
      this.statsMap.put(
          column.getInternalName(),
          new RowBufferStats(column.getName(), column.getCollation(), column.getOrdinal()));

      if (onErrorOption == OpenChannelRequest.OnErrorOption.ABORT
          || onErrorOption == OpenChannelRequest.OnErrorOption.SKIP_BATCH) {
        this.tempStatsMap.put(
            column.getInternalName(),
            new RowBufferStats(column.getName(), column.getCollation(), column.getOrdinal()));
      }

      id++;
    }
    schema = new MessageType(PARQUET_MESSAGE_TYPE_NAME, parquetTypes);
    createFileWriter();
    tempData.clear();
    data.clear();
  }

  /** Create BDEC file writer if Parquet memory optimization is enabled. */
  private void createFileWriter() {
    fileOutput = new ByteArrayOutputStream();
    try {
      if (clientBufferParameters.getEnableParquetInternalBuffering()) {
        bdecParquetWriter =
            new BdecParquetWriter(
                fileOutput,
                schema,
                metadata,
                channelFullyQualifiedName,
                clientBufferParameters.getMaxChunkSizeInBytes(),
                clientBufferParameters.getBdecParquetCompression());
      } else {
        this.bdecParquetWriter = null;
      }
      data.clear();
    } catch (IOException e) {
      throw new SFException(ErrorCode.INTERNAL_ERROR, "cannot create parquet writer", e);
    }
  }

  @Override
  boolean hasColumn(String name) {
    return fieldIndex.containsKey(name);
  }

  @Override
  float addRow(
      Map row,
      int bufferedRowIndex,
      Map statsMap,
      Set formattedInputColumnNames,
      final long insertRowIndex) {
    return addRow(row, this::writeRow, statsMap, formattedInputColumnNames, insertRowIndex);
  }

  void writeRow(List