org.apache.parquet.hadoop.BdecParquetWriter Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of snowflake-ingest-sdk Show documentation
Snowflake Ingest SDK
There is a newer version: 3.0.0
/*
 * Copyright (c) 2022 Snowflake Computing Inc. All rights reserved.
 */

package org.apache.parquet.hadoop;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import net.snowflake.ingest.utils.Constants;
import net.snowflake.ingest.utils.ErrorCode;
import net.snowflake.ingest.utils.SFException;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.values.factory.DefaultV1ValuesWriterFactory;
import org.apache.parquet.crypto.FileEncryptionProperties;
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.io.DelegatingPositionOutputStream;
import org.apache.parquet.io.OutputFile;
import org.apache.parquet.io.ParquetEncodingException;
import org.apache.parquet.io.PositionOutputStream;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.io.api.RecordConsumer;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;

/**
 * BDEC specific parquet writer.
 *
 * Resides in parquet package because, it uses {@link InternalParquetRecordWriter} and {@link
 * CodecFactory} that are package private.
 */
public class BdecParquetWriter implements AutoCloseable {
  private final InternalParquetRecordWriter> writer;
  private final CodecFactory codecFactory;
  private long rowsWritten = 0;

  /**
   * Creates a BDEC specific parquet writer.
   *
   * @param stream output
   * @param schema row schema
   * @param extraMetaData extra metadata
   * @param channelName name of the channel that is using the writer
   * @throws IOException
   */
  public BdecParquetWriter(
      ByteArrayOutputStream stream,
      MessageType schema,
      Map extraMetaData,
      String channelName,
      long maxChunkSizeInBytes,
      Constants.BdecParquetCompression bdecParquetCompression)
      throws IOException {
    OutputFile file = new ByteArrayOutputFile(stream, maxChunkSizeInBytes);
    ParquetProperties encodingProps = createParquetProperties();
    Configuration conf = new Configuration();
    WriteSupport> writeSupport =
        new BdecWriteSupport(schema, extraMetaData, channelName);
    WriteSupport.WriteContext writeContext = writeSupport.init(conf);

    ParquetFileWriter fileWriter =
        new ParquetFileWriter(
            file,
            schema,
            ParquetFileWriter.Mode.CREATE,
            Constants.MAX_BLOB_SIZE_IN_BYTES * 2,
            ParquetWriter.MAX_PADDING_SIZE_DEFAULT,
            encodingProps.getColumnIndexTruncateLength(),
            encodingProps.getStatisticsTruncateLength(),
            encodingProps.getPageWriteChecksumEnabled(),
            (FileEncryptionProperties) null);
    fileWriter.start();

    /*
    Internally parquet writer initialises CodecFactory with the configured page size.
    We set the page size to the max chunk size that is quite big in general.
    CodecFactory allocates a byte buffer of that size on heap during initialisation.

    If we use Parquet writer for buffering, there will be one writer per channel on each flush.
    The memory will be allocated for each writer at the beginning even if we don't write anything with each writer,
    which is the case when we enable parquet writer buffering.
    Hence, to avoid huge memory allocations, we have to internally initialise CodecFactory with `ParquetWriter.DEFAULT_PAGE_SIZE` as it usually happens.
    To get code access to this internal initialisation, we have to move the BdecParquetWriter class in the parquet.hadoop package.
    */
    codecFactory = new CodecFactory(conf, ParquetWriter.DEFAULT_PAGE_SIZE);
    @SuppressWarnings("deprecation") // Parquet does not support the new one now
    CodecFactory.BytesCompressor compressor =
        codecFactory.getCompressor(bdecParquetCompression.getCompressionCodec());
    writer =
        new InternalParquetRecordWriter<>(
            fileWriter,
            writeSupport,
            schema,
            writeContext.getExtraMetaData(),
            Constants.MAX_BLOB_SIZE_IN_BYTES * 2,
            compressor,
            true,
            encodingProps);
  }

  /** @return List of row counts per block stored in the parquet footer */
  public List getRowCountsFromFooter() {
    final List blockRowCounts = new ArrayList<>();
    for (BlockMetaData metadata : writer.getFooter().getBlocks()) {
      blockRowCounts.add(metadata.getRowCount());
    }
    return blockRowCounts;
  }

  public void writeRow(List