org.apache.parquet.hadoop.SnowflakeParquetWriter Maven / Gradle / Ivy
/*
* Copyright (c) 2022-2024 Snowflake Computing Inc. All rights reserved.
*/
package org.apache.parquet.hadoop;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import net.snowflake.ingest.utils.Constants;
import net.snowflake.ingest.utils.ErrorCode;
import net.snowflake.ingest.utils.SFException;
import org.apache.hadoop.conf.Configuration;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.values.factory.DefaultValuesWriterFactory;
import org.apache.parquet.crypto.FileEncryptionProperties;
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.parquet.hadoop.metadata.BlockMetaData;
import org.apache.parquet.hadoop.metadata.ColumnChunkMetaData;
import org.apache.parquet.io.DelegatingPositionOutputStream;
import org.apache.parquet.io.OutputFile;
import org.apache.parquet.io.ParquetEncodingException;
import org.apache.parquet.io.PositionOutputStream;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.io.api.RecordConsumer;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
/**
* Snowflake specific parquet writer, supports BDEC file for FDN tables and parquet file for Iceberg
* tables.
*
* Resides in parquet package because, it uses {@link InternalParquetRecordWriter} and {@link
* CodecFactory} that are package private.
*/
public class SnowflakeParquetWriter implements AutoCloseable {
private final InternalParquetRecordWriter> writer;
private final CodecFactory codecFactory;
// Optional cap on the max number of row groups to allow per file, if this is exceeded we'll end
// up throwing
private final Optional maxRowGroups;
private final ParquetProperties.WriterVersion writerVersion;
private final boolean enableDictionaryEncoding;
private long rowsWritten = 0;
/**
* Creates a Snowflake specific parquet writer.
*
* @param stream output
* @param schema row schema
* @param extraMetaData extra metadata
* @param channelName name of the channel that is using the writer
* @param maxRowGroups Optional cap on the max number of row groups to allow per file, if this is
* exceeded we'll end up throwing
* @throws IOException
*/
public SnowflakeParquetWriter(
ByteArrayOutputStream stream,
MessageType schema,
Map extraMetaData,
String channelName,
long maxChunkSizeInBytes,
Optional maxRowGroups,
Constants.BdecParquetCompression bdecParquetCompression,
ParquetProperties.WriterVersion writerVersion,
boolean enableDictionaryEncoding)
throws IOException {
OutputFile file = new ByteArrayOutputFile(stream, maxChunkSizeInBytes);
this.maxRowGroups = maxRowGroups;
this.writerVersion = writerVersion;
this.enableDictionaryEncoding = enableDictionaryEncoding;
ParquetProperties encodingProps = createParquetProperties();
Configuration conf = new Configuration();
WriteSupport> writeSupport =
new SnowflakeWriteSupport(schema, extraMetaData, channelName);
WriteSupport.WriteContext writeContext = writeSupport.init(conf);
ParquetFileWriter fileWriter =
new ParquetFileWriter(
file,
schema,
ParquetFileWriter.Mode.CREATE,
Constants.MAX_BLOB_SIZE_IN_BYTES * 2,
ParquetWriter.MAX_PADDING_SIZE_DEFAULT,
encodingProps.getColumnIndexTruncateLength(),
encodingProps.getStatisticsTruncateLength(),
encodingProps.getPageWriteChecksumEnabled(),
(FileEncryptionProperties) null);
fileWriter.start();
/*
Internally parquet writer initialises CodecFactory with the configured page size.
We set the page size to the max chunk size that is quite big in general.
CodecFactory allocates a byte buffer of that size on heap during initialisation.
If we use Parquet writer for buffering, there will be one writer per channel on each flush.
The memory will be allocated for each writer at the beginning even if we don't write anything with each writer,
which is the case when we enable parquet writer buffering.
Hence, to avoid huge memory allocations, we have to internally initialise CodecFactory with `ParquetWriter.DEFAULT_PAGE_SIZE` as it usually happens.
To get code access to this internal initialisation, we have to move the BdecParquetWriter class in the parquet.hadoop package.
*/
codecFactory = new CodecFactory(conf, ParquetWriter.DEFAULT_PAGE_SIZE);
@SuppressWarnings("deprecation") // Parquet does not support the new one now
CodecFactory.BytesCompressor compressor =
codecFactory.getCompressor(bdecParquetCompression.getCompressionCodec());
writer =
new InternalParquetRecordWriter<>(
fileWriter,
writeSupport,
schema,
writeContext.getExtraMetaData(),
Constants.MAX_BLOB_SIZE_IN_BYTES * 2,
compressor,
true,
encodingProps);
}
/** @return List of row counts per block stored in the parquet footer */
public List getRowCountsFromFooter() {
if (maxRowGroups.isPresent() && writer.getFooter().getBlocks().size() > maxRowGroups.get()) {
throw new SFException(
ErrorCode.INTERNAL_ERROR,
String.format(
"Expecting only %d row group in the parquet file, but found %d",
maxRowGroups.get(), writer.getFooter().getBlocks().size()));
}
final List blockRowCounts = new ArrayList<>();
for (BlockMetaData metadata : writer.getFooter().getBlocks()) {
blockRowCounts.add(metadata.getRowCount());
}
return blockRowCounts;
}
/** @return extended metadata size (page index size + bloom filter size) */
public long getExtendedMetadataSize() {
long extendedMetadataSize = 0;
for (BlockMetaData metadata : writer.getFooter().getBlocks()) {
for (ColumnChunkMetaData column : metadata.getColumns()) {
extendedMetadataSize +=
(column.getColumnIndexReference() != null
? column.getColumnIndexReference().getLength()
: 0)
+ (column.getOffsetIndexReference() != null
? column.getOffsetIndexReference().getLength()
: 0)
+ (column.getBloomFilterLength() == -1 ? 0 : column.getBloomFilterLength());
}
}
return extendedMetadataSize;
}
public void writeRow(List