Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
Resides in parquet package because, it uses {@link InternalParquetRecordWriter} and {@link
* CodecFactory} that are package private.
*/
public class BdecParquetWriter implements AutoCloseable {
private final InternalParquetRecordWriter> writer;
private final CodecFactory codecFactory;
private long rowsWritten = 0;
/**
* Creates a BDEC specific parquet writer.
*
* @param stream output
* @param schema row schema
* @param extraMetaData extra metadata
* @param channelName name of the channel that is using the writer
* @throws IOException
*/
public BdecParquetWriter(
ByteArrayOutputStream stream,
MessageType schema,
Map extraMetaData,
String channelName,
long maxChunkSizeInBytes,
Constants.BdecParquetCompression bdecParquetCompression)
throws IOException {
OutputFile file = new ByteArrayOutputFile(stream, maxChunkSizeInBytes);
ParquetProperties encodingProps = createParquetProperties();
Configuration conf = new Configuration();
WriteSupport> writeSupport =
new BdecWriteSupport(schema, extraMetaData, channelName);
WriteSupport.WriteContext writeContext = writeSupport.init(conf);
ParquetFileWriter fileWriter =
new ParquetFileWriter(
file,
schema,
ParquetFileWriter.Mode.CREATE,
Constants.MAX_BLOB_SIZE_IN_BYTES * 2,
ParquetWriter.MAX_PADDING_SIZE_DEFAULT,
encodingProps.getColumnIndexTruncateLength(),
encodingProps.getStatisticsTruncateLength(),
encodingProps.getPageWriteChecksumEnabled(),
(FileEncryptionProperties) null);
fileWriter.start();
/*
Internally parquet writer initialises CodecFactory with the configured page size.
We set the page size to the max chunk size that is quite big in general.
CodecFactory allocates a byte buffer of that size on heap during initialisation.
If we use Parquet writer for buffering, there will be one writer per channel on each flush.
The memory will be allocated for each writer at the beginning even if we don't write anything with each writer,
which is the case when we enable parquet writer buffering.
Hence, to avoid huge memory allocations, we have to internally initialise CodecFactory with `ParquetWriter.DEFAULT_PAGE_SIZE` as it usually happens.
To get code access to this internal initialisation, we have to move the BdecParquetWriter class in the parquet.hadoop package.
*/
codecFactory = new CodecFactory(conf, ParquetWriter.DEFAULT_PAGE_SIZE);
@SuppressWarnings("deprecation") // Parquet does not support the new one now
CodecFactory.BytesCompressor compressor =
codecFactory.getCompressor(bdecParquetCompression.getCompressionCodec());
writer =
new InternalParquetRecordWriter<>(
fileWriter,
writeSupport,
schema,
writeContext.getExtraMetaData(),
Constants.MAX_BLOB_SIZE_IN_BYTES * 2,
compressor,
true,
encodingProps);
}
/** @return List of row counts per block stored in the parquet footer */
public List getRowCountsFromFooter() {
final List blockRowCounts = new ArrayList<>();
for (BlockMetaData metadata : writer.getFooter().getBlocks()) {
blockRowCounts.add(metadata.getRowCount());
}
return blockRowCounts;
}
public void writeRow(List