All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.parquet.hadoop.ParquetWriter Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.parquet.hadoop;

import java.io.Closeable;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.bytes.ByteBufferAllocator;
import org.apache.parquet.column.ParquetProperties;
import org.apache.parquet.column.ParquetProperties.WriterVersion;
import org.apache.parquet.compression.CompressionCodecFactory;
import org.apache.parquet.conf.HadoopParquetConfiguration;
import org.apache.parquet.conf.ParquetConfiguration;
import org.apache.parquet.crypto.FileEncryptionProperties;
import org.apache.parquet.hadoop.api.WriteSupport;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.hadoop.util.ConfigurationUtil;
import org.apache.parquet.hadoop.util.HadoopOutputFile;
import org.apache.parquet.io.OutputFile;
import org.apache.parquet.schema.MessageType;

/**
 * Write records to a Parquet file.
 */
public class ParquetWriter implements Closeable {

  public static final int DEFAULT_BLOCK_SIZE = 128 * 1024 * 1024;
  public static final int DEFAULT_PAGE_SIZE = ParquetProperties.DEFAULT_PAGE_SIZE;
  public static final CompressionCodecName DEFAULT_COMPRESSION_CODEC_NAME = CompressionCodecName.UNCOMPRESSED;
  public static final boolean DEFAULT_IS_DICTIONARY_ENABLED = ParquetProperties.DEFAULT_IS_DICTIONARY_ENABLED;
  public static final boolean DEFAULT_IS_VALIDATING_ENABLED = false;
  public static final WriterVersion DEFAULT_WRITER_VERSION = ParquetProperties.DEFAULT_WRITER_VERSION;

  public static final String OBJECT_MODEL_NAME_PROP = "writer.model.name";

  // max size (bytes) to write as padding and the min size of a row group
  public static final int MAX_PADDING_SIZE_DEFAULT = 8 * 1024 * 1024; // 8MB

  private final InternalParquetRecordWriter writer;
  private final CompressionCodecFactory codecFactory;

  /**
   * Create a new ParquetWriter.
   * (with dictionary encoding enabled and validation off)
   *
   * @param file                 the file to create
   * @param writeSupport         the implementation to write a record to a RecordConsumer
   * @param compressionCodecName the compression codec to use
   * @param blockSize            the block size threshold
   * @param pageSize             the page size threshold
   * @throws IOException if there is an error while writing
   * @deprecated will be removed in 2.0.0
   */
  @Deprecated
  public ParquetWriter(
      Path file,
      WriteSupport writeSupport,
      CompressionCodecName compressionCodecName,
      int blockSize,
      int pageSize)
      throws IOException {
    this(
        file,
        writeSupport,
        compressionCodecName,
        blockSize,
        pageSize,
        DEFAULT_IS_DICTIONARY_ENABLED,
        DEFAULT_IS_VALIDATING_ENABLED);
  }

  /**
   * Create a new ParquetWriter.
   *
   * @param file                 the file to create
   * @param writeSupport         the implementation to write a record to a RecordConsumer
   * @param compressionCodecName the compression codec to use
   * @param blockSize            the block size threshold
   * @param pageSize             the page size threshold (both data and dictionary)
   * @param enableDictionary     to turn dictionary encoding on
   * @param validating           to turn on validation using the schema
   * @throws IOException if there is an error while writing
   * @deprecated will be removed in 2.0.0
   */
  @Deprecated
  public ParquetWriter(
      Path file,
      WriteSupport writeSupport,
      CompressionCodecName compressionCodecName,
      int blockSize,
      int pageSize,
      boolean enableDictionary,
      boolean validating)
      throws IOException {
    this(file, writeSupport, compressionCodecName, blockSize, pageSize, pageSize, enableDictionary, validating);
  }

  /**
   * Create a new ParquetWriter.
   *
   * @param file                 the file to create
   * @param writeSupport         the implementation to write a record to a RecordConsumer
   * @param compressionCodecName the compression codec to use
   * @param blockSize            the block size threshold
   * @param pageSize             the page size threshold
   * @param dictionaryPageSize   the page size threshold for the dictionary pages
   * @param enableDictionary     to turn dictionary encoding on
   * @param validating           to turn on validation using the schema
   * @throws IOException if there is an error while writing
   * @deprecated will be removed in 2.0.0
   */
  @Deprecated
  public ParquetWriter(
      Path file,
      WriteSupport writeSupport,
      CompressionCodecName compressionCodecName,
      int blockSize,
      int pageSize,
      int dictionaryPageSize,
      boolean enableDictionary,
      boolean validating)
      throws IOException {
    this(
        file,
        writeSupport,
        compressionCodecName,
        blockSize,
        pageSize,
        dictionaryPageSize,
        enableDictionary,
        validating,
        DEFAULT_WRITER_VERSION);
  }

  /**
   * Create a new ParquetWriter.
   * 

* Directly instantiates a Hadoop {@link org.apache.hadoop.conf.Configuration} which reads * configuration from the classpath. * * @param file the file to create * @param writeSupport the implementation to write a record to a RecordConsumer * @param compressionCodecName the compression codec to use * @param blockSize the block size threshold * @param pageSize the page size threshold * @param dictionaryPageSize the page size threshold for the dictionary pages * @param enableDictionary to turn dictionary encoding on * @param validating to turn on validation using the schema * @param writerVersion version of parquetWriter from {@link ParquetProperties.WriterVersion} * @throws IOException if there is an error while writing * @deprecated will be removed in 2.0.0 */ @Deprecated public ParquetWriter( Path file, WriteSupport writeSupport, CompressionCodecName compressionCodecName, int blockSize, int pageSize, int dictionaryPageSize, boolean enableDictionary, boolean validating, WriterVersion writerVersion) throws IOException { this( file, writeSupport, compressionCodecName, blockSize, pageSize, dictionaryPageSize, enableDictionary, validating, writerVersion, new Configuration()); } /** * Create a new ParquetWriter. * * @param file the file to create * @param writeSupport the implementation to write a record to a RecordConsumer * @param compressionCodecName the compression codec to use * @param blockSize the block size threshold * @param pageSize the page size threshold * @param dictionaryPageSize the page size threshold for the dictionary pages * @param enableDictionary to turn dictionary encoding on * @param validating to turn on validation using the schema * @param writerVersion version of parquetWriter from {@link ParquetProperties.WriterVersion} * @param conf Hadoop configuration to use while accessing the filesystem * @throws IOException if there is an error while writing * @deprecated will be removed in 2.0.0 */ @Deprecated public ParquetWriter( Path file, WriteSupport writeSupport, CompressionCodecName compressionCodecName, int blockSize, int pageSize, int dictionaryPageSize, boolean enableDictionary, boolean validating, WriterVersion writerVersion, Configuration conf) throws IOException { this( file, ParquetFileWriter.Mode.CREATE, writeSupport, compressionCodecName, blockSize, pageSize, dictionaryPageSize, enableDictionary, validating, writerVersion, conf); } /** * Create a new ParquetWriter. * * @param file the file to create * @param mode file creation mode * @param writeSupport the implementation to write a record to a RecordConsumer * @param compressionCodecName the compression codec to use * @param blockSize the block size threshold * @param pageSize the page size threshold * @param dictionaryPageSize the page size threshold for the dictionary pages * @param enableDictionary to turn dictionary encoding on * @param validating to turn on validation using the schema * @param writerVersion version of parquetWriter from {@link ParquetProperties.WriterVersion} * @param conf Hadoop configuration to use while accessing the filesystem * @throws IOException if there is an error while writing * @deprecated will be removed in 2.0.0 */ @Deprecated public ParquetWriter( Path file, ParquetFileWriter.Mode mode, WriteSupport writeSupport, CompressionCodecName compressionCodecName, int blockSize, int pageSize, int dictionaryPageSize, boolean enableDictionary, boolean validating, WriterVersion writerVersion, Configuration conf) throws IOException { this( HadoopOutputFile.fromPath(file, conf), mode, writeSupport, compressionCodecName, blockSize, validating, conf, MAX_PADDING_SIZE_DEFAULT, ParquetProperties.builder() .withPageSize(pageSize) .withDictionaryPageSize(dictionaryPageSize) .withDictionaryEncoding(enableDictionary) .withWriterVersion(writerVersion) .build(), null); } /** * Create a new ParquetWriter. The default block size is 128 MB. The default * page size is 1 MB. Default compression is no compression. Dictionary encoding is disabled. * * @param file the file to create * @param writeSupport the implementation to write a record to a RecordConsumer * @throws IOException if there is an error while writing * @deprecated will be removed in 2.0.0 */ @Deprecated public ParquetWriter(Path file, WriteSupport writeSupport) throws IOException { this(file, writeSupport, DEFAULT_COMPRESSION_CODEC_NAME, DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE); } @Deprecated public ParquetWriter(Path file, Configuration conf, WriteSupport writeSupport) throws IOException { this( file, writeSupport, DEFAULT_COMPRESSION_CODEC_NAME, DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE, DEFAULT_PAGE_SIZE, DEFAULT_IS_DICTIONARY_ENABLED, DEFAULT_IS_VALIDATING_ENABLED, DEFAULT_WRITER_VERSION, conf); } ParquetWriter( OutputFile file, ParquetFileWriter.Mode mode, WriteSupport writeSupport, CompressionCodecName compressionCodecName, long rowGroupSize, boolean validating, Configuration conf, int maxPaddingSize, ParquetProperties encodingProps, FileEncryptionProperties encryptionProperties) throws IOException { this( file, mode, writeSupport, compressionCodecName, rowGroupSize, validating, new HadoopParquetConfiguration(conf), maxPaddingSize, encodingProps, encryptionProperties); } ParquetWriter( OutputFile file, ParquetFileWriter.Mode mode, WriteSupport writeSupport, CompressionCodecName compressionCodecName, long rowGroupSize, boolean validating, ParquetConfiguration conf, int maxPaddingSize, ParquetProperties encodingProps, FileEncryptionProperties encryptionProperties) throws IOException { this( file, mode, writeSupport, compressionCodecName, new CodecFactory(conf, encodingProps.getPageSizeThreshold()), rowGroupSize, validating, conf, maxPaddingSize, encodingProps, encryptionProperties); } ParquetWriter( OutputFile file, ParquetFileWriter.Mode mode, WriteSupport writeSupport, CompressionCodecName compressionCodecName, CompressionCodecFactory codecFactory, long rowGroupSize, boolean validating, ParquetConfiguration conf, int maxPaddingSize, ParquetProperties encodingProps, FileEncryptionProperties encryptionProperties) throws IOException { WriteSupport.WriteContext writeContext = writeSupport.init(conf); MessageType schema = writeContext.getSchema(); // encryptionProperties could be built from the implementation of EncryptionPropertiesFactory when it is // attached. if (encryptionProperties == null) { encryptionProperties = EncryptionPropertiesHelper.createEncryptionProperties(conf, file, writeContext); } ParquetFileWriter fileWriter = new ParquetFileWriter( file, schema, mode, rowGroupSize, maxPaddingSize, encryptionProperties, encodingProps); fileWriter.start(); this.codecFactory = codecFactory; CompressionCodecFactory.BytesInputCompressor compressor = codecFactory.getCompressor(compressionCodecName); final Map extraMetadata; if (encodingProps.getExtraMetaData() == null || encodingProps.getExtraMetaData().isEmpty()) { extraMetadata = writeContext.getExtraMetaData(); } else { extraMetadata = new HashMap<>(writeContext.getExtraMetaData()); encodingProps.getExtraMetaData().forEach((metadataKey, metadataValue) -> { if (metadataKey.equals(OBJECT_MODEL_NAME_PROP)) { throw new IllegalArgumentException("Cannot overwrite metadata key " + OBJECT_MODEL_NAME_PROP + ". Please use another key name."); } if (extraMetadata.put(metadataKey, metadataValue) != null) { throw new IllegalArgumentException( "Duplicate metadata key " + metadataKey + ". Please use another key name."); } }); } this.writer = new InternalParquetRecordWriter( fileWriter, writeSupport, schema, extraMetadata, rowGroupSize, compressor, validating, encodingProps); } public void write(T object) throws IOException { try { writer.write(object); } catch (InterruptedException e) { throw new IOException(e); } } @Override public void close() throws IOException { try { writer.close(); } catch (InterruptedException e) { throw new IOException(e); } finally { // release after the writer closes in case it is used for a last flush codecFactory.release(); } } /** * @return the ParquetMetadata written to the (closed) file. */ public ParquetMetadata getFooter() { return writer.getFooter(); } /** * @return the total size of data written to the file and buffered in memory */ public long getDataSize() { return writer.getDataSize(); } /** * An abstract builder class for ParquetWriter instances. *

* Object models should extend this builder to provide writer configuration * options. * * @param The type of objects written by the constructed ParquetWriter. * @param The type of this builder that is returned by builder methods */ public abstract static class Builder> { private OutputFile file = null; private Path path = null; private FileEncryptionProperties encryptionProperties = null; private ParquetConfiguration conf = null; private ParquetFileWriter.Mode mode; private CompressionCodecFactory codecFactory = null; private CompressionCodecName codecName = DEFAULT_COMPRESSION_CODEC_NAME; private long rowGroupSize = DEFAULT_BLOCK_SIZE; private int maxPaddingSize = MAX_PADDING_SIZE_DEFAULT; private boolean enableValidation = DEFAULT_IS_VALIDATING_ENABLED; private ParquetProperties.Builder encodingPropsBuilder = ParquetProperties.builder(); protected Builder(Path path) { this.path = path; } protected Builder(OutputFile path) { this.file = path; } /** * @return this as the correct subclass of ParquetWriter.Builder. */ protected abstract SELF self(); /** * @param conf a configuration * @return an appropriate WriteSupport for the object model. * @deprecated Use {@link #getWriteSupport(ParquetConfiguration)} instead */ @Deprecated protected abstract WriteSupport getWriteSupport(Configuration conf); /** * @param conf a configuration * @return an appropriate WriteSupport for the object model. */ protected WriteSupport getWriteSupport(ParquetConfiguration conf) { return getWriteSupport(ConfigurationUtil.createHadoopConfiguration(conf)); } /** * Set the {@link Configuration} used by the constructed writer. * * @param conf a {@code Configuration} * @return this builder for method chaining. */ public SELF withConf(Configuration conf) { this.conf = new HadoopParquetConfiguration(conf); return self(); } /** * Set the {@link ParquetConfiguration} used by the constructed writer. * * @param conf a {@code ParquetConfiguration} * @return this builder for method chaining. */ public SELF withConf(ParquetConfiguration conf) { this.conf = conf; return self(); } /** * Set the {@link ParquetFileWriter.Mode write mode} used when creating the * backing file for this writer. * * @param mode a {@code ParquetFileWriter.Mode} * @return this builder for method chaining. */ public SELF withWriteMode(ParquetFileWriter.Mode mode) { this.mode = mode; return self(); } /** * Set the {@link CompressionCodecName compression codec} used by the * constructed writer. * * @param codecName a {@code CompressionCodecName} * @return this builder for method chaining. */ public SELF withCompressionCodec(CompressionCodecName codecName) { this.codecName = codecName; return self(); } /** * Set the {@link CompressionCodecFactory codec factory} used by the * constructed writer. * * @param codecFactory a {@link CompressionCodecFactory} * @return this builder for method chaining. */ public SELF withCodecFactory(CompressionCodecFactory codecFactory) { this.codecFactory = codecFactory; return self(); } /** * Set the {@link FileEncryptionProperties file encryption properties} used by the * constructed writer. * * @param encryptionProperties a {@code FileEncryptionProperties} * @return this builder for method chaining. */ public SELF withEncryption(FileEncryptionProperties encryptionProperties) { this.encryptionProperties = encryptionProperties; return self(); } /** * Set the Parquet format row group size used by the constructed writer. * * @param rowGroupSize an integer size in bytes * @return this builder for method chaining. * @deprecated Use {@link #withRowGroupSize(long)} instead */ @Deprecated public SELF withRowGroupSize(int rowGroupSize) { return withRowGroupSize((long) rowGroupSize); } /** * Set the Parquet format row group size used by the constructed writer. * * @param rowGroupSize an integer size in bytes * @return this builder for method chaining. */ public SELF withRowGroupSize(long rowGroupSize) { this.rowGroupSize = rowGroupSize; return self(); } /** * Set the Parquet format page size used by the constructed writer. * * @param pageSize an integer size in bytes * @return this builder for method chaining. */ public SELF withPageSize(int pageSize) { encodingPropsBuilder.withPageSize(pageSize); return self(); } /** * Sets the Parquet format page row count limit used by the constructed writer. * * @param rowCount limit for the number of rows stored in a page * @return this builder for method chaining */ public SELF withPageRowCountLimit(int rowCount) { encodingPropsBuilder.withPageRowCountLimit(rowCount); return self(); } /** * Set the Parquet format dictionary page size used by the constructed * writer. * * @param dictionaryPageSize an integer size in bytes * @return this builder for method chaining. */ public SELF withDictionaryPageSize(int dictionaryPageSize) { encodingPropsBuilder.withDictionaryPageSize(dictionaryPageSize); return self(); } /** * Set the maximum amount of padding, in bytes, that will be used to align * row groups with blocks in the underlying filesystem. If the underlying * filesystem is not a block filesystem like HDFS, this has no effect. * * @param maxPaddingSize an integer size in bytes * @return this builder for method chaining. */ public SELF withMaxPaddingSize(int maxPaddingSize) { this.maxPaddingSize = maxPaddingSize; return self(); } /** * Enables dictionary encoding for the constructed writer. * * @return this builder for method chaining. */ public SELF enableDictionaryEncoding() { encodingPropsBuilder.withDictionaryEncoding(true); return self(); } /** * Enable or disable dictionary encoding for the constructed writer. * * @param enableDictionary whether dictionary encoding should be enabled * @return this builder for method chaining. */ public SELF withDictionaryEncoding(boolean enableDictionary) { encodingPropsBuilder.withDictionaryEncoding(enableDictionary); return self(); } public SELF withByteStreamSplitEncoding(boolean enableByteStreamSplit) { encodingPropsBuilder.withByteStreamSplitEncoding(enableByteStreamSplit); return self(); } /** * Enable or disable dictionary encoding of the specified column for the constructed writer. * * @param columnPath the path of the column (dot-string) * @param enableDictionary whether dictionary encoding should be enabled * @return this builder for method chaining. */ public SELF withDictionaryEncoding(String columnPath, boolean enableDictionary) { encodingPropsBuilder.withDictionaryEncoding(columnPath, enableDictionary); return self(); } /** * Enables validation for the constructed writer. * * @return this builder for method chaining. */ public SELF enableValidation() { this.enableValidation = true; return self(); } /** * Enable or disable validation for the constructed writer. * * @param enableValidation whether validation should be enabled * @return this builder for method chaining. */ public SELF withValidation(boolean enableValidation) { this.enableValidation = enableValidation; return self(); } /** * Set the {@link WriterVersion format version} used by the constructed * writer. * * @param version a {@code WriterVersion} * @return this builder for method chaining. */ public SELF withWriterVersion(WriterVersion version) { encodingPropsBuilder.withWriterVersion(version); return self(); } /** * Enables writing page level checksums for the constructed writer. * * @return this builder for method chaining. */ public SELF enablePageWriteChecksum() { encodingPropsBuilder.withPageWriteChecksumEnabled(true); return self(); } /** * Enables writing page level checksums for the constructed writer. * * @param enablePageWriteChecksum whether page checksums should be written out * @return this builder for method chaining. */ public SELF withPageWriteChecksumEnabled(boolean enablePageWriteChecksum) { encodingPropsBuilder.withPageWriteChecksumEnabled(enablePageWriteChecksum); return self(); } /** * Set max Bloom filter bytes for related columns. * * @param maxBloomFilterBytes the max bytes of a Bloom filter bitset for a column. * @return this builder for method chaining */ public SELF withMaxBloomFilterBytes(int maxBloomFilterBytes) { encodingPropsBuilder.withMaxBloomFilterBytes(maxBloomFilterBytes); return self(); } /** * Sets the NDV (number of distinct values) for the specified column. * * @param columnPath the path of the column (dot-string) * @param ndv the NDV of the column * @return this builder for method chaining. */ public SELF withBloomFilterNDV(String columnPath, long ndv) { encodingPropsBuilder.withBloomFilterNDV(columnPath, ndv); return self(); } public SELF withBloomFilterFPP(String columnPath, double fpp) { encodingPropsBuilder.withBloomFilterFPP(columnPath, fpp); return self(); } /** * When NDV (number of distinct values) for a specified column is not set, whether to use * `AdaptiveBloomFilter` to automatically adjust the BloomFilter size according to `parquet.bloom.filter.max.bytes` * * @param enabled whether to write bloom filter for the column */ public SELF withAdaptiveBloomFilterEnabled(boolean enabled) { encodingPropsBuilder.withAdaptiveBloomFilterEnabled(enabled); return self(); } /** * When `AdaptiveBloomFilter` is enabled, set how many bloom filter candidates to use. * * @param columnPath the path of the column (dot-string) * @param number the number of candidate */ public SELF withBloomFilterCandidateNumber(String columnPath, int number) { encodingPropsBuilder.withBloomFilterCandidatesNumber(columnPath, number); return self(); } /** * Sets the bloom filter enabled/disabled * * @param enabled whether to write bloom filters * @return this builder for method chaining */ public SELF withBloomFilterEnabled(boolean enabled) { encodingPropsBuilder.withBloomFilterEnabled(enabled); return self(); } /** * Sets the bloom filter enabled/disabled for the specified column. If not set for the column specifically the * default enabled/disabled state will take place. See {@link #withBloomFilterEnabled(boolean)}. * * @param columnPath the path of the column (dot-string) * @param enabled whether to write bloom filter for the column * @return this builder for method chaining */ public SELF withBloomFilterEnabled(String columnPath, boolean enabled) { encodingPropsBuilder.withBloomFilterEnabled(columnPath, enabled); return self(); } /** * Sets the minimum number of rows to write before a page size check is done. * * @param min writes at least `min` rows before invoking a page size check * @return this builder for method chaining */ public SELF withMinRowCountForPageSizeCheck(int min) { encodingPropsBuilder.withMinRowCountForPageSizeCheck(min); return self(); } /** * Sets the maximum number of rows to write before a page size check is done. * * @param max makes a page size check after `max` rows have been written * @return this builder for method chaining */ public SELF withMaxRowCountForPageSizeCheck(int max) { encodingPropsBuilder.withMaxRowCountForPageSizeCheck(max); return self(); } /** * Sets the length to be used for truncating binary values in a binary column index. * * @param length the length to truncate to * @return this builder for method chaining */ public SELF withColumnIndexTruncateLength(int length) { encodingPropsBuilder.withColumnIndexTruncateLength(length); return self(); } /** * Sets the length which the min/max binary values in row groups are truncated to. * * @param length the length to truncate to * @return this builder for method chaining */ public SELF withStatisticsTruncateLength(int length) { encodingPropsBuilder.withStatisticsTruncateLength(length); return self(); } /** * Sets additional metadata entries to be included in the file footer. * * @param extraMetaData a Map of additional stringly-typed metadata entries * @return this builder for method chaining */ public SELF withExtraMetaData(Map extraMetaData) { encodingPropsBuilder.withExtraMetaData(extraMetaData); return self(); } /** * Sets the ByteBuffer allocator instance to be used for allocating memory for writing. * * @param allocator the allocator instance * @return this builder for method chaining */ public SELF withAllocator(ByteBufferAllocator allocator) { encodingPropsBuilder.withAllocator(allocator); return self(); } /** * Set a property that will be available to the read path. For writers that use a Hadoop * configuration, this is the recommended way to add configuration values. * * @param property a String property name * @param value a String property value * @return this builder for method chaining. */ public SELF config(String property, String value) { if (conf == null) { conf = new HadoopParquetConfiguration(); } conf.set(property, value); return self(); } /** * Sets the statistics enabled/disabled for the specified column. All column statistics are enabled by default. * * @param columnPath the path of the column (dot-string) * @param enabled whether to write calculate statistics for the column * @return this builder for method chaining */ public SELF withStatisticsEnabled(String columnPath, boolean enabled) { encodingPropsBuilder.withStatisticsEnabled(columnPath, enabled); return self(); } /** * Sets whether statistics are enabled globally. When disabled, statistics will not be collected * for any column unless explicitly enabled for specific columns. * * @param enabled whether to collect statistics globally * @return this builder for method chaining */ public SELF withStatisticsEnabled(boolean enabled) { encodingPropsBuilder.withStatisticsEnabled(enabled); return self(); } /** * Sets the size statistics enabled/disabled for the specified column. All column size statistics are enabled by default. * * @param columnPath the path of the column (dot-string) * @param enabled whether to collect size statistics for the column * @return this builder for method chaining */ public SELF withSizeStatisticsEnabled(String columnPath, boolean enabled) { encodingPropsBuilder.withSizeStatisticsEnabled(columnPath, enabled); return self(); } /** * Sets whether size statistics are enabled globally. When disabled, size statistics will not be collected * for any column unless explicitly enabled for specific columns. * * @param enabled whether to collect size statistics globally * @return this builder for method chaining */ public SELF withSizeStatisticsEnabled(boolean enabled) { encodingPropsBuilder.withSizeStatisticsEnabled(enabled); return self(); } /** * Build a {@link ParquetWriter} with the accumulated configuration. * * @return a configured {@code ParquetWriter} instance. * @throws IOException if there is an error while creating the writer */ public ParquetWriter build() throws IOException { if (conf == null) { conf = new HadoopParquetConfiguration(); } ParquetProperties encodingProps = encodingPropsBuilder.build(); if (codecFactory == null) { codecFactory = new CodecFactory(conf, encodingProps.getPageSizeThreshold()); } return new ParquetWriter<>( (file != null) ? file : HadoopOutputFile.fromPath(path, ConfigurationUtil.createHadoopConfiguration(conf)), mode, getWriteSupport(conf), codecName, codecFactory, rowGroupSize, enableValidation, conf, maxPaddingSize, encodingProps, encryptionProperties); } } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy